.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
---|
1 | 2 | /* |
---|
2 | 3 | * Copyright 2018 Google Inc. |
---|
3 | 4 | * Author: Eric Dumazet (edumazet@google.com) |
---|
.. | .. |
---|
44 | 45 | * cpu usage user:0.046 sys:3.559, 110.016 usec per MB, 65529 c-switches |
---|
45 | 46 | * received 32768 MB (99.9939 % mmap'ed) in 7.43764 s, 36.9577 Gbit |
---|
46 | 47 | * cpu usage user:0.035 sys:3.467, 106.873 usec per MB, 65530 c-switches |
---|
47 | | - * |
---|
48 | | - * License (GPLv2): |
---|
49 | | - * |
---|
50 | | - * This program is free software; you can redistribute it and/or modify it |
---|
51 | | - * under the terms and conditions of the GNU General Public License, |
---|
52 | | - * version 2, as published by the Free Software Foundation. |
---|
53 | | - * |
---|
54 | | - * This program is distributed in the hope it will be useful, but WITHOUT |
---|
55 | | - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
---|
56 | | - * FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for |
---|
57 | | - * more details. |
---|
58 | | - * |
---|
59 | | - * You should have received a copy of the GNU General Public License along with |
---|
60 | | - * this program; if not, write to the Free Software Foundation, Inc., |
---|
61 | | - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. |
---|
62 | 48 | */ |
---|
63 | 49 | #define _GNU_SOURCE |
---|
64 | 50 | #include <pthread.h> |
---|
.. | .. |
---|
85 | 71 | #define MSG_ZEROCOPY 0x4000000 |
---|
86 | 72 | #endif |
---|
87 | 73 | |
---|
88 | | -#define FILE_SZ (1UL << 35) |
---|
| 74 | +#define FILE_SZ (1ULL << 35) |
---|
89 | 75 | static int cfg_family = AF_INET6; |
---|
90 | 76 | static socklen_t cfg_alen = sizeof(struct sockaddr_in6); |
---|
91 | 77 | static int cfg_port = 8787; |
---|
.. | .. |
---|
96 | 82 | static int xflg; /* hash received data (simple xor) (-h option) */ |
---|
97 | 83 | static int keepflag; /* -k option: receiver shall keep all received file in memory (no munmap() calls) */ |
---|
98 | 84 | |
---|
99 | | -static int chunk_size = 512*1024; |
---|
| 85 | +static size_t chunk_size = 512*1024; |
---|
| 86 | + |
---|
| 87 | +static size_t map_align; |
---|
100 | 88 | |
---|
101 | 89 | unsigned long htotal; |
---|
102 | 90 | |
---|
.. | .. |
---|
132 | 120 | htotal = temp; |
---|
133 | 121 | } |
---|
134 | 122 | |
---|
| 123 | +#define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1)) |
---|
| 124 | +#define ALIGN_PTR_UP(p, ptr_align_to) ((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to)) |
---|
| 125 | + |
---|
| 126 | + |
---|
| 127 | +static void *mmap_large_buffer(size_t need, size_t *allocated) |
---|
| 128 | +{ |
---|
| 129 | + void *buffer; |
---|
| 130 | + size_t sz; |
---|
| 131 | + |
---|
| 132 | + /* Attempt to use huge pages if possible. */ |
---|
| 133 | + sz = ALIGN_UP(need, map_align); |
---|
| 134 | + buffer = mmap(NULL, sz, PROT_READ | PROT_WRITE, |
---|
| 135 | + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); |
---|
| 136 | + |
---|
| 137 | + if (buffer == (void *)-1) { |
---|
| 138 | + sz = need; |
---|
| 139 | + buffer = mmap(NULL, sz, PROT_READ | PROT_WRITE, |
---|
| 140 | + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); |
---|
| 141 | + if (buffer != (void *)-1) |
---|
| 142 | + fprintf(stderr, "MAP_HUGETLB attempt failed, look at /sys/kernel/mm/hugepages for optimal performance\n"); |
---|
| 143 | + } |
---|
| 144 | + *allocated = sz; |
---|
| 145 | + return buffer; |
---|
| 146 | +} |
---|
| 147 | + |
---|
135 | 148 | void *child_thread(void *arg) |
---|
136 | 149 | { |
---|
137 | 150 | unsigned long total_mmap = 0, total = 0; |
---|
.. | .. |
---|
140 | 153 | int flags = MAP_SHARED; |
---|
141 | 154 | struct timeval t0, t1; |
---|
142 | 155 | char *buffer = NULL; |
---|
| 156 | + void *raddr = NULL; |
---|
143 | 157 | void *addr = NULL; |
---|
144 | 158 | double throughput; |
---|
145 | 159 | struct rusage ru; |
---|
| 160 | + size_t buffer_sz; |
---|
146 | 161 | int lu, fd; |
---|
147 | 162 | |
---|
148 | 163 | fd = (int)(unsigned long)arg; |
---|
.. | .. |
---|
150 | 165 | gettimeofday(&t0, NULL); |
---|
151 | 166 | |
---|
152 | 167 | fcntl(fd, F_SETFL, O_NDELAY); |
---|
153 | | - buffer = malloc(chunk_size); |
---|
154 | | - if (!buffer) { |
---|
155 | | - perror("malloc"); |
---|
| 168 | + buffer = mmap_large_buffer(chunk_size, &buffer_sz); |
---|
| 169 | + if (buffer == (void *)-1) { |
---|
| 170 | + perror("mmap"); |
---|
156 | 171 | goto error; |
---|
157 | 172 | } |
---|
158 | 173 | if (zflg) { |
---|
159 | | - addr = mmap(NULL, chunk_size, PROT_READ, flags, fd, 0); |
---|
160 | | - if (addr == (void *)-1) |
---|
| 174 | + raddr = mmap(NULL, chunk_size + map_align, PROT_READ, flags, fd, 0); |
---|
| 175 | + if (raddr == (void *)-1) { |
---|
| 176 | + perror("mmap"); |
---|
161 | 177 | zflg = 0; |
---|
| 178 | + } else { |
---|
| 179 | + addr = ALIGN_PTR_UP(raddr, map_align); |
---|
| 180 | + } |
---|
162 | 181 | } |
---|
163 | 182 | while (1) { |
---|
164 | 183 | struct pollfd pfd = { .fd = fd, .events = POLLIN, }; |
---|
.. | .. |
---|
169 | 188 | socklen_t zc_len = sizeof(zc); |
---|
170 | 189 | int res; |
---|
171 | 190 | |
---|
172 | | - zc.address = (__u64)addr; |
---|
| 191 | + memset(&zc, 0, sizeof(zc)); |
---|
| 192 | + zc.address = (__u64)((unsigned long)addr); |
---|
173 | 193 | zc.length = chunk_size; |
---|
174 | | - zc.recv_skip_hint = 0; |
---|
| 194 | + |
---|
175 | 195 | res = getsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, |
---|
176 | 196 | &zc, &zc_len); |
---|
177 | 197 | if (res == -1) |
---|
.. | .. |
---|
182 | 202 | total_mmap += zc.length; |
---|
183 | 203 | if (xflg) |
---|
184 | 204 | hash_zone(addr, zc.length); |
---|
| 205 | + /* It is more efficient to unmap the pages right now, |
---|
| 206 | + * instead of doing this in next TCP_ZEROCOPY_RECEIVE. |
---|
| 207 | + */ |
---|
| 208 | + madvise(addr, zc.length, MADV_DONTNEED); |
---|
185 | 209 | total += zc.length; |
---|
186 | 210 | } |
---|
187 | 211 | if (zc.recv_skip_hint) { |
---|
.. | .. |
---|
233 | 257 | ru.ru_nvcsw); |
---|
234 | 258 | } |
---|
235 | 259 | error: |
---|
236 | | - free(buffer); |
---|
| 260 | + munmap(buffer, buffer_sz); |
---|
237 | 261 | close(fd); |
---|
238 | 262 | if (zflg) |
---|
239 | | - munmap(addr, chunk_size); |
---|
| 263 | + munmap(raddr, chunk_size + map_align); |
---|
240 | 264 | pthread_exit(0); |
---|
241 | 265 | } |
---|
242 | 266 | |
---|
.. | .. |
---|
284 | 308 | |
---|
285 | 309 | static void do_accept(int fdlisten) |
---|
286 | 310 | { |
---|
| 311 | + pthread_attr_t attr; |
---|
| 312 | + int rcvlowat; |
---|
| 313 | + |
---|
| 314 | + pthread_attr_init(&attr); |
---|
| 315 | + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); |
---|
| 316 | + |
---|
| 317 | + rcvlowat = chunk_size; |
---|
287 | 318 | if (setsockopt(fdlisten, SOL_SOCKET, SO_RCVLOWAT, |
---|
288 | | - &chunk_size, sizeof(chunk_size)) == -1) { |
---|
| 319 | + &rcvlowat, sizeof(rcvlowat)) == -1) { |
---|
289 | 320 | perror("setsockopt SO_RCVLOWAT"); |
---|
290 | 321 | } |
---|
291 | 322 | |
---|
.. | .. |
---|
302 | 333 | perror("accept"); |
---|
303 | 334 | continue; |
---|
304 | 335 | } |
---|
305 | | - res = pthread_create(&th, NULL, child_thread, |
---|
| 336 | + res = pthread_create(&th, &attr, child_thread, |
---|
306 | 337 | (void *)(unsigned long)fd); |
---|
307 | 338 | if (res) { |
---|
308 | 339 | errno = res; |
---|
.. | .. |
---|
312 | 343 | } |
---|
313 | 344 | } |
---|
314 | 345 | |
---|
| 346 | +/* Each thread should reserve a big enough vma to avoid |
---|
| 347 | + * spinlock collisions in ptl locks. |
---|
| 348 | + * This size is 2MB on x86_64, and is exported in /proc/meminfo. |
---|
| 349 | + */ |
---|
| 350 | +static unsigned long default_huge_page_size(void) |
---|
| 351 | +{ |
---|
| 352 | + FILE *f = fopen("/proc/meminfo", "r"); |
---|
| 353 | + unsigned long hps = 0; |
---|
| 354 | + size_t linelen = 0; |
---|
| 355 | + char *line = NULL; |
---|
| 356 | + |
---|
| 357 | + if (!f) |
---|
| 358 | + return 0; |
---|
| 359 | + while (getline(&line, &linelen, f) > 0) { |
---|
| 360 | + if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { |
---|
| 361 | + hps <<= 10; |
---|
| 362 | + break; |
---|
| 363 | + } |
---|
| 364 | + } |
---|
| 365 | + free(line); |
---|
| 366 | + fclose(f); |
---|
| 367 | + return hps; |
---|
| 368 | +} |
---|
| 369 | + |
---|
315 | 370 | int main(int argc, char *argv[]) |
---|
316 | 371 | { |
---|
317 | 372 | struct sockaddr_storage listenaddr, addr; |
---|
318 | 373 | unsigned int max_pacing_rate = 0; |
---|
319 | | - unsigned long total = 0; |
---|
| 374 | + uint64_t total = 0; |
---|
320 | 375 | char *host = NULL; |
---|
321 | 376 | int fd, c, on = 1; |
---|
| 377 | + size_t buffer_sz; |
---|
322 | 378 | char *buffer; |
---|
323 | 379 | int sflg = 0; |
---|
324 | 380 | int mss = 0; |
---|
325 | 381 | |
---|
326 | | - while ((c = getopt(argc, argv, "46p:svr:w:H:zxkP:M:")) != -1) { |
---|
| 382 | + while ((c = getopt(argc, argv, "46p:svr:w:H:zxkP:M:C:a:")) != -1) { |
---|
327 | 383 | switch (c) { |
---|
328 | 384 | case '4': |
---|
329 | 385 | cfg_family = PF_INET; |
---|
.. | .. |
---|
363 | 419 | case 'P': |
---|
364 | 420 | max_pacing_rate = atoi(optarg) ; |
---|
365 | 421 | break; |
---|
| 422 | + case 'C': |
---|
| 423 | + chunk_size = atol(optarg); |
---|
| 424 | + break; |
---|
| 425 | + case 'a': |
---|
| 426 | + map_align = atol(optarg); |
---|
| 427 | + break; |
---|
366 | 428 | default: |
---|
367 | 429 | exit(1); |
---|
368 | 430 | } |
---|
| 431 | + } |
---|
| 432 | + if (!map_align) { |
---|
| 433 | + map_align = default_huge_page_size(); |
---|
| 434 | + /* if really /proc/meminfo is not helping, |
---|
| 435 | + * we use the default x86_64 hugepagesize. |
---|
| 436 | + */ |
---|
| 437 | + if (!map_align) |
---|
| 438 | + map_align = 2*1024*1024; |
---|
369 | 439 | } |
---|
370 | 440 | if (sflg) { |
---|
371 | 441 | int fdlisten = socket(cfg_family, SOCK_STREAM, 0); |
---|
.. | .. |
---|
395 | 465 | } |
---|
396 | 466 | do_accept(fdlisten); |
---|
397 | 467 | } |
---|
398 | | - buffer = mmap(NULL, chunk_size, PROT_READ | PROT_WRITE, |
---|
399 | | - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); |
---|
| 468 | + |
---|
| 469 | + buffer = mmap_large_buffer(chunk_size, &buffer_sz); |
---|
400 | 470 | if (buffer == (char *)-1) { |
---|
401 | 471 | perror("mmap"); |
---|
402 | 472 | exit(1); |
---|
.. | .. |
---|
431 | 501 | zflg = 0; |
---|
432 | 502 | } |
---|
433 | 503 | while (total < FILE_SZ) { |
---|
434 | | - long wr = FILE_SZ - total; |
---|
| 504 | + int64_t wr = FILE_SZ - total; |
---|
435 | 505 | |
---|
436 | 506 | if (wr > chunk_size) |
---|
437 | 507 | wr = chunk_size; |
---|
438 | 508 | /* Note : we just want to fill the pipe with 0 bytes */ |
---|
439 | | - wr = send(fd, buffer, wr, zflg ? MSG_ZEROCOPY : 0); |
---|
| 509 | + wr = send(fd, buffer, (size_t)wr, zflg ? MSG_ZEROCOPY : 0); |
---|
440 | 510 | if (wr <= 0) |
---|
441 | 511 | break; |
---|
442 | 512 | total += wr; |
---|
443 | 513 | } |
---|
444 | 514 | close(fd); |
---|
445 | | - munmap(buffer, chunk_size); |
---|
| 515 | + munmap(buffer, buffer_sz); |
---|
446 | 516 | return 0; |
---|
447 | 517 | } |
---|