| .. | .. |
|---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
|---|
| 1 | 2 | /* |
|---|
| 2 | 3 | * Copyright 2018 Google Inc. |
|---|
| 3 | 4 | * Author: Eric Dumazet (edumazet@google.com) |
|---|
| .. | .. |
|---|
| 44 | 45 | * cpu usage user:0.046 sys:3.559, 110.016 usec per MB, 65529 c-switches |
|---|
| 45 | 46 | * received 32768 MB (99.9939 % mmap'ed) in 7.43764 s, 36.9577 Gbit |
|---|
| 46 | 47 | * cpu usage user:0.035 sys:3.467, 106.873 usec per MB, 65530 c-switches |
|---|
| 47 | | - * |
|---|
| 48 | | - * License (GPLv2): |
|---|
| 49 | | - * |
|---|
| 50 | | - * This program is free software; you can redistribute it and/or modify it |
|---|
| 51 | | - * under the terms and conditions of the GNU General Public License, |
|---|
| 52 | | - * version 2, as published by the Free Software Foundation. |
|---|
| 53 | | - * |
|---|
| 54 | | - * This program is distributed in the hope it will be useful, but WITHOUT |
|---|
| 55 | | - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
|---|
| 56 | | - * FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for |
|---|
| 57 | | - * more details. |
|---|
| 58 | | - * |
|---|
| 59 | | - * You should have received a copy of the GNU General Public License along with |
|---|
| 60 | | - * this program; if not, write to the Free Software Foundation, Inc., |
|---|
| 61 | | - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. |
|---|
| 62 | 48 | */ |
|---|
| 63 | 49 | #define _GNU_SOURCE |
|---|
| 64 | 50 | #include <pthread.h> |
|---|
| .. | .. |
|---|
| 85 | 71 | #define MSG_ZEROCOPY 0x4000000 |
|---|
| 86 | 72 | #endif |
|---|
| 87 | 73 | |
|---|
| 88 | | -#define FILE_SZ (1UL << 35) |
|---|
| 74 | +#define FILE_SZ (1ULL << 35) |
|---|
| 89 | 75 | static int cfg_family = AF_INET6; |
|---|
| 90 | 76 | static socklen_t cfg_alen = sizeof(struct sockaddr_in6); |
|---|
| 91 | 77 | static int cfg_port = 8787; |
|---|
| .. | .. |
|---|
| 96 | 82 | static int xflg; /* hash received data (simple xor) (-h option) */ |
|---|
| 97 | 83 | static int keepflag; /* -k option: receiver shall keep all received file in memory (no munmap() calls) */ |
|---|
| 98 | 84 | |
|---|
| 99 | | -static int chunk_size = 512*1024; |
|---|
| 85 | +static size_t chunk_size = 512*1024; |
|---|
| 86 | + |
|---|
| 87 | +static size_t map_align; |
|---|
| 100 | 88 | |
|---|
| 101 | 89 | unsigned long htotal; |
|---|
| 102 | 90 | |
|---|
| .. | .. |
|---|
| 132 | 120 | htotal = temp; |
|---|
| 133 | 121 | } |
|---|
| 134 | 122 | |
|---|
| 123 | +#define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1)) |
|---|
| 124 | +#define ALIGN_PTR_UP(p, ptr_align_to) ((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to)) |
|---|
| 125 | + |
|---|
| 126 | + |
|---|
| 127 | +static void *mmap_large_buffer(size_t need, size_t *allocated) |
|---|
| 128 | +{ |
|---|
| 129 | + void *buffer; |
|---|
| 130 | + size_t sz; |
|---|
| 131 | + |
|---|
| 132 | + /* Attempt to use huge pages if possible. */ |
|---|
| 133 | + sz = ALIGN_UP(need, map_align); |
|---|
| 134 | + buffer = mmap(NULL, sz, PROT_READ | PROT_WRITE, |
|---|
| 135 | + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); |
|---|
| 136 | + |
|---|
| 137 | + if (buffer == (void *)-1) { |
|---|
| 138 | + sz = need; |
|---|
| 139 | + buffer = mmap(NULL, sz, PROT_READ | PROT_WRITE, |
|---|
| 140 | + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); |
|---|
| 141 | + if (buffer != (void *)-1) |
|---|
| 142 | + fprintf(stderr, "MAP_HUGETLB attempt failed, look at /sys/kernel/mm/hugepages for optimal performance\n"); |
|---|
| 143 | + } |
|---|
| 144 | + *allocated = sz; |
|---|
| 145 | + return buffer; |
|---|
| 146 | +} |
|---|
| 147 | + |
|---|
| 135 | 148 | void *child_thread(void *arg) |
|---|
| 136 | 149 | { |
|---|
| 137 | 150 | unsigned long total_mmap = 0, total = 0; |
|---|
| .. | .. |
|---|
| 140 | 153 | int flags = MAP_SHARED; |
|---|
| 141 | 154 | struct timeval t0, t1; |
|---|
| 142 | 155 | char *buffer = NULL; |
|---|
| 156 | + void *raddr = NULL; |
|---|
| 143 | 157 | void *addr = NULL; |
|---|
| 144 | 158 | double throughput; |
|---|
| 145 | 159 | struct rusage ru; |
|---|
| 160 | + size_t buffer_sz; |
|---|
| 146 | 161 | int lu, fd; |
|---|
| 147 | 162 | |
|---|
| 148 | 163 | fd = (int)(unsigned long)arg; |
|---|
| .. | .. |
|---|
| 150 | 165 | gettimeofday(&t0, NULL); |
|---|
| 151 | 166 | |
|---|
| 152 | 167 | fcntl(fd, F_SETFL, O_NDELAY); |
|---|
| 153 | | - buffer = malloc(chunk_size); |
|---|
| 154 | | - if (!buffer) { |
|---|
| 155 | | - perror("malloc"); |
|---|
| 168 | + buffer = mmap_large_buffer(chunk_size, &buffer_sz); |
|---|
| 169 | + if (buffer == (void *)-1) { |
|---|
| 170 | + perror("mmap"); |
|---|
| 156 | 171 | goto error; |
|---|
| 157 | 172 | } |
|---|
| 158 | 173 | if (zflg) { |
|---|
| 159 | | - addr = mmap(NULL, chunk_size, PROT_READ, flags, fd, 0); |
|---|
| 160 | | - if (addr == (void *)-1) |
|---|
| 174 | + raddr = mmap(NULL, chunk_size + map_align, PROT_READ, flags, fd, 0); |
|---|
| 175 | + if (raddr == (void *)-1) { |
|---|
| 176 | + perror("mmap"); |
|---|
| 161 | 177 | zflg = 0; |
|---|
| 178 | + } else { |
|---|
| 179 | + addr = ALIGN_PTR_UP(raddr, map_align); |
|---|
| 180 | + } |
|---|
| 162 | 181 | } |
|---|
| 163 | 182 | while (1) { |
|---|
| 164 | 183 | struct pollfd pfd = { .fd = fd, .events = POLLIN, }; |
|---|
| .. | .. |
|---|
| 169 | 188 | socklen_t zc_len = sizeof(zc); |
|---|
| 170 | 189 | int res; |
|---|
| 171 | 190 | |
|---|
| 172 | | - zc.address = (__u64)addr; |
|---|
| 191 | + memset(&zc, 0, sizeof(zc)); |
|---|
| 192 | + zc.address = (__u64)((unsigned long)addr); |
|---|
| 173 | 193 | zc.length = chunk_size; |
|---|
| 174 | | - zc.recv_skip_hint = 0; |
|---|
| 194 | + |
|---|
| 175 | 195 | res = getsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, |
|---|
| 176 | 196 | &zc, &zc_len); |
|---|
| 177 | 197 | if (res == -1) |
|---|
| .. | .. |
|---|
| 182 | 202 | total_mmap += zc.length; |
|---|
| 183 | 203 | if (xflg) |
|---|
| 184 | 204 | hash_zone(addr, zc.length); |
|---|
| 205 | + /* It is more efficient to unmap the pages right now, |
|---|
| 206 | + * instead of doing this in next TCP_ZEROCOPY_RECEIVE. |
|---|
| 207 | + */ |
|---|
| 208 | + madvise(addr, zc.length, MADV_DONTNEED); |
|---|
| 185 | 209 | total += zc.length; |
|---|
| 186 | 210 | } |
|---|
| 187 | 211 | if (zc.recv_skip_hint) { |
|---|
| .. | .. |
|---|
| 233 | 257 | ru.ru_nvcsw); |
|---|
| 234 | 258 | } |
|---|
| 235 | 259 | error: |
|---|
| 236 | | - free(buffer); |
|---|
| 260 | + munmap(buffer, buffer_sz); |
|---|
| 237 | 261 | close(fd); |
|---|
| 238 | 262 | if (zflg) |
|---|
| 239 | | - munmap(addr, chunk_size); |
|---|
| 263 | + munmap(raddr, chunk_size + map_align); |
|---|
| 240 | 264 | pthread_exit(0); |
|---|
| 241 | 265 | } |
|---|
| 242 | 266 | |
|---|
| .. | .. |
|---|
| 284 | 308 | |
|---|
| 285 | 309 | static void do_accept(int fdlisten) |
|---|
| 286 | 310 | { |
|---|
| 311 | + pthread_attr_t attr; |
|---|
| 312 | + int rcvlowat; |
|---|
| 313 | + |
|---|
| 314 | + pthread_attr_init(&attr); |
|---|
| 315 | + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); |
|---|
| 316 | + |
|---|
| 317 | + rcvlowat = chunk_size; |
|---|
| 287 | 318 | if (setsockopt(fdlisten, SOL_SOCKET, SO_RCVLOWAT, |
|---|
| 288 | | - &chunk_size, sizeof(chunk_size)) == -1) { |
|---|
| 319 | + &rcvlowat, sizeof(rcvlowat)) == -1) { |
|---|
| 289 | 320 | perror("setsockopt SO_RCVLOWAT"); |
|---|
| 290 | 321 | } |
|---|
| 291 | 322 | |
|---|
| .. | .. |
|---|
| 302 | 333 | perror("accept"); |
|---|
| 303 | 334 | continue; |
|---|
| 304 | 335 | } |
|---|
| 305 | | - res = pthread_create(&th, NULL, child_thread, |
|---|
| 336 | + res = pthread_create(&th, &attr, child_thread, |
|---|
| 306 | 337 | (void *)(unsigned long)fd); |
|---|
| 307 | 338 | if (res) { |
|---|
| 308 | 339 | errno = res; |
|---|
| .. | .. |
|---|
| 312 | 343 | } |
|---|
| 313 | 344 | } |
|---|
| 314 | 345 | |
|---|
| 346 | +/* Each thread should reserve a big enough vma to avoid |
|---|
| 347 | + * spinlock collisions in ptl locks. |
|---|
| 348 | + * This size is 2MB on x86_64, and is exported in /proc/meminfo. |
|---|
| 349 | + */ |
|---|
| 350 | +static unsigned long default_huge_page_size(void) |
|---|
| 351 | +{ |
|---|
| 352 | + FILE *f = fopen("/proc/meminfo", "r"); |
|---|
| 353 | + unsigned long hps = 0; |
|---|
| 354 | + size_t linelen = 0; |
|---|
| 355 | + char *line = NULL; |
|---|
| 356 | + |
|---|
| 357 | + if (!f) |
|---|
| 358 | + return 0; |
|---|
| 359 | + while (getline(&line, &linelen, f) > 0) { |
|---|
| 360 | + if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { |
|---|
| 361 | + hps <<= 10; |
|---|
| 362 | + break; |
|---|
| 363 | + } |
|---|
| 364 | + } |
|---|
| 365 | + free(line); |
|---|
| 366 | + fclose(f); |
|---|
| 367 | + return hps; |
|---|
| 368 | +} |
|---|
| 369 | + |
|---|
| 315 | 370 | int main(int argc, char *argv[]) |
|---|
| 316 | 371 | { |
|---|
| 317 | 372 | struct sockaddr_storage listenaddr, addr; |
|---|
| 318 | 373 | unsigned int max_pacing_rate = 0; |
|---|
| 319 | | - unsigned long total = 0; |
|---|
| 374 | + uint64_t total = 0; |
|---|
| 320 | 375 | char *host = NULL; |
|---|
| 321 | 376 | int fd, c, on = 1; |
|---|
| 377 | + size_t buffer_sz; |
|---|
| 322 | 378 | char *buffer; |
|---|
| 323 | 379 | int sflg = 0; |
|---|
| 324 | 380 | int mss = 0; |
|---|
| 325 | 381 | |
|---|
| 326 | | - while ((c = getopt(argc, argv, "46p:svr:w:H:zxkP:M:")) != -1) { |
|---|
| 382 | + while ((c = getopt(argc, argv, "46p:svr:w:H:zxkP:M:C:a:")) != -1) { |
|---|
| 327 | 383 | switch (c) { |
|---|
| 328 | 384 | case '4': |
|---|
| 329 | 385 | cfg_family = PF_INET; |
|---|
| .. | .. |
|---|
| 363 | 419 | case 'P': |
|---|
| 364 | 420 | max_pacing_rate = atoi(optarg) ; |
|---|
| 365 | 421 | break; |
|---|
| 422 | + case 'C': |
|---|
| 423 | + chunk_size = atol(optarg); |
|---|
| 424 | + break; |
|---|
| 425 | + case 'a': |
|---|
| 426 | + map_align = atol(optarg); |
|---|
| 427 | + break; |
|---|
| 366 | 428 | default: |
|---|
| 367 | 429 | exit(1); |
|---|
| 368 | 430 | } |
|---|
| 431 | + } |
|---|
| 432 | + if (!map_align) { |
|---|
| 433 | + map_align = default_huge_page_size(); |
|---|
| 434 | + /* if really /proc/meminfo is not helping, |
|---|
| 435 | + * we use the default x86_64 hugepagesize. |
|---|
| 436 | + */ |
|---|
| 437 | + if (!map_align) |
|---|
| 438 | + map_align = 2*1024*1024; |
|---|
| 369 | 439 | } |
|---|
| 370 | 440 | if (sflg) { |
|---|
| 371 | 441 | int fdlisten = socket(cfg_family, SOCK_STREAM, 0); |
|---|
| .. | .. |
|---|
| 395 | 465 | } |
|---|
| 396 | 466 | do_accept(fdlisten); |
|---|
| 397 | 467 | } |
|---|
| 398 | | - buffer = mmap(NULL, chunk_size, PROT_READ | PROT_WRITE, |
|---|
| 399 | | - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); |
|---|
| 468 | + |
|---|
| 469 | + buffer = mmap_large_buffer(chunk_size, &buffer_sz); |
|---|
| 400 | 470 | if (buffer == (char *)-1) { |
|---|
| 401 | 471 | perror("mmap"); |
|---|
| 402 | 472 | exit(1); |
|---|
| .. | .. |
|---|
| 431 | 501 | zflg = 0; |
|---|
| 432 | 502 | } |
|---|
| 433 | 503 | while (total < FILE_SZ) { |
|---|
| 434 | | - long wr = FILE_SZ - total; |
|---|
| 504 | + int64_t wr = FILE_SZ - total; |
|---|
| 435 | 505 | |
|---|
| 436 | 506 | if (wr > chunk_size) |
|---|
| 437 | 507 | wr = chunk_size; |
|---|
| 438 | 508 | /* Note : we just want to fill the pipe with 0 bytes */ |
|---|
| 439 | | - wr = send(fd, buffer, wr, zflg ? MSG_ZEROCOPY : 0); |
|---|
| 509 | + wr = send(fd, buffer, (size_t)wr, zflg ? MSG_ZEROCOPY : 0); |
|---|
| 440 | 510 | if (wr <= 0) |
|---|
| 441 | 511 | break; |
|---|
| 442 | 512 | total += wr; |
|---|
| 443 | 513 | } |
|---|
| 444 | 514 | close(fd); |
|---|
| 445 | | - munmap(buffer, chunk_size); |
|---|
| 515 | + munmap(buffer, buffer_sz); |
|---|
| 446 | 516 | return 0; |
|---|
| 447 | 517 | } |
|---|