.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
---|
1 | 2 | /* |
---|
2 | 3 | * TCP CUBIC: Binary Increase Congestion control for TCP v2.3 |
---|
3 | 4 | * Home page: |
---|
.. | .. |
---|
39 | 40 | |
---|
40 | 41 | /* Number of delay samples for detecting the increase of delay */ |
---|
41 | 42 | #define HYSTART_MIN_SAMPLES 8 |
---|
42 | | -#define HYSTART_DELAY_MIN (4U<<3) |
---|
43 | | -#define HYSTART_DELAY_MAX (16U<<3) |
---|
| 43 | +#define HYSTART_DELAY_MIN (4000U) /* 4 ms */ |
---|
| 44 | +#define HYSTART_DELAY_MAX (16000U) /* 16 ms */ |
---|
44 | 45 | #define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX) |
---|
45 | 46 | |
---|
46 | 47 | static int fast_convergence __read_mostly = 1; |
---|
.. | .. |
---|
52 | 53 | static int hystart __read_mostly = 1; |
---|
53 | 54 | static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY; |
---|
54 | 55 | static int hystart_low_window __read_mostly = 16; |
---|
55 | | -static int hystart_ack_delta __read_mostly = 2; |
---|
| 56 | +static int hystart_ack_delta_us __read_mostly = 2000; |
---|
56 | 57 | |
---|
57 | 58 | static u32 cube_rtt_scale __read_mostly; |
---|
58 | 59 | static u32 beta_scale __read_mostly; |
---|
.. | .. |
---|
76 | 77 | " 1: packet-train 2: delay 3: both packet-train and delay"); |
---|
77 | 78 | module_param(hystart_low_window, int, 0644); |
---|
78 | 79 | MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start"); |
---|
79 | | -module_param(hystart_ack_delta, int, 0644); |
---|
80 | | -MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (msecs)"); |
---|
| 80 | +module_param(hystart_ack_delta_us, int, 0644); |
---|
| 81 | +MODULE_PARM_DESC(hystart_ack_delta_us, "spacing between ack's indicating train (usecs)"); |
---|
81 | 82 | |
---|
82 | 83 | /* BIC TCP Parameters */ |
---|
83 | 84 | struct bictcp { |
---|
.. | .. |
---|
88 | 89 | u32 bic_origin_point;/* origin point of bic function */ |
---|
89 | 90 | u32 bic_K; /* time to origin point |
---|
90 | 91 | from the beginning of the current epoch */ |
---|
91 | | - u32 delay_min; /* min delay (msec << 3) */ |
---|
| 92 | + u32 delay_min; /* min delay (usec) */ |
---|
92 | 93 | u32 epoch_start; /* beginning of an epoch */ |
---|
93 | 94 | u32 ack_cnt; /* number of acks */ |
---|
94 | 95 | u32 tcp_cwnd; /* estimated tcp cwnd */ |
---|
.. | .. |
---|
116 | 117 | ca->found = 0; |
---|
117 | 118 | } |
---|
118 | 119 | |
---|
119 | | -static inline u32 bictcp_clock(void) |
---|
| 120 | +static inline u32 bictcp_clock_us(const struct sock *sk) |
---|
120 | 121 | { |
---|
121 | | -#if HZ < 1000 |
---|
122 | | - return ktime_to_ms(ktime_get_real()); |
---|
123 | | -#else |
---|
124 | | - return jiffies_to_msecs(jiffies); |
---|
125 | | -#endif |
---|
| 122 | + return tcp_sk(sk)->tcp_mstamp; |
---|
126 | 123 | } |
---|
127 | 124 | |
---|
128 | 125 | static inline void bictcp_hystart_reset(struct sock *sk) |
---|
.. | .. |
---|
130 | 127 | struct tcp_sock *tp = tcp_sk(sk); |
---|
131 | 128 | struct bictcp *ca = inet_csk_ca(sk); |
---|
132 | 129 | |
---|
133 | | - ca->round_start = ca->last_ack = bictcp_clock(); |
---|
| 130 | + ca->round_start = ca->last_ack = bictcp_clock_us(sk); |
---|
134 | 131 | ca->end_seq = tp->snd_nxt; |
---|
135 | | - ca->curr_rtt = 0; |
---|
| 132 | + ca->curr_rtt = ~0U; |
---|
136 | 133 | ca->sample_cnt = 0; |
---|
137 | 134 | } |
---|
138 | 135 | |
---|
.. | .. |
---|
275 | 272 | */ |
---|
276 | 273 | |
---|
277 | 274 | t = (s32)(tcp_jiffies32 - ca->epoch_start); |
---|
278 | | - t += msecs_to_jiffies(ca->delay_min >> 3); |
---|
| 275 | + t += usecs_to_jiffies(ca->delay_min); |
---|
279 | 276 | /* change the unit from HZ to bictcp_HZ */ |
---|
280 | 277 | t <<= BICTCP_HZ; |
---|
281 | 278 | do_div(t, HZ); |
---|
.. | .. |
---|
373 | 370 | } |
---|
374 | 371 | } |
---|
375 | 372 | |
---|
| 373 | +/* Account for TSO/GRO delays. |
---|
| 374 | + * Otherwise short RTT flows could get too small ssthresh, since during |
---|
| 375 | + * slow start we begin with small TSO packets and ca->delay_min would |
---|
| 376 | + * not account for long aggregation delay when TSO packets get bigger. |
---|
| 377 | + * Ideally even with a very small RTT we would like to have at least one |
---|
| 378 | + * TSO packet being sent and received by GRO, and another one in qdisc layer. |
---|
| 379 | + * We apply another 100% factor because @rate is doubled at this point. |
---|
| 380 | + * We cap the cushion to 1ms. |
---|
| 381 | + */ |
---|
| 382 | +static u32 hystart_ack_delay(struct sock *sk) |
---|
| 383 | +{ |
---|
| 384 | + unsigned long rate; |
---|
| 385 | + |
---|
| 386 | + rate = READ_ONCE(sk->sk_pacing_rate); |
---|
| 387 | + if (!rate) |
---|
| 388 | + return 0; |
---|
| 389 | + return min_t(u64, USEC_PER_MSEC, |
---|
| 390 | + div64_ul((u64)GSO_MAX_SIZE * 4 * USEC_PER_SEC, rate)); |
---|
| 391 | +} |
---|
| 392 | + |
---|
376 | 393 | static void hystart_update(struct sock *sk, u32 delay) |
---|
377 | 394 | { |
---|
378 | 395 | struct tcp_sock *tp = tcp_sk(sk); |
---|
379 | 396 | struct bictcp *ca = inet_csk_ca(sk); |
---|
380 | | - |
---|
381 | | - if (ca->found & hystart_detect) |
---|
382 | | - return; |
---|
| 397 | + u32 threshold; |
---|
383 | 398 | |
---|
384 | 399 | if (after(tp->snd_una, ca->end_seq)) |
---|
385 | 400 | bictcp_hystart_reset(sk); |
---|
386 | 401 | |
---|
387 | 402 | if (hystart_detect & HYSTART_ACK_TRAIN) { |
---|
388 | | - u32 now = bictcp_clock(); |
---|
| 403 | + u32 now = bictcp_clock_us(sk); |
---|
389 | 404 | |
---|
390 | 405 | /* first detection parameter - ack-train detection */ |
---|
391 | | - if ((s32)(now - ca->last_ack) <= hystart_ack_delta) { |
---|
| 406 | + if ((s32)(now - ca->last_ack) <= hystart_ack_delta_us) { |
---|
392 | 407 | ca->last_ack = now; |
---|
393 | | - if ((s32)(now - ca->round_start) > ca->delay_min >> 4) { |
---|
394 | | - ca->found |= HYSTART_ACK_TRAIN; |
---|
| 408 | + |
---|
| 409 | + threshold = ca->delay_min + hystart_ack_delay(sk); |
---|
| 410 | + |
---|
| 411 | + /* Hystart ack train triggers if we get ack past |
---|
| 412 | + * ca->delay_min/2. |
---|
| 413 | + * Pacing might have delayed packets up to RTT/2 |
---|
| 414 | + * during slow start. |
---|
| 415 | + */ |
---|
| 416 | + if (sk->sk_pacing_status == SK_PACING_NONE) |
---|
| 417 | + threshold >>= 1; |
---|
| 418 | + |
---|
| 419 | + if ((s32)(now - ca->round_start) > threshold) { |
---|
| 420 | + ca->found = 1; |
---|
| 421 | + pr_debug("hystart_ack_train (%u > %u) delay_min %u (+ ack_delay %u) cwnd %u\n", |
---|
| 422 | + now - ca->round_start, threshold, |
---|
| 423 | + ca->delay_min, hystart_ack_delay(sk), tp->snd_cwnd); |
---|
395 | 424 | NET_INC_STATS(sock_net(sk), |
---|
396 | 425 | LINUX_MIB_TCPHYSTARTTRAINDETECT); |
---|
397 | 426 | NET_ADD_STATS(sock_net(sk), |
---|
.. | .. |
---|
407 | 436 | if (ca->curr_rtt > delay) |
---|
408 | 437 | ca->curr_rtt = delay; |
---|
409 | 438 | if (ca->sample_cnt < HYSTART_MIN_SAMPLES) { |
---|
410 | | - if (ca->curr_rtt == 0 || ca->curr_rtt > delay) |
---|
411 | | - ca->curr_rtt = delay; |
---|
412 | | - |
---|
413 | 439 | ca->sample_cnt++; |
---|
414 | 440 | } else { |
---|
415 | 441 | if (ca->curr_rtt > ca->delay_min + |
---|
416 | 442 | HYSTART_DELAY_THRESH(ca->delay_min >> 3)) { |
---|
417 | | - ca->found |= HYSTART_DELAY; |
---|
| 443 | + ca->found = 1; |
---|
418 | 444 | NET_INC_STATS(sock_net(sk), |
---|
419 | 445 | LINUX_MIB_TCPHYSTARTDELAYDETECT); |
---|
420 | 446 | NET_ADD_STATS(sock_net(sk), |
---|
.. | .. |
---|
426 | 452 | } |
---|
427 | 453 | } |
---|
428 | 454 | |
---|
429 | | -/* Track delayed acknowledgment ratio using sliding window |
---|
430 | | - * ratio = (15*ratio + sample) / 16 |
---|
431 | | - */ |
---|
432 | 455 | static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) |
---|
433 | 456 | { |
---|
434 | 457 | const struct tcp_sock *tp = tcp_sk(sk); |
---|
.. | .. |
---|
443 | 466 | if (ca->epoch_start && (s32)(tcp_jiffies32 - ca->epoch_start) < HZ) |
---|
444 | 467 | return; |
---|
445 | 468 | |
---|
446 | | - delay = (sample->rtt_us << 3) / USEC_PER_MSEC; |
---|
| 469 | + delay = sample->rtt_us; |
---|
447 | 470 | if (delay == 0) |
---|
448 | 471 | delay = 1; |
---|
449 | 472 | |
---|
.. | .. |
---|
452 | 475 | ca->delay_min = delay; |
---|
453 | 476 | |
---|
454 | 477 | /* hystart triggers when cwnd is larger than some threshold */ |
---|
455 | | - if (hystart && tcp_in_slow_start(tp) && |
---|
| 478 | + if (!ca->found && tcp_in_slow_start(tp) && hystart && |
---|
456 | 479 | tp->snd_cwnd >= hystart_low_window) |
---|
457 | 480 | hystart_update(sk, delay); |
---|
458 | 481 | } |
---|