hc
2024-05-11 04dd17822334871b23ea2862f7798fb0e0007777
kernel/net/ipv4/tcp_cubic.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * TCP CUBIC: Binary Increase Congestion control for TCP v2.3
34 * Home page:
....@@ -39,8 +40,8 @@
3940
4041 /* Number of delay samples for detecting the increase of delay */
4142 #define HYSTART_MIN_SAMPLES 8
42
-#define HYSTART_DELAY_MIN (4U<<3)
43
-#define HYSTART_DELAY_MAX (16U<<3)
43
+#define HYSTART_DELAY_MIN (4000U) /* 4 ms */
44
+#define HYSTART_DELAY_MAX (16000U) /* 16 ms */
4445 #define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX)
4546
4647 static int fast_convergence __read_mostly = 1;
....@@ -52,7 +53,7 @@
5253 static int hystart __read_mostly = 1;
5354 static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY;
5455 static int hystart_low_window __read_mostly = 16;
55
-static int hystart_ack_delta __read_mostly = 2;
56
+static int hystart_ack_delta_us __read_mostly = 2000;
5657
5758 static u32 cube_rtt_scale __read_mostly;
5859 static u32 beta_scale __read_mostly;
....@@ -76,8 +77,8 @@
7677 " 1: packet-train 2: delay 3: both packet-train and delay");
7778 module_param(hystart_low_window, int, 0644);
7879 MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start");
79
-module_param(hystart_ack_delta, int, 0644);
80
-MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (msecs)");
80
+module_param(hystart_ack_delta_us, int, 0644);
81
+MODULE_PARM_DESC(hystart_ack_delta_us, "spacing between ack's indicating train (usecs)");
8182
8283 /* BIC TCP Parameters */
8384 struct bictcp {
....@@ -88,7 +89,7 @@
8889 u32 bic_origin_point;/* origin point of bic function */
8990 u32 bic_K; /* time to origin point
9091 from the beginning of the current epoch */
91
- u32 delay_min; /* min delay (msec << 3) */
92
+ u32 delay_min; /* min delay (usec) */
9293 u32 epoch_start; /* beginning of an epoch */
9394 u32 ack_cnt; /* number of acks */
9495 u32 tcp_cwnd; /* estimated tcp cwnd */
....@@ -116,13 +117,9 @@
116117 ca->found = 0;
117118 }
118119
119
-static inline u32 bictcp_clock(void)
120
+static inline u32 bictcp_clock_us(const struct sock *sk)
120121 {
121
-#if HZ < 1000
122
- return ktime_to_ms(ktime_get_real());
123
-#else
124
- return jiffies_to_msecs(jiffies);
125
-#endif
122
+ return tcp_sk(sk)->tcp_mstamp;
126123 }
127124
128125 static inline void bictcp_hystart_reset(struct sock *sk)
....@@ -130,9 +127,9 @@
130127 struct tcp_sock *tp = tcp_sk(sk);
131128 struct bictcp *ca = inet_csk_ca(sk);
132129
133
- ca->round_start = ca->last_ack = bictcp_clock();
130
+ ca->round_start = ca->last_ack = bictcp_clock_us(sk);
134131 ca->end_seq = tp->snd_nxt;
135
- ca->curr_rtt = 0;
132
+ ca->curr_rtt = ~0U;
136133 ca->sample_cnt = 0;
137134 }
138135
....@@ -275,7 +272,7 @@
275272 */
276273
277274 t = (s32)(tcp_jiffies32 - ca->epoch_start);
278
- t += msecs_to_jiffies(ca->delay_min >> 3);
275
+ t += usecs_to_jiffies(ca->delay_min);
279276 /* change the unit from HZ to bictcp_HZ */
280277 t <<= BICTCP_HZ;
281278 do_div(t, HZ);
....@@ -373,25 +370,57 @@
373370 }
374371 }
375372
373
+/* Account for TSO/GRO delays.
374
+ * Otherwise short RTT flows could get too small ssthresh, since during
375
+ * slow start we begin with small TSO packets and ca->delay_min would
376
+ * not account for long aggregation delay when TSO packets get bigger.
377
+ * Ideally even with a very small RTT we would like to have at least one
378
+ * TSO packet being sent and received by GRO, and another one in qdisc layer.
379
+ * We apply another 100% factor because @rate is doubled at this point.
380
+ * We cap the cushion to 1ms.
381
+ */
382
+static u32 hystart_ack_delay(struct sock *sk)
383
+{
384
+ unsigned long rate;
385
+
386
+ rate = READ_ONCE(sk->sk_pacing_rate);
387
+ if (!rate)
388
+ return 0;
389
+ return min_t(u64, USEC_PER_MSEC,
390
+ div64_ul((u64)GSO_MAX_SIZE * 4 * USEC_PER_SEC, rate));
391
+}
392
+
376393 static void hystart_update(struct sock *sk, u32 delay)
377394 {
378395 struct tcp_sock *tp = tcp_sk(sk);
379396 struct bictcp *ca = inet_csk_ca(sk);
380
-
381
- if (ca->found & hystart_detect)
382
- return;
397
+ u32 threshold;
383398
384399 if (after(tp->snd_una, ca->end_seq))
385400 bictcp_hystart_reset(sk);
386401
387402 if (hystart_detect & HYSTART_ACK_TRAIN) {
388
- u32 now = bictcp_clock();
403
+ u32 now = bictcp_clock_us(sk);
389404
390405 /* first detection parameter - ack-train detection */
391
- if ((s32)(now - ca->last_ack) <= hystart_ack_delta) {
406
+ if ((s32)(now - ca->last_ack) <= hystart_ack_delta_us) {
392407 ca->last_ack = now;
393
- if ((s32)(now - ca->round_start) > ca->delay_min >> 4) {
394
- ca->found |= HYSTART_ACK_TRAIN;
408
+
409
+ threshold = ca->delay_min + hystart_ack_delay(sk);
410
+
411
+ /* Hystart ack train triggers if we get ack past
412
+ * ca->delay_min/2.
413
+ * Pacing might have delayed packets up to RTT/2
414
+ * during slow start.
415
+ */
416
+ if (sk->sk_pacing_status == SK_PACING_NONE)
417
+ threshold >>= 1;
418
+
419
+ if ((s32)(now - ca->round_start) > threshold) {
420
+ ca->found = 1;
421
+ pr_debug("hystart_ack_train (%u > %u) delay_min %u (+ ack_delay %u) cwnd %u\n",
422
+ now - ca->round_start, threshold,
423
+ ca->delay_min, hystart_ack_delay(sk), tp->snd_cwnd);
395424 NET_INC_STATS(sock_net(sk),
396425 LINUX_MIB_TCPHYSTARTTRAINDETECT);
397426 NET_ADD_STATS(sock_net(sk),
....@@ -407,14 +436,11 @@
407436 if (ca->curr_rtt > delay)
408437 ca->curr_rtt = delay;
409438 if (ca->sample_cnt < HYSTART_MIN_SAMPLES) {
410
- if (ca->curr_rtt == 0 || ca->curr_rtt > delay)
411
- ca->curr_rtt = delay;
412
-
413439 ca->sample_cnt++;
414440 } else {
415441 if (ca->curr_rtt > ca->delay_min +
416442 HYSTART_DELAY_THRESH(ca->delay_min >> 3)) {
417
- ca->found |= HYSTART_DELAY;
443
+ ca->found = 1;
418444 NET_INC_STATS(sock_net(sk),
419445 LINUX_MIB_TCPHYSTARTDELAYDETECT);
420446 NET_ADD_STATS(sock_net(sk),
....@@ -426,9 +452,6 @@
426452 }
427453 }
428454
429
-/* Track delayed acknowledgment ratio using sliding window
430
- * ratio = (15*ratio + sample) / 16
431
- */
432455 static void bictcp_acked(struct sock *sk, const struct ack_sample *sample)
433456 {
434457 const struct tcp_sock *tp = tcp_sk(sk);
....@@ -443,7 +466,7 @@
443466 if (ca->epoch_start && (s32)(tcp_jiffies32 - ca->epoch_start) < HZ)
444467 return;
445468
446
- delay = (sample->rtt_us << 3) / USEC_PER_MSEC;
469
+ delay = sample->rtt_us;
447470 if (delay == 0)
448471 delay = 1;
449472
....@@ -452,7 +475,7 @@
452475 ca->delay_min = delay;
453476
454477 /* hystart triggers when cwnd is larger than some threshold */
455
- if (hystart && tcp_in_slow_start(tp) &&
478
+ if (!ca->found && tcp_in_slow_start(tp) && hystart &&
456479 tp->snd_cwnd >= hystart_low_window)
457480 hystart_update(sk, delay);
458481 }