hc
2024-05-16 8d2a02b24d66aa359e83eebc1ed3c0f85367a1cb
kernel/net/ipv4/tcp_bbr.c
....@@ -136,6 +136,14 @@
136136 /* Skip TSO below the following bandwidth (bits/sec): */
137137 static const int bbr_min_tso_rate = 1200000;
138138
139
+/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck.
140
+ * In order to help drive the network toward lower queues and low latency while
141
+ * maintaining high utilization, the average pacing rate aims to be slightly
142
+ * lower than the estimated bandwidth. This is an important aspect of the
143
+ * design.
144
+ */
145
+static const int bbr_pacing_margin_percent = 1;
146
+
139147 /* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
140148 * that will allow a smoothly increasing pacing rate that will double each RTT
141149 * and send the same number of packets per RTT that an un-paced, slow-starting
....@@ -235,17 +243,15 @@
235243 {
236244 unsigned int mss = tcp_sk(sk)->mss_cache;
237245
238
- if (!tcp_needs_internal_pacing(sk))
239
- mss = tcp_mss_to_mtu(sk, mss);
240246 rate *= mss;
241247 rate *= gain;
242248 rate >>= BBR_SCALE;
243
- rate *= USEC_PER_SEC;
249
+ rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent);
244250 return rate >> BW_SCALE;
245251 }
246252
247253 /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
248
-static u32 bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
254
+static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
249255 {
250256 u64 rate = bw;
251257
....@@ -273,18 +279,12 @@
273279 sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain);
274280 }
275281
276
-/* Pace using current bw estimate and a gain factor. In order to help drive the
277
- * network toward lower queues while maintaining high utilization and low
278
- * latency, the average pacing rate aims to be slightly (~1%) lower than the
279
- * estimated bandwidth. This is an important aspect of the design. In this
280
- * implementation this slightly lower pacing rate is achieved implicitly by not
281
- * including link-layer headers in the packet size used for the pacing rate.
282
- */
282
+/* Pace using current bw estimate and a gain factor. */
283283 static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
284284 {
285285 struct tcp_sock *tp = tcp_sk(sk);
286286 struct bbr *bbr = inet_csk_ca(sk);
287
- u32 rate = bbr_bw_to_pacing_rate(sk, bw, gain);
287
+ unsigned long rate = bbr_bw_to_pacing_rate(sk, bw, gain);
288288
289289 if (unlikely(!bbr->has_seen_rtt && tp->srtt_us))
290290 bbr_init_pacing_rate_from_rtt(sk);
....@@ -306,7 +306,8 @@
306306 /* Sort of tcp_tso_autosize() but ignoring
307307 * driver provided sk_gso_max_size.
308308 */
309
- bytes = min_t(u32, sk->sk_pacing_rate >> sk->sk_pacing_shift,
309
+ bytes = min_t(unsigned long,
310
+ sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
310311 GSO_MAX_SIZE - 1 - MAX_TCP_HEADER);
311312 segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
312313
....@@ -346,7 +347,7 @@
346347
347348 /* Calculate bdp based on min RTT and the estimated bottleneck bandwidth:
348349 *
349
- * bdp = bw * min_rtt * gain
350
+ * bdp = ceil(bw * min_rtt * gain)
350351 *
351352 * The key factor, gain, controls the amount of queue. While a small gain
352353 * builds a smaller queue, it becomes more vulnerable to noise in RTT
....@@ -370,7 +371,9 @@
370371
371372 w = (u64)bw * bbr->min_rtt_us;
372373
373
- /* Apply a gain to the given value, then remove the BW_SCALE shift. */
374
+ /* Apply a gain to the given value, remove the BW_SCALE shift, and
375
+ * round the value up to avoid a negative feedback loop.
376
+ */
374377 bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
375378
376379 return bdp;
....@@ -386,7 +389,7 @@
386389 * which allows 2 outstanding 2-packet sequences, to try to keep pipe
387390 * full even with ACK-every-other-packet delayed ACKs.
388391 */
389
-static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd, int gain)
392
+static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd)
390393 {
391394 struct bbr *bbr = inet_csk_ca(sk);
392395
....@@ -397,7 +400,7 @@
397400 cwnd = (cwnd + 1) & ~1U;
398401
399402 /* Ensure gain cycling gets inflight above BDP even for small BDPs. */
400
- if (bbr->mode == BBR_PROBE_BW && gain > BBR_UNIT)
403
+ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0)
401404 cwnd += 2;
402405
403406 return cwnd;
....@@ -409,9 +412,42 @@
409412 u32 inflight;
410413
411414 inflight = bbr_bdp(sk, bw, gain);
412
- inflight = bbr_quantization_budget(sk, inflight, gain);
415
+ inflight = bbr_quantization_budget(sk, inflight);
413416
414417 return inflight;
418
+}
419
+
420
+/* With pacing at lower layers, there's often less data "in the network" than
421
+ * "in flight". With TSQ and departure time pacing at lower layers (e.g. fq),
422
+ * we often have several skbs queued in the pacing layer with a pre-scheduled
423
+ * earliest departure time (EDT). BBR adapts its pacing rate based on the
424
+ * inflight level that it estimates has already been "baked in" by previous
425
+ * departure time decisions. We calculate a rough estimate of the number of our
426
+ * packets that might be in the network at the earliest departure time for the
427
+ * next skb scheduled:
428
+ * in_network_at_edt = inflight_at_edt - (EDT - now) * bw
429
+ * If we're increasing inflight, then we want to know if the transmit of the
430
+ * EDT skb will push inflight above the target, so inflight_at_edt includes
431
+ * bbr_tso_segs_goal() from the skb departing at EDT. If decreasing inflight,
432
+ * then estimate if inflight will sink too low just before the EDT transmit.
433
+ */
434
+static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now)
435
+{
436
+ struct tcp_sock *tp = tcp_sk(sk);
437
+ struct bbr *bbr = inet_csk_ca(sk);
438
+ u64 now_ns, edt_ns, interval_us;
439
+ u32 interval_delivered, inflight_at_edt;
440
+
441
+ now_ns = tp->tcp_clock_cache;
442
+ edt_ns = max(tp->tcp_wstamp_ns, now_ns);
443
+ interval_us = div_u64(edt_ns - now_ns, NSEC_PER_USEC);
444
+ interval_delivered = (u64)bbr_bw(sk) * interval_us >> BW_SCALE;
445
+ inflight_at_edt = inflight_now;
446
+ if (bbr->pacing_gain > BBR_UNIT) /* increasing inflight */
447
+ inflight_at_edt += bbr_tso_segs_goal(sk); /* include EDT skb */
448
+ if (interval_delivered >= inflight_at_edt)
449
+ return 0;
450
+ return inflight_at_edt - interval_delivered;
415451 }
416452
417453 /* Find the cwnd increment based on estimate of ack aggregation */
....@@ -496,7 +532,7 @@
496532 * due to aggregation (of data and/or ACKs) visible in the ACK stream.
497533 */
498534 target_cwnd += bbr_ack_aggregation_cwnd(sk);
499
- target_cwnd = bbr_quantization_budget(sk, target_cwnd, gain);
535
+ target_cwnd = bbr_quantization_budget(sk, target_cwnd);
500536
501537 /* If we're below target cwnd, slow start cwnd toward target cwnd. */
502538 if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */
....@@ -528,7 +564,7 @@
528564 if (bbr->pacing_gain == BBR_UNIT)
529565 return is_full_length; /* just use wall clock time */
530566
531
- inflight = rs->prior_in_flight; /* what was in-flight before ACK? */
567
+ inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
532568 bw = bbr_max_bw(sk);
533569
534570 /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
....@@ -556,8 +592,6 @@
556592
557593 bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
558594 bbr->cycle_mstamp = tp->delivered_mstamp;
559
- bbr->pacing_gain = bbr->lt_use_bw ? BBR_UNIT :
560
- bbr_pacing_gain[bbr->cycle_idx];
561595 }
562596
563597 /* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
....@@ -575,8 +609,6 @@
575609 struct bbr *bbr = inet_csk_ca(sk);
576610
577611 bbr->mode = BBR_STARTUP;
578
- bbr->pacing_gain = bbr_high_gain;
579
- bbr->cwnd_gain = bbr_high_gain;
580612 }
581613
582614 static void bbr_reset_probe_bw_mode(struct sock *sk)
....@@ -584,8 +616,6 @@
584616 struct bbr *bbr = inet_csk_ca(sk);
585617
586618 bbr->mode = BBR_PROBE_BW;
587
- bbr->pacing_gain = BBR_UNIT;
588
- bbr->cwnd_gain = bbr_cwnd_gain;
589619 bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand);
590620 bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */
591621 }
....@@ -863,13 +893,11 @@
863893
864894 if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
865895 bbr->mode = BBR_DRAIN; /* drain queue we created */
866
- bbr->pacing_gain = bbr_drain_gain; /* pace slow to drain */
867
- bbr->cwnd_gain = bbr_high_gain; /* maintain cwnd */
868896 tcp_sk(sk)->snd_ssthresh =
869897 bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
870898 } /* fall through to check if in-flight is already small: */
871899 if (bbr->mode == BBR_DRAIN &&
872
- tcp_packets_in_flight(tcp_sk(sk)) <=
900
+ bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
873901 bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT))
874902 bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */
875903 }
....@@ -926,8 +954,6 @@
926954 if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
927955 !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
928956 bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */
929
- bbr->pacing_gain = BBR_UNIT;
930
- bbr->cwnd_gain = BBR_UNIT;
931957 bbr_save_cwnd(sk); /* note cwnd so we can restore it */
932958 bbr->probe_rtt_done_stamp = 0;
933959 }
....@@ -955,6 +981,35 @@
955981 bbr->idle_restart = 0;
956982 }
957983
984
+static void bbr_update_gains(struct sock *sk)
985
+{
986
+ struct bbr *bbr = inet_csk_ca(sk);
987
+
988
+ switch (bbr->mode) {
989
+ case BBR_STARTUP:
990
+ bbr->pacing_gain = bbr_high_gain;
991
+ bbr->cwnd_gain = bbr_high_gain;
992
+ break;
993
+ case BBR_DRAIN:
994
+ bbr->pacing_gain = bbr_drain_gain; /* slow, to drain */
995
+ bbr->cwnd_gain = bbr_high_gain; /* keep cwnd */
996
+ break;
997
+ case BBR_PROBE_BW:
998
+ bbr->pacing_gain = (bbr->lt_use_bw ?
999
+ BBR_UNIT :
1000
+ bbr_pacing_gain[bbr->cycle_idx]);
1001
+ bbr->cwnd_gain = bbr_cwnd_gain;
1002
+ break;
1003
+ case BBR_PROBE_RTT:
1004
+ bbr->pacing_gain = BBR_UNIT;
1005
+ bbr->cwnd_gain = BBR_UNIT;
1006
+ break;
1007
+ default:
1008
+ WARN_ONCE(1, "BBR bad mode: %u\n", bbr->mode);
1009
+ break;
1010
+ }
1011
+}
1012
+
9581013 static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
9591014 {
9601015 bbr_update_bw(sk, rs);
....@@ -963,6 +1018,7 @@
9631018 bbr_check_full_bw_reached(sk, rs);
9641019 bbr_check_drain(sk, rs);
9651020 bbr_update_min_rtt(sk, rs);
1021
+ bbr_update_gains(sk);
9661022 }
9671023
9681024 static void bbr_main(struct sock *sk, const struct rate_sample *rs)