From ea08eeccae9297f7aabd2ef7f0c2517ac4549acc Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Tue, 20 Feb 2024 01:18:26 +0000
Subject: [PATCH] write in 30M
---
kernel/net/ipv4/tcp_bbr.c | 118 +++++++++++++++++++++++++++++++++++++++++++---------------
1 files changed, 87 insertions(+), 31 deletions(-)
diff --git a/kernel/net/ipv4/tcp_bbr.c b/kernel/net/ipv4/tcp_bbr.c
index 1740de0..6274462 100644
--- a/kernel/net/ipv4/tcp_bbr.c
+++ b/kernel/net/ipv4/tcp_bbr.c
@@ -136,6 +136,14 @@
/* Skip TSO below the following bandwidth (bits/sec): */
static const int bbr_min_tso_rate = 1200000;
+/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck.
+ * In order to help drive the network toward lower queues and low latency while
+ * maintaining high utilization, the average pacing rate aims to be slightly
+ * lower than the estimated bandwidth. This is an important aspect of the
+ * design.
+ */
+static const int bbr_pacing_margin_percent = 1;
+
/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
* that will allow a smoothly increasing pacing rate that will double each RTT
* and send the same number of packets per RTT that an un-paced, slow-starting
@@ -235,17 +243,15 @@
{
unsigned int mss = tcp_sk(sk)->mss_cache;
- if (!tcp_needs_internal_pacing(sk))
- mss = tcp_mss_to_mtu(sk, mss);
rate *= mss;
rate *= gain;
rate >>= BBR_SCALE;
- rate *= USEC_PER_SEC;
+ rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent);
return rate >> BW_SCALE;
}
/* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
-static u32 bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
+static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
{
u64 rate = bw;
@@ -273,18 +279,12 @@
sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain);
}
-/* Pace using current bw estimate and a gain factor. In order to help drive the
- * network toward lower queues while maintaining high utilization and low
- * latency, the average pacing rate aims to be slightly (~1%) lower than the
- * estimated bandwidth. This is an important aspect of the design. In this
- * implementation this slightly lower pacing rate is achieved implicitly by not
- * including link-layer headers in the packet size used for the pacing rate.
- */
+/* Pace using current bw estimate and a gain factor. */
static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bbr *bbr = inet_csk_ca(sk);
- u32 rate = bbr_bw_to_pacing_rate(sk, bw, gain);
+ unsigned long rate = bbr_bw_to_pacing_rate(sk, bw, gain);
if (unlikely(!bbr->has_seen_rtt && tp->srtt_us))
bbr_init_pacing_rate_from_rtt(sk);
@@ -306,7 +306,8 @@
/* Sort of tcp_tso_autosize() but ignoring
* driver provided sk_gso_max_size.
*/
- bytes = min_t(u32, sk->sk_pacing_rate >> sk->sk_pacing_shift,
+ bytes = min_t(unsigned long,
+ sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
GSO_MAX_SIZE - 1 - MAX_TCP_HEADER);
segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
@@ -346,7 +347,7 @@
/* Calculate bdp based on min RTT and the estimated bottleneck bandwidth:
*
- * bdp = bw * min_rtt * gain
+ * bdp = ceil(bw * min_rtt * gain)
*
* The key factor, gain, controls the amount of queue. While a small gain
* builds a smaller queue, it becomes more vulnerable to noise in RTT
@@ -370,7 +371,9 @@
w = (u64)bw * bbr->min_rtt_us;
- /* Apply a gain to the given value, then remove the BW_SCALE shift. */
+ /* Apply a gain to the given value, remove the BW_SCALE shift, and
+ * round the value up to avoid a negative feedback loop.
+ */
bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
return bdp;
@@ -386,7 +389,7 @@
* which allows 2 outstanding 2-packet sequences, to try to keep pipe
* full even with ACK-every-other-packet delayed ACKs.
*/
-static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd, int gain)
+static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd)
{
struct bbr *bbr = inet_csk_ca(sk);
@@ -397,7 +400,7 @@
cwnd = (cwnd + 1) & ~1U;
/* Ensure gain cycling gets inflight above BDP even for small BDPs. */
- if (bbr->mode == BBR_PROBE_BW && gain > BBR_UNIT)
+ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0)
cwnd += 2;
return cwnd;
@@ -409,9 +412,42 @@
u32 inflight;
inflight = bbr_bdp(sk, bw, gain);
- inflight = bbr_quantization_budget(sk, inflight, gain);
+ inflight = bbr_quantization_budget(sk, inflight);
return inflight;
+}
+
+/* With pacing at lower layers, there's often less data "in the network" than
+ * "in flight". With TSQ and departure time pacing at lower layers (e.g. fq),
+ * we often have several skbs queued in the pacing layer with a pre-scheduled
+ * earliest departure time (EDT). BBR adapts its pacing rate based on the
+ * inflight level that it estimates has already been "baked in" by previous
+ * departure time decisions. We calculate a rough estimate of the number of our
+ * packets that might be in the network at the earliest departure time for the
+ * next skb scheduled:
+ * in_network_at_edt = inflight_at_edt - (EDT - now) * bw
+ * If we're increasing inflight, then we want to know if the transmit of the
+ * EDT skb will push inflight above the target, so inflight_at_edt includes
+ * bbr_tso_segs_goal() from the skb departing at EDT. If decreasing inflight,
+ * then estimate if inflight will sink too low just before the EDT transmit.
+ */
+static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u64 now_ns, edt_ns, interval_us;
+ u32 interval_delivered, inflight_at_edt;
+
+ now_ns = tp->tcp_clock_cache;
+ edt_ns = max(tp->tcp_wstamp_ns, now_ns);
+ interval_us = div_u64(edt_ns - now_ns, NSEC_PER_USEC);
+ interval_delivered = (u64)bbr_bw(sk) * interval_us >> BW_SCALE;
+ inflight_at_edt = inflight_now;
+ if (bbr->pacing_gain > BBR_UNIT) /* increasing inflight */
+ inflight_at_edt += bbr_tso_segs_goal(sk); /* include EDT skb */
+ if (interval_delivered >= inflight_at_edt)
+ return 0;
+ return inflight_at_edt - interval_delivered;
}
/* Find the cwnd increment based on estimate of ack aggregation */
@@ -496,7 +532,7 @@
* due to aggregation (of data and/or ACKs) visible in the ACK stream.
*/
target_cwnd += bbr_ack_aggregation_cwnd(sk);
- target_cwnd = bbr_quantization_budget(sk, target_cwnd, gain);
+ target_cwnd = bbr_quantization_budget(sk, target_cwnd);
/* If we're below target cwnd, slow start cwnd toward target cwnd. */
if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */
@@ -528,7 +564,7 @@
if (bbr->pacing_gain == BBR_UNIT)
return is_full_length; /* just use wall clock time */
- inflight = rs->prior_in_flight; /* what was in-flight before ACK? */
+ inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
bw = bbr_max_bw(sk);
/* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
@@ -556,8 +592,6 @@
bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
bbr->cycle_mstamp = tp->delivered_mstamp;
- bbr->pacing_gain = bbr->lt_use_bw ? BBR_UNIT :
- bbr_pacing_gain[bbr->cycle_idx];
}
/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
@@ -575,8 +609,6 @@
struct bbr *bbr = inet_csk_ca(sk);
bbr->mode = BBR_STARTUP;
- bbr->pacing_gain = bbr_high_gain;
- bbr->cwnd_gain = bbr_high_gain;
}
static void bbr_reset_probe_bw_mode(struct sock *sk)
@@ -584,8 +616,6 @@
struct bbr *bbr = inet_csk_ca(sk);
bbr->mode = BBR_PROBE_BW;
- bbr->pacing_gain = BBR_UNIT;
- bbr->cwnd_gain = bbr_cwnd_gain;
bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand);
bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */
}
@@ -863,13 +893,11 @@
if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
bbr->mode = BBR_DRAIN; /* drain queue we created */
- bbr->pacing_gain = bbr_drain_gain; /* pace slow to drain */
- bbr->cwnd_gain = bbr_high_gain; /* maintain cwnd */
tcp_sk(sk)->snd_ssthresh =
bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
} /* fall through to check if in-flight is already small: */
if (bbr->mode == BBR_DRAIN &&
- tcp_packets_in_flight(tcp_sk(sk)) <=
+ bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT))
bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */
}
@@ -926,8 +954,6 @@
if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
!bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */
- bbr->pacing_gain = BBR_UNIT;
- bbr->cwnd_gain = BBR_UNIT;
bbr_save_cwnd(sk); /* note cwnd so we can restore it */
bbr->probe_rtt_done_stamp = 0;
}
@@ -955,6 +981,35 @@
bbr->idle_restart = 0;
}
+static void bbr_update_gains(struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ switch (bbr->mode) {
+ case BBR_STARTUP:
+ bbr->pacing_gain = bbr_high_gain;
+ bbr->cwnd_gain = bbr_high_gain;
+ break;
+ case BBR_DRAIN:
+ bbr->pacing_gain = bbr_drain_gain; /* slow, to drain */
+ bbr->cwnd_gain = bbr_high_gain; /* keep cwnd */
+ break;
+ case BBR_PROBE_BW:
+ bbr->pacing_gain = (bbr->lt_use_bw ?
+ BBR_UNIT :
+ bbr_pacing_gain[bbr->cycle_idx]);
+ bbr->cwnd_gain = bbr_cwnd_gain;
+ break;
+ case BBR_PROBE_RTT:
+ bbr->pacing_gain = BBR_UNIT;
+ bbr->cwnd_gain = BBR_UNIT;
+ break;
+ default:
+ WARN_ONCE(1, "BBR bad mode: %u\n", bbr->mode);
+ break;
+ }
+}
+
static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
{
bbr_update_bw(sk, rs);
@@ -963,6 +1018,7 @@
bbr_check_full_bw_reached(sk, rs);
bbr_check_drain(sk, rs);
bbr_update_min_rtt(sk, rs);
+ bbr_update_gains(sk);
}
static void bbr_main(struct sock *sk, const struct rate_sample *rs)
--
Gitblit v1.6.2