.. | .. |
---|
136 | 136 | /* Skip TSO below the following bandwidth (bits/sec): */ |
---|
137 | 137 | static const int bbr_min_tso_rate = 1200000; |
---|
138 | 138 | |
---|
| 139 | +/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. |
---|
| 140 | + * In order to help drive the network toward lower queues and low latency while |
---|
| 141 | + * maintaining high utilization, the average pacing rate aims to be slightly |
---|
| 142 | + * lower than the estimated bandwidth. This is an important aspect of the |
---|
| 143 | + * design. |
---|
| 144 | + */ |
---|
| 145 | +static const int bbr_pacing_margin_percent = 1; |
---|
| 146 | + |
---|
139 | 147 | /* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain |
---|
140 | 148 | * that will allow a smoothly increasing pacing rate that will double each RTT |
---|
141 | 149 | * and send the same number of packets per RTT that an un-paced, slow-starting |
---|
.. | .. |
---|
235 | 243 | { |
---|
236 | 244 | unsigned int mss = tcp_sk(sk)->mss_cache; |
---|
237 | 245 | |
---|
238 | | - if (!tcp_needs_internal_pacing(sk)) |
---|
239 | | - mss = tcp_mss_to_mtu(sk, mss); |
---|
240 | 246 | rate *= mss; |
---|
241 | 247 | rate *= gain; |
---|
242 | 248 | rate >>= BBR_SCALE; |
---|
243 | | - rate *= USEC_PER_SEC; |
---|
| 249 | + rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent); |
---|
244 | 250 | return rate >> BW_SCALE; |
---|
245 | 251 | } |
---|
246 | 252 | |
---|
247 | 253 | /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */ |
---|
248 | | -static u32 bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) |
---|
| 254 | +static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) |
---|
249 | 255 | { |
---|
250 | 256 | u64 rate = bw; |
---|
251 | 257 | |
---|
.. | .. |
---|
273 | 279 | sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain); |
---|
274 | 280 | } |
---|
275 | 281 | |
---|
276 | | -/* Pace using current bw estimate and a gain factor. In order to help drive the |
---|
277 | | - * network toward lower queues while maintaining high utilization and low |
---|
278 | | - * latency, the average pacing rate aims to be slightly (~1%) lower than the |
---|
279 | | - * estimated bandwidth. This is an important aspect of the design. In this |
---|
280 | | - * implementation this slightly lower pacing rate is achieved implicitly by not |
---|
281 | | - * including link-layer headers in the packet size used for the pacing rate. |
---|
282 | | - */ |
---|
| 282 | +/* Pace using current bw estimate and a gain factor. */ |
---|
283 | 283 | static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) |
---|
284 | 284 | { |
---|
285 | 285 | struct tcp_sock *tp = tcp_sk(sk); |
---|
286 | 286 | struct bbr *bbr = inet_csk_ca(sk); |
---|
287 | | - u32 rate = bbr_bw_to_pacing_rate(sk, bw, gain); |
---|
| 287 | + unsigned long rate = bbr_bw_to_pacing_rate(sk, bw, gain); |
---|
288 | 288 | |
---|
289 | 289 | if (unlikely(!bbr->has_seen_rtt && tp->srtt_us)) |
---|
290 | 290 | bbr_init_pacing_rate_from_rtt(sk); |
---|
.. | .. |
---|
306 | 306 | /* Sort of tcp_tso_autosize() but ignoring |
---|
307 | 307 | * driver provided sk_gso_max_size. |
---|
308 | 308 | */ |
---|
309 | | - bytes = min_t(u32, sk->sk_pacing_rate >> sk->sk_pacing_shift, |
---|
| 309 | + bytes = min_t(unsigned long, |
---|
| 310 | + sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift), |
---|
310 | 311 | GSO_MAX_SIZE - 1 - MAX_TCP_HEADER); |
---|
311 | 312 | segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); |
---|
312 | 313 | |
---|
.. | .. |
---|
346 | 347 | |
---|
347 | 348 | /* Calculate bdp based on min RTT and the estimated bottleneck bandwidth: |
---|
348 | 349 | * |
---|
349 | | - * bdp = bw * min_rtt * gain |
---|
| 350 | + * bdp = ceil(bw * min_rtt * gain) |
---|
350 | 351 | * |
---|
351 | 352 | * The key factor, gain, controls the amount of queue. While a small gain |
---|
352 | 353 | * builds a smaller queue, it becomes more vulnerable to noise in RTT |
---|
.. | .. |
---|
370 | 371 | |
---|
371 | 372 | w = (u64)bw * bbr->min_rtt_us; |
---|
372 | 373 | |
---|
373 | | - /* Apply a gain to the given value, then remove the BW_SCALE shift. */ |
---|
| 374 | + /* Apply a gain to the given value, remove the BW_SCALE shift, and |
---|
| 375 | + * round the value up to avoid a negative feedback loop. |
---|
| 376 | + */ |
---|
374 | 377 | bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; |
---|
375 | 378 | |
---|
376 | 379 | return bdp; |
---|
.. | .. |
---|
386 | 389 | * which allows 2 outstanding 2-packet sequences, to try to keep pipe |
---|
387 | 390 | * full even with ACK-every-other-packet delayed ACKs. |
---|
388 | 391 | */ |
---|
389 | | -static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd, int gain) |
---|
| 392 | +static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd) |
---|
390 | 393 | { |
---|
391 | 394 | struct bbr *bbr = inet_csk_ca(sk); |
---|
392 | 395 | |
---|
.. | .. |
---|
397 | 400 | cwnd = (cwnd + 1) & ~1U; |
---|
398 | 401 | |
---|
399 | 402 | /* Ensure gain cycling gets inflight above BDP even for small BDPs. */ |
---|
400 | | - if (bbr->mode == BBR_PROBE_BW && gain > BBR_UNIT) |
---|
| 403 | + if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0) |
---|
401 | 404 | cwnd += 2; |
---|
402 | 405 | |
---|
403 | 406 | return cwnd; |
---|
.. | .. |
---|
409 | 412 | u32 inflight; |
---|
410 | 413 | |
---|
411 | 414 | inflight = bbr_bdp(sk, bw, gain); |
---|
412 | | - inflight = bbr_quantization_budget(sk, inflight, gain); |
---|
| 415 | + inflight = bbr_quantization_budget(sk, inflight); |
---|
413 | 416 | |
---|
414 | 417 | return inflight; |
---|
| 418 | +} |
---|
| 419 | + |
---|
| 420 | +/* With pacing at lower layers, there's often less data "in the network" than |
---|
| 421 | + * "in flight". With TSQ and departure time pacing at lower layers (e.g. fq), |
---|
| 422 | + * we often have several skbs queued in the pacing layer with a pre-scheduled |
---|
| 423 | + * earliest departure time (EDT). BBR adapts its pacing rate based on the |
---|
| 424 | + * inflight level that it estimates has already been "baked in" by previous |
---|
| 425 | + * departure time decisions. We calculate a rough estimate of the number of our |
---|
| 426 | + * packets that might be in the network at the earliest departure time for the |
---|
| 427 | + * next skb scheduled: |
---|
| 428 | + * in_network_at_edt = inflight_at_edt - (EDT - now) * bw |
---|
| 429 | + * If we're increasing inflight, then we want to know if the transmit of the |
---|
| 430 | + * EDT skb will push inflight above the target, so inflight_at_edt includes |
---|
| 431 | + * bbr_tso_segs_goal() from the skb departing at EDT. If decreasing inflight, |
---|
| 432 | + * then estimate if inflight will sink too low just before the EDT transmit. |
---|
| 433 | + */ |
---|
| 434 | +static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now) |
---|
| 435 | +{ |
---|
| 436 | + struct tcp_sock *tp = tcp_sk(sk); |
---|
| 437 | + struct bbr *bbr = inet_csk_ca(sk); |
---|
| 438 | + u64 now_ns, edt_ns, interval_us; |
---|
| 439 | + u32 interval_delivered, inflight_at_edt; |
---|
| 440 | + |
---|
| 441 | + now_ns = tp->tcp_clock_cache; |
---|
| 442 | + edt_ns = max(tp->tcp_wstamp_ns, now_ns); |
---|
| 443 | + interval_us = div_u64(edt_ns - now_ns, NSEC_PER_USEC); |
---|
| 444 | + interval_delivered = (u64)bbr_bw(sk) * interval_us >> BW_SCALE; |
---|
| 445 | + inflight_at_edt = inflight_now; |
---|
| 446 | + if (bbr->pacing_gain > BBR_UNIT) /* increasing inflight */ |
---|
| 447 | + inflight_at_edt += bbr_tso_segs_goal(sk); /* include EDT skb */ |
---|
| 448 | + if (interval_delivered >= inflight_at_edt) |
---|
| 449 | + return 0; |
---|
| 450 | + return inflight_at_edt - interval_delivered; |
---|
415 | 451 | } |
---|
416 | 452 | |
---|
417 | 453 | /* Find the cwnd increment based on estimate of ack aggregation */ |
---|
.. | .. |
---|
496 | 532 | * due to aggregation (of data and/or ACKs) visible in the ACK stream. |
---|
497 | 533 | */ |
---|
498 | 534 | target_cwnd += bbr_ack_aggregation_cwnd(sk); |
---|
499 | | - target_cwnd = bbr_quantization_budget(sk, target_cwnd, gain); |
---|
| 535 | + target_cwnd = bbr_quantization_budget(sk, target_cwnd); |
---|
500 | 536 | |
---|
501 | 537 | /* If we're below target cwnd, slow start cwnd toward target cwnd. */ |
---|
502 | 538 | if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */ |
---|
.. | .. |
---|
528 | 564 | if (bbr->pacing_gain == BBR_UNIT) |
---|
529 | 565 | return is_full_length; /* just use wall clock time */ |
---|
530 | 566 | |
---|
531 | | - inflight = rs->prior_in_flight; /* what was in-flight before ACK? */ |
---|
| 567 | + inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); |
---|
532 | 568 | bw = bbr_max_bw(sk); |
---|
533 | 569 | |
---|
534 | 570 | /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at |
---|
.. | .. |
---|
556 | 592 | |
---|
557 | 593 | bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1); |
---|
558 | 594 | bbr->cycle_mstamp = tp->delivered_mstamp; |
---|
559 | | - bbr->pacing_gain = bbr->lt_use_bw ? BBR_UNIT : |
---|
560 | | - bbr_pacing_gain[bbr->cycle_idx]; |
---|
561 | 595 | } |
---|
562 | 596 | |
---|
563 | 597 | /* Gain cycling: cycle pacing gain to converge to fair share of available bw. */ |
---|
.. | .. |
---|
575 | 609 | struct bbr *bbr = inet_csk_ca(sk); |
---|
576 | 610 | |
---|
577 | 611 | bbr->mode = BBR_STARTUP; |
---|
578 | | - bbr->pacing_gain = bbr_high_gain; |
---|
579 | | - bbr->cwnd_gain = bbr_high_gain; |
---|
580 | 612 | } |
---|
581 | 613 | |
---|
582 | 614 | static void bbr_reset_probe_bw_mode(struct sock *sk) |
---|
.. | .. |
---|
584 | 616 | struct bbr *bbr = inet_csk_ca(sk); |
---|
585 | 617 | |
---|
586 | 618 | bbr->mode = BBR_PROBE_BW; |
---|
587 | | - bbr->pacing_gain = BBR_UNIT; |
---|
588 | | - bbr->cwnd_gain = bbr_cwnd_gain; |
---|
589 | 619 | bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand); |
---|
590 | 620 | bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */ |
---|
591 | 621 | } |
---|
.. | .. |
---|
863 | 893 | |
---|
864 | 894 | if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { |
---|
865 | 895 | bbr->mode = BBR_DRAIN; /* drain queue we created */ |
---|
866 | | - bbr->pacing_gain = bbr_drain_gain; /* pace slow to drain */ |
---|
867 | | - bbr->cwnd_gain = bbr_high_gain; /* maintain cwnd */ |
---|
868 | 896 | tcp_sk(sk)->snd_ssthresh = |
---|
869 | 897 | bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); |
---|
870 | 898 | } /* fall through to check if in-flight is already small: */ |
---|
871 | 899 | if (bbr->mode == BBR_DRAIN && |
---|
872 | | - tcp_packets_in_flight(tcp_sk(sk)) <= |
---|
| 900 | + bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= |
---|
873 | 901 | bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) |
---|
874 | 902 | bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */ |
---|
875 | 903 | } |
---|
.. | .. |
---|
926 | 954 | if (bbr_probe_rtt_mode_ms > 0 && filter_expired && |
---|
927 | 955 | !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) { |
---|
928 | 956 | bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */ |
---|
929 | | - bbr->pacing_gain = BBR_UNIT; |
---|
930 | | - bbr->cwnd_gain = BBR_UNIT; |
---|
931 | 957 | bbr_save_cwnd(sk); /* note cwnd so we can restore it */ |
---|
932 | 958 | bbr->probe_rtt_done_stamp = 0; |
---|
933 | 959 | } |
---|
.. | .. |
---|
955 | 981 | bbr->idle_restart = 0; |
---|
956 | 982 | } |
---|
957 | 983 | |
---|
| 984 | +static void bbr_update_gains(struct sock *sk) |
---|
| 985 | +{ |
---|
| 986 | + struct bbr *bbr = inet_csk_ca(sk); |
---|
| 987 | + |
---|
| 988 | + switch (bbr->mode) { |
---|
| 989 | + case BBR_STARTUP: |
---|
| 990 | + bbr->pacing_gain = bbr_high_gain; |
---|
| 991 | + bbr->cwnd_gain = bbr_high_gain; |
---|
| 992 | + break; |
---|
| 993 | + case BBR_DRAIN: |
---|
| 994 | + bbr->pacing_gain = bbr_drain_gain; /* slow, to drain */ |
---|
| 995 | + bbr->cwnd_gain = bbr_high_gain; /* keep cwnd */ |
---|
| 996 | + break; |
---|
| 997 | + case BBR_PROBE_BW: |
---|
| 998 | + bbr->pacing_gain = (bbr->lt_use_bw ? |
---|
| 999 | + BBR_UNIT : |
---|
| 1000 | + bbr_pacing_gain[bbr->cycle_idx]); |
---|
| 1001 | + bbr->cwnd_gain = bbr_cwnd_gain; |
---|
| 1002 | + break; |
---|
| 1003 | + case BBR_PROBE_RTT: |
---|
| 1004 | + bbr->pacing_gain = BBR_UNIT; |
---|
| 1005 | + bbr->cwnd_gain = BBR_UNIT; |
---|
| 1006 | + break; |
---|
| 1007 | + default: |
---|
| 1008 | + WARN_ONCE(1, "BBR bad mode: %u\n", bbr->mode); |
---|
| 1009 | + break; |
---|
| 1010 | + } |
---|
| 1011 | +} |
---|
| 1012 | + |
---|
958 | 1013 | static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) |
---|
959 | 1014 | { |
---|
960 | 1015 | bbr_update_bw(sk, rs); |
---|
.. | .. |
---|
963 | 1018 | bbr_check_full_bw_reached(sk, rs); |
---|
964 | 1019 | bbr_check_drain(sk, rs); |
---|
965 | 1020 | bbr_update_min_rtt(sk, rs); |
---|
| 1021 | + bbr_update_gains(sk); |
---|
966 | 1022 | } |
---|
967 | 1023 | |
---|
968 | 1024 | static void bbr_main(struct sock *sk, const struct rate_sample *rs) |
---|