| .. | .. |
|---|
| 136 | 136 | /* Skip TSO below the following bandwidth (bits/sec): */ |
|---|
| 137 | 137 | static const int bbr_min_tso_rate = 1200000; |
|---|
| 138 | 138 | |
|---|
| 139 | +/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. |
|---|
| 140 | + * In order to help drive the network toward lower queues and low latency while |
|---|
| 141 | + * maintaining high utilization, the average pacing rate aims to be slightly |
|---|
| 142 | + * lower than the estimated bandwidth. This is an important aspect of the |
|---|
| 143 | + * design. |
|---|
| 144 | + */ |
|---|
| 145 | +static const int bbr_pacing_margin_percent = 1; |
|---|
| 146 | + |
|---|
| 139 | 147 | /* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain |
|---|
| 140 | 148 | * that will allow a smoothly increasing pacing rate that will double each RTT |
|---|
| 141 | 149 | * and send the same number of packets per RTT that an un-paced, slow-starting |
|---|
| .. | .. |
|---|
| 235 | 243 | { |
|---|
| 236 | 244 | unsigned int mss = tcp_sk(sk)->mss_cache; |
|---|
| 237 | 245 | |
|---|
| 238 | | - if (!tcp_needs_internal_pacing(sk)) |
|---|
| 239 | | - mss = tcp_mss_to_mtu(sk, mss); |
|---|
| 240 | 246 | rate *= mss; |
|---|
| 241 | 247 | rate *= gain; |
|---|
| 242 | 248 | rate >>= BBR_SCALE; |
|---|
| 243 | | - rate *= USEC_PER_SEC; |
|---|
| 249 | + rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent); |
|---|
| 244 | 250 | return rate >> BW_SCALE; |
|---|
| 245 | 251 | } |
|---|
| 246 | 252 | |
|---|
| 247 | 253 | /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */ |
|---|
| 248 | | -static u32 bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) |
|---|
| 254 | +static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) |
|---|
| 249 | 255 | { |
|---|
| 250 | 256 | u64 rate = bw; |
|---|
| 251 | 257 | |
|---|
| .. | .. |
|---|
| 273 | 279 | sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain); |
|---|
| 274 | 280 | } |
|---|
| 275 | 281 | |
|---|
| 276 | | -/* Pace using current bw estimate and a gain factor. In order to help drive the |
|---|
| 277 | | - * network toward lower queues while maintaining high utilization and low |
|---|
| 278 | | - * latency, the average pacing rate aims to be slightly (~1%) lower than the |
|---|
| 279 | | - * estimated bandwidth. This is an important aspect of the design. In this |
|---|
| 280 | | - * implementation this slightly lower pacing rate is achieved implicitly by not |
|---|
| 281 | | - * including link-layer headers in the packet size used for the pacing rate. |
|---|
| 282 | | - */ |
|---|
| 282 | +/* Pace using current bw estimate and a gain factor. */ |
|---|
| 283 | 283 | static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) |
|---|
| 284 | 284 | { |
|---|
| 285 | 285 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 286 | 286 | struct bbr *bbr = inet_csk_ca(sk); |
|---|
| 287 | | - u32 rate = bbr_bw_to_pacing_rate(sk, bw, gain); |
|---|
| 287 | + unsigned long rate = bbr_bw_to_pacing_rate(sk, bw, gain); |
|---|
| 288 | 288 | |
|---|
| 289 | 289 | if (unlikely(!bbr->has_seen_rtt && tp->srtt_us)) |
|---|
| 290 | 290 | bbr_init_pacing_rate_from_rtt(sk); |
|---|
| .. | .. |
|---|
| 306 | 306 | /* Sort of tcp_tso_autosize() but ignoring |
|---|
| 307 | 307 | * driver provided sk_gso_max_size. |
|---|
| 308 | 308 | */ |
|---|
| 309 | | - bytes = min_t(u32, sk->sk_pacing_rate >> sk->sk_pacing_shift, |
|---|
| 309 | + bytes = min_t(unsigned long, |
|---|
| 310 | + sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift), |
|---|
| 310 | 311 | GSO_MAX_SIZE - 1 - MAX_TCP_HEADER); |
|---|
| 311 | 312 | segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); |
|---|
| 312 | 313 | |
|---|
| .. | .. |
|---|
| 346 | 347 | |
|---|
| 347 | 348 | /* Calculate bdp based on min RTT and the estimated bottleneck bandwidth: |
|---|
| 348 | 349 | * |
|---|
| 349 | | - * bdp = bw * min_rtt * gain |
|---|
| 350 | + * bdp = ceil(bw * min_rtt * gain) |
|---|
| 350 | 351 | * |
|---|
| 351 | 352 | * The key factor, gain, controls the amount of queue. While a small gain |
|---|
| 352 | 353 | * builds a smaller queue, it becomes more vulnerable to noise in RTT |
|---|
| .. | .. |
|---|
| 370 | 371 | |
|---|
| 371 | 372 | w = (u64)bw * bbr->min_rtt_us; |
|---|
| 372 | 373 | |
|---|
| 373 | | - /* Apply a gain to the given value, then remove the BW_SCALE shift. */ |
|---|
| 374 | + /* Apply a gain to the given value, remove the BW_SCALE shift, and |
|---|
| 375 | + * round the value up to avoid a negative feedback loop. |
|---|
| 376 | + */ |
|---|
| 374 | 377 | bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; |
|---|
| 375 | 378 | |
|---|
| 376 | 379 | return bdp; |
|---|
| .. | .. |
|---|
| 386 | 389 | * which allows 2 outstanding 2-packet sequences, to try to keep pipe |
|---|
| 387 | 390 | * full even with ACK-every-other-packet delayed ACKs. |
|---|
| 388 | 391 | */ |
|---|
| 389 | | -static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd, int gain) |
|---|
| 392 | +static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd) |
|---|
| 390 | 393 | { |
|---|
| 391 | 394 | struct bbr *bbr = inet_csk_ca(sk); |
|---|
| 392 | 395 | |
|---|
| .. | .. |
|---|
| 397 | 400 | cwnd = (cwnd + 1) & ~1U; |
|---|
| 398 | 401 | |
|---|
| 399 | 402 | /* Ensure gain cycling gets inflight above BDP even for small BDPs. */ |
|---|
| 400 | | - if (bbr->mode == BBR_PROBE_BW && gain > BBR_UNIT) |
|---|
| 403 | + if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0) |
|---|
| 401 | 404 | cwnd += 2; |
|---|
| 402 | 405 | |
|---|
| 403 | 406 | return cwnd; |
|---|
| .. | .. |
|---|
| 409 | 412 | u32 inflight; |
|---|
| 410 | 413 | |
|---|
| 411 | 414 | inflight = bbr_bdp(sk, bw, gain); |
|---|
| 412 | | - inflight = bbr_quantization_budget(sk, inflight, gain); |
|---|
| 415 | + inflight = bbr_quantization_budget(sk, inflight); |
|---|
| 413 | 416 | |
|---|
| 414 | 417 | return inflight; |
|---|
| 418 | +} |
|---|
| 419 | + |
|---|
| 420 | +/* With pacing at lower layers, there's often less data "in the network" than |
|---|
| 421 | + * "in flight". With TSQ and departure time pacing at lower layers (e.g. fq), |
|---|
| 422 | + * we often have several skbs queued in the pacing layer with a pre-scheduled |
|---|
| 423 | + * earliest departure time (EDT). BBR adapts its pacing rate based on the |
|---|
| 424 | + * inflight level that it estimates has already been "baked in" by previous |
|---|
| 425 | + * departure time decisions. We calculate a rough estimate of the number of our |
|---|
| 426 | + * packets that might be in the network at the earliest departure time for the |
|---|
| 427 | + * next skb scheduled: |
|---|
| 428 | + * in_network_at_edt = inflight_at_edt - (EDT - now) * bw |
|---|
| 429 | + * If we're increasing inflight, then we want to know if the transmit of the |
|---|
| 430 | + * EDT skb will push inflight above the target, so inflight_at_edt includes |
|---|
| 431 | + * bbr_tso_segs_goal() from the skb departing at EDT. If decreasing inflight, |
|---|
| 432 | + * then estimate if inflight will sink too low just before the EDT transmit. |
|---|
| 433 | + */ |
|---|
| 434 | +static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now) |
|---|
| 435 | +{ |
|---|
| 436 | + struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 437 | + struct bbr *bbr = inet_csk_ca(sk); |
|---|
| 438 | + u64 now_ns, edt_ns, interval_us; |
|---|
| 439 | + u32 interval_delivered, inflight_at_edt; |
|---|
| 440 | + |
|---|
| 441 | + now_ns = tp->tcp_clock_cache; |
|---|
| 442 | + edt_ns = max(tp->tcp_wstamp_ns, now_ns); |
|---|
| 443 | + interval_us = div_u64(edt_ns - now_ns, NSEC_PER_USEC); |
|---|
| 444 | + interval_delivered = (u64)bbr_bw(sk) * interval_us >> BW_SCALE; |
|---|
| 445 | + inflight_at_edt = inflight_now; |
|---|
| 446 | + if (bbr->pacing_gain > BBR_UNIT) /* increasing inflight */ |
|---|
| 447 | + inflight_at_edt += bbr_tso_segs_goal(sk); /* include EDT skb */ |
|---|
| 448 | + if (interval_delivered >= inflight_at_edt) |
|---|
| 449 | + return 0; |
|---|
| 450 | + return inflight_at_edt - interval_delivered; |
|---|
| 415 | 451 | } |
|---|
| 416 | 452 | |
|---|
| 417 | 453 | /* Find the cwnd increment based on estimate of ack aggregation */ |
|---|
| .. | .. |
|---|
| 496 | 532 | * due to aggregation (of data and/or ACKs) visible in the ACK stream. |
|---|
| 497 | 533 | */ |
|---|
| 498 | 534 | target_cwnd += bbr_ack_aggregation_cwnd(sk); |
|---|
| 499 | | - target_cwnd = bbr_quantization_budget(sk, target_cwnd, gain); |
|---|
| 535 | + target_cwnd = bbr_quantization_budget(sk, target_cwnd); |
|---|
| 500 | 536 | |
|---|
| 501 | 537 | /* If we're below target cwnd, slow start cwnd toward target cwnd. */ |
|---|
| 502 | 538 | if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */ |
|---|
| .. | .. |
|---|
| 528 | 564 | if (bbr->pacing_gain == BBR_UNIT) |
|---|
| 529 | 565 | return is_full_length; /* just use wall clock time */ |
|---|
| 530 | 566 | |
|---|
| 531 | | - inflight = rs->prior_in_flight; /* what was in-flight before ACK? */ |
|---|
| 567 | + inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); |
|---|
| 532 | 568 | bw = bbr_max_bw(sk); |
|---|
| 533 | 569 | |
|---|
| 534 | 570 | /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at |
|---|
| .. | .. |
|---|
| 556 | 592 | |
|---|
| 557 | 593 | bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1); |
|---|
| 558 | 594 | bbr->cycle_mstamp = tp->delivered_mstamp; |
|---|
| 559 | | - bbr->pacing_gain = bbr->lt_use_bw ? BBR_UNIT : |
|---|
| 560 | | - bbr_pacing_gain[bbr->cycle_idx]; |
|---|
| 561 | 595 | } |
|---|
| 562 | 596 | |
|---|
| 563 | 597 | /* Gain cycling: cycle pacing gain to converge to fair share of available bw. */ |
|---|
| .. | .. |
|---|
| 575 | 609 | struct bbr *bbr = inet_csk_ca(sk); |
|---|
| 576 | 610 | |
|---|
| 577 | 611 | bbr->mode = BBR_STARTUP; |
|---|
| 578 | | - bbr->pacing_gain = bbr_high_gain; |
|---|
| 579 | | - bbr->cwnd_gain = bbr_high_gain; |
|---|
| 580 | 612 | } |
|---|
| 581 | 613 | |
|---|
| 582 | 614 | static void bbr_reset_probe_bw_mode(struct sock *sk) |
|---|
| .. | .. |
|---|
| 584 | 616 | struct bbr *bbr = inet_csk_ca(sk); |
|---|
| 585 | 617 | |
|---|
| 586 | 618 | bbr->mode = BBR_PROBE_BW; |
|---|
| 587 | | - bbr->pacing_gain = BBR_UNIT; |
|---|
| 588 | | - bbr->cwnd_gain = bbr_cwnd_gain; |
|---|
| 589 | 619 | bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand); |
|---|
| 590 | 620 | bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */ |
|---|
| 591 | 621 | } |
|---|
| .. | .. |
|---|
| 863 | 893 | |
|---|
| 864 | 894 | if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { |
|---|
| 865 | 895 | bbr->mode = BBR_DRAIN; /* drain queue we created */ |
|---|
| 866 | | - bbr->pacing_gain = bbr_drain_gain; /* pace slow to drain */ |
|---|
| 867 | | - bbr->cwnd_gain = bbr_high_gain; /* maintain cwnd */ |
|---|
| 868 | 896 | tcp_sk(sk)->snd_ssthresh = |
|---|
| 869 | 897 | bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); |
|---|
| 870 | 898 | } /* fall through to check if in-flight is already small: */ |
|---|
| 871 | 899 | if (bbr->mode == BBR_DRAIN && |
|---|
| 872 | | - tcp_packets_in_flight(tcp_sk(sk)) <= |
|---|
| 900 | + bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= |
|---|
| 873 | 901 | bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) |
|---|
| 874 | 902 | bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */ |
|---|
| 875 | 903 | } |
|---|
| .. | .. |
|---|
| 926 | 954 | if (bbr_probe_rtt_mode_ms > 0 && filter_expired && |
|---|
| 927 | 955 | !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) { |
|---|
| 928 | 956 | bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */ |
|---|
| 929 | | - bbr->pacing_gain = BBR_UNIT; |
|---|
| 930 | | - bbr->cwnd_gain = BBR_UNIT; |
|---|
| 931 | 957 | bbr_save_cwnd(sk); /* note cwnd so we can restore it */ |
|---|
| 932 | 958 | bbr->probe_rtt_done_stamp = 0; |
|---|
| 933 | 959 | } |
|---|
| .. | .. |
|---|
| 955 | 981 | bbr->idle_restart = 0; |
|---|
| 956 | 982 | } |
|---|
| 957 | 983 | |
|---|
| 984 | +static void bbr_update_gains(struct sock *sk) |
|---|
| 985 | +{ |
|---|
| 986 | + struct bbr *bbr = inet_csk_ca(sk); |
|---|
| 987 | + |
|---|
| 988 | + switch (bbr->mode) { |
|---|
| 989 | + case BBR_STARTUP: |
|---|
| 990 | + bbr->pacing_gain = bbr_high_gain; |
|---|
| 991 | + bbr->cwnd_gain = bbr_high_gain; |
|---|
| 992 | + break; |
|---|
| 993 | + case BBR_DRAIN: |
|---|
| 994 | + bbr->pacing_gain = bbr_drain_gain; /* slow, to drain */ |
|---|
| 995 | + bbr->cwnd_gain = bbr_high_gain; /* keep cwnd */ |
|---|
| 996 | + break; |
|---|
| 997 | + case BBR_PROBE_BW: |
|---|
| 998 | + bbr->pacing_gain = (bbr->lt_use_bw ? |
|---|
| 999 | + BBR_UNIT : |
|---|
| 1000 | + bbr_pacing_gain[bbr->cycle_idx]); |
|---|
| 1001 | + bbr->cwnd_gain = bbr_cwnd_gain; |
|---|
| 1002 | + break; |
|---|
| 1003 | + case BBR_PROBE_RTT: |
|---|
| 1004 | + bbr->pacing_gain = BBR_UNIT; |
|---|
| 1005 | + bbr->cwnd_gain = BBR_UNIT; |
|---|
| 1006 | + break; |
|---|
| 1007 | + default: |
|---|
| 1008 | + WARN_ONCE(1, "BBR bad mode: %u\n", bbr->mode); |
|---|
| 1009 | + break; |
|---|
| 1010 | + } |
|---|
| 1011 | +} |
|---|
| 1012 | + |
|---|
| 958 | 1013 | static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) |
|---|
| 959 | 1014 | { |
|---|
| 960 | 1015 | bbr_update_bw(sk, rs); |
|---|
| .. | .. |
|---|
| 963 | 1018 | bbr_check_full_bw_reached(sk, rs); |
|---|
| 964 | 1019 | bbr_check_drain(sk, rs); |
|---|
| 965 | 1020 | bbr_update_min_rtt(sk, rs); |
|---|
| 1021 | + bbr_update_gains(sk); |
|---|
| 966 | 1022 | } |
|---|
| 967 | 1023 | |
|---|
| 968 | 1024 | static void bbr_main(struct sock *sk, const struct rate_sample *rs) |
|---|