| .. | .. | 
|---|
| 136 | 136 | /* Skip TSO below the following bandwidth (bits/sec): */ | 
|---|
| 137 | 137 | static const int bbr_min_tso_rate = 1200000; | 
|---|
| 138 | 138 |  | 
|---|
|  | 139 | +/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. | 
|---|
|  | 140 | + * In order to help drive the network toward lower queues and low latency while | 
|---|
|  | 141 | + * maintaining high utilization, the average pacing rate aims to be slightly | 
|---|
|  | 142 | + * lower than the estimated bandwidth. This is an important aspect of the | 
|---|
|  | 143 | + * design. | 
|---|
|  | 144 | + */ | 
|---|
|  | 145 | +static const int bbr_pacing_margin_percent = 1; | 
|---|
|  | 146 | + | 
|---|
| 139 | 147 | /* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain | 
|---|
| 140 | 148 | * that will allow a smoothly increasing pacing rate that will double each RTT | 
|---|
| 141 | 149 | * and send the same number of packets per RTT that an un-paced, slow-starting | 
|---|
| .. | .. | 
|---|
| 235 | 243 | { | 
|---|
| 236 | 244 | unsigned int mss = tcp_sk(sk)->mss_cache; | 
|---|
| 237 | 245 |  | 
|---|
| 238 |  | -	if (!tcp_needs_internal_pacing(sk)) | 
|---|
| 239 |  | -		mss = tcp_mss_to_mtu(sk, mss); | 
|---|
| 240 | 246 | rate *= mss; | 
|---|
| 241 | 247 | rate *= gain; | 
|---|
| 242 | 248 | rate >>= BBR_SCALE; | 
|---|
| 243 |  | -	rate *= USEC_PER_SEC; | 
|---|
|  | 249 | +	rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent); | 
|---|
| 244 | 250 | return rate >> BW_SCALE; | 
|---|
| 245 | 251 | } | 
|---|
| 246 | 252 |  | 
|---|
| 247 | 253 | /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */ | 
|---|
| 248 |  | -static u32 bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) | 
|---|
|  | 254 | +static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) | 
|---|
| 249 | 255 | { | 
|---|
| 250 | 256 | u64 rate = bw; | 
|---|
| 251 | 257 |  | 
|---|
| .. | .. | 
|---|
| 273 | 279 | sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain); | 
|---|
| 274 | 280 | } | 
|---|
| 275 | 281 |  | 
|---|
| 276 |  | -/* Pace using current bw estimate and a gain factor. In order to help drive the | 
|---|
| 277 |  | - * network toward lower queues while maintaining high utilization and low | 
|---|
| 278 |  | - * latency, the average pacing rate aims to be slightly (~1%) lower than the | 
|---|
| 279 |  | - * estimated bandwidth. This is an important aspect of the design. In this | 
|---|
| 280 |  | - * implementation this slightly lower pacing rate is achieved implicitly by not | 
|---|
| 281 |  | - * including link-layer headers in the packet size used for the pacing rate. | 
|---|
| 282 |  | - */ | 
|---|
|  | 282 | +/* Pace using current bw estimate and a gain factor. */ | 
|---|
| 283 | 283 | static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) | 
|---|
| 284 | 284 | { | 
|---|
| 285 | 285 | struct tcp_sock *tp = tcp_sk(sk); | 
|---|
| 286 | 286 | struct bbr *bbr = inet_csk_ca(sk); | 
|---|
| 287 |  | -	u32 rate = bbr_bw_to_pacing_rate(sk, bw, gain); | 
|---|
|  | 287 | +	unsigned long rate = bbr_bw_to_pacing_rate(sk, bw, gain); | 
|---|
| 288 | 288 |  | 
|---|
| 289 | 289 | if (unlikely(!bbr->has_seen_rtt && tp->srtt_us)) | 
|---|
| 290 | 290 | bbr_init_pacing_rate_from_rtt(sk); | 
|---|
| .. | .. | 
|---|
| 306 | 306 | /* Sort of tcp_tso_autosize() but ignoring | 
|---|
| 307 | 307 | * driver provided sk_gso_max_size. | 
|---|
| 308 | 308 | */ | 
|---|
| 309 |  | -	bytes = min_t(u32, sk->sk_pacing_rate >> sk->sk_pacing_shift, | 
|---|
|  | 309 | +	bytes = min_t(unsigned long, | 
|---|
|  | 310 | +		      sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift), | 
|---|
| 310 | 311 | GSO_MAX_SIZE - 1 - MAX_TCP_HEADER); | 
|---|
| 311 | 312 | segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); | 
|---|
| 312 | 313 |  | 
|---|
| .. | .. | 
|---|
| 346 | 347 |  | 
|---|
| 347 | 348 | /* Calculate bdp based on min RTT and the estimated bottleneck bandwidth: | 
|---|
| 348 | 349 | * | 
|---|
| 349 |  | - * bdp = bw * min_rtt * gain | 
|---|
|  | 350 | + * bdp = ceil(bw * min_rtt * gain) | 
|---|
| 350 | 351 | * | 
|---|
| 351 | 352 | * The key factor, gain, controls the amount of queue. While a small gain | 
|---|
| 352 | 353 | * builds a smaller queue, it becomes more vulnerable to noise in RTT | 
|---|
| .. | .. | 
|---|
| 370 | 371 |  | 
|---|
| 371 | 372 | w = (u64)bw * bbr->min_rtt_us; | 
|---|
| 372 | 373 |  | 
|---|
| 373 |  | -	/* Apply a gain to the given value, then remove the BW_SCALE shift. */ | 
|---|
|  | 374 | +	/* Apply a gain to the given value, remove the BW_SCALE shift, and | 
|---|
|  | 375 | +	 * round the value up to avoid a negative feedback loop. | 
|---|
|  | 376 | +	 */ | 
|---|
| 374 | 377 | bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; | 
|---|
| 375 | 378 |  | 
|---|
| 376 | 379 | return bdp; | 
|---|
| .. | .. | 
|---|
| 386 | 389 | * which allows 2 outstanding 2-packet sequences, to try to keep pipe | 
|---|
| 387 | 390 | * full even with ACK-every-other-packet delayed ACKs. | 
|---|
| 388 | 391 | */ | 
|---|
| 389 |  | -static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd, int gain) | 
|---|
|  | 392 | +static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd) | 
|---|
| 390 | 393 | { | 
|---|
| 391 | 394 | struct bbr *bbr = inet_csk_ca(sk); | 
|---|
| 392 | 395 |  | 
|---|
| .. | .. | 
|---|
| 397 | 400 | cwnd = (cwnd + 1) & ~1U; | 
|---|
| 398 | 401 |  | 
|---|
| 399 | 402 | /* Ensure gain cycling gets inflight above BDP even for small BDPs. */ | 
|---|
| 400 |  | -	if (bbr->mode == BBR_PROBE_BW && gain > BBR_UNIT) | 
|---|
|  | 403 | +	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0) | 
|---|
| 401 | 404 | cwnd += 2; | 
|---|
| 402 | 405 |  | 
|---|
| 403 | 406 | return cwnd; | 
|---|
| .. | .. | 
|---|
| 409 | 412 | u32 inflight; | 
|---|
| 410 | 413 |  | 
|---|
| 411 | 414 | inflight = bbr_bdp(sk, bw, gain); | 
|---|
| 412 |  | -	inflight = bbr_quantization_budget(sk, inflight, gain); | 
|---|
|  | 415 | +	inflight = bbr_quantization_budget(sk, inflight); | 
|---|
| 413 | 416 |  | 
|---|
| 414 | 417 | return inflight; | 
|---|
|  | 418 | +} | 
|---|
|  | 419 | + | 
|---|
|  | 420 | +/* With pacing at lower layers, there's often less data "in the network" than | 
|---|
|  | 421 | + * "in flight". With TSQ and departure time pacing at lower layers (e.g. fq), | 
|---|
|  | 422 | + * we often have several skbs queued in the pacing layer with a pre-scheduled | 
|---|
|  | 423 | + * earliest departure time (EDT). BBR adapts its pacing rate based on the | 
|---|
|  | 424 | + * inflight level that it estimates has already been "baked in" by previous | 
|---|
|  | 425 | + * departure time decisions. We calculate a rough estimate of the number of our | 
|---|
|  | 426 | + * packets that might be in the network at the earliest departure time for the | 
|---|
|  | 427 | + * next skb scheduled: | 
|---|
|  | 428 | + *   in_network_at_edt = inflight_at_edt - (EDT - now) * bw | 
|---|
|  | 429 | + * If we're increasing inflight, then we want to know if the transmit of the | 
|---|
|  | 430 | + * EDT skb will push inflight above the target, so inflight_at_edt includes | 
|---|
|  | 431 | + * bbr_tso_segs_goal() from the skb departing at EDT. If decreasing inflight, | 
|---|
|  | 432 | + * then estimate if inflight will sink too low just before the EDT transmit. | 
|---|
|  | 433 | + */ | 
|---|
|  | 434 | +static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now) | 
|---|
|  | 435 | +{ | 
|---|
|  | 436 | +	struct tcp_sock *tp = tcp_sk(sk); | 
|---|
|  | 437 | +	struct bbr *bbr = inet_csk_ca(sk); | 
|---|
|  | 438 | +	u64 now_ns, edt_ns, interval_us; | 
|---|
|  | 439 | +	u32 interval_delivered, inflight_at_edt; | 
|---|
|  | 440 | + | 
|---|
|  | 441 | +	now_ns = tp->tcp_clock_cache; | 
|---|
|  | 442 | +	edt_ns = max(tp->tcp_wstamp_ns, now_ns); | 
|---|
|  | 443 | +	interval_us = div_u64(edt_ns - now_ns, NSEC_PER_USEC); | 
|---|
|  | 444 | +	interval_delivered = (u64)bbr_bw(sk) * interval_us >> BW_SCALE; | 
|---|
|  | 445 | +	inflight_at_edt = inflight_now; | 
|---|
|  | 446 | +	if (bbr->pacing_gain > BBR_UNIT)              /* increasing inflight */ | 
|---|
|  | 447 | +		inflight_at_edt += bbr_tso_segs_goal(sk);  /* include EDT skb */ | 
|---|
|  | 448 | +	if (interval_delivered >= inflight_at_edt) | 
|---|
|  | 449 | +		return 0; | 
|---|
|  | 450 | +	return inflight_at_edt - interval_delivered; | 
|---|
| 415 | 451 | } | 
|---|
| 416 | 452 |  | 
|---|
| 417 | 453 | /* Find the cwnd increment based on estimate of ack aggregation */ | 
|---|
| .. | .. | 
|---|
| 496 | 532 | * due to aggregation (of data and/or ACKs) visible in the ACK stream. | 
|---|
| 497 | 533 | */ | 
|---|
| 498 | 534 | target_cwnd += bbr_ack_aggregation_cwnd(sk); | 
|---|
| 499 |  | -	target_cwnd = bbr_quantization_budget(sk, target_cwnd, gain); | 
|---|
|  | 535 | +	target_cwnd = bbr_quantization_budget(sk, target_cwnd); | 
|---|
| 500 | 536 |  | 
|---|
| 501 | 537 | /* If we're below target cwnd, slow start cwnd toward target cwnd. */ | 
|---|
| 502 | 538 | if (bbr_full_bw_reached(sk))  /* only cut cwnd if we filled the pipe */ | 
|---|
| .. | .. | 
|---|
| 528 | 564 | if (bbr->pacing_gain == BBR_UNIT) | 
|---|
| 529 | 565 | return is_full_length;		/* just use wall clock time */ | 
|---|
| 530 | 566 |  | 
|---|
| 531 |  | -	inflight = rs->prior_in_flight;  /* what was in-flight before ACK? */ | 
|---|
|  | 567 | +	inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); | 
|---|
| 532 | 568 | bw = bbr_max_bw(sk); | 
|---|
| 533 | 569 |  | 
|---|
| 534 | 570 | /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at | 
|---|
| .. | .. | 
|---|
| 556 | 592 |  | 
|---|
| 557 | 593 | bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1); | 
|---|
| 558 | 594 | bbr->cycle_mstamp = tp->delivered_mstamp; | 
|---|
| 559 |  | -	bbr->pacing_gain = bbr->lt_use_bw ? BBR_UNIT : | 
|---|
| 560 |  | -					    bbr_pacing_gain[bbr->cycle_idx]; | 
|---|
| 561 | 595 | } | 
|---|
| 562 | 596 |  | 
|---|
| 563 | 597 | /* Gain cycling: cycle pacing gain to converge to fair share of available bw. */ | 
|---|
| .. | .. | 
|---|
| 575 | 609 | struct bbr *bbr = inet_csk_ca(sk); | 
|---|
| 576 | 610 |  | 
|---|
| 577 | 611 | bbr->mode = BBR_STARTUP; | 
|---|
| 578 |  | -	bbr->pacing_gain = bbr_high_gain; | 
|---|
| 579 |  | -	bbr->cwnd_gain	 = bbr_high_gain; | 
|---|
| 580 | 612 | } | 
|---|
| 581 | 613 |  | 
|---|
| 582 | 614 | static void bbr_reset_probe_bw_mode(struct sock *sk) | 
|---|
| .. | .. | 
|---|
| 584 | 616 | struct bbr *bbr = inet_csk_ca(sk); | 
|---|
| 585 | 617 |  | 
|---|
| 586 | 618 | bbr->mode = BBR_PROBE_BW; | 
|---|
| 587 |  | -	bbr->pacing_gain = BBR_UNIT; | 
|---|
| 588 |  | -	bbr->cwnd_gain = bbr_cwnd_gain; | 
|---|
| 589 | 619 | bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand); | 
|---|
| 590 | 620 | bbr_advance_cycle_phase(sk);	/* flip to next phase of gain cycle */ | 
|---|
| 591 | 621 | } | 
|---|
| .. | .. | 
|---|
| 863 | 893 |  | 
|---|
| 864 | 894 | if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { | 
|---|
| 865 | 895 | bbr->mode = BBR_DRAIN;	/* drain queue we created */ | 
|---|
| 866 |  | -		bbr->pacing_gain = bbr_drain_gain;	/* pace slow to drain */ | 
|---|
| 867 |  | -		bbr->cwnd_gain = bbr_high_gain;	/* maintain cwnd */ | 
|---|
| 868 | 896 | tcp_sk(sk)->snd_ssthresh = | 
|---|
| 869 | 897 | bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); | 
|---|
| 870 | 898 | }	/* fall through to check if in-flight is already small: */ | 
|---|
| 871 | 899 | if (bbr->mode == BBR_DRAIN && | 
|---|
| 872 |  | -	    tcp_packets_in_flight(tcp_sk(sk)) <= | 
|---|
|  | 900 | +	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= | 
|---|
| 873 | 901 | bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) | 
|---|
| 874 | 902 | bbr_reset_probe_bw_mode(sk);  /* we estimate queue is drained */ | 
|---|
| 875 | 903 | } | 
|---|
| .. | .. | 
|---|
| 926 | 954 | if (bbr_probe_rtt_mode_ms > 0 && filter_expired && | 
|---|
| 927 | 955 | !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) { | 
|---|
| 928 | 956 | bbr->mode = BBR_PROBE_RTT;  /* dip, drain queue */ | 
|---|
| 929 |  | -		bbr->pacing_gain = BBR_UNIT; | 
|---|
| 930 |  | -		bbr->cwnd_gain = BBR_UNIT; | 
|---|
| 931 | 957 | bbr_save_cwnd(sk);  /* note cwnd so we can restore it */ | 
|---|
| 932 | 958 | bbr->probe_rtt_done_stamp = 0; | 
|---|
| 933 | 959 | } | 
|---|
| .. | .. | 
|---|
| 955 | 981 | bbr->idle_restart = 0; | 
|---|
| 956 | 982 | } | 
|---|
| 957 | 983 |  | 
|---|
|  | 984 | +static void bbr_update_gains(struct sock *sk) | 
|---|
|  | 985 | +{ | 
|---|
|  | 986 | +	struct bbr *bbr = inet_csk_ca(sk); | 
|---|
|  | 987 | + | 
|---|
|  | 988 | +	switch (bbr->mode) { | 
|---|
|  | 989 | +	case BBR_STARTUP: | 
|---|
|  | 990 | +		bbr->pacing_gain = bbr_high_gain; | 
|---|
|  | 991 | +		bbr->cwnd_gain	 = bbr_high_gain; | 
|---|
|  | 992 | +		break; | 
|---|
|  | 993 | +	case BBR_DRAIN: | 
|---|
|  | 994 | +		bbr->pacing_gain = bbr_drain_gain;	/* slow, to drain */ | 
|---|
|  | 995 | +		bbr->cwnd_gain	 = bbr_high_gain;	/* keep cwnd */ | 
|---|
|  | 996 | +		break; | 
|---|
|  | 997 | +	case BBR_PROBE_BW: | 
|---|
|  | 998 | +		bbr->pacing_gain = (bbr->lt_use_bw ? | 
|---|
|  | 999 | +				    BBR_UNIT : | 
|---|
|  | 1000 | +				    bbr_pacing_gain[bbr->cycle_idx]); | 
|---|
|  | 1001 | +		bbr->cwnd_gain	 = bbr_cwnd_gain; | 
|---|
|  | 1002 | +		break; | 
|---|
|  | 1003 | +	case BBR_PROBE_RTT: | 
|---|
|  | 1004 | +		bbr->pacing_gain = BBR_UNIT; | 
|---|
|  | 1005 | +		bbr->cwnd_gain	 = BBR_UNIT; | 
|---|
|  | 1006 | +		break; | 
|---|
|  | 1007 | +	default: | 
|---|
|  | 1008 | +		WARN_ONCE(1, "BBR bad mode: %u\n", bbr->mode); | 
|---|
|  | 1009 | +		break; | 
|---|
|  | 1010 | +	} | 
|---|
|  | 1011 | +} | 
|---|
|  | 1012 | + | 
|---|
| 958 | 1013 | static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) | 
|---|
| 959 | 1014 | { | 
|---|
| 960 | 1015 | bbr_update_bw(sk, rs); | 
|---|
| .. | .. | 
|---|
| 963 | 1018 | bbr_check_full_bw_reached(sk, rs); | 
|---|
| 964 | 1019 | bbr_check_drain(sk, rs); | 
|---|
| 965 | 1020 | bbr_update_min_rtt(sk, rs); | 
|---|
|  | 1021 | +	bbr_update_gains(sk); | 
|---|
| 966 | 1022 | } | 
|---|
| 967 | 1023 |  | 
|---|
| 968 | 1024 | static void bbr_main(struct sock *sk, const struct rate_sample *rs) | 
|---|