| .. | .. |
|---|
| 77 | 77 | #include <asm/unaligned.h> |
|---|
| 78 | 78 | #include <linux/errqueue.h> |
|---|
| 79 | 79 | #include <trace/events/tcp.h> |
|---|
| 80 | | -#include <linux/static_key.h> |
|---|
| 80 | +#include <linux/jump_label_ratelimit.h> |
|---|
| 81 | 81 | #include <net/busy_poll.h> |
|---|
| 82 | +#include <net/mptcp.h> |
|---|
| 83 | +#include <trace/hooks/net.h> |
|---|
| 82 | 84 | |
|---|
| 83 | 85 | int sysctl_tcp_max_orphans __read_mostly = NR_FILE; |
|---|
| 84 | 86 | |
|---|
| .. | .. |
|---|
| 113 | 115 | #define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */ |
|---|
| 114 | 116 | |
|---|
| 115 | 117 | #if IS_ENABLED(CONFIG_TLS_DEVICE) |
|---|
| 116 | | -static DEFINE_STATIC_KEY_FALSE(clean_acked_data_enabled); |
|---|
| 118 | +static DEFINE_STATIC_KEY_DEFERRED_FALSE(clean_acked_data_enabled, HZ); |
|---|
| 117 | 119 | |
|---|
| 118 | 120 | void clean_acked_data_enable(struct inet_connection_sock *icsk, |
|---|
| 119 | 121 | void (*cad)(struct sock *sk, u32 ack_seq)) |
|---|
| 120 | 122 | { |
|---|
| 121 | 123 | icsk->icsk_clean_acked = cad; |
|---|
| 122 | | - static_branch_inc(&clean_acked_data_enabled); |
|---|
| 124 | + static_branch_deferred_inc(&clean_acked_data_enabled); |
|---|
| 123 | 125 | } |
|---|
| 124 | 126 | EXPORT_SYMBOL_GPL(clean_acked_data_enable); |
|---|
| 125 | 127 | |
|---|
| 126 | 128 | void clean_acked_data_disable(struct inet_connection_sock *icsk) |
|---|
| 127 | 129 | { |
|---|
| 128 | | - static_branch_dec(&clean_acked_data_enabled); |
|---|
| 130 | + static_branch_slow_dec_deferred(&clean_acked_data_enabled); |
|---|
| 129 | 131 | icsk->icsk_clean_acked = NULL; |
|---|
| 130 | 132 | } |
|---|
| 131 | 133 | EXPORT_SYMBOL_GPL(clean_acked_data_disable); |
|---|
| 134 | + |
|---|
| 135 | +void clean_acked_data_flush(void) |
|---|
| 136 | +{ |
|---|
| 137 | + static_key_deferred_flush(&clean_acked_data_enabled); |
|---|
| 138 | +} |
|---|
| 139 | +EXPORT_SYMBOL_GPL(clean_acked_data_flush); |
|---|
| 140 | +#endif |
|---|
| 141 | + |
|---|
| 142 | +#ifdef CONFIG_CGROUP_BPF |
|---|
| 143 | +static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) |
|---|
| 144 | +{ |
|---|
| 145 | + bool unknown_opt = tcp_sk(sk)->rx_opt.saw_unknown && |
|---|
| 146 | + BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), |
|---|
| 147 | + BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG); |
|---|
| 148 | + bool parse_all_opt = BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), |
|---|
| 149 | + BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG); |
|---|
| 150 | + struct bpf_sock_ops_kern sock_ops; |
|---|
| 151 | + |
|---|
| 152 | + if (likely(!unknown_opt && !parse_all_opt)) |
|---|
| 153 | + return; |
|---|
| 154 | + |
|---|
| 155 | + /* The skb will be handled in the |
|---|
| 156 | + * bpf_skops_established() or |
|---|
| 157 | + * bpf_skops_write_hdr_opt(). |
|---|
| 158 | + */ |
|---|
| 159 | + switch (sk->sk_state) { |
|---|
| 160 | + case TCP_SYN_RECV: |
|---|
| 161 | + case TCP_SYN_SENT: |
|---|
| 162 | + case TCP_LISTEN: |
|---|
| 163 | + return; |
|---|
| 164 | + } |
|---|
| 165 | + |
|---|
| 166 | + sock_owned_by_me(sk); |
|---|
| 167 | + |
|---|
| 168 | + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); |
|---|
| 169 | + sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB; |
|---|
| 170 | + sock_ops.is_fullsock = 1; |
|---|
| 171 | + sock_ops.sk = sk; |
|---|
| 172 | + bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb)); |
|---|
| 173 | + |
|---|
| 174 | + BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); |
|---|
| 175 | +} |
|---|
| 176 | + |
|---|
| 177 | +static void bpf_skops_established(struct sock *sk, int bpf_op, |
|---|
| 178 | + struct sk_buff *skb) |
|---|
| 179 | +{ |
|---|
| 180 | + struct bpf_sock_ops_kern sock_ops; |
|---|
| 181 | + |
|---|
| 182 | + sock_owned_by_me(sk); |
|---|
| 183 | + |
|---|
| 184 | + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); |
|---|
| 185 | + sock_ops.op = bpf_op; |
|---|
| 186 | + sock_ops.is_fullsock = 1; |
|---|
| 187 | + sock_ops.sk = sk; |
|---|
| 188 | + /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */ |
|---|
| 189 | + if (skb) |
|---|
| 190 | + bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb)); |
|---|
| 191 | + |
|---|
| 192 | + BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); |
|---|
| 193 | +} |
|---|
| 194 | +#else |
|---|
| 195 | +static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) |
|---|
| 196 | +{ |
|---|
| 197 | +} |
|---|
| 198 | + |
|---|
| 199 | +static void bpf_skops_established(struct sock *sk, int bpf_op, |
|---|
| 200 | + struct sk_buff *skb) |
|---|
| 201 | +{ |
|---|
| 202 | +} |
|---|
| 132 | 203 | #endif |
|---|
| 133 | 204 | |
|---|
| 134 | 205 | static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb, |
|---|
| .. | .. |
|---|
| 172 | 243 | if (unlikely(len > icsk->icsk_ack.rcv_mss + |
|---|
| 173 | 244 | MAX_TCP_OPTION_SPACE)) |
|---|
| 174 | 245 | tcp_gro_dev_warn(sk, skb, len); |
|---|
| 246 | + /* If the skb has a len of exactly 1*MSS and has the PSH bit |
|---|
| 247 | + * set then it is likely the end of an application write. So |
|---|
| 248 | + * more data may not be arriving soon, and yet the data sender |
|---|
| 249 | + * may be waiting for an ACK if cwnd-bound or using TX zero |
|---|
| 250 | + * copy. So we set ICSK_ACK_PUSHED here so that |
|---|
| 251 | + * tcp_cleanup_rbuf() will send an ACK immediately if the app |
|---|
| 252 | + * reads all of the data and is not ping-pong. If len > MSS |
|---|
| 253 | + * then this logic does not matter (and does not hurt) because |
|---|
| 254 | + * tcp_cleanup_rbuf() will always ACK immediately if the app |
|---|
| 255 | + * reads data and there is more than an MSS of unACKed data. |
|---|
| 256 | + */ |
|---|
| 257 | + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_PSH) |
|---|
| 258 | + icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; |
|---|
| 175 | 259 | } else { |
|---|
| 176 | 260 | /* Otherwise, we make more careful check taking into account, |
|---|
| 177 | 261 | * that SACKs block is variable. |
|---|
| .. | .. |
|---|
| 216 | 300 | icsk->icsk_ack.quick = quickacks; |
|---|
| 217 | 301 | } |
|---|
| 218 | 302 | |
|---|
| 219 | | -void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) |
|---|
| 303 | +static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) |
|---|
| 220 | 304 | { |
|---|
| 221 | 305 | struct inet_connection_sock *icsk = inet_csk(sk); |
|---|
| 222 | 306 | |
|---|
| 223 | 307 | tcp_incr_quickack(sk, max_quickacks); |
|---|
| 224 | | - icsk->icsk_ack.pingpong = 0; |
|---|
| 308 | + inet_csk_exit_pingpong_mode(sk); |
|---|
| 225 | 309 | icsk->icsk_ack.ato = TCP_ATO_MIN; |
|---|
| 226 | 310 | } |
|---|
| 227 | | -EXPORT_SYMBOL(tcp_enter_quickack_mode); |
|---|
| 228 | 311 | |
|---|
| 229 | 312 | /* Send ACKs quickly, if "quick" count is not exhausted |
|---|
| 230 | 313 | * and the session is not interactive. |
|---|
| .. | .. |
|---|
| 236 | 319 | const struct dst_entry *dst = __sk_dst_get(sk); |
|---|
| 237 | 320 | |
|---|
| 238 | 321 | return (dst && dst_metric(dst, RTAX_QUICKACK)) || |
|---|
| 239 | | - (icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong); |
|---|
| 322 | + (icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk)); |
|---|
| 240 | 323 | } |
|---|
| 241 | 324 | |
|---|
| 242 | 325 | static void tcp_ecn_queue_cwr(struct tcp_sock *tp) |
|---|
| .. | .. |
|---|
| 354 | 437 | sndmem *= nr_segs * per_mss; |
|---|
| 355 | 438 | |
|---|
| 356 | 439 | if (sk->sk_sndbuf < sndmem) |
|---|
| 357 | | - sk->sk_sndbuf = min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]); |
|---|
| 440 | + WRITE_ONCE(sk->sk_sndbuf, |
|---|
| 441 | + min(sndmem, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[2]))); |
|---|
| 358 | 442 | } |
|---|
| 359 | 443 | |
|---|
| 360 | 444 | /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) |
|---|
| .. | .. |
|---|
| 383 | 467 | */ |
|---|
| 384 | 468 | |
|---|
| 385 | 469 | /* Slow part of check#2. */ |
|---|
| 386 | | -static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) |
|---|
| 470 | +static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb, |
|---|
| 471 | + unsigned int skbtruesize) |
|---|
| 387 | 472 | { |
|---|
| 388 | 473 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 389 | 474 | /* Optimize this! */ |
|---|
| 390 | | - int truesize = tcp_win_from_space(sk, skb->truesize) >> 1; |
|---|
| 391 | | - int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1; |
|---|
| 475 | + int truesize = tcp_win_from_space(sk, skbtruesize) >> 1; |
|---|
| 476 | + int window = tcp_win_from_space(sk, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])) >> 1; |
|---|
| 392 | 477 | |
|---|
| 393 | 478 | while (tp->rcv_ssthresh <= window) { |
|---|
| 394 | 479 | if (truesize <= skb->len) |
|---|
| .. | .. |
|---|
| 400 | 485 | return 0; |
|---|
| 401 | 486 | } |
|---|
| 402 | 487 | |
|---|
| 403 | | -static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) |
|---|
| 488 | +/* Even if skb appears to have a bad len/truesize ratio, TCP coalescing |
|---|
| 489 | + * can play nice with us, as sk_buff and skb->head might be either |
|---|
| 490 | + * freed or shared with up to MAX_SKB_FRAGS segments. |
|---|
| 491 | + * Only give a boost to drivers using page frag(s) to hold the frame(s), |
|---|
| 492 | + * and if no payload was pulled in skb->head before reaching us. |
|---|
| 493 | + */ |
|---|
| 494 | +static u32 truesize_adjust(bool adjust, const struct sk_buff *skb) |
|---|
| 495 | +{ |
|---|
| 496 | + u32 truesize = skb->truesize; |
|---|
| 497 | + |
|---|
| 498 | + if (adjust && !skb_headlen(skb)) { |
|---|
| 499 | + truesize -= SKB_TRUESIZE(skb_end_offset(skb)); |
|---|
| 500 | + /* paranoid check, some drivers might be buggy */ |
|---|
| 501 | + if (unlikely((int)truesize < (int)skb->len)) |
|---|
| 502 | + truesize = skb->truesize; |
|---|
| 503 | + } |
|---|
| 504 | + return truesize; |
|---|
| 505 | +} |
|---|
| 506 | + |
|---|
| 507 | +static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb, |
|---|
| 508 | + bool adjust) |
|---|
| 404 | 509 | { |
|---|
| 405 | 510 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 406 | 511 | int room; |
|---|
| .. | .. |
|---|
| 409 | 514 | |
|---|
| 410 | 515 | /* Check #1 */ |
|---|
| 411 | 516 | if (room > 0 && !tcp_under_memory_pressure(sk)) { |
|---|
| 517 | + unsigned int truesize = truesize_adjust(adjust, skb); |
|---|
| 412 | 518 | int incr; |
|---|
| 413 | 519 | |
|---|
| 414 | 520 | /* Check #2. Increase window, if skb with such overhead |
|---|
| 415 | 521 | * will fit to rcvbuf in future. |
|---|
| 416 | 522 | */ |
|---|
| 417 | | - if (tcp_win_from_space(sk, skb->truesize) <= skb->len) |
|---|
| 523 | + if (tcp_win_from_space(sk, truesize) <= skb->len) |
|---|
| 418 | 524 | incr = 2 * tp->advmss; |
|---|
| 419 | 525 | else |
|---|
| 420 | | - incr = __tcp_grow_window(sk, skb); |
|---|
| 526 | + incr = __tcp_grow_window(sk, skb, truesize); |
|---|
| 421 | 527 | |
|---|
| 422 | 528 | if (incr) { |
|---|
| 423 | 529 | incr = max_t(int, incr, 2 * skb->len); |
|---|
| .. | .. |
|---|
| 430 | 536 | /* 3. Try to fixup all. It is made immediately after connection enters |
|---|
| 431 | 537 | * established state. |
|---|
| 432 | 538 | */ |
|---|
| 433 | | -void tcp_init_buffer_space(struct sock *sk) |
|---|
| 539 | +static void tcp_init_buffer_space(struct sock *sk) |
|---|
| 434 | 540 | { |
|---|
| 435 | | - int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win; |
|---|
| 541 | + int tcp_app_win = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_app_win); |
|---|
| 436 | 542 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 437 | 543 | int maxwin; |
|---|
| 438 | 544 | |
|---|
| .. | .. |
|---|
| 472 | 578 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 473 | 579 | struct inet_connection_sock *icsk = inet_csk(sk); |
|---|
| 474 | 580 | struct net *net = sock_net(sk); |
|---|
| 581 | + int rmem2; |
|---|
| 475 | 582 | |
|---|
| 476 | 583 | icsk->icsk_ack.quick = 0; |
|---|
| 584 | + rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]); |
|---|
| 477 | 585 | |
|---|
| 478 | | - if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] && |
|---|
| 586 | + if (sk->sk_rcvbuf < rmem2 && |
|---|
| 479 | 587 | !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && |
|---|
| 480 | 588 | !tcp_under_memory_pressure(sk) && |
|---|
| 481 | 589 | sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { |
|---|
| 482 | | - sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), |
|---|
| 483 | | - net->ipv4.sysctl_tcp_rmem[2]); |
|---|
| 590 | + WRITE_ONCE(sk->sk_rcvbuf, |
|---|
| 591 | + min(atomic_read(&sk->sk_rmem_alloc), rmem2)); |
|---|
| 484 | 592 | } |
|---|
| 485 | 593 | if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) |
|---|
| 486 | 594 | tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss); |
|---|
| .. | .. |
|---|
| 510 | 618 | * |
|---|
| 511 | 619 | * The algorithm for RTT estimation w/o timestamps is based on |
|---|
| 512 | 620 | * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL. |
|---|
| 513 | | - * <http://public.lanl.gov/radiant/pubs.html#DRS> |
|---|
| 621 | + * <https://public.lanl.gov/radiant/pubs.html#DRS> |
|---|
| 514 | 622 | * |
|---|
| 515 | 623 | * More detail on this code can be found at |
|---|
| 516 | 624 | * <http://staff.psc.edu/jheffner/>, |
|---|
| .. | .. |
|---|
| 621 | 729 | * <prev RTT . ><current RTT .. ><next RTT .... > |
|---|
| 622 | 730 | */ |
|---|
| 623 | 731 | |
|---|
| 624 | | - if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf && |
|---|
| 732 | + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && |
|---|
| 625 | 733 | !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { |
|---|
| 626 | 734 | int rcvmem, rcvbuf; |
|---|
| 627 | 735 | u64 rcvwin, grow; |
|---|
| .. | .. |
|---|
| 642 | 750 | |
|---|
| 643 | 751 | do_div(rcvwin, tp->advmss); |
|---|
| 644 | 752 | rcvbuf = min_t(u64, rcvwin * rcvmem, |
|---|
| 645 | | - sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); |
|---|
| 753 | + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); |
|---|
| 646 | 754 | if (rcvbuf > sk->sk_rcvbuf) { |
|---|
| 647 | | - sk->sk_rcvbuf = rcvbuf; |
|---|
| 755 | + WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); |
|---|
| 648 | 756 | |
|---|
| 649 | 757 | /* Make the window clamp follow along. */ |
|---|
| 650 | 758 | tp->window_clamp = tcp_win_from_space(sk, rcvbuf); |
|---|
| .. | .. |
|---|
| 710 | 818 | tcp_ecn_check_ce(sk, skb); |
|---|
| 711 | 819 | |
|---|
| 712 | 820 | if (skb->len >= 128) |
|---|
| 713 | | - tcp_grow_window(sk, skb); |
|---|
| 821 | + tcp_grow_window(sk, skb, true); |
|---|
| 714 | 822 | } |
|---|
| 715 | 823 | |
|---|
| 716 | 824 | /* Called to compute a smoothed rtt estimate. The data fed to this |
|---|
| .. | .. |
|---|
| 774 | 882 | tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2; |
|---|
| 775 | 883 | tp->rtt_seq = tp->snd_nxt; |
|---|
| 776 | 884 | tp->mdev_max_us = tcp_rto_min_us(sk); |
|---|
| 885 | + |
|---|
| 886 | + tcp_bpf_rtt(sk); |
|---|
| 777 | 887 | } |
|---|
| 778 | 888 | } else { |
|---|
| 779 | 889 | /* no previous measure. */ |
|---|
| .. | .. |
|---|
| 782 | 892 | tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk)); |
|---|
| 783 | 893 | tp->mdev_max_us = tp->rttvar_us; |
|---|
| 784 | 894 | tp->rtt_seq = tp->snd_nxt; |
|---|
| 895 | + |
|---|
| 896 | + tcp_bpf_rtt(sk); |
|---|
| 785 | 897 | } |
|---|
| 786 | 898 | tp->srtt_us = max(1U, srtt); |
|---|
| 787 | 899 | } |
|---|
| .. | .. |
|---|
| 859 | 971 | return min_t(__u32, cwnd, tp->snd_cwnd_clamp); |
|---|
| 860 | 972 | } |
|---|
| 861 | 973 | |
|---|
| 862 | | -/* Take a notice that peer is sending D-SACKs */ |
|---|
| 863 | | -static void tcp_dsack_seen(struct tcp_sock *tp) |
|---|
| 974 | +struct tcp_sacktag_state { |
|---|
| 975 | + /* Timestamps for earliest and latest never-retransmitted segment |
|---|
| 976 | + * that was SACKed. RTO needs the earliest RTT to stay conservative, |
|---|
| 977 | + * but congestion control should still get an accurate delay signal. |
|---|
| 978 | + */ |
|---|
| 979 | + u64 first_sackt; |
|---|
| 980 | + u64 last_sackt; |
|---|
| 981 | + u32 reord; |
|---|
| 982 | + u32 sack_delivered; |
|---|
| 983 | + int flag; |
|---|
| 984 | + unsigned int mss_now; |
|---|
| 985 | + struct rate_sample *rate; |
|---|
| 986 | +}; |
|---|
| 987 | + |
|---|
| 988 | +/* Take a notice that peer is sending D-SACKs. Skip update of data delivery |
|---|
| 989 | + * and spurious retransmission information if this DSACK is unlikely caused by |
|---|
| 990 | + * sender's action: |
|---|
| 991 | + * - DSACKed sequence range is larger than maximum receiver's window. |
|---|
| 992 | + * - Total no. of DSACKed segments exceed the total no. of retransmitted segs. |
|---|
| 993 | + */ |
|---|
| 994 | +static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq, |
|---|
| 995 | + u32 end_seq, struct tcp_sacktag_state *state) |
|---|
| 864 | 996 | { |
|---|
| 997 | + u32 seq_len, dup_segs = 1; |
|---|
| 998 | + |
|---|
| 999 | + if (!before(start_seq, end_seq)) |
|---|
| 1000 | + return 0; |
|---|
| 1001 | + |
|---|
| 1002 | + seq_len = end_seq - start_seq; |
|---|
| 1003 | + /* Dubious DSACK: DSACKed range greater than maximum advertised rwnd */ |
|---|
| 1004 | + if (seq_len > tp->max_window) |
|---|
| 1005 | + return 0; |
|---|
| 1006 | + if (seq_len > tp->mss_cache) |
|---|
| 1007 | + dup_segs = DIV_ROUND_UP(seq_len, tp->mss_cache); |
|---|
| 1008 | + |
|---|
| 1009 | + tp->dsack_dups += dup_segs; |
|---|
| 1010 | + /* Skip the DSACK if dup segs weren't retransmitted by sender */ |
|---|
| 1011 | + if (tp->dsack_dups > tp->total_retrans) |
|---|
| 1012 | + return 0; |
|---|
| 1013 | + |
|---|
| 865 | 1014 | tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; |
|---|
| 866 | 1015 | tp->rack.dsack_seen = 1; |
|---|
| 867 | | - tp->dsack_dups++; |
|---|
| 1016 | + |
|---|
| 1017 | + state->flag |= FLAG_DSACKING_ACK; |
|---|
| 1018 | + /* A spurious retransmission is delivered */ |
|---|
| 1019 | + state->sack_delivered += dup_segs; |
|---|
| 1020 | + |
|---|
| 1021 | + return dup_segs; |
|---|
| 868 | 1022 | } |
|---|
| 869 | 1023 | |
|---|
| 870 | 1024 | /* It's reordering when higher sequence was delivered (i.e. sacked) before |
|---|
| .. | .. |
|---|
| 893 | 1047 | tp->undo_marker ? tp->undo_retrans : 0); |
|---|
| 894 | 1048 | #endif |
|---|
| 895 | 1049 | tp->reordering = min_t(u32, (metric + mss - 1) / mss, |
|---|
| 896 | | - sock_net(sk)->ipv4.sysctl_tcp_max_reordering); |
|---|
| 1050 | + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering)); |
|---|
| 897 | 1051 | } |
|---|
| 898 | 1052 | |
|---|
| 899 | 1053 | /* This exciting event is worth to be remembered. 8) */ |
|---|
| .. | .. |
|---|
| 902 | 1056 | ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER); |
|---|
| 903 | 1057 | } |
|---|
| 904 | 1058 | |
|---|
| 905 | | -/* This must be called before lost_out is incremented */ |
|---|
| 1059 | + /* This must be called before lost_out or retrans_out are updated |
|---|
| 1060 | + * on a new loss, because we want to know if all skbs previously |
|---|
| 1061 | + * known to be lost have already been retransmitted, indicating |
|---|
| 1062 | + * that this newly lost skb is our next skb to retransmit. |
|---|
| 1063 | + */ |
|---|
| 906 | 1064 | static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) |
|---|
| 907 | 1065 | { |
|---|
| 908 | 1066 | if ((!tp->retransmit_skb_hint && tp->retrans_out >= tp->lost_out) || |
|---|
| .. | .. |
|---|
| 912 | 1070 | tp->retransmit_skb_hint = skb; |
|---|
| 913 | 1071 | } |
|---|
| 914 | 1072 | |
|---|
| 915 | | -/* Sum the number of packets on the wire we have marked as lost. |
|---|
| 916 | | - * There are two cases we care about here: |
|---|
| 917 | | - * a) Packet hasn't been marked lost (nor retransmitted), |
|---|
| 918 | | - * and this is the first loss. |
|---|
| 919 | | - * b) Packet has been marked both lost and retransmitted, |
|---|
| 920 | | - * and this means we think it was lost again. |
|---|
| 1073 | +/* Sum the number of packets on the wire we have marked as lost, and |
|---|
| 1074 | + * notify the congestion control module that the given skb was marked lost. |
|---|
| 921 | 1075 | */ |
|---|
| 922 | | -static void tcp_sum_lost(struct tcp_sock *tp, struct sk_buff *skb) |
|---|
| 1076 | +static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb) |
|---|
| 1077 | +{ |
|---|
| 1078 | + tp->lost += tcp_skb_pcount(skb); |
|---|
| 1079 | +} |
|---|
| 1080 | + |
|---|
| 1081 | +void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) |
|---|
| 923 | 1082 | { |
|---|
| 924 | 1083 | __u8 sacked = TCP_SKB_CB(skb)->sacked; |
|---|
| 1084 | + struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 925 | 1085 | |
|---|
| 926 | | - if (!(sacked & TCPCB_LOST) || |
|---|
| 927 | | - ((sacked & TCPCB_LOST) && (sacked & TCPCB_SACKED_RETRANS))) |
|---|
| 928 | | - tp->lost += tcp_skb_pcount(skb); |
|---|
| 929 | | -} |
|---|
| 1086 | + if (sacked & TCPCB_SACKED_ACKED) |
|---|
| 1087 | + return; |
|---|
| 930 | 1088 | |
|---|
| 931 | | -static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb) |
|---|
| 932 | | -{ |
|---|
| 933 | | - if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { |
|---|
| 934 | | - tcp_verify_retransmit_hint(tp, skb); |
|---|
| 935 | | - |
|---|
| 936 | | - tp->lost_out += tcp_skb_pcount(skb); |
|---|
| 937 | | - tcp_sum_lost(tp, skb); |
|---|
| 938 | | - TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; |
|---|
| 939 | | - } |
|---|
| 940 | | -} |
|---|
| 941 | | - |
|---|
| 942 | | -void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb) |
|---|
| 943 | | -{ |
|---|
| 944 | 1089 | tcp_verify_retransmit_hint(tp, skb); |
|---|
| 945 | | - |
|---|
| 946 | | - tcp_sum_lost(tp, skb); |
|---|
| 947 | | - if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { |
|---|
| 1090 | + if (sacked & TCPCB_LOST) { |
|---|
| 1091 | + if (sacked & TCPCB_SACKED_RETRANS) { |
|---|
| 1092 | + /* Account for retransmits that are lost again */ |
|---|
| 1093 | + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; |
|---|
| 1094 | + tp->retrans_out -= tcp_skb_pcount(skb); |
|---|
| 1095 | + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT, |
|---|
| 1096 | + tcp_skb_pcount(skb)); |
|---|
| 1097 | + tcp_notify_skb_loss_event(tp, skb); |
|---|
| 1098 | + } |
|---|
| 1099 | + } else { |
|---|
| 948 | 1100 | tp->lost_out += tcp_skb_pcount(skb); |
|---|
| 949 | 1101 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; |
|---|
| 1102 | + tcp_notify_skb_loss_event(tp, skb); |
|---|
| 950 | 1103 | } |
|---|
| 1104 | +} |
|---|
| 1105 | + |
|---|
| 1106 | +/* Updates the delivered and delivered_ce counts */ |
|---|
| 1107 | +static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, |
|---|
| 1108 | + bool ece_ack) |
|---|
| 1109 | +{ |
|---|
| 1110 | + tp->delivered += delivered; |
|---|
| 1111 | + if (ece_ack) |
|---|
| 1112 | + tp->delivered_ce += delivered; |
|---|
| 951 | 1113 | } |
|---|
| 952 | 1114 | |
|---|
| 953 | 1115 | /* This procedure tags the retransmission queue when SACKs arrive. |
|---|
| .. | .. |
|---|
| 1082 | 1244 | |
|---|
| 1083 | 1245 | static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, |
|---|
| 1084 | 1246 | struct tcp_sack_block_wire *sp, int num_sacks, |
|---|
| 1085 | | - u32 prior_snd_una) |
|---|
| 1247 | + u32 prior_snd_una, struct tcp_sacktag_state *state) |
|---|
| 1086 | 1248 | { |
|---|
| 1087 | 1249 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 1088 | 1250 | u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq); |
|---|
| 1089 | 1251 | u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq); |
|---|
| 1090 | | - bool dup_sack = false; |
|---|
| 1252 | + u32 dup_segs; |
|---|
| 1091 | 1253 | |
|---|
| 1092 | 1254 | if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) { |
|---|
| 1093 | | - dup_sack = true; |
|---|
| 1094 | | - tcp_dsack_seen(tp); |
|---|
| 1095 | 1255 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV); |
|---|
| 1096 | 1256 | } else if (num_sacks > 1) { |
|---|
| 1097 | 1257 | u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq); |
|---|
| 1098 | 1258 | u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq); |
|---|
| 1099 | 1259 | |
|---|
| 1100 | | - if (!after(end_seq_0, end_seq_1) && |
|---|
| 1101 | | - !before(start_seq_0, start_seq_1)) { |
|---|
| 1102 | | - dup_sack = true; |
|---|
| 1103 | | - tcp_dsack_seen(tp); |
|---|
| 1104 | | - NET_INC_STATS(sock_net(sk), |
|---|
| 1105 | | - LINUX_MIB_TCPDSACKOFORECV); |
|---|
| 1106 | | - } |
|---|
| 1260 | + if (after(end_seq_0, end_seq_1) || before(start_seq_0, start_seq_1)) |
|---|
| 1261 | + return false; |
|---|
| 1262 | + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKOFORECV); |
|---|
| 1263 | + } else { |
|---|
| 1264 | + return false; |
|---|
| 1107 | 1265 | } |
|---|
| 1108 | 1266 | |
|---|
| 1267 | + dup_segs = tcp_dsack_seen(tp, start_seq_0, end_seq_0, state); |
|---|
| 1268 | + if (!dup_segs) { /* Skip dubious DSACK */ |
|---|
| 1269 | + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKIGNOREDDUBIOUS); |
|---|
| 1270 | + return false; |
|---|
| 1271 | + } |
|---|
| 1272 | + |
|---|
| 1273 | + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECVSEGS, dup_segs); |
|---|
| 1274 | + |
|---|
| 1109 | 1275 | /* D-SACK for already forgotten data... Do dumb counting. */ |
|---|
| 1110 | | - if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 && |
|---|
| 1276 | + if (tp->undo_marker && tp->undo_retrans > 0 && |
|---|
| 1111 | 1277 | !after(end_seq_0, prior_snd_una) && |
|---|
| 1112 | 1278 | after(end_seq_0, tp->undo_marker)) |
|---|
| 1113 | | - tp->undo_retrans--; |
|---|
| 1279 | + tp->undo_retrans = max_t(int, 0, tp->undo_retrans - dup_segs); |
|---|
| 1114 | 1280 | |
|---|
| 1115 | | - return dup_sack; |
|---|
| 1281 | + return true; |
|---|
| 1116 | 1282 | } |
|---|
| 1117 | | - |
|---|
| 1118 | | -struct tcp_sacktag_state { |
|---|
| 1119 | | - u32 reord; |
|---|
| 1120 | | - /* Timestamps for earliest and latest never-retransmitted segment |
|---|
| 1121 | | - * that was SACKed. RTO needs the earliest RTT to stay conservative, |
|---|
| 1122 | | - * but congestion control should still get an accurate delay signal. |
|---|
| 1123 | | - */ |
|---|
| 1124 | | - u64 first_sackt; |
|---|
| 1125 | | - u64 last_sackt; |
|---|
| 1126 | | - struct rate_sample *rate; |
|---|
| 1127 | | - int flag; |
|---|
| 1128 | | - unsigned int mss_now; |
|---|
| 1129 | | -}; |
|---|
| 1130 | 1283 | |
|---|
| 1131 | 1284 | /* Check if skb is fully within the SACK block. In presence of GSO skbs, |
|---|
| 1132 | 1285 | * the incoming SACK may not exactly match but we can find smaller MSS |
|---|
| .. | .. |
|---|
| 1246 | 1399 | sacked |= TCPCB_SACKED_ACKED; |
|---|
| 1247 | 1400 | state->flag |= FLAG_DATA_SACKED; |
|---|
| 1248 | 1401 | tp->sacked_out += pcount; |
|---|
| 1249 | | - tp->delivered += pcount; /* Out-of-order packets delivered */ |
|---|
| 1402 | + /* Out-of-order packets delivered */ |
|---|
| 1403 | + state->sack_delivered += pcount; |
|---|
| 1250 | 1404 | |
|---|
| 1251 | 1405 | /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ |
|---|
| 1252 | 1406 | if (tp->lost_skb_hint && |
|---|
| .. | .. |
|---|
| 1289 | 1443 | */ |
|---|
| 1290 | 1444 | tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, |
|---|
| 1291 | 1445 | start_seq, end_seq, dup_sack, pcount, |
|---|
| 1292 | | - skb->skb_mstamp); |
|---|
| 1446 | + tcp_skb_timestamp_us(skb)); |
|---|
| 1293 | 1447 | tcp_rate_skb_delivered(sk, skb, state->rate); |
|---|
| 1294 | 1448 | |
|---|
| 1295 | 1449 | if (skb == tp->lost_skb_hint) |
|---|
| .. | .. |
|---|
| 1413 | 1567 | if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) |
|---|
| 1414 | 1568 | goto fallback; |
|---|
| 1415 | 1569 | |
|---|
| 1416 | | - if (!tcp_skb_can_collapse_to(prev)) |
|---|
| 1570 | + if (!tcp_skb_can_collapse(prev, skb)) |
|---|
| 1417 | 1571 | goto fallback; |
|---|
| 1418 | 1572 | |
|---|
| 1419 | 1573 | in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && |
|---|
| .. | .. |
|---|
| 1502 | 1656 | (mss != tcp_skb_seglen(skb))) |
|---|
| 1503 | 1657 | goto out; |
|---|
| 1504 | 1658 | |
|---|
| 1659 | + if (!tcp_skb_can_collapse(prev, skb)) |
|---|
| 1660 | + goto out; |
|---|
| 1505 | 1661 | len = skb->len; |
|---|
| 1506 | 1662 | pcount = tcp_skb_pcount(skb); |
|---|
| 1507 | 1663 | if (tcp_skb_shift(prev, skb, pcount, len)) |
|---|
| .. | .. |
|---|
| 1578 | 1734 | TCP_SKB_CB(skb)->end_seq, |
|---|
| 1579 | 1735 | dup_sack, |
|---|
| 1580 | 1736 | tcp_skb_pcount(skb), |
|---|
| 1581 | | - skb->skb_mstamp); |
|---|
| 1737 | + tcp_skb_timestamp_us(skb)); |
|---|
| 1582 | 1738 | tcp_rate_skb_delivered(sk, skb, state->rate); |
|---|
| 1583 | 1739 | if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) |
|---|
| 1584 | 1740 | list_del_init(&skb->tcp_tsorted_anchor); |
|---|
| .. | .. |
|---|
| 1591 | 1747 | return skb; |
|---|
| 1592 | 1748 | } |
|---|
| 1593 | 1749 | |
|---|
| 1594 | | -static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk, |
|---|
| 1595 | | - struct tcp_sacktag_state *state, |
|---|
| 1596 | | - u32 seq) |
|---|
| 1750 | +static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk, u32 seq) |
|---|
| 1597 | 1751 | { |
|---|
| 1598 | 1752 | struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node; |
|---|
| 1599 | 1753 | struct sk_buff *skb; |
|---|
| .. | .. |
|---|
| 1615 | 1769 | } |
|---|
| 1616 | 1770 | |
|---|
| 1617 | 1771 | static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, |
|---|
| 1618 | | - struct tcp_sacktag_state *state, |
|---|
| 1619 | 1772 | u32 skip_to_seq) |
|---|
| 1620 | 1773 | { |
|---|
| 1621 | 1774 | if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq)) |
|---|
| 1622 | 1775 | return skb; |
|---|
| 1623 | 1776 | |
|---|
| 1624 | | - return tcp_sacktag_bsearch(sk, state, skip_to_seq); |
|---|
| 1777 | + return tcp_sacktag_bsearch(sk, skip_to_seq); |
|---|
| 1625 | 1778 | } |
|---|
| 1626 | 1779 | |
|---|
| 1627 | 1780 | static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, |
|---|
| .. | .. |
|---|
| 1634 | 1787 | return skb; |
|---|
| 1635 | 1788 | |
|---|
| 1636 | 1789 | if (before(next_dup->start_seq, skip_to_seq)) { |
|---|
| 1637 | | - skb = tcp_sacktag_skip(skb, sk, state, next_dup->start_seq); |
|---|
| 1790 | + skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq); |
|---|
| 1638 | 1791 | skb = tcp_sacktag_walk(skb, sk, NULL, state, |
|---|
| 1639 | 1792 | next_dup->start_seq, next_dup->end_seq, |
|---|
| 1640 | 1793 | 1); |
|---|
| .. | .. |
|---|
| 1672 | 1825 | tcp_highest_sack_reset(sk); |
|---|
| 1673 | 1826 | |
|---|
| 1674 | 1827 | found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, |
|---|
| 1675 | | - num_sacks, prior_snd_una); |
|---|
| 1676 | | - if (found_dup_sack) { |
|---|
| 1677 | | - state->flag |= FLAG_DSACKING_ACK; |
|---|
| 1678 | | - tp->delivered++; /* A spurious retransmission is delivered */ |
|---|
| 1679 | | - } |
|---|
| 1828 | + num_sacks, prior_snd_una, state); |
|---|
| 1680 | 1829 | |
|---|
| 1681 | 1830 | /* Eliminate too old ACKs, but take into |
|---|
| 1682 | 1831 | * account more or less fresh ones, they can |
|---|
| .. | .. |
|---|
| 1778 | 1927 | |
|---|
| 1779 | 1928 | /* Head todo? */ |
|---|
| 1780 | 1929 | if (before(start_seq, cache->start_seq)) { |
|---|
| 1781 | | - skb = tcp_sacktag_skip(skb, sk, state, |
|---|
| 1782 | | - start_seq); |
|---|
| 1930 | + skb = tcp_sacktag_skip(skb, sk, start_seq); |
|---|
| 1783 | 1931 | skb = tcp_sacktag_walk(skb, sk, next_dup, |
|---|
| 1784 | 1932 | state, |
|---|
| 1785 | 1933 | start_seq, |
|---|
| .. | .. |
|---|
| 1805 | 1953 | goto walk; |
|---|
| 1806 | 1954 | } |
|---|
| 1807 | 1955 | |
|---|
| 1808 | | - skb = tcp_sacktag_skip(skb, sk, state, cache->end_seq); |
|---|
| 1956 | + skb = tcp_sacktag_skip(skb, sk, cache->end_seq); |
|---|
| 1809 | 1957 | /* Check overlap against next cached too (past this one already) */ |
|---|
| 1810 | 1958 | cache++; |
|---|
| 1811 | 1959 | continue; |
|---|
| .. | .. |
|---|
| 1816 | 1964 | if (!skb) |
|---|
| 1817 | 1965 | break; |
|---|
| 1818 | 1966 | } |
|---|
| 1819 | | - skb = tcp_sacktag_skip(skb, sk, state, start_seq); |
|---|
| 1967 | + skb = tcp_sacktag_skip(skb, sk, start_seq); |
|---|
| 1820 | 1968 | |
|---|
| 1821 | 1969 | walk: |
|---|
| 1822 | 1970 | skb = tcp_sacktag_walk(skb, sk, next_dup, state, |
|---|
| .. | .. |
|---|
| 1878 | 2026 | return; |
|---|
| 1879 | 2027 | |
|---|
| 1880 | 2028 | tp->reordering = min_t(u32, tp->packets_out + addend, |
|---|
| 1881 | | - sock_net(sk)->ipv4.sysctl_tcp_max_reordering); |
|---|
| 2029 | + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering)); |
|---|
| 1882 | 2030 | tp->reord_seen++; |
|---|
| 1883 | 2031 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER); |
|---|
| 1884 | 2032 | } |
|---|
| 1885 | 2033 | |
|---|
| 1886 | 2034 | /* Emulate SACKs for SACKless connection: account for a new dupack. */ |
|---|
| 1887 | 2035 | |
|---|
| 1888 | | -static void tcp_add_reno_sack(struct sock *sk) |
|---|
| 2036 | +static void tcp_add_reno_sack(struct sock *sk, int num_dupack, bool ece_ack) |
|---|
| 1889 | 2037 | { |
|---|
| 1890 | | - struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 1891 | | - u32 prior_sacked = tp->sacked_out; |
|---|
| 2038 | + if (num_dupack) { |
|---|
| 2039 | + struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 2040 | + u32 prior_sacked = tp->sacked_out; |
|---|
| 2041 | + s32 delivered; |
|---|
| 1892 | 2042 | |
|---|
| 1893 | | - tp->sacked_out++; |
|---|
| 1894 | | - tcp_check_reno_reordering(sk, 0); |
|---|
| 1895 | | - if (tp->sacked_out > prior_sacked) |
|---|
| 1896 | | - tp->delivered++; /* Some out-of-order packet is delivered */ |
|---|
| 1897 | | - tcp_verify_left_out(tp); |
|---|
| 2043 | + tp->sacked_out += num_dupack; |
|---|
| 2044 | + tcp_check_reno_reordering(sk, 0); |
|---|
| 2045 | + delivered = tp->sacked_out - prior_sacked; |
|---|
| 2046 | + if (delivered > 0) |
|---|
| 2047 | + tcp_count_delivered(tp, delivered, ece_ack); |
|---|
| 2048 | + tcp_verify_left_out(tp); |
|---|
| 2049 | + } |
|---|
| 1898 | 2050 | } |
|---|
| 1899 | 2051 | |
|---|
| 1900 | 2052 | /* Account for ACK, ACKing some data in Reno Recovery phase. */ |
|---|
| 1901 | 2053 | |
|---|
| 1902 | | -static void tcp_remove_reno_sacks(struct sock *sk, int acked) |
|---|
| 2054 | +static void tcp_remove_reno_sacks(struct sock *sk, int acked, bool ece_ack) |
|---|
| 1903 | 2055 | { |
|---|
| 1904 | 2056 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 1905 | 2057 | |
|---|
| 1906 | 2058 | if (acked > 0) { |
|---|
| 1907 | 2059 | /* One ACK acked hole. The rest eat duplicate ACKs. */ |
|---|
| 1908 | | - tp->delivered += max_t(int, acked - tp->sacked_out, 1); |
|---|
| 2060 | + tcp_count_delivered(tp, max_t(int, acked - tp->sacked_out, 1), |
|---|
| 2061 | + ece_ack); |
|---|
| 1909 | 2062 | if (acked - 1 >= tp->sacked_out) |
|---|
| 1910 | 2063 | tp->sacked_out = 0; |
|---|
| 1911 | 2064 | else |
|---|
| .. | .. |
|---|
| 1938 | 2091 | |
|---|
| 1939 | 2092 | static bool tcp_is_rack(const struct sock *sk) |
|---|
| 1940 | 2093 | { |
|---|
| 1941 | | - return sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION; |
|---|
| 2094 | + return READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) & |
|---|
| 2095 | + TCP_RACK_LOSS_DETECTION; |
|---|
| 1942 | 2096 | } |
|---|
| 1943 | 2097 | |
|---|
| 1944 | 2098 | /* If we detect SACK reneging, forget all SACK information |
|---|
| .. | .. |
|---|
| 1982 | 2136 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 1983 | 2137 | struct net *net = sock_net(sk); |
|---|
| 1984 | 2138 | bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery; |
|---|
| 2139 | + u8 reordering; |
|---|
| 1985 | 2140 | |
|---|
| 1986 | 2141 | tcp_timeout_mark_lost(sk); |
|---|
| 1987 | 2142 | |
|---|
| .. | .. |
|---|
| 2002 | 2157 | /* Timeout in disordered state after receiving substantial DUPACKs |
|---|
| 2003 | 2158 | * suggests that the degree of reordering is over-estimated. |
|---|
| 2004 | 2159 | */ |
|---|
| 2160 | + reordering = READ_ONCE(net->ipv4.sysctl_tcp_reordering); |
|---|
| 2005 | 2161 | if (icsk->icsk_ca_state <= TCP_CA_Disorder && |
|---|
| 2006 | | - tp->sacked_out >= net->ipv4.sysctl_tcp_reordering) |
|---|
| 2162 | + tp->sacked_out >= reordering) |
|---|
| 2007 | 2163 | tp->reordering = min_t(unsigned int, tp->reordering, |
|---|
| 2008 | | - net->ipv4.sysctl_tcp_reordering); |
|---|
| 2164 | + reordering); |
|---|
| 2165 | + |
|---|
| 2009 | 2166 | tcp_set_ca_state(sk, TCP_CA_Loss); |
|---|
| 2010 | 2167 | tp->high_seq = tp->snd_nxt; |
|---|
| 2011 | 2168 | tcp_ecn_queue_cwr(tp); |
|---|
| .. | .. |
|---|
| 2014 | 2171 | * loss recovery is underway except recurring timeout(s) on |
|---|
| 2015 | 2172 | * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing |
|---|
| 2016 | 2173 | */ |
|---|
| 2017 | | - tp->frto = net->ipv4.sysctl_tcp_frto && |
|---|
| 2174 | + tp->frto = READ_ONCE(net->ipv4.sysctl_tcp_frto) && |
|---|
| 2018 | 2175 | (new_recovery || icsk->icsk_retransmits) && |
|---|
| 2019 | 2176 | !inet_csk(sk)->icsk_mtup.probe_size; |
|---|
| 2020 | 2177 | } |
|---|
| .. | .. |
|---|
| 2031 | 2188 | */ |
|---|
| 2032 | 2189 | static bool tcp_check_sack_reneging(struct sock *sk, int flag) |
|---|
| 2033 | 2190 | { |
|---|
| 2034 | | - if (flag & FLAG_SACK_RENEGING) { |
|---|
| 2191 | + if (flag & FLAG_SACK_RENEGING && |
|---|
| 2192 | + flag & FLAG_SND_UNA_ADVANCED) { |
|---|
| 2035 | 2193 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 2036 | 2194 | unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4), |
|---|
| 2037 | 2195 | msecs_to_jiffies(10)); |
|---|
| .. | .. |
|---|
| 2172 | 2330 | } |
|---|
| 2173 | 2331 | |
|---|
| 2174 | 2332 | /* Detect loss in event "A" above by marking head of queue up as lost. |
|---|
| 2175 | | - * For non-SACK(Reno) senders, the first "packets" number of segments |
|---|
| 2176 | | - * are considered lost. For RFC3517 SACK, a segment is considered lost if it |
|---|
| 2333 | + * For RFC3517 SACK, a segment is considered lost if it |
|---|
| 2177 | 2334 | * has at least tp->reordering SACKed seqments above it; "packets" refers to |
|---|
| 2178 | 2335 | * the maximum SACKed segments to pass before reaching this limit. |
|---|
| 2179 | 2336 | */ |
|---|
| .. | .. |
|---|
| 2181 | 2338 | { |
|---|
| 2182 | 2339 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 2183 | 2340 | struct sk_buff *skb; |
|---|
| 2184 | | - int cnt, oldcnt, lost; |
|---|
| 2185 | | - unsigned int mss; |
|---|
| 2341 | + int cnt; |
|---|
| 2186 | 2342 | /* Use SACK to deduce losses of new sequences sent during recovery */ |
|---|
| 2187 | | - const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq; |
|---|
| 2343 | + const u32 loss_high = tp->snd_nxt; |
|---|
| 2188 | 2344 | |
|---|
| 2189 | 2345 | WARN_ON(packets > tp->packets_out); |
|---|
| 2190 | 2346 | skb = tp->lost_skb_hint; |
|---|
| .. | .. |
|---|
| 2207 | 2363 | if (after(TCP_SKB_CB(skb)->end_seq, loss_high)) |
|---|
| 2208 | 2364 | break; |
|---|
| 2209 | 2365 | |
|---|
| 2210 | | - oldcnt = cnt; |
|---|
| 2211 | | - if (tcp_is_reno(tp) || |
|---|
| 2212 | | - (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) |
|---|
| 2366 | + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) |
|---|
| 2213 | 2367 | cnt += tcp_skb_pcount(skb); |
|---|
| 2214 | 2368 | |
|---|
| 2215 | | - if (cnt > packets) { |
|---|
| 2216 | | - if (tcp_is_sack(tp) || |
|---|
| 2217 | | - (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || |
|---|
| 2218 | | - (oldcnt >= packets)) |
|---|
| 2219 | | - break; |
|---|
| 2369 | + if (cnt > packets) |
|---|
| 2370 | + break; |
|---|
| 2220 | 2371 | |
|---|
| 2221 | | - mss = tcp_skb_mss(skb); |
|---|
| 2222 | | - /* If needed, chop off the prefix to mark as lost. */ |
|---|
| 2223 | | - lost = (packets - oldcnt) * mss; |
|---|
| 2224 | | - if (lost < skb->len && |
|---|
| 2225 | | - tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, |
|---|
| 2226 | | - lost, mss, GFP_ATOMIC) < 0) |
|---|
| 2227 | | - break; |
|---|
| 2228 | | - cnt = packets; |
|---|
| 2229 | | - } |
|---|
| 2230 | | - |
|---|
| 2231 | | - tcp_skb_mark_lost(tp, skb); |
|---|
| 2372 | + if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST)) |
|---|
| 2373 | + tcp_mark_skb_lost(sk, skb); |
|---|
| 2232 | 2374 | |
|---|
| 2233 | 2375 | if (mark_head) |
|---|
| 2234 | 2376 | break; |
|---|
| .. | .. |
|---|
| 2272 | 2414 | */ |
|---|
| 2273 | 2415 | static inline bool tcp_packet_delayed(const struct tcp_sock *tp) |
|---|
| 2274 | 2416 | { |
|---|
| 2275 | | - return !tp->retrans_stamp || |
|---|
| 2417 | + return tp->retrans_stamp && |
|---|
| 2276 | 2418 | tcp_tsopt_ecr_before(tp, tp->retrans_stamp); |
|---|
| 2277 | 2419 | } |
|---|
| 2278 | 2420 | |
|---|
| .. | .. |
|---|
| 2368 | 2510 | return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp)); |
|---|
| 2369 | 2511 | } |
|---|
| 2370 | 2512 | |
|---|
| 2513 | +static bool tcp_is_non_sack_preventing_reopen(struct sock *sk) |
|---|
| 2514 | +{ |
|---|
| 2515 | + struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 2516 | + |
|---|
| 2517 | + if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { |
|---|
| 2518 | + /* Hold old state until something *above* high_seq |
|---|
| 2519 | + * is ACKed. For Reno it is MUST to prevent false |
|---|
| 2520 | + * fast retransmits (RFC2582). SACK TCP is safe. */ |
|---|
| 2521 | + if (!tcp_any_retrans_done(sk)) |
|---|
| 2522 | + tp->retrans_stamp = 0; |
|---|
| 2523 | + return true; |
|---|
| 2524 | + } |
|---|
| 2525 | + return false; |
|---|
| 2526 | +} |
|---|
| 2527 | + |
|---|
| 2371 | 2528 | /* People celebrate: "We love our President!" */ |
|---|
| 2372 | 2529 | static bool tcp_try_undo_recovery(struct sock *sk) |
|---|
| 2373 | 2530 | { |
|---|
| .. | .. |
|---|
| 2390 | 2547 | } else if (tp->rack.reo_wnd_persist) { |
|---|
| 2391 | 2548 | tp->rack.reo_wnd_persist--; |
|---|
| 2392 | 2549 | } |
|---|
| 2393 | | - if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { |
|---|
| 2394 | | - /* Hold old state until something *above* high_seq |
|---|
| 2395 | | - * is ACKed. For Reno it is MUST to prevent false |
|---|
| 2396 | | - * fast retransmits (RFC2582). SACK TCP is safe. */ |
|---|
| 2397 | | - if (!tcp_any_retrans_done(sk)) |
|---|
| 2398 | | - tp->retrans_stamp = 0; |
|---|
| 2550 | + if (tcp_is_non_sack_preventing_reopen(sk)) |
|---|
| 2399 | 2551 | return true; |
|---|
| 2400 | | - } |
|---|
| 2401 | 2552 | tcp_set_ca_state(sk, TCP_CA_Open); |
|---|
| 2402 | 2553 | tp->is_sack_reneg = 0; |
|---|
| 2403 | 2554 | return false; |
|---|
| .. | .. |
|---|
| 2433 | 2584 | NET_INC_STATS(sock_net(sk), |
|---|
| 2434 | 2585 | LINUX_MIB_TCPSPURIOUSRTOS); |
|---|
| 2435 | 2586 | inet_csk(sk)->icsk_retransmits = 0; |
|---|
| 2587 | + if (tcp_is_non_sack_preventing_reopen(sk)) |
|---|
| 2588 | + return true; |
|---|
| 2436 | 2589 | if (frto_undo || tcp_is_sack(tp)) { |
|---|
| 2437 | 2590 | tcp_set_ca_state(sk, TCP_CA_Open); |
|---|
| 2438 | 2591 | tp->is_sack_reneg = 0; |
|---|
| .. | .. |
|---|
| 2479 | 2632 | u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered + |
|---|
| 2480 | 2633 | tp->prior_cwnd - 1; |
|---|
| 2481 | 2634 | sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out; |
|---|
| 2482 | | - } else if ((flag & FLAG_RETRANS_DATA_ACKED) && |
|---|
| 2483 | | - !(flag & FLAG_LOST_RETRANS)) { |
|---|
| 2635 | + } else if ((flag & (FLAG_RETRANS_DATA_ACKED | FLAG_LOST_RETRANS)) == |
|---|
| 2636 | + FLAG_RETRANS_DATA_ACKED) { |
|---|
| 2484 | 2637 | sndcnt = min_t(int, delta, |
|---|
| 2485 | 2638 | max_t(int, tp->prr_delivered - tp->prr_out, |
|---|
| 2486 | 2639 | newly_acked_sacked) + 1); |
|---|
| .. | .. |
|---|
| 2566 | 2719 | { |
|---|
| 2567 | 2720 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 2568 | 2721 | struct inet_connection_sock *icsk = inet_csk(sk); |
|---|
| 2722 | + u64 val; |
|---|
| 2569 | 2723 | |
|---|
| 2570 | | - /* FIXME: breaks with very large cwnd */ |
|---|
| 2571 | 2724 | tp->prior_ssthresh = tcp_current_ssthresh(sk); |
|---|
| 2572 | | - tp->snd_cwnd = tp->snd_cwnd * |
|---|
| 2573 | | - tcp_mss_to_mtu(sk, tp->mss_cache) / |
|---|
| 2574 | | - icsk->icsk_mtup.probe_size; |
|---|
| 2725 | + |
|---|
| 2726 | + val = (u64)tp->snd_cwnd * tcp_mss_to_mtu(sk, tp->mss_cache); |
|---|
| 2727 | + do_div(val, icsk->icsk_mtup.probe_size); |
|---|
| 2728 | + WARN_ON_ONCE((u32)val != val); |
|---|
| 2729 | + tp->snd_cwnd = max_t(u32, 1U, val); |
|---|
| 2730 | + |
|---|
| 2575 | 2731 | tp->snd_cwnd_cnt = 0; |
|---|
| 2576 | 2732 | tp->snd_cwnd_stamp = tcp_jiffies32; |
|---|
| 2577 | 2733 | tp->snd_ssthresh = tcp_current_ssthresh(sk); |
|---|
| .. | .. |
|---|
| 2594 | 2750 | unsigned int mss = tcp_current_mss(sk); |
|---|
| 2595 | 2751 | |
|---|
| 2596 | 2752 | skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { |
|---|
| 2597 | | - if (tcp_skb_seglen(skb) > mss && |
|---|
| 2598 | | - !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { |
|---|
| 2599 | | - if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { |
|---|
| 2600 | | - TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; |
|---|
| 2601 | | - tp->retrans_out -= tcp_skb_pcount(skb); |
|---|
| 2602 | | - } |
|---|
| 2603 | | - tcp_skb_mark_lost_uncond_verify(tp, skb); |
|---|
| 2604 | | - } |
|---|
| 2753 | + if (tcp_skb_seglen(skb) > mss) |
|---|
| 2754 | + tcp_mark_skb_lost(sk, skb); |
|---|
| 2605 | 2755 | } |
|---|
| 2606 | 2756 | |
|---|
| 2607 | 2757 | tcp_clear_retrans_hints_partial(tp); |
|---|
| .. | .. |
|---|
| 2656 | 2806 | /* Process an ACK in CA_Loss state. Move to CA_Open if lost data are |
|---|
| 2657 | 2807 | * recovered or spurious. Otherwise retransmits more on partial ACKs. |
|---|
| 2658 | 2808 | */ |
|---|
| 2659 | | -static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, |
|---|
| 2809 | +static void tcp_process_loss(struct sock *sk, int flag, int num_dupack, |
|---|
| 2660 | 2810 | int *rexmit) |
|---|
| 2661 | 2811 | { |
|---|
| 2662 | 2812 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 2663 | 2813 | bool recovered = !before(tp->snd_una, tp->high_seq); |
|---|
| 2664 | 2814 | |
|---|
| 2665 | | - if ((flag & FLAG_SND_UNA_ADVANCED) && |
|---|
| 2815 | + if ((flag & FLAG_SND_UNA_ADVANCED || rcu_access_pointer(tp->fastopen_rsk)) && |
|---|
| 2666 | 2816 | tcp_try_undo_loss(sk, false)) |
|---|
| 2667 | 2817 | return; |
|---|
| 2668 | 2818 | |
|---|
| .. | .. |
|---|
| 2675 | 2825 | return; |
|---|
| 2676 | 2826 | |
|---|
| 2677 | 2827 | if (after(tp->snd_nxt, tp->high_seq)) { |
|---|
| 2678 | | - if (flag & FLAG_DATA_SACKED || is_dupack) |
|---|
| 2828 | + if (flag & FLAG_DATA_SACKED || num_dupack) |
|---|
| 2679 | 2829 | tp->frto = 0; /* Step 3.a. loss was real */ |
|---|
| 2680 | 2830 | } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) { |
|---|
| 2681 | 2831 | tp->high_seq = tp->snd_nxt; |
|---|
| .. | .. |
|---|
| 2701 | 2851 | /* A Reno DUPACK means new data in F-RTO step 2.b above are |
|---|
| 2702 | 2852 | * delivered. Lower inflight to clock out (re)tranmissions. |
|---|
| 2703 | 2853 | */ |
|---|
| 2704 | | - if (after(tp->snd_nxt, tp->high_seq) && is_dupack) |
|---|
| 2705 | | - tcp_add_reno_sack(sk); |
|---|
| 2854 | + if (after(tp->snd_nxt, tp->high_seq) && num_dupack) |
|---|
| 2855 | + tcp_add_reno_sack(sk, num_dupack, flag & FLAG_ECE); |
|---|
| 2706 | 2856 | else if (flag & FLAG_SND_UNA_ADVANCED) |
|---|
| 2707 | 2857 | tcp_reset_reno_sack(tp); |
|---|
| 2708 | 2858 | } |
|---|
| 2709 | 2859 | *rexmit = REXMIT_LOST; |
|---|
| 2710 | 2860 | } |
|---|
| 2711 | 2861 | |
|---|
| 2862 | +static bool tcp_force_fast_retransmit(struct sock *sk) |
|---|
| 2863 | +{ |
|---|
| 2864 | + struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 2865 | + |
|---|
| 2866 | + return after(tcp_highest_sack_seq(tp), |
|---|
| 2867 | + tp->snd_una + tp->reordering * tp->mss_cache); |
|---|
| 2868 | +} |
|---|
| 2869 | + |
|---|
| 2712 | 2870 | /* Undo during fast recovery after partial ACK. */ |
|---|
| 2713 | | -static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una) |
|---|
| 2871 | +static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una, |
|---|
| 2872 | + bool *do_lost) |
|---|
| 2714 | 2873 | { |
|---|
| 2715 | 2874 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 2716 | 2875 | |
|---|
| .. | .. |
|---|
| 2735 | 2894 | tcp_undo_cwnd_reduction(sk, true); |
|---|
| 2736 | 2895 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); |
|---|
| 2737 | 2896 | tcp_try_keep_open(sk); |
|---|
| 2738 | | - return true; |
|---|
| 2897 | + } else { |
|---|
| 2898 | + /* Partial ACK arrived. Force fast retransmit. */ |
|---|
| 2899 | + *do_lost = tcp_force_fast_retransmit(sk); |
|---|
| 2739 | 2900 | } |
|---|
| 2740 | 2901 | return false; |
|---|
| 2741 | 2902 | } |
|---|
| .. | .. |
|---|
| 2759 | 2920 | } |
|---|
| 2760 | 2921 | } |
|---|
| 2761 | 2922 | |
|---|
| 2762 | | -static bool tcp_force_fast_retransmit(struct sock *sk) |
|---|
| 2763 | | -{ |
|---|
| 2764 | | - struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 2765 | | - |
|---|
| 2766 | | - return after(tcp_highest_sack_seq(tp), |
|---|
| 2767 | | - tp->snd_una + tp->reordering * tp->mss_cache); |
|---|
| 2768 | | -} |
|---|
| 2769 | | - |
|---|
| 2770 | 2923 | /* Process an event, which can update packets-in-flight not trivially. |
|---|
| 2771 | 2924 | * Main goal of this function is to calculate new estimate for left_out, |
|---|
| 2772 | 2925 | * taking into account both packets sitting in receiver's buffer and |
|---|
| .. | .. |
|---|
| 2780 | 2933 | * tcp_xmit_retransmit_queue(). |
|---|
| 2781 | 2934 | */ |
|---|
| 2782 | 2935 | static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, |
|---|
| 2783 | | - bool is_dupack, int *ack_flag, int *rexmit) |
|---|
| 2936 | + int num_dupack, int *ack_flag, int *rexmit) |
|---|
| 2784 | 2937 | { |
|---|
| 2785 | 2938 | struct inet_connection_sock *icsk = inet_csk(sk); |
|---|
| 2786 | 2939 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 2787 | 2940 | int fast_rexmit = 0, flag = *ack_flag; |
|---|
| 2788 | | - bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && |
|---|
| 2789 | | - tcp_force_fast_retransmit(sk)); |
|---|
| 2941 | + bool ece_ack = flag & FLAG_ECE; |
|---|
| 2942 | + bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) && |
|---|
| 2943 | + tcp_force_fast_retransmit(sk)); |
|---|
| 2790 | 2944 | |
|---|
| 2791 | 2945 | if (!tp->packets_out && tp->sacked_out) |
|---|
| 2792 | 2946 | tp->sacked_out = 0; |
|---|
| 2793 | 2947 | |
|---|
| 2794 | 2948 | /* Now state machine starts. |
|---|
| 2795 | 2949 | * A. ECE, hence prohibit cwnd undoing, the reduction is required. */ |
|---|
| 2796 | | - if (flag & FLAG_ECE) |
|---|
| 2950 | + if (ece_ack) |
|---|
| 2797 | 2951 | tp->prior_ssthresh = 0; |
|---|
| 2798 | 2952 | |
|---|
| 2799 | 2953 | /* B. In all the states check for reneging SACKs. */ |
|---|
| .. | .. |
|---|
| 2833 | 2987 | switch (icsk->icsk_ca_state) { |
|---|
| 2834 | 2988 | case TCP_CA_Recovery: |
|---|
| 2835 | 2989 | if (!(flag & FLAG_SND_UNA_ADVANCED)) { |
|---|
| 2836 | | - if (tcp_is_reno(tp) && is_dupack) |
|---|
| 2837 | | - tcp_add_reno_sack(sk); |
|---|
| 2838 | | - } else { |
|---|
| 2839 | | - if (tcp_try_undo_partial(sk, prior_snd_una)) |
|---|
| 2840 | | - return; |
|---|
| 2841 | | - /* Partial ACK arrived. Force fast retransmit. */ |
|---|
| 2842 | | - do_lost = tcp_is_reno(tp) || |
|---|
| 2843 | | - tcp_force_fast_retransmit(sk); |
|---|
| 2844 | | - } |
|---|
| 2845 | | - if (tcp_try_undo_dsack(sk)) { |
|---|
| 2846 | | - tcp_try_keep_open(sk); |
|---|
| 2990 | + if (tcp_is_reno(tp)) |
|---|
| 2991 | + tcp_add_reno_sack(sk, num_dupack, ece_ack); |
|---|
| 2992 | + } else if (tcp_try_undo_partial(sk, prior_snd_una, &do_lost)) |
|---|
| 2847 | 2993 | return; |
|---|
| 2848 | | - } |
|---|
| 2994 | + |
|---|
| 2995 | + if (tcp_try_undo_dsack(sk)) |
|---|
| 2996 | + tcp_try_keep_open(sk); |
|---|
| 2997 | + |
|---|
| 2849 | 2998 | tcp_identify_packet_loss(sk, ack_flag); |
|---|
| 2999 | + if (icsk->icsk_ca_state != TCP_CA_Recovery) { |
|---|
| 3000 | + if (!tcp_time_to_recover(sk, flag)) |
|---|
| 3001 | + return; |
|---|
| 3002 | + /* Undo reverts the recovery state. If loss is evident, |
|---|
| 3003 | + * starts a new recovery (e.g. reordering then loss); |
|---|
| 3004 | + */ |
|---|
| 3005 | + tcp_enter_recovery(sk, ece_ack); |
|---|
| 3006 | + } |
|---|
| 2850 | 3007 | break; |
|---|
| 2851 | 3008 | case TCP_CA_Loss: |
|---|
| 2852 | | - tcp_process_loss(sk, flag, is_dupack, rexmit); |
|---|
| 3009 | + tcp_process_loss(sk, flag, num_dupack, rexmit); |
|---|
| 2853 | 3010 | tcp_identify_packet_loss(sk, ack_flag); |
|---|
| 2854 | 3011 | if (!(icsk->icsk_ca_state == TCP_CA_Open || |
|---|
| 2855 | 3012 | (*ack_flag & FLAG_LOST_RETRANS))) |
|---|
| 2856 | 3013 | return; |
|---|
| 2857 | 3014 | /* Change state if cwnd is undone or retransmits are lost */ |
|---|
| 2858 | | - /* fall through */ |
|---|
| 3015 | + fallthrough; |
|---|
| 2859 | 3016 | default: |
|---|
| 2860 | 3017 | if (tcp_is_reno(tp)) { |
|---|
| 2861 | 3018 | if (flag & FLAG_SND_UNA_ADVANCED) |
|---|
| 2862 | 3019 | tcp_reset_reno_sack(tp); |
|---|
| 2863 | | - if (is_dupack) |
|---|
| 2864 | | - tcp_add_reno_sack(sk); |
|---|
| 3020 | + tcp_add_reno_sack(sk, num_dupack, ece_ack); |
|---|
| 2865 | 3021 | } |
|---|
| 2866 | 3022 | |
|---|
| 2867 | 3023 | if (icsk->icsk_ca_state <= TCP_CA_Disorder) |
|---|
| .. | .. |
|---|
| 2885 | 3041 | } |
|---|
| 2886 | 3042 | |
|---|
| 2887 | 3043 | /* Otherwise enter Recovery state */ |
|---|
| 2888 | | - tcp_enter_recovery(sk, (flag & FLAG_ECE)); |
|---|
| 3044 | + tcp_enter_recovery(sk, ece_ack); |
|---|
| 2889 | 3045 | fast_rexmit = 1; |
|---|
| 2890 | 3046 | } |
|---|
| 2891 | 3047 | |
|---|
| .. | .. |
|---|
| 2896 | 3052 | |
|---|
| 2897 | 3053 | static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us, const int flag) |
|---|
| 2898 | 3054 | { |
|---|
| 2899 | | - u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ; |
|---|
| 3055 | + u32 wlen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen) * HZ; |
|---|
| 2900 | 3056 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 2901 | 3057 | |
|---|
| 2902 | 3058 | if ((flag & FLAG_ACK_MAYBE_DELAYED) && rtt_us > tcp_min_rtt(tp)) { |
|---|
| .. | .. |
|---|
| 2935 | 3091 | u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; |
|---|
| 2936 | 3092 | |
|---|
| 2937 | 3093 | if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { |
|---|
| 3094 | + if (!delta) |
|---|
| 3095 | + delta = 1; |
|---|
| 2938 | 3096 | seq_rtt_us = delta * (USEC_PER_SEC / TCP_TS_HZ); |
|---|
| 2939 | 3097 | ca_rtt_us = seq_rtt_us; |
|---|
| 2940 | 3098 | } |
|---|
| .. | .. |
|---|
| 2988 | 3146 | /* If the retrans timer is currently being used by Fast Open |
|---|
| 2989 | 3147 | * for SYN-ACK retrans purpose, stay put. |
|---|
| 2990 | 3148 | */ |
|---|
| 2991 | | - if (tp->fastopen_rsk) |
|---|
| 3149 | + if (rcu_access_pointer(tp->fastopen_rsk)) |
|---|
| 2992 | 3150 | return; |
|---|
| 2993 | 3151 | |
|---|
| 2994 | 3152 | if (!tp->packets_out) { |
|---|
| .. | .. |
|---|
| 3004 | 3162 | */ |
|---|
| 3005 | 3163 | rto = usecs_to_jiffies(max_t(int, delta_us, 1)); |
|---|
| 3006 | 3164 | } |
|---|
| 3007 | | - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, |
|---|
| 3008 | | - TCP_RTO_MAX); |
|---|
| 3165 | + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, |
|---|
| 3166 | + TCP_RTO_MAX); |
|---|
| 3009 | 3167 | } |
|---|
| 3010 | 3168 | } |
|---|
| 3011 | 3169 | |
|---|
| .. | .. |
|---|
| 3061 | 3219 | */ |
|---|
| 3062 | 3220 | static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, |
|---|
| 3063 | 3221 | u32 prior_snd_una, |
|---|
| 3064 | | - struct tcp_sacktag_state *sack) |
|---|
| 3222 | + struct tcp_sacktag_state *sack, bool ece_ack) |
|---|
| 3065 | 3223 | { |
|---|
| 3066 | 3224 | const struct inet_connection_sock *icsk = inet_csk(sk); |
|---|
| 3067 | 3225 | u64 first_ackt, last_ackt; |
|---|
| .. | .. |
|---|
| 3086 | 3244 | u8 sacked = scb->sacked; |
|---|
| 3087 | 3245 | u32 acked_pcount; |
|---|
| 3088 | 3246 | |
|---|
| 3089 | | - tcp_ack_tstamp(sk, skb, prior_snd_una); |
|---|
| 3090 | | - |
|---|
| 3091 | 3247 | /* Determine how many packets and what bytes were acked, tso and else */ |
|---|
| 3092 | 3248 | if (after(scb->end_seq, tp->snd_una)) { |
|---|
| 3093 | 3249 | if (tcp_skb_pcount(skb) == 1 || |
|---|
| .. | .. |
|---|
| 3107 | 3263 | tp->retrans_out -= acked_pcount; |
|---|
| 3108 | 3264 | flag |= FLAG_RETRANS_DATA_ACKED; |
|---|
| 3109 | 3265 | } else if (!(sacked & TCPCB_SACKED_ACKED)) { |
|---|
| 3110 | | - last_ackt = skb->skb_mstamp; |
|---|
| 3266 | + last_ackt = tcp_skb_timestamp_us(skb); |
|---|
| 3111 | 3267 | WARN_ON_ONCE(last_ackt == 0); |
|---|
| 3112 | 3268 | if (!first_ackt) |
|---|
| 3113 | 3269 | first_ackt = last_ackt; |
|---|
| .. | .. |
|---|
| 3122 | 3278 | if (sacked & TCPCB_SACKED_ACKED) { |
|---|
| 3123 | 3279 | tp->sacked_out -= acked_pcount; |
|---|
| 3124 | 3280 | } else if (tcp_is_sack(tp)) { |
|---|
| 3125 | | - tp->delivered += acked_pcount; |
|---|
| 3281 | + tcp_count_delivered(tp, acked_pcount, ece_ack); |
|---|
| 3126 | 3282 | if (!tcp_skb_spurious_retrans(tp, skb)) |
|---|
| 3127 | 3283 | tcp_rack_advance(tp, sacked, scb->end_seq, |
|---|
| 3128 | | - skb->skb_mstamp); |
|---|
| 3284 | + tcp_skb_timestamp_us(skb)); |
|---|
| 3129 | 3285 | } |
|---|
| 3130 | 3286 | if (sacked & TCPCB_LOST) |
|---|
| 3131 | 3287 | tp->lost_out -= acked_pcount; |
|---|
| .. | .. |
|---|
| 3151 | 3307 | if (!fully_acked) |
|---|
| 3152 | 3308 | break; |
|---|
| 3153 | 3309 | |
|---|
| 3310 | + tcp_ack_tstamp(sk, skb, prior_snd_una); |
|---|
| 3311 | + |
|---|
| 3154 | 3312 | next = skb_rb_next(skb); |
|---|
| 3155 | 3313 | if (unlikely(skb == tp->retransmit_skb_hint)) |
|---|
| 3156 | 3314 | tp->retransmit_skb_hint = NULL; |
|---|
| .. | .. |
|---|
| 3166 | 3324 | if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una))) |
|---|
| 3167 | 3325 | tp->snd_up = tp->snd_una; |
|---|
| 3168 | 3326 | |
|---|
| 3169 | | - if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) |
|---|
| 3170 | | - flag |= FLAG_SACK_RENEGING; |
|---|
| 3327 | + if (skb) { |
|---|
| 3328 | + tcp_ack_tstamp(sk, skb, prior_snd_una); |
|---|
| 3329 | + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) |
|---|
| 3330 | + flag |= FLAG_SACK_RENEGING; |
|---|
| 3331 | + } |
|---|
| 3171 | 3332 | |
|---|
| 3172 | 3333 | if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) { |
|---|
| 3173 | 3334 | seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt); |
|---|
| .. | .. |
|---|
| 3199 | 3360 | } |
|---|
| 3200 | 3361 | |
|---|
| 3201 | 3362 | if (tcp_is_reno(tp)) { |
|---|
| 3202 | | - tcp_remove_reno_sacks(sk, pkts_acked); |
|---|
| 3363 | + tcp_remove_reno_sacks(sk, pkts_acked, ece_ack); |
|---|
| 3203 | 3364 | |
|---|
| 3204 | 3365 | /* If any of the cumulatively ACKed segments was |
|---|
| 3205 | 3366 | * retransmitted, non-SACK case cannot confirm that |
|---|
| .. | .. |
|---|
| 3220 | 3381 | tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta); |
|---|
| 3221 | 3382 | } |
|---|
| 3222 | 3383 | } else if (skb && rtt_update && sack_rtt_us >= 0 && |
|---|
| 3223 | | - sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) { |
|---|
| 3384 | + sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, |
|---|
| 3385 | + tcp_skb_timestamp_us(skb))) { |
|---|
| 3224 | 3386 | /* Do not re-arm RTO if the sack RTT is measured from data sent |
|---|
| 3225 | 3387 | * after when the head was last (re)transmitted. Otherwise the |
|---|
| 3226 | 3388 | * timeout may continue to extend in loss recovery. |
|---|
| .. | .. |
|---|
| 3273 | 3435 | return; |
|---|
| 3274 | 3436 | if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) { |
|---|
| 3275 | 3437 | icsk->icsk_backoff = 0; |
|---|
| 3438 | + icsk->icsk_probes_tstamp = 0; |
|---|
| 3276 | 3439 | inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); |
|---|
| 3277 | 3440 | /* Socket must be waked up by subsequent tcp_data_snd_check(). |
|---|
| 3278 | 3441 | * This function is not for random using! |
|---|
| .. | .. |
|---|
| 3280 | 3443 | } else { |
|---|
| 3281 | 3444 | unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX); |
|---|
| 3282 | 3445 | |
|---|
| 3283 | | - inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, |
|---|
| 3284 | | - when, TCP_RTO_MAX); |
|---|
| 3446 | + when = tcp_clamp_probe0_to_user_timeout(sk, when); |
|---|
| 3447 | + tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, TCP_RTO_MAX); |
|---|
| 3285 | 3448 | } |
|---|
| 3286 | 3449 | } |
|---|
| 3287 | 3450 | |
|---|
| .. | .. |
|---|
| 3300 | 3463 | * new SACK or ECE mark may first advance cwnd here and later reduce |
|---|
| 3301 | 3464 | * cwnd in tcp_fastretrans_alert() based on more states. |
|---|
| 3302 | 3465 | */ |
|---|
| 3303 | | - if (tcp_sk(sk)->reordering > sock_net(sk)->ipv4.sysctl_tcp_reordering) |
|---|
| 3466 | + if (tcp_sk(sk)->reordering > |
|---|
| 3467 | + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering)) |
|---|
| 3304 | 3468 | return flag & FLAG_FORWARD_PROGRESS; |
|---|
| 3305 | 3469 | |
|---|
| 3306 | 3470 | return flag & FLAG_DATA_ACKED; |
|---|
| .. | .. |
|---|
| 3409 | 3573 | static bool __tcp_oow_rate_limited(struct net *net, int mib_idx, |
|---|
| 3410 | 3574 | u32 *last_oow_ack_time) |
|---|
| 3411 | 3575 | { |
|---|
| 3412 | | - if (*last_oow_ack_time) { |
|---|
| 3413 | | - s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time); |
|---|
| 3576 | + /* Paired with the WRITE_ONCE() in this function. */ |
|---|
| 3577 | + u32 val = READ_ONCE(*last_oow_ack_time); |
|---|
| 3414 | 3578 | |
|---|
| 3415 | | - if (0 <= elapsed && elapsed < net->ipv4.sysctl_tcp_invalid_ratelimit) { |
|---|
| 3579 | + if (val) { |
|---|
| 3580 | + s32 elapsed = (s32)(tcp_jiffies32 - val); |
|---|
| 3581 | + |
|---|
| 3582 | + if (0 <= elapsed && |
|---|
| 3583 | + elapsed < READ_ONCE(net->ipv4.sysctl_tcp_invalid_ratelimit)) { |
|---|
| 3416 | 3584 | NET_INC_STATS(net, mib_idx); |
|---|
| 3417 | 3585 | return true; /* rate-limited: don't send yet! */ |
|---|
| 3418 | 3586 | } |
|---|
| 3419 | 3587 | } |
|---|
| 3420 | 3588 | |
|---|
| 3421 | | - *last_oow_ack_time = tcp_jiffies32; |
|---|
| 3589 | + /* Paired with the prior READ_ONCE() and with itself, |
|---|
| 3590 | + * as we might be lockless. |
|---|
| 3591 | + */ |
|---|
| 3592 | + WRITE_ONCE(*last_oow_ack_time, tcp_jiffies32); |
|---|
| 3422 | 3593 | |
|---|
| 3423 | 3594 | return false; /* not rate-limited: go ahead, send dupack now! */ |
|---|
| 3424 | 3595 | } |
|---|
| .. | .. |
|---|
| 3459 | 3630 | |
|---|
| 3460 | 3631 | /* Then check host-wide RFC 5961 rate limit. */ |
|---|
| 3461 | 3632 | now = jiffies / HZ; |
|---|
| 3462 | | - if (now != challenge_timestamp) { |
|---|
| 3463 | | - u32 ack_limit = net->ipv4.sysctl_tcp_challenge_ack_limit; |
|---|
| 3633 | + if (now != READ_ONCE(challenge_timestamp)) { |
|---|
| 3634 | + u32 ack_limit = READ_ONCE(net->ipv4.sysctl_tcp_challenge_ack_limit); |
|---|
| 3464 | 3635 | u32 half = (ack_limit + 1) >> 1; |
|---|
| 3465 | 3636 | |
|---|
| 3466 | | - challenge_timestamp = now; |
|---|
| 3637 | + WRITE_ONCE(challenge_timestamp, now); |
|---|
| 3467 | 3638 | WRITE_ONCE(challenge_count, half + prandom_u32_max(ack_limit)); |
|---|
| 3468 | 3639 | } |
|---|
| 3469 | 3640 | count = READ_ONCE(challenge_count); |
|---|
| .. | .. |
|---|
| 3544 | 3715 | { |
|---|
| 3545 | 3716 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 3546 | 3717 | |
|---|
| 3547 | | - if (rexmit == REXMIT_NONE) |
|---|
| 3718 | + if (rexmit == REXMIT_NONE || sk->sk_state == TCP_SYN_SENT) |
|---|
| 3548 | 3719 | return; |
|---|
| 3549 | 3720 | |
|---|
| 3550 | | - if (unlikely(rexmit == 2)) { |
|---|
| 3721 | + if (unlikely(rexmit == REXMIT_NEW)) { |
|---|
| 3551 | 3722 | __tcp_push_pending_frames(sk, tcp_current_mss(sk), |
|---|
| 3552 | 3723 | TCP_NAGLE_OFF); |
|---|
| 3553 | 3724 | if (after(tp->snd_nxt, tp->high_seq)) |
|---|
| .. | .. |
|---|
| 3566 | 3737 | |
|---|
| 3567 | 3738 | delivered = tp->delivered - prior_delivered; |
|---|
| 3568 | 3739 | NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered); |
|---|
| 3569 | | - if (flag & FLAG_ECE) { |
|---|
| 3570 | | - tp->delivered_ce += delivered; |
|---|
| 3740 | + if (flag & FLAG_ECE) |
|---|
| 3571 | 3741 | NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered); |
|---|
| 3572 | | - } |
|---|
| 3742 | + |
|---|
| 3573 | 3743 | return delivered; |
|---|
| 3574 | 3744 | } |
|---|
| 3575 | 3745 | |
|---|
| .. | .. |
|---|
| 3584 | 3754 | bool is_sack_reneg = tp->is_sack_reneg; |
|---|
| 3585 | 3755 | u32 ack_seq = TCP_SKB_CB(skb)->seq; |
|---|
| 3586 | 3756 | u32 ack = TCP_SKB_CB(skb)->ack_seq; |
|---|
| 3587 | | - bool is_dupack = false; |
|---|
| 3757 | + int num_dupack = 0; |
|---|
| 3588 | 3758 | int prior_packets = tp->packets_out; |
|---|
| 3589 | 3759 | u32 delivered = tp->delivered; |
|---|
| 3590 | 3760 | u32 lost = tp->lost; |
|---|
| .. | .. |
|---|
| 3593 | 3763 | |
|---|
| 3594 | 3764 | sack_state.first_sackt = 0; |
|---|
| 3595 | 3765 | sack_state.rate = &rs; |
|---|
| 3766 | + sack_state.sack_delivered = 0; |
|---|
| 3596 | 3767 | |
|---|
| 3597 | 3768 | /* We very likely will need to access rtx queue. */ |
|---|
| 3598 | 3769 | prefetch(sk->tcp_rtx_queue.rb_node); |
|---|
| .. | .. |
|---|
| 3614 | 3785 | * this segment (RFC793 Section 3.9). |
|---|
| 3615 | 3786 | */ |
|---|
| 3616 | 3787 | if (after(ack, tp->snd_nxt)) |
|---|
| 3617 | | - goto invalid_ack; |
|---|
| 3788 | + return -1; |
|---|
| 3618 | 3789 | |
|---|
| 3619 | 3790 | if (after(ack, prior_snd_una)) { |
|---|
| 3620 | 3791 | flag |= FLAG_SND_UNA_ADVANCED; |
|---|
| 3621 | 3792 | icsk->icsk_retransmits = 0; |
|---|
| 3622 | 3793 | |
|---|
| 3623 | 3794 | #if IS_ENABLED(CONFIG_TLS_DEVICE) |
|---|
| 3624 | | - if (static_branch_unlikely(&clean_acked_data_enabled)) |
|---|
| 3795 | + if (static_branch_unlikely(&clean_acked_data_enabled.key)) |
|---|
| 3625 | 3796 | if (icsk->icsk_clean_acked) |
|---|
| 3626 | 3797 | icsk->icsk_clean_acked(sk, ack); |
|---|
| 3627 | 3798 | #endif |
|---|
| .. | .. |
|---|
| 3636 | 3807 | if (flag & FLAG_UPDATE_TS_RECENT) |
|---|
| 3637 | 3808 | tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); |
|---|
| 3638 | 3809 | |
|---|
| 3639 | | - if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) { |
|---|
| 3810 | + if ((flag & (FLAG_SLOWPATH | FLAG_SND_UNA_ADVANCED)) == |
|---|
| 3811 | + FLAG_SND_UNA_ADVANCED) { |
|---|
| 3640 | 3812 | /* Window is constant, pure forward advance. |
|---|
| 3641 | 3813 | * No more checks are required. |
|---|
| 3642 | 3814 | * Note, we use the fact that SND.UNA>=SND.WL2. |
|---|
| .. | .. |
|---|
| 3667 | 3839 | ack_ev_flags |= CA_ACK_ECE; |
|---|
| 3668 | 3840 | } |
|---|
| 3669 | 3841 | |
|---|
| 3842 | + if (sack_state.sack_delivered) |
|---|
| 3843 | + tcp_count_delivered(tp, sack_state.sack_delivered, |
|---|
| 3844 | + flag & FLAG_ECE); |
|---|
| 3845 | + |
|---|
| 3670 | 3846 | if (flag & FLAG_WIN_UPDATE) |
|---|
| 3671 | 3847 | ack_ev_flags |= CA_ACK_WIN_UPDATE; |
|---|
| 3672 | 3848 | |
|---|
| .. | .. |
|---|
| 3692 | 3868 | goto no_queue; |
|---|
| 3693 | 3869 | |
|---|
| 3694 | 3870 | /* See if we can take anything off of the retransmit queue. */ |
|---|
| 3695 | | - flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state); |
|---|
| 3871 | + flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state, |
|---|
| 3872 | + flag & FLAG_ECE); |
|---|
| 3696 | 3873 | |
|---|
| 3697 | 3874 | tcp_rack_update_reo_wnd(sk, &rs); |
|---|
| 3698 | 3875 | |
|---|
| .. | .. |
|---|
| 3700 | 3877 | tcp_process_tlp_ack(sk, ack, flag); |
|---|
| 3701 | 3878 | |
|---|
| 3702 | 3879 | if (tcp_ack_is_dubious(sk, flag)) { |
|---|
| 3703 | | - is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); |
|---|
| 3704 | | - tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, |
|---|
| 3880 | + if (!(flag & (FLAG_SND_UNA_ADVANCED | |
|---|
| 3881 | + FLAG_NOT_DUP | FLAG_DSACKING_ACK))) { |
|---|
| 3882 | + num_dupack = 1; |
|---|
| 3883 | + /* Consider if pure acks were aggregated in tcp_add_backlog() */ |
|---|
| 3884 | + if (!(flag & FLAG_DATA)) |
|---|
| 3885 | + num_dupack = max_t(u16, 1, skb_shinfo(skb)->gso_segs); |
|---|
| 3886 | + } |
|---|
| 3887 | + tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, |
|---|
| 3705 | 3888 | &rexmit); |
|---|
| 3706 | 3889 | } |
|---|
| 3707 | 3890 | |
|---|
| .. | .. |
|---|
| 3723 | 3906 | no_queue: |
|---|
| 3724 | 3907 | /* If data was DSACKed, see if we can undo a cwnd reduction. */ |
|---|
| 3725 | 3908 | if (flag & FLAG_DSACKING_ACK) { |
|---|
| 3726 | | - tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, |
|---|
| 3909 | + tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, |
|---|
| 3727 | 3910 | &rexmit); |
|---|
| 3728 | 3911 | tcp_newly_delivered(sk, delivered, flag); |
|---|
| 3729 | 3912 | } |
|---|
| .. | .. |
|---|
| 3737 | 3920 | tcp_process_tlp_ack(sk, ack, flag); |
|---|
| 3738 | 3921 | return 1; |
|---|
| 3739 | 3922 | |
|---|
| 3740 | | -invalid_ack: |
|---|
| 3741 | | - SOCK_DEBUG(sk, "Ack %u after %u:%u\n", ack, tp->snd_una, tp->snd_nxt); |
|---|
| 3742 | | - return -1; |
|---|
| 3743 | | - |
|---|
| 3744 | 3923 | old_ack: |
|---|
| 3745 | 3924 | /* If data was SACKed, tag it and see if we should send more data. |
|---|
| 3746 | 3925 | * If data was DSACKed, see if we can undo a cwnd reduction. |
|---|
| .. | .. |
|---|
| 3748 | 3927 | if (TCP_SKB_CB(skb)->sacked) { |
|---|
| 3749 | 3928 | flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, |
|---|
| 3750 | 3929 | &sack_state); |
|---|
| 3751 | | - tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, |
|---|
| 3930 | + tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, |
|---|
| 3752 | 3931 | &rexmit); |
|---|
| 3753 | 3932 | tcp_newly_delivered(sk, delivered, flag); |
|---|
| 3754 | 3933 | tcp_xmit_recovery(sk, rexmit); |
|---|
| 3755 | 3934 | } |
|---|
| 3756 | 3935 | |
|---|
| 3757 | | - SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt); |
|---|
| 3758 | 3936 | return 0; |
|---|
| 3759 | 3937 | } |
|---|
| 3760 | 3938 | |
|---|
| .. | .. |
|---|
| 3775 | 3953 | foc->exp = exp_opt; |
|---|
| 3776 | 3954 | } |
|---|
| 3777 | 3955 | |
|---|
| 3778 | | -static void smc_parse_options(const struct tcphdr *th, |
|---|
| 3956 | +static bool smc_parse_options(const struct tcphdr *th, |
|---|
| 3779 | 3957 | struct tcp_options_received *opt_rx, |
|---|
| 3780 | 3958 | const unsigned char *ptr, |
|---|
| 3781 | 3959 | int opsize) |
|---|
| .. | .. |
|---|
| 3784 | 3962 | if (static_branch_unlikely(&tcp_have_smc)) { |
|---|
| 3785 | 3963 | if (th->syn && !(opsize & 1) && |
|---|
| 3786 | 3964 | opsize >= TCPOLEN_EXP_SMC_BASE && |
|---|
| 3787 | | - get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) |
|---|
| 3965 | + get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) { |
|---|
| 3788 | 3966 | opt_rx->smc_ok = 1; |
|---|
| 3967 | + return true; |
|---|
| 3968 | + } |
|---|
| 3789 | 3969 | } |
|---|
| 3790 | 3970 | #endif |
|---|
| 3971 | + return false; |
|---|
| 3972 | +} |
|---|
| 3973 | + |
|---|
| 3974 | +/* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped |
|---|
| 3975 | + * value on success. |
|---|
| 3976 | + */ |
|---|
| 3977 | +static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss) |
|---|
| 3978 | +{ |
|---|
| 3979 | + const unsigned char *ptr = (const unsigned char *)(th + 1); |
|---|
| 3980 | + int length = (th->doff * 4) - sizeof(struct tcphdr); |
|---|
| 3981 | + u16 mss = 0; |
|---|
| 3982 | + |
|---|
| 3983 | + while (length > 0) { |
|---|
| 3984 | + int opcode = *ptr++; |
|---|
| 3985 | + int opsize; |
|---|
| 3986 | + |
|---|
| 3987 | + switch (opcode) { |
|---|
| 3988 | + case TCPOPT_EOL: |
|---|
| 3989 | + return mss; |
|---|
| 3990 | + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ |
|---|
| 3991 | + length--; |
|---|
| 3992 | + continue; |
|---|
| 3993 | + default: |
|---|
| 3994 | + if (length < 2) |
|---|
| 3995 | + return mss; |
|---|
| 3996 | + opsize = *ptr++; |
|---|
| 3997 | + if (opsize < 2) /* "silly options" */ |
|---|
| 3998 | + return mss; |
|---|
| 3999 | + if (opsize > length) |
|---|
| 4000 | + return mss; /* fail on partial options */ |
|---|
| 4001 | + if (opcode == TCPOPT_MSS && opsize == TCPOLEN_MSS) { |
|---|
| 4002 | + u16 in_mss = get_unaligned_be16(ptr); |
|---|
| 4003 | + |
|---|
| 4004 | + if (in_mss) { |
|---|
| 4005 | + if (user_mss && user_mss < in_mss) |
|---|
| 4006 | + in_mss = user_mss; |
|---|
| 4007 | + mss = in_mss; |
|---|
| 4008 | + } |
|---|
| 4009 | + } |
|---|
| 4010 | + ptr += opsize - 2; |
|---|
| 4011 | + length -= opsize; |
|---|
| 4012 | + } |
|---|
| 4013 | + } |
|---|
| 4014 | + return mss; |
|---|
| 3791 | 4015 | } |
|---|
| 3792 | 4016 | |
|---|
| 3793 | 4017 | /* Look for tcp options. Normally only called on SYN and SYNACK packets. |
|---|
| .. | .. |
|---|
| 3805 | 4029 | |
|---|
| 3806 | 4030 | ptr = (const unsigned char *)(th + 1); |
|---|
| 3807 | 4031 | opt_rx->saw_tstamp = 0; |
|---|
| 4032 | + opt_rx->saw_unknown = 0; |
|---|
| 3808 | 4033 | |
|---|
| 3809 | 4034 | while (length > 0) { |
|---|
| 3810 | 4035 | int opcode = *ptr++; |
|---|
| .. | .. |
|---|
| 3817 | 4042 | length--; |
|---|
| 3818 | 4043 | continue; |
|---|
| 3819 | 4044 | default: |
|---|
| 4045 | + if (length < 2) |
|---|
| 4046 | + return; |
|---|
| 3820 | 4047 | opsize = *ptr++; |
|---|
| 3821 | 4048 | if (opsize < 2) /* "silly options" */ |
|---|
| 3822 | 4049 | return; |
|---|
| .. | .. |
|---|
| 3836 | 4063 | break; |
|---|
| 3837 | 4064 | case TCPOPT_WINDOW: |
|---|
| 3838 | 4065 | if (opsize == TCPOLEN_WINDOW && th->syn && |
|---|
| 3839 | | - !estab && net->ipv4.sysctl_tcp_window_scaling) { |
|---|
| 4066 | + !estab && READ_ONCE(net->ipv4.sysctl_tcp_window_scaling)) { |
|---|
| 3840 | 4067 | __u8 snd_wscale = *(__u8 *)ptr; |
|---|
| 3841 | 4068 | opt_rx->wscale_ok = 1; |
|---|
| 3842 | 4069 | if (snd_wscale > TCP_MAX_WSCALE) { |
|---|
| .. | .. |
|---|
| 3852 | 4079 | case TCPOPT_TIMESTAMP: |
|---|
| 3853 | 4080 | if ((opsize == TCPOLEN_TIMESTAMP) && |
|---|
| 3854 | 4081 | ((estab && opt_rx->tstamp_ok) || |
|---|
| 3855 | | - (!estab && net->ipv4.sysctl_tcp_timestamps))) { |
|---|
| 4082 | + (!estab && READ_ONCE(net->ipv4.sysctl_tcp_timestamps)))) { |
|---|
| 3856 | 4083 | opt_rx->saw_tstamp = 1; |
|---|
| 3857 | 4084 | opt_rx->rcv_tsval = get_unaligned_be32(ptr); |
|---|
| 3858 | 4085 | opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4); |
|---|
| .. | .. |
|---|
| 3860 | 4087 | break; |
|---|
| 3861 | 4088 | case TCPOPT_SACK_PERM: |
|---|
| 3862 | 4089 | if (opsize == TCPOLEN_SACK_PERM && th->syn && |
|---|
| 3863 | | - !estab && net->ipv4.sysctl_tcp_sack) { |
|---|
| 4090 | + !estab && READ_ONCE(net->ipv4.sysctl_tcp_sack)) { |
|---|
| 3864 | 4091 | opt_rx->sack_ok = TCP_SACK_SEEN; |
|---|
| 3865 | 4092 | tcp_sack_reset(opt_rx); |
|---|
| 3866 | 4093 | } |
|---|
| .. | .. |
|---|
| 3893 | 4120 | */ |
|---|
| 3894 | 4121 | if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE && |
|---|
| 3895 | 4122 | get_unaligned_be16(ptr) == |
|---|
| 3896 | | - TCPOPT_FASTOPEN_MAGIC) |
|---|
| 4123 | + TCPOPT_FASTOPEN_MAGIC) { |
|---|
| 3897 | 4124 | tcp_parse_fastopen_option(opsize - |
|---|
| 3898 | 4125 | TCPOLEN_EXP_FASTOPEN_BASE, |
|---|
| 3899 | 4126 | ptr + 2, th->syn, foc, true); |
|---|
| 3900 | | - else |
|---|
| 3901 | | - smc_parse_options(th, opt_rx, ptr, |
|---|
| 3902 | | - opsize); |
|---|
| 4127 | + break; |
|---|
| 4128 | + } |
|---|
| 4129 | + |
|---|
| 4130 | + if (smc_parse_options(th, opt_rx, ptr, opsize)) |
|---|
| 4131 | + break; |
|---|
| 4132 | + |
|---|
| 4133 | + opt_rx->saw_unknown = 1; |
|---|
| 3903 | 4134 | break; |
|---|
| 3904 | 4135 | |
|---|
| 4136 | + default: |
|---|
| 4137 | + opt_rx->saw_unknown = 1; |
|---|
| 3905 | 4138 | } |
|---|
| 3906 | 4139 | ptr += opsize-2; |
|---|
| 3907 | 4140 | length -= opsize; |
|---|
| .. | .. |
|---|
| 4109 | 4342 | |
|---|
| 4110 | 4343 | inet_csk_schedule_ack(sk); |
|---|
| 4111 | 4344 | |
|---|
| 4112 | | - sk->sk_shutdown |= RCV_SHUTDOWN; |
|---|
| 4345 | + WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | RCV_SHUTDOWN); |
|---|
| 4113 | 4346 | sock_set_flag(sk, SOCK_DONE); |
|---|
| 4114 | 4347 | |
|---|
| 4115 | 4348 | switch (sk->sk_state) { |
|---|
| .. | .. |
|---|
| 4117 | 4350 | case TCP_ESTABLISHED: |
|---|
| 4118 | 4351 | /* Move to CLOSE_WAIT */ |
|---|
| 4119 | 4352 | tcp_set_state(sk, TCP_CLOSE_WAIT); |
|---|
| 4120 | | - inet_csk(sk)->icsk_ack.pingpong = 1; |
|---|
| 4353 | + inet_csk_enter_pingpong_mode(sk); |
|---|
| 4121 | 4354 | break; |
|---|
| 4122 | 4355 | |
|---|
| 4123 | 4356 | case TCP_CLOSE_WAIT: |
|---|
| .. | .. |
|---|
| 4189 | 4422 | { |
|---|
| 4190 | 4423 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 4191 | 4424 | |
|---|
| 4192 | | - if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) { |
|---|
| 4425 | + if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) { |
|---|
| 4193 | 4426 | int mib_idx; |
|---|
| 4194 | 4427 | |
|---|
| 4195 | 4428 | if (before(seq, tp->rcv_nxt)) |
|---|
| .. | .. |
|---|
| 4215 | 4448 | tcp_sack_extend(tp->duplicate_sack, seq, end_seq); |
|---|
| 4216 | 4449 | } |
|---|
| 4217 | 4450 | |
|---|
| 4451 | +static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb) |
|---|
| 4452 | +{ |
|---|
| 4453 | + /* When the ACK path fails or drops most ACKs, the sender would |
|---|
| 4454 | + * timeout and spuriously retransmit the same segment repeatedly. |
|---|
| 4455 | + * The receiver remembers and reflects via DSACKs. Leverage the |
|---|
| 4456 | + * DSACK state and change the txhash to re-route speculatively. |
|---|
| 4457 | + */ |
|---|
| 4458 | + if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq && |
|---|
| 4459 | + sk_rethink_txhash(sk)) |
|---|
| 4460 | + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH); |
|---|
| 4461 | +} |
|---|
| 4462 | + |
|---|
| 4218 | 4463 | static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) |
|---|
| 4219 | 4464 | { |
|---|
| 4220 | 4465 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| .. | .. |
|---|
| 4224 | 4469 | NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); |
|---|
| 4225 | 4470 | tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); |
|---|
| 4226 | 4471 | |
|---|
| 4227 | | - if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) { |
|---|
| 4472 | + if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) { |
|---|
| 4228 | 4473 | u32 end_seq = TCP_SKB_CB(skb)->end_seq; |
|---|
| 4229 | 4474 | |
|---|
| 4475 | + tcp_rcv_spurious_retrans(sk, skb); |
|---|
| 4230 | 4476 | if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) |
|---|
| 4231 | 4477 | end_seq = tp->rcv_nxt; |
|---|
| 4232 | 4478 | tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq); |
|---|
| .. | .. |
|---|
| 4260 | 4506 | sp[i] = sp[i + 1]; |
|---|
| 4261 | 4507 | continue; |
|---|
| 4262 | 4508 | } |
|---|
| 4263 | | - this_sack++, swalk++; |
|---|
| 4509 | + this_sack++; |
|---|
| 4510 | + swalk++; |
|---|
| 4264 | 4511 | } |
|---|
| 4265 | 4512 | } |
|---|
| 4513 | + |
|---|
| 4514 | +static void tcp_sack_compress_send_ack(struct sock *sk) |
|---|
| 4515 | +{ |
|---|
| 4516 | + struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 4517 | + |
|---|
| 4518 | + if (!tp->compressed_ack) |
|---|
| 4519 | + return; |
|---|
| 4520 | + |
|---|
| 4521 | + if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) |
|---|
| 4522 | + __sock_put(sk); |
|---|
| 4523 | + |
|---|
| 4524 | + /* Since we have to send one ack finally, |
|---|
| 4525 | + * substract one from tp->compressed_ack to keep |
|---|
| 4526 | + * LINUX_MIB_TCPACKCOMPRESSED accurate. |
|---|
| 4527 | + */ |
|---|
| 4528 | + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, |
|---|
| 4529 | + tp->compressed_ack - 1); |
|---|
| 4530 | + |
|---|
| 4531 | + tp->compressed_ack = 0; |
|---|
| 4532 | + tcp_send_ack(sk); |
|---|
| 4533 | +} |
|---|
| 4534 | + |
|---|
| 4535 | +/* Reasonable amount of sack blocks included in TCP SACK option |
|---|
| 4536 | + * The max is 4, but this becomes 3 if TCP timestamps are there. |
|---|
| 4537 | + * Given that SACK packets might be lost, be conservative and use 2. |
|---|
| 4538 | + */ |
|---|
| 4539 | +#define TCP_SACK_BLOCKS_EXPECTED 2 |
|---|
| 4266 | 4540 | |
|---|
| 4267 | 4541 | static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) |
|---|
| 4268 | 4542 | { |
|---|
| .. | .. |
|---|
| 4276 | 4550 | |
|---|
| 4277 | 4551 | for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) { |
|---|
| 4278 | 4552 | if (tcp_sack_extend(sp, seq, end_seq)) { |
|---|
| 4553 | + if (this_sack >= TCP_SACK_BLOCKS_EXPECTED) |
|---|
| 4554 | + tcp_sack_compress_send_ack(sk); |
|---|
| 4279 | 4555 | /* Rotate this_sack to the first one. */ |
|---|
| 4280 | 4556 | for (; this_sack > 0; this_sack--, sp--) |
|---|
| 4281 | 4557 | swap(*sp, *(sp - 1)); |
|---|
| .. | .. |
|---|
| 4285 | 4561 | } |
|---|
| 4286 | 4562 | } |
|---|
| 4287 | 4563 | |
|---|
| 4564 | + if (this_sack >= TCP_SACK_BLOCKS_EXPECTED) |
|---|
| 4565 | + tcp_sack_compress_send_ack(sk); |
|---|
| 4566 | + |
|---|
| 4288 | 4567 | /* Could not find an adjacent existing SACK, build a new one, |
|---|
| 4289 | 4568 | * put it at the front, and shift everyone else down. We |
|---|
| 4290 | 4569 | * always know there is at least one SACK present already here. |
|---|
| .. | .. |
|---|
| 4292 | 4571 | * If the sack array is full, forget about the last one. |
|---|
| 4293 | 4572 | */ |
|---|
| 4294 | 4573 | if (this_sack >= TCP_NUM_SACKS) { |
|---|
| 4295 | | - if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) |
|---|
| 4296 | | - tcp_send_ack(sk); |
|---|
| 4297 | 4574 | this_sack--; |
|---|
| 4298 | 4575 | tp->rx_opt.num_sacks--; |
|---|
| 4299 | 4576 | sp--; |
|---|
| .. | .. |
|---|
| 4345 | 4622 | /** |
|---|
| 4346 | 4623 | * tcp_try_coalesce - try to merge skb to prior one |
|---|
| 4347 | 4624 | * @sk: socket |
|---|
| 4348 | | - * @dest: destination queue |
|---|
| 4349 | 4625 | * @to: prior buffer |
|---|
| 4350 | 4626 | * @from: buffer to add in queue |
|---|
| 4351 | 4627 | * @fragstolen: pointer to boolean |
|---|
| .. | .. |
|---|
| 4367 | 4643 | |
|---|
| 4368 | 4644 | /* Its possible this segment overlaps with prior segment in queue */ |
|---|
| 4369 | 4645 | if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) |
|---|
| 4646 | + return false; |
|---|
| 4647 | + |
|---|
| 4648 | + if (!mptcp_skb_can_collapse(to, from)) |
|---|
| 4370 | 4649 | return false; |
|---|
| 4371 | 4650 | |
|---|
| 4372 | 4651 | #ifdef CONFIG_TLS_DEVICE |
|---|
| .. | .. |
|---|
| 4412 | 4691 | |
|---|
| 4413 | 4692 | static void tcp_drop(struct sock *sk, struct sk_buff *skb) |
|---|
| 4414 | 4693 | { |
|---|
| 4694 | + trace_android_vh_kfree_skb(skb); |
|---|
| 4415 | 4695 | sk_drops_add(sk, skb); |
|---|
| 4416 | 4696 | __kfree_skb(skb); |
|---|
| 4417 | 4697 | } |
|---|
| .. | .. |
|---|
| 4443 | 4723 | rb_erase(&skb->rbnode, &tp->out_of_order_queue); |
|---|
| 4444 | 4724 | |
|---|
| 4445 | 4725 | if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) { |
|---|
| 4446 | | - SOCK_DEBUG(sk, "ofo packet was already received\n"); |
|---|
| 4447 | 4726 | tcp_drop(sk, skb); |
|---|
| 4448 | 4727 | continue; |
|---|
| 4449 | 4728 | } |
|---|
| 4450 | | - SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n", |
|---|
| 4451 | | - tp->rcv_nxt, TCP_SKB_CB(skb)->seq, |
|---|
| 4452 | | - TCP_SKB_CB(skb)->end_seq); |
|---|
| 4453 | 4729 | |
|---|
| 4454 | 4730 | tail = skb_peek_tail(&sk->sk_receive_queue); |
|---|
| 4455 | 4731 | eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); |
|---|
| .. | .. |
|---|
| 4511 | 4787 | tp->pred_flags = 0; |
|---|
| 4512 | 4788 | inet_csk_schedule_ack(sk); |
|---|
| 4513 | 4789 | |
|---|
| 4790 | + tp->rcv_ooopack += max_t(u16, 1, skb_shinfo(skb)->gso_segs); |
|---|
| 4514 | 4791 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); |
|---|
| 4515 | 4792 | seq = TCP_SKB_CB(skb)->seq; |
|---|
| 4516 | 4793 | end_seq = TCP_SKB_CB(skb)->end_seq; |
|---|
| 4517 | | - SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", |
|---|
| 4518 | | - tp->rcv_nxt, seq, end_seq); |
|---|
| 4519 | 4794 | |
|---|
| 4520 | 4795 | p = &tp->out_of_order_queue.rb_node; |
|---|
| 4521 | 4796 | if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) { |
|---|
| .. | .. |
|---|
| 4541 | 4816 | * and trigger fast retransmit. |
|---|
| 4542 | 4817 | */ |
|---|
| 4543 | 4818 | if (tcp_is_sack(tp)) |
|---|
| 4544 | | - tcp_grow_window(sk, skb); |
|---|
| 4819 | + tcp_grow_window(sk, skb, true); |
|---|
| 4545 | 4820 | kfree_skb_partial(skb, fragstolen); |
|---|
| 4546 | 4821 | skb = NULL; |
|---|
| 4547 | 4822 | goto add_sack; |
|---|
| .. | .. |
|---|
| 4629 | 4904 | * and trigger fast retransmit. |
|---|
| 4630 | 4905 | */ |
|---|
| 4631 | 4906 | if (tcp_is_sack(tp)) |
|---|
| 4632 | | - tcp_grow_window(sk, skb); |
|---|
| 4907 | + tcp_grow_window(sk, skb, false); |
|---|
| 4633 | 4908 | skb_condense(skb); |
|---|
| 4634 | 4909 | skb_set_owner_r(skb, sk); |
|---|
| 4635 | 4910 | } |
|---|
| 4636 | 4911 | } |
|---|
| 4637 | 4912 | |
|---|
| 4638 | | -static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, |
|---|
| 4639 | | - bool *fragstolen) |
|---|
| 4913 | +static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, |
|---|
| 4914 | + bool *fragstolen) |
|---|
| 4640 | 4915 | { |
|---|
| 4641 | 4916 | int eaten; |
|---|
| 4642 | 4917 | struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue); |
|---|
| 4643 | 4918 | |
|---|
| 4644 | | - __skb_pull(skb, hdrlen); |
|---|
| 4645 | 4919 | eaten = (tail && |
|---|
| 4646 | 4920 | tcp_try_coalesce(sk, tail, |
|---|
| 4647 | 4921 | skb, fragstolen)) ? 1 : 0; |
|---|
| .. | .. |
|---|
| 4692 | 4966 | TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size; |
|---|
| 4693 | 4967 | TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1; |
|---|
| 4694 | 4968 | |
|---|
| 4695 | | - if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) { |
|---|
| 4969 | + if (tcp_queue_rcv(sk, skb, &fragstolen)) { |
|---|
| 4696 | 4970 | WARN_ON_ONCE(fragstolen); /* should not happen */ |
|---|
| 4697 | 4971 | __kfree_skb(skb); |
|---|
| 4698 | 4972 | } |
|---|
| .. | .. |
|---|
| 4724 | 4998 | bool fragstolen; |
|---|
| 4725 | 4999 | int eaten; |
|---|
| 4726 | 5000 | |
|---|
| 5001 | + if (sk_is_mptcp(sk)) |
|---|
| 5002 | + mptcp_incoming_options(sk, skb); |
|---|
| 5003 | + |
|---|
| 4727 | 5004 | if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { |
|---|
| 4728 | 5005 | __kfree_skb(skb); |
|---|
| 4729 | 5006 | return; |
|---|
| .. | .. |
|---|
| 4753 | 5030 | goto drop; |
|---|
| 4754 | 5031 | } |
|---|
| 4755 | 5032 | |
|---|
| 4756 | | - eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); |
|---|
| 5033 | + eaten = tcp_queue_rcv(sk, skb, &fragstolen); |
|---|
| 4757 | 5034 | if (skb->len) |
|---|
| 4758 | 5035 | tcp_event_data_recv(sk, skb); |
|---|
| 4759 | 5036 | if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) |
|---|
| .. | .. |
|---|
| 4782 | 5059 | } |
|---|
| 4783 | 5060 | |
|---|
| 4784 | 5061 | if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { |
|---|
| 5062 | + tcp_rcv_spurious_retrans(sk, skb); |
|---|
| 4785 | 5063 | /* A retransmit, 2nd most common case. Force an immediate ack. */ |
|---|
| 4786 | 5064 | NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); |
|---|
| 4787 | 5065 | tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); |
|---|
| .. | .. |
|---|
| 4800 | 5078 | |
|---|
| 4801 | 5079 | if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { |
|---|
| 4802 | 5080 | /* Partial packet, seq < rcv_next < end_seq */ |
|---|
| 4803 | | - SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n", |
|---|
| 4804 | | - tp->rcv_nxt, TCP_SKB_CB(skb)->seq, |
|---|
| 4805 | | - TCP_SKB_CB(skb)->end_seq); |
|---|
| 4806 | | - |
|---|
| 4807 | 5081 | tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt); |
|---|
| 4808 | 5082 | |
|---|
| 4809 | 5083 | /* If window is closed, drop tail of packet. But after |
|---|
| .. | .. |
|---|
| 4897 | 5171 | /* The first skb to collapse is: |
|---|
| 4898 | 5172 | * - not SYN/FIN and |
|---|
| 4899 | 5173 | * - bloated or contains data before "start" or |
|---|
| 4900 | | - * overlaps to the next one. |
|---|
| 5174 | + * overlaps to the next one and mptcp allow collapsing. |
|---|
| 4901 | 5175 | */ |
|---|
| 4902 | 5176 | if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) && |
|---|
| 4903 | 5177 | (tcp_win_from_space(sk, skb->truesize) > skb->len || |
|---|
| .. | .. |
|---|
| 4906 | 5180 | break; |
|---|
| 4907 | 5181 | } |
|---|
| 4908 | 5182 | |
|---|
| 4909 | | - if (n && n != tail && |
|---|
| 5183 | + if (n && n != tail && mptcp_skb_can_collapse(skb, n) && |
|---|
| 4910 | 5184 | TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) { |
|---|
| 4911 | 5185 | end_of_skbs = false; |
|---|
| 4912 | 5186 | break; |
|---|
| .. | .. |
|---|
| 4939 | 5213 | else |
|---|
| 4940 | 5214 | __skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */ |
|---|
| 4941 | 5215 | skb_set_owner_r(nskb, sk); |
|---|
| 5216 | + mptcp_skb_ext_move(nskb, skb); |
|---|
| 4942 | 5217 | |
|---|
| 4943 | 5218 | /* Copy data, releasing collapsed skbs. */ |
|---|
| 4944 | 5219 | while (copy > 0) { |
|---|
| .. | .. |
|---|
| 4958 | 5233 | skb = tcp_collapse_one(sk, skb, list, root); |
|---|
| 4959 | 5234 | if (!skb || |
|---|
| 4960 | 5235 | skb == tail || |
|---|
| 5236 | + !mptcp_skb_can_collapse(nskb, skb) || |
|---|
| 4961 | 5237 | (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) |
|---|
| 4962 | 5238 | goto end; |
|---|
| 4963 | 5239 | #ifdef CONFIG_TLS_DEVICE |
|---|
| .. | .. |
|---|
| 5082 | 5358 | { |
|---|
| 5083 | 5359 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 5084 | 5360 | |
|---|
| 5085 | | - SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq); |
|---|
| 5086 | | - |
|---|
| 5087 | 5361 | NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED); |
|---|
| 5088 | 5362 | |
|---|
| 5089 | 5363 | if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) |
|---|
| .. | .. |
|---|
| 5149 | 5423 | return true; |
|---|
| 5150 | 5424 | } |
|---|
| 5151 | 5425 | |
|---|
| 5152 | | -/* When incoming ACK allowed to free some skb from write_queue, |
|---|
| 5153 | | - * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket |
|---|
| 5154 | | - * on the exit from tcp input handler. |
|---|
| 5155 | | - * |
|---|
| 5156 | | - * PROBLEM: sndbuf expansion does not work well with largesend. |
|---|
| 5157 | | - */ |
|---|
| 5158 | 5426 | static void tcp_new_space(struct sock *sk) |
|---|
| 5159 | 5427 | { |
|---|
| 5160 | 5428 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| .. | .. |
|---|
| 5167 | 5435 | sk->sk_write_space(sk); |
|---|
| 5168 | 5436 | } |
|---|
| 5169 | 5437 | |
|---|
| 5170 | | -static void tcp_check_space(struct sock *sk) |
|---|
| 5438 | +/* Caller made space either from: |
|---|
| 5439 | + * 1) Freeing skbs in rtx queues (after tp->snd_una has advanced) |
|---|
| 5440 | + * 2) Sent skbs from output queue (and thus advancing tp->snd_nxt) |
|---|
| 5441 | + * |
|---|
| 5442 | + * We might be able to generate EPOLLOUT to the application if: |
|---|
| 5443 | + * 1) Space consumed in output/rtx queues is below sk->sk_sndbuf/2 |
|---|
| 5444 | + * 2) notsent amount (tp->write_seq - tp->snd_nxt) became |
|---|
| 5445 | + * small enough that tcp_stream_memory_free() decides it |
|---|
| 5446 | + * is time to generate EPOLLOUT. |
|---|
| 5447 | + */ |
|---|
| 5448 | +void tcp_check_space(struct sock *sk) |
|---|
| 5171 | 5449 | { |
|---|
| 5172 | | - if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) { |
|---|
| 5173 | | - sock_reset_flag(sk, SOCK_QUEUE_SHRUNK); |
|---|
| 5174 | | - /* pairs with tcp_poll() */ |
|---|
| 5175 | | - smp_mb(); |
|---|
| 5176 | | - if (sk->sk_socket && |
|---|
| 5177 | | - test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { |
|---|
| 5178 | | - tcp_new_space(sk); |
|---|
| 5179 | | - if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) |
|---|
| 5180 | | - tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); |
|---|
| 5181 | | - } |
|---|
| 5450 | + /* pairs with tcp_poll() */ |
|---|
| 5451 | + smp_mb(); |
|---|
| 5452 | + if (sk->sk_socket && |
|---|
| 5453 | + test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { |
|---|
| 5454 | + tcp_new_space(sk); |
|---|
| 5455 | + if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) |
|---|
| 5456 | + tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); |
|---|
| 5182 | 5457 | } |
|---|
| 5183 | 5458 | } |
|---|
| 5184 | 5459 | |
|---|
| .. | .. |
|---|
| 5220 | 5495 | } |
|---|
| 5221 | 5496 | |
|---|
| 5222 | 5497 | if (!tcp_is_sack(tp) || |
|---|
| 5223 | | - tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr) |
|---|
| 5498 | + tp->compressed_ack >= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr)) |
|---|
| 5224 | 5499 | goto send_now; |
|---|
| 5225 | 5500 | |
|---|
| 5226 | 5501 | if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) { |
|---|
| 5227 | 5502 | tp->compressed_ack_rcv_nxt = tp->rcv_nxt; |
|---|
| 5228 | | - if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) |
|---|
| 5229 | | - NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, |
|---|
| 5230 | | - tp->compressed_ack - TCP_FASTRETRANS_THRESH); |
|---|
| 5231 | | - tp->compressed_ack = 0; |
|---|
| 5503 | + tp->dup_ack_counter = 0; |
|---|
| 5232 | 5504 | } |
|---|
| 5233 | | - |
|---|
| 5234 | | - if (++tp->compressed_ack <= TCP_FASTRETRANS_THRESH) |
|---|
| 5505 | + if (tp->dup_ack_counter < TCP_FASTRETRANS_THRESH) { |
|---|
| 5506 | + tp->dup_ack_counter++; |
|---|
| 5235 | 5507 | goto send_now; |
|---|
| 5236 | | - |
|---|
| 5508 | + } |
|---|
| 5509 | + tp->compressed_ack++; |
|---|
| 5237 | 5510 | if (hrtimer_is_queued(&tp->compressed_ack_timer)) |
|---|
| 5238 | 5511 | return; |
|---|
| 5239 | 5512 | |
|---|
| .. | .. |
|---|
| 5243 | 5516 | if (tp->srtt_us && tp->srtt_us < rtt) |
|---|
| 5244 | 5517 | rtt = tp->srtt_us; |
|---|
| 5245 | 5518 | |
|---|
| 5246 | | - delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns, |
|---|
| 5519 | + delay = min_t(unsigned long, |
|---|
| 5520 | + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns), |
|---|
| 5247 | 5521 | rtt * (NSEC_PER_USEC >> 3)/20); |
|---|
| 5248 | 5522 | sock_hold(sk); |
|---|
| 5249 | | - hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay), |
|---|
| 5250 | | - HRTIMER_MODE_REL_PINNED_SOFT); |
|---|
| 5523 | + hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay), |
|---|
| 5524 | + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns), |
|---|
| 5525 | + HRTIMER_MODE_REL_PINNED_SOFT); |
|---|
| 5251 | 5526 | } |
|---|
| 5252 | 5527 | |
|---|
| 5253 | 5528 | static inline void tcp_ack_snd_check(struct sock *sk) |
|---|
| .. | .. |
|---|
| 5274 | 5549 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 5275 | 5550 | u32 ptr = ntohs(th->urg_ptr); |
|---|
| 5276 | 5551 | |
|---|
| 5277 | | - if (ptr && !sock_net(sk)->ipv4.sysctl_tcp_stdurg) |
|---|
| 5552 | + if (ptr && !READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_stdurg)) |
|---|
| 5278 | 5553 | ptr--; |
|---|
| 5279 | 5554 | ptr += ntohl(th->seq); |
|---|
| 5280 | 5555 | |
|---|
| .. | .. |
|---|
| 5328 | 5603 | } |
|---|
| 5329 | 5604 | |
|---|
| 5330 | 5605 | tp->urg_data = TCP_URG_NOTYET; |
|---|
| 5331 | | - tp->urg_seq = ptr; |
|---|
| 5606 | + WRITE_ONCE(tp->urg_seq, ptr); |
|---|
| 5332 | 5607 | |
|---|
| 5333 | 5608 | /* Disable header prediction. */ |
|---|
| 5334 | 5609 | tp->pred_flags = 0; |
|---|
| .. | .. |
|---|
| 5481 | 5756 | goto discard; |
|---|
| 5482 | 5757 | } |
|---|
| 5483 | 5758 | |
|---|
| 5759 | + bpf_skops_parse_hdr(sk, skb); |
|---|
| 5760 | + |
|---|
| 5484 | 5761 | return true; |
|---|
| 5485 | 5762 | |
|---|
| 5486 | 5763 | discard: |
|---|
| .. | .. |
|---|
| 5521 | 5798 | trace_tcp_probe(sk, skb); |
|---|
| 5522 | 5799 | |
|---|
| 5523 | 5800 | tcp_mstamp_refresh(tp); |
|---|
| 5524 | | - if (unlikely(!sk->sk_rx_dst)) |
|---|
| 5801 | + if (unlikely(!rcu_access_pointer(sk->sk_rx_dst))) |
|---|
| 5525 | 5802 | inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); |
|---|
| 5526 | 5803 | /* |
|---|
| 5527 | 5804 | * Header prediction. |
|---|
| .. | .. |
|---|
| 5628 | 5905 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS); |
|---|
| 5629 | 5906 | |
|---|
| 5630 | 5907 | /* Bulk data transfer: receiver */ |
|---|
| 5631 | | - eaten = tcp_queue_rcv(sk, skb, tcp_header_len, |
|---|
| 5632 | | - &fragstolen); |
|---|
| 5908 | + __skb_pull(skb, tcp_header_len); |
|---|
| 5909 | + eaten = tcp_queue_rcv(sk, skb, &fragstolen); |
|---|
| 5633 | 5910 | |
|---|
| 5634 | 5911 | tcp_event_data_recv(sk, skb); |
|---|
| 5635 | 5912 | |
|---|
| .. | .. |
|---|
| 5691 | 5968 | } |
|---|
| 5692 | 5969 | EXPORT_SYMBOL(tcp_rcv_established); |
|---|
| 5693 | 5970 | |
|---|
| 5971 | +void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb) |
|---|
| 5972 | +{ |
|---|
| 5973 | + struct inet_connection_sock *icsk = inet_csk(sk); |
|---|
| 5974 | + struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 5975 | + |
|---|
| 5976 | + tcp_mtup_init(sk); |
|---|
| 5977 | + icsk->icsk_af_ops->rebuild_header(sk); |
|---|
| 5978 | + tcp_init_metrics(sk); |
|---|
| 5979 | + |
|---|
| 5980 | + /* Initialize the congestion window to start the transfer. |
|---|
| 5981 | + * Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been |
|---|
| 5982 | + * retransmitted. In light of RFC6298 more aggressive 1sec |
|---|
| 5983 | + * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK |
|---|
| 5984 | + * retransmission has occurred. |
|---|
| 5985 | + */ |
|---|
| 5986 | + if (tp->total_retrans > 1 && tp->undo_marker) |
|---|
| 5987 | + tp->snd_cwnd = 1; |
|---|
| 5988 | + else |
|---|
| 5989 | + tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk)); |
|---|
| 5990 | + tp->snd_cwnd_stamp = tcp_jiffies32; |
|---|
| 5991 | + |
|---|
| 5992 | + bpf_skops_established(sk, bpf_op, skb); |
|---|
| 5993 | + /* Initialize congestion control unless BPF initialized it already: */ |
|---|
| 5994 | + if (!icsk->icsk_ca_initialized) |
|---|
| 5995 | + tcp_init_congestion_control(sk); |
|---|
| 5996 | + tcp_init_buffer_space(sk); |
|---|
| 5997 | +} |
|---|
| 5998 | + |
|---|
| 5694 | 5999 | void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) |
|---|
| 5695 | 6000 | { |
|---|
| 5696 | 6001 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| .. | .. |
|---|
| 5705 | 6010 | sk_mark_napi_id(sk, skb); |
|---|
| 5706 | 6011 | } |
|---|
| 5707 | 6012 | |
|---|
| 5708 | | - tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); |
|---|
| 6013 | + tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, skb); |
|---|
| 5709 | 6014 | |
|---|
| 5710 | 6015 | /* Prevent spurious tcp_cwnd_restart() on first data |
|---|
| 5711 | 6016 | * packet. |
|---|
| .. | .. |
|---|
| 5760 | 6065 | tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp); |
|---|
| 5761 | 6066 | |
|---|
| 5762 | 6067 | if (data) { /* Retransmit unacked data in SYN */ |
|---|
| 6068 | + if (tp->total_retrans) |
|---|
| 6069 | + tp->fastopen_client_fail = TFO_SYN_RETRANSMITTED; |
|---|
| 6070 | + else |
|---|
| 6071 | + tp->fastopen_client_fail = TFO_DATA_NOT_ACKED; |
|---|
| 5763 | 6072 | skb_rbtree_walk_from(data) { |
|---|
| 5764 | 6073 | if (__tcp_retransmit_skb(sk, data, 1)) |
|---|
| 5765 | 6074 | break; |
|---|
| .. | .. |
|---|
| 5792 | 6101 | #endif |
|---|
| 5793 | 6102 | } |
|---|
| 5794 | 6103 | |
|---|
| 6104 | +static void tcp_try_undo_spurious_syn(struct sock *sk) |
|---|
| 6105 | +{ |
|---|
| 6106 | + struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 6107 | + u32 syn_stamp; |
|---|
| 6108 | + |
|---|
| 6109 | + /* undo_marker is set when SYN or SYNACK times out. The timeout is |
|---|
| 6110 | + * spurious if the ACK's timestamp option echo value matches the |
|---|
| 6111 | + * original SYN timestamp. |
|---|
| 6112 | + */ |
|---|
| 6113 | + syn_stamp = tp->retrans_stamp; |
|---|
| 6114 | + if (tp->undo_marker && syn_stamp && tp->rx_opt.saw_tstamp && |
|---|
| 6115 | + syn_stamp == tp->rx_opt.rcv_tsecr) |
|---|
| 6116 | + tp->undo_marker = 0; |
|---|
| 6117 | +} |
|---|
| 6118 | + |
|---|
| 5795 | 6119 | static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, |
|---|
| 5796 | 6120 | const struct tcphdr *th) |
|---|
| 5797 | 6121 | { |
|---|
| .. | .. |
|---|
| 5815 | 6139 | * the segment and return)" |
|---|
| 5816 | 6140 | */ |
|---|
| 5817 | 6141 | if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) || |
|---|
| 5818 | | - after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) |
|---|
| 6142 | + after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) { |
|---|
| 6143 | + /* Previous FIN/ACK or RST/ACK might be ignored. */ |
|---|
| 6144 | + if (icsk->icsk_retransmits == 0) |
|---|
| 6145 | + inet_csk_reset_xmit_timer(sk, |
|---|
| 6146 | + ICSK_TIME_RETRANS, |
|---|
| 6147 | + TCP_TIMEOUT_MIN, TCP_RTO_MAX); |
|---|
| 5819 | 6148 | goto reset_and_undo; |
|---|
| 6149 | + } |
|---|
| 5820 | 6150 | |
|---|
| 5821 | 6151 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && |
|---|
| 5822 | 6152 | !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp, |
|---|
| .. | .. |
|---|
| 5859 | 6189 | tcp_ecn_rcv_synack(tp, th); |
|---|
| 5860 | 6190 | |
|---|
| 5861 | 6191 | tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); |
|---|
| 6192 | + tcp_try_undo_spurious_syn(sk); |
|---|
| 5862 | 6193 | tcp_ack(sk, skb, FLAG_SLOWPATH); |
|---|
| 5863 | 6194 | |
|---|
| 5864 | 6195 | /* Ok.. it's good. Set up sequence numbers and |
|---|
| .. | .. |
|---|
| 5912 | 6243 | return -1; |
|---|
| 5913 | 6244 | if (sk->sk_write_pending || |
|---|
| 5914 | 6245 | icsk->icsk_accept_queue.rskq_defer_accept || |
|---|
| 5915 | | - icsk->icsk_ack.pingpong) { |
|---|
| 6246 | + inet_csk_in_pingpong_mode(sk)) { |
|---|
| 5916 | 6247 | /* Save one ACK. Data will be ready after |
|---|
| 5917 | 6248 | * several ticks, if write_pending is set. |
|---|
| 5918 | 6249 | * |
|---|
| .. | .. |
|---|
| 6017 | 6348 | return 1; |
|---|
| 6018 | 6349 | } |
|---|
| 6019 | 6350 | |
|---|
| 6351 | +static void tcp_rcv_synrecv_state_fastopen(struct sock *sk) |
|---|
| 6352 | +{ |
|---|
| 6353 | + struct request_sock *req; |
|---|
| 6354 | + |
|---|
| 6355 | + /* If we are still handling the SYNACK RTO, see if timestamp ECR allows |
|---|
| 6356 | + * undo. If peer SACKs triggered fast recovery, we can't undo here. |
|---|
| 6357 | + */ |
|---|
| 6358 | + if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) |
|---|
| 6359 | + tcp_try_undo_loss(sk, false); |
|---|
| 6360 | + |
|---|
| 6361 | + /* Reset rtx states to prevent spurious retransmits_timed_out() */ |
|---|
| 6362 | + tcp_sk(sk)->retrans_stamp = 0; |
|---|
| 6363 | + inet_csk(sk)->icsk_retransmits = 0; |
|---|
| 6364 | + |
|---|
| 6365 | + /* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1, |
|---|
| 6366 | + * we no longer need req so release it. |
|---|
| 6367 | + */ |
|---|
| 6368 | + req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, |
|---|
| 6369 | + lockdep_sock_is_held(sk)); |
|---|
| 6370 | + reqsk_fastopen_remove(sk, req, false); |
|---|
| 6371 | + |
|---|
| 6372 | + /* Re-arm the timer because data may have been sent out. |
|---|
| 6373 | + * This is similar to the regular data transmission case |
|---|
| 6374 | + * when new data has just been ack'ed. |
|---|
| 6375 | + * |
|---|
| 6376 | + * (TFO) - we could try to be more aggressive and |
|---|
| 6377 | + * retransmitting any data sooner based on when they |
|---|
| 6378 | + * are sent out. |
|---|
| 6379 | + */ |
|---|
| 6380 | + tcp_rearm_rto(sk); |
|---|
| 6381 | +} |
|---|
| 6382 | + |
|---|
| 6020 | 6383 | /* |
|---|
| 6021 | 6384 | * This function implements the receiving procedure of RFC 793 for |
|---|
| 6022 | 6385 | * all states except ESTABLISHED and TIME_WAIT. |
|---|
| .. | .. |
|---|
| 6079 | 6442 | |
|---|
| 6080 | 6443 | tcp_mstamp_refresh(tp); |
|---|
| 6081 | 6444 | tp->rx_opt.saw_tstamp = 0; |
|---|
| 6082 | | - req = tp->fastopen_rsk; |
|---|
| 6445 | + req = rcu_dereference_protected(tp->fastopen_rsk, |
|---|
| 6446 | + lockdep_sock_is_held(sk)); |
|---|
| 6083 | 6447 | if (req) { |
|---|
| 6084 | 6448 | bool req_stolen; |
|---|
| 6085 | 6449 | |
|---|
| .. | .. |
|---|
| 6113 | 6477 | if (!tp->srtt_us) |
|---|
| 6114 | 6478 | tcp_synack_rtt_meas(sk, req); |
|---|
| 6115 | 6479 | |
|---|
| 6116 | | - /* Once we leave TCP_SYN_RECV, we no longer need req |
|---|
| 6117 | | - * so release it. |
|---|
| 6118 | | - */ |
|---|
| 6119 | 6480 | if (req) { |
|---|
| 6120 | | - inet_csk(sk)->icsk_retransmits = 0; |
|---|
| 6121 | | - reqsk_fastopen_remove(sk, req, false); |
|---|
| 6122 | | - /* Re-arm the timer because data may have been sent out. |
|---|
| 6123 | | - * This is similar to the regular data transmission case |
|---|
| 6124 | | - * when new data has just been ack'ed. |
|---|
| 6125 | | - * |
|---|
| 6126 | | - * (TFO) - we could try to be more aggressive and |
|---|
| 6127 | | - * retransmitting any data sooner based on when they |
|---|
| 6128 | | - * are sent out. |
|---|
| 6129 | | - */ |
|---|
| 6130 | | - tcp_rearm_rto(sk); |
|---|
| 6481 | + tcp_rcv_synrecv_state_fastopen(sk); |
|---|
| 6131 | 6482 | } else { |
|---|
| 6132 | | - tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); |
|---|
| 6483 | + tcp_try_undo_spurious_syn(sk); |
|---|
| 6484 | + tp->retrans_stamp = 0; |
|---|
| 6485 | + tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, |
|---|
| 6486 | + skb); |
|---|
| 6133 | 6487 | WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); |
|---|
| 6134 | 6488 | } |
|---|
| 6135 | 6489 | smp_mb(); |
|---|
| .. | .. |
|---|
| 6163 | 6517 | case TCP_FIN_WAIT1: { |
|---|
| 6164 | 6518 | int tmo; |
|---|
| 6165 | 6519 | |
|---|
| 6166 | | - /* If we enter the TCP_FIN_WAIT1 state and we are a |
|---|
| 6167 | | - * Fast Open socket and this is the first acceptable |
|---|
| 6168 | | - * ACK we have received, this would have acknowledged |
|---|
| 6169 | | - * our SYNACK so stop the SYNACK timer. |
|---|
| 6170 | | - */ |
|---|
| 6171 | | - if (req) { |
|---|
| 6172 | | - /* We no longer need the request sock. */ |
|---|
| 6173 | | - reqsk_fastopen_remove(sk, req, false); |
|---|
| 6174 | | - tcp_rearm_rto(sk); |
|---|
| 6175 | | - } |
|---|
| 6520 | + if (req) |
|---|
| 6521 | + tcp_rcv_synrecv_state_fastopen(sk); |
|---|
| 6522 | + |
|---|
| 6176 | 6523 | if (tp->snd_una != tp->write_seq) |
|---|
| 6177 | 6524 | break; |
|---|
| 6178 | 6525 | |
|---|
| 6179 | 6526 | tcp_set_state(sk, TCP_FIN_WAIT2); |
|---|
| 6180 | | - sk->sk_shutdown |= SEND_SHUTDOWN; |
|---|
| 6527 | + WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | SEND_SHUTDOWN); |
|---|
| 6181 | 6528 | |
|---|
| 6182 | 6529 | sk_dst_confirm(sk); |
|---|
| 6183 | 6530 | |
|---|
| .. | .. |
|---|
| 6244 | 6591 | case TCP_CLOSE_WAIT: |
|---|
| 6245 | 6592 | case TCP_CLOSING: |
|---|
| 6246 | 6593 | case TCP_LAST_ACK: |
|---|
| 6247 | | - if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) |
|---|
| 6594 | + if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { |
|---|
| 6595 | + if (sk_is_mptcp(sk)) |
|---|
| 6596 | + mptcp_incoming_options(sk, skb); |
|---|
| 6248 | 6597 | break; |
|---|
| 6249 | | - /* fall through */ |
|---|
| 6598 | + } |
|---|
| 6599 | + fallthrough; |
|---|
| 6250 | 6600 | case TCP_FIN_WAIT1: |
|---|
| 6251 | 6601 | case TCP_FIN_WAIT2: |
|---|
| 6252 | 6602 | /* RFC 793 says to queue data in these states, |
|---|
| .. | .. |
|---|
| 6261 | 6611 | return 1; |
|---|
| 6262 | 6612 | } |
|---|
| 6263 | 6613 | } |
|---|
| 6264 | | - /* Fall through */ |
|---|
| 6614 | + fallthrough; |
|---|
| 6265 | 6615 | case TCP_ESTABLISHED: |
|---|
| 6266 | 6616 | tcp_data_queue(sk, skb); |
|---|
| 6267 | 6617 | queued = 1; |
|---|
| .. | .. |
|---|
| 6307 | 6657 | * congestion control: Linux DCTCP asserts ECT on all packets, |
|---|
| 6308 | 6658 | * including SYN, which is most optimal solution; however, |
|---|
| 6309 | 6659 | * others, such as FreeBSD do not. |
|---|
| 6660 | + * |
|---|
| 6661 | + * Exception: At least one of the reserved bits of the TCP header (th->res1) is |
|---|
| 6662 | + * set, indicating the use of a future TCP extension (such as AccECN). See |
|---|
| 6663 | + * RFC8311 ยง4.3 which updates RFC3168 to allow the development of such |
|---|
| 6664 | + * extensions. |
|---|
| 6310 | 6665 | */ |
|---|
| 6311 | 6666 | static void tcp_ecn_create_request(struct request_sock *req, |
|---|
| 6312 | 6667 | const struct sk_buff *skb, |
|---|
| .. | .. |
|---|
| 6326 | 6681 | ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK); |
|---|
| 6327 | 6682 | ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst; |
|---|
| 6328 | 6683 | |
|---|
| 6329 | | - if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) || |
|---|
| 6684 | + if (((!ect || th->res1) && ecn_ok) || tcp_ca_needs_ecn(listen_sk) || |
|---|
| 6330 | 6685 | (ecn_ok_dst & DST_FEATURE_ECN_CA) || |
|---|
| 6331 | 6686 | tcp_bpf_ca_needs_ecn((struct sock *)req)) |
|---|
| 6332 | 6687 | inet_rsk(req)->ecn_ok = 1; |
|---|
| .. | .. |
|---|
| 6339 | 6694 | struct inet_request_sock *ireq = inet_rsk(req); |
|---|
| 6340 | 6695 | |
|---|
| 6341 | 6696 | req->rsk_rcv_wnd = 0; /* So that tcp_send_synack() knows! */ |
|---|
| 6342 | | - req->cookie_ts = 0; |
|---|
| 6343 | 6697 | tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; |
|---|
| 6344 | 6698 | tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; |
|---|
| 6345 | | - tcp_rsk(req)->snt_synack = tcp_clock_us(); |
|---|
| 6699 | + tcp_rsk(req)->snt_synack = 0; |
|---|
| 6346 | 6700 | tcp_rsk(req)->last_oow_ack_time = 0; |
|---|
| 6347 | 6701 | req->mss = rx_opt->mss_clamp; |
|---|
| 6348 | 6702 | req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; |
|---|
| .. | .. |
|---|
| 6387 | 6741 | /* |
|---|
| 6388 | 6742 | * Return true if a syncookie should be sent |
|---|
| 6389 | 6743 | */ |
|---|
| 6390 | | -static bool tcp_syn_flood_action(const struct sock *sk, |
|---|
| 6391 | | - const struct sk_buff *skb, |
|---|
| 6392 | | - const char *proto) |
|---|
| 6744 | +static bool tcp_syn_flood_action(const struct sock *sk, const char *proto) |
|---|
| 6393 | 6745 | { |
|---|
| 6394 | 6746 | struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; |
|---|
| 6395 | 6747 | const char *msg = "Dropping request"; |
|---|
| 6396 | | - bool want_cookie = false; |
|---|
| 6397 | 6748 | struct net *net = sock_net(sk); |
|---|
| 6749 | + bool want_cookie = false; |
|---|
| 6750 | + u8 syncookies; |
|---|
| 6751 | + |
|---|
| 6752 | + syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies); |
|---|
| 6398 | 6753 | |
|---|
| 6399 | 6754 | #ifdef CONFIG_SYN_COOKIES |
|---|
| 6400 | | - if (net->ipv4.sysctl_tcp_syncookies) { |
|---|
| 6755 | + if (syncookies) { |
|---|
| 6401 | 6756 | msg = "Sending cookies"; |
|---|
| 6402 | 6757 | want_cookie = true; |
|---|
| 6403 | 6758 | __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES); |
|---|
| .. | .. |
|---|
| 6405 | 6760 | #endif |
|---|
| 6406 | 6761 | __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); |
|---|
| 6407 | 6762 | |
|---|
| 6408 | | - if (!queue->synflood_warned && |
|---|
| 6409 | | - net->ipv4.sysctl_tcp_syncookies != 2 && |
|---|
| 6763 | + if (!queue->synflood_warned && syncookies != 2 && |
|---|
| 6410 | 6764 | xchg(&queue->synflood_warned, 1) == 0) |
|---|
| 6411 | 6765 | net_info_ratelimited("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", |
|---|
| 6412 | | - proto, ntohs(tcp_hdr(skb)->dest), msg); |
|---|
| 6766 | + proto, sk->sk_num, msg); |
|---|
| 6413 | 6767 | |
|---|
| 6414 | 6768 | return want_cookie; |
|---|
| 6415 | 6769 | } |
|---|
| .. | .. |
|---|
| 6420 | 6774 | { |
|---|
| 6421 | 6775 | if (tcp_sk(sk)->save_syn) { |
|---|
| 6422 | 6776 | u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb); |
|---|
| 6423 | | - u32 *copy; |
|---|
| 6777 | + struct saved_syn *saved_syn; |
|---|
| 6778 | + u32 mac_hdrlen; |
|---|
| 6779 | + void *base; |
|---|
| 6424 | 6780 | |
|---|
| 6425 | | - copy = kmalloc(len + sizeof(u32), GFP_ATOMIC); |
|---|
| 6426 | | - if (copy) { |
|---|
| 6427 | | - copy[0] = len; |
|---|
| 6428 | | - memcpy(©[1], skb_network_header(skb), len); |
|---|
| 6429 | | - req->saved_syn = copy; |
|---|
| 6781 | + if (tcp_sk(sk)->save_syn == 2) { /* Save full header. */ |
|---|
| 6782 | + base = skb_mac_header(skb); |
|---|
| 6783 | + mac_hdrlen = skb_mac_header_len(skb); |
|---|
| 6784 | + len += mac_hdrlen; |
|---|
| 6785 | + } else { |
|---|
| 6786 | + base = skb_network_header(skb); |
|---|
| 6787 | + mac_hdrlen = 0; |
|---|
| 6788 | + } |
|---|
| 6789 | + |
|---|
| 6790 | + saved_syn = kmalloc(struct_size(saved_syn, data, len), |
|---|
| 6791 | + GFP_ATOMIC); |
|---|
| 6792 | + if (saved_syn) { |
|---|
| 6793 | + saved_syn->mac_hdrlen = mac_hdrlen; |
|---|
| 6794 | + saved_syn->network_hdrlen = skb_network_header_len(skb); |
|---|
| 6795 | + saved_syn->tcp_hdrlen = tcp_hdrlen(skb); |
|---|
| 6796 | + memcpy(saved_syn->data, base, len); |
|---|
| 6797 | + req->saved_syn = saved_syn; |
|---|
| 6430 | 6798 | } |
|---|
| 6431 | 6799 | } |
|---|
| 6432 | 6800 | } |
|---|
| 6801 | + |
|---|
| 6802 | +/* If a SYN cookie is required and supported, returns a clamped MSS value to be |
|---|
| 6803 | + * used for SYN cookie generation. |
|---|
| 6804 | + */ |
|---|
| 6805 | +u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, |
|---|
| 6806 | + const struct tcp_request_sock_ops *af_ops, |
|---|
| 6807 | + struct sock *sk, struct tcphdr *th) |
|---|
| 6808 | +{ |
|---|
| 6809 | + struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 6810 | + u16 mss; |
|---|
| 6811 | + |
|---|
| 6812 | + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies) != 2 && |
|---|
| 6813 | + !inet_csk_reqsk_queue_is_full(sk)) |
|---|
| 6814 | + return 0; |
|---|
| 6815 | + |
|---|
| 6816 | + if (!tcp_syn_flood_action(sk, rsk_ops->slab_name)) |
|---|
| 6817 | + return 0; |
|---|
| 6818 | + |
|---|
| 6819 | + if (sk_acceptq_is_full(sk)) { |
|---|
| 6820 | + NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); |
|---|
| 6821 | + return 0; |
|---|
| 6822 | + } |
|---|
| 6823 | + |
|---|
| 6824 | + mss = tcp_parse_mss_option(th, tp->rx_opt.user_mss); |
|---|
| 6825 | + if (!mss) |
|---|
| 6826 | + mss = af_ops->mss_clamp; |
|---|
| 6827 | + |
|---|
| 6828 | + return mss; |
|---|
| 6829 | +} |
|---|
| 6830 | +EXPORT_SYMBOL_GPL(tcp_get_syncookie_mss); |
|---|
| 6433 | 6831 | |
|---|
| 6434 | 6832 | int tcp_conn_request(struct request_sock_ops *rsk_ops, |
|---|
| 6435 | 6833 | const struct tcp_request_sock_ops *af_ops, |
|---|
| .. | .. |
|---|
| 6445 | 6843 | bool want_cookie = false; |
|---|
| 6446 | 6844 | struct dst_entry *dst; |
|---|
| 6447 | 6845 | struct flowi fl; |
|---|
| 6846 | + u8 syncookies; |
|---|
| 6847 | + |
|---|
| 6848 | + syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies); |
|---|
| 6448 | 6849 | |
|---|
| 6449 | 6850 | /* TW buckets are converted to open requests without |
|---|
| 6450 | 6851 | * limitations, they conserve resources and peer is |
|---|
| 6451 | 6852 | * evidently real one. |
|---|
| 6452 | 6853 | */ |
|---|
| 6453 | | - if ((net->ipv4.sysctl_tcp_syncookies == 2 || |
|---|
| 6454 | | - inet_csk_reqsk_queue_is_full(sk)) && !isn) { |
|---|
| 6455 | | - want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name); |
|---|
| 6854 | + if ((syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) && !isn) { |
|---|
| 6855 | + want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name); |
|---|
| 6456 | 6856 | if (!want_cookie) |
|---|
| 6457 | 6857 | goto drop; |
|---|
| 6458 | 6858 | } |
|---|
| .. | .. |
|---|
| 6466 | 6866 | if (!req) |
|---|
| 6467 | 6867 | goto drop; |
|---|
| 6468 | 6868 | |
|---|
| 6869 | + req->syncookie = want_cookie; |
|---|
| 6469 | 6870 | tcp_rsk(req)->af_specific = af_ops; |
|---|
| 6470 | 6871 | tcp_rsk(req)->ts_off = 0; |
|---|
| 6872 | +#if IS_ENABLED(CONFIG_MPTCP) |
|---|
| 6873 | + tcp_rsk(req)->is_mptcp = 0; |
|---|
| 6874 | +#endif |
|---|
| 6471 | 6875 | |
|---|
| 6472 | 6876 | tcp_clear_options(&tmp_opt); |
|---|
| 6473 | 6877 | tmp_opt.mss_clamp = af_ops->mss_clamp; |
|---|
| .. | .. |
|---|
| 6501 | 6905 | goto drop_and_free; |
|---|
| 6502 | 6906 | |
|---|
| 6503 | 6907 | if (!want_cookie && !isn) { |
|---|
| 6908 | + int max_syn_backlog = READ_ONCE(net->ipv4.sysctl_max_syn_backlog); |
|---|
| 6909 | + |
|---|
| 6504 | 6910 | /* Kill the following clause, if you dislike this way. */ |
|---|
| 6505 | | - if (!net->ipv4.sysctl_tcp_syncookies && |
|---|
| 6506 | | - (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < |
|---|
| 6507 | | - (net->ipv4.sysctl_max_syn_backlog >> 2)) && |
|---|
| 6911 | + if (!syncookies && |
|---|
| 6912 | + (max_syn_backlog - inet_csk_reqsk_queue_len(sk) < |
|---|
| 6913 | + (max_syn_backlog >> 2)) && |
|---|
| 6508 | 6914 | !tcp_peer_is_proven(req, dst)) { |
|---|
| 6509 | 6915 | /* Without syncookies last quarter of |
|---|
| 6510 | 6916 | * backlog is filled with destinations, |
|---|
| .. | .. |
|---|
| 6525 | 6931 | |
|---|
| 6526 | 6932 | if (want_cookie) { |
|---|
| 6527 | 6933 | isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); |
|---|
| 6528 | | - req->cookie_ts = tmp_opt.tstamp_ok; |
|---|
| 6529 | 6934 | if (!tmp_opt.tstamp_ok) |
|---|
| 6530 | 6935 | inet_rsk(req)->ecn_ok = 0; |
|---|
| 6531 | 6936 | } |
|---|
| 6532 | 6937 | |
|---|
| 6533 | 6938 | tcp_rsk(req)->snt_isn = isn; |
|---|
| 6534 | 6939 | tcp_rsk(req)->txhash = net_tx_rndhash(); |
|---|
| 6940 | + tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield; |
|---|
| 6535 | 6941 | tcp_openreq_init_rwin(req, sk, dst); |
|---|
| 6536 | 6942 | sk_rx_queue_set(req_to_sk(req), skb); |
|---|
| 6537 | 6943 | if (!want_cookie) { |
|---|
| .. | .. |
|---|
| 6540 | 6946 | } |
|---|
| 6541 | 6947 | if (fastopen_sk) { |
|---|
| 6542 | 6948 | af_ops->send_synack(fastopen_sk, dst, &fl, req, |
|---|
| 6543 | | - &foc, TCP_SYNACK_FASTOPEN); |
|---|
| 6949 | + &foc, TCP_SYNACK_FASTOPEN, skb); |
|---|
| 6544 | 6950 | /* Add the child socket directly into the accept queue */ |
|---|
| 6545 | 6951 | if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) { |
|---|
| 6546 | 6952 | reqsk_fastopen_remove(fastopen_sk, req, false); |
|---|
| 6547 | 6953 | bh_unlock_sock(fastopen_sk); |
|---|
| 6548 | 6954 | sock_put(fastopen_sk); |
|---|
| 6549 | | - reqsk_put(req); |
|---|
| 6550 | | - goto drop; |
|---|
| 6955 | + goto drop_and_free; |
|---|
| 6551 | 6956 | } |
|---|
| 6552 | 6957 | sk->sk_data_ready(sk); |
|---|
| 6553 | 6958 | bh_unlock_sock(fastopen_sk); |
|---|
| .. | .. |
|---|
| 6559 | 6964 | tcp_timeout_init((struct sock *)req)); |
|---|
| 6560 | 6965 | af_ops->send_synack(sk, dst, &fl, req, &foc, |
|---|
| 6561 | 6966 | !want_cookie ? TCP_SYNACK_NORMAL : |
|---|
| 6562 | | - TCP_SYNACK_COOKIE); |
|---|
| 6967 | + TCP_SYNACK_COOKIE, |
|---|
| 6968 | + skb); |
|---|
| 6563 | 6969 | if (want_cookie) { |
|---|
| 6564 | 6970 | reqsk_free(req); |
|---|
| 6565 | 6971 | return 0; |
|---|
| .. | .. |
|---|
| 6571 | 6977 | drop_and_release: |
|---|
| 6572 | 6978 | dst_release(dst); |
|---|
| 6573 | 6979 | drop_and_free: |
|---|
| 6574 | | - reqsk_free(req); |
|---|
| 6980 | + __reqsk_free(req); |
|---|
| 6575 | 6981 | drop: |
|---|
| 6576 | 6982 | tcp_listendrop(sk); |
|---|
| 6577 | 6983 | return 0; |
|---|