| .. | .. |
|---|
| 77 | 77 | #include <asm/unaligned.h> |
|---|
| 78 | 78 | #include <linux/errqueue.h> |
|---|
| 79 | 79 | #include <trace/events/tcp.h> |
|---|
| 80 | | -#include <linux/static_key.h> |
|---|
| 80 | +#include <linux/jump_label_ratelimit.h> |
|---|
| 81 | 81 | #include <net/busy_poll.h> |
|---|
| 82 | +#include <net/mptcp.h> |
|---|
| 83 | +#include <trace/hooks/net.h> |
|---|
| 82 | 84 | |
|---|
| 83 | 85 | int sysctl_tcp_max_orphans __read_mostly = NR_FILE; |
|---|
| 84 | 86 | |
|---|
| .. | .. |
|---|
| 113 | 115 | #define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */ |
|---|
| 114 | 116 | |
|---|
| 115 | 117 | #if IS_ENABLED(CONFIG_TLS_DEVICE) |
|---|
| 116 | | -static DEFINE_STATIC_KEY_FALSE(clean_acked_data_enabled); |
|---|
| 118 | +static DEFINE_STATIC_KEY_DEFERRED_FALSE(clean_acked_data_enabled, HZ); |
|---|
| 117 | 119 | |
|---|
| 118 | 120 | void clean_acked_data_enable(struct inet_connection_sock *icsk, |
|---|
| 119 | 121 | void (*cad)(struct sock *sk, u32 ack_seq)) |
|---|
| 120 | 122 | { |
|---|
| 121 | 123 | icsk->icsk_clean_acked = cad; |
|---|
| 122 | | - static_branch_inc(&clean_acked_data_enabled); |
|---|
| 124 | + static_branch_deferred_inc(&clean_acked_data_enabled); |
|---|
| 123 | 125 | } |
|---|
| 124 | 126 | EXPORT_SYMBOL_GPL(clean_acked_data_enable); |
|---|
| 125 | 127 | |
|---|
| 126 | 128 | void clean_acked_data_disable(struct inet_connection_sock *icsk) |
|---|
| 127 | 129 | { |
|---|
| 128 | | - static_branch_dec(&clean_acked_data_enabled); |
|---|
| 130 | + static_branch_slow_dec_deferred(&clean_acked_data_enabled); |
|---|
| 129 | 131 | icsk->icsk_clean_acked = NULL; |
|---|
| 130 | 132 | } |
|---|
| 131 | 133 | EXPORT_SYMBOL_GPL(clean_acked_data_disable); |
|---|
| 134 | + |
|---|
| 135 | +void clean_acked_data_flush(void) |
|---|
| 136 | +{ |
|---|
| 137 | + static_key_deferred_flush(&clean_acked_data_enabled); |
|---|
| 138 | +} |
|---|
| 139 | +EXPORT_SYMBOL_GPL(clean_acked_data_flush); |
|---|
| 140 | +#endif |
|---|
| 141 | + |
|---|
| 142 | +#ifdef CONFIG_CGROUP_BPF |
|---|
| 143 | +static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) |
|---|
| 144 | +{ |
|---|
| 145 | + bool unknown_opt = tcp_sk(sk)->rx_opt.saw_unknown && |
|---|
| 146 | + BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), |
|---|
| 147 | + BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG); |
|---|
| 148 | + bool parse_all_opt = BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), |
|---|
| 149 | + BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG); |
|---|
| 150 | + struct bpf_sock_ops_kern sock_ops; |
|---|
| 151 | + |
|---|
| 152 | + if (likely(!unknown_opt && !parse_all_opt)) |
|---|
| 153 | + return; |
|---|
| 154 | + |
|---|
| 155 | + /* The skb will be handled in the |
|---|
| 156 | + * bpf_skops_established() or |
|---|
| 157 | + * bpf_skops_write_hdr_opt(). |
|---|
| 158 | + */ |
|---|
| 159 | + switch (sk->sk_state) { |
|---|
| 160 | + case TCP_SYN_RECV: |
|---|
| 161 | + case TCP_SYN_SENT: |
|---|
| 162 | + case TCP_LISTEN: |
|---|
| 163 | + return; |
|---|
| 164 | + } |
|---|
| 165 | + |
|---|
| 166 | + sock_owned_by_me(sk); |
|---|
| 167 | + |
|---|
| 168 | + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); |
|---|
| 169 | + sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB; |
|---|
| 170 | + sock_ops.is_fullsock = 1; |
|---|
| 171 | + sock_ops.sk = sk; |
|---|
| 172 | + bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb)); |
|---|
| 173 | + |
|---|
| 174 | + BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); |
|---|
| 175 | +} |
|---|
| 176 | + |
|---|
| 177 | +static void bpf_skops_established(struct sock *sk, int bpf_op, |
|---|
| 178 | + struct sk_buff *skb) |
|---|
| 179 | +{ |
|---|
| 180 | + struct bpf_sock_ops_kern sock_ops; |
|---|
| 181 | + |
|---|
| 182 | + sock_owned_by_me(sk); |
|---|
| 183 | + |
|---|
| 184 | + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); |
|---|
| 185 | + sock_ops.op = bpf_op; |
|---|
| 186 | + sock_ops.is_fullsock = 1; |
|---|
| 187 | + sock_ops.sk = sk; |
|---|
| 188 | + /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */ |
|---|
| 189 | + if (skb) |
|---|
| 190 | + bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb)); |
|---|
| 191 | + |
|---|
| 192 | + BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); |
|---|
| 193 | +} |
|---|
| 194 | +#else |
|---|
| 195 | +static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) |
|---|
| 196 | +{ |
|---|
| 197 | +} |
|---|
| 198 | + |
|---|
| 199 | +static void bpf_skops_established(struct sock *sk, int bpf_op, |
|---|
| 200 | + struct sk_buff *skb) |
|---|
| 201 | +{ |
|---|
| 202 | +} |
|---|
| 132 | 203 | #endif |
|---|
| 133 | 204 | |
|---|
| 134 | 205 | static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb, |
|---|
| .. | .. |
|---|
| 221 | 292 | struct inet_connection_sock *icsk = inet_csk(sk); |
|---|
| 222 | 293 | |
|---|
| 223 | 294 | tcp_incr_quickack(sk, max_quickacks); |
|---|
| 224 | | - icsk->icsk_ack.pingpong = 0; |
|---|
| 295 | + inet_csk_exit_pingpong_mode(sk); |
|---|
| 225 | 296 | icsk->icsk_ack.ato = TCP_ATO_MIN; |
|---|
| 226 | 297 | } |
|---|
| 227 | 298 | EXPORT_SYMBOL(tcp_enter_quickack_mode); |
|---|
| .. | .. |
|---|
| 236 | 307 | const struct dst_entry *dst = __sk_dst_get(sk); |
|---|
| 237 | 308 | |
|---|
| 238 | 309 | return (dst && dst_metric(dst, RTAX_QUICKACK)) || |
|---|
| 239 | | - (icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong); |
|---|
| 310 | + (icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk)); |
|---|
| 240 | 311 | } |
|---|
| 241 | 312 | |
|---|
| 242 | 313 | static void tcp_ecn_queue_cwr(struct tcp_sock *tp) |
|---|
| .. | .. |
|---|
| 354 | 425 | sndmem *= nr_segs * per_mss; |
|---|
| 355 | 426 | |
|---|
| 356 | 427 | if (sk->sk_sndbuf < sndmem) |
|---|
| 357 | | - sk->sk_sndbuf = min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]); |
|---|
| 428 | + WRITE_ONCE(sk->sk_sndbuf, |
|---|
| 429 | + min(sndmem, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[2]))); |
|---|
| 358 | 430 | } |
|---|
| 359 | 431 | |
|---|
| 360 | 432 | /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) |
|---|
| .. | .. |
|---|
| 383 | 455 | */ |
|---|
| 384 | 456 | |
|---|
| 385 | 457 | /* Slow part of check#2. */ |
|---|
| 386 | | -static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) |
|---|
| 458 | +static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb, |
|---|
| 459 | + unsigned int skbtruesize) |
|---|
| 387 | 460 | { |
|---|
| 388 | 461 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 389 | 462 | /* Optimize this! */ |
|---|
| 390 | | - int truesize = tcp_win_from_space(sk, skb->truesize) >> 1; |
|---|
| 391 | | - int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1; |
|---|
| 463 | + int truesize = tcp_win_from_space(sk, skbtruesize) >> 1; |
|---|
| 464 | + int window = tcp_win_from_space(sk, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])) >> 1; |
|---|
| 392 | 465 | |
|---|
| 393 | 466 | while (tp->rcv_ssthresh <= window) { |
|---|
| 394 | 467 | if (truesize <= skb->len) |
|---|
| .. | .. |
|---|
| 400 | 473 | return 0; |
|---|
| 401 | 474 | } |
|---|
| 402 | 475 | |
|---|
| 403 | | -static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) |
|---|
| 476 | +/* Even if skb appears to have a bad len/truesize ratio, TCP coalescing |
|---|
| 477 | + * can play nice with us, as sk_buff and skb->head might be either |
|---|
| 478 | + * freed or shared with up to MAX_SKB_FRAGS segments. |
|---|
| 479 | + * Only give a boost to drivers using page frag(s) to hold the frame(s), |
|---|
| 480 | + * and if no payload was pulled in skb->head before reaching us. |
|---|
| 481 | + */ |
|---|
| 482 | +static u32 truesize_adjust(bool adjust, const struct sk_buff *skb) |
|---|
| 483 | +{ |
|---|
| 484 | + u32 truesize = skb->truesize; |
|---|
| 485 | + |
|---|
| 486 | + if (adjust && !skb_headlen(skb)) { |
|---|
| 487 | + truesize -= SKB_TRUESIZE(skb_end_offset(skb)); |
|---|
| 488 | + /* paranoid check, some drivers might be buggy */ |
|---|
| 489 | + if (unlikely((int)truesize < (int)skb->len)) |
|---|
| 490 | + truesize = skb->truesize; |
|---|
| 491 | + } |
|---|
| 492 | + return truesize; |
|---|
| 493 | +} |
|---|
| 494 | + |
|---|
| 495 | +static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb, |
|---|
| 496 | + bool adjust) |
|---|
| 404 | 497 | { |
|---|
| 405 | 498 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 406 | 499 | int room; |
|---|
| .. | .. |
|---|
| 409 | 502 | |
|---|
| 410 | 503 | /* Check #1 */ |
|---|
| 411 | 504 | if (room > 0 && !tcp_under_memory_pressure(sk)) { |
|---|
| 505 | + unsigned int truesize = truesize_adjust(adjust, skb); |
|---|
| 412 | 506 | int incr; |
|---|
| 413 | 507 | |
|---|
| 414 | 508 | /* Check #2. Increase window, if skb with such overhead |
|---|
| 415 | 509 | * will fit to rcvbuf in future. |
|---|
| 416 | 510 | */ |
|---|
| 417 | | - if (tcp_win_from_space(sk, skb->truesize) <= skb->len) |
|---|
| 511 | + if (tcp_win_from_space(sk, truesize) <= skb->len) |
|---|
| 418 | 512 | incr = 2 * tp->advmss; |
|---|
| 419 | 513 | else |
|---|
| 420 | | - incr = __tcp_grow_window(sk, skb); |
|---|
| 514 | + incr = __tcp_grow_window(sk, skb, truesize); |
|---|
| 421 | 515 | |
|---|
| 422 | 516 | if (incr) { |
|---|
| 423 | 517 | incr = max_t(int, incr, 2 * skb->len); |
|---|
| .. | .. |
|---|
| 430 | 524 | /* 3. Try to fixup all. It is made immediately after connection enters |
|---|
| 431 | 525 | * established state. |
|---|
| 432 | 526 | */ |
|---|
| 433 | | -void tcp_init_buffer_space(struct sock *sk) |
|---|
| 527 | +static void tcp_init_buffer_space(struct sock *sk) |
|---|
| 434 | 528 | { |
|---|
| 435 | | - int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win; |
|---|
| 529 | + int tcp_app_win = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_app_win); |
|---|
| 436 | 530 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 437 | 531 | int maxwin; |
|---|
| 438 | 532 | |
|---|
| .. | .. |
|---|
| 472 | 566 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 473 | 567 | struct inet_connection_sock *icsk = inet_csk(sk); |
|---|
| 474 | 568 | struct net *net = sock_net(sk); |
|---|
| 569 | + int rmem2; |
|---|
| 475 | 570 | |
|---|
| 476 | 571 | icsk->icsk_ack.quick = 0; |
|---|
| 572 | + rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]); |
|---|
| 477 | 573 | |
|---|
| 478 | | - if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] && |
|---|
| 574 | + if (sk->sk_rcvbuf < rmem2 && |
|---|
| 479 | 575 | !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && |
|---|
| 480 | 576 | !tcp_under_memory_pressure(sk) && |
|---|
| 481 | 577 | sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { |
|---|
| 482 | | - sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), |
|---|
| 483 | | - net->ipv4.sysctl_tcp_rmem[2]); |
|---|
| 578 | + WRITE_ONCE(sk->sk_rcvbuf, |
|---|
| 579 | + min(atomic_read(&sk->sk_rmem_alloc), rmem2)); |
|---|
| 484 | 580 | } |
|---|
| 485 | 581 | if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) |
|---|
| 486 | 582 | tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss); |
|---|
| .. | .. |
|---|
| 510 | 606 | * |
|---|
| 511 | 607 | * The algorithm for RTT estimation w/o timestamps is based on |
|---|
| 512 | 608 | * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL. |
|---|
| 513 | | - * <http://public.lanl.gov/radiant/pubs.html#DRS> |
|---|
| 609 | + * <https://public.lanl.gov/radiant/pubs.html#DRS> |
|---|
| 514 | 610 | * |
|---|
| 515 | 611 | * More detail on this code can be found at |
|---|
| 516 | 612 | * <http://staff.psc.edu/jheffner/>, |
|---|
| .. | .. |
|---|
| 621 | 717 | * <prev RTT . ><current RTT .. ><next RTT .... > |
|---|
| 622 | 718 | */ |
|---|
| 623 | 719 | |
|---|
| 624 | | - if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf && |
|---|
| 720 | + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && |
|---|
| 625 | 721 | !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { |
|---|
| 626 | 722 | int rcvmem, rcvbuf; |
|---|
| 627 | 723 | u64 rcvwin, grow; |
|---|
| .. | .. |
|---|
| 642 | 738 | |
|---|
| 643 | 739 | do_div(rcvwin, tp->advmss); |
|---|
| 644 | 740 | rcvbuf = min_t(u64, rcvwin * rcvmem, |
|---|
| 645 | | - sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); |
|---|
| 741 | + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); |
|---|
| 646 | 742 | if (rcvbuf > sk->sk_rcvbuf) { |
|---|
| 647 | | - sk->sk_rcvbuf = rcvbuf; |
|---|
| 743 | + WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); |
|---|
| 648 | 744 | |
|---|
| 649 | 745 | /* Make the window clamp follow along. */ |
|---|
| 650 | 746 | tp->window_clamp = tcp_win_from_space(sk, rcvbuf); |
|---|
| .. | .. |
|---|
| 710 | 806 | tcp_ecn_check_ce(sk, skb); |
|---|
| 711 | 807 | |
|---|
| 712 | 808 | if (skb->len >= 128) |
|---|
| 713 | | - tcp_grow_window(sk, skb); |
|---|
| 809 | + tcp_grow_window(sk, skb, true); |
|---|
| 714 | 810 | } |
|---|
| 715 | 811 | |
|---|
| 716 | 812 | /* Called to compute a smoothed rtt estimate. The data fed to this |
|---|
| .. | .. |
|---|
| 774 | 870 | tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2; |
|---|
| 775 | 871 | tp->rtt_seq = tp->snd_nxt; |
|---|
| 776 | 872 | tp->mdev_max_us = tcp_rto_min_us(sk); |
|---|
| 873 | + |
|---|
| 874 | + tcp_bpf_rtt(sk); |
|---|
| 777 | 875 | } |
|---|
| 778 | 876 | } else { |
|---|
| 779 | 877 | /* no previous measure. */ |
|---|
| .. | .. |
|---|
| 782 | 880 | tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk)); |
|---|
| 783 | 881 | tp->mdev_max_us = tp->rttvar_us; |
|---|
| 784 | 882 | tp->rtt_seq = tp->snd_nxt; |
|---|
| 883 | + |
|---|
| 884 | + tcp_bpf_rtt(sk); |
|---|
| 785 | 885 | } |
|---|
| 786 | 886 | tp->srtt_us = max(1U, srtt); |
|---|
| 787 | 887 | } |
|---|
| .. | .. |
|---|
| 859 | 959 | return min_t(__u32, cwnd, tp->snd_cwnd_clamp); |
|---|
| 860 | 960 | } |
|---|
| 861 | 961 | |
|---|
| 862 | | -/* Take a notice that peer is sending D-SACKs */ |
|---|
| 863 | | -static void tcp_dsack_seen(struct tcp_sock *tp) |
|---|
| 962 | +struct tcp_sacktag_state { |
|---|
| 963 | + /* Timestamps for earliest and latest never-retransmitted segment |
|---|
| 964 | + * that was SACKed. RTO needs the earliest RTT to stay conservative, |
|---|
| 965 | + * but congestion control should still get an accurate delay signal. |
|---|
| 966 | + */ |
|---|
| 967 | + u64 first_sackt; |
|---|
| 968 | + u64 last_sackt; |
|---|
| 969 | + u32 reord; |
|---|
| 970 | + u32 sack_delivered; |
|---|
| 971 | + int flag; |
|---|
| 972 | + unsigned int mss_now; |
|---|
| 973 | + struct rate_sample *rate; |
|---|
| 974 | +}; |
|---|
| 975 | + |
|---|
| 976 | +/* Take a notice that peer is sending D-SACKs. Skip update of data delivery |
|---|
| 977 | + * and spurious retransmission information if this DSACK is unlikely caused by |
|---|
| 978 | + * sender's action: |
|---|
| 979 | + * - DSACKed sequence range is larger than maximum receiver's window. |
|---|
| 980 | + * - Total no. of DSACKed segments exceed the total no. of retransmitted segs. |
|---|
| 981 | + */ |
|---|
| 982 | +static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq, |
|---|
| 983 | + u32 end_seq, struct tcp_sacktag_state *state) |
|---|
| 864 | 984 | { |
|---|
| 985 | + u32 seq_len, dup_segs = 1; |
|---|
| 986 | + |
|---|
| 987 | + if (!before(start_seq, end_seq)) |
|---|
| 988 | + return 0; |
|---|
| 989 | + |
|---|
| 990 | + seq_len = end_seq - start_seq; |
|---|
| 991 | + /* Dubious DSACK: DSACKed range greater than maximum advertised rwnd */ |
|---|
| 992 | + if (seq_len > tp->max_window) |
|---|
| 993 | + return 0; |
|---|
| 994 | + if (seq_len > tp->mss_cache) |
|---|
| 995 | + dup_segs = DIV_ROUND_UP(seq_len, tp->mss_cache); |
|---|
| 996 | + |
|---|
| 997 | + tp->dsack_dups += dup_segs; |
|---|
| 998 | + /* Skip the DSACK if dup segs weren't retransmitted by sender */ |
|---|
| 999 | + if (tp->dsack_dups > tp->total_retrans) |
|---|
| 1000 | + return 0; |
|---|
| 1001 | + |
|---|
| 865 | 1002 | tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; |
|---|
| 866 | 1003 | tp->rack.dsack_seen = 1; |
|---|
| 867 | | - tp->dsack_dups++; |
|---|
| 1004 | + |
|---|
| 1005 | + state->flag |= FLAG_DSACKING_ACK; |
|---|
| 1006 | + /* A spurious retransmission is delivered */ |
|---|
| 1007 | + state->sack_delivered += dup_segs; |
|---|
| 1008 | + |
|---|
| 1009 | + return dup_segs; |
|---|
| 868 | 1010 | } |
|---|
| 869 | 1011 | |
|---|
| 870 | 1012 | /* It's reordering when higher sequence was delivered (i.e. sacked) before |
|---|
| .. | .. |
|---|
| 893 | 1035 | tp->undo_marker ? tp->undo_retrans : 0); |
|---|
| 894 | 1036 | #endif |
|---|
| 895 | 1037 | tp->reordering = min_t(u32, (metric + mss - 1) / mss, |
|---|
| 896 | | - sock_net(sk)->ipv4.sysctl_tcp_max_reordering); |
|---|
| 1038 | + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering)); |
|---|
| 897 | 1039 | } |
|---|
| 898 | 1040 | |
|---|
| 899 | 1041 | /* This exciting event is worth to be remembered. 8) */ |
|---|
| .. | .. |
|---|
| 902 | 1044 | ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER); |
|---|
| 903 | 1045 | } |
|---|
| 904 | 1046 | |
|---|
| 905 | | -/* This must be called before lost_out is incremented */ |
|---|
| 1047 | + /* This must be called before lost_out or retrans_out are updated |
|---|
| 1048 | + * on a new loss, because we want to know if all skbs previously |
|---|
| 1049 | + * known to be lost have already been retransmitted, indicating |
|---|
| 1050 | + * that this newly lost skb is our next skb to retransmit. |
|---|
| 1051 | + */ |
|---|
| 906 | 1052 | static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) |
|---|
| 907 | 1053 | { |
|---|
| 908 | 1054 | if ((!tp->retransmit_skb_hint && tp->retrans_out >= tp->lost_out) || |
|---|
| .. | .. |
|---|
| 912 | 1058 | tp->retransmit_skb_hint = skb; |
|---|
| 913 | 1059 | } |
|---|
| 914 | 1060 | |
|---|
| 915 | | -/* Sum the number of packets on the wire we have marked as lost. |
|---|
| 916 | | - * There are two cases we care about here: |
|---|
| 917 | | - * a) Packet hasn't been marked lost (nor retransmitted), |
|---|
| 918 | | - * and this is the first loss. |
|---|
| 919 | | - * b) Packet has been marked both lost and retransmitted, |
|---|
| 920 | | - * and this means we think it was lost again. |
|---|
| 1061 | +/* Sum the number of packets on the wire we have marked as lost, and |
|---|
| 1062 | + * notify the congestion control module that the given skb was marked lost. |
|---|
| 921 | 1063 | */ |
|---|
| 922 | | -static void tcp_sum_lost(struct tcp_sock *tp, struct sk_buff *skb) |
|---|
| 1064 | +static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb) |
|---|
| 1065 | +{ |
|---|
| 1066 | + tp->lost += tcp_skb_pcount(skb); |
|---|
| 1067 | +} |
|---|
| 1068 | + |
|---|
| 1069 | +void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) |
|---|
| 923 | 1070 | { |
|---|
| 924 | 1071 | __u8 sacked = TCP_SKB_CB(skb)->sacked; |
|---|
| 1072 | + struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 925 | 1073 | |
|---|
| 926 | | - if (!(sacked & TCPCB_LOST) || |
|---|
| 927 | | - ((sacked & TCPCB_LOST) && (sacked & TCPCB_SACKED_RETRANS))) |
|---|
| 928 | | - tp->lost += tcp_skb_pcount(skb); |
|---|
| 929 | | -} |
|---|
| 1074 | + if (sacked & TCPCB_SACKED_ACKED) |
|---|
| 1075 | + return; |
|---|
| 930 | 1076 | |
|---|
| 931 | | -static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb) |
|---|
| 932 | | -{ |
|---|
| 933 | | - if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { |
|---|
| 934 | | - tcp_verify_retransmit_hint(tp, skb); |
|---|
| 935 | | - |
|---|
| 936 | | - tp->lost_out += tcp_skb_pcount(skb); |
|---|
| 937 | | - tcp_sum_lost(tp, skb); |
|---|
| 938 | | - TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; |
|---|
| 939 | | - } |
|---|
| 940 | | -} |
|---|
| 941 | | - |
|---|
| 942 | | -void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb) |
|---|
| 943 | | -{ |
|---|
| 944 | 1077 | tcp_verify_retransmit_hint(tp, skb); |
|---|
| 945 | | - |
|---|
| 946 | | - tcp_sum_lost(tp, skb); |
|---|
| 947 | | - if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { |
|---|
| 1078 | + if (sacked & TCPCB_LOST) { |
|---|
| 1079 | + if (sacked & TCPCB_SACKED_RETRANS) { |
|---|
| 1080 | + /* Account for retransmits that are lost again */ |
|---|
| 1081 | + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; |
|---|
| 1082 | + tp->retrans_out -= tcp_skb_pcount(skb); |
|---|
| 1083 | + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT, |
|---|
| 1084 | + tcp_skb_pcount(skb)); |
|---|
| 1085 | + tcp_notify_skb_loss_event(tp, skb); |
|---|
| 1086 | + } |
|---|
| 1087 | + } else { |
|---|
| 948 | 1088 | tp->lost_out += tcp_skb_pcount(skb); |
|---|
| 949 | 1089 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; |
|---|
| 1090 | + tcp_notify_skb_loss_event(tp, skb); |
|---|
| 950 | 1091 | } |
|---|
| 1092 | +} |
|---|
| 1093 | + |
|---|
| 1094 | +/* Updates the delivered and delivered_ce counts */ |
|---|
| 1095 | +static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, |
|---|
| 1096 | + bool ece_ack) |
|---|
| 1097 | +{ |
|---|
| 1098 | + tp->delivered += delivered; |
|---|
| 1099 | + if (ece_ack) |
|---|
| 1100 | + tp->delivered_ce += delivered; |
|---|
| 951 | 1101 | } |
|---|
| 952 | 1102 | |
|---|
| 953 | 1103 | /* This procedure tags the retransmission queue when SACKs arrive. |
|---|
| .. | .. |
|---|
| 1082 | 1232 | |
|---|
| 1083 | 1233 | static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, |
|---|
| 1084 | 1234 | struct tcp_sack_block_wire *sp, int num_sacks, |
|---|
| 1085 | | - u32 prior_snd_una) |
|---|
| 1235 | + u32 prior_snd_una, struct tcp_sacktag_state *state) |
|---|
| 1086 | 1236 | { |
|---|
| 1087 | 1237 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 1088 | 1238 | u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq); |
|---|
| 1089 | 1239 | u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq); |
|---|
| 1090 | | - bool dup_sack = false; |
|---|
| 1240 | + u32 dup_segs; |
|---|
| 1091 | 1241 | |
|---|
| 1092 | 1242 | if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) { |
|---|
| 1093 | | - dup_sack = true; |
|---|
| 1094 | | - tcp_dsack_seen(tp); |
|---|
| 1095 | 1243 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV); |
|---|
| 1096 | 1244 | } else if (num_sacks > 1) { |
|---|
| 1097 | 1245 | u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq); |
|---|
| 1098 | 1246 | u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq); |
|---|
| 1099 | 1247 | |
|---|
| 1100 | | - if (!after(end_seq_0, end_seq_1) && |
|---|
| 1101 | | - !before(start_seq_0, start_seq_1)) { |
|---|
| 1102 | | - dup_sack = true; |
|---|
| 1103 | | - tcp_dsack_seen(tp); |
|---|
| 1104 | | - NET_INC_STATS(sock_net(sk), |
|---|
| 1105 | | - LINUX_MIB_TCPDSACKOFORECV); |
|---|
| 1106 | | - } |
|---|
| 1248 | + if (after(end_seq_0, end_seq_1) || before(start_seq_0, start_seq_1)) |
|---|
| 1249 | + return false; |
|---|
| 1250 | + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKOFORECV); |
|---|
| 1251 | + } else { |
|---|
| 1252 | + return false; |
|---|
| 1107 | 1253 | } |
|---|
| 1108 | 1254 | |
|---|
| 1255 | + dup_segs = tcp_dsack_seen(tp, start_seq_0, end_seq_0, state); |
|---|
| 1256 | + if (!dup_segs) { /* Skip dubious DSACK */ |
|---|
| 1257 | + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKIGNOREDDUBIOUS); |
|---|
| 1258 | + return false; |
|---|
| 1259 | + } |
|---|
| 1260 | + |
|---|
| 1261 | + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECVSEGS, dup_segs); |
|---|
| 1262 | + |
|---|
| 1109 | 1263 | /* D-SACK for already forgotten data... Do dumb counting. */ |
|---|
| 1110 | | - if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 && |
|---|
| 1264 | + if (tp->undo_marker && tp->undo_retrans > 0 && |
|---|
| 1111 | 1265 | !after(end_seq_0, prior_snd_una) && |
|---|
| 1112 | 1266 | after(end_seq_0, tp->undo_marker)) |
|---|
| 1113 | | - tp->undo_retrans--; |
|---|
| 1267 | + tp->undo_retrans = max_t(int, 0, tp->undo_retrans - dup_segs); |
|---|
| 1114 | 1268 | |
|---|
| 1115 | | - return dup_sack; |
|---|
| 1269 | + return true; |
|---|
| 1116 | 1270 | } |
|---|
| 1117 | | - |
|---|
| 1118 | | -struct tcp_sacktag_state { |
|---|
| 1119 | | - u32 reord; |
|---|
| 1120 | | - /* Timestamps for earliest and latest never-retransmitted segment |
|---|
| 1121 | | - * that was SACKed. RTO needs the earliest RTT to stay conservative, |
|---|
| 1122 | | - * but congestion control should still get an accurate delay signal. |
|---|
| 1123 | | - */ |
|---|
| 1124 | | - u64 first_sackt; |
|---|
| 1125 | | - u64 last_sackt; |
|---|
| 1126 | | - struct rate_sample *rate; |
|---|
| 1127 | | - int flag; |
|---|
| 1128 | | - unsigned int mss_now; |
|---|
| 1129 | | -}; |
|---|
| 1130 | 1271 | |
|---|
| 1131 | 1272 | /* Check if skb is fully within the SACK block. In presence of GSO skbs, |
|---|
| 1132 | 1273 | * the incoming SACK may not exactly match but we can find smaller MSS |
|---|
| .. | .. |
|---|
| 1246 | 1387 | sacked |= TCPCB_SACKED_ACKED; |
|---|
| 1247 | 1388 | state->flag |= FLAG_DATA_SACKED; |
|---|
| 1248 | 1389 | tp->sacked_out += pcount; |
|---|
| 1249 | | - tp->delivered += pcount; /* Out-of-order packets delivered */ |
|---|
| 1390 | + /* Out-of-order packets delivered */ |
|---|
| 1391 | + state->sack_delivered += pcount; |
|---|
| 1250 | 1392 | |
|---|
| 1251 | 1393 | /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ |
|---|
| 1252 | 1394 | if (tp->lost_skb_hint && |
|---|
| .. | .. |
|---|
| 1289 | 1431 | */ |
|---|
| 1290 | 1432 | tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, |
|---|
| 1291 | 1433 | start_seq, end_seq, dup_sack, pcount, |
|---|
| 1292 | | - skb->skb_mstamp); |
|---|
| 1434 | + tcp_skb_timestamp_us(skb)); |
|---|
| 1293 | 1435 | tcp_rate_skb_delivered(sk, skb, state->rate); |
|---|
| 1294 | 1436 | |
|---|
| 1295 | 1437 | if (skb == tp->lost_skb_hint) |
|---|
| .. | .. |
|---|
| 1413 | 1555 | if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) |
|---|
| 1414 | 1556 | goto fallback; |
|---|
| 1415 | 1557 | |
|---|
| 1416 | | - if (!tcp_skb_can_collapse_to(prev)) |
|---|
| 1558 | + if (!tcp_skb_can_collapse(prev, skb)) |
|---|
| 1417 | 1559 | goto fallback; |
|---|
| 1418 | 1560 | |
|---|
| 1419 | 1561 | in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && |
|---|
| .. | .. |
|---|
| 1502 | 1644 | (mss != tcp_skb_seglen(skb))) |
|---|
| 1503 | 1645 | goto out; |
|---|
| 1504 | 1646 | |
|---|
| 1647 | + if (!tcp_skb_can_collapse(prev, skb)) |
|---|
| 1648 | + goto out; |
|---|
| 1505 | 1649 | len = skb->len; |
|---|
| 1506 | 1650 | pcount = tcp_skb_pcount(skb); |
|---|
| 1507 | 1651 | if (tcp_skb_shift(prev, skb, pcount, len)) |
|---|
| .. | .. |
|---|
| 1578 | 1722 | TCP_SKB_CB(skb)->end_seq, |
|---|
| 1579 | 1723 | dup_sack, |
|---|
| 1580 | 1724 | tcp_skb_pcount(skb), |
|---|
| 1581 | | - skb->skb_mstamp); |
|---|
| 1725 | + tcp_skb_timestamp_us(skb)); |
|---|
| 1582 | 1726 | tcp_rate_skb_delivered(sk, skb, state->rate); |
|---|
| 1583 | 1727 | if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) |
|---|
| 1584 | 1728 | list_del_init(&skb->tcp_tsorted_anchor); |
|---|
| .. | .. |
|---|
| 1591 | 1735 | return skb; |
|---|
| 1592 | 1736 | } |
|---|
| 1593 | 1737 | |
|---|
| 1594 | | -static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk, |
|---|
| 1595 | | - struct tcp_sacktag_state *state, |
|---|
| 1596 | | - u32 seq) |
|---|
| 1738 | +static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk, u32 seq) |
|---|
| 1597 | 1739 | { |
|---|
| 1598 | 1740 | struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node; |
|---|
| 1599 | 1741 | struct sk_buff *skb; |
|---|
| .. | .. |
|---|
| 1615 | 1757 | } |
|---|
| 1616 | 1758 | |
|---|
| 1617 | 1759 | static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, |
|---|
| 1618 | | - struct tcp_sacktag_state *state, |
|---|
| 1619 | 1760 | u32 skip_to_seq) |
|---|
| 1620 | 1761 | { |
|---|
| 1621 | 1762 | if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq)) |
|---|
| 1622 | 1763 | return skb; |
|---|
| 1623 | 1764 | |
|---|
| 1624 | | - return tcp_sacktag_bsearch(sk, state, skip_to_seq); |
|---|
| 1765 | + return tcp_sacktag_bsearch(sk, skip_to_seq); |
|---|
| 1625 | 1766 | } |
|---|
| 1626 | 1767 | |
|---|
| 1627 | 1768 | static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, |
|---|
| .. | .. |
|---|
| 1634 | 1775 | return skb; |
|---|
| 1635 | 1776 | |
|---|
| 1636 | 1777 | if (before(next_dup->start_seq, skip_to_seq)) { |
|---|
| 1637 | | - skb = tcp_sacktag_skip(skb, sk, state, next_dup->start_seq); |
|---|
| 1778 | + skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq); |
|---|
| 1638 | 1779 | skb = tcp_sacktag_walk(skb, sk, NULL, state, |
|---|
| 1639 | 1780 | next_dup->start_seq, next_dup->end_seq, |
|---|
| 1640 | 1781 | 1); |
|---|
| .. | .. |
|---|
| 1672 | 1813 | tcp_highest_sack_reset(sk); |
|---|
| 1673 | 1814 | |
|---|
| 1674 | 1815 | found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, |
|---|
| 1675 | | - num_sacks, prior_snd_una); |
|---|
| 1676 | | - if (found_dup_sack) { |
|---|
| 1677 | | - state->flag |= FLAG_DSACKING_ACK; |
|---|
| 1678 | | - tp->delivered++; /* A spurious retransmission is delivered */ |
|---|
| 1679 | | - } |
|---|
| 1816 | + num_sacks, prior_snd_una, state); |
|---|
| 1680 | 1817 | |
|---|
| 1681 | 1818 | /* Eliminate too old ACKs, but take into |
|---|
| 1682 | 1819 | * account more or less fresh ones, they can |
|---|
| .. | .. |
|---|
| 1778 | 1915 | |
|---|
| 1779 | 1916 | /* Head todo? */ |
|---|
| 1780 | 1917 | if (before(start_seq, cache->start_seq)) { |
|---|
| 1781 | | - skb = tcp_sacktag_skip(skb, sk, state, |
|---|
| 1782 | | - start_seq); |
|---|
| 1918 | + skb = tcp_sacktag_skip(skb, sk, start_seq); |
|---|
| 1783 | 1919 | skb = tcp_sacktag_walk(skb, sk, next_dup, |
|---|
| 1784 | 1920 | state, |
|---|
| 1785 | 1921 | start_seq, |
|---|
| .. | .. |
|---|
| 1805 | 1941 | goto walk; |
|---|
| 1806 | 1942 | } |
|---|
| 1807 | 1943 | |
|---|
| 1808 | | - skb = tcp_sacktag_skip(skb, sk, state, cache->end_seq); |
|---|
| 1944 | + skb = tcp_sacktag_skip(skb, sk, cache->end_seq); |
|---|
| 1809 | 1945 | /* Check overlap against next cached too (past this one already) */ |
|---|
| 1810 | 1946 | cache++; |
|---|
| 1811 | 1947 | continue; |
|---|
| .. | .. |
|---|
| 1816 | 1952 | if (!skb) |
|---|
| 1817 | 1953 | break; |
|---|
| 1818 | 1954 | } |
|---|
| 1819 | | - skb = tcp_sacktag_skip(skb, sk, state, start_seq); |
|---|
| 1955 | + skb = tcp_sacktag_skip(skb, sk, start_seq); |
|---|
| 1820 | 1956 | |
|---|
| 1821 | 1957 | walk: |
|---|
| 1822 | 1958 | skb = tcp_sacktag_walk(skb, sk, next_dup, state, |
|---|
| .. | .. |
|---|
| 1878 | 2014 | return; |
|---|
| 1879 | 2015 | |
|---|
| 1880 | 2016 | tp->reordering = min_t(u32, tp->packets_out + addend, |
|---|
| 1881 | | - sock_net(sk)->ipv4.sysctl_tcp_max_reordering); |
|---|
| 2017 | + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering)); |
|---|
| 1882 | 2018 | tp->reord_seen++; |
|---|
| 1883 | 2019 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER); |
|---|
| 1884 | 2020 | } |
|---|
| 1885 | 2021 | |
|---|
| 1886 | 2022 | /* Emulate SACKs for SACKless connection: account for a new dupack. */ |
|---|
| 1887 | 2023 | |
|---|
| 1888 | | -static void tcp_add_reno_sack(struct sock *sk) |
|---|
| 2024 | +static void tcp_add_reno_sack(struct sock *sk, int num_dupack, bool ece_ack) |
|---|
| 1889 | 2025 | { |
|---|
| 1890 | | - struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 1891 | | - u32 prior_sacked = tp->sacked_out; |
|---|
| 2026 | + if (num_dupack) { |
|---|
| 2027 | + struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 2028 | + u32 prior_sacked = tp->sacked_out; |
|---|
| 2029 | + s32 delivered; |
|---|
| 1892 | 2030 | |
|---|
| 1893 | | - tp->sacked_out++; |
|---|
| 1894 | | - tcp_check_reno_reordering(sk, 0); |
|---|
| 1895 | | - if (tp->sacked_out > prior_sacked) |
|---|
| 1896 | | - tp->delivered++; /* Some out-of-order packet is delivered */ |
|---|
| 1897 | | - tcp_verify_left_out(tp); |
|---|
| 2031 | + tp->sacked_out += num_dupack; |
|---|
| 2032 | + tcp_check_reno_reordering(sk, 0); |
|---|
| 2033 | + delivered = tp->sacked_out - prior_sacked; |
|---|
| 2034 | + if (delivered > 0) |
|---|
| 2035 | + tcp_count_delivered(tp, delivered, ece_ack); |
|---|
| 2036 | + tcp_verify_left_out(tp); |
|---|
| 2037 | + } |
|---|
| 1898 | 2038 | } |
|---|
| 1899 | 2039 | |
|---|
| 1900 | 2040 | /* Account for ACK, ACKing some data in Reno Recovery phase. */ |
|---|
| 1901 | 2041 | |
|---|
| 1902 | | -static void tcp_remove_reno_sacks(struct sock *sk, int acked) |
|---|
| 2042 | +static void tcp_remove_reno_sacks(struct sock *sk, int acked, bool ece_ack) |
|---|
| 1903 | 2043 | { |
|---|
| 1904 | 2044 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 1905 | 2045 | |
|---|
| 1906 | 2046 | if (acked > 0) { |
|---|
| 1907 | 2047 | /* One ACK acked hole. The rest eat duplicate ACKs. */ |
|---|
| 1908 | | - tp->delivered += max_t(int, acked - tp->sacked_out, 1); |
|---|
| 2048 | + tcp_count_delivered(tp, max_t(int, acked - tp->sacked_out, 1), |
|---|
| 2049 | + ece_ack); |
|---|
| 1909 | 2050 | if (acked - 1 >= tp->sacked_out) |
|---|
| 1910 | 2051 | tp->sacked_out = 0; |
|---|
| 1911 | 2052 | else |
|---|
| .. | .. |
|---|
| 1938 | 2079 | |
|---|
| 1939 | 2080 | static bool tcp_is_rack(const struct sock *sk) |
|---|
| 1940 | 2081 | { |
|---|
| 1941 | | - return sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION; |
|---|
| 2082 | + return READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) & |
|---|
| 2083 | + TCP_RACK_LOSS_DETECTION; |
|---|
| 1942 | 2084 | } |
|---|
| 1943 | 2085 | |
|---|
| 1944 | 2086 | /* If we detect SACK reneging, forget all SACK information |
|---|
| .. | .. |
|---|
| 1982 | 2124 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 1983 | 2125 | struct net *net = sock_net(sk); |
|---|
| 1984 | 2126 | bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery; |
|---|
| 2127 | + u8 reordering; |
|---|
| 1985 | 2128 | |
|---|
| 1986 | 2129 | tcp_timeout_mark_lost(sk); |
|---|
| 1987 | 2130 | |
|---|
| .. | .. |
|---|
| 2002 | 2145 | /* Timeout in disordered state after receiving substantial DUPACKs |
|---|
| 2003 | 2146 | * suggests that the degree of reordering is over-estimated. |
|---|
| 2004 | 2147 | */ |
|---|
| 2148 | + reordering = READ_ONCE(net->ipv4.sysctl_tcp_reordering); |
|---|
| 2005 | 2149 | if (icsk->icsk_ca_state <= TCP_CA_Disorder && |
|---|
| 2006 | | - tp->sacked_out >= net->ipv4.sysctl_tcp_reordering) |
|---|
| 2150 | + tp->sacked_out >= reordering) |
|---|
| 2007 | 2151 | tp->reordering = min_t(unsigned int, tp->reordering, |
|---|
| 2008 | | - net->ipv4.sysctl_tcp_reordering); |
|---|
| 2152 | + reordering); |
|---|
| 2153 | + |
|---|
| 2009 | 2154 | tcp_set_ca_state(sk, TCP_CA_Loss); |
|---|
| 2010 | 2155 | tp->high_seq = tp->snd_nxt; |
|---|
| 2011 | 2156 | tcp_ecn_queue_cwr(tp); |
|---|
| .. | .. |
|---|
| 2014 | 2159 | * loss recovery is underway except recurring timeout(s) on |
|---|
| 2015 | 2160 | * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing |
|---|
| 2016 | 2161 | */ |
|---|
| 2017 | | - tp->frto = net->ipv4.sysctl_tcp_frto && |
|---|
| 2162 | + tp->frto = READ_ONCE(net->ipv4.sysctl_tcp_frto) && |
|---|
| 2018 | 2163 | (new_recovery || icsk->icsk_retransmits) && |
|---|
| 2019 | 2164 | !inet_csk(sk)->icsk_mtup.probe_size; |
|---|
| 2020 | 2165 | } |
|---|
| .. | .. |
|---|
| 2031 | 2176 | */ |
|---|
| 2032 | 2177 | static bool tcp_check_sack_reneging(struct sock *sk, int flag) |
|---|
| 2033 | 2178 | { |
|---|
| 2034 | | - if (flag & FLAG_SACK_RENEGING) { |
|---|
| 2179 | + if (flag & FLAG_SACK_RENEGING && |
|---|
| 2180 | + flag & FLAG_SND_UNA_ADVANCED) { |
|---|
| 2035 | 2181 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 2036 | 2182 | unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4), |
|---|
| 2037 | 2183 | msecs_to_jiffies(10)); |
|---|
| .. | .. |
|---|
| 2172 | 2318 | } |
|---|
| 2173 | 2319 | |
|---|
| 2174 | 2320 | /* Detect loss in event "A" above by marking head of queue up as lost. |
|---|
| 2175 | | - * For non-SACK(Reno) senders, the first "packets" number of segments |
|---|
| 2176 | | - * are considered lost. For RFC3517 SACK, a segment is considered lost if it |
|---|
| 2321 | + * For RFC3517 SACK, a segment is considered lost if it |
|---|
| 2177 | 2322 | * has at least tp->reordering SACKed seqments above it; "packets" refers to |
|---|
| 2178 | 2323 | * the maximum SACKed segments to pass before reaching this limit. |
|---|
| 2179 | 2324 | */ |
|---|
| .. | .. |
|---|
| 2181 | 2326 | { |
|---|
| 2182 | 2327 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 2183 | 2328 | struct sk_buff *skb; |
|---|
| 2184 | | - int cnt, oldcnt, lost; |
|---|
| 2185 | | - unsigned int mss; |
|---|
| 2329 | + int cnt; |
|---|
| 2186 | 2330 | /* Use SACK to deduce losses of new sequences sent during recovery */ |
|---|
| 2187 | | - const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq; |
|---|
| 2331 | + const u32 loss_high = tp->snd_nxt; |
|---|
| 2188 | 2332 | |
|---|
| 2189 | 2333 | WARN_ON(packets > tp->packets_out); |
|---|
| 2190 | 2334 | skb = tp->lost_skb_hint; |
|---|
| .. | .. |
|---|
| 2207 | 2351 | if (after(TCP_SKB_CB(skb)->end_seq, loss_high)) |
|---|
| 2208 | 2352 | break; |
|---|
| 2209 | 2353 | |
|---|
| 2210 | | - oldcnt = cnt; |
|---|
| 2211 | | - if (tcp_is_reno(tp) || |
|---|
| 2212 | | - (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) |
|---|
| 2354 | + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) |
|---|
| 2213 | 2355 | cnt += tcp_skb_pcount(skb); |
|---|
| 2214 | 2356 | |
|---|
| 2215 | | - if (cnt > packets) { |
|---|
| 2216 | | - if (tcp_is_sack(tp) || |
|---|
| 2217 | | - (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || |
|---|
| 2218 | | - (oldcnt >= packets)) |
|---|
| 2219 | | - break; |
|---|
| 2357 | + if (cnt > packets) |
|---|
| 2358 | + break; |
|---|
| 2220 | 2359 | |
|---|
| 2221 | | - mss = tcp_skb_mss(skb); |
|---|
| 2222 | | - /* If needed, chop off the prefix to mark as lost. */ |
|---|
| 2223 | | - lost = (packets - oldcnt) * mss; |
|---|
| 2224 | | - if (lost < skb->len && |
|---|
| 2225 | | - tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, |
|---|
| 2226 | | - lost, mss, GFP_ATOMIC) < 0) |
|---|
| 2227 | | - break; |
|---|
| 2228 | | - cnt = packets; |
|---|
| 2229 | | - } |
|---|
| 2230 | | - |
|---|
| 2231 | | - tcp_skb_mark_lost(tp, skb); |
|---|
| 2360 | + if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST)) |
|---|
| 2361 | + tcp_mark_skb_lost(sk, skb); |
|---|
| 2232 | 2362 | |
|---|
| 2233 | 2363 | if (mark_head) |
|---|
| 2234 | 2364 | break; |
|---|
| .. | .. |
|---|
| 2272 | 2402 | */ |
|---|
| 2273 | 2403 | static inline bool tcp_packet_delayed(const struct tcp_sock *tp) |
|---|
| 2274 | 2404 | { |
|---|
| 2275 | | - return !tp->retrans_stamp || |
|---|
| 2405 | + return tp->retrans_stamp && |
|---|
| 2276 | 2406 | tcp_tsopt_ecr_before(tp, tp->retrans_stamp); |
|---|
| 2277 | 2407 | } |
|---|
| 2278 | 2408 | |
|---|
| .. | .. |
|---|
| 2368 | 2498 | return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp)); |
|---|
| 2369 | 2499 | } |
|---|
| 2370 | 2500 | |
|---|
| 2501 | +static bool tcp_is_non_sack_preventing_reopen(struct sock *sk) |
|---|
| 2502 | +{ |
|---|
| 2503 | + struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 2504 | + |
|---|
| 2505 | + if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { |
|---|
| 2506 | + /* Hold old state until something *above* high_seq |
|---|
| 2507 | + * is ACKed. For Reno it is MUST to prevent false |
|---|
| 2508 | + * fast retransmits (RFC2582). SACK TCP is safe. */ |
|---|
| 2509 | + if (!tcp_any_retrans_done(sk)) |
|---|
| 2510 | + tp->retrans_stamp = 0; |
|---|
| 2511 | + return true; |
|---|
| 2512 | + } |
|---|
| 2513 | + return false; |
|---|
| 2514 | +} |
|---|
| 2515 | + |
|---|
| 2371 | 2516 | /* People celebrate: "We love our President!" */ |
|---|
| 2372 | 2517 | static bool tcp_try_undo_recovery(struct sock *sk) |
|---|
| 2373 | 2518 | { |
|---|
| .. | .. |
|---|
| 2390 | 2535 | } else if (tp->rack.reo_wnd_persist) { |
|---|
| 2391 | 2536 | tp->rack.reo_wnd_persist--; |
|---|
| 2392 | 2537 | } |
|---|
| 2393 | | - if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { |
|---|
| 2394 | | - /* Hold old state until something *above* high_seq |
|---|
| 2395 | | - * is ACKed. For Reno it is MUST to prevent false |
|---|
| 2396 | | - * fast retransmits (RFC2582). SACK TCP is safe. */ |
|---|
| 2397 | | - if (!tcp_any_retrans_done(sk)) |
|---|
| 2398 | | - tp->retrans_stamp = 0; |
|---|
| 2538 | + if (tcp_is_non_sack_preventing_reopen(sk)) |
|---|
| 2399 | 2539 | return true; |
|---|
| 2400 | | - } |
|---|
| 2401 | 2540 | tcp_set_ca_state(sk, TCP_CA_Open); |
|---|
| 2402 | 2541 | tp->is_sack_reneg = 0; |
|---|
| 2403 | 2542 | return false; |
|---|
| .. | .. |
|---|
| 2433 | 2572 | NET_INC_STATS(sock_net(sk), |
|---|
| 2434 | 2573 | LINUX_MIB_TCPSPURIOUSRTOS); |
|---|
| 2435 | 2574 | inet_csk(sk)->icsk_retransmits = 0; |
|---|
| 2575 | + if (tcp_is_non_sack_preventing_reopen(sk)) |
|---|
| 2576 | + return true; |
|---|
| 2436 | 2577 | if (frto_undo || tcp_is_sack(tp)) { |
|---|
| 2437 | 2578 | tcp_set_ca_state(sk, TCP_CA_Open); |
|---|
| 2438 | 2579 | tp->is_sack_reneg = 0; |
|---|
| .. | .. |
|---|
| 2479 | 2620 | u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered + |
|---|
| 2480 | 2621 | tp->prior_cwnd - 1; |
|---|
| 2481 | 2622 | sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out; |
|---|
| 2482 | | - } else if ((flag & FLAG_RETRANS_DATA_ACKED) && |
|---|
| 2483 | | - !(flag & FLAG_LOST_RETRANS)) { |
|---|
| 2623 | + } else if ((flag & (FLAG_RETRANS_DATA_ACKED | FLAG_LOST_RETRANS)) == |
|---|
| 2624 | + FLAG_RETRANS_DATA_ACKED) { |
|---|
| 2484 | 2625 | sndcnt = min_t(int, delta, |
|---|
| 2485 | 2626 | max_t(int, tp->prr_delivered - tp->prr_out, |
|---|
| 2486 | 2627 | newly_acked_sacked) + 1); |
|---|
| .. | .. |
|---|
| 2566 | 2707 | { |
|---|
| 2567 | 2708 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 2568 | 2709 | struct inet_connection_sock *icsk = inet_csk(sk); |
|---|
| 2710 | + u64 val; |
|---|
| 2569 | 2711 | |
|---|
| 2570 | | - /* FIXME: breaks with very large cwnd */ |
|---|
| 2571 | 2712 | tp->prior_ssthresh = tcp_current_ssthresh(sk); |
|---|
| 2572 | | - tp->snd_cwnd = tp->snd_cwnd * |
|---|
| 2573 | | - tcp_mss_to_mtu(sk, tp->mss_cache) / |
|---|
| 2574 | | - icsk->icsk_mtup.probe_size; |
|---|
| 2713 | + |
|---|
| 2714 | + val = (u64)tp->snd_cwnd * tcp_mss_to_mtu(sk, tp->mss_cache); |
|---|
| 2715 | + do_div(val, icsk->icsk_mtup.probe_size); |
|---|
| 2716 | + WARN_ON_ONCE((u32)val != val); |
|---|
| 2717 | + tp->snd_cwnd = max_t(u32, 1U, val); |
|---|
| 2718 | + |
|---|
| 2575 | 2719 | tp->snd_cwnd_cnt = 0; |
|---|
| 2576 | 2720 | tp->snd_cwnd_stamp = tcp_jiffies32; |
|---|
| 2577 | 2721 | tp->snd_ssthresh = tcp_current_ssthresh(sk); |
|---|
| .. | .. |
|---|
| 2594 | 2738 | unsigned int mss = tcp_current_mss(sk); |
|---|
| 2595 | 2739 | |
|---|
| 2596 | 2740 | skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { |
|---|
| 2597 | | - if (tcp_skb_seglen(skb) > mss && |
|---|
| 2598 | | - !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { |
|---|
| 2599 | | - if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { |
|---|
| 2600 | | - TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; |
|---|
| 2601 | | - tp->retrans_out -= tcp_skb_pcount(skb); |
|---|
| 2602 | | - } |
|---|
| 2603 | | - tcp_skb_mark_lost_uncond_verify(tp, skb); |
|---|
| 2604 | | - } |
|---|
| 2741 | + if (tcp_skb_seglen(skb) > mss) |
|---|
| 2742 | + tcp_mark_skb_lost(sk, skb); |
|---|
| 2605 | 2743 | } |
|---|
| 2606 | 2744 | |
|---|
| 2607 | 2745 | tcp_clear_retrans_hints_partial(tp); |
|---|
| .. | .. |
|---|
| 2656 | 2794 | /* Process an ACK in CA_Loss state. Move to CA_Open if lost data are |
|---|
| 2657 | 2795 | * recovered or spurious. Otherwise retransmits more on partial ACKs. |
|---|
| 2658 | 2796 | */ |
|---|
| 2659 | | -static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, |
|---|
| 2797 | +static void tcp_process_loss(struct sock *sk, int flag, int num_dupack, |
|---|
| 2660 | 2798 | int *rexmit) |
|---|
| 2661 | 2799 | { |
|---|
| 2662 | 2800 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 2663 | 2801 | bool recovered = !before(tp->snd_una, tp->high_seq); |
|---|
| 2664 | 2802 | |
|---|
| 2665 | | - if ((flag & FLAG_SND_UNA_ADVANCED) && |
|---|
| 2803 | + if ((flag & FLAG_SND_UNA_ADVANCED || rcu_access_pointer(tp->fastopen_rsk)) && |
|---|
| 2666 | 2804 | tcp_try_undo_loss(sk, false)) |
|---|
| 2667 | 2805 | return; |
|---|
| 2668 | 2806 | |
|---|
| .. | .. |
|---|
| 2675 | 2813 | return; |
|---|
| 2676 | 2814 | |
|---|
| 2677 | 2815 | if (after(tp->snd_nxt, tp->high_seq)) { |
|---|
| 2678 | | - if (flag & FLAG_DATA_SACKED || is_dupack) |
|---|
| 2816 | + if (flag & FLAG_DATA_SACKED || num_dupack) |
|---|
| 2679 | 2817 | tp->frto = 0; /* Step 3.a. loss was real */ |
|---|
| 2680 | 2818 | } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) { |
|---|
| 2681 | 2819 | tp->high_seq = tp->snd_nxt; |
|---|
| .. | .. |
|---|
| 2701 | 2839 | /* A Reno DUPACK means new data in F-RTO step 2.b above are |
|---|
| 2702 | 2840 | * delivered. Lower inflight to clock out (re)tranmissions. |
|---|
| 2703 | 2841 | */ |
|---|
| 2704 | | - if (after(tp->snd_nxt, tp->high_seq) && is_dupack) |
|---|
| 2705 | | - tcp_add_reno_sack(sk); |
|---|
| 2842 | + if (after(tp->snd_nxt, tp->high_seq) && num_dupack) |
|---|
| 2843 | + tcp_add_reno_sack(sk, num_dupack, flag & FLAG_ECE); |
|---|
| 2706 | 2844 | else if (flag & FLAG_SND_UNA_ADVANCED) |
|---|
| 2707 | 2845 | tcp_reset_reno_sack(tp); |
|---|
| 2708 | 2846 | } |
|---|
| 2709 | 2847 | *rexmit = REXMIT_LOST; |
|---|
| 2710 | 2848 | } |
|---|
| 2711 | 2849 | |
|---|
| 2850 | +static bool tcp_force_fast_retransmit(struct sock *sk) |
|---|
| 2851 | +{ |
|---|
| 2852 | + struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 2853 | + |
|---|
| 2854 | + return after(tcp_highest_sack_seq(tp), |
|---|
| 2855 | + tp->snd_una + tp->reordering * tp->mss_cache); |
|---|
| 2856 | +} |
|---|
| 2857 | + |
|---|
| 2712 | 2858 | /* Undo during fast recovery after partial ACK. */ |
|---|
| 2713 | | -static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una) |
|---|
| 2859 | +static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una, |
|---|
| 2860 | + bool *do_lost) |
|---|
| 2714 | 2861 | { |
|---|
| 2715 | 2862 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 2716 | 2863 | |
|---|
| .. | .. |
|---|
| 2735 | 2882 | tcp_undo_cwnd_reduction(sk, true); |
|---|
| 2736 | 2883 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); |
|---|
| 2737 | 2884 | tcp_try_keep_open(sk); |
|---|
| 2738 | | - return true; |
|---|
| 2885 | + } else { |
|---|
| 2886 | + /* Partial ACK arrived. Force fast retransmit. */ |
|---|
| 2887 | + *do_lost = tcp_force_fast_retransmit(sk); |
|---|
| 2739 | 2888 | } |
|---|
| 2740 | 2889 | return false; |
|---|
| 2741 | 2890 | } |
|---|
| .. | .. |
|---|
| 2759 | 2908 | } |
|---|
| 2760 | 2909 | } |
|---|
| 2761 | 2910 | |
|---|
| 2762 | | -static bool tcp_force_fast_retransmit(struct sock *sk) |
|---|
| 2763 | | -{ |
|---|
| 2764 | | - struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 2765 | | - |
|---|
| 2766 | | - return after(tcp_highest_sack_seq(tp), |
|---|
| 2767 | | - tp->snd_una + tp->reordering * tp->mss_cache); |
|---|
| 2768 | | -} |
|---|
| 2769 | | - |
|---|
| 2770 | 2911 | /* Process an event, which can update packets-in-flight not trivially. |
|---|
| 2771 | 2912 | * Main goal of this function is to calculate new estimate for left_out, |
|---|
| 2772 | 2913 | * taking into account both packets sitting in receiver's buffer and |
|---|
| .. | .. |
|---|
| 2780 | 2921 | * tcp_xmit_retransmit_queue(). |
|---|
| 2781 | 2922 | */ |
|---|
| 2782 | 2923 | static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, |
|---|
| 2783 | | - bool is_dupack, int *ack_flag, int *rexmit) |
|---|
| 2924 | + int num_dupack, int *ack_flag, int *rexmit) |
|---|
| 2784 | 2925 | { |
|---|
| 2785 | 2926 | struct inet_connection_sock *icsk = inet_csk(sk); |
|---|
| 2786 | 2927 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 2787 | 2928 | int fast_rexmit = 0, flag = *ack_flag; |
|---|
| 2788 | | - bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && |
|---|
| 2789 | | - tcp_force_fast_retransmit(sk)); |
|---|
| 2929 | + bool ece_ack = flag & FLAG_ECE; |
|---|
| 2930 | + bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) && |
|---|
| 2931 | + tcp_force_fast_retransmit(sk)); |
|---|
| 2790 | 2932 | |
|---|
| 2791 | 2933 | if (!tp->packets_out && tp->sacked_out) |
|---|
| 2792 | 2934 | tp->sacked_out = 0; |
|---|
| 2793 | 2935 | |
|---|
| 2794 | 2936 | /* Now state machine starts. |
|---|
| 2795 | 2937 | * A. ECE, hence prohibit cwnd undoing, the reduction is required. */ |
|---|
| 2796 | | - if (flag & FLAG_ECE) |
|---|
| 2938 | + if (ece_ack) |
|---|
| 2797 | 2939 | tp->prior_ssthresh = 0; |
|---|
| 2798 | 2940 | |
|---|
| 2799 | 2941 | /* B. In all the states check for reneging SACKs. */ |
|---|
| .. | .. |
|---|
| 2833 | 2975 | switch (icsk->icsk_ca_state) { |
|---|
| 2834 | 2976 | case TCP_CA_Recovery: |
|---|
| 2835 | 2977 | if (!(flag & FLAG_SND_UNA_ADVANCED)) { |
|---|
| 2836 | | - if (tcp_is_reno(tp) && is_dupack) |
|---|
| 2837 | | - tcp_add_reno_sack(sk); |
|---|
| 2838 | | - } else { |
|---|
| 2839 | | - if (tcp_try_undo_partial(sk, prior_snd_una)) |
|---|
| 2840 | | - return; |
|---|
| 2841 | | - /* Partial ACK arrived. Force fast retransmit. */ |
|---|
| 2842 | | - do_lost = tcp_is_reno(tp) || |
|---|
| 2843 | | - tcp_force_fast_retransmit(sk); |
|---|
| 2844 | | - } |
|---|
| 2845 | | - if (tcp_try_undo_dsack(sk)) { |
|---|
| 2846 | | - tcp_try_keep_open(sk); |
|---|
| 2978 | + if (tcp_is_reno(tp)) |
|---|
| 2979 | + tcp_add_reno_sack(sk, num_dupack, ece_ack); |
|---|
| 2980 | + } else if (tcp_try_undo_partial(sk, prior_snd_una, &do_lost)) |
|---|
| 2847 | 2981 | return; |
|---|
| 2848 | | - } |
|---|
| 2982 | + |
|---|
| 2983 | + if (tcp_try_undo_dsack(sk)) |
|---|
| 2984 | + tcp_try_keep_open(sk); |
|---|
| 2985 | + |
|---|
| 2849 | 2986 | tcp_identify_packet_loss(sk, ack_flag); |
|---|
| 2987 | + if (icsk->icsk_ca_state != TCP_CA_Recovery) { |
|---|
| 2988 | + if (!tcp_time_to_recover(sk, flag)) |
|---|
| 2989 | + return; |
|---|
| 2990 | + /* Undo reverts the recovery state. If loss is evident, |
|---|
| 2991 | + * starts a new recovery (e.g. reordering then loss); |
|---|
| 2992 | + */ |
|---|
| 2993 | + tcp_enter_recovery(sk, ece_ack); |
|---|
| 2994 | + } |
|---|
| 2850 | 2995 | break; |
|---|
| 2851 | 2996 | case TCP_CA_Loss: |
|---|
| 2852 | | - tcp_process_loss(sk, flag, is_dupack, rexmit); |
|---|
| 2997 | + tcp_process_loss(sk, flag, num_dupack, rexmit); |
|---|
| 2853 | 2998 | tcp_identify_packet_loss(sk, ack_flag); |
|---|
| 2854 | 2999 | if (!(icsk->icsk_ca_state == TCP_CA_Open || |
|---|
| 2855 | 3000 | (*ack_flag & FLAG_LOST_RETRANS))) |
|---|
| 2856 | 3001 | return; |
|---|
| 2857 | 3002 | /* Change state if cwnd is undone or retransmits are lost */ |
|---|
| 2858 | | - /* fall through */ |
|---|
| 3003 | + fallthrough; |
|---|
| 2859 | 3004 | default: |
|---|
| 2860 | 3005 | if (tcp_is_reno(tp)) { |
|---|
| 2861 | 3006 | if (flag & FLAG_SND_UNA_ADVANCED) |
|---|
| 2862 | 3007 | tcp_reset_reno_sack(tp); |
|---|
| 2863 | | - if (is_dupack) |
|---|
| 2864 | | - tcp_add_reno_sack(sk); |
|---|
| 3008 | + tcp_add_reno_sack(sk, num_dupack, ece_ack); |
|---|
| 2865 | 3009 | } |
|---|
| 2866 | 3010 | |
|---|
| 2867 | 3011 | if (icsk->icsk_ca_state <= TCP_CA_Disorder) |
|---|
| .. | .. |
|---|
| 2885 | 3029 | } |
|---|
| 2886 | 3030 | |
|---|
| 2887 | 3031 | /* Otherwise enter Recovery state */ |
|---|
| 2888 | | - tcp_enter_recovery(sk, (flag & FLAG_ECE)); |
|---|
| 3032 | + tcp_enter_recovery(sk, ece_ack); |
|---|
| 2889 | 3033 | fast_rexmit = 1; |
|---|
| 2890 | 3034 | } |
|---|
| 2891 | 3035 | |
|---|
| .. | .. |
|---|
| 2896 | 3040 | |
|---|
| 2897 | 3041 | static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us, const int flag) |
|---|
| 2898 | 3042 | { |
|---|
| 2899 | | - u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ; |
|---|
| 3043 | + u32 wlen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen) * HZ; |
|---|
| 2900 | 3044 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 2901 | 3045 | |
|---|
| 2902 | 3046 | if ((flag & FLAG_ACK_MAYBE_DELAYED) && rtt_us > tcp_min_rtt(tp)) { |
|---|
| .. | .. |
|---|
| 2935 | 3079 | u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; |
|---|
| 2936 | 3080 | |
|---|
| 2937 | 3081 | if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { |
|---|
| 3082 | + if (!delta) |
|---|
| 3083 | + delta = 1; |
|---|
| 2938 | 3084 | seq_rtt_us = delta * (USEC_PER_SEC / TCP_TS_HZ); |
|---|
| 2939 | 3085 | ca_rtt_us = seq_rtt_us; |
|---|
| 2940 | 3086 | } |
|---|
| .. | .. |
|---|
| 2988 | 3134 | /* If the retrans timer is currently being used by Fast Open |
|---|
| 2989 | 3135 | * for SYN-ACK retrans purpose, stay put. |
|---|
| 2990 | 3136 | */ |
|---|
| 2991 | | - if (tp->fastopen_rsk) |
|---|
| 3137 | + if (rcu_access_pointer(tp->fastopen_rsk)) |
|---|
| 2992 | 3138 | return; |
|---|
| 2993 | 3139 | |
|---|
| 2994 | 3140 | if (!tp->packets_out) { |
|---|
| .. | .. |
|---|
| 3004 | 3150 | */ |
|---|
| 3005 | 3151 | rto = usecs_to_jiffies(max_t(int, delta_us, 1)); |
|---|
| 3006 | 3152 | } |
|---|
| 3007 | | - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, |
|---|
| 3008 | | - TCP_RTO_MAX); |
|---|
| 3153 | + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, |
|---|
| 3154 | + TCP_RTO_MAX); |
|---|
| 3009 | 3155 | } |
|---|
| 3010 | 3156 | } |
|---|
| 3011 | 3157 | |
|---|
| .. | .. |
|---|
| 3061 | 3207 | */ |
|---|
| 3062 | 3208 | static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, |
|---|
| 3063 | 3209 | u32 prior_snd_una, |
|---|
| 3064 | | - struct tcp_sacktag_state *sack) |
|---|
| 3210 | + struct tcp_sacktag_state *sack, bool ece_ack) |
|---|
| 3065 | 3211 | { |
|---|
| 3066 | 3212 | const struct inet_connection_sock *icsk = inet_csk(sk); |
|---|
| 3067 | 3213 | u64 first_ackt, last_ackt; |
|---|
| .. | .. |
|---|
| 3086 | 3232 | u8 sacked = scb->sacked; |
|---|
| 3087 | 3233 | u32 acked_pcount; |
|---|
| 3088 | 3234 | |
|---|
| 3089 | | - tcp_ack_tstamp(sk, skb, prior_snd_una); |
|---|
| 3090 | | - |
|---|
| 3091 | 3235 | /* Determine how many packets and what bytes were acked, tso and else */ |
|---|
| 3092 | 3236 | if (after(scb->end_seq, tp->snd_una)) { |
|---|
| 3093 | 3237 | if (tcp_skb_pcount(skb) == 1 || |
|---|
| .. | .. |
|---|
| 3107 | 3251 | tp->retrans_out -= acked_pcount; |
|---|
| 3108 | 3252 | flag |= FLAG_RETRANS_DATA_ACKED; |
|---|
| 3109 | 3253 | } else if (!(sacked & TCPCB_SACKED_ACKED)) { |
|---|
| 3110 | | - last_ackt = skb->skb_mstamp; |
|---|
| 3254 | + last_ackt = tcp_skb_timestamp_us(skb); |
|---|
| 3111 | 3255 | WARN_ON_ONCE(last_ackt == 0); |
|---|
| 3112 | 3256 | if (!first_ackt) |
|---|
| 3113 | 3257 | first_ackt = last_ackt; |
|---|
| .. | .. |
|---|
| 3122 | 3266 | if (sacked & TCPCB_SACKED_ACKED) { |
|---|
| 3123 | 3267 | tp->sacked_out -= acked_pcount; |
|---|
| 3124 | 3268 | } else if (tcp_is_sack(tp)) { |
|---|
| 3125 | | - tp->delivered += acked_pcount; |
|---|
| 3269 | + tcp_count_delivered(tp, acked_pcount, ece_ack); |
|---|
| 3126 | 3270 | if (!tcp_skb_spurious_retrans(tp, skb)) |
|---|
| 3127 | 3271 | tcp_rack_advance(tp, sacked, scb->end_seq, |
|---|
| 3128 | | - skb->skb_mstamp); |
|---|
| 3272 | + tcp_skb_timestamp_us(skb)); |
|---|
| 3129 | 3273 | } |
|---|
| 3130 | 3274 | if (sacked & TCPCB_LOST) |
|---|
| 3131 | 3275 | tp->lost_out -= acked_pcount; |
|---|
| .. | .. |
|---|
| 3151 | 3295 | if (!fully_acked) |
|---|
| 3152 | 3296 | break; |
|---|
| 3153 | 3297 | |
|---|
| 3298 | + tcp_ack_tstamp(sk, skb, prior_snd_una); |
|---|
| 3299 | + |
|---|
| 3154 | 3300 | next = skb_rb_next(skb); |
|---|
| 3155 | 3301 | if (unlikely(skb == tp->retransmit_skb_hint)) |
|---|
| 3156 | 3302 | tp->retransmit_skb_hint = NULL; |
|---|
| .. | .. |
|---|
| 3166 | 3312 | if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una))) |
|---|
| 3167 | 3313 | tp->snd_up = tp->snd_una; |
|---|
| 3168 | 3314 | |
|---|
| 3169 | | - if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) |
|---|
| 3170 | | - flag |= FLAG_SACK_RENEGING; |
|---|
| 3315 | + if (skb) { |
|---|
| 3316 | + tcp_ack_tstamp(sk, skb, prior_snd_una); |
|---|
| 3317 | + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) |
|---|
| 3318 | + flag |= FLAG_SACK_RENEGING; |
|---|
| 3319 | + } |
|---|
| 3171 | 3320 | |
|---|
| 3172 | 3321 | if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) { |
|---|
| 3173 | 3322 | seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt); |
|---|
| .. | .. |
|---|
| 3199 | 3348 | } |
|---|
| 3200 | 3349 | |
|---|
| 3201 | 3350 | if (tcp_is_reno(tp)) { |
|---|
| 3202 | | - tcp_remove_reno_sacks(sk, pkts_acked); |
|---|
| 3351 | + tcp_remove_reno_sacks(sk, pkts_acked, ece_ack); |
|---|
| 3203 | 3352 | |
|---|
| 3204 | 3353 | /* If any of the cumulatively ACKed segments was |
|---|
| 3205 | 3354 | * retransmitted, non-SACK case cannot confirm that |
|---|
| .. | .. |
|---|
| 3220 | 3369 | tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta); |
|---|
| 3221 | 3370 | } |
|---|
| 3222 | 3371 | } else if (skb && rtt_update && sack_rtt_us >= 0 && |
|---|
| 3223 | | - sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) { |
|---|
| 3372 | + sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, |
|---|
| 3373 | + tcp_skb_timestamp_us(skb))) { |
|---|
| 3224 | 3374 | /* Do not re-arm RTO if the sack RTT is measured from data sent |
|---|
| 3225 | 3375 | * after when the head was last (re)transmitted. Otherwise the |
|---|
| 3226 | 3376 | * timeout may continue to extend in loss recovery. |
|---|
| .. | .. |
|---|
| 3273 | 3423 | return; |
|---|
| 3274 | 3424 | if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) { |
|---|
| 3275 | 3425 | icsk->icsk_backoff = 0; |
|---|
| 3426 | + icsk->icsk_probes_tstamp = 0; |
|---|
| 3276 | 3427 | inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); |
|---|
| 3277 | 3428 | /* Socket must be waked up by subsequent tcp_data_snd_check(). |
|---|
| 3278 | 3429 | * This function is not for random using! |
|---|
| .. | .. |
|---|
| 3280 | 3431 | } else { |
|---|
| 3281 | 3432 | unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX); |
|---|
| 3282 | 3433 | |
|---|
| 3283 | | - inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, |
|---|
| 3284 | | - when, TCP_RTO_MAX); |
|---|
| 3434 | + when = tcp_clamp_probe0_to_user_timeout(sk, when); |
|---|
| 3435 | + tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, TCP_RTO_MAX); |
|---|
| 3285 | 3436 | } |
|---|
| 3286 | 3437 | } |
|---|
| 3287 | 3438 | |
|---|
| .. | .. |
|---|
| 3300 | 3451 | * new SACK or ECE mark may first advance cwnd here and later reduce |
|---|
| 3301 | 3452 | * cwnd in tcp_fastretrans_alert() based on more states. |
|---|
| 3302 | 3453 | */ |
|---|
| 3303 | | - if (tcp_sk(sk)->reordering > sock_net(sk)->ipv4.sysctl_tcp_reordering) |
|---|
| 3454 | + if (tcp_sk(sk)->reordering > |
|---|
| 3455 | + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering)) |
|---|
| 3304 | 3456 | return flag & FLAG_FORWARD_PROGRESS; |
|---|
| 3305 | 3457 | |
|---|
| 3306 | 3458 | return flag & FLAG_DATA_ACKED; |
|---|
| .. | .. |
|---|
| 3412 | 3564 | if (*last_oow_ack_time) { |
|---|
| 3413 | 3565 | s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time); |
|---|
| 3414 | 3566 | |
|---|
| 3415 | | - if (0 <= elapsed && elapsed < net->ipv4.sysctl_tcp_invalid_ratelimit) { |
|---|
| 3567 | + if (0 <= elapsed && |
|---|
| 3568 | + elapsed < READ_ONCE(net->ipv4.sysctl_tcp_invalid_ratelimit)) { |
|---|
| 3416 | 3569 | NET_INC_STATS(net, mib_idx); |
|---|
| 3417 | 3570 | return true; /* rate-limited: don't send yet! */ |
|---|
| 3418 | 3571 | } |
|---|
| .. | .. |
|---|
| 3459 | 3612 | |
|---|
| 3460 | 3613 | /* Then check host-wide RFC 5961 rate limit. */ |
|---|
| 3461 | 3614 | now = jiffies / HZ; |
|---|
| 3462 | | - if (now != challenge_timestamp) { |
|---|
| 3463 | | - u32 ack_limit = net->ipv4.sysctl_tcp_challenge_ack_limit; |
|---|
| 3615 | + if (now != READ_ONCE(challenge_timestamp)) { |
|---|
| 3616 | + u32 ack_limit = READ_ONCE(net->ipv4.sysctl_tcp_challenge_ack_limit); |
|---|
| 3464 | 3617 | u32 half = (ack_limit + 1) >> 1; |
|---|
| 3465 | 3618 | |
|---|
| 3466 | | - challenge_timestamp = now; |
|---|
| 3619 | + WRITE_ONCE(challenge_timestamp, now); |
|---|
| 3467 | 3620 | WRITE_ONCE(challenge_count, half + prandom_u32_max(ack_limit)); |
|---|
| 3468 | 3621 | } |
|---|
| 3469 | 3622 | count = READ_ONCE(challenge_count); |
|---|
| .. | .. |
|---|
| 3544 | 3697 | { |
|---|
| 3545 | 3698 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 3546 | 3699 | |
|---|
| 3547 | | - if (rexmit == REXMIT_NONE) |
|---|
| 3700 | + if (rexmit == REXMIT_NONE || sk->sk_state == TCP_SYN_SENT) |
|---|
| 3548 | 3701 | return; |
|---|
| 3549 | 3702 | |
|---|
| 3550 | | - if (unlikely(rexmit == 2)) { |
|---|
| 3703 | + if (unlikely(rexmit == REXMIT_NEW)) { |
|---|
| 3551 | 3704 | __tcp_push_pending_frames(sk, tcp_current_mss(sk), |
|---|
| 3552 | 3705 | TCP_NAGLE_OFF); |
|---|
| 3553 | 3706 | if (after(tp->snd_nxt, tp->high_seq)) |
|---|
| .. | .. |
|---|
| 3566 | 3719 | |
|---|
| 3567 | 3720 | delivered = tp->delivered - prior_delivered; |
|---|
| 3568 | 3721 | NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered); |
|---|
| 3569 | | - if (flag & FLAG_ECE) { |
|---|
| 3570 | | - tp->delivered_ce += delivered; |
|---|
| 3722 | + if (flag & FLAG_ECE) |
|---|
| 3571 | 3723 | NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered); |
|---|
| 3572 | | - } |
|---|
| 3724 | + |
|---|
| 3573 | 3725 | return delivered; |
|---|
| 3574 | 3726 | } |
|---|
| 3575 | 3727 | |
|---|
| .. | .. |
|---|
| 3584 | 3736 | bool is_sack_reneg = tp->is_sack_reneg; |
|---|
| 3585 | 3737 | u32 ack_seq = TCP_SKB_CB(skb)->seq; |
|---|
| 3586 | 3738 | u32 ack = TCP_SKB_CB(skb)->ack_seq; |
|---|
| 3587 | | - bool is_dupack = false; |
|---|
| 3739 | + int num_dupack = 0; |
|---|
| 3588 | 3740 | int prior_packets = tp->packets_out; |
|---|
| 3589 | 3741 | u32 delivered = tp->delivered; |
|---|
| 3590 | 3742 | u32 lost = tp->lost; |
|---|
| .. | .. |
|---|
| 3593 | 3745 | |
|---|
| 3594 | 3746 | sack_state.first_sackt = 0; |
|---|
| 3595 | 3747 | sack_state.rate = &rs; |
|---|
| 3748 | + sack_state.sack_delivered = 0; |
|---|
| 3596 | 3749 | |
|---|
| 3597 | 3750 | /* We very likely will need to access rtx queue. */ |
|---|
| 3598 | 3751 | prefetch(sk->tcp_rtx_queue.rb_node); |
|---|
| .. | .. |
|---|
| 3614 | 3767 | * this segment (RFC793 Section 3.9). |
|---|
| 3615 | 3768 | */ |
|---|
| 3616 | 3769 | if (after(ack, tp->snd_nxt)) |
|---|
| 3617 | | - goto invalid_ack; |
|---|
| 3770 | + return -1; |
|---|
| 3618 | 3771 | |
|---|
| 3619 | 3772 | if (after(ack, prior_snd_una)) { |
|---|
| 3620 | 3773 | flag |= FLAG_SND_UNA_ADVANCED; |
|---|
| 3621 | 3774 | icsk->icsk_retransmits = 0; |
|---|
| 3622 | 3775 | |
|---|
| 3623 | 3776 | #if IS_ENABLED(CONFIG_TLS_DEVICE) |
|---|
| 3624 | | - if (static_branch_unlikely(&clean_acked_data_enabled)) |
|---|
| 3777 | + if (static_branch_unlikely(&clean_acked_data_enabled.key)) |
|---|
| 3625 | 3778 | if (icsk->icsk_clean_acked) |
|---|
| 3626 | 3779 | icsk->icsk_clean_acked(sk, ack); |
|---|
| 3627 | 3780 | #endif |
|---|
| .. | .. |
|---|
| 3636 | 3789 | if (flag & FLAG_UPDATE_TS_RECENT) |
|---|
| 3637 | 3790 | tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); |
|---|
| 3638 | 3791 | |
|---|
| 3639 | | - if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) { |
|---|
| 3792 | + if ((flag & (FLAG_SLOWPATH | FLAG_SND_UNA_ADVANCED)) == |
|---|
| 3793 | + FLAG_SND_UNA_ADVANCED) { |
|---|
| 3640 | 3794 | /* Window is constant, pure forward advance. |
|---|
| 3641 | 3795 | * No more checks are required. |
|---|
| 3642 | 3796 | * Note, we use the fact that SND.UNA>=SND.WL2. |
|---|
| .. | .. |
|---|
| 3667 | 3821 | ack_ev_flags |= CA_ACK_ECE; |
|---|
| 3668 | 3822 | } |
|---|
| 3669 | 3823 | |
|---|
| 3824 | + if (sack_state.sack_delivered) |
|---|
| 3825 | + tcp_count_delivered(tp, sack_state.sack_delivered, |
|---|
| 3826 | + flag & FLAG_ECE); |
|---|
| 3827 | + |
|---|
| 3670 | 3828 | if (flag & FLAG_WIN_UPDATE) |
|---|
| 3671 | 3829 | ack_ev_flags |= CA_ACK_WIN_UPDATE; |
|---|
| 3672 | 3830 | |
|---|
| .. | .. |
|---|
| 3692 | 3850 | goto no_queue; |
|---|
| 3693 | 3851 | |
|---|
| 3694 | 3852 | /* See if we can take anything off of the retransmit queue. */ |
|---|
| 3695 | | - flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state); |
|---|
| 3853 | + flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state, |
|---|
| 3854 | + flag & FLAG_ECE); |
|---|
| 3696 | 3855 | |
|---|
| 3697 | 3856 | tcp_rack_update_reo_wnd(sk, &rs); |
|---|
| 3698 | 3857 | |
|---|
| .. | .. |
|---|
| 3700 | 3859 | tcp_process_tlp_ack(sk, ack, flag); |
|---|
| 3701 | 3860 | |
|---|
| 3702 | 3861 | if (tcp_ack_is_dubious(sk, flag)) { |
|---|
| 3703 | | - is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); |
|---|
| 3704 | | - tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, |
|---|
| 3862 | + if (!(flag & (FLAG_SND_UNA_ADVANCED | |
|---|
| 3863 | + FLAG_NOT_DUP | FLAG_DSACKING_ACK))) { |
|---|
| 3864 | + num_dupack = 1; |
|---|
| 3865 | + /* Consider if pure acks were aggregated in tcp_add_backlog() */ |
|---|
| 3866 | + if (!(flag & FLAG_DATA)) |
|---|
| 3867 | + num_dupack = max_t(u16, 1, skb_shinfo(skb)->gso_segs); |
|---|
| 3868 | + } |
|---|
| 3869 | + tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, |
|---|
| 3705 | 3870 | &rexmit); |
|---|
| 3706 | 3871 | } |
|---|
| 3707 | 3872 | |
|---|
| .. | .. |
|---|
| 3723 | 3888 | no_queue: |
|---|
| 3724 | 3889 | /* If data was DSACKed, see if we can undo a cwnd reduction. */ |
|---|
| 3725 | 3890 | if (flag & FLAG_DSACKING_ACK) { |
|---|
| 3726 | | - tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, |
|---|
| 3891 | + tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, |
|---|
| 3727 | 3892 | &rexmit); |
|---|
| 3728 | 3893 | tcp_newly_delivered(sk, delivered, flag); |
|---|
| 3729 | 3894 | } |
|---|
| .. | .. |
|---|
| 3737 | 3902 | tcp_process_tlp_ack(sk, ack, flag); |
|---|
| 3738 | 3903 | return 1; |
|---|
| 3739 | 3904 | |
|---|
| 3740 | | -invalid_ack: |
|---|
| 3741 | | - SOCK_DEBUG(sk, "Ack %u after %u:%u\n", ack, tp->snd_una, tp->snd_nxt); |
|---|
| 3742 | | - return -1; |
|---|
| 3743 | | - |
|---|
| 3744 | 3905 | old_ack: |
|---|
| 3745 | 3906 | /* If data was SACKed, tag it and see if we should send more data. |
|---|
| 3746 | 3907 | * If data was DSACKed, see if we can undo a cwnd reduction. |
|---|
| .. | .. |
|---|
| 3748 | 3909 | if (TCP_SKB_CB(skb)->sacked) { |
|---|
| 3749 | 3910 | flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, |
|---|
| 3750 | 3911 | &sack_state); |
|---|
| 3751 | | - tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, |
|---|
| 3912 | + tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, |
|---|
| 3752 | 3913 | &rexmit); |
|---|
| 3753 | 3914 | tcp_newly_delivered(sk, delivered, flag); |
|---|
| 3754 | 3915 | tcp_xmit_recovery(sk, rexmit); |
|---|
| 3755 | 3916 | } |
|---|
| 3756 | 3917 | |
|---|
| 3757 | | - SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt); |
|---|
| 3758 | 3918 | return 0; |
|---|
| 3759 | 3919 | } |
|---|
| 3760 | 3920 | |
|---|
| .. | .. |
|---|
| 3775 | 3935 | foc->exp = exp_opt; |
|---|
| 3776 | 3936 | } |
|---|
| 3777 | 3937 | |
|---|
| 3778 | | -static void smc_parse_options(const struct tcphdr *th, |
|---|
| 3938 | +static bool smc_parse_options(const struct tcphdr *th, |
|---|
| 3779 | 3939 | struct tcp_options_received *opt_rx, |
|---|
| 3780 | 3940 | const unsigned char *ptr, |
|---|
| 3781 | 3941 | int opsize) |
|---|
| .. | .. |
|---|
| 3784 | 3944 | if (static_branch_unlikely(&tcp_have_smc)) { |
|---|
| 3785 | 3945 | if (th->syn && !(opsize & 1) && |
|---|
| 3786 | 3946 | opsize >= TCPOLEN_EXP_SMC_BASE && |
|---|
| 3787 | | - get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) |
|---|
| 3947 | + get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) { |
|---|
| 3788 | 3948 | opt_rx->smc_ok = 1; |
|---|
| 3949 | + return true; |
|---|
| 3950 | + } |
|---|
| 3789 | 3951 | } |
|---|
| 3790 | 3952 | #endif |
|---|
| 3953 | + return false; |
|---|
| 3954 | +} |
|---|
| 3955 | + |
|---|
| 3956 | +/* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped |
|---|
| 3957 | + * value on success. |
|---|
| 3958 | + */ |
|---|
| 3959 | +static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss) |
|---|
| 3960 | +{ |
|---|
| 3961 | + const unsigned char *ptr = (const unsigned char *)(th + 1); |
|---|
| 3962 | + int length = (th->doff * 4) - sizeof(struct tcphdr); |
|---|
| 3963 | + u16 mss = 0; |
|---|
| 3964 | + |
|---|
| 3965 | + while (length > 0) { |
|---|
| 3966 | + int opcode = *ptr++; |
|---|
| 3967 | + int opsize; |
|---|
| 3968 | + |
|---|
| 3969 | + switch (opcode) { |
|---|
| 3970 | + case TCPOPT_EOL: |
|---|
| 3971 | + return mss; |
|---|
| 3972 | + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ |
|---|
| 3973 | + length--; |
|---|
| 3974 | + continue; |
|---|
| 3975 | + default: |
|---|
| 3976 | + if (length < 2) |
|---|
| 3977 | + return mss; |
|---|
| 3978 | + opsize = *ptr++; |
|---|
| 3979 | + if (opsize < 2) /* "silly options" */ |
|---|
| 3980 | + return mss; |
|---|
| 3981 | + if (opsize > length) |
|---|
| 3982 | + return mss; /* fail on partial options */ |
|---|
| 3983 | + if (opcode == TCPOPT_MSS && opsize == TCPOLEN_MSS) { |
|---|
| 3984 | + u16 in_mss = get_unaligned_be16(ptr); |
|---|
| 3985 | + |
|---|
| 3986 | + if (in_mss) { |
|---|
| 3987 | + if (user_mss && user_mss < in_mss) |
|---|
| 3988 | + in_mss = user_mss; |
|---|
| 3989 | + mss = in_mss; |
|---|
| 3990 | + } |
|---|
| 3991 | + } |
|---|
| 3992 | + ptr += opsize - 2; |
|---|
| 3993 | + length -= opsize; |
|---|
| 3994 | + } |
|---|
| 3995 | + } |
|---|
| 3996 | + return mss; |
|---|
| 3791 | 3997 | } |
|---|
| 3792 | 3998 | |
|---|
| 3793 | 3999 | /* Look for tcp options. Normally only called on SYN and SYNACK packets. |
|---|
| .. | .. |
|---|
| 3805 | 4011 | |
|---|
| 3806 | 4012 | ptr = (const unsigned char *)(th + 1); |
|---|
| 3807 | 4013 | opt_rx->saw_tstamp = 0; |
|---|
| 4014 | + opt_rx->saw_unknown = 0; |
|---|
| 3808 | 4015 | |
|---|
| 3809 | 4016 | while (length > 0) { |
|---|
| 3810 | 4017 | int opcode = *ptr++; |
|---|
| .. | .. |
|---|
| 3817 | 4024 | length--; |
|---|
| 3818 | 4025 | continue; |
|---|
| 3819 | 4026 | default: |
|---|
| 4027 | + if (length < 2) |
|---|
| 4028 | + return; |
|---|
| 3820 | 4029 | opsize = *ptr++; |
|---|
| 3821 | 4030 | if (opsize < 2) /* "silly options" */ |
|---|
| 3822 | 4031 | return; |
|---|
| .. | .. |
|---|
| 3836 | 4045 | break; |
|---|
| 3837 | 4046 | case TCPOPT_WINDOW: |
|---|
| 3838 | 4047 | if (opsize == TCPOLEN_WINDOW && th->syn && |
|---|
| 3839 | | - !estab && net->ipv4.sysctl_tcp_window_scaling) { |
|---|
| 4048 | + !estab && READ_ONCE(net->ipv4.sysctl_tcp_window_scaling)) { |
|---|
| 3840 | 4049 | __u8 snd_wscale = *(__u8 *)ptr; |
|---|
| 3841 | 4050 | opt_rx->wscale_ok = 1; |
|---|
| 3842 | 4051 | if (snd_wscale > TCP_MAX_WSCALE) { |
|---|
| .. | .. |
|---|
| 3852 | 4061 | case TCPOPT_TIMESTAMP: |
|---|
| 3853 | 4062 | if ((opsize == TCPOLEN_TIMESTAMP) && |
|---|
| 3854 | 4063 | ((estab && opt_rx->tstamp_ok) || |
|---|
| 3855 | | - (!estab && net->ipv4.sysctl_tcp_timestamps))) { |
|---|
| 4064 | + (!estab && READ_ONCE(net->ipv4.sysctl_tcp_timestamps)))) { |
|---|
| 3856 | 4065 | opt_rx->saw_tstamp = 1; |
|---|
| 3857 | 4066 | opt_rx->rcv_tsval = get_unaligned_be32(ptr); |
|---|
| 3858 | 4067 | opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4); |
|---|
| .. | .. |
|---|
| 3860 | 4069 | break; |
|---|
| 3861 | 4070 | case TCPOPT_SACK_PERM: |
|---|
| 3862 | 4071 | if (opsize == TCPOLEN_SACK_PERM && th->syn && |
|---|
| 3863 | | - !estab && net->ipv4.sysctl_tcp_sack) { |
|---|
| 4072 | + !estab && READ_ONCE(net->ipv4.sysctl_tcp_sack)) { |
|---|
| 3864 | 4073 | opt_rx->sack_ok = TCP_SACK_SEEN; |
|---|
| 3865 | 4074 | tcp_sack_reset(opt_rx); |
|---|
| 3866 | 4075 | } |
|---|
| .. | .. |
|---|
| 3893 | 4102 | */ |
|---|
| 3894 | 4103 | if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE && |
|---|
| 3895 | 4104 | get_unaligned_be16(ptr) == |
|---|
| 3896 | | - TCPOPT_FASTOPEN_MAGIC) |
|---|
| 4105 | + TCPOPT_FASTOPEN_MAGIC) { |
|---|
| 3897 | 4106 | tcp_parse_fastopen_option(opsize - |
|---|
| 3898 | 4107 | TCPOLEN_EXP_FASTOPEN_BASE, |
|---|
| 3899 | 4108 | ptr + 2, th->syn, foc, true); |
|---|
| 3900 | | - else |
|---|
| 3901 | | - smc_parse_options(th, opt_rx, ptr, |
|---|
| 3902 | | - opsize); |
|---|
| 4109 | + break; |
|---|
| 4110 | + } |
|---|
| 4111 | + |
|---|
| 4112 | + if (smc_parse_options(th, opt_rx, ptr, opsize)) |
|---|
| 4113 | + break; |
|---|
| 4114 | + |
|---|
| 4115 | + opt_rx->saw_unknown = 1; |
|---|
| 3903 | 4116 | break; |
|---|
| 3904 | 4117 | |
|---|
| 4118 | + default: |
|---|
| 4119 | + opt_rx->saw_unknown = 1; |
|---|
| 3905 | 4120 | } |
|---|
| 3906 | 4121 | ptr += opsize-2; |
|---|
| 3907 | 4122 | length -= opsize; |
|---|
| .. | .. |
|---|
| 4117 | 4332 | case TCP_ESTABLISHED: |
|---|
| 4118 | 4333 | /* Move to CLOSE_WAIT */ |
|---|
| 4119 | 4334 | tcp_set_state(sk, TCP_CLOSE_WAIT); |
|---|
| 4120 | | - inet_csk(sk)->icsk_ack.pingpong = 1; |
|---|
| 4335 | + inet_csk_enter_pingpong_mode(sk); |
|---|
| 4121 | 4336 | break; |
|---|
| 4122 | 4337 | |
|---|
| 4123 | 4338 | case TCP_CLOSE_WAIT: |
|---|
| .. | .. |
|---|
| 4189 | 4404 | { |
|---|
| 4190 | 4405 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 4191 | 4406 | |
|---|
| 4192 | | - if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) { |
|---|
| 4407 | + if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) { |
|---|
| 4193 | 4408 | int mib_idx; |
|---|
| 4194 | 4409 | |
|---|
| 4195 | 4410 | if (before(seq, tp->rcv_nxt)) |
|---|
| .. | .. |
|---|
| 4215 | 4430 | tcp_sack_extend(tp->duplicate_sack, seq, end_seq); |
|---|
| 4216 | 4431 | } |
|---|
| 4217 | 4432 | |
|---|
| 4433 | +static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb) |
|---|
| 4434 | +{ |
|---|
| 4435 | + /* When the ACK path fails or drops most ACKs, the sender would |
|---|
| 4436 | + * timeout and spuriously retransmit the same segment repeatedly. |
|---|
| 4437 | + * The receiver remembers and reflects via DSACKs. Leverage the |
|---|
| 4438 | + * DSACK state and change the txhash to re-route speculatively. |
|---|
| 4439 | + */ |
|---|
| 4440 | + if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq && |
|---|
| 4441 | + sk_rethink_txhash(sk)) |
|---|
| 4442 | + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH); |
|---|
| 4443 | +} |
|---|
| 4444 | + |
|---|
| 4218 | 4445 | static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) |
|---|
| 4219 | 4446 | { |
|---|
| 4220 | 4447 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| .. | .. |
|---|
| 4224 | 4451 | NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); |
|---|
| 4225 | 4452 | tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); |
|---|
| 4226 | 4453 | |
|---|
| 4227 | | - if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) { |
|---|
| 4454 | + if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) { |
|---|
| 4228 | 4455 | u32 end_seq = TCP_SKB_CB(skb)->end_seq; |
|---|
| 4229 | 4456 | |
|---|
| 4457 | + tcp_rcv_spurious_retrans(sk, skb); |
|---|
| 4230 | 4458 | if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) |
|---|
| 4231 | 4459 | end_seq = tp->rcv_nxt; |
|---|
| 4232 | 4460 | tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq); |
|---|
| .. | .. |
|---|
| 4260 | 4488 | sp[i] = sp[i + 1]; |
|---|
| 4261 | 4489 | continue; |
|---|
| 4262 | 4490 | } |
|---|
| 4263 | | - this_sack++, swalk++; |
|---|
| 4491 | + this_sack++; |
|---|
| 4492 | + swalk++; |
|---|
| 4264 | 4493 | } |
|---|
| 4265 | 4494 | } |
|---|
| 4495 | + |
|---|
| 4496 | +static void tcp_sack_compress_send_ack(struct sock *sk) |
|---|
| 4497 | +{ |
|---|
| 4498 | + struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 4499 | + |
|---|
| 4500 | + if (!tp->compressed_ack) |
|---|
| 4501 | + return; |
|---|
| 4502 | + |
|---|
| 4503 | + if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) |
|---|
| 4504 | + __sock_put(sk); |
|---|
| 4505 | + |
|---|
| 4506 | + /* Since we have to send one ack finally, |
|---|
| 4507 | + * substract one from tp->compressed_ack to keep |
|---|
| 4508 | + * LINUX_MIB_TCPACKCOMPRESSED accurate. |
|---|
| 4509 | + */ |
|---|
| 4510 | + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, |
|---|
| 4511 | + tp->compressed_ack - 1); |
|---|
| 4512 | + |
|---|
| 4513 | + tp->compressed_ack = 0; |
|---|
| 4514 | + tcp_send_ack(sk); |
|---|
| 4515 | +} |
|---|
| 4516 | + |
|---|
| 4517 | +/* Reasonable amount of sack blocks included in TCP SACK option |
|---|
| 4518 | + * The max is 4, but this becomes 3 if TCP timestamps are there. |
|---|
| 4519 | + * Given that SACK packets might be lost, be conservative and use 2. |
|---|
| 4520 | + */ |
|---|
| 4521 | +#define TCP_SACK_BLOCKS_EXPECTED 2 |
|---|
| 4266 | 4522 | |
|---|
| 4267 | 4523 | static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) |
|---|
| 4268 | 4524 | { |
|---|
| .. | .. |
|---|
| 4276 | 4532 | |
|---|
| 4277 | 4533 | for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) { |
|---|
| 4278 | 4534 | if (tcp_sack_extend(sp, seq, end_seq)) { |
|---|
| 4535 | + if (this_sack >= TCP_SACK_BLOCKS_EXPECTED) |
|---|
| 4536 | + tcp_sack_compress_send_ack(sk); |
|---|
| 4279 | 4537 | /* Rotate this_sack to the first one. */ |
|---|
| 4280 | 4538 | for (; this_sack > 0; this_sack--, sp--) |
|---|
| 4281 | 4539 | swap(*sp, *(sp - 1)); |
|---|
| .. | .. |
|---|
| 4285 | 4543 | } |
|---|
| 4286 | 4544 | } |
|---|
| 4287 | 4545 | |
|---|
| 4546 | + if (this_sack >= TCP_SACK_BLOCKS_EXPECTED) |
|---|
| 4547 | + tcp_sack_compress_send_ack(sk); |
|---|
| 4548 | + |
|---|
| 4288 | 4549 | /* Could not find an adjacent existing SACK, build a new one, |
|---|
| 4289 | 4550 | * put it at the front, and shift everyone else down. We |
|---|
| 4290 | 4551 | * always know there is at least one SACK present already here. |
|---|
| .. | .. |
|---|
| 4292 | 4553 | * If the sack array is full, forget about the last one. |
|---|
| 4293 | 4554 | */ |
|---|
| 4294 | 4555 | if (this_sack >= TCP_NUM_SACKS) { |
|---|
| 4295 | | - if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) |
|---|
| 4296 | | - tcp_send_ack(sk); |
|---|
| 4297 | 4556 | this_sack--; |
|---|
| 4298 | 4557 | tp->rx_opt.num_sacks--; |
|---|
| 4299 | 4558 | sp--; |
|---|
| .. | .. |
|---|
| 4345 | 4604 | /** |
|---|
| 4346 | 4605 | * tcp_try_coalesce - try to merge skb to prior one |
|---|
| 4347 | 4606 | * @sk: socket |
|---|
| 4348 | | - * @dest: destination queue |
|---|
| 4349 | 4607 | * @to: prior buffer |
|---|
| 4350 | 4608 | * @from: buffer to add in queue |
|---|
| 4351 | 4609 | * @fragstolen: pointer to boolean |
|---|
| .. | .. |
|---|
| 4367 | 4625 | |
|---|
| 4368 | 4626 | /* Its possible this segment overlaps with prior segment in queue */ |
|---|
| 4369 | 4627 | if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) |
|---|
| 4628 | + return false; |
|---|
| 4629 | + |
|---|
| 4630 | + if (!mptcp_skb_can_collapse(to, from)) |
|---|
| 4370 | 4631 | return false; |
|---|
| 4371 | 4632 | |
|---|
| 4372 | 4633 | #ifdef CONFIG_TLS_DEVICE |
|---|
| .. | .. |
|---|
| 4412 | 4673 | |
|---|
| 4413 | 4674 | static void tcp_drop(struct sock *sk, struct sk_buff *skb) |
|---|
| 4414 | 4675 | { |
|---|
| 4676 | + trace_android_vh_kfree_skb(skb); |
|---|
| 4415 | 4677 | sk_drops_add(sk, skb); |
|---|
| 4416 | 4678 | __kfree_skb(skb); |
|---|
| 4417 | 4679 | } |
|---|
| .. | .. |
|---|
| 4443 | 4705 | rb_erase(&skb->rbnode, &tp->out_of_order_queue); |
|---|
| 4444 | 4706 | |
|---|
| 4445 | 4707 | if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) { |
|---|
| 4446 | | - SOCK_DEBUG(sk, "ofo packet was already received\n"); |
|---|
| 4447 | 4708 | tcp_drop(sk, skb); |
|---|
| 4448 | 4709 | continue; |
|---|
| 4449 | 4710 | } |
|---|
| 4450 | | - SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n", |
|---|
| 4451 | | - tp->rcv_nxt, TCP_SKB_CB(skb)->seq, |
|---|
| 4452 | | - TCP_SKB_CB(skb)->end_seq); |
|---|
| 4453 | 4711 | |
|---|
| 4454 | 4712 | tail = skb_peek_tail(&sk->sk_receive_queue); |
|---|
| 4455 | 4713 | eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); |
|---|
| .. | .. |
|---|
| 4511 | 4769 | tp->pred_flags = 0; |
|---|
| 4512 | 4770 | inet_csk_schedule_ack(sk); |
|---|
| 4513 | 4771 | |
|---|
| 4772 | + tp->rcv_ooopack += max_t(u16, 1, skb_shinfo(skb)->gso_segs); |
|---|
| 4514 | 4773 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); |
|---|
| 4515 | 4774 | seq = TCP_SKB_CB(skb)->seq; |
|---|
| 4516 | 4775 | end_seq = TCP_SKB_CB(skb)->end_seq; |
|---|
| 4517 | | - SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", |
|---|
| 4518 | | - tp->rcv_nxt, seq, end_seq); |
|---|
| 4519 | 4776 | |
|---|
| 4520 | 4777 | p = &tp->out_of_order_queue.rb_node; |
|---|
| 4521 | 4778 | if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) { |
|---|
| .. | .. |
|---|
| 4541 | 4798 | * and trigger fast retransmit. |
|---|
| 4542 | 4799 | */ |
|---|
| 4543 | 4800 | if (tcp_is_sack(tp)) |
|---|
| 4544 | | - tcp_grow_window(sk, skb); |
|---|
| 4801 | + tcp_grow_window(sk, skb, true); |
|---|
| 4545 | 4802 | kfree_skb_partial(skb, fragstolen); |
|---|
| 4546 | 4803 | skb = NULL; |
|---|
| 4547 | 4804 | goto add_sack; |
|---|
| .. | .. |
|---|
| 4629 | 4886 | * and trigger fast retransmit. |
|---|
| 4630 | 4887 | */ |
|---|
| 4631 | 4888 | if (tcp_is_sack(tp)) |
|---|
| 4632 | | - tcp_grow_window(sk, skb); |
|---|
| 4889 | + tcp_grow_window(sk, skb, false); |
|---|
| 4633 | 4890 | skb_condense(skb); |
|---|
| 4634 | 4891 | skb_set_owner_r(skb, sk); |
|---|
| 4635 | 4892 | } |
|---|
| 4636 | 4893 | } |
|---|
| 4637 | 4894 | |
|---|
| 4638 | | -static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, |
|---|
| 4639 | | - bool *fragstolen) |
|---|
| 4895 | +static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, |
|---|
| 4896 | + bool *fragstolen) |
|---|
| 4640 | 4897 | { |
|---|
| 4641 | 4898 | int eaten; |
|---|
| 4642 | 4899 | struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue); |
|---|
| 4643 | 4900 | |
|---|
| 4644 | | - __skb_pull(skb, hdrlen); |
|---|
| 4645 | 4901 | eaten = (tail && |
|---|
| 4646 | 4902 | tcp_try_coalesce(sk, tail, |
|---|
| 4647 | 4903 | skb, fragstolen)) ? 1 : 0; |
|---|
| .. | .. |
|---|
| 4692 | 4948 | TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size; |
|---|
| 4693 | 4949 | TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1; |
|---|
| 4694 | 4950 | |
|---|
| 4695 | | - if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) { |
|---|
| 4951 | + if (tcp_queue_rcv(sk, skb, &fragstolen)) { |
|---|
| 4696 | 4952 | WARN_ON_ONCE(fragstolen); /* should not happen */ |
|---|
| 4697 | 4953 | __kfree_skb(skb); |
|---|
| 4698 | 4954 | } |
|---|
| .. | .. |
|---|
| 4724 | 4980 | bool fragstolen; |
|---|
| 4725 | 4981 | int eaten; |
|---|
| 4726 | 4982 | |
|---|
| 4983 | + if (sk_is_mptcp(sk)) |
|---|
| 4984 | + mptcp_incoming_options(sk, skb); |
|---|
| 4985 | + |
|---|
| 4727 | 4986 | if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { |
|---|
| 4728 | 4987 | __kfree_skb(skb); |
|---|
| 4729 | 4988 | return; |
|---|
| .. | .. |
|---|
| 4753 | 5012 | goto drop; |
|---|
| 4754 | 5013 | } |
|---|
| 4755 | 5014 | |
|---|
| 4756 | | - eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); |
|---|
| 5015 | + eaten = tcp_queue_rcv(sk, skb, &fragstolen); |
|---|
| 4757 | 5016 | if (skb->len) |
|---|
| 4758 | 5017 | tcp_event_data_recv(sk, skb); |
|---|
| 4759 | 5018 | if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) |
|---|
| .. | .. |
|---|
| 4782 | 5041 | } |
|---|
| 4783 | 5042 | |
|---|
| 4784 | 5043 | if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { |
|---|
| 5044 | + tcp_rcv_spurious_retrans(sk, skb); |
|---|
| 4785 | 5045 | /* A retransmit, 2nd most common case. Force an immediate ack. */ |
|---|
| 4786 | 5046 | NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); |
|---|
| 4787 | 5047 | tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); |
|---|
| .. | .. |
|---|
| 4800 | 5060 | |
|---|
| 4801 | 5061 | if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { |
|---|
| 4802 | 5062 | /* Partial packet, seq < rcv_next < end_seq */ |
|---|
| 4803 | | - SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n", |
|---|
| 4804 | | - tp->rcv_nxt, TCP_SKB_CB(skb)->seq, |
|---|
| 4805 | | - TCP_SKB_CB(skb)->end_seq); |
|---|
| 4806 | | - |
|---|
| 4807 | 5063 | tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt); |
|---|
| 4808 | 5064 | |
|---|
| 4809 | 5065 | /* If window is closed, drop tail of packet. But after |
|---|
| .. | .. |
|---|
| 4897 | 5153 | /* The first skb to collapse is: |
|---|
| 4898 | 5154 | * - not SYN/FIN and |
|---|
| 4899 | 5155 | * - bloated or contains data before "start" or |
|---|
| 4900 | | - * overlaps to the next one. |
|---|
| 5156 | + * overlaps to the next one and mptcp allow collapsing. |
|---|
| 4901 | 5157 | */ |
|---|
| 4902 | 5158 | if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) && |
|---|
| 4903 | 5159 | (tcp_win_from_space(sk, skb->truesize) > skb->len || |
|---|
| .. | .. |
|---|
| 4906 | 5162 | break; |
|---|
| 4907 | 5163 | } |
|---|
| 4908 | 5164 | |
|---|
| 4909 | | - if (n && n != tail && |
|---|
| 5165 | + if (n && n != tail && mptcp_skb_can_collapse(skb, n) && |
|---|
| 4910 | 5166 | TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) { |
|---|
| 4911 | 5167 | end_of_skbs = false; |
|---|
| 4912 | 5168 | break; |
|---|
| .. | .. |
|---|
| 4939 | 5195 | else |
|---|
| 4940 | 5196 | __skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */ |
|---|
| 4941 | 5197 | skb_set_owner_r(nskb, sk); |
|---|
| 5198 | + mptcp_skb_ext_move(nskb, skb); |
|---|
| 4942 | 5199 | |
|---|
| 4943 | 5200 | /* Copy data, releasing collapsed skbs. */ |
|---|
| 4944 | 5201 | while (copy > 0) { |
|---|
| .. | .. |
|---|
| 4958 | 5215 | skb = tcp_collapse_one(sk, skb, list, root); |
|---|
| 4959 | 5216 | if (!skb || |
|---|
| 4960 | 5217 | skb == tail || |
|---|
| 5218 | + !mptcp_skb_can_collapse(nskb, skb) || |
|---|
| 4961 | 5219 | (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) |
|---|
| 4962 | 5220 | goto end; |
|---|
| 4963 | 5221 | #ifdef CONFIG_TLS_DEVICE |
|---|
| .. | .. |
|---|
| 5082 | 5340 | { |
|---|
| 5083 | 5341 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 5084 | 5342 | |
|---|
| 5085 | | - SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq); |
|---|
| 5086 | | - |
|---|
| 5087 | 5343 | NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED); |
|---|
| 5088 | 5344 | |
|---|
| 5089 | 5345 | if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) |
|---|
| .. | .. |
|---|
| 5149 | 5405 | return true; |
|---|
| 5150 | 5406 | } |
|---|
| 5151 | 5407 | |
|---|
| 5152 | | -/* When incoming ACK allowed to free some skb from write_queue, |
|---|
| 5153 | | - * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket |
|---|
| 5154 | | - * on the exit from tcp input handler. |
|---|
| 5155 | | - * |
|---|
| 5156 | | - * PROBLEM: sndbuf expansion does not work well with largesend. |
|---|
| 5157 | | - */ |
|---|
| 5158 | 5408 | static void tcp_new_space(struct sock *sk) |
|---|
| 5159 | 5409 | { |
|---|
| 5160 | 5410 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| .. | .. |
|---|
| 5167 | 5417 | sk->sk_write_space(sk); |
|---|
| 5168 | 5418 | } |
|---|
| 5169 | 5419 | |
|---|
| 5170 | | -static void tcp_check_space(struct sock *sk) |
|---|
| 5420 | +/* Caller made space either from: |
|---|
| 5421 | + * 1) Freeing skbs in rtx queues (after tp->snd_una has advanced) |
|---|
| 5422 | + * 2) Sent skbs from output queue (and thus advancing tp->snd_nxt) |
|---|
| 5423 | + * |
|---|
| 5424 | + * We might be able to generate EPOLLOUT to the application if: |
|---|
| 5425 | + * 1) Space consumed in output/rtx queues is below sk->sk_sndbuf/2 |
|---|
| 5426 | + * 2) notsent amount (tp->write_seq - tp->snd_nxt) became |
|---|
| 5427 | + * small enough that tcp_stream_memory_free() decides it |
|---|
| 5428 | + * is time to generate EPOLLOUT. |
|---|
| 5429 | + */ |
|---|
| 5430 | +void tcp_check_space(struct sock *sk) |
|---|
| 5171 | 5431 | { |
|---|
| 5172 | | - if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) { |
|---|
| 5173 | | - sock_reset_flag(sk, SOCK_QUEUE_SHRUNK); |
|---|
| 5174 | | - /* pairs with tcp_poll() */ |
|---|
| 5175 | | - smp_mb(); |
|---|
| 5176 | | - if (sk->sk_socket && |
|---|
| 5177 | | - test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { |
|---|
| 5178 | | - tcp_new_space(sk); |
|---|
| 5179 | | - if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) |
|---|
| 5180 | | - tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); |
|---|
| 5181 | | - } |
|---|
| 5432 | + /* pairs with tcp_poll() */ |
|---|
| 5433 | + smp_mb(); |
|---|
| 5434 | + if (sk->sk_socket && |
|---|
| 5435 | + test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { |
|---|
| 5436 | + tcp_new_space(sk); |
|---|
| 5437 | + if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) |
|---|
| 5438 | + tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); |
|---|
| 5182 | 5439 | } |
|---|
| 5183 | 5440 | } |
|---|
| 5184 | 5441 | |
|---|
| .. | .. |
|---|
| 5220 | 5477 | } |
|---|
| 5221 | 5478 | |
|---|
| 5222 | 5479 | if (!tcp_is_sack(tp) || |
|---|
| 5223 | | - tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr) |
|---|
| 5480 | + tp->compressed_ack >= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr)) |
|---|
| 5224 | 5481 | goto send_now; |
|---|
| 5225 | 5482 | |
|---|
| 5226 | 5483 | if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) { |
|---|
| 5227 | 5484 | tp->compressed_ack_rcv_nxt = tp->rcv_nxt; |
|---|
| 5228 | | - if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) |
|---|
| 5229 | | - NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, |
|---|
| 5230 | | - tp->compressed_ack - TCP_FASTRETRANS_THRESH); |
|---|
| 5231 | | - tp->compressed_ack = 0; |
|---|
| 5485 | + tp->dup_ack_counter = 0; |
|---|
| 5232 | 5486 | } |
|---|
| 5233 | | - |
|---|
| 5234 | | - if (++tp->compressed_ack <= TCP_FASTRETRANS_THRESH) |
|---|
| 5487 | + if (tp->dup_ack_counter < TCP_FASTRETRANS_THRESH) { |
|---|
| 5488 | + tp->dup_ack_counter++; |
|---|
| 5235 | 5489 | goto send_now; |
|---|
| 5236 | | - |
|---|
| 5490 | + } |
|---|
| 5491 | + tp->compressed_ack++; |
|---|
| 5237 | 5492 | if (hrtimer_is_queued(&tp->compressed_ack_timer)) |
|---|
| 5238 | 5493 | return; |
|---|
| 5239 | 5494 | |
|---|
| .. | .. |
|---|
| 5243 | 5498 | if (tp->srtt_us && tp->srtt_us < rtt) |
|---|
| 5244 | 5499 | rtt = tp->srtt_us; |
|---|
| 5245 | 5500 | |
|---|
| 5246 | | - delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns, |
|---|
| 5501 | + delay = min_t(unsigned long, |
|---|
| 5502 | + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns), |
|---|
| 5247 | 5503 | rtt * (NSEC_PER_USEC >> 3)/20); |
|---|
| 5248 | 5504 | sock_hold(sk); |
|---|
| 5249 | | - hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay), |
|---|
| 5250 | | - HRTIMER_MODE_REL_PINNED_SOFT); |
|---|
| 5505 | + hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay), |
|---|
| 5506 | + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns), |
|---|
| 5507 | + HRTIMER_MODE_REL_PINNED_SOFT); |
|---|
| 5251 | 5508 | } |
|---|
| 5252 | 5509 | |
|---|
| 5253 | 5510 | static inline void tcp_ack_snd_check(struct sock *sk) |
|---|
| .. | .. |
|---|
| 5274 | 5531 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 5275 | 5532 | u32 ptr = ntohs(th->urg_ptr); |
|---|
| 5276 | 5533 | |
|---|
| 5277 | | - if (ptr && !sock_net(sk)->ipv4.sysctl_tcp_stdurg) |
|---|
| 5534 | + if (ptr && !READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_stdurg)) |
|---|
| 5278 | 5535 | ptr--; |
|---|
| 5279 | 5536 | ptr += ntohl(th->seq); |
|---|
| 5280 | 5537 | |
|---|
| .. | .. |
|---|
| 5328 | 5585 | } |
|---|
| 5329 | 5586 | |
|---|
| 5330 | 5587 | tp->urg_data = TCP_URG_NOTYET; |
|---|
| 5331 | | - tp->urg_seq = ptr; |
|---|
| 5588 | + WRITE_ONCE(tp->urg_seq, ptr); |
|---|
| 5332 | 5589 | |
|---|
| 5333 | 5590 | /* Disable header prediction. */ |
|---|
| 5334 | 5591 | tp->pred_flags = 0; |
|---|
| .. | .. |
|---|
| 5481 | 5738 | goto discard; |
|---|
| 5482 | 5739 | } |
|---|
| 5483 | 5740 | |
|---|
| 5741 | + bpf_skops_parse_hdr(sk, skb); |
|---|
| 5742 | + |
|---|
| 5484 | 5743 | return true; |
|---|
| 5485 | 5744 | |
|---|
| 5486 | 5745 | discard: |
|---|
| .. | .. |
|---|
| 5521 | 5780 | trace_tcp_probe(sk, skb); |
|---|
| 5522 | 5781 | |
|---|
| 5523 | 5782 | tcp_mstamp_refresh(tp); |
|---|
| 5524 | | - if (unlikely(!sk->sk_rx_dst)) |
|---|
| 5783 | + if (unlikely(!rcu_access_pointer(sk->sk_rx_dst))) |
|---|
| 5525 | 5784 | inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); |
|---|
| 5526 | 5785 | /* |
|---|
| 5527 | 5786 | * Header prediction. |
|---|
| .. | .. |
|---|
| 5628 | 5887 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS); |
|---|
| 5629 | 5888 | |
|---|
| 5630 | 5889 | /* Bulk data transfer: receiver */ |
|---|
| 5631 | | - eaten = tcp_queue_rcv(sk, skb, tcp_header_len, |
|---|
| 5632 | | - &fragstolen); |
|---|
| 5890 | + __skb_pull(skb, tcp_header_len); |
|---|
| 5891 | + eaten = tcp_queue_rcv(sk, skb, &fragstolen); |
|---|
| 5633 | 5892 | |
|---|
| 5634 | 5893 | tcp_event_data_recv(sk, skb); |
|---|
| 5635 | 5894 | |
|---|
| .. | .. |
|---|
| 5691 | 5950 | } |
|---|
| 5692 | 5951 | EXPORT_SYMBOL(tcp_rcv_established); |
|---|
| 5693 | 5952 | |
|---|
| 5953 | +void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb) |
|---|
| 5954 | +{ |
|---|
| 5955 | + struct inet_connection_sock *icsk = inet_csk(sk); |
|---|
| 5956 | + struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 5957 | + |
|---|
| 5958 | + tcp_mtup_init(sk); |
|---|
| 5959 | + icsk->icsk_af_ops->rebuild_header(sk); |
|---|
| 5960 | + tcp_init_metrics(sk); |
|---|
| 5961 | + |
|---|
| 5962 | + /* Initialize the congestion window to start the transfer. |
|---|
| 5963 | + * Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been |
|---|
| 5964 | + * retransmitted. In light of RFC6298 more aggressive 1sec |
|---|
| 5965 | + * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK |
|---|
| 5966 | + * retransmission has occurred. |
|---|
| 5967 | + */ |
|---|
| 5968 | + if (tp->total_retrans > 1 && tp->undo_marker) |
|---|
| 5969 | + tp->snd_cwnd = 1; |
|---|
| 5970 | + else |
|---|
| 5971 | + tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk)); |
|---|
| 5972 | + tp->snd_cwnd_stamp = tcp_jiffies32; |
|---|
| 5973 | + |
|---|
| 5974 | + bpf_skops_established(sk, bpf_op, skb); |
|---|
| 5975 | + /* Initialize congestion control unless BPF initialized it already: */ |
|---|
| 5976 | + if (!icsk->icsk_ca_initialized) |
|---|
| 5977 | + tcp_init_congestion_control(sk); |
|---|
| 5978 | + tcp_init_buffer_space(sk); |
|---|
| 5979 | +} |
|---|
| 5980 | + |
|---|
| 5694 | 5981 | void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) |
|---|
| 5695 | 5982 | { |
|---|
| 5696 | 5983 | struct tcp_sock *tp = tcp_sk(sk); |
|---|
| .. | .. |
|---|
| 5705 | 5992 | sk_mark_napi_id(sk, skb); |
|---|
| 5706 | 5993 | } |
|---|
| 5707 | 5994 | |
|---|
| 5708 | | - tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); |
|---|
| 5995 | + tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, skb); |
|---|
| 5709 | 5996 | |
|---|
| 5710 | 5997 | /* Prevent spurious tcp_cwnd_restart() on first data |
|---|
| 5711 | 5998 | * packet. |
|---|
| .. | .. |
|---|
| 5760 | 6047 | tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp); |
|---|
| 5761 | 6048 | |
|---|
| 5762 | 6049 | if (data) { /* Retransmit unacked data in SYN */ |
|---|
| 6050 | + if (tp->total_retrans) |
|---|
| 6051 | + tp->fastopen_client_fail = TFO_SYN_RETRANSMITTED; |
|---|
| 6052 | + else |
|---|
| 6053 | + tp->fastopen_client_fail = TFO_DATA_NOT_ACKED; |
|---|
| 5763 | 6054 | skb_rbtree_walk_from(data) { |
|---|
| 5764 | 6055 | if (__tcp_retransmit_skb(sk, data, 1)) |
|---|
| 5765 | 6056 | break; |
|---|
| .. | .. |
|---|
| 5792 | 6083 | #endif |
|---|
| 5793 | 6084 | } |
|---|
| 5794 | 6085 | |
|---|
| 6086 | +static void tcp_try_undo_spurious_syn(struct sock *sk) |
|---|
| 6087 | +{ |
|---|
| 6088 | + struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 6089 | + u32 syn_stamp; |
|---|
| 6090 | + |
|---|
| 6091 | + /* undo_marker is set when SYN or SYNACK times out. The timeout is |
|---|
| 6092 | + * spurious if the ACK's timestamp option echo value matches the |
|---|
| 6093 | + * original SYN timestamp. |
|---|
| 6094 | + */ |
|---|
| 6095 | + syn_stamp = tp->retrans_stamp; |
|---|
| 6096 | + if (tp->undo_marker && syn_stamp && tp->rx_opt.saw_tstamp && |
|---|
| 6097 | + syn_stamp == tp->rx_opt.rcv_tsecr) |
|---|
| 6098 | + tp->undo_marker = 0; |
|---|
| 6099 | +} |
|---|
| 6100 | + |
|---|
| 5795 | 6101 | static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, |
|---|
| 5796 | 6102 | const struct tcphdr *th) |
|---|
| 5797 | 6103 | { |
|---|
| .. | .. |
|---|
| 5815 | 6121 | * the segment and return)" |
|---|
| 5816 | 6122 | */ |
|---|
| 5817 | 6123 | if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) || |
|---|
| 5818 | | - after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) |
|---|
| 6124 | + after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) { |
|---|
| 6125 | + /* Previous FIN/ACK or RST/ACK might be ignored. */ |
|---|
| 6126 | + if (icsk->icsk_retransmits == 0) |
|---|
| 6127 | + inet_csk_reset_xmit_timer(sk, |
|---|
| 6128 | + ICSK_TIME_RETRANS, |
|---|
| 6129 | + TCP_TIMEOUT_MIN, TCP_RTO_MAX); |
|---|
| 5819 | 6130 | goto reset_and_undo; |
|---|
| 6131 | + } |
|---|
| 5820 | 6132 | |
|---|
| 5821 | 6133 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && |
|---|
| 5822 | 6134 | !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp, |
|---|
| .. | .. |
|---|
| 5859 | 6171 | tcp_ecn_rcv_synack(tp, th); |
|---|
| 5860 | 6172 | |
|---|
| 5861 | 6173 | tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); |
|---|
| 6174 | + tcp_try_undo_spurious_syn(sk); |
|---|
| 5862 | 6175 | tcp_ack(sk, skb, FLAG_SLOWPATH); |
|---|
| 5863 | 6176 | |
|---|
| 5864 | 6177 | /* Ok.. it's good. Set up sequence numbers and |
|---|
| .. | .. |
|---|
| 5912 | 6225 | return -1; |
|---|
| 5913 | 6226 | if (sk->sk_write_pending || |
|---|
| 5914 | 6227 | icsk->icsk_accept_queue.rskq_defer_accept || |
|---|
| 5915 | | - icsk->icsk_ack.pingpong) { |
|---|
| 6228 | + inet_csk_in_pingpong_mode(sk)) { |
|---|
| 5916 | 6229 | /* Save one ACK. Data will be ready after |
|---|
| 5917 | 6230 | * several ticks, if write_pending is set. |
|---|
| 5918 | 6231 | * |
|---|
| .. | .. |
|---|
| 6017 | 6330 | return 1; |
|---|
| 6018 | 6331 | } |
|---|
| 6019 | 6332 | |
|---|
| 6333 | +static void tcp_rcv_synrecv_state_fastopen(struct sock *sk) |
|---|
| 6334 | +{ |
|---|
| 6335 | + struct request_sock *req; |
|---|
| 6336 | + |
|---|
| 6337 | + /* If we are still handling the SYNACK RTO, see if timestamp ECR allows |
|---|
| 6338 | + * undo. If peer SACKs triggered fast recovery, we can't undo here. |
|---|
| 6339 | + */ |
|---|
| 6340 | + if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) |
|---|
| 6341 | + tcp_try_undo_loss(sk, false); |
|---|
| 6342 | + |
|---|
| 6343 | + /* Reset rtx states to prevent spurious retransmits_timed_out() */ |
|---|
| 6344 | + tcp_sk(sk)->retrans_stamp = 0; |
|---|
| 6345 | + inet_csk(sk)->icsk_retransmits = 0; |
|---|
| 6346 | + |
|---|
| 6347 | + /* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1, |
|---|
| 6348 | + * we no longer need req so release it. |
|---|
| 6349 | + */ |
|---|
| 6350 | + req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, |
|---|
| 6351 | + lockdep_sock_is_held(sk)); |
|---|
| 6352 | + reqsk_fastopen_remove(sk, req, false); |
|---|
| 6353 | + |
|---|
| 6354 | + /* Re-arm the timer because data may have been sent out. |
|---|
| 6355 | + * This is similar to the regular data transmission case |
|---|
| 6356 | + * when new data has just been ack'ed. |
|---|
| 6357 | + * |
|---|
| 6358 | + * (TFO) - we could try to be more aggressive and |
|---|
| 6359 | + * retransmitting any data sooner based on when they |
|---|
| 6360 | + * are sent out. |
|---|
| 6361 | + */ |
|---|
| 6362 | + tcp_rearm_rto(sk); |
|---|
| 6363 | +} |
|---|
| 6364 | + |
|---|
| 6020 | 6365 | /* |
|---|
| 6021 | 6366 | * This function implements the receiving procedure of RFC 793 for |
|---|
| 6022 | 6367 | * all states except ESTABLISHED and TIME_WAIT. |
|---|
| .. | .. |
|---|
| 6079 | 6424 | |
|---|
| 6080 | 6425 | tcp_mstamp_refresh(tp); |
|---|
| 6081 | 6426 | tp->rx_opt.saw_tstamp = 0; |
|---|
| 6082 | | - req = tp->fastopen_rsk; |
|---|
| 6427 | + req = rcu_dereference_protected(tp->fastopen_rsk, |
|---|
| 6428 | + lockdep_sock_is_held(sk)); |
|---|
| 6083 | 6429 | if (req) { |
|---|
| 6084 | 6430 | bool req_stolen; |
|---|
| 6085 | 6431 | |
|---|
| .. | .. |
|---|
| 6113 | 6459 | if (!tp->srtt_us) |
|---|
| 6114 | 6460 | tcp_synack_rtt_meas(sk, req); |
|---|
| 6115 | 6461 | |
|---|
| 6116 | | - /* Once we leave TCP_SYN_RECV, we no longer need req |
|---|
| 6117 | | - * so release it. |
|---|
| 6118 | | - */ |
|---|
| 6119 | 6462 | if (req) { |
|---|
| 6120 | | - inet_csk(sk)->icsk_retransmits = 0; |
|---|
| 6121 | | - reqsk_fastopen_remove(sk, req, false); |
|---|
| 6122 | | - /* Re-arm the timer because data may have been sent out. |
|---|
| 6123 | | - * This is similar to the regular data transmission case |
|---|
| 6124 | | - * when new data has just been ack'ed. |
|---|
| 6125 | | - * |
|---|
| 6126 | | - * (TFO) - we could try to be more aggressive and |
|---|
| 6127 | | - * retransmitting any data sooner based on when they |
|---|
| 6128 | | - * are sent out. |
|---|
| 6129 | | - */ |
|---|
| 6130 | | - tcp_rearm_rto(sk); |
|---|
| 6463 | + tcp_rcv_synrecv_state_fastopen(sk); |
|---|
| 6131 | 6464 | } else { |
|---|
| 6132 | | - tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); |
|---|
| 6465 | + tcp_try_undo_spurious_syn(sk); |
|---|
| 6466 | + tp->retrans_stamp = 0; |
|---|
| 6467 | + tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, |
|---|
| 6468 | + skb); |
|---|
| 6133 | 6469 | WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); |
|---|
| 6134 | 6470 | } |
|---|
| 6135 | 6471 | smp_mb(); |
|---|
| .. | .. |
|---|
| 6163 | 6499 | case TCP_FIN_WAIT1: { |
|---|
| 6164 | 6500 | int tmo; |
|---|
| 6165 | 6501 | |
|---|
| 6166 | | - /* If we enter the TCP_FIN_WAIT1 state and we are a |
|---|
| 6167 | | - * Fast Open socket and this is the first acceptable |
|---|
| 6168 | | - * ACK we have received, this would have acknowledged |
|---|
| 6169 | | - * our SYNACK so stop the SYNACK timer. |
|---|
| 6170 | | - */ |
|---|
| 6171 | | - if (req) { |
|---|
| 6172 | | - /* We no longer need the request sock. */ |
|---|
| 6173 | | - reqsk_fastopen_remove(sk, req, false); |
|---|
| 6174 | | - tcp_rearm_rto(sk); |
|---|
| 6175 | | - } |
|---|
| 6502 | + if (req) |
|---|
| 6503 | + tcp_rcv_synrecv_state_fastopen(sk); |
|---|
| 6504 | + |
|---|
| 6176 | 6505 | if (tp->snd_una != tp->write_seq) |
|---|
| 6177 | 6506 | break; |
|---|
| 6178 | 6507 | |
|---|
| .. | .. |
|---|
| 6244 | 6573 | case TCP_CLOSE_WAIT: |
|---|
| 6245 | 6574 | case TCP_CLOSING: |
|---|
| 6246 | 6575 | case TCP_LAST_ACK: |
|---|
| 6247 | | - if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) |
|---|
| 6576 | + if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { |
|---|
| 6577 | + if (sk_is_mptcp(sk)) |
|---|
| 6578 | + mptcp_incoming_options(sk, skb); |
|---|
| 6248 | 6579 | break; |
|---|
| 6249 | | - /* fall through */ |
|---|
| 6580 | + } |
|---|
| 6581 | + fallthrough; |
|---|
| 6250 | 6582 | case TCP_FIN_WAIT1: |
|---|
| 6251 | 6583 | case TCP_FIN_WAIT2: |
|---|
| 6252 | 6584 | /* RFC 793 says to queue data in these states, |
|---|
| .. | .. |
|---|
| 6261 | 6593 | return 1; |
|---|
| 6262 | 6594 | } |
|---|
| 6263 | 6595 | } |
|---|
| 6264 | | - /* Fall through */ |
|---|
| 6596 | + fallthrough; |
|---|
| 6265 | 6597 | case TCP_ESTABLISHED: |
|---|
| 6266 | 6598 | tcp_data_queue(sk, skb); |
|---|
| 6267 | 6599 | queued = 1; |
|---|
| .. | .. |
|---|
| 6307 | 6639 | * congestion control: Linux DCTCP asserts ECT on all packets, |
|---|
| 6308 | 6640 | * including SYN, which is most optimal solution; however, |
|---|
| 6309 | 6641 | * others, such as FreeBSD do not. |
|---|
| 6642 | + * |
|---|
| 6643 | + * Exception: At least one of the reserved bits of the TCP header (th->res1) is |
|---|
| 6644 | + * set, indicating the use of a future TCP extension (such as AccECN). See |
|---|
| 6645 | + * RFC8311 ยง4.3 which updates RFC3168 to allow the development of such |
|---|
| 6646 | + * extensions. |
|---|
| 6310 | 6647 | */ |
|---|
| 6311 | 6648 | static void tcp_ecn_create_request(struct request_sock *req, |
|---|
| 6312 | 6649 | const struct sk_buff *skb, |
|---|
| .. | .. |
|---|
| 6326 | 6663 | ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK); |
|---|
| 6327 | 6664 | ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst; |
|---|
| 6328 | 6665 | |
|---|
| 6329 | | - if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) || |
|---|
| 6666 | + if (((!ect || th->res1) && ecn_ok) || tcp_ca_needs_ecn(listen_sk) || |
|---|
| 6330 | 6667 | (ecn_ok_dst & DST_FEATURE_ECN_CA) || |
|---|
| 6331 | 6668 | tcp_bpf_ca_needs_ecn((struct sock *)req)) |
|---|
| 6332 | 6669 | inet_rsk(req)->ecn_ok = 1; |
|---|
| .. | .. |
|---|
| 6339 | 6676 | struct inet_request_sock *ireq = inet_rsk(req); |
|---|
| 6340 | 6677 | |
|---|
| 6341 | 6678 | req->rsk_rcv_wnd = 0; /* So that tcp_send_synack() knows! */ |
|---|
| 6342 | | - req->cookie_ts = 0; |
|---|
| 6343 | 6679 | tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; |
|---|
| 6344 | 6680 | tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; |
|---|
| 6345 | | - tcp_rsk(req)->snt_synack = tcp_clock_us(); |
|---|
| 6681 | + tcp_rsk(req)->snt_synack = 0; |
|---|
| 6346 | 6682 | tcp_rsk(req)->last_oow_ack_time = 0; |
|---|
| 6347 | 6683 | req->mss = rx_opt->mss_clamp; |
|---|
| 6348 | 6684 | req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; |
|---|
| .. | .. |
|---|
| 6387 | 6723 | /* |
|---|
| 6388 | 6724 | * Return true if a syncookie should be sent |
|---|
| 6389 | 6725 | */ |
|---|
| 6390 | | -static bool tcp_syn_flood_action(const struct sock *sk, |
|---|
| 6391 | | - const struct sk_buff *skb, |
|---|
| 6392 | | - const char *proto) |
|---|
| 6726 | +static bool tcp_syn_flood_action(const struct sock *sk, const char *proto) |
|---|
| 6393 | 6727 | { |
|---|
| 6394 | 6728 | struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; |
|---|
| 6395 | 6729 | const char *msg = "Dropping request"; |
|---|
| 6396 | | - bool want_cookie = false; |
|---|
| 6397 | 6730 | struct net *net = sock_net(sk); |
|---|
| 6731 | + bool want_cookie = false; |
|---|
| 6732 | + u8 syncookies; |
|---|
| 6733 | + |
|---|
| 6734 | + syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies); |
|---|
| 6398 | 6735 | |
|---|
| 6399 | 6736 | #ifdef CONFIG_SYN_COOKIES |
|---|
| 6400 | | - if (net->ipv4.sysctl_tcp_syncookies) { |
|---|
| 6737 | + if (syncookies) { |
|---|
| 6401 | 6738 | msg = "Sending cookies"; |
|---|
| 6402 | 6739 | want_cookie = true; |
|---|
| 6403 | 6740 | __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES); |
|---|
| .. | .. |
|---|
| 6405 | 6742 | #endif |
|---|
| 6406 | 6743 | __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); |
|---|
| 6407 | 6744 | |
|---|
| 6408 | | - if (!queue->synflood_warned && |
|---|
| 6409 | | - net->ipv4.sysctl_tcp_syncookies != 2 && |
|---|
| 6745 | + if (!queue->synflood_warned && syncookies != 2 && |
|---|
| 6410 | 6746 | xchg(&queue->synflood_warned, 1) == 0) |
|---|
| 6411 | 6747 | net_info_ratelimited("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", |
|---|
| 6412 | | - proto, ntohs(tcp_hdr(skb)->dest), msg); |
|---|
| 6748 | + proto, sk->sk_num, msg); |
|---|
| 6413 | 6749 | |
|---|
| 6414 | 6750 | return want_cookie; |
|---|
| 6415 | 6751 | } |
|---|
| .. | .. |
|---|
| 6420 | 6756 | { |
|---|
| 6421 | 6757 | if (tcp_sk(sk)->save_syn) { |
|---|
| 6422 | 6758 | u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb); |
|---|
| 6423 | | - u32 *copy; |
|---|
| 6759 | + struct saved_syn *saved_syn; |
|---|
| 6760 | + u32 mac_hdrlen; |
|---|
| 6761 | + void *base; |
|---|
| 6424 | 6762 | |
|---|
| 6425 | | - copy = kmalloc(len + sizeof(u32), GFP_ATOMIC); |
|---|
| 6426 | | - if (copy) { |
|---|
| 6427 | | - copy[0] = len; |
|---|
| 6428 | | - memcpy(©[1], skb_network_header(skb), len); |
|---|
| 6429 | | - req->saved_syn = copy; |
|---|
| 6763 | + if (tcp_sk(sk)->save_syn == 2) { /* Save full header. */ |
|---|
| 6764 | + base = skb_mac_header(skb); |
|---|
| 6765 | + mac_hdrlen = skb_mac_header_len(skb); |
|---|
| 6766 | + len += mac_hdrlen; |
|---|
| 6767 | + } else { |
|---|
| 6768 | + base = skb_network_header(skb); |
|---|
| 6769 | + mac_hdrlen = 0; |
|---|
| 6770 | + } |
|---|
| 6771 | + |
|---|
| 6772 | + saved_syn = kmalloc(struct_size(saved_syn, data, len), |
|---|
| 6773 | + GFP_ATOMIC); |
|---|
| 6774 | + if (saved_syn) { |
|---|
| 6775 | + saved_syn->mac_hdrlen = mac_hdrlen; |
|---|
| 6776 | + saved_syn->network_hdrlen = skb_network_header_len(skb); |
|---|
| 6777 | + saved_syn->tcp_hdrlen = tcp_hdrlen(skb); |
|---|
| 6778 | + memcpy(saved_syn->data, base, len); |
|---|
| 6779 | + req->saved_syn = saved_syn; |
|---|
| 6430 | 6780 | } |
|---|
| 6431 | 6781 | } |
|---|
| 6432 | 6782 | } |
|---|
| 6783 | + |
|---|
| 6784 | +/* If a SYN cookie is required and supported, returns a clamped MSS value to be |
|---|
| 6785 | + * used for SYN cookie generation. |
|---|
| 6786 | + */ |
|---|
| 6787 | +u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, |
|---|
| 6788 | + const struct tcp_request_sock_ops *af_ops, |
|---|
| 6789 | + struct sock *sk, struct tcphdr *th) |
|---|
| 6790 | +{ |
|---|
| 6791 | + struct tcp_sock *tp = tcp_sk(sk); |
|---|
| 6792 | + u16 mss; |
|---|
| 6793 | + |
|---|
| 6794 | + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies) != 2 && |
|---|
| 6795 | + !inet_csk_reqsk_queue_is_full(sk)) |
|---|
| 6796 | + return 0; |
|---|
| 6797 | + |
|---|
| 6798 | + if (!tcp_syn_flood_action(sk, rsk_ops->slab_name)) |
|---|
| 6799 | + return 0; |
|---|
| 6800 | + |
|---|
| 6801 | + if (sk_acceptq_is_full(sk)) { |
|---|
| 6802 | + NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); |
|---|
| 6803 | + return 0; |
|---|
| 6804 | + } |
|---|
| 6805 | + |
|---|
| 6806 | + mss = tcp_parse_mss_option(th, tp->rx_opt.user_mss); |
|---|
| 6807 | + if (!mss) |
|---|
| 6808 | + mss = af_ops->mss_clamp; |
|---|
| 6809 | + |
|---|
| 6810 | + return mss; |
|---|
| 6811 | +} |
|---|
| 6812 | +EXPORT_SYMBOL_GPL(tcp_get_syncookie_mss); |
|---|
| 6433 | 6813 | |
|---|
| 6434 | 6814 | int tcp_conn_request(struct request_sock_ops *rsk_ops, |
|---|
| 6435 | 6815 | const struct tcp_request_sock_ops *af_ops, |
|---|
| .. | .. |
|---|
| 6445 | 6825 | bool want_cookie = false; |
|---|
| 6446 | 6826 | struct dst_entry *dst; |
|---|
| 6447 | 6827 | struct flowi fl; |
|---|
| 6828 | + u8 syncookies; |
|---|
| 6829 | + |
|---|
| 6830 | + syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies); |
|---|
| 6448 | 6831 | |
|---|
| 6449 | 6832 | /* TW buckets are converted to open requests without |
|---|
| 6450 | 6833 | * limitations, they conserve resources and peer is |
|---|
| 6451 | 6834 | * evidently real one. |
|---|
| 6452 | 6835 | */ |
|---|
| 6453 | | - if ((net->ipv4.sysctl_tcp_syncookies == 2 || |
|---|
| 6454 | | - inet_csk_reqsk_queue_is_full(sk)) && !isn) { |
|---|
| 6455 | | - want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name); |
|---|
| 6836 | + if ((syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) && !isn) { |
|---|
| 6837 | + want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name); |
|---|
| 6456 | 6838 | if (!want_cookie) |
|---|
| 6457 | 6839 | goto drop; |
|---|
| 6458 | 6840 | } |
|---|
| .. | .. |
|---|
| 6466 | 6848 | if (!req) |
|---|
| 6467 | 6849 | goto drop; |
|---|
| 6468 | 6850 | |
|---|
| 6851 | + req->syncookie = want_cookie; |
|---|
| 6469 | 6852 | tcp_rsk(req)->af_specific = af_ops; |
|---|
| 6470 | 6853 | tcp_rsk(req)->ts_off = 0; |
|---|
| 6854 | +#if IS_ENABLED(CONFIG_MPTCP) |
|---|
| 6855 | + tcp_rsk(req)->is_mptcp = 0; |
|---|
| 6856 | +#endif |
|---|
| 6471 | 6857 | |
|---|
| 6472 | 6858 | tcp_clear_options(&tmp_opt); |
|---|
| 6473 | 6859 | tmp_opt.mss_clamp = af_ops->mss_clamp; |
|---|
| .. | .. |
|---|
| 6501 | 6887 | goto drop_and_free; |
|---|
| 6502 | 6888 | |
|---|
| 6503 | 6889 | if (!want_cookie && !isn) { |
|---|
| 6890 | + int max_syn_backlog = READ_ONCE(net->ipv4.sysctl_max_syn_backlog); |
|---|
| 6891 | + |
|---|
| 6504 | 6892 | /* Kill the following clause, if you dislike this way. */ |
|---|
| 6505 | | - if (!net->ipv4.sysctl_tcp_syncookies && |
|---|
| 6506 | | - (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < |
|---|
| 6507 | | - (net->ipv4.sysctl_max_syn_backlog >> 2)) && |
|---|
| 6893 | + if (!syncookies && |
|---|
| 6894 | + (max_syn_backlog - inet_csk_reqsk_queue_len(sk) < |
|---|
| 6895 | + (max_syn_backlog >> 2)) && |
|---|
| 6508 | 6896 | !tcp_peer_is_proven(req, dst)) { |
|---|
| 6509 | 6897 | /* Without syncookies last quarter of |
|---|
| 6510 | 6898 | * backlog is filled with destinations, |
|---|
| .. | .. |
|---|
| 6525 | 6913 | |
|---|
| 6526 | 6914 | if (want_cookie) { |
|---|
| 6527 | 6915 | isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); |
|---|
| 6528 | | - req->cookie_ts = tmp_opt.tstamp_ok; |
|---|
| 6529 | 6916 | if (!tmp_opt.tstamp_ok) |
|---|
| 6530 | 6917 | inet_rsk(req)->ecn_ok = 0; |
|---|
| 6531 | 6918 | } |
|---|
| 6532 | 6919 | |
|---|
| 6533 | 6920 | tcp_rsk(req)->snt_isn = isn; |
|---|
| 6534 | 6921 | tcp_rsk(req)->txhash = net_tx_rndhash(); |
|---|
| 6922 | + tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield; |
|---|
| 6535 | 6923 | tcp_openreq_init_rwin(req, sk, dst); |
|---|
| 6536 | 6924 | sk_rx_queue_set(req_to_sk(req), skb); |
|---|
| 6537 | 6925 | if (!want_cookie) { |
|---|
| .. | .. |
|---|
| 6540 | 6928 | } |
|---|
| 6541 | 6929 | if (fastopen_sk) { |
|---|
| 6542 | 6930 | af_ops->send_synack(fastopen_sk, dst, &fl, req, |
|---|
| 6543 | | - &foc, TCP_SYNACK_FASTOPEN); |
|---|
| 6931 | + &foc, TCP_SYNACK_FASTOPEN, skb); |
|---|
| 6544 | 6932 | /* Add the child socket directly into the accept queue */ |
|---|
| 6545 | 6933 | if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) { |
|---|
| 6546 | 6934 | reqsk_fastopen_remove(fastopen_sk, req, false); |
|---|
| 6547 | 6935 | bh_unlock_sock(fastopen_sk); |
|---|
| 6548 | 6936 | sock_put(fastopen_sk); |
|---|
| 6549 | | - reqsk_put(req); |
|---|
| 6550 | | - goto drop; |
|---|
| 6937 | + goto drop_and_free; |
|---|
| 6551 | 6938 | } |
|---|
| 6552 | 6939 | sk->sk_data_ready(sk); |
|---|
| 6553 | 6940 | bh_unlock_sock(fastopen_sk); |
|---|
| .. | .. |
|---|
| 6559 | 6946 | tcp_timeout_init((struct sock *)req)); |
|---|
| 6560 | 6947 | af_ops->send_synack(sk, dst, &fl, req, &foc, |
|---|
| 6561 | 6948 | !want_cookie ? TCP_SYNACK_NORMAL : |
|---|
| 6562 | | - TCP_SYNACK_COOKIE); |
|---|
| 6949 | + TCP_SYNACK_COOKIE, |
|---|
| 6950 | + skb); |
|---|
| 6563 | 6951 | if (want_cookie) { |
|---|
| 6564 | 6952 | reqsk_free(req); |
|---|
| 6565 | 6953 | return 0; |
|---|
| .. | .. |
|---|
| 6571 | 6959 | drop_and_release: |
|---|
| 6572 | 6960 | dst_release(dst); |
|---|
| 6573 | 6961 | drop_and_free: |
|---|
| 6574 | | - reqsk_free(req); |
|---|
| 6962 | + __reqsk_free(req); |
|---|
| 6575 | 6963 | drop: |
|---|
| 6576 | 6964 | tcp_listendrop(sk); |
|---|
| 6577 | 6965 | return 0; |
|---|