.. | .. |
---|
77 | 77 | #include <asm/unaligned.h> |
---|
78 | 78 | #include <linux/errqueue.h> |
---|
79 | 79 | #include <trace/events/tcp.h> |
---|
80 | | -#include <linux/static_key.h> |
---|
| 80 | +#include <linux/jump_label_ratelimit.h> |
---|
81 | 81 | #include <net/busy_poll.h> |
---|
| 82 | +#include <net/mptcp.h> |
---|
| 83 | +#include <trace/hooks/net.h> |
---|
82 | 84 | |
---|
83 | 85 | int sysctl_tcp_max_orphans __read_mostly = NR_FILE; |
---|
84 | 86 | |
---|
.. | .. |
---|
113 | 115 | #define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */ |
---|
114 | 116 | |
---|
115 | 117 | #if IS_ENABLED(CONFIG_TLS_DEVICE) |
---|
116 | | -static DEFINE_STATIC_KEY_FALSE(clean_acked_data_enabled); |
---|
| 118 | +static DEFINE_STATIC_KEY_DEFERRED_FALSE(clean_acked_data_enabled, HZ); |
---|
117 | 119 | |
---|
118 | 120 | void clean_acked_data_enable(struct inet_connection_sock *icsk, |
---|
119 | 121 | void (*cad)(struct sock *sk, u32 ack_seq)) |
---|
120 | 122 | { |
---|
121 | 123 | icsk->icsk_clean_acked = cad; |
---|
122 | | - static_branch_inc(&clean_acked_data_enabled); |
---|
| 124 | + static_branch_deferred_inc(&clean_acked_data_enabled); |
---|
123 | 125 | } |
---|
124 | 126 | EXPORT_SYMBOL_GPL(clean_acked_data_enable); |
---|
125 | 127 | |
---|
126 | 128 | void clean_acked_data_disable(struct inet_connection_sock *icsk) |
---|
127 | 129 | { |
---|
128 | | - static_branch_dec(&clean_acked_data_enabled); |
---|
| 130 | + static_branch_slow_dec_deferred(&clean_acked_data_enabled); |
---|
129 | 131 | icsk->icsk_clean_acked = NULL; |
---|
130 | 132 | } |
---|
131 | 133 | EXPORT_SYMBOL_GPL(clean_acked_data_disable); |
---|
| 134 | + |
---|
| 135 | +void clean_acked_data_flush(void) |
---|
| 136 | +{ |
---|
| 137 | + static_key_deferred_flush(&clean_acked_data_enabled); |
---|
| 138 | +} |
---|
| 139 | +EXPORT_SYMBOL_GPL(clean_acked_data_flush); |
---|
| 140 | +#endif |
---|
| 141 | + |
---|
| 142 | +#ifdef CONFIG_CGROUP_BPF |
---|
| 143 | +static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) |
---|
| 144 | +{ |
---|
| 145 | + bool unknown_opt = tcp_sk(sk)->rx_opt.saw_unknown && |
---|
| 146 | + BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), |
---|
| 147 | + BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG); |
---|
| 148 | + bool parse_all_opt = BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), |
---|
| 149 | + BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG); |
---|
| 150 | + struct bpf_sock_ops_kern sock_ops; |
---|
| 151 | + |
---|
| 152 | + if (likely(!unknown_opt && !parse_all_opt)) |
---|
| 153 | + return; |
---|
| 154 | + |
---|
| 155 | + /* The skb will be handled in the |
---|
| 156 | + * bpf_skops_established() or |
---|
| 157 | + * bpf_skops_write_hdr_opt(). |
---|
| 158 | + */ |
---|
| 159 | + switch (sk->sk_state) { |
---|
| 160 | + case TCP_SYN_RECV: |
---|
| 161 | + case TCP_SYN_SENT: |
---|
| 162 | + case TCP_LISTEN: |
---|
| 163 | + return; |
---|
| 164 | + } |
---|
| 165 | + |
---|
| 166 | + sock_owned_by_me(sk); |
---|
| 167 | + |
---|
| 168 | + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); |
---|
| 169 | + sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB; |
---|
| 170 | + sock_ops.is_fullsock = 1; |
---|
| 171 | + sock_ops.sk = sk; |
---|
| 172 | + bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb)); |
---|
| 173 | + |
---|
| 174 | + BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); |
---|
| 175 | +} |
---|
| 176 | + |
---|
| 177 | +static void bpf_skops_established(struct sock *sk, int bpf_op, |
---|
| 178 | + struct sk_buff *skb) |
---|
| 179 | +{ |
---|
| 180 | + struct bpf_sock_ops_kern sock_ops; |
---|
| 181 | + |
---|
| 182 | + sock_owned_by_me(sk); |
---|
| 183 | + |
---|
| 184 | + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); |
---|
| 185 | + sock_ops.op = bpf_op; |
---|
| 186 | + sock_ops.is_fullsock = 1; |
---|
| 187 | + sock_ops.sk = sk; |
---|
| 188 | + /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */ |
---|
| 189 | + if (skb) |
---|
| 190 | + bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb)); |
---|
| 191 | + |
---|
| 192 | + BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); |
---|
| 193 | +} |
---|
| 194 | +#else |
---|
| 195 | +static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) |
---|
| 196 | +{ |
---|
| 197 | +} |
---|
| 198 | + |
---|
| 199 | +static void bpf_skops_established(struct sock *sk, int bpf_op, |
---|
| 200 | + struct sk_buff *skb) |
---|
| 201 | +{ |
---|
| 202 | +} |
---|
132 | 203 | #endif |
---|
133 | 204 | |
---|
134 | 205 | static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb, |
---|
.. | .. |
---|
221 | 292 | struct inet_connection_sock *icsk = inet_csk(sk); |
---|
222 | 293 | |
---|
223 | 294 | tcp_incr_quickack(sk, max_quickacks); |
---|
224 | | - icsk->icsk_ack.pingpong = 0; |
---|
| 295 | + inet_csk_exit_pingpong_mode(sk); |
---|
225 | 296 | icsk->icsk_ack.ato = TCP_ATO_MIN; |
---|
226 | 297 | } |
---|
227 | 298 | EXPORT_SYMBOL(tcp_enter_quickack_mode); |
---|
.. | .. |
---|
236 | 307 | const struct dst_entry *dst = __sk_dst_get(sk); |
---|
237 | 308 | |
---|
238 | 309 | return (dst && dst_metric(dst, RTAX_QUICKACK)) || |
---|
239 | | - (icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong); |
---|
| 310 | + (icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk)); |
---|
240 | 311 | } |
---|
241 | 312 | |
---|
242 | 313 | static void tcp_ecn_queue_cwr(struct tcp_sock *tp) |
---|
.. | .. |
---|
354 | 425 | sndmem *= nr_segs * per_mss; |
---|
355 | 426 | |
---|
356 | 427 | if (sk->sk_sndbuf < sndmem) |
---|
357 | | - sk->sk_sndbuf = min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]); |
---|
| 428 | + WRITE_ONCE(sk->sk_sndbuf, |
---|
| 429 | + min(sndmem, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[2]))); |
---|
358 | 430 | } |
---|
359 | 431 | |
---|
360 | 432 | /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) |
---|
.. | .. |
---|
383 | 455 | */ |
---|
384 | 456 | |
---|
385 | 457 | /* Slow part of check#2. */ |
---|
386 | | -static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) |
---|
| 458 | +static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb, |
---|
| 459 | + unsigned int skbtruesize) |
---|
387 | 460 | { |
---|
388 | 461 | struct tcp_sock *tp = tcp_sk(sk); |
---|
389 | 462 | /* Optimize this! */ |
---|
390 | | - int truesize = tcp_win_from_space(sk, skb->truesize) >> 1; |
---|
391 | | - int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1; |
---|
| 463 | + int truesize = tcp_win_from_space(sk, skbtruesize) >> 1; |
---|
| 464 | + int window = tcp_win_from_space(sk, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])) >> 1; |
---|
392 | 465 | |
---|
393 | 466 | while (tp->rcv_ssthresh <= window) { |
---|
394 | 467 | if (truesize <= skb->len) |
---|
.. | .. |
---|
400 | 473 | return 0; |
---|
401 | 474 | } |
---|
402 | 475 | |
---|
403 | | -static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) |
---|
| 476 | +/* Even if skb appears to have a bad len/truesize ratio, TCP coalescing |
---|
| 477 | + * can play nice with us, as sk_buff and skb->head might be either |
---|
| 478 | + * freed or shared with up to MAX_SKB_FRAGS segments. |
---|
| 479 | + * Only give a boost to drivers using page frag(s) to hold the frame(s), |
---|
| 480 | + * and if no payload was pulled in skb->head before reaching us. |
---|
| 481 | + */ |
---|
| 482 | +static u32 truesize_adjust(bool adjust, const struct sk_buff *skb) |
---|
| 483 | +{ |
---|
| 484 | + u32 truesize = skb->truesize; |
---|
| 485 | + |
---|
| 486 | + if (adjust && !skb_headlen(skb)) { |
---|
| 487 | + truesize -= SKB_TRUESIZE(skb_end_offset(skb)); |
---|
| 488 | + /* paranoid check, some drivers might be buggy */ |
---|
| 489 | + if (unlikely((int)truesize < (int)skb->len)) |
---|
| 490 | + truesize = skb->truesize; |
---|
| 491 | + } |
---|
| 492 | + return truesize; |
---|
| 493 | +} |
---|
| 494 | + |
---|
| 495 | +static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb, |
---|
| 496 | + bool adjust) |
---|
404 | 497 | { |
---|
405 | 498 | struct tcp_sock *tp = tcp_sk(sk); |
---|
406 | 499 | int room; |
---|
.. | .. |
---|
409 | 502 | |
---|
410 | 503 | /* Check #1 */ |
---|
411 | 504 | if (room > 0 && !tcp_under_memory_pressure(sk)) { |
---|
| 505 | + unsigned int truesize = truesize_adjust(adjust, skb); |
---|
412 | 506 | int incr; |
---|
413 | 507 | |
---|
414 | 508 | /* Check #2. Increase window, if skb with such overhead |
---|
415 | 509 | * will fit to rcvbuf in future. |
---|
416 | 510 | */ |
---|
417 | | - if (tcp_win_from_space(sk, skb->truesize) <= skb->len) |
---|
| 511 | + if (tcp_win_from_space(sk, truesize) <= skb->len) |
---|
418 | 512 | incr = 2 * tp->advmss; |
---|
419 | 513 | else |
---|
420 | | - incr = __tcp_grow_window(sk, skb); |
---|
| 514 | + incr = __tcp_grow_window(sk, skb, truesize); |
---|
421 | 515 | |
---|
422 | 516 | if (incr) { |
---|
423 | 517 | incr = max_t(int, incr, 2 * skb->len); |
---|
.. | .. |
---|
430 | 524 | /* 3. Try to fixup all. It is made immediately after connection enters |
---|
431 | 525 | * established state. |
---|
432 | 526 | */ |
---|
433 | | -void tcp_init_buffer_space(struct sock *sk) |
---|
| 527 | +static void tcp_init_buffer_space(struct sock *sk) |
---|
434 | 528 | { |
---|
435 | | - int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win; |
---|
| 529 | + int tcp_app_win = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_app_win); |
---|
436 | 530 | struct tcp_sock *tp = tcp_sk(sk); |
---|
437 | 531 | int maxwin; |
---|
438 | 532 | |
---|
.. | .. |
---|
472 | 566 | struct tcp_sock *tp = tcp_sk(sk); |
---|
473 | 567 | struct inet_connection_sock *icsk = inet_csk(sk); |
---|
474 | 568 | struct net *net = sock_net(sk); |
---|
| 569 | + int rmem2; |
---|
475 | 570 | |
---|
476 | 571 | icsk->icsk_ack.quick = 0; |
---|
| 572 | + rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]); |
---|
477 | 573 | |
---|
478 | | - if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] && |
---|
| 574 | + if (sk->sk_rcvbuf < rmem2 && |
---|
479 | 575 | !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && |
---|
480 | 576 | !tcp_under_memory_pressure(sk) && |
---|
481 | 577 | sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { |
---|
482 | | - sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), |
---|
483 | | - net->ipv4.sysctl_tcp_rmem[2]); |
---|
| 578 | + WRITE_ONCE(sk->sk_rcvbuf, |
---|
| 579 | + min(atomic_read(&sk->sk_rmem_alloc), rmem2)); |
---|
484 | 580 | } |
---|
485 | 581 | if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) |
---|
486 | 582 | tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss); |
---|
.. | .. |
---|
510 | 606 | * |
---|
511 | 607 | * The algorithm for RTT estimation w/o timestamps is based on |
---|
512 | 608 | * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL. |
---|
513 | | - * <http://public.lanl.gov/radiant/pubs.html#DRS> |
---|
| 609 | + * <https://public.lanl.gov/radiant/pubs.html#DRS> |
---|
514 | 610 | * |
---|
515 | 611 | * More detail on this code can be found at |
---|
516 | 612 | * <http://staff.psc.edu/jheffner/>, |
---|
.. | .. |
---|
621 | 717 | * <prev RTT . ><current RTT .. ><next RTT .... > |
---|
622 | 718 | */ |
---|
623 | 719 | |
---|
624 | | - if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf && |
---|
| 720 | + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && |
---|
625 | 721 | !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { |
---|
626 | 722 | int rcvmem, rcvbuf; |
---|
627 | 723 | u64 rcvwin, grow; |
---|
.. | .. |
---|
642 | 738 | |
---|
643 | 739 | do_div(rcvwin, tp->advmss); |
---|
644 | 740 | rcvbuf = min_t(u64, rcvwin * rcvmem, |
---|
645 | | - sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); |
---|
| 741 | + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); |
---|
646 | 742 | if (rcvbuf > sk->sk_rcvbuf) { |
---|
647 | | - sk->sk_rcvbuf = rcvbuf; |
---|
| 743 | + WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); |
---|
648 | 744 | |
---|
649 | 745 | /* Make the window clamp follow along. */ |
---|
650 | 746 | tp->window_clamp = tcp_win_from_space(sk, rcvbuf); |
---|
.. | .. |
---|
710 | 806 | tcp_ecn_check_ce(sk, skb); |
---|
711 | 807 | |
---|
712 | 808 | if (skb->len >= 128) |
---|
713 | | - tcp_grow_window(sk, skb); |
---|
| 809 | + tcp_grow_window(sk, skb, true); |
---|
714 | 810 | } |
---|
715 | 811 | |
---|
716 | 812 | /* Called to compute a smoothed rtt estimate. The data fed to this |
---|
.. | .. |
---|
774 | 870 | tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2; |
---|
775 | 871 | tp->rtt_seq = tp->snd_nxt; |
---|
776 | 872 | tp->mdev_max_us = tcp_rto_min_us(sk); |
---|
| 873 | + |
---|
| 874 | + tcp_bpf_rtt(sk); |
---|
777 | 875 | } |
---|
778 | 876 | } else { |
---|
779 | 877 | /* no previous measure. */ |
---|
.. | .. |
---|
782 | 880 | tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk)); |
---|
783 | 881 | tp->mdev_max_us = tp->rttvar_us; |
---|
784 | 882 | tp->rtt_seq = tp->snd_nxt; |
---|
| 883 | + |
---|
| 884 | + tcp_bpf_rtt(sk); |
---|
785 | 885 | } |
---|
786 | 886 | tp->srtt_us = max(1U, srtt); |
---|
787 | 887 | } |
---|
.. | .. |
---|
859 | 959 | return min_t(__u32, cwnd, tp->snd_cwnd_clamp); |
---|
860 | 960 | } |
---|
861 | 961 | |
---|
862 | | -/* Take a notice that peer is sending D-SACKs */ |
---|
863 | | -static void tcp_dsack_seen(struct tcp_sock *tp) |
---|
| 962 | +struct tcp_sacktag_state { |
---|
| 963 | + /* Timestamps for earliest and latest never-retransmitted segment |
---|
| 964 | + * that was SACKed. RTO needs the earliest RTT to stay conservative, |
---|
| 965 | + * but congestion control should still get an accurate delay signal. |
---|
| 966 | + */ |
---|
| 967 | + u64 first_sackt; |
---|
| 968 | + u64 last_sackt; |
---|
| 969 | + u32 reord; |
---|
| 970 | + u32 sack_delivered; |
---|
| 971 | + int flag; |
---|
| 972 | + unsigned int mss_now; |
---|
| 973 | + struct rate_sample *rate; |
---|
| 974 | +}; |
---|
| 975 | + |
---|
| 976 | +/* Take a notice that peer is sending D-SACKs. Skip update of data delivery |
---|
| 977 | + * and spurious retransmission information if this DSACK is unlikely caused by |
---|
| 978 | + * sender's action: |
---|
| 979 | + * - DSACKed sequence range is larger than maximum receiver's window. |
---|
| 980 | + * - Total no. of DSACKed segments exceed the total no. of retransmitted segs. |
---|
| 981 | + */ |
---|
| 982 | +static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq, |
---|
| 983 | + u32 end_seq, struct tcp_sacktag_state *state) |
---|
864 | 984 | { |
---|
| 985 | + u32 seq_len, dup_segs = 1; |
---|
| 986 | + |
---|
| 987 | + if (!before(start_seq, end_seq)) |
---|
| 988 | + return 0; |
---|
| 989 | + |
---|
| 990 | + seq_len = end_seq - start_seq; |
---|
| 991 | + /* Dubious DSACK: DSACKed range greater than maximum advertised rwnd */ |
---|
| 992 | + if (seq_len > tp->max_window) |
---|
| 993 | + return 0; |
---|
| 994 | + if (seq_len > tp->mss_cache) |
---|
| 995 | + dup_segs = DIV_ROUND_UP(seq_len, tp->mss_cache); |
---|
| 996 | + |
---|
| 997 | + tp->dsack_dups += dup_segs; |
---|
| 998 | + /* Skip the DSACK if dup segs weren't retransmitted by sender */ |
---|
| 999 | + if (tp->dsack_dups > tp->total_retrans) |
---|
| 1000 | + return 0; |
---|
| 1001 | + |
---|
865 | 1002 | tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; |
---|
866 | 1003 | tp->rack.dsack_seen = 1; |
---|
867 | | - tp->dsack_dups++; |
---|
| 1004 | + |
---|
| 1005 | + state->flag |= FLAG_DSACKING_ACK; |
---|
| 1006 | + /* A spurious retransmission is delivered */ |
---|
| 1007 | + state->sack_delivered += dup_segs; |
---|
| 1008 | + |
---|
| 1009 | + return dup_segs; |
---|
868 | 1010 | } |
---|
869 | 1011 | |
---|
870 | 1012 | /* It's reordering when higher sequence was delivered (i.e. sacked) before |
---|
.. | .. |
---|
893 | 1035 | tp->undo_marker ? tp->undo_retrans : 0); |
---|
894 | 1036 | #endif |
---|
895 | 1037 | tp->reordering = min_t(u32, (metric + mss - 1) / mss, |
---|
896 | | - sock_net(sk)->ipv4.sysctl_tcp_max_reordering); |
---|
| 1038 | + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering)); |
---|
897 | 1039 | } |
---|
898 | 1040 | |
---|
899 | 1041 | /* This exciting event is worth to be remembered. 8) */ |
---|
.. | .. |
---|
902 | 1044 | ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER); |
---|
903 | 1045 | } |
---|
904 | 1046 | |
---|
905 | | -/* This must be called before lost_out is incremented */ |
---|
| 1047 | + /* This must be called before lost_out or retrans_out are updated |
---|
| 1048 | + * on a new loss, because we want to know if all skbs previously |
---|
| 1049 | + * known to be lost have already been retransmitted, indicating |
---|
| 1050 | + * that this newly lost skb is our next skb to retransmit. |
---|
| 1051 | + */ |
---|
906 | 1052 | static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) |
---|
907 | 1053 | { |
---|
908 | 1054 | if ((!tp->retransmit_skb_hint && tp->retrans_out >= tp->lost_out) || |
---|
.. | .. |
---|
912 | 1058 | tp->retransmit_skb_hint = skb; |
---|
913 | 1059 | } |
---|
914 | 1060 | |
---|
915 | | -/* Sum the number of packets on the wire we have marked as lost. |
---|
916 | | - * There are two cases we care about here: |
---|
917 | | - * a) Packet hasn't been marked lost (nor retransmitted), |
---|
918 | | - * and this is the first loss. |
---|
919 | | - * b) Packet has been marked both lost and retransmitted, |
---|
920 | | - * and this means we think it was lost again. |
---|
| 1061 | +/* Sum the number of packets on the wire we have marked as lost, and |
---|
| 1062 | + * notify the congestion control module that the given skb was marked lost. |
---|
921 | 1063 | */ |
---|
922 | | -static void tcp_sum_lost(struct tcp_sock *tp, struct sk_buff *skb) |
---|
| 1064 | +static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb) |
---|
| 1065 | +{ |
---|
| 1066 | + tp->lost += tcp_skb_pcount(skb); |
---|
| 1067 | +} |
---|
| 1068 | + |
---|
| 1069 | +void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) |
---|
923 | 1070 | { |
---|
924 | 1071 | __u8 sacked = TCP_SKB_CB(skb)->sacked; |
---|
| 1072 | + struct tcp_sock *tp = tcp_sk(sk); |
---|
925 | 1073 | |
---|
926 | | - if (!(sacked & TCPCB_LOST) || |
---|
927 | | - ((sacked & TCPCB_LOST) && (sacked & TCPCB_SACKED_RETRANS))) |
---|
928 | | - tp->lost += tcp_skb_pcount(skb); |
---|
929 | | -} |
---|
| 1074 | + if (sacked & TCPCB_SACKED_ACKED) |
---|
| 1075 | + return; |
---|
930 | 1076 | |
---|
931 | | -static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb) |
---|
932 | | -{ |
---|
933 | | - if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { |
---|
934 | | - tcp_verify_retransmit_hint(tp, skb); |
---|
935 | | - |
---|
936 | | - tp->lost_out += tcp_skb_pcount(skb); |
---|
937 | | - tcp_sum_lost(tp, skb); |
---|
938 | | - TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; |
---|
939 | | - } |
---|
940 | | -} |
---|
941 | | - |
---|
942 | | -void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb) |
---|
943 | | -{ |
---|
944 | 1077 | tcp_verify_retransmit_hint(tp, skb); |
---|
945 | | - |
---|
946 | | - tcp_sum_lost(tp, skb); |
---|
947 | | - if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { |
---|
| 1078 | + if (sacked & TCPCB_LOST) { |
---|
| 1079 | + if (sacked & TCPCB_SACKED_RETRANS) { |
---|
| 1080 | + /* Account for retransmits that are lost again */ |
---|
| 1081 | + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; |
---|
| 1082 | + tp->retrans_out -= tcp_skb_pcount(skb); |
---|
| 1083 | + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT, |
---|
| 1084 | + tcp_skb_pcount(skb)); |
---|
| 1085 | + tcp_notify_skb_loss_event(tp, skb); |
---|
| 1086 | + } |
---|
| 1087 | + } else { |
---|
948 | 1088 | tp->lost_out += tcp_skb_pcount(skb); |
---|
949 | 1089 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; |
---|
| 1090 | + tcp_notify_skb_loss_event(tp, skb); |
---|
950 | 1091 | } |
---|
| 1092 | +} |
---|
| 1093 | + |
---|
| 1094 | +/* Updates the delivered and delivered_ce counts */ |
---|
| 1095 | +static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, |
---|
| 1096 | + bool ece_ack) |
---|
| 1097 | +{ |
---|
| 1098 | + tp->delivered += delivered; |
---|
| 1099 | + if (ece_ack) |
---|
| 1100 | + tp->delivered_ce += delivered; |
---|
951 | 1101 | } |
---|
952 | 1102 | |
---|
953 | 1103 | /* This procedure tags the retransmission queue when SACKs arrive. |
---|
.. | .. |
---|
1082 | 1232 | |
---|
1083 | 1233 | static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, |
---|
1084 | 1234 | struct tcp_sack_block_wire *sp, int num_sacks, |
---|
1085 | | - u32 prior_snd_una) |
---|
| 1235 | + u32 prior_snd_una, struct tcp_sacktag_state *state) |
---|
1086 | 1236 | { |
---|
1087 | 1237 | struct tcp_sock *tp = tcp_sk(sk); |
---|
1088 | 1238 | u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq); |
---|
1089 | 1239 | u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq); |
---|
1090 | | - bool dup_sack = false; |
---|
| 1240 | + u32 dup_segs; |
---|
1091 | 1241 | |
---|
1092 | 1242 | if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) { |
---|
1093 | | - dup_sack = true; |
---|
1094 | | - tcp_dsack_seen(tp); |
---|
1095 | 1243 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV); |
---|
1096 | 1244 | } else if (num_sacks > 1) { |
---|
1097 | 1245 | u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq); |
---|
1098 | 1246 | u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq); |
---|
1099 | 1247 | |
---|
1100 | | - if (!after(end_seq_0, end_seq_1) && |
---|
1101 | | - !before(start_seq_0, start_seq_1)) { |
---|
1102 | | - dup_sack = true; |
---|
1103 | | - tcp_dsack_seen(tp); |
---|
1104 | | - NET_INC_STATS(sock_net(sk), |
---|
1105 | | - LINUX_MIB_TCPDSACKOFORECV); |
---|
1106 | | - } |
---|
| 1248 | + if (after(end_seq_0, end_seq_1) || before(start_seq_0, start_seq_1)) |
---|
| 1249 | + return false; |
---|
| 1250 | + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKOFORECV); |
---|
| 1251 | + } else { |
---|
| 1252 | + return false; |
---|
1107 | 1253 | } |
---|
1108 | 1254 | |
---|
| 1255 | + dup_segs = tcp_dsack_seen(tp, start_seq_0, end_seq_0, state); |
---|
| 1256 | + if (!dup_segs) { /* Skip dubious DSACK */ |
---|
| 1257 | + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKIGNOREDDUBIOUS); |
---|
| 1258 | + return false; |
---|
| 1259 | + } |
---|
| 1260 | + |
---|
| 1261 | + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECVSEGS, dup_segs); |
---|
| 1262 | + |
---|
1109 | 1263 | /* D-SACK for already forgotten data... Do dumb counting. */ |
---|
1110 | | - if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 && |
---|
| 1264 | + if (tp->undo_marker && tp->undo_retrans > 0 && |
---|
1111 | 1265 | !after(end_seq_0, prior_snd_una) && |
---|
1112 | 1266 | after(end_seq_0, tp->undo_marker)) |
---|
1113 | | - tp->undo_retrans--; |
---|
| 1267 | + tp->undo_retrans = max_t(int, 0, tp->undo_retrans - dup_segs); |
---|
1114 | 1268 | |
---|
1115 | | - return dup_sack; |
---|
| 1269 | + return true; |
---|
1116 | 1270 | } |
---|
1117 | | - |
---|
1118 | | -struct tcp_sacktag_state { |
---|
1119 | | - u32 reord; |
---|
1120 | | - /* Timestamps for earliest and latest never-retransmitted segment |
---|
1121 | | - * that was SACKed. RTO needs the earliest RTT to stay conservative, |
---|
1122 | | - * but congestion control should still get an accurate delay signal. |
---|
1123 | | - */ |
---|
1124 | | - u64 first_sackt; |
---|
1125 | | - u64 last_sackt; |
---|
1126 | | - struct rate_sample *rate; |
---|
1127 | | - int flag; |
---|
1128 | | - unsigned int mss_now; |
---|
1129 | | -}; |
---|
1130 | 1271 | |
---|
1131 | 1272 | /* Check if skb is fully within the SACK block. In presence of GSO skbs, |
---|
1132 | 1273 | * the incoming SACK may not exactly match but we can find smaller MSS |
---|
.. | .. |
---|
1246 | 1387 | sacked |= TCPCB_SACKED_ACKED; |
---|
1247 | 1388 | state->flag |= FLAG_DATA_SACKED; |
---|
1248 | 1389 | tp->sacked_out += pcount; |
---|
1249 | | - tp->delivered += pcount; /* Out-of-order packets delivered */ |
---|
| 1390 | + /* Out-of-order packets delivered */ |
---|
| 1391 | + state->sack_delivered += pcount; |
---|
1250 | 1392 | |
---|
1251 | 1393 | /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ |
---|
1252 | 1394 | if (tp->lost_skb_hint && |
---|
.. | .. |
---|
1289 | 1431 | */ |
---|
1290 | 1432 | tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, |
---|
1291 | 1433 | start_seq, end_seq, dup_sack, pcount, |
---|
1292 | | - skb->skb_mstamp); |
---|
| 1434 | + tcp_skb_timestamp_us(skb)); |
---|
1293 | 1435 | tcp_rate_skb_delivered(sk, skb, state->rate); |
---|
1294 | 1436 | |
---|
1295 | 1437 | if (skb == tp->lost_skb_hint) |
---|
.. | .. |
---|
1413 | 1555 | if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) |
---|
1414 | 1556 | goto fallback; |
---|
1415 | 1557 | |
---|
1416 | | - if (!tcp_skb_can_collapse_to(prev)) |
---|
| 1558 | + if (!tcp_skb_can_collapse(prev, skb)) |
---|
1417 | 1559 | goto fallback; |
---|
1418 | 1560 | |
---|
1419 | 1561 | in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && |
---|
.. | .. |
---|
1502 | 1644 | (mss != tcp_skb_seglen(skb))) |
---|
1503 | 1645 | goto out; |
---|
1504 | 1646 | |
---|
| 1647 | + if (!tcp_skb_can_collapse(prev, skb)) |
---|
| 1648 | + goto out; |
---|
1505 | 1649 | len = skb->len; |
---|
1506 | 1650 | pcount = tcp_skb_pcount(skb); |
---|
1507 | 1651 | if (tcp_skb_shift(prev, skb, pcount, len)) |
---|
.. | .. |
---|
1578 | 1722 | TCP_SKB_CB(skb)->end_seq, |
---|
1579 | 1723 | dup_sack, |
---|
1580 | 1724 | tcp_skb_pcount(skb), |
---|
1581 | | - skb->skb_mstamp); |
---|
| 1725 | + tcp_skb_timestamp_us(skb)); |
---|
1582 | 1726 | tcp_rate_skb_delivered(sk, skb, state->rate); |
---|
1583 | 1727 | if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) |
---|
1584 | 1728 | list_del_init(&skb->tcp_tsorted_anchor); |
---|
.. | .. |
---|
1591 | 1735 | return skb; |
---|
1592 | 1736 | } |
---|
1593 | 1737 | |
---|
1594 | | -static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk, |
---|
1595 | | - struct tcp_sacktag_state *state, |
---|
1596 | | - u32 seq) |
---|
| 1738 | +static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk, u32 seq) |
---|
1597 | 1739 | { |
---|
1598 | 1740 | struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node; |
---|
1599 | 1741 | struct sk_buff *skb; |
---|
.. | .. |
---|
1615 | 1757 | } |
---|
1616 | 1758 | |
---|
1617 | 1759 | static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, |
---|
1618 | | - struct tcp_sacktag_state *state, |
---|
1619 | 1760 | u32 skip_to_seq) |
---|
1620 | 1761 | { |
---|
1621 | 1762 | if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq)) |
---|
1622 | 1763 | return skb; |
---|
1623 | 1764 | |
---|
1624 | | - return tcp_sacktag_bsearch(sk, state, skip_to_seq); |
---|
| 1765 | + return tcp_sacktag_bsearch(sk, skip_to_seq); |
---|
1625 | 1766 | } |
---|
1626 | 1767 | |
---|
1627 | 1768 | static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, |
---|
.. | .. |
---|
1634 | 1775 | return skb; |
---|
1635 | 1776 | |
---|
1636 | 1777 | if (before(next_dup->start_seq, skip_to_seq)) { |
---|
1637 | | - skb = tcp_sacktag_skip(skb, sk, state, next_dup->start_seq); |
---|
| 1778 | + skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq); |
---|
1638 | 1779 | skb = tcp_sacktag_walk(skb, sk, NULL, state, |
---|
1639 | 1780 | next_dup->start_seq, next_dup->end_seq, |
---|
1640 | 1781 | 1); |
---|
.. | .. |
---|
1672 | 1813 | tcp_highest_sack_reset(sk); |
---|
1673 | 1814 | |
---|
1674 | 1815 | found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, |
---|
1675 | | - num_sacks, prior_snd_una); |
---|
1676 | | - if (found_dup_sack) { |
---|
1677 | | - state->flag |= FLAG_DSACKING_ACK; |
---|
1678 | | - tp->delivered++; /* A spurious retransmission is delivered */ |
---|
1679 | | - } |
---|
| 1816 | + num_sacks, prior_snd_una, state); |
---|
1680 | 1817 | |
---|
1681 | 1818 | /* Eliminate too old ACKs, but take into |
---|
1682 | 1819 | * account more or less fresh ones, they can |
---|
.. | .. |
---|
1778 | 1915 | |
---|
1779 | 1916 | /* Head todo? */ |
---|
1780 | 1917 | if (before(start_seq, cache->start_seq)) { |
---|
1781 | | - skb = tcp_sacktag_skip(skb, sk, state, |
---|
1782 | | - start_seq); |
---|
| 1918 | + skb = tcp_sacktag_skip(skb, sk, start_seq); |
---|
1783 | 1919 | skb = tcp_sacktag_walk(skb, sk, next_dup, |
---|
1784 | 1920 | state, |
---|
1785 | 1921 | start_seq, |
---|
.. | .. |
---|
1805 | 1941 | goto walk; |
---|
1806 | 1942 | } |
---|
1807 | 1943 | |
---|
1808 | | - skb = tcp_sacktag_skip(skb, sk, state, cache->end_seq); |
---|
| 1944 | + skb = tcp_sacktag_skip(skb, sk, cache->end_seq); |
---|
1809 | 1945 | /* Check overlap against next cached too (past this one already) */ |
---|
1810 | 1946 | cache++; |
---|
1811 | 1947 | continue; |
---|
.. | .. |
---|
1816 | 1952 | if (!skb) |
---|
1817 | 1953 | break; |
---|
1818 | 1954 | } |
---|
1819 | | - skb = tcp_sacktag_skip(skb, sk, state, start_seq); |
---|
| 1955 | + skb = tcp_sacktag_skip(skb, sk, start_seq); |
---|
1820 | 1956 | |
---|
1821 | 1957 | walk: |
---|
1822 | 1958 | skb = tcp_sacktag_walk(skb, sk, next_dup, state, |
---|
.. | .. |
---|
1878 | 2014 | return; |
---|
1879 | 2015 | |
---|
1880 | 2016 | tp->reordering = min_t(u32, tp->packets_out + addend, |
---|
1881 | | - sock_net(sk)->ipv4.sysctl_tcp_max_reordering); |
---|
| 2017 | + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering)); |
---|
1882 | 2018 | tp->reord_seen++; |
---|
1883 | 2019 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER); |
---|
1884 | 2020 | } |
---|
1885 | 2021 | |
---|
1886 | 2022 | /* Emulate SACKs for SACKless connection: account for a new dupack. */ |
---|
1887 | 2023 | |
---|
1888 | | -static void tcp_add_reno_sack(struct sock *sk) |
---|
| 2024 | +static void tcp_add_reno_sack(struct sock *sk, int num_dupack, bool ece_ack) |
---|
1889 | 2025 | { |
---|
1890 | | - struct tcp_sock *tp = tcp_sk(sk); |
---|
1891 | | - u32 prior_sacked = tp->sacked_out; |
---|
| 2026 | + if (num_dupack) { |
---|
| 2027 | + struct tcp_sock *tp = tcp_sk(sk); |
---|
| 2028 | + u32 prior_sacked = tp->sacked_out; |
---|
| 2029 | + s32 delivered; |
---|
1892 | 2030 | |
---|
1893 | | - tp->sacked_out++; |
---|
1894 | | - tcp_check_reno_reordering(sk, 0); |
---|
1895 | | - if (tp->sacked_out > prior_sacked) |
---|
1896 | | - tp->delivered++; /* Some out-of-order packet is delivered */ |
---|
1897 | | - tcp_verify_left_out(tp); |
---|
| 2031 | + tp->sacked_out += num_dupack; |
---|
| 2032 | + tcp_check_reno_reordering(sk, 0); |
---|
| 2033 | + delivered = tp->sacked_out - prior_sacked; |
---|
| 2034 | + if (delivered > 0) |
---|
| 2035 | + tcp_count_delivered(tp, delivered, ece_ack); |
---|
| 2036 | + tcp_verify_left_out(tp); |
---|
| 2037 | + } |
---|
1898 | 2038 | } |
---|
1899 | 2039 | |
---|
1900 | 2040 | /* Account for ACK, ACKing some data in Reno Recovery phase. */ |
---|
1901 | 2041 | |
---|
1902 | | -static void tcp_remove_reno_sacks(struct sock *sk, int acked) |
---|
| 2042 | +static void tcp_remove_reno_sacks(struct sock *sk, int acked, bool ece_ack) |
---|
1903 | 2043 | { |
---|
1904 | 2044 | struct tcp_sock *tp = tcp_sk(sk); |
---|
1905 | 2045 | |
---|
1906 | 2046 | if (acked > 0) { |
---|
1907 | 2047 | /* One ACK acked hole. The rest eat duplicate ACKs. */ |
---|
1908 | | - tp->delivered += max_t(int, acked - tp->sacked_out, 1); |
---|
| 2048 | + tcp_count_delivered(tp, max_t(int, acked - tp->sacked_out, 1), |
---|
| 2049 | + ece_ack); |
---|
1909 | 2050 | if (acked - 1 >= tp->sacked_out) |
---|
1910 | 2051 | tp->sacked_out = 0; |
---|
1911 | 2052 | else |
---|
.. | .. |
---|
1938 | 2079 | |
---|
1939 | 2080 | static bool tcp_is_rack(const struct sock *sk) |
---|
1940 | 2081 | { |
---|
1941 | | - return sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION; |
---|
| 2082 | + return READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) & |
---|
| 2083 | + TCP_RACK_LOSS_DETECTION; |
---|
1942 | 2084 | } |
---|
1943 | 2085 | |
---|
1944 | 2086 | /* If we detect SACK reneging, forget all SACK information |
---|
.. | .. |
---|
1982 | 2124 | struct tcp_sock *tp = tcp_sk(sk); |
---|
1983 | 2125 | struct net *net = sock_net(sk); |
---|
1984 | 2126 | bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery; |
---|
| 2127 | + u8 reordering; |
---|
1985 | 2128 | |
---|
1986 | 2129 | tcp_timeout_mark_lost(sk); |
---|
1987 | 2130 | |
---|
.. | .. |
---|
2002 | 2145 | /* Timeout in disordered state after receiving substantial DUPACKs |
---|
2003 | 2146 | * suggests that the degree of reordering is over-estimated. |
---|
2004 | 2147 | */ |
---|
| 2148 | + reordering = READ_ONCE(net->ipv4.sysctl_tcp_reordering); |
---|
2005 | 2149 | if (icsk->icsk_ca_state <= TCP_CA_Disorder && |
---|
2006 | | - tp->sacked_out >= net->ipv4.sysctl_tcp_reordering) |
---|
| 2150 | + tp->sacked_out >= reordering) |
---|
2007 | 2151 | tp->reordering = min_t(unsigned int, tp->reordering, |
---|
2008 | | - net->ipv4.sysctl_tcp_reordering); |
---|
| 2152 | + reordering); |
---|
| 2153 | + |
---|
2009 | 2154 | tcp_set_ca_state(sk, TCP_CA_Loss); |
---|
2010 | 2155 | tp->high_seq = tp->snd_nxt; |
---|
2011 | 2156 | tcp_ecn_queue_cwr(tp); |
---|
.. | .. |
---|
2014 | 2159 | * loss recovery is underway except recurring timeout(s) on |
---|
2015 | 2160 | * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing |
---|
2016 | 2161 | */ |
---|
2017 | | - tp->frto = net->ipv4.sysctl_tcp_frto && |
---|
| 2162 | + tp->frto = READ_ONCE(net->ipv4.sysctl_tcp_frto) && |
---|
2018 | 2163 | (new_recovery || icsk->icsk_retransmits) && |
---|
2019 | 2164 | !inet_csk(sk)->icsk_mtup.probe_size; |
---|
2020 | 2165 | } |
---|
.. | .. |
---|
2031 | 2176 | */ |
---|
2032 | 2177 | static bool tcp_check_sack_reneging(struct sock *sk, int flag) |
---|
2033 | 2178 | { |
---|
2034 | | - if (flag & FLAG_SACK_RENEGING) { |
---|
| 2179 | + if (flag & FLAG_SACK_RENEGING && |
---|
| 2180 | + flag & FLAG_SND_UNA_ADVANCED) { |
---|
2035 | 2181 | struct tcp_sock *tp = tcp_sk(sk); |
---|
2036 | 2182 | unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4), |
---|
2037 | 2183 | msecs_to_jiffies(10)); |
---|
.. | .. |
---|
2172 | 2318 | } |
---|
2173 | 2319 | |
---|
2174 | 2320 | /* Detect loss in event "A" above by marking head of queue up as lost. |
---|
2175 | | - * For non-SACK(Reno) senders, the first "packets" number of segments |
---|
2176 | | - * are considered lost. For RFC3517 SACK, a segment is considered lost if it |
---|
| 2321 | + * For RFC3517 SACK, a segment is considered lost if it |
---|
2177 | 2322 | * has at least tp->reordering SACKed seqments above it; "packets" refers to |
---|
2178 | 2323 | * the maximum SACKed segments to pass before reaching this limit. |
---|
2179 | 2324 | */ |
---|
.. | .. |
---|
2181 | 2326 | { |
---|
2182 | 2327 | struct tcp_sock *tp = tcp_sk(sk); |
---|
2183 | 2328 | struct sk_buff *skb; |
---|
2184 | | - int cnt, oldcnt, lost; |
---|
2185 | | - unsigned int mss; |
---|
| 2329 | + int cnt; |
---|
2186 | 2330 | /* Use SACK to deduce losses of new sequences sent during recovery */ |
---|
2187 | | - const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq; |
---|
| 2331 | + const u32 loss_high = tp->snd_nxt; |
---|
2188 | 2332 | |
---|
2189 | 2333 | WARN_ON(packets > tp->packets_out); |
---|
2190 | 2334 | skb = tp->lost_skb_hint; |
---|
.. | .. |
---|
2207 | 2351 | if (after(TCP_SKB_CB(skb)->end_seq, loss_high)) |
---|
2208 | 2352 | break; |
---|
2209 | 2353 | |
---|
2210 | | - oldcnt = cnt; |
---|
2211 | | - if (tcp_is_reno(tp) || |
---|
2212 | | - (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) |
---|
| 2354 | + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) |
---|
2213 | 2355 | cnt += tcp_skb_pcount(skb); |
---|
2214 | 2356 | |
---|
2215 | | - if (cnt > packets) { |
---|
2216 | | - if (tcp_is_sack(tp) || |
---|
2217 | | - (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || |
---|
2218 | | - (oldcnt >= packets)) |
---|
2219 | | - break; |
---|
| 2357 | + if (cnt > packets) |
---|
| 2358 | + break; |
---|
2220 | 2359 | |
---|
2221 | | - mss = tcp_skb_mss(skb); |
---|
2222 | | - /* If needed, chop off the prefix to mark as lost. */ |
---|
2223 | | - lost = (packets - oldcnt) * mss; |
---|
2224 | | - if (lost < skb->len && |
---|
2225 | | - tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, |
---|
2226 | | - lost, mss, GFP_ATOMIC) < 0) |
---|
2227 | | - break; |
---|
2228 | | - cnt = packets; |
---|
2229 | | - } |
---|
2230 | | - |
---|
2231 | | - tcp_skb_mark_lost(tp, skb); |
---|
| 2360 | + if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST)) |
---|
| 2361 | + tcp_mark_skb_lost(sk, skb); |
---|
2232 | 2362 | |
---|
2233 | 2363 | if (mark_head) |
---|
2234 | 2364 | break; |
---|
.. | .. |
---|
2272 | 2402 | */ |
---|
2273 | 2403 | static inline bool tcp_packet_delayed(const struct tcp_sock *tp) |
---|
2274 | 2404 | { |
---|
2275 | | - return !tp->retrans_stamp || |
---|
| 2405 | + return tp->retrans_stamp && |
---|
2276 | 2406 | tcp_tsopt_ecr_before(tp, tp->retrans_stamp); |
---|
2277 | 2407 | } |
---|
2278 | 2408 | |
---|
.. | .. |
---|
2368 | 2498 | return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp)); |
---|
2369 | 2499 | } |
---|
2370 | 2500 | |
---|
| 2501 | +static bool tcp_is_non_sack_preventing_reopen(struct sock *sk) |
---|
| 2502 | +{ |
---|
| 2503 | + struct tcp_sock *tp = tcp_sk(sk); |
---|
| 2504 | + |
---|
| 2505 | + if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { |
---|
| 2506 | + /* Hold old state until something *above* high_seq |
---|
| 2507 | + * is ACKed. For Reno it is MUST to prevent false |
---|
| 2508 | + * fast retransmits (RFC2582). SACK TCP is safe. */ |
---|
| 2509 | + if (!tcp_any_retrans_done(sk)) |
---|
| 2510 | + tp->retrans_stamp = 0; |
---|
| 2511 | + return true; |
---|
| 2512 | + } |
---|
| 2513 | + return false; |
---|
| 2514 | +} |
---|
| 2515 | + |
---|
2371 | 2516 | /* People celebrate: "We love our President!" */ |
---|
2372 | 2517 | static bool tcp_try_undo_recovery(struct sock *sk) |
---|
2373 | 2518 | { |
---|
.. | .. |
---|
2390 | 2535 | } else if (tp->rack.reo_wnd_persist) { |
---|
2391 | 2536 | tp->rack.reo_wnd_persist--; |
---|
2392 | 2537 | } |
---|
2393 | | - if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { |
---|
2394 | | - /* Hold old state until something *above* high_seq |
---|
2395 | | - * is ACKed. For Reno it is MUST to prevent false |
---|
2396 | | - * fast retransmits (RFC2582). SACK TCP is safe. */ |
---|
2397 | | - if (!tcp_any_retrans_done(sk)) |
---|
2398 | | - tp->retrans_stamp = 0; |
---|
| 2538 | + if (tcp_is_non_sack_preventing_reopen(sk)) |
---|
2399 | 2539 | return true; |
---|
2400 | | - } |
---|
2401 | 2540 | tcp_set_ca_state(sk, TCP_CA_Open); |
---|
2402 | 2541 | tp->is_sack_reneg = 0; |
---|
2403 | 2542 | return false; |
---|
.. | .. |
---|
2433 | 2572 | NET_INC_STATS(sock_net(sk), |
---|
2434 | 2573 | LINUX_MIB_TCPSPURIOUSRTOS); |
---|
2435 | 2574 | inet_csk(sk)->icsk_retransmits = 0; |
---|
| 2575 | + if (tcp_is_non_sack_preventing_reopen(sk)) |
---|
| 2576 | + return true; |
---|
2436 | 2577 | if (frto_undo || tcp_is_sack(tp)) { |
---|
2437 | 2578 | tcp_set_ca_state(sk, TCP_CA_Open); |
---|
2438 | 2579 | tp->is_sack_reneg = 0; |
---|
.. | .. |
---|
2479 | 2620 | u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered + |
---|
2480 | 2621 | tp->prior_cwnd - 1; |
---|
2481 | 2622 | sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out; |
---|
2482 | | - } else if ((flag & FLAG_RETRANS_DATA_ACKED) && |
---|
2483 | | - !(flag & FLAG_LOST_RETRANS)) { |
---|
| 2623 | + } else if ((flag & (FLAG_RETRANS_DATA_ACKED | FLAG_LOST_RETRANS)) == |
---|
| 2624 | + FLAG_RETRANS_DATA_ACKED) { |
---|
2484 | 2625 | sndcnt = min_t(int, delta, |
---|
2485 | 2626 | max_t(int, tp->prr_delivered - tp->prr_out, |
---|
2486 | 2627 | newly_acked_sacked) + 1); |
---|
.. | .. |
---|
2566 | 2707 | { |
---|
2567 | 2708 | struct tcp_sock *tp = tcp_sk(sk); |
---|
2568 | 2709 | struct inet_connection_sock *icsk = inet_csk(sk); |
---|
| 2710 | + u64 val; |
---|
2569 | 2711 | |
---|
2570 | | - /* FIXME: breaks with very large cwnd */ |
---|
2571 | 2712 | tp->prior_ssthresh = tcp_current_ssthresh(sk); |
---|
2572 | | - tp->snd_cwnd = tp->snd_cwnd * |
---|
2573 | | - tcp_mss_to_mtu(sk, tp->mss_cache) / |
---|
2574 | | - icsk->icsk_mtup.probe_size; |
---|
| 2713 | + |
---|
| 2714 | + val = (u64)tp->snd_cwnd * tcp_mss_to_mtu(sk, tp->mss_cache); |
---|
| 2715 | + do_div(val, icsk->icsk_mtup.probe_size); |
---|
| 2716 | + WARN_ON_ONCE((u32)val != val); |
---|
| 2717 | + tp->snd_cwnd = max_t(u32, 1U, val); |
---|
| 2718 | + |
---|
2575 | 2719 | tp->snd_cwnd_cnt = 0; |
---|
2576 | 2720 | tp->snd_cwnd_stamp = tcp_jiffies32; |
---|
2577 | 2721 | tp->snd_ssthresh = tcp_current_ssthresh(sk); |
---|
.. | .. |
---|
2594 | 2738 | unsigned int mss = tcp_current_mss(sk); |
---|
2595 | 2739 | |
---|
2596 | 2740 | skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { |
---|
2597 | | - if (tcp_skb_seglen(skb) > mss && |
---|
2598 | | - !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { |
---|
2599 | | - if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { |
---|
2600 | | - TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; |
---|
2601 | | - tp->retrans_out -= tcp_skb_pcount(skb); |
---|
2602 | | - } |
---|
2603 | | - tcp_skb_mark_lost_uncond_verify(tp, skb); |
---|
2604 | | - } |
---|
| 2741 | + if (tcp_skb_seglen(skb) > mss) |
---|
| 2742 | + tcp_mark_skb_lost(sk, skb); |
---|
2605 | 2743 | } |
---|
2606 | 2744 | |
---|
2607 | 2745 | tcp_clear_retrans_hints_partial(tp); |
---|
.. | .. |
---|
2656 | 2794 | /* Process an ACK in CA_Loss state. Move to CA_Open if lost data are |
---|
2657 | 2795 | * recovered or spurious. Otherwise retransmits more on partial ACKs. |
---|
2658 | 2796 | */ |
---|
2659 | | -static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, |
---|
| 2797 | +static void tcp_process_loss(struct sock *sk, int flag, int num_dupack, |
---|
2660 | 2798 | int *rexmit) |
---|
2661 | 2799 | { |
---|
2662 | 2800 | struct tcp_sock *tp = tcp_sk(sk); |
---|
2663 | 2801 | bool recovered = !before(tp->snd_una, tp->high_seq); |
---|
2664 | 2802 | |
---|
2665 | | - if ((flag & FLAG_SND_UNA_ADVANCED) && |
---|
| 2803 | + if ((flag & FLAG_SND_UNA_ADVANCED || rcu_access_pointer(tp->fastopen_rsk)) && |
---|
2666 | 2804 | tcp_try_undo_loss(sk, false)) |
---|
2667 | 2805 | return; |
---|
2668 | 2806 | |
---|
.. | .. |
---|
2675 | 2813 | return; |
---|
2676 | 2814 | |
---|
2677 | 2815 | if (after(tp->snd_nxt, tp->high_seq)) { |
---|
2678 | | - if (flag & FLAG_DATA_SACKED || is_dupack) |
---|
| 2816 | + if (flag & FLAG_DATA_SACKED || num_dupack) |
---|
2679 | 2817 | tp->frto = 0; /* Step 3.a. loss was real */ |
---|
2680 | 2818 | } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) { |
---|
2681 | 2819 | tp->high_seq = tp->snd_nxt; |
---|
.. | .. |
---|
2701 | 2839 | /* A Reno DUPACK means new data in F-RTO step 2.b above are |
---|
2702 | 2840 | * delivered. Lower inflight to clock out (re)tranmissions. |
---|
2703 | 2841 | */ |
---|
2704 | | - if (after(tp->snd_nxt, tp->high_seq) && is_dupack) |
---|
2705 | | - tcp_add_reno_sack(sk); |
---|
| 2842 | + if (after(tp->snd_nxt, tp->high_seq) && num_dupack) |
---|
| 2843 | + tcp_add_reno_sack(sk, num_dupack, flag & FLAG_ECE); |
---|
2706 | 2844 | else if (flag & FLAG_SND_UNA_ADVANCED) |
---|
2707 | 2845 | tcp_reset_reno_sack(tp); |
---|
2708 | 2846 | } |
---|
2709 | 2847 | *rexmit = REXMIT_LOST; |
---|
2710 | 2848 | } |
---|
2711 | 2849 | |
---|
| 2850 | +static bool tcp_force_fast_retransmit(struct sock *sk) |
---|
| 2851 | +{ |
---|
| 2852 | + struct tcp_sock *tp = tcp_sk(sk); |
---|
| 2853 | + |
---|
| 2854 | + return after(tcp_highest_sack_seq(tp), |
---|
| 2855 | + tp->snd_una + tp->reordering * tp->mss_cache); |
---|
| 2856 | +} |
---|
| 2857 | + |
---|
2712 | 2858 | /* Undo during fast recovery after partial ACK. */ |
---|
2713 | | -static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una) |
---|
| 2859 | +static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una, |
---|
| 2860 | + bool *do_lost) |
---|
2714 | 2861 | { |
---|
2715 | 2862 | struct tcp_sock *tp = tcp_sk(sk); |
---|
2716 | 2863 | |
---|
.. | .. |
---|
2735 | 2882 | tcp_undo_cwnd_reduction(sk, true); |
---|
2736 | 2883 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); |
---|
2737 | 2884 | tcp_try_keep_open(sk); |
---|
2738 | | - return true; |
---|
| 2885 | + } else { |
---|
| 2886 | + /* Partial ACK arrived. Force fast retransmit. */ |
---|
| 2887 | + *do_lost = tcp_force_fast_retransmit(sk); |
---|
2739 | 2888 | } |
---|
2740 | 2889 | return false; |
---|
2741 | 2890 | } |
---|
.. | .. |
---|
2759 | 2908 | } |
---|
2760 | 2909 | } |
---|
2761 | 2910 | |
---|
2762 | | -static bool tcp_force_fast_retransmit(struct sock *sk) |
---|
2763 | | -{ |
---|
2764 | | - struct tcp_sock *tp = tcp_sk(sk); |
---|
2765 | | - |
---|
2766 | | - return after(tcp_highest_sack_seq(tp), |
---|
2767 | | - tp->snd_una + tp->reordering * tp->mss_cache); |
---|
2768 | | -} |
---|
2769 | | - |
---|
2770 | 2911 | /* Process an event, which can update packets-in-flight not trivially. |
---|
2771 | 2912 | * Main goal of this function is to calculate new estimate for left_out, |
---|
2772 | 2913 | * taking into account both packets sitting in receiver's buffer and |
---|
.. | .. |
---|
2780 | 2921 | * tcp_xmit_retransmit_queue(). |
---|
2781 | 2922 | */ |
---|
2782 | 2923 | static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, |
---|
2783 | | - bool is_dupack, int *ack_flag, int *rexmit) |
---|
| 2924 | + int num_dupack, int *ack_flag, int *rexmit) |
---|
2784 | 2925 | { |
---|
2785 | 2926 | struct inet_connection_sock *icsk = inet_csk(sk); |
---|
2786 | 2927 | struct tcp_sock *tp = tcp_sk(sk); |
---|
2787 | 2928 | int fast_rexmit = 0, flag = *ack_flag; |
---|
2788 | | - bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && |
---|
2789 | | - tcp_force_fast_retransmit(sk)); |
---|
| 2929 | + bool ece_ack = flag & FLAG_ECE; |
---|
| 2930 | + bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) && |
---|
| 2931 | + tcp_force_fast_retransmit(sk)); |
---|
2790 | 2932 | |
---|
2791 | 2933 | if (!tp->packets_out && tp->sacked_out) |
---|
2792 | 2934 | tp->sacked_out = 0; |
---|
2793 | 2935 | |
---|
2794 | 2936 | /* Now state machine starts. |
---|
2795 | 2937 | * A. ECE, hence prohibit cwnd undoing, the reduction is required. */ |
---|
2796 | | - if (flag & FLAG_ECE) |
---|
| 2938 | + if (ece_ack) |
---|
2797 | 2939 | tp->prior_ssthresh = 0; |
---|
2798 | 2940 | |
---|
2799 | 2941 | /* B. In all the states check for reneging SACKs. */ |
---|
.. | .. |
---|
2833 | 2975 | switch (icsk->icsk_ca_state) { |
---|
2834 | 2976 | case TCP_CA_Recovery: |
---|
2835 | 2977 | if (!(flag & FLAG_SND_UNA_ADVANCED)) { |
---|
2836 | | - if (tcp_is_reno(tp) && is_dupack) |
---|
2837 | | - tcp_add_reno_sack(sk); |
---|
2838 | | - } else { |
---|
2839 | | - if (tcp_try_undo_partial(sk, prior_snd_una)) |
---|
2840 | | - return; |
---|
2841 | | - /* Partial ACK arrived. Force fast retransmit. */ |
---|
2842 | | - do_lost = tcp_is_reno(tp) || |
---|
2843 | | - tcp_force_fast_retransmit(sk); |
---|
2844 | | - } |
---|
2845 | | - if (tcp_try_undo_dsack(sk)) { |
---|
2846 | | - tcp_try_keep_open(sk); |
---|
| 2978 | + if (tcp_is_reno(tp)) |
---|
| 2979 | + tcp_add_reno_sack(sk, num_dupack, ece_ack); |
---|
| 2980 | + } else if (tcp_try_undo_partial(sk, prior_snd_una, &do_lost)) |
---|
2847 | 2981 | return; |
---|
2848 | | - } |
---|
| 2982 | + |
---|
| 2983 | + if (tcp_try_undo_dsack(sk)) |
---|
| 2984 | + tcp_try_keep_open(sk); |
---|
| 2985 | + |
---|
2849 | 2986 | tcp_identify_packet_loss(sk, ack_flag); |
---|
| 2987 | + if (icsk->icsk_ca_state != TCP_CA_Recovery) { |
---|
| 2988 | + if (!tcp_time_to_recover(sk, flag)) |
---|
| 2989 | + return; |
---|
| 2990 | + /* Undo reverts the recovery state. If loss is evident, |
---|
| 2991 | + * starts a new recovery (e.g. reordering then loss); |
---|
| 2992 | + */ |
---|
| 2993 | + tcp_enter_recovery(sk, ece_ack); |
---|
| 2994 | + } |
---|
2850 | 2995 | break; |
---|
2851 | 2996 | case TCP_CA_Loss: |
---|
2852 | | - tcp_process_loss(sk, flag, is_dupack, rexmit); |
---|
| 2997 | + tcp_process_loss(sk, flag, num_dupack, rexmit); |
---|
2853 | 2998 | tcp_identify_packet_loss(sk, ack_flag); |
---|
2854 | 2999 | if (!(icsk->icsk_ca_state == TCP_CA_Open || |
---|
2855 | 3000 | (*ack_flag & FLAG_LOST_RETRANS))) |
---|
2856 | 3001 | return; |
---|
2857 | 3002 | /* Change state if cwnd is undone or retransmits are lost */ |
---|
2858 | | - /* fall through */ |
---|
| 3003 | + fallthrough; |
---|
2859 | 3004 | default: |
---|
2860 | 3005 | if (tcp_is_reno(tp)) { |
---|
2861 | 3006 | if (flag & FLAG_SND_UNA_ADVANCED) |
---|
2862 | 3007 | tcp_reset_reno_sack(tp); |
---|
2863 | | - if (is_dupack) |
---|
2864 | | - tcp_add_reno_sack(sk); |
---|
| 3008 | + tcp_add_reno_sack(sk, num_dupack, ece_ack); |
---|
2865 | 3009 | } |
---|
2866 | 3010 | |
---|
2867 | 3011 | if (icsk->icsk_ca_state <= TCP_CA_Disorder) |
---|
.. | .. |
---|
2885 | 3029 | } |
---|
2886 | 3030 | |
---|
2887 | 3031 | /* Otherwise enter Recovery state */ |
---|
2888 | | - tcp_enter_recovery(sk, (flag & FLAG_ECE)); |
---|
| 3032 | + tcp_enter_recovery(sk, ece_ack); |
---|
2889 | 3033 | fast_rexmit = 1; |
---|
2890 | 3034 | } |
---|
2891 | 3035 | |
---|
.. | .. |
---|
2896 | 3040 | |
---|
2897 | 3041 | static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us, const int flag) |
---|
2898 | 3042 | { |
---|
2899 | | - u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ; |
---|
| 3043 | + u32 wlen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen) * HZ; |
---|
2900 | 3044 | struct tcp_sock *tp = tcp_sk(sk); |
---|
2901 | 3045 | |
---|
2902 | 3046 | if ((flag & FLAG_ACK_MAYBE_DELAYED) && rtt_us > tcp_min_rtt(tp)) { |
---|
.. | .. |
---|
2935 | 3079 | u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; |
---|
2936 | 3080 | |
---|
2937 | 3081 | if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { |
---|
| 3082 | + if (!delta) |
---|
| 3083 | + delta = 1; |
---|
2938 | 3084 | seq_rtt_us = delta * (USEC_PER_SEC / TCP_TS_HZ); |
---|
2939 | 3085 | ca_rtt_us = seq_rtt_us; |
---|
2940 | 3086 | } |
---|
.. | .. |
---|
2988 | 3134 | /* If the retrans timer is currently being used by Fast Open |
---|
2989 | 3135 | * for SYN-ACK retrans purpose, stay put. |
---|
2990 | 3136 | */ |
---|
2991 | | - if (tp->fastopen_rsk) |
---|
| 3137 | + if (rcu_access_pointer(tp->fastopen_rsk)) |
---|
2992 | 3138 | return; |
---|
2993 | 3139 | |
---|
2994 | 3140 | if (!tp->packets_out) { |
---|
.. | .. |
---|
3004 | 3150 | */ |
---|
3005 | 3151 | rto = usecs_to_jiffies(max_t(int, delta_us, 1)); |
---|
3006 | 3152 | } |
---|
3007 | | - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, |
---|
3008 | | - TCP_RTO_MAX); |
---|
| 3153 | + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, |
---|
| 3154 | + TCP_RTO_MAX); |
---|
3009 | 3155 | } |
---|
3010 | 3156 | } |
---|
3011 | 3157 | |
---|
.. | .. |
---|
3061 | 3207 | */ |
---|
3062 | 3208 | static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, |
---|
3063 | 3209 | u32 prior_snd_una, |
---|
3064 | | - struct tcp_sacktag_state *sack) |
---|
| 3210 | + struct tcp_sacktag_state *sack, bool ece_ack) |
---|
3065 | 3211 | { |
---|
3066 | 3212 | const struct inet_connection_sock *icsk = inet_csk(sk); |
---|
3067 | 3213 | u64 first_ackt, last_ackt; |
---|
.. | .. |
---|
3086 | 3232 | u8 sacked = scb->sacked; |
---|
3087 | 3233 | u32 acked_pcount; |
---|
3088 | 3234 | |
---|
3089 | | - tcp_ack_tstamp(sk, skb, prior_snd_una); |
---|
3090 | | - |
---|
3091 | 3235 | /* Determine how many packets and what bytes were acked, tso and else */ |
---|
3092 | 3236 | if (after(scb->end_seq, tp->snd_una)) { |
---|
3093 | 3237 | if (tcp_skb_pcount(skb) == 1 || |
---|
.. | .. |
---|
3107 | 3251 | tp->retrans_out -= acked_pcount; |
---|
3108 | 3252 | flag |= FLAG_RETRANS_DATA_ACKED; |
---|
3109 | 3253 | } else if (!(sacked & TCPCB_SACKED_ACKED)) { |
---|
3110 | | - last_ackt = skb->skb_mstamp; |
---|
| 3254 | + last_ackt = tcp_skb_timestamp_us(skb); |
---|
3111 | 3255 | WARN_ON_ONCE(last_ackt == 0); |
---|
3112 | 3256 | if (!first_ackt) |
---|
3113 | 3257 | first_ackt = last_ackt; |
---|
.. | .. |
---|
3122 | 3266 | if (sacked & TCPCB_SACKED_ACKED) { |
---|
3123 | 3267 | tp->sacked_out -= acked_pcount; |
---|
3124 | 3268 | } else if (tcp_is_sack(tp)) { |
---|
3125 | | - tp->delivered += acked_pcount; |
---|
| 3269 | + tcp_count_delivered(tp, acked_pcount, ece_ack); |
---|
3126 | 3270 | if (!tcp_skb_spurious_retrans(tp, skb)) |
---|
3127 | 3271 | tcp_rack_advance(tp, sacked, scb->end_seq, |
---|
3128 | | - skb->skb_mstamp); |
---|
| 3272 | + tcp_skb_timestamp_us(skb)); |
---|
3129 | 3273 | } |
---|
3130 | 3274 | if (sacked & TCPCB_LOST) |
---|
3131 | 3275 | tp->lost_out -= acked_pcount; |
---|
.. | .. |
---|
3151 | 3295 | if (!fully_acked) |
---|
3152 | 3296 | break; |
---|
3153 | 3297 | |
---|
| 3298 | + tcp_ack_tstamp(sk, skb, prior_snd_una); |
---|
| 3299 | + |
---|
3154 | 3300 | next = skb_rb_next(skb); |
---|
3155 | 3301 | if (unlikely(skb == tp->retransmit_skb_hint)) |
---|
3156 | 3302 | tp->retransmit_skb_hint = NULL; |
---|
.. | .. |
---|
3166 | 3312 | if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una))) |
---|
3167 | 3313 | tp->snd_up = tp->snd_una; |
---|
3168 | 3314 | |
---|
3169 | | - if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) |
---|
3170 | | - flag |= FLAG_SACK_RENEGING; |
---|
| 3315 | + if (skb) { |
---|
| 3316 | + tcp_ack_tstamp(sk, skb, prior_snd_una); |
---|
| 3317 | + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) |
---|
| 3318 | + flag |= FLAG_SACK_RENEGING; |
---|
| 3319 | + } |
---|
3171 | 3320 | |
---|
3172 | 3321 | if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) { |
---|
3173 | 3322 | seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt); |
---|
.. | .. |
---|
3199 | 3348 | } |
---|
3200 | 3349 | |
---|
3201 | 3350 | if (tcp_is_reno(tp)) { |
---|
3202 | | - tcp_remove_reno_sacks(sk, pkts_acked); |
---|
| 3351 | + tcp_remove_reno_sacks(sk, pkts_acked, ece_ack); |
---|
3203 | 3352 | |
---|
3204 | 3353 | /* If any of the cumulatively ACKed segments was |
---|
3205 | 3354 | * retransmitted, non-SACK case cannot confirm that |
---|
.. | .. |
---|
3220 | 3369 | tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta); |
---|
3221 | 3370 | } |
---|
3222 | 3371 | } else if (skb && rtt_update && sack_rtt_us >= 0 && |
---|
3223 | | - sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) { |
---|
| 3372 | + sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, |
---|
| 3373 | + tcp_skb_timestamp_us(skb))) { |
---|
3224 | 3374 | /* Do not re-arm RTO if the sack RTT is measured from data sent |
---|
3225 | 3375 | * after when the head was last (re)transmitted. Otherwise the |
---|
3226 | 3376 | * timeout may continue to extend in loss recovery. |
---|
.. | .. |
---|
3273 | 3423 | return; |
---|
3274 | 3424 | if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) { |
---|
3275 | 3425 | icsk->icsk_backoff = 0; |
---|
| 3426 | + icsk->icsk_probes_tstamp = 0; |
---|
3276 | 3427 | inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); |
---|
3277 | 3428 | /* Socket must be waked up by subsequent tcp_data_snd_check(). |
---|
3278 | 3429 | * This function is not for random using! |
---|
.. | .. |
---|
3280 | 3431 | } else { |
---|
3281 | 3432 | unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX); |
---|
3282 | 3433 | |
---|
3283 | | - inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, |
---|
3284 | | - when, TCP_RTO_MAX); |
---|
| 3434 | + when = tcp_clamp_probe0_to_user_timeout(sk, when); |
---|
| 3435 | + tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, TCP_RTO_MAX); |
---|
3285 | 3436 | } |
---|
3286 | 3437 | } |
---|
3287 | 3438 | |
---|
.. | .. |
---|
3300 | 3451 | * new SACK or ECE mark may first advance cwnd here and later reduce |
---|
3301 | 3452 | * cwnd in tcp_fastretrans_alert() based on more states. |
---|
3302 | 3453 | */ |
---|
3303 | | - if (tcp_sk(sk)->reordering > sock_net(sk)->ipv4.sysctl_tcp_reordering) |
---|
| 3454 | + if (tcp_sk(sk)->reordering > |
---|
| 3455 | + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering)) |
---|
3304 | 3456 | return flag & FLAG_FORWARD_PROGRESS; |
---|
3305 | 3457 | |
---|
3306 | 3458 | return flag & FLAG_DATA_ACKED; |
---|
.. | .. |
---|
3412 | 3564 | if (*last_oow_ack_time) { |
---|
3413 | 3565 | s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time); |
---|
3414 | 3566 | |
---|
3415 | | - if (0 <= elapsed && elapsed < net->ipv4.sysctl_tcp_invalid_ratelimit) { |
---|
| 3567 | + if (0 <= elapsed && |
---|
| 3568 | + elapsed < READ_ONCE(net->ipv4.sysctl_tcp_invalid_ratelimit)) { |
---|
3416 | 3569 | NET_INC_STATS(net, mib_idx); |
---|
3417 | 3570 | return true; /* rate-limited: don't send yet! */ |
---|
3418 | 3571 | } |
---|
.. | .. |
---|
3459 | 3612 | |
---|
3460 | 3613 | /* Then check host-wide RFC 5961 rate limit. */ |
---|
3461 | 3614 | now = jiffies / HZ; |
---|
3462 | | - if (now != challenge_timestamp) { |
---|
3463 | | - u32 ack_limit = net->ipv4.sysctl_tcp_challenge_ack_limit; |
---|
| 3615 | + if (now != READ_ONCE(challenge_timestamp)) { |
---|
| 3616 | + u32 ack_limit = READ_ONCE(net->ipv4.sysctl_tcp_challenge_ack_limit); |
---|
3464 | 3617 | u32 half = (ack_limit + 1) >> 1; |
---|
3465 | 3618 | |
---|
3466 | | - challenge_timestamp = now; |
---|
| 3619 | + WRITE_ONCE(challenge_timestamp, now); |
---|
3467 | 3620 | WRITE_ONCE(challenge_count, half + prandom_u32_max(ack_limit)); |
---|
3468 | 3621 | } |
---|
3469 | 3622 | count = READ_ONCE(challenge_count); |
---|
.. | .. |
---|
3544 | 3697 | { |
---|
3545 | 3698 | struct tcp_sock *tp = tcp_sk(sk); |
---|
3546 | 3699 | |
---|
3547 | | - if (rexmit == REXMIT_NONE) |
---|
| 3700 | + if (rexmit == REXMIT_NONE || sk->sk_state == TCP_SYN_SENT) |
---|
3548 | 3701 | return; |
---|
3549 | 3702 | |
---|
3550 | | - if (unlikely(rexmit == 2)) { |
---|
| 3703 | + if (unlikely(rexmit == REXMIT_NEW)) { |
---|
3551 | 3704 | __tcp_push_pending_frames(sk, tcp_current_mss(sk), |
---|
3552 | 3705 | TCP_NAGLE_OFF); |
---|
3553 | 3706 | if (after(tp->snd_nxt, tp->high_seq)) |
---|
.. | .. |
---|
3566 | 3719 | |
---|
3567 | 3720 | delivered = tp->delivered - prior_delivered; |
---|
3568 | 3721 | NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered); |
---|
3569 | | - if (flag & FLAG_ECE) { |
---|
3570 | | - tp->delivered_ce += delivered; |
---|
| 3722 | + if (flag & FLAG_ECE) |
---|
3571 | 3723 | NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered); |
---|
3572 | | - } |
---|
| 3724 | + |
---|
3573 | 3725 | return delivered; |
---|
3574 | 3726 | } |
---|
3575 | 3727 | |
---|
.. | .. |
---|
3584 | 3736 | bool is_sack_reneg = tp->is_sack_reneg; |
---|
3585 | 3737 | u32 ack_seq = TCP_SKB_CB(skb)->seq; |
---|
3586 | 3738 | u32 ack = TCP_SKB_CB(skb)->ack_seq; |
---|
3587 | | - bool is_dupack = false; |
---|
| 3739 | + int num_dupack = 0; |
---|
3588 | 3740 | int prior_packets = tp->packets_out; |
---|
3589 | 3741 | u32 delivered = tp->delivered; |
---|
3590 | 3742 | u32 lost = tp->lost; |
---|
.. | .. |
---|
3593 | 3745 | |
---|
3594 | 3746 | sack_state.first_sackt = 0; |
---|
3595 | 3747 | sack_state.rate = &rs; |
---|
| 3748 | + sack_state.sack_delivered = 0; |
---|
3596 | 3749 | |
---|
3597 | 3750 | /* We very likely will need to access rtx queue. */ |
---|
3598 | 3751 | prefetch(sk->tcp_rtx_queue.rb_node); |
---|
.. | .. |
---|
3614 | 3767 | * this segment (RFC793 Section 3.9). |
---|
3615 | 3768 | */ |
---|
3616 | 3769 | if (after(ack, tp->snd_nxt)) |
---|
3617 | | - goto invalid_ack; |
---|
| 3770 | + return -1; |
---|
3618 | 3771 | |
---|
3619 | 3772 | if (after(ack, prior_snd_una)) { |
---|
3620 | 3773 | flag |= FLAG_SND_UNA_ADVANCED; |
---|
3621 | 3774 | icsk->icsk_retransmits = 0; |
---|
3622 | 3775 | |
---|
3623 | 3776 | #if IS_ENABLED(CONFIG_TLS_DEVICE) |
---|
3624 | | - if (static_branch_unlikely(&clean_acked_data_enabled)) |
---|
| 3777 | + if (static_branch_unlikely(&clean_acked_data_enabled.key)) |
---|
3625 | 3778 | if (icsk->icsk_clean_acked) |
---|
3626 | 3779 | icsk->icsk_clean_acked(sk, ack); |
---|
3627 | 3780 | #endif |
---|
.. | .. |
---|
3636 | 3789 | if (flag & FLAG_UPDATE_TS_RECENT) |
---|
3637 | 3790 | tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); |
---|
3638 | 3791 | |
---|
3639 | | - if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) { |
---|
| 3792 | + if ((flag & (FLAG_SLOWPATH | FLAG_SND_UNA_ADVANCED)) == |
---|
| 3793 | + FLAG_SND_UNA_ADVANCED) { |
---|
3640 | 3794 | /* Window is constant, pure forward advance. |
---|
3641 | 3795 | * No more checks are required. |
---|
3642 | 3796 | * Note, we use the fact that SND.UNA>=SND.WL2. |
---|
.. | .. |
---|
3667 | 3821 | ack_ev_flags |= CA_ACK_ECE; |
---|
3668 | 3822 | } |
---|
3669 | 3823 | |
---|
| 3824 | + if (sack_state.sack_delivered) |
---|
| 3825 | + tcp_count_delivered(tp, sack_state.sack_delivered, |
---|
| 3826 | + flag & FLAG_ECE); |
---|
| 3827 | + |
---|
3670 | 3828 | if (flag & FLAG_WIN_UPDATE) |
---|
3671 | 3829 | ack_ev_flags |= CA_ACK_WIN_UPDATE; |
---|
3672 | 3830 | |
---|
.. | .. |
---|
3692 | 3850 | goto no_queue; |
---|
3693 | 3851 | |
---|
3694 | 3852 | /* See if we can take anything off of the retransmit queue. */ |
---|
3695 | | - flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state); |
---|
| 3853 | + flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state, |
---|
| 3854 | + flag & FLAG_ECE); |
---|
3696 | 3855 | |
---|
3697 | 3856 | tcp_rack_update_reo_wnd(sk, &rs); |
---|
3698 | 3857 | |
---|
.. | .. |
---|
3700 | 3859 | tcp_process_tlp_ack(sk, ack, flag); |
---|
3701 | 3860 | |
---|
3702 | 3861 | if (tcp_ack_is_dubious(sk, flag)) { |
---|
3703 | | - is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); |
---|
3704 | | - tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, |
---|
| 3862 | + if (!(flag & (FLAG_SND_UNA_ADVANCED | |
---|
| 3863 | + FLAG_NOT_DUP | FLAG_DSACKING_ACK))) { |
---|
| 3864 | + num_dupack = 1; |
---|
| 3865 | + /* Consider if pure acks were aggregated in tcp_add_backlog() */ |
---|
| 3866 | + if (!(flag & FLAG_DATA)) |
---|
| 3867 | + num_dupack = max_t(u16, 1, skb_shinfo(skb)->gso_segs); |
---|
| 3868 | + } |
---|
| 3869 | + tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, |
---|
3705 | 3870 | &rexmit); |
---|
3706 | 3871 | } |
---|
3707 | 3872 | |
---|
.. | .. |
---|
3723 | 3888 | no_queue: |
---|
3724 | 3889 | /* If data was DSACKed, see if we can undo a cwnd reduction. */ |
---|
3725 | 3890 | if (flag & FLAG_DSACKING_ACK) { |
---|
3726 | | - tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, |
---|
| 3891 | + tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, |
---|
3727 | 3892 | &rexmit); |
---|
3728 | 3893 | tcp_newly_delivered(sk, delivered, flag); |
---|
3729 | 3894 | } |
---|
.. | .. |
---|
3737 | 3902 | tcp_process_tlp_ack(sk, ack, flag); |
---|
3738 | 3903 | return 1; |
---|
3739 | 3904 | |
---|
3740 | | -invalid_ack: |
---|
3741 | | - SOCK_DEBUG(sk, "Ack %u after %u:%u\n", ack, tp->snd_una, tp->snd_nxt); |
---|
3742 | | - return -1; |
---|
3743 | | - |
---|
3744 | 3905 | old_ack: |
---|
3745 | 3906 | /* If data was SACKed, tag it and see if we should send more data. |
---|
3746 | 3907 | * If data was DSACKed, see if we can undo a cwnd reduction. |
---|
.. | .. |
---|
3748 | 3909 | if (TCP_SKB_CB(skb)->sacked) { |
---|
3749 | 3910 | flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, |
---|
3750 | 3911 | &sack_state); |
---|
3751 | | - tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, |
---|
| 3912 | + tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, |
---|
3752 | 3913 | &rexmit); |
---|
3753 | 3914 | tcp_newly_delivered(sk, delivered, flag); |
---|
3754 | 3915 | tcp_xmit_recovery(sk, rexmit); |
---|
3755 | 3916 | } |
---|
3756 | 3917 | |
---|
3757 | | - SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt); |
---|
3758 | 3918 | return 0; |
---|
3759 | 3919 | } |
---|
3760 | 3920 | |
---|
.. | .. |
---|
3775 | 3935 | foc->exp = exp_opt; |
---|
3776 | 3936 | } |
---|
3777 | 3937 | |
---|
3778 | | -static void smc_parse_options(const struct tcphdr *th, |
---|
| 3938 | +static bool smc_parse_options(const struct tcphdr *th, |
---|
3779 | 3939 | struct tcp_options_received *opt_rx, |
---|
3780 | 3940 | const unsigned char *ptr, |
---|
3781 | 3941 | int opsize) |
---|
.. | .. |
---|
3784 | 3944 | if (static_branch_unlikely(&tcp_have_smc)) { |
---|
3785 | 3945 | if (th->syn && !(opsize & 1) && |
---|
3786 | 3946 | opsize >= TCPOLEN_EXP_SMC_BASE && |
---|
3787 | | - get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) |
---|
| 3947 | + get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) { |
---|
3788 | 3948 | opt_rx->smc_ok = 1; |
---|
| 3949 | + return true; |
---|
| 3950 | + } |
---|
3789 | 3951 | } |
---|
3790 | 3952 | #endif |
---|
| 3953 | + return false; |
---|
| 3954 | +} |
---|
| 3955 | + |
---|
| 3956 | +/* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped |
---|
| 3957 | + * value on success. |
---|
| 3958 | + */ |
---|
| 3959 | +static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss) |
---|
| 3960 | +{ |
---|
| 3961 | + const unsigned char *ptr = (const unsigned char *)(th + 1); |
---|
| 3962 | + int length = (th->doff * 4) - sizeof(struct tcphdr); |
---|
| 3963 | + u16 mss = 0; |
---|
| 3964 | + |
---|
| 3965 | + while (length > 0) { |
---|
| 3966 | + int opcode = *ptr++; |
---|
| 3967 | + int opsize; |
---|
| 3968 | + |
---|
| 3969 | + switch (opcode) { |
---|
| 3970 | + case TCPOPT_EOL: |
---|
| 3971 | + return mss; |
---|
| 3972 | + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ |
---|
| 3973 | + length--; |
---|
| 3974 | + continue; |
---|
| 3975 | + default: |
---|
| 3976 | + if (length < 2) |
---|
| 3977 | + return mss; |
---|
| 3978 | + opsize = *ptr++; |
---|
| 3979 | + if (opsize < 2) /* "silly options" */ |
---|
| 3980 | + return mss; |
---|
| 3981 | + if (opsize > length) |
---|
| 3982 | + return mss; /* fail on partial options */ |
---|
| 3983 | + if (opcode == TCPOPT_MSS && opsize == TCPOLEN_MSS) { |
---|
| 3984 | + u16 in_mss = get_unaligned_be16(ptr); |
---|
| 3985 | + |
---|
| 3986 | + if (in_mss) { |
---|
| 3987 | + if (user_mss && user_mss < in_mss) |
---|
| 3988 | + in_mss = user_mss; |
---|
| 3989 | + mss = in_mss; |
---|
| 3990 | + } |
---|
| 3991 | + } |
---|
| 3992 | + ptr += opsize - 2; |
---|
| 3993 | + length -= opsize; |
---|
| 3994 | + } |
---|
| 3995 | + } |
---|
| 3996 | + return mss; |
---|
3791 | 3997 | } |
---|
3792 | 3998 | |
---|
3793 | 3999 | /* Look for tcp options. Normally only called on SYN and SYNACK packets. |
---|
.. | .. |
---|
3805 | 4011 | |
---|
3806 | 4012 | ptr = (const unsigned char *)(th + 1); |
---|
3807 | 4013 | opt_rx->saw_tstamp = 0; |
---|
| 4014 | + opt_rx->saw_unknown = 0; |
---|
3808 | 4015 | |
---|
3809 | 4016 | while (length > 0) { |
---|
3810 | 4017 | int opcode = *ptr++; |
---|
.. | .. |
---|
3817 | 4024 | length--; |
---|
3818 | 4025 | continue; |
---|
3819 | 4026 | default: |
---|
| 4027 | + if (length < 2) |
---|
| 4028 | + return; |
---|
3820 | 4029 | opsize = *ptr++; |
---|
3821 | 4030 | if (opsize < 2) /* "silly options" */ |
---|
3822 | 4031 | return; |
---|
.. | .. |
---|
3836 | 4045 | break; |
---|
3837 | 4046 | case TCPOPT_WINDOW: |
---|
3838 | 4047 | if (opsize == TCPOLEN_WINDOW && th->syn && |
---|
3839 | | - !estab && net->ipv4.sysctl_tcp_window_scaling) { |
---|
| 4048 | + !estab && READ_ONCE(net->ipv4.sysctl_tcp_window_scaling)) { |
---|
3840 | 4049 | __u8 snd_wscale = *(__u8 *)ptr; |
---|
3841 | 4050 | opt_rx->wscale_ok = 1; |
---|
3842 | 4051 | if (snd_wscale > TCP_MAX_WSCALE) { |
---|
.. | .. |
---|
3852 | 4061 | case TCPOPT_TIMESTAMP: |
---|
3853 | 4062 | if ((opsize == TCPOLEN_TIMESTAMP) && |
---|
3854 | 4063 | ((estab && opt_rx->tstamp_ok) || |
---|
3855 | | - (!estab && net->ipv4.sysctl_tcp_timestamps))) { |
---|
| 4064 | + (!estab && READ_ONCE(net->ipv4.sysctl_tcp_timestamps)))) { |
---|
3856 | 4065 | opt_rx->saw_tstamp = 1; |
---|
3857 | 4066 | opt_rx->rcv_tsval = get_unaligned_be32(ptr); |
---|
3858 | 4067 | opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4); |
---|
.. | .. |
---|
3860 | 4069 | break; |
---|
3861 | 4070 | case TCPOPT_SACK_PERM: |
---|
3862 | 4071 | if (opsize == TCPOLEN_SACK_PERM && th->syn && |
---|
3863 | | - !estab && net->ipv4.sysctl_tcp_sack) { |
---|
| 4072 | + !estab && READ_ONCE(net->ipv4.sysctl_tcp_sack)) { |
---|
3864 | 4073 | opt_rx->sack_ok = TCP_SACK_SEEN; |
---|
3865 | 4074 | tcp_sack_reset(opt_rx); |
---|
3866 | 4075 | } |
---|
.. | .. |
---|
3893 | 4102 | */ |
---|
3894 | 4103 | if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE && |
---|
3895 | 4104 | get_unaligned_be16(ptr) == |
---|
3896 | | - TCPOPT_FASTOPEN_MAGIC) |
---|
| 4105 | + TCPOPT_FASTOPEN_MAGIC) { |
---|
3897 | 4106 | tcp_parse_fastopen_option(opsize - |
---|
3898 | 4107 | TCPOLEN_EXP_FASTOPEN_BASE, |
---|
3899 | 4108 | ptr + 2, th->syn, foc, true); |
---|
3900 | | - else |
---|
3901 | | - smc_parse_options(th, opt_rx, ptr, |
---|
3902 | | - opsize); |
---|
| 4109 | + break; |
---|
| 4110 | + } |
---|
| 4111 | + |
---|
| 4112 | + if (smc_parse_options(th, opt_rx, ptr, opsize)) |
---|
| 4113 | + break; |
---|
| 4114 | + |
---|
| 4115 | + opt_rx->saw_unknown = 1; |
---|
3903 | 4116 | break; |
---|
3904 | 4117 | |
---|
| 4118 | + default: |
---|
| 4119 | + opt_rx->saw_unknown = 1; |
---|
3905 | 4120 | } |
---|
3906 | 4121 | ptr += opsize-2; |
---|
3907 | 4122 | length -= opsize; |
---|
.. | .. |
---|
4117 | 4332 | case TCP_ESTABLISHED: |
---|
4118 | 4333 | /* Move to CLOSE_WAIT */ |
---|
4119 | 4334 | tcp_set_state(sk, TCP_CLOSE_WAIT); |
---|
4120 | | - inet_csk(sk)->icsk_ack.pingpong = 1; |
---|
| 4335 | + inet_csk_enter_pingpong_mode(sk); |
---|
4121 | 4336 | break; |
---|
4122 | 4337 | |
---|
4123 | 4338 | case TCP_CLOSE_WAIT: |
---|
.. | .. |
---|
4189 | 4404 | { |
---|
4190 | 4405 | struct tcp_sock *tp = tcp_sk(sk); |
---|
4191 | 4406 | |
---|
4192 | | - if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) { |
---|
| 4407 | + if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) { |
---|
4193 | 4408 | int mib_idx; |
---|
4194 | 4409 | |
---|
4195 | 4410 | if (before(seq, tp->rcv_nxt)) |
---|
.. | .. |
---|
4215 | 4430 | tcp_sack_extend(tp->duplicate_sack, seq, end_seq); |
---|
4216 | 4431 | } |
---|
4217 | 4432 | |
---|
| 4433 | +static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb) |
---|
| 4434 | +{ |
---|
| 4435 | + /* When the ACK path fails or drops most ACKs, the sender would |
---|
| 4436 | + * timeout and spuriously retransmit the same segment repeatedly. |
---|
| 4437 | + * The receiver remembers and reflects via DSACKs. Leverage the |
---|
| 4438 | + * DSACK state and change the txhash to re-route speculatively. |
---|
| 4439 | + */ |
---|
| 4440 | + if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq && |
---|
| 4441 | + sk_rethink_txhash(sk)) |
---|
| 4442 | + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH); |
---|
| 4443 | +} |
---|
| 4444 | + |
---|
4218 | 4445 | static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) |
---|
4219 | 4446 | { |
---|
4220 | 4447 | struct tcp_sock *tp = tcp_sk(sk); |
---|
.. | .. |
---|
4224 | 4451 | NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); |
---|
4225 | 4452 | tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); |
---|
4226 | 4453 | |
---|
4227 | | - if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) { |
---|
| 4454 | + if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) { |
---|
4228 | 4455 | u32 end_seq = TCP_SKB_CB(skb)->end_seq; |
---|
4229 | 4456 | |
---|
| 4457 | + tcp_rcv_spurious_retrans(sk, skb); |
---|
4230 | 4458 | if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) |
---|
4231 | 4459 | end_seq = tp->rcv_nxt; |
---|
4232 | 4460 | tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq); |
---|
.. | .. |
---|
4260 | 4488 | sp[i] = sp[i + 1]; |
---|
4261 | 4489 | continue; |
---|
4262 | 4490 | } |
---|
4263 | | - this_sack++, swalk++; |
---|
| 4491 | + this_sack++; |
---|
| 4492 | + swalk++; |
---|
4264 | 4493 | } |
---|
4265 | 4494 | } |
---|
| 4495 | + |
---|
| 4496 | +static void tcp_sack_compress_send_ack(struct sock *sk) |
---|
| 4497 | +{ |
---|
| 4498 | + struct tcp_sock *tp = tcp_sk(sk); |
---|
| 4499 | + |
---|
| 4500 | + if (!tp->compressed_ack) |
---|
| 4501 | + return; |
---|
| 4502 | + |
---|
| 4503 | + if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) |
---|
| 4504 | + __sock_put(sk); |
---|
| 4505 | + |
---|
| 4506 | + /* Since we have to send one ack finally, |
---|
| 4507 | + * substract one from tp->compressed_ack to keep |
---|
| 4508 | + * LINUX_MIB_TCPACKCOMPRESSED accurate. |
---|
| 4509 | + */ |
---|
| 4510 | + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, |
---|
| 4511 | + tp->compressed_ack - 1); |
---|
| 4512 | + |
---|
| 4513 | + tp->compressed_ack = 0; |
---|
| 4514 | + tcp_send_ack(sk); |
---|
| 4515 | +} |
---|
| 4516 | + |
---|
| 4517 | +/* Reasonable amount of sack blocks included in TCP SACK option |
---|
| 4518 | + * The max is 4, but this becomes 3 if TCP timestamps are there. |
---|
| 4519 | + * Given that SACK packets might be lost, be conservative and use 2. |
---|
| 4520 | + */ |
---|
| 4521 | +#define TCP_SACK_BLOCKS_EXPECTED 2 |
---|
4266 | 4522 | |
---|
4267 | 4523 | static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) |
---|
4268 | 4524 | { |
---|
.. | .. |
---|
4276 | 4532 | |
---|
4277 | 4533 | for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) { |
---|
4278 | 4534 | if (tcp_sack_extend(sp, seq, end_seq)) { |
---|
| 4535 | + if (this_sack >= TCP_SACK_BLOCKS_EXPECTED) |
---|
| 4536 | + tcp_sack_compress_send_ack(sk); |
---|
4279 | 4537 | /* Rotate this_sack to the first one. */ |
---|
4280 | 4538 | for (; this_sack > 0; this_sack--, sp--) |
---|
4281 | 4539 | swap(*sp, *(sp - 1)); |
---|
.. | .. |
---|
4285 | 4543 | } |
---|
4286 | 4544 | } |
---|
4287 | 4545 | |
---|
| 4546 | + if (this_sack >= TCP_SACK_BLOCKS_EXPECTED) |
---|
| 4547 | + tcp_sack_compress_send_ack(sk); |
---|
| 4548 | + |
---|
4288 | 4549 | /* Could not find an adjacent existing SACK, build a new one, |
---|
4289 | 4550 | * put it at the front, and shift everyone else down. We |
---|
4290 | 4551 | * always know there is at least one SACK present already here. |
---|
.. | .. |
---|
4292 | 4553 | * If the sack array is full, forget about the last one. |
---|
4293 | 4554 | */ |
---|
4294 | 4555 | if (this_sack >= TCP_NUM_SACKS) { |
---|
4295 | | - if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) |
---|
4296 | | - tcp_send_ack(sk); |
---|
4297 | 4556 | this_sack--; |
---|
4298 | 4557 | tp->rx_opt.num_sacks--; |
---|
4299 | 4558 | sp--; |
---|
.. | .. |
---|
4345 | 4604 | /** |
---|
4346 | 4605 | * tcp_try_coalesce - try to merge skb to prior one |
---|
4347 | 4606 | * @sk: socket |
---|
4348 | | - * @dest: destination queue |
---|
4349 | 4607 | * @to: prior buffer |
---|
4350 | 4608 | * @from: buffer to add in queue |
---|
4351 | 4609 | * @fragstolen: pointer to boolean |
---|
.. | .. |
---|
4367 | 4625 | |
---|
4368 | 4626 | /* Its possible this segment overlaps with prior segment in queue */ |
---|
4369 | 4627 | if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) |
---|
| 4628 | + return false; |
---|
| 4629 | + |
---|
| 4630 | + if (!mptcp_skb_can_collapse(to, from)) |
---|
4370 | 4631 | return false; |
---|
4371 | 4632 | |
---|
4372 | 4633 | #ifdef CONFIG_TLS_DEVICE |
---|
.. | .. |
---|
4412 | 4673 | |
---|
4413 | 4674 | static void tcp_drop(struct sock *sk, struct sk_buff *skb) |
---|
4414 | 4675 | { |
---|
| 4676 | + trace_android_vh_kfree_skb(skb); |
---|
4415 | 4677 | sk_drops_add(sk, skb); |
---|
4416 | 4678 | __kfree_skb(skb); |
---|
4417 | 4679 | } |
---|
.. | .. |
---|
4443 | 4705 | rb_erase(&skb->rbnode, &tp->out_of_order_queue); |
---|
4444 | 4706 | |
---|
4445 | 4707 | if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) { |
---|
4446 | | - SOCK_DEBUG(sk, "ofo packet was already received\n"); |
---|
4447 | 4708 | tcp_drop(sk, skb); |
---|
4448 | 4709 | continue; |
---|
4449 | 4710 | } |
---|
4450 | | - SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n", |
---|
4451 | | - tp->rcv_nxt, TCP_SKB_CB(skb)->seq, |
---|
4452 | | - TCP_SKB_CB(skb)->end_seq); |
---|
4453 | 4711 | |
---|
4454 | 4712 | tail = skb_peek_tail(&sk->sk_receive_queue); |
---|
4455 | 4713 | eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); |
---|
.. | .. |
---|
4511 | 4769 | tp->pred_flags = 0; |
---|
4512 | 4770 | inet_csk_schedule_ack(sk); |
---|
4513 | 4771 | |
---|
| 4772 | + tp->rcv_ooopack += max_t(u16, 1, skb_shinfo(skb)->gso_segs); |
---|
4514 | 4773 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); |
---|
4515 | 4774 | seq = TCP_SKB_CB(skb)->seq; |
---|
4516 | 4775 | end_seq = TCP_SKB_CB(skb)->end_seq; |
---|
4517 | | - SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", |
---|
4518 | | - tp->rcv_nxt, seq, end_seq); |
---|
4519 | 4776 | |
---|
4520 | 4777 | p = &tp->out_of_order_queue.rb_node; |
---|
4521 | 4778 | if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) { |
---|
.. | .. |
---|
4541 | 4798 | * and trigger fast retransmit. |
---|
4542 | 4799 | */ |
---|
4543 | 4800 | if (tcp_is_sack(tp)) |
---|
4544 | | - tcp_grow_window(sk, skb); |
---|
| 4801 | + tcp_grow_window(sk, skb, true); |
---|
4545 | 4802 | kfree_skb_partial(skb, fragstolen); |
---|
4546 | 4803 | skb = NULL; |
---|
4547 | 4804 | goto add_sack; |
---|
.. | .. |
---|
4629 | 4886 | * and trigger fast retransmit. |
---|
4630 | 4887 | */ |
---|
4631 | 4888 | if (tcp_is_sack(tp)) |
---|
4632 | | - tcp_grow_window(sk, skb); |
---|
| 4889 | + tcp_grow_window(sk, skb, false); |
---|
4633 | 4890 | skb_condense(skb); |
---|
4634 | 4891 | skb_set_owner_r(skb, sk); |
---|
4635 | 4892 | } |
---|
4636 | 4893 | } |
---|
4637 | 4894 | |
---|
4638 | | -static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, |
---|
4639 | | - bool *fragstolen) |
---|
| 4895 | +static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, |
---|
| 4896 | + bool *fragstolen) |
---|
4640 | 4897 | { |
---|
4641 | 4898 | int eaten; |
---|
4642 | 4899 | struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue); |
---|
4643 | 4900 | |
---|
4644 | | - __skb_pull(skb, hdrlen); |
---|
4645 | 4901 | eaten = (tail && |
---|
4646 | 4902 | tcp_try_coalesce(sk, tail, |
---|
4647 | 4903 | skb, fragstolen)) ? 1 : 0; |
---|
.. | .. |
---|
4692 | 4948 | TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size; |
---|
4693 | 4949 | TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1; |
---|
4694 | 4950 | |
---|
4695 | | - if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) { |
---|
| 4951 | + if (tcp_queue_rcv(sk, skb, &fragstolen)) { |
---|
4696 | 4952 | WARN_ON_ONCE(fragstolen); /* should not happen */ |
---|
4697 | 4953 | __kfree_skb(skb); |
---|
4698 | 4954 | } |
---|
.. | .. |
---|
4724 | 4980 | bool fragstolen; |
---|
4725 | 4981 | int eaten; |
---|
4726 | 4982 | |
---|
| 4983 | + if (sk_is_mptcp(sk)) |
---|
| 4984 | + mptcp_incoming_options(sk, skb); |
---|
| 4985 | + |
---|
4727 | 4986 | if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { |
---|
4728 | 4987 | __kfree_skb(skb); |
---|
4729 | 4988 | return; |
---|
.. | .. |
---|
4753 | 5012 | goto drop; |
---|
4754 | 5013 | } |
---|
4755 | 5014 | |
---|
4756 | | - eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); |
---|
| 5015 | + eaten = tcp_queue_rcv(sk, skb, &fragstolen); |
---|
4757 | 5016 | if (skb->len) |
---|
4758 | 5017 | tcp_event_data_recv(sk, skb); |
---|
4759 | 5018 | if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) |
---|
.. | .. |
---|
4782 | 5041 | } |
---|
4783 | 5042 | |
---|
4784 | 5043 | if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { |
---|
| 5044 | + tcp_rcv_spurious_retrans(sk, skb); |
---|
4785 | 5045 | /* A retransmit, 2nd most common case. Force an immediate ack. */ |
---|
4786 | 5046 | NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); |
---|
4787 | 5047 | tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); |
---|
.. | .. |
---|
4800 | 5060 | |
---|
4801 | 5061 | if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { |
---|
4802 | 5062 | /* Partial packet, seq < rcv_next < end_seq */ |
---|
4803 | | - SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n", |
---|
4804 | | - tp->rcv_nxt, TCP_SKB_CB(skb)->seq, |
---|
4805 | | - TCP_SKB_CB(skb)->end_seq); |
---|
4806 | | - |
---|
4807 | 5063 | tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt); |
---|
4808 | 5064 | |
---|
4809 | 5065 | /* If window is closed, drop tail of packet. But after |
---|
.. | .. |
---|
4897 | 5153 | /* The first skb to collapse is: |
---|
4898 | 5154 | * - not SYN/FIN and |
---|
4899 | 5155 | * - bloated or contains data before "start" or |
---|
4900 | | - * overlaps to the next one. |
---|
| 5156 | + * overlaps to the next one and mptcp allow collapsing. |
---|
4901 | 5157 | */ |
---|
4902 | 5158 | if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) && |
---|
4903 | 5159 | (tcp_win_from_space(sk, skb->truesize) > skb->len || |
---|
.. | .. |
---|
4906 | 5162 | break; |
---|
4907 | 5163 | } |
---|
4908 | 5164 | |
---|
4909 | | - if (n && n != tail && |
---|
| 5165 | + if (n && n != tail && mptcp_skb_can_collapse(skb, n) && |
---|
4910 | 5166 | TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) { |
---|
4911 | 5167 | end_of_skbs = false; |
---|
4912 | 5168 | break; |
---|
.. | .. |
---|
4939 | 5195 | else |
---|
4940 | 5196 | __skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */ |
---|
4941 | 5197 | skb_set_owner_r(nskb, sk); |
---|
| 5198 | + mptcp_skb_ext_move(nskb, skb); |
---|
4942 | 5199 | |
---|
4943 | 5200 | /* Copy data, releasing collapsed skbs. */ |
---|
4944 | 5201 | while (copy > 0) { |
---|
.. | .. |
---|
4958 | 5215 | skb = tcp_collapse_one(sk, skb, list, root); |
---|
4959 | 5216 | if (!skb || |
---|
4960 | 5217 | skb == tail || |
---|
| 5218 | + !mptcp_skb_can_collapse(nskb, skb) || |
---|
4961 | 5219 | (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) |
---|
4962 | 5220 | goto end; |
---|
4963 | 5221 | #ifdef CONFIG_TLS_DEVICE |
---|
.. | .. |
---|
5082 | 5340 | { |
---|
5083 | 5341 | struct tcp_sock *tp = tcp_sk(sk); |
---|
5084 | 5342 | |
---|
5085 | | - SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq); |
---|
5086 | | - |
---|
5087 | 5343 | NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED); |
---|
5088 | 5344 | |
---|
5089 | 5345 | if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) |
---|
.. | .. |
---|
5149 | 5405 | return true; |
---|
5150 | 5406 | } |
---|
5151 | 5407 | |
---|
5152 | | -/* When incoming ACK allowed to free some skb from write_queue, |
---|
5153 | | - * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket |
---|
5154 | | - * on the exit from tcp input handler. |
---|
5155 | | - * |
---|
5156 | | - * PROBLEM: sndbuf expansion does not work well with largesend. |
---|
5157 | | - */ |
---|
5158 | 5408 | static void tcp_new_space(struct sock *sk) |
---|
5159 | 5409 | { |
---|
5160 | 5410 | struct tcp_sock *tp = tcp_sk(sk); |
---|
.. | .. |
---|
5167 | 5417 | sk->sk_write_space(sk); |
---|
5168 | 5418 | } |
---|
5169 | 5419 | |
---|
5170 | | -static void tcp_check_space(struct sock *sk) |
---|
| 5420 | +/* Caller made space either from: |
---|
| 5421 | + * 1) Freeing skbs in rtx queues (after tp->snd_una has advanced) |
---|
| 5422 | + * 2) Sent skbs from output queue (and thus advancing tp->snd_nxt) |
---|
| 5423 | + * |
---|
| 5424 | + * We might be able to generate EPOLLOUT to the application if: |
---|
| 5425 | + * 1) Space consumed in output/rtx queues is below sk->sk_sndbuf/2 |
---|
| 5426 | + * 2) notsent amount (tp->write_seq - tp->snd_nxt) became |
---|
| 5427 | + * small enough that tcp_stream_memory_free() decides it |
---|
| 5428 | + * is time to generate EPOLLOUT. |
---|
| 5429 | + */ |
---|
| 5430 | +void tcp_check_space(struct sock *sk) |
---|
5171 | 5431 | { |
---|
5172 | | - if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) { |
---|
5173 | | - sock_reset_flag(sk, SOCK_QUEUE_SHRUNK); |
---|
5174 | | - /* pairs with tcp_poll() */ |
---|
5175 | | - smp_mb(); |
---|
5176 | | - if (sk->sk_socket && |
---|
5177 | | - test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { |
---|
5178 | | - tcp_new_space(sk); |
---|
5179 | | - if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) |
---|
5180 | | - tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); |
---|
5181 | | - } |
---|
| 5432 | + /* pairs with tcp_poll() */ |
---|
| 5433 | + smp_mb(); |
---|
| 5434 | + if (sk->sk_socket && |
---|
| 5435 | + test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { |
---|
| 5436 | + tcp_new_space(sk); |
---|
| 5437 | + if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) |
---|
| 5438 | + tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); |
---|
5182 | 5439 | } |
---|
5183 | 5440 | } |
---|
5184 | 5441 | |
---|
.. | .. |
---|
5220 | 5477 | } |
---|
5221 | 5478 | |
---|
5222 | 5479 | if (!tcp_is_sack(tp) || |
---|
5223 | | - tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr) |
---|
| 5480 | + tp->compressed_ack >= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr)) |
---|
5224 | 5481 | goto send_now; |
---|
5225 | 5482 | |
---|
5226 | 5483 | if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) { |
---|
5227 | 5484 | tp->compressed_ack_rcv_nxt = tp->rcv_nxt; |
---|
5228 | | - if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) |
---|
5229 | | - NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, |
---|
5230 | | - tp->compressed_ack - TCP_FASTRETRANS_THRESH); |
---|
5231 | | - tp->compressed_ack = 0; |
---|
| 5485 | + tp->dup_ack_counter = 0; |
---|
5232 | 5486 | } |
---|
5233 | | - |
---|
5234 | | - if (++tp->compressed_ack <= TCP_FASTRETRANS_THRESH) |
---|
| 5487 | + if (tp->dup_ack_counter < TCP_FASTRETRANS_THRESH) { |
---|
| 5488 | + tp->dup_ack_counter++; |
---|
5235 | 5489 | goto send_now; |
---|
5236 | | - |
---|
| 5490 | + } |
---|
| 5491 | + tp->compressed_ack++; |
---|
5237 | 5492 | if (hrtimer_is_queued(&tp->compressed_ack_timer)) |
---|
5238 | 5493 | return; |
---|
5239 | 5494 | |
---|
.. | .. |
---|
5243 | 5498 | if (tp->srtt_us && tp->srtt_us < rtt) |
---|
5244 | 5499 | rtt = tp->srtt_us; |
---|
5245 | 5500 | |
---|
5246 | | - delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns, |
---|
| 5501 | + delay = min_t(unsigned long, |
---|
| 5502 | + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns), |
---|
5247 | 5503 | rtt * (NSEC_PER_USEC >> 3)/20); |
---|
5248 | 5504 | sock_hold(sk); |
---|
5249 | | - hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay), |
---|
5250 | | - HRTIMER_MODE_REL_PINNED_SOFT); |
---|
| 5505 | + hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay), |
---|
| 5506 | + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns), |
---|
| 5507 | + HRTIMER_MODE_REL_PINNED_SOFT); |
---|
5251 | 5508 | } |
---|
5252 | 5509 | |
---|
5253 | 5510 | static inline void tcp_ack_snd_check(struct sock *sk) |
---|
.. | .. |
---|
5274 | 5531 | struct tcp_sock *tp = tcp_sk(sk); |
---|
5275 | 5532 | u32 ptr = ntohs(th->urg_ptr); |
---|
5276 | 5533 | |
---|
5277 | | - if (ptr && !sock_net(sk)->ipv4.sysctl_tcp_stdurg) |
---|
| 5534 | + if (ptr && !READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_stdurg)) |
---|
5278 | 5535 | ptr--; |
---|
5279 | 5536 | ptr += ntohl(th->seq); |
---|
5280 | 5537 | |
---|
.. | .. |
---|
5328 | 5585 | } |
---|
5329 | 5586 | |
---|
5330 | 5587 | tp->urg_data = TCP_URG_NOTYET; |
---|
5331 | | - tp->urg_seq = ptr; |
---|
| 5588 | + WRITE_ONCE(tp->urg_seq, ptr); |
---|
5332 | 5589 | |
---|
5333 | 5590 | /* Disable header prediction. */ |
---|
5334 | 5591 | tp->pred_flags = 0; |
---|
.. | .. |
---|
5481 | 5738 | goto discard; |
---|
5482 | 5739 | } |
---|
5483 | 5740 | |
---|
| 5741 | + bpf_skops_parse_hdr(sk, skb); |
---|
| 5742 | + |
---|
5484 | 5743 | return true; |
---|
5485 | 5744 | |
---|
5486 | 5745 | discard: |
---|
.. | .. |
---|
5521 | 5780 | trace_tcp_probe(sk, skb); |
---|
5522 | 5781 | |
---|
5523 | 5782 | tcp_mstamp_refresh(tp); |
---|
5524 | | - if (unlikely(!sk->sk_rx_dst)) |
---|
| 5783 | + if (unlikely(!rcu_access_pointer(sk->sk_rx_dst))) |
---|
5525 | 5784 | inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); |
---|
5526 | 5785 | /* |
---|
5527 | 5786 | * Header prediction. |
---|
.. | .. |
---|
5628 | 5887 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS); |
---|
5629 | 5888 | |
---|
5630 | 5889 | /* Bulk data transfer: receiver */ |
---|
5631 | | - eaten = tcp_queue_rcv(sk, skb, tcp_header_len, |
---|
5632 | | - &fragstolen); |
---|
| 5890 | + __skb_pull(skb, tcp_header_len); |
---|
| 5891 | + eaten = tcp_queue_rcv(sk, skb, &fragstolen); |
---|
5633 | 5892 | |
---|
5634 | 5893 | tcp_event_data_recv(sk, skb); |
---|
5635 | 5894 | |
---|
.. | .. |
---|
5691 | 5950 | } |
---|
5692 | 5951 | EXPORT_SYMBOL(tcp_rcv_established); |
---|
5693 | 5952 | |
---|
| 5953 | +void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb) |
---|
| 5954 | +{ |
---|
| 5955 | + struct inet_connection_sock *icsk = inet_csk(sk); |
---|
| 5956 | + struct tcp_sock *tp = tcp_sk(sk); |
---|
| 5957 | + |
---|
| 5958 | + tcp_mtup_init(sk); |
---|
| 5959 | + icsk->icsk_af_ops->rebuild_header(sk); |
---|
| 5960 | + tcp_init_metrics(sk); |
---|
| 5961 | + |
---|
| 5962 | + /* Initialize the congestion window to start the transfer. |
---|
| 5963 | + * Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been |
---|
| 5964 | + * retransmitted. In light of RFC6298 more aggressive 1sec |
---|
| 5965 | + * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK |
---|
| 5966 | + * retransmission has occurred. |
---|
| 5967 | + */ |
---|
| 5968 | + if (tp->total_retrans > 1 && tp->undo_marker) |
---|
| 5969 | + tp->snd_cwnd = 1; |
---|
| 5970 | + else |
---|
| 5971 | + tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk)); |
---|
| 5972 | + tp->snd_cwnd_stamp = tcp_jiffies32; |
---|
| 5973 | + |
---|
| 5974 | + bpf_skops_established(sk, bpf_op, skb); |
---|
| 5975 | + /* Initialize congestion control unless BPF initialized it already: */ |
---|
| 5976 | + if (!icsk->icsk_ca_initialized) |
---|
| 5977 | + tcp_init_congestion_control(sk); |
---|
| 5978 | + tcp_init_buffer_space(sk); |
---|
| 5979 | +} |
---|
| 5980 | + |
---|
5694 | 5981 | void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) |
---|
5695 | 5982 | { |
---|
5696 | 5983 | struct tcp_sock *tp = tcp_sk(sk); |
---|
.. | .. |
---|
5705 | 5992 | sk_mark_napi_id(sk, skb); |
---|
5706 | 5993 | } |
---|
5707 | 5994 | |
---|
5708 | | - tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); |
---|
| 5995 | + tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, skb); |
---|
5709 | 5996 | |
---|
5710 | 5997 | /* Prevent spurious tcp_cwnd_restart() on first data |
---|
5711 | 5998 | * packet. |
---|
.. | .. |
---|
5760 | 6047 | tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp); |
---|
5761 | 6048 | |
---|
5762 | 6049 | if (data) { /* Retransmit unacked data in SYN */ |
---|
| 6050 | + if (tp->total_retrans) |
---|
| 6051 | + tp->fastopen_client_fail = TFO_SYN_RETRANSMITTED; |
---|
| 6052 | + else |
---|
| 6053 | + tp->fastopen_client_fail = TFO_DATA_NOT_ACKED; |
---|
5763 | 6054 | skb_rbtree_walk_from(data) { |
---|
5764 | 6055 | if (__tcp_retransmit_skb(sk, data, 1)) |
---|
5765 | 6056 | break; |
---|
.. | .. |
---|
5792 | 6083 | #endif |
---|
5793 | 6084 | } |
---|
5794 | 6085 | |
---|
| 6086 | +static void tcp_try_undo_spurious_syn(struct sock *sk) |
---|
| 6087 | +{ |
---|
| 6088 | + struct tcp_sock *tp = tcp_sk(sk); |
---|
| 6089 | + u32 syn_stamp; |
---|
| 6090 | + |
---|
| 6091 | + /* undo_marker is set when SYN or SYNACK times out. The timeout is |
---|
| 6092 | + * spurious if the ACK's timestamp option echo value matches the |
---|
| 6093 | + * original SYN timestamp. |
---|
| 6094 | + */ |
---|
| 6095 | + syn_stamp = tp->retrans_stamp; |
---|
| 6096 | + if (tp->undo_marker && syn_stamp && tp->rx_opt.saw_tstamp && |
---|
| 6097 | + syn_stamp == tp->rx_opt.rcv_tsecr) |
---|
| 6098 | + tp->undo_marker = 0; |
---|
| 6099 | +} |
---|
| 6100 | + |
---|
5795 | 6101 | static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, |
---|
5796 | 6102 | const struct tcphdr *th) |
---|
5797 | 6103 | { |
---|
.. | .. |
---|
5815 | 6121 | * the segment and return)" |
---|
5816 | 6122 | */ |
---|
5817 | 6123 | if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) || |
---|
5818 | | - after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) |
---|
| 6124 | + after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) { |
---|
| 6125 | + /* Previous FIN/ACK or RST/ACK might be ignored. */ |
---|
| 6126 | + if (icsk->icsk_retransmits == 0) |
---|
| 6127 | + inet_csk_reset_xmit_timer(sk, |
---|
| 6128 | + ICSK_TIME_RETRANS, |
---|
| 6129 | + TCP_TIMEOUT_MIN, TCP_RTO_MAX); |
---|
5819 | 6130 | goto reset_and_undo; |
---|
| 6131 | + } |
---|
5820 | 6132 | |
---|
5821 | 6133 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && |
---|
5822 | 6134 | !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp, |
---|
.. | .. |
---|
5859 | 6171 | tcp_ecn_rcv_synack(tp, th); |
---|
5860 | 6172 | |
---|
5861 | 6173 | tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); |
---|
| 6174 | + tcp_try_undo_spurious_syn(sk); |
---|
5862 | 6175 | tcp_ack(sk, skb, FLAG_SLOWPATH); |
---|
5863 | 6176 | |
---|
5864 | 6177 | /* Ok.. it's good. Set up sequence numbers and |
---|
.. | .. |
---|
5912 | 6225 | return -1; |
---|
5913 | 6226 | if (sk->sk_write_pending || |
---|
5914 | 6227 | icsk->icsk_accept_queue.rskq_defer_accept || |
---|
5915 | | - icsk->icsk_ack.pingpong) { |
---|
| 6228 | + inet_csk_in_pingpong_mode(sk)) { |
---|
5916 | 6229 | /* Save one ACK. Data will be ready after |
---|
5917 | 6230 | * several ticks, if write_pending is set. |
---|
5918 | 6231 | * |
---|
.. | .. |
---|
6017 | 6330 | return 1; |
---|
6018 | 6331 | } |
---|
6019 | 6332 | |
---|
| 6333 | +static void tcp_rcv_synrecv_state_fastopen(struct sock *sk) |
---|
| 6334 | +{ |
---|
| 6335 | + struct request_sock *req; |
---|
| 6336 | + |
---|
| 6337 | + /* If we are still handling the SYNACK RTO, see if timestamp ECR allows |
---|
| 6338 | + * undo. If peer SACKs triggered fast recovery, we can't undo here. |
---|
| 6339 | + */ |
---|
| 6340 | + if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) |
---|
| 6341 | + tcp_try_undo_loss(sk, false); |
---|
| 6342 | + |
---|
| 6343 | + /* Reset rtx states to prevent spurious retransmits_timed_out() */ |
---|
| 6344 | + tcp_sk(sk)->retrans_stamp = 0; |
---|
| 6345 | + inet_csk(sk)->icsk_retransmits = 0; |
---|
| 6346 | + |
---|
| 6347 | + /* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1, |
---|
| 6348 | + * we no longer need req so release it. |
---|
| 6349 | + */ |
---|
| 6350 | + req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, |
---|
| 6351 | + lockdep_sock_is_held(sk)); |
---|
| 6352 | + reqsk_fastopen_remove(sk, req, false); |
---|
| 6353 | + |
---|
| 6354 | + /* Re-arm the timer because data may have been sent out. |
---|
| 6355 | + * This is similar to the regular data transmission case |
---|
| 6356 | + * when new data has just been ack'ed. |
---|
| 6357 | + * |
---|
| 6358 | + * (TFO) - we could try to be more aggressive and |
---|
| 6359 | + * retransmitting any data sooner based on when they |
---|
| 6360 | + * are sent out. |
---|
| 6361 | + */ |
---|
| 6362 | + tcp_rearm_rto(sk); |
---|
| 6363 | +} |
---|
| 6364 | + |
---|
6020 | 6365 | /* |
---|
6021 | 6366 | * This function implements the receiving procedure of RFC 793 for |
---|
6022 | 6367 | * all states except ESTABLISHED and TIME_WAIT. |
---|
.. | .. |
---|
6079 | 6424 | |
---|
6080 | 6425 | tcp_mstamp_refresh(tp); |
---|
6081 | 6426 | tp->rx_opt.saw_tstamp = 0; |
---|
6082 | | - req = tp->fastopen_rsk; |
---|
| 6427 | + req = rcu_dereference_protected(tp->fastopen_rsk, |
---|
| 6428 | + lockdep_sock_is_held(sk)); |
---|
6083 | 6429 | if (req) { |
---|
6084 | 6430 | bool req_stolen; |
---|
6085 | 6431 | |
---|
.. | .. |
---|
6113 | 6459 | if (!tp->srtt_us) |
---|
6114 | 6460 | tcp_synack_rtt_meas(sk, req); |
---|
6115 | 6461 | |
---|
6116 | | - /* Once we leave TCP_SYN_RECV, we no longer need req |
---|
6117 | | - * so release it. |
---|
6118 | | - */ |
---|
6119 | 6462 | if (req) { |
---|
6120 | | - inet_csk(sk)->icsk_retransmits = 0; |
---|
6121 | | - reqsk_fastopen_remove(sk, req, false); |
---|
6122 | | - /* Re-arm the timer because data may have been sent out. |
---|
6123 | | - * This is similar to the regular data transmission case |
---|
6124 | | - * when new data has just been ack'ed. |
---|
6125 | | - * |
---|
6126 | | - * (TFO) - we could try to be more aggressive and |
---|
6127 | | - * retransmitting any data sooner based on when they |
---|
6128 | | - * are sent out. |
---|
6129 | | - */ |
---|
6130 | | - tcp_rearm_rto(sk); |
---|
| 6463 | + tcp_rcv_synrecv_state_fastopen(sk); |
---|
6131 | 6464 | } else { |
---|
6132 | | - tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); |
---|
| 6465 | + tcp_try_undo_spurious_syn(sk); |
---|
| 6466 | + tp->retrans_stamp = 0; |
---|
| 6467 | + tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, |
---|
| 6468 | + skb); |
---|
6133 | 6469 | WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); |
---|
6134 | 6470 | } |
---|
6135 | 6471 | smp_mb(); |
---|
.. | .. |
---|
6163 | 6499 | case TCP_FIN_WAIT1: { |
---|
6164 | 6500 | int tmo; |
---|
6165 | 6501 | |
---|
6166 | | - /* If we enter the TCP_FIN_WAIT1 state and we are a |
---|
6167 | | - * Fast Open socket and this is the first acceptable |
---|
6168 | | - * ACK we have received, this would have acknowledged |
---|
6169 | | - * our SYNACK so stop the SYNACK timer. |
---|
6170 | | - */ |
---|
6171 | | - if (req) { |
---|
6172 | | - /* We no longer need the request sock. */ |
---|
6173 | | - reqsk_fastopen_remove(sk, req, false); |
---|
6174 | | - tcp_rearm_rto(sk); |
---|
6175 | | - } |
---|
| 6502 | + if (req) |
---|
| 6503 | + tcp_rcv_synrecv_state_fastopen(sk); |
---|
| 6504 | + |
---|
6176 | 6505 | if (tp->snd_una != tp->write_seq) |
---|
6177 | 6506 | break; |
---|
6178 | 6507 | |
---|
.. | .. |
---|
6244 | 6573 | case TCP_CLOSE_WAIT: |
---|
6245 | 6574 | case TCP_CLOSING: |
---|
6246 | 6575 | case TCP_LAST_ACK: |
---|
6247 | | - if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) |
---|
| 6576 | + if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { |
---|
| 6577 | + if (sk_is_mptcp(sk)) |
---|
| 6578 | + mptcp_incoming_options(sk, skb); |
---|
6248 | 6579 | break; |
---|
6249 | | - /* fall through */ |
---|
| 6580 | + } |
---|
| 6581 | + fallthrough; |
---|
6250 | 6582 | case TCP_FIN_WAIT1: |
---|
6251 | 6583 | case TCP_FIN_WAIT2: |
---|
6252 | 6584 | /* RFC 793 says to queue data in these states, |
---|
.. | .. |
---|
6261 | 6593 | return 1; |
---|
6262 | 6594 | } |
---|
6263 | 6595 | } |
---|
6264 | | - /* Fall through */ |
---|
| 6596 | + fallthrough; |
---|
6265 | 6597 | case TCP_ESTABLISHED: |
---|
6266 | 6598 | tcp_data_queue(sk, skb); |
---|
6267 | 6599 | queued = 1; |
---|
.. | .. |
---|
6307 | 6639 | * congestion control: Linux DCTCP asserts ECT on all packets, |
---|
6308 | 6640 | * including SYN, which is most optimal solution; however, |
---|
6309 | 6641 | * others, such as FreeBSD do not. |
---|
| 6642 | + * |
---|
| 6643 | + * Exception: At least one of the reserved bits of the TCP header (th->res1) is |
---|
| 6644 | + * set, indicating the use of a future TCP extension (such as AccECN). See |
---|
| 6645 | + * RFC8311 ยง4.3 which updates RFC3168 to allow the development of such |
---|
| 6646 | + * extensions. |
---|
6310 | 6647 | */ |
---|
6311 | 6648 | static void tcp_ecn_create_request(struct request_sock *req, |
---|
6312 | 6649 | const struct sk_buff *skb, |
---|
.. | .. |
---|
6326 | 6663 | ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK); |
---|
6327 | 6664 | ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst; |
---|
6328 | 6665 | |
---|
6329 | | - if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) || |
---|
| 6666 | + if (((!ect || th->res1) && ecn_ok) || tcp_ca_needs_ecn(listen_sk) || |
---|
6330 | 6667 | (ecn_ok_dst & DST_FEATURE_ECN_CA) || |
---|
6331 | 6668 | tcp_bpf_ca_needs_ecn((struct sock *)req)) |
---|
6332 | 6669 | inet_rsk(req)->ecn_ok = 1; |
---|
.. | .. |
---|
6339 | 6676 | struct inet_request_sock *ireq = inet_rsk(req); |
---|
6340 | 6677 | |
---|
6341 | 6678 | req->rsk_rcv_wnd = 0; /* So that tcp_send_synack() knows! */ |
---|
6342 | | - req->cookie_ts = 0; |
---|
6343 | 6679 | tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; |
---|
6344 | 6680 | tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; |
---|
6345 | | - tcp_rsk(req)->snt_synack = tcp_clock_us(); |
---|
| 6681 | + tcp_rsk(req)->snt_synack = 0; |
---|
6346 | 6682 | tcp_rsk(req)->last_oow_ack_time = 0; |
---|
6347 | 6683 | req->mss = rx_opt->mss_clamp; |
---|
6348 | 6684 | req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; |
---|
.. | .. |
---|
6387 | 6723 | /* |
---|
6388 | 6724 | * Return true if a syncookie should be sent |
---|
6389 | 6725 | */ |
---|
6390 | | -static bool tcp_syn_flood_action(const struct sock *sk, |
---|
6391 | | - const struct sk_buff *skb, |
---|
6392 | | - const char *proto) |
---|
| 6726 | +static bool tcp_syn_flood_action(const struct sock *sk, const char *proto) |
---|
6393 | 6727 | { |
---|
6394 | 6728 | struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; |
---|
6395 | 6729 | const char *msg = "Dropping request"; |
---|
6396 | | - bool want_cookie = false; |
---|
6397 | 6730 | struct net *net = sock_net(sk); |
---|
| 6731 | + bool want_cookie = false; |
---|
| 6732 | + u8 syncookies; |
---|
| 6733 | + |
---|
| 6734 | + syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies); |
---|
6398 | 6735 | |
---|
6399 | 6736 | #ifdef CONFIG_SYN_COOKIES |
---|
6400 | | - if (net->ipv4.sysctl_tcp_syncookies) { |
---|
| 6737 | + if (syncookies) { |
---|
6401 | 6738 | msg = "Sending cookies"; |
---|
6402 | 6739 | want_cookie = true; |
---|
6403 | 6740 | __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES); |
---|
.. | .. |
---|
6405 | 6742 | #endif |
---|
6406 | 6743 | __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); |
---|
6407 | 6744 | |
---|
6408 | | - if (!queue->synflood_warned && |
---|
6409 | | - net->ipv4.sysctl_tcp_syncookies != 2 && |
---|
| 6745 | + if (!queue->synflood_warned && syncookies != 2 && |
---|
6410 | 6746 | xchg(&queue->synflood_warned, 1) == 0) |
---|
6411 | 6747 | net_info_ratelimited("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", |
---|
6412 | | - proto, ntohs(tcp_hdr(skb)->dest), msg); |
---|
| 6748 | + proto, sk->sk_num, msg); |
---|
6413 | 6749 | |
---|
6414 | 6750 | return want_cookie; |
---|
6415 | 6751 | } |
---|
.. | .. |
---|
6420 | 6756 | { |
---|
6421 | 6757 | if (tcp_sk(sk)->save_syn) { |
---|
6422 | 6758 | u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb); |
---|
6423 | | - u32 *copy; |
---|
| 6759 | + struct saved_syn *saved_syn; |
---|
| 6760 | + u32 mac_hdrlen; |
---|
| 6761 | + void *base; |
---|
6424 | 6762 | |
---|
6425 | | - copy = kmalloc(len + sizeof(u32), GFP_ATOMIC); |
---|
6426 | | - if (copy) { |
---|
6427 | | - copy[0] = len; |
---|
6428 | | - memcpy(©[1], skb_network_header(skb), len); |
---|
6429 | | - req->saved_syn = copy; |
---|
| 6763 | + if (tcp_sk(sk)->save_syn == 2) { /* Save full header. */ |
---|
| 6764 | + base = skb_mac_header(skb); |
---|
| 6765 | + mac_hdrlen = skb_mac_header_len(skb); |
---|
| 6766 | + len += mac_hdrlen; |
---|
| 6767 | + } else { |
---|
| 6768 | + base = skb_network_header(skb); |
---|
| 6769 | + mac_hdrlen = 0; |
---|
| 6770 | + } |
---|
| 6771 | + |
---|
| 6772 | + saved_syn = kmalloc(struct_size(saved_syn, data, len), |
---|
| 6773 | + GFP_ATOMIC); |
---|
| 6774 | + if (saved_syn) { |
---|
| 6775 | + saved_syn->mac_hdrlen = mac_hdrlen; |
---|
| 6776 | + saved_syn->network_hdrlen = skb_network_header_len(skb); |
---|
| 6777 | + saved_syn->tcp_hdrlen = tcp_hdrlen(skb); |
---|
| 6778 | + memcpy(saved_syn->data, base, len); |
---|
| 6779 | + req->saved_syn = saved_syn; |
---|
6430 | 6780 | } |
---|
6431 | 6781 | } |
---|
6432 | 6782 | } |
---|
| 6783 | + |
---|
| 6784 | +/* If a SYN cookie is required and supported, returns a clamped MSS value to be |
---|
| 6785 | + * used for SYN cookie generation. |
---|
| 6786 | + */ |
---|
| 6787 | +u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, |
---|
| 6788 | + const struct tcp_request_sock_ops *af_ops, |
---|
| 6789 | + struct sock *sk, struct tcphdr *th) |
---|
| 6790 | +{ |
---|
| 6791 | + struct tcp_sock *tp = tcp_sk(sk); |
---|
| 6792 | + u16 mss; |
---|
| 6793 | + |
---|
| 6794 | + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies) != 2 && |
---|
| 6795 | + !inet_csk_reqsk_queue_is_full(sk)) |
---|
| 6796 | + return 0; |
---|
| 6797 | + |
---|
| 6798 | + if (!tcp_syn_flood_action(sk, rsk_ops->slab_name)) |
---|
| 6799 | + return 0; |
---|
| 6800 | + |
---|
| 6801 | + if (sk_acceptq_is_full(sk)) { |
---|
| 6802 | + NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); |
---|
| 6803 | + return 0; |
---|
| 6804 | + } |
---|
| 6805 | + |
---|
| 6806 | + mss = tcp_parse_mss_option(th, tp->rx_opt.user_mss); |
---|
| 6807 | + if (!mss) |
---|
| 6808 | + mss = af_ops->mss_clamp; |
---|
| 6809 | + |
---|
| 6810 | + return mss; |
---|
| 6811 | +} |
---|
| 6812 | +EXPORT_SYMBOL_GPL(tcp_get_syncookie_mss); |
---|
6433 | 6813 | |
---|
6434 | 6814 | int tcp_conn_request(struct request_sock_ops *rsk_ops, |
---|
6435 | 6815 | const struct tcp_request_sock_ops *af_ops, |
---|
.. | .. |
---|
6445 | 6825 | bool want_cookie = false; |
---|
6446 | 6826 | struct dst_entry *dst; |
---|
6447 | 6827 | struct flowi fl; |
---|
| 6828 | + u8 syncookies; |
---|
| 6829 | + |
---|
| 6830 | + syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies); |
---|
6448 | 6831 | |
---|
6449 | 6832 | /* TW buckets are converted to open requests without |
---|
6450 | 6833 | * limitations, they conserve resources and peer is |
---|
6451 | 6834 | * evidently real one. |
---|
6452 | 6835 | */ |
---|
6453 | | - if ((net->ipv4.sysctl_tcp_syncookies == 2 || |
---|
6454 | | - inet_csk_reqsk_queue_is_full(sk)) && !isn) { |
---|
6455 | | - want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name); |
---|
| 6836 | + if ((syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) && !isn) { |
---|
| 6837 | + want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name); |
---|
6456 | 6838 | if (!want_cookie) |
---|
6457 | 6839 | goto drop; |
---|
6458 | 6840 | } |
---|
.. | .. |
---|
6466 | 6848 | if (!req) |
---|
6467 | 6849 | goto drop; |
---|
6468 | 6850 | |
---|
| 6851 | + req->syncookie = want_cookie; |
---|
6469 | 6852 | tcp_rsk(req)->af_specific = af_ops; |
---|
6470 | 6853 | tcp_rsk(req)->ts_off = 0; |
---|
| 6854 | +#if IS_ENABLED(CONFIG_MPTCP) |
---|
| 6855 | + tcp_rsk(req)->is_mptcp = 0; |
---|
| 6856 | +#endif |
---|
6471 | 6857 | |
---|
6472 | 6858 | tcp_clear_options(&tmp_opt); |
---|
6473 | 6859 | tmp_opt.mss_clamp = af_ops->mss_clamp; |
---|
.. | .. |
---|
6501 | 6887 | goto drop_and_free; |
---|
6502 | 6888 | |
---|
6503 | 6889 | if (!want_cookie && !isn) { |
---|
| 6890 | + int max_syn_backlog = READ_ONCE(net->ipv4.sysctl_max_syn_backlog); |
---|
| 6891 | + |
---|
6504 | 6892 | /* Kill the following clause, if you dislike this way. */ |
---|
6505 | | - if (!net->ipv4.sysctl_tcp_syncookies && |
---|
6506 | | - (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < |
---|
6507 | | - (net->ipv4.sysctl_max_syn_backlog >> 2)) && |
---|
| 6893 | + if (!syncookies && |
---|
| 6894 | + (max_syn_backlog - inet_csk_reqsk_queue_len(sk) < |
---|
| 6895 | + (max_syn_backlog >> 2)) && |
---|
6508 | 6896 | !tcp_peer_is_proven(req, dst)) { |
---|
6509 | 6897 | /* Without syncookies last quarter of |
---|
6510 | 6898 | * backlog is filled with destinations, |
---|
.. | .. |
---|
6525 | 6913 | |
---|
6526 | 6914 | if (want_cookie) { |
---|
6527 | 6915 | isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); |
---|
6528 | | - req->cookie_ts = tmp_opt.tstamp_ok; |
---|
6529 | 6916 | if (!tmp_opt.tstamp_ok) |
---|
6530 | 6917 | inet_rsk(req)->ecn_ok = 0; |
---|
6531 | 6918 | } |
---|
6532 | 6919 | |
---|
6533 | 6920 | tcp_rsk(req)->snt_isn = isn; |
---|
6534 | 6921 | tcp_rsk(req)->txhash = net_tx_rndhash(); |
---|
| 6922 | + tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield; |
---|
6535 | 6923 | tcp_openreq_init_rwin(req, sk, dst); |
---|
6536 | 6924 | sk_rx_queue_set(req_to_sk(req), skb); |
---|
6537 | 6925 | if (!want_cookie) { |
---|
.. | .. |
---|
6540 | 6928 | } |
---|
6541 | 6929 | if (fastopen_sk) { |
---|
6542 | 6930 | af_ops->send_synack(fastopen_sk, dst, &fl, req, |
---|
6543 | | - &foc, TCP_SYNACK_FASTOPEN); |
---|
| 6931 | + &foc, TCP_SYNACK_FASTOPEN, skb); |
---|
6544 | 6932 | /* Add the child socket directly into the accept queue */ |
---|
6545 | 6933 | if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) { |
---|
6546 | 6934 | reqsk_fastopen_remove(fastopen_sk, req, false); |
---|
6547 | 6935 | bh_unlock_sock(fastopen_sk); |
---|
6548 | 6936 | sock_put(fastopen_sk); |
---|
6549 | | - reqsk_put(req); |
---|
6550 | | - goto drop; |
---|
| 6937 | + goto drop_and_free; |
---|
6551 | 6938 | } |
---|
6552 | 6939 | sk->sk_data_ready(sk); |
---|
6553 | 6940 | bh_unlock_sock(fastopen_sk); |
---|
.. | .. |
---|
6559 | 6946 | tcp_timeout_init((struct sock *)req)); |
---|
6560 | 6947 | af_ops->send_synack(sk, dst, &fl, req, &foc, |
---|
6561 | 6948 | !want_cookie ? TCP_SYNACK_NORMAL : |
---|
6562 | | - TCP_SYNACK_COOKIE); |
---|
| 6949 | + TCP_SYNACK_COOKIE, |
---|
| 6950 | + skb); |
---|
6563 | 6951 | if (want_cookie) { |
---|
6564 | 6952 | reqsk_free(req); |
---|
6565 | 6953 | return 0; |
---|
.. | .. |
---|
6571 | 6959 | drop_and_release: |
---|
6572 | 6960 | dst_release(dst); |
---|
6573 | 6961 | drop_and_free: |
---|
6574 | | - reqsk_free(req); |
---|
| 6962 | + __reqsk_free(req); |
---|
6575 | 6963 | drop: |
---|
6576 | 6964 | tcp_listendrop(sk); |
---|
6577 | 6965 | return 0; |
---|