.. | .. |
---|
77 | 77 | #include <asm/unaligned.h> |
---|
78 | 78 | #include <linux/errqueue.h> |
---|
79 | 79 | #include <trace/events/tcp.h> |
---|
80 | | -#include <linux/static_key.h> |
---|
| 80 | +#include <linux/jump_label_ratelimit.h> |
---|
81 | 81 | #include <net/busy_poll.h> |
---|
| 82 | +#include <net/mptcp.h> |
---|
| 83 | +#include <trace/hooks/net.h> |
---|
82 | 84 | |
---|
83 | 85 | int sysctl_tcp_max_orphans __read_mostly = NR_FILE; |
---|
84 | 86 | |
---|
.. | .. |
---|
113 | 115 | #define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */ |
---|
114 | 116 | |
---|
115 | 117 | #if IS_ENABLED(CONFIG_TLS_DEVICE) |
---|
116 | | -static DEFINE_STATIC_KEY_FALSE(clean_acked_data_enabled); |
---|
| 118 | +static DEFINE_STATIC_KEY_DEFERRED_FALSE(clean_acked_data_enabled, HZ); |
---|
117 | 119 | |
---|
118 | 120 | void clean_acked_data_enable(struct inet_connection_sock *icsk, |
---|
119 | 121 | void (*cad)(struct sock *sk, u32 ack_seq)) |
---|
120 | 122 | { |
---|
121 | 123 | icsk->icsk_clean_acked = cad; |
---|
122 | | - static_branch_inc(&clean_acked_data_enabled); |
---|
| 124 | + static_branch_deferred_inc(&clean_acked_data_enabled); |
---|
123 | 125 | } |
---|
124 | 126 | EXPORT_SYMBOL_GPL(clean_acked_data_enable); |
---|
125 | 127 | |
---|
126 | 128 | void clean_acked_data_disable(struct inet_connection_sock *icsk) |
---|
127 | 129 | { |
---|
128 | | - static_branch_dec(&clean_acked_data_enabled); |
---|
| 130 | + static_branch_slow_dec_deferred(&clean_acked_data_enabled); |
---|
129 | 131 | icsk->icsk_clean_acked = NULL; |
---|
130 | 132 | } |
---|
131 | 133 | EXPORT_SYMBOL_GPL(clean_acked_data_disable); |
---|
| 134 | + |
---|
| 135 | +void clean_acked_data_flush(void) |
---|
| 136 | +{ |
---|
| 137 | + static_key_deferred_flush(&clean_acked_data_enabled); |
---|
| 138 | +} |
---|
| 139 | +EXPORT_SYMBOL_GPL(clean_acked_data_flush); |
---|
| 140 | +#endif |
---|
| 141 | + |
---|
| 142 | +#ifdef CONFIG_CGROUP_BPF |
---|
| 143 | +static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) |
---|
| 144 | +{ |
---|
| 145 | + bool unknown_opt = tcp_sk(sk)->rx_opt.saw_unknown && |
---|
| 146 | + BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), |
---|
| 147 | + BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG); |
---|
| 148 | + bool parse_all_opt = BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), |
---|
| 149 | + BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG); |
---|
| 150 | + struct bpf_sock_ops_kern sock_ops; |
---|
| 151 | + |
---|
| 152 | + if (likely(!unknown_opt && !parse_all_opt)) |
---|
| 153 | + return; |
---|
| 154 | + |
---|
| 155 | + /* The skb will be handled in the |
---|
| 156 | + * bpf_skops_established() or |
---|
| 157 | + * bpf_skops_write_hdr_opt(). |
---|
| 158 | + */ |
---|
| 159 | + switch (sk->sk_state) { |
---|
| 160 | + case TCP_SYN_RECV: |
---|
| 161 | + case TCP_SYN_SENT: |
---|
| 162 | + case TCP_LISTEN: |
---|
| 163 | + return; |
---|
| 164 | + } |
---|
| 165 | + |
---|
| 166 | + sock_owned_by_me(sk); |
---|
| 167 | + |
---|
| 168 | + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); |
---|
| 169 | + sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB; |
---|
| 170 | + sock_ops.is_fullsock = 1; |
---|
| 171 | + sock_ops.sk = sk; |
---|
| 172 | + bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb)); |
---|
| 173 | + |
---|
| 174 | + BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); |
---|
| 175 | +} |
---|
| 176 | + |
---|
| 177 | +static void bpf_skops_established(struct sock *sk, int bpf_op, |
---|
| 178 | + struct sk_buff *skb) |
---|
| 179 | +{ |
---|
| 180 | + struct bpf_sock_ops_kern sock_ops; |
---|
| 181 | + |
---|
| 182 | + sock_owned_by_me(sk); |
---|
| 183 | + |
---|
| 184 | + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); |
---|
| 185 | + sock_ops.op = bpf_op; |
---|
| 186 | + sock_ops.is_fullsock = 1; |
---|
| 187 | + sock_ops.sk = sk; |
---|
| 188 | + /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */ |
---|
| 189 | + if (skb) |
---|
| 190 | + bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb)); |
---|
| 191 | + |
---|
| 192 | + BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); |
---|
| 193 | +} |
---|
| 194 | +#else |
---|
| 195 | +static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) |
---|
| 196 | +{ |
---|
| 197 | +} |
---|
| 198 | + |
---|
| 199 | +static void bpf_skops_established(struct sock *sk, int bpf_op, |
---|
| 200 | + struct sk_buff *skb) |
---|
| 201 | +{ |
---|
| 202 | +} |
---|
132 | 203 | #endif |
---|
133 | 204 | |
---|
134 | 205 | static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb, |
---|
.. | .. |
---|
172 | 243 | if (unlikely(len > icsk->icsk_ack.rcv_mss + |
---|
173 | 244 | MAX_TCP_OPTION_SPACE)) |
---|
174 | 245 | tcp_gro_dev_warn(sk, skb, len); |
---|
| 246 | + /* If the skb has a len of exactly 1*MSS and has the PSH bit |
---|
| 247 | + * set then it is likely the end of an application write. So |
---|
| 248 | + * more data may not be arriving soon, and yet the data sender |
---|
| 249 | + * may be waiting for an ACK if cwnd-bound or using TX zero |
---|
| 250 | + * copy. So we set ICSK_ACK_PUSHED here so that |
---|
| 251 | + * tcp_cleanup_rbuf() will send an ACK immediately if the app |
---|
| 252 | + * reads all of the data and is not ping-pong. If len > MSS |
---|
| 253 | + * then this logic does not matter (and does not hurt) because |
---|
| 254 | + * tcp_cleanup_rbuf() will always ACK immediately if the app |
---|
| 255 | + * reads data and there is more than an MSS of unACKed data. |
---|
| 256 | + */ |
---|
| 257 | + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_PSH) |
---|
| 258 | + icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; |
---|
175 | 259 | } else { |
---|
176 | 260 | /* Otherwise, we make more careful check taking into account, |
---|
177 | 261 | * that SACKs block is variable. |
---|
.. | .. |
---|
216 | 300 | icsk->icsk_ack.quick = quickacks; |
---|
217 | 301 | } |
---|
218 | 302 | |
---|
219 | | -void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) |
---|
| 303 | +static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) |
---|
220 | 304 | { |
---|
221 | 305 | struct inet_connection_sock *icsk = inet_csk(sk); |
---|
222 | 306 | |
---|
223 | 307 | tcp_incr_quickack(sk, max_quickacks); |
---|
224 | | - icsk->icsk_ack.pingpong = 0; |
---|
| 308 | + inet_csk_exit_pingpong_mode(sk); |
---|
225 | 309 | icsk->icsk_ack.ato = TCP_ATO_MIN; |
---|
226 | 310 | } |
---|
227 | | -EXPORT_SYMBOL(tcp_enter_quickack_mode); |
---|
228 | 311 | |
---|
229 | 312 | /* Send ACKs quickly, if "quick" count is not exhausted |
---|
230 | 313 | * and the session is not interactive. |
---|
.. | .. |
---|
236 | 319 | const struct dst_entry *dst = __sk_dst_get(sk); |
---|
237 | 320 | |
---|
238 | 321 | return (dst && dst_metric(dst, RTAX_QUICKACK)) || |
---|
239 | | - (icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong); |
---|
| 322 | + (icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk)); |
---|
240 | 323 | } |
---|
241 | 324 | |
---|
242 | 325 | static void tcp_ecn_queue_cwr(struct tcp_sock *tp) |
---|
.. | .. |
---|
354 | 437 | sndmem *= nr_segs * per_mss; |
---|
355 | 438 | |
---|
356 | 439 | if (sk->sk_sndbuf < sndmem) |
---|
357 | | - sk->sk_sndbuf = min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]); |
---|
| 440 | + WRITE_ONCE(sk->sk_sndbuf, |
---|
| 441 | + min(sndmem, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[2]))); |
---|
358 | 442 | } |
---|
359 | 443 | |
---|
360 | 444 | /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) |
---|
.. | .. |
---|
383 | 467 | */ |
---|
384 | 468 | |
---|
385 | 469 | /* Slow part of check#2. */ |
---|
386 | | -static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) |
---|
| 470 | +static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb, |
---|
| 471 | + unsigned int skbtruesize) |
---|
387 | 472 | { |
---|
388 | 473 | struct tcp_sock *tp = tcp_sk(sk); |
---|
389 | 474 | /* Optimize this! */ |
---|
390 | | - int truesize = tcp_win_from_space(sk, skb->truesize) >> 1; |
---|
391 | | - int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1; |
---|
| 475 | + int truesize = tcp_win_from_space(sk, skbtruesize) >> 1; |
---|
| 476 | + int window = tcp_win_from_space(sk, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])) >> 1; |
---|
392 | 477 | |
---|
393 | 478 | while (tp->rcv_ssthresh <= window) { |
---|
394 | 479 | if (truesize <= skb->len) |
---|
.. | .. |
---|
400 | 485 | return 0; |
---|
401 | 486 | } |
---|
402 | 487 | |
---|
403 | | -static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) |
---|
| 488 | +/* Even if skb appears to have a bad len/truesize ratio, TCP coalescing |
---|
| 489 | + * can play nice with us, as sk_buff and skb->head might be either |
---|
| 490 | + * freed or shared with up to MAX_SKB_FRAGS segments. |
---|
| 491 | + * Only give a boost to drivers using page frag(s) to hold the frame(s), |
---|
| 492 | + * and if no payload was pulled in skb->head before reaching us. |
---|
| 493 | + */ |
---|
| 494 | +static u32 truesize_adjust(bool adjust, const struct sk_buff *skb) |
---|
| 495 | +{ |
---|
| 496 | + u32 truesize = skb->truesize; |
---|
| 497 | + |
---|
| 498 | + if (adjust && !skb_headlen(skb)) { |
---|
| 499 | + truesize -= SKB_TRUESIZE(skb_end_offset(skb)); |
---|
| 500 | + /* paranoid check, some drivers might be buggy */ |
---|
| 501 | + if (unlikely((int)truesize < (int)skb->len)) |
---|
| 502 | + truesize = skb->truesize; |
---|
| 503 | + } |
---|
| 504 | + return truesize; |
---|
| 505 | +} |
---|
| 506 | + |
---|
| 507 | +static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb, |
---|
| 508 | + bool adjust) |
---|
404 | 509 | { |
---|
405 | 510 | struct tcp_sock *tp = tcp_sk(sk); |
---|
406 | 511 | int room; |
---|
.. | .. |
---|
409 | 514 | |
---|
410 | 515 | /* Check #1 */ |
---|
411 | 516 | if (room > 0 && !tcp_under_memory_pressure(sk)) { |
---|
| 517 | + unsigned int truesize = truesize_adjust(adjust, skb); |
---|
412 | 518 | int incr; |
---|
413 | 519 | |
---|
414 | 520 | /* Check #2. Increase window, if skb with such overhead |
---|
415 | 521 | * will fit to rcvbuf in future. |
---|
416 | 522 | */ |
---|
417 | | - if (tcp_win_from_space(sk, skb->truesize) <= skb->len) |
---|
| 523 | + if (tcp_win_from_space(sk, truesize) <= skb->len) |
---|
418 | 524 | incr = 2 * tp->advmss; |
---|
419 | 525 | else |
---|
420 | | - incr = __tcp_grow_window(sk, skb); |
---|
| 526 | + incr = __tcp_grow_window(sk, skb, truesize); |
---|
421 | 527 | |
---|
422 | 528 | if (incr) { |
---|
423 | 529 | incr = max_t(int, incr, 2 * skb->len); |
---|
.. | .. |
---|
430 | 536 | /* 3. Try to fixup all. It is made immediately after connection enters |
---|
431 | 537 | * established state. |
---|
432 | 538 | */ |
---|
433 | | -void tcp_init_buffer_space(struct sock *sk) |
---|
| 539 | +static void tcp_init_buffer_space(struct sock *sk) |
---|
434 | 540 | { |
---|
435 | | - int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win; |
---|
| 541 | + int tcp_app_win = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_app_win); |
---|
436 | 542 | struct tcp_sock *tp = tcp_sk(sk); |
---|
437 | 543 | int maxwin; |
---|
438 | 544 | |
---|
.. | .. |
---|
472 | 578 | struct tcp_sock *tp = tcp_sk(sk); |
---|
473 | 579 | struct inet_connection_sock *icsk = inet_csk(sk); |
---|
474 | 580 | struct net *net = sock_net(sk); |
---|
| 581 | + int rmem2; |
---|
475 | 582 | |
---|
476 | 583 | icsk->icsk_ack.quick = 0; |
---|
| 584 | + rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]); |
---|
477 | 585 | |
---|
478 | | - if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] && |
---|
| 586 | + if (sk->sk_rcvbuf < rmem2 && |
---|
479 | 587 | !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && |
---|
480 | 588 | !tcp_under_memory_pressure(sk) && |
---|
481 | 589 | sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { |
---|
482 | | - sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), |
---|
483 | | - net->ipv4.sysctl_tcp_rmem[2]); |
---|
| 590 | + WRITE_ONCE(sk->sk_rcvbuf, |
---|
| 591 | + min(atomic_read(&sk->sk_rmem_alloc), rmem2)); |
---|
484 | 592 | } |
---|
485 | 593 | if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) |
---|
486 | 594 | tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss); |
---|
.. | .. |
---|
510 | 618 | * |
---|
511 | 619 | * The algorithm for RTT estimation w/o timestamps is based on |
---|
512 | 620 | * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL. |
---|
513 | | - * <http://public.lanl.gov/radiant/pubs.html#DRS> |
---|
| 621 | + * <https://public.lanl.gov/radiant/pubs.html#DRS> |
---|
514 | 622 | * |
---|
515 | 623 | * More detail on this code can be found at |
---|
516 | 624 | * <http://staff.psc.edu/jheffner/>, |
---|
.. | .. |
---|
621 | 729 | * <prev RTT . ><current RTT .. ><next RTT .... > |
---|
622 | 730 | */ |
---|
623 | 731 | |
---|
624 | | - if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf && |
---|
| 732 | + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && |
---|
625 | 733 | !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { |
---|
626 | 734 | int rcvmem, rcvbuf; |
---|
627 | 735 | u64 rcvwin, grow; |
---|
.. | .. |
---|
642 | 750 | |
---|
643 | 751 | do_div(rcvwin, tp->advmss); |
---|
644 | 752 | rcvbuf = min_t(u64, rcvwin * rcvmem, |
---|
645 | | - sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); |
---|
| 753 | + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); |
---|
646 | 754 | if (rcvbuf > sk->sk_rcvbuf) { |
---|
647 | | - sk->sk_rcvbuf = rcvbuf; |
---|
| 755 | + WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); |
---|
648 | 756 | |
---|
649 | 757 | /* Make the window clamp follow along. */ |
---|
650 | 758 | tp->window_clamp = tcp_win_from_space(sk, rcvbuf); |
---|
.. | .. |
---|
710 | 818 | tcp_ecn_check_ce(sk, skb); |
---|
711 | 819 | |
---|
712 | 820 | if (skb->len >= 128) |
---|
713 | | - tcp_grow_window(sk, skb); |
---|
| 821 | + tcp_grow_window(sk, skb, true); |
---|
714 | 822 | } |
---|
715 | 823 | |
---|
716 | 824 | /* Called to compute a smoothed rtt estimate. The data fed to this |
---|
.. | .. |
---|
774 | 882 | tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2; |
---|
775 | 883 | tp->rtt_seq = tp->snd_nxt; |
---|
776 | 884 | tp->mdev_max_us = tcp_rto_min_us(sk); |
---|
| 885 | + |
---|
| 886 | + tcp_bpf_rtt(sk); |
---|
777 | 887 | } |
---|
778 | 888 | } else { |
---|
779 | 889 | /* no previous measure. */ |
---|
.. | .. |
---|
782 | 892 | tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk)); |
---|
783 | 893 | tp->mdev_max_us = tp->rttvar_us; |
---|
784 | 894 | tp->rtt_seq = tp->snd_nxt; |
---|
| 895 | + |
---|
| 896 | + tcp_bpf_rtt(sk); |
---|
785 | 897 | } |
---|
786 | 898 | tp->srtt_us = max(1U, srtt); |
---|
787 | 899 | } |
---|
.. | .. |
---|
859 | 971 | return min_t(__u32, cwnd, tp->snd_cwnd_clamp); |
---|
860 | 972 | } |
---|
861 | 973 | |
---|
862 | | -/* Take a notice that peer is sending D-SACKs */ |
---|
863 | | -static void tcp_dsack_seen(struct tcp_sock *tp) |
---|
| 974 | +struct tcp_sacktag_state { |
---|
| 975 | + /* Timestamps for earliest and latest never-retransmitted segment |
---|
| 976 | + * that was SACKed. RTO needs the earliest RTT to stay conservative, |
---|
| 977 | + * but congestion control should still get an accurate delay signal. |
---|
| 978 | + */ |
---|
| 979 | + u64 first_sackt; |
---|
| 980 | + u64 last_sackt; |
---|
| 981 | + u32 reord; |
---|
| 982 | + u32 sack_delivered; |
---|
| 983 | + int flag; |
---|
| 984 | + unsigned int mss_now; |
---|
| 985 | + struct rate_sample *rate; |
---|
| 986 | +}; |
---|
| 987 | + |
---|
| 988 | +/* Take a notice that peer is sending D-SACKs. Skip update of data delivery |
---|
| 989 | + * and spurious retransmission information if this DSACK is unlikely caused by |
---|
| 990 | + * sender's action: |
---|
| 991 | + * - DSACKed sequence range is larger than maximum receiver's window. |
---|
| 992 | + * - Total no. of DSACKed segments exceed the total no. of retransmitted segs. |
---|
| 993 | + */ |
---|
| 994 | +static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq, |
---|
| 995 | + u32 end_seq, struct tcp_sacktag_state *state) |
---|
864 | 996 | { |
---|
| 997 | + u32 seq_len, dup_segs = 1; |
---|
| 998 | + |
---|
| 999 | + if (!before(start_seq, end_seq)) |
---|
| 1000 | + return 0; |
---|
| 1001 | + |
---|
| 1002 | + seq_len = end_seq - start_seq; |
---|
| 1003 | + /* Dubious DSACK: DSACKed range greater than maximum advertised rwnd */ |
---|
| 1004 | + if (seq_len > tp->max_window) |
---|
| 1005 | + return 0; |
---|
| 1006 | + if (seq_len > tp->mss_cache) |
---|
| 1007 | + dup_segs = DIV_ROUND_UP(seq_len, tp->mss_cache); |
---|
| 1008 | + |
---|
| 1009 | + tp->dsack_dups += dup_segs; |
---|
| 1010 | + /* Skip the DSACK if dup segs weren't retransmitted by sender */ |
---|
| 1011 | + if (tp->dsack_dups > tp->total_retrans) |
---|
| 1012 | + return 0; |
---|
| 1013 | + |
---|
865 | 1014 | tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; |
---|
866 | 1015 | tp->rack.dsack_seen = 1; |
---|
867 | | - tp->dsack_dups++; |
---|
| 1016 | + |
---|
| 1017 | + state->flag |= FLAG_DSACKING_ACK; |
---|
| 1018 | + /* A spurious retransmission is delivered */ |
---|
| 1019 | + state->sack_delivered += dup_segs; |
---|
| 1020 | + |
---|
| 1021 | + return dup_segs; |
---|
868 | 1022 | } |
---|
869 | 1023 | |
---|
870 | 1024 | /* It's reordering when higher sequence was delivered (i.e. sacked) before |
---|
.. | .. |
---|
893 | 1047 | tp->undo_marker ? tp->undo_retrans : 0); |
---|
894 | 1048 | #endif |
---|
895 | 1049 | tp->reordering = min_t(u32, (metric + mss - 1) / mss, |
---|
896 | | - sock_net(sk)->ipv4.sysctl_tcp_max_reordering); |
---|
| 1050 | + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering)); |
---|
897 | 1051 | } |
---|
898 | 1052 | |
---|
899 | 1053 | /* This exciting event is worth to be remembered. 8) */ |
---|
.. | .. |
---|
902 | 1056 | ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER); |
---|
903 | 1057 | } |
---|
904 | 1058 | |
---|
905 | | -/* This must be called before lost_out is incremented */ |
---|
| 1059 | + /* This must be called before lost_out or retrans_out are updated |
---|
| 1060 | + * on a new loss, because we want to know if all skbs previously |
---|
| 1061 | + * known to be lost have already been retransmitted, indicating |
---|
| 1062 | + * that this newly lost skb is our next skb to retransmit. |
---|
| 1063 | + */ |
---|
906 | 1064 | static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) |
---|
907 | 1065 | { |
---|
908 | 1066 | if ((!tp->retransmit_skb_hint && tp->retrans_out >= tp->lost_out) || |
---|
.. | .. |
---|
912 | 1070 | tp->retransmit_skb_hint = skb; |
---|
913 | 1071 | } |
---|
914 | 1072 | |
---|
915 | | -/* Sum the number of packets on the wire we have marked as lost. |
---|
916 | | - * There are two cases we care about here: |
---|
917 | | - * a) Packet hasn't been marked lost (nor retransmitted), |
---|
918 | | - * and this is the first loss. |
---|
919 | | - * b) Packet has been marked both lost and retransmitted, |
---|
920 | | - * and this means we think it was lost again. |
---|
| 1073 | +/* Sum the number of packets on the wire we have marked as lost, and |
---|
| 1074 | + * notify the congestion control module that the given skb was marked lost. |
---|
921 | 1075 | */ |
---|
922 | | -static void tcp_sum_lost(struct tcp_sock *tp, struct sk_buff *skb) |
---|
| 1076 | +static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb) |
---|
| 1077 | +{ |
---|
| 1078 | + tp->lost += tcp_skb_pcount(skb); |
---|
| 1079 | +} |
---|
| 1080 | + |
---|
| 1081 | +void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) |
---|
923 | 1082 | { |
---|
924 | 1083 | __u8 sacked = TCP_SKB_CB(skb)->sacked; |
---|
| 1084 | + struct tcp_sock *tp = tcp_sk(sk); |
---|
925 | 1085 | |
---|
926 | | - if (!(sacked & TCPCB_LOST) || |
---|
927 | | - ((sacked & TCPCB_LOST) && (sacked & TCPCB_SACKED_RETRANS))) |
---|
928 | | - tp->lost += tcp_skb_pcount(skb); |
---|
929 | | -} |
---|
| 1086 | + if (sacked & TCPCB_SACKED_ACKED) |
---|
| 1087 | + return; |
---|
930 | 1088 | |
---|
931 | | -static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb) |
---|
932 | | -{ |
---|
933 | | - if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { |
---|
934 | | - tcp_verify_retransmit_hint(tp, skb); |
---|
935 | | - |
---|
936 | | - tp->lost_out += tcp_skb_pcount(skb); |
---|
937 | | - tcp_sum_lost(tp, skb); |
---|
938 | | - TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; |
---|
939 | | - } |
---|
940 | | -} |
---|
941 | | - |
---|
942 | | -void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb) |
---|
943 | | -{ |
---|
944 | 1089 | tcp_verify_retransmit_hint(tp, skb); |
---|
945 | | - |
---|
946 | | - tcp_sum_lost(tp, skb); |
---|
947 | | - if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { |
---|
| 1090 | + if (sacked & TCPCB_LOST) { |
---|
| 1091 | + if (sacked & TCPCB_SACKED_RETRANS) { |
---|
| 1092 | + /* Account for retransmits that are lost again */ |
---|
| 1093 | + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; |
---|
| 1094 | + tp->retrans_out -= tcp_skb_pcount(skb); |
---|
| 1095 | + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT, |
---|
| 1096 | + tcp_skb_pcount(skb)); |
---|
| 1097 | + tcp_notify_skb_loss_event(tp, skb); |
---|
| 1098 | + } |
---|
| 1099 | + } else { |
---|
948 | 1100 | tp->lost_out += tcp_skb_pcount(skb); |
---|
949 | 1101 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; |
---|
| 1102 | + tcp_notify_skb_loss_event(tp, skb); |
---|
950 | 1103 | } |
---|
| 1104 | +} |
---|
| 1105 | + |
---|
| 1106 | +/* Updates the delivered and delivered_ce counts */ |
---|
| 1107 | +static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, |
---|
| 1108 | + bool ece_ack) |
---|
| 1109 | +{ |
---|
| 1110 | + tp->delivered += delivered; |
---|
| 1111 | + if (ece_ack) |
---|
| 1112 | + tp->delivered_ce += delivered; |
---|
951 | 1113 | } |
---|
952 | 1114 | |
---|
953 | 1115 | /* This procedure tags the retransmission queue when SACKs arrive. |
---|
.. | .. |
---|
1082 | 1244 | |
---|
1083 | 1245 | static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, |
---|
1084 | 1246 | struct tcp_sack_block_wire *sp, int num_sacks, |
---|
1085 | | - u32 prior_snd_una) |
---|
| 1247 | + u32 prior_snd_una, struct tcp_sacktag_state *state) |
---|
1086 | 1248 | { |
---|
1087 | 1249 | struct tcp_sock *tp = tcp_sk(sk); |
---|
1088 | 1250 | u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq); |
---|
1089 | 1251 | u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq); |
---|
1090 | | - bool dup_sack = false; |
---|
| 1252 | + u32 dup_segs; |
---|
1091 | 1253 | |
---|
1092 | 1254 | if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) { |
---|
1093 | | - dup_sack = true; |
---|
1094 | | - tcp_dsack_seen(tp); |
---|
1095 | 1255 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV); |
---|
1096 | 1256 | } else if (num_sacks > 1) { |
---|
1097 | 1257 | u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq); |
---|
1098 | 1258 | u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq); |
---|
1099 | 1259 | |
---|
1100 | | - if (!after(end_seq_0, end_seq_1) && |
---|
1101 | | - !before(start_seq_0, start_seq_1)) { |
---|
1102 | | - dup_sack = true; |
---|
1103 | | - tcp_dsack_seen(tp); |
---|
1104 | | - NET_INC_STATS(sock_net(sk), |
---|
1105 | | - LINUX_MIB_TCPDSACKOFORECV); |
---|
1106 | | - } |
---|
| 1260 | + if (after(end_seq_0, end_seq_1) || before(start_seq_0, start_seq_1)) |
---|
| 1261 | + return false; |
---|
| 1262 | + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKOFORECV); |
---|
| 1263 | + } else { |
---|
| 1264 | + return false; |
---|
1107 | 1265 | } |
---|
1108 | 1266 | |
---|
| 1267 | + dup_segs = tcp_dsack_seen(tp, start_seq_0, end_seq_0, state); |
---|
| 1268 | + if (!dup_segs) { /* Skip dubious DSACK */ |
---|
| 1269 | + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKIGNOREDDUBIOUS); |
---|
| 1270 | + return false; |
---|
| 1271 | + } |
---|
| 1272 | + |
---|
| 1273 | + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECVSEGS, dup_segs); |
---|
| 1274 | + |
---|
1109 | 1275 | /* D-SACK for already forgotten data... Do dumb counting. */ |
---|
1110 | | - if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 && |
---|
| 1276 | + if (tp->undo_marker && tp->undo_retrans > 0 && |
---|
1111 | 1277 | !after(end_seq_0, prior_snd_una) && |
---|
1112 | 1278 | after(end_seq_0, tp->undo_marker)) |
---|
1113 | | - tp->undo_retrans--; |
---|
| 1279 | + tp->undo_retrans = max_t(int, 0, tp->undo_retrans - dup_segs); |
---|
1114 | 1280 | |
---|
1115 | | - return dup_sack; |
---|
| 1281 | + return true; |
---|
1116 | 1282 | } |
---|
1117 | | - |
---|
1118 | | -struct tcp_sacktag_state { |
---|
1119 | | - u32 reord; |
---|
1120 | | - /* Timestamps for earliest and latest never-retransmitted segment |
---|
1121 | | - * that was SACKed. RTO needs the earliest RTT to stay conservative, |
---|
1122 | | - * but congestion control should still get an accurate delay signal. |
---|
1123 | | - */ |
---|
1124 | | - u64 first_sackt; |
---|
1125 | | - u64 last_sackt; |
---|
1126 | | - struct rate_sample *rate; |
---|
1127 | | - int flag; |
---|
1128 | | - unsigned int mss_now; |
---|
1129 | | -}; |
---|
1130 | 1283 | |
---|
1131 | 1284 | /* Check if skb is fully within the SACK block. In presence of GSO skbs, |
---|
1132 | 1285 | * the incoming SACK may not exactly match but we can find smaller MSS |
---|
.. | .. |
---|
1246 | 1399 | sacked |= TCPCB_SACKED_ACKED; |
---|
1247 | 1400 | state->flag |= FLAG_DATA_SACKED; |
---|
1248 | 1401 | tp->sacked_out += pcount; |
---|
1249 | | - tp->delivered += pcount; /* Out-of-order packets delivered */ |
---|
| 1402 | + /* Out-of-order packets delivered */ |
---|
| 1403 | + state->sack_delivered += pcount; |
---|
1250 | 1404 | |
---|
1251 | 1405 | /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ |
---|
1252 | 1406 | if (tp->lost_skb_hint && |
---|
.. | .. |
---|
1289 | 1443 | */ |
---|
1290 | 1444 | tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, |
---|
1291 | 1445 | start_seq, end_seq, dup_sack, pcount, |
---|
1292 | | - skb->skb_mstamp); |
---|
| 1446 | + tcp_skb_timestamp_us(skb)); |
---|
1293 | 1447 | tcp_rate_skb_delivered(sk, skb, state->rate); |
---|
1294 | 1448 | |
---|
1295 | 1449 | if (skb == tp->lost_skb_hint) |
---|
.. | .. |
---|
1413 | 1567 | if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) |
---|
1414 | 1568 | goto fallback; |
---|
1415 | 1569 | |
---|
1416 | | - if (!tcp_skb_can_collapse_to(prev)) |
---|
| 1570 | + if (!tcp_skb_can_collapse(prev, skb)) |
---|
1417 | 1571 | goto fallback; |
---|
1418 | 1572 | |
---|
1419 | 1573 | in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && |
---|
.. | .. |
---|
1502 | 1656 | (mss != tcp_skb_seglen(skb))) |
---|
1503 | 1657 | goto out; |
---|
1504 | 1658 | |
---|
| 1659 | + if (!tcp_skb_can_collapse(prev, skb)) |
---|
| 1660 | + goto out; |
---|
1505 | 1661 | len = skb->len; |
---|
1506 | 1662 | pcount = tcp_skb_pcount(skb); |
---|
1507 | 1663 | if (tcp_skb_shift(prev, skb, pcount, len)) |
---|
.. | .. |
---|
1578 | 1734 | TCP_SKB_CB(skb)->end_seq, |
---|
1579 | 1735 | dup_sack, |
---|
1580 | 1736 | tcp_skb_pcount(skb), |
---|
1581 | | - skb->skb_mstamp); |
---|
| 1737 | + tcp_skb_timestamp_us(skb)); |
---|
1582 | 1738 | tcp_rate_skb_delivered(sk, skb, state->rate); |
---|
1583 | 1739 | if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) |
---|
1584 | 1740 | list_del_init(&skb->tcp_tsorted_anchor); |
---|
.. | .. |
---|
1591 | 1747 | return skb; |
---|
1592 | 1748 | } |
---|
1593 | 1749 | |
---|
1594 | | -static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk, |
---|
1595 | | - struct tcp_sacktag_state *state, |
---|
1596 | | - u32 seq) |
---|
| 1750 | +static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk, u32 seq) |
---|
1597 | 1751 | { |
---|
1598 | 1752 | struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node; |
---|
1599 | 1753 | struct sk_buff *skb; |
---|
.. | .. |
---|
1615 | 1769 | } |
---|
1616 | 1770 | |
---|
1617 | 1771 | static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, |
---|
1618 | | - struct tcp_sacktag_state *state, |
---|
1619 | 1772 | u32 skip_to_seq) |
---|
1620 | 1773 | { |
---|
1621 | 1774 | if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq)) |
---|
1622 | 1775 | return skb; |
---|
1623 | 1776 | |
---|
1624 | | - return tcp_sacktag_bsearch(sk, state, skip_to_seq); |
---|
| 1777 | + return tcp_sacktag_bsearch(sk, skip_to_seq); |
---|
1625 | 1778 | } |
---|
1626 | 1779 | |
---|
1627 | 1780 | static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, |
---|
.. | .. |
---|
1634 | 1787 | return skb; |
---|
1635 | 1788 | |
---|
1636 | 1789 | if (before(next_dup->start_seq, skip_to_seq)) { |
---|
1637 | | - skb = tcp_sacktag_skip(skb, sk, state, next_dup->start_seq); |
---|
| 1790 | + skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq); |
---|
1638 | 1791 | skb = tcp_sacktag_walk(skb, sk, NULL, state, |
---|
1639 | 1792 | next_dup->start_seq, next_dup->end_seq, |
---|
1640 | 1793 | 1); |
---|
.. | .. |
---|
1672 | 1825 | tcp_highest_sack_reset(sk); |
---|
1673 | 1826 | |
---|
1674 | 1827 | found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, |
---|
1675 | | - num_sacks, prior_snd_una); |
---|
1676 | | - if (found_dup_sack) { |
---|
1677 | | - state->flag |= FLAG_DSACKING_ACK; |
---|
1678 | | - tp->delivered++; /* A spurious retransmission is delivered */ |
---|
1679 | | - } |
---|
| 1828 | + num_sacks, prior_snd_una, state); |
---|
1680 | 1829 | |
---|
1681 | 1830 | /* Eliminate too old ACKs, but take into |
---|
1682 | 1831 | * account more or less fresh ones, they can |
---|
.. | .. |
---|
1778 | 1927 | |
---|
1779 | 1928 | /* Head todo? */ |
---|
1780 | 1929 | if (before(start_seq, cache->start_seq)) { |
---|
1781 | | - skb = tcp_sacktag_skip(skb, sk, state, |
---|
1782 | | - start_seq); |
---|
| 1930 | + skb = tcp_sacktag_skip(skb, sk, start_seq); |
---|
1783 | 1931 | skb = tcp_sacktag_walk(skb, sk, next_dup, |
---|
1784 | 1932 | state, |
---|
1785 | 1933 | start_seq, |
---|
.. | .. |
---|
1805 | 1953 | goto walk; |
---|
1806 | 1954 | } |
---|
1807 | 1955 | |
---|
1808 | | - skb = tcp_sacktag_skip(skb, sk, state, cache->end_seq); |
---|
| 1956 | + skb = tcp_sacktag_skip(skb, sk, cache->end_seq); |
---|
1809 | 1957 | /* Check overlap against next cached too (past this one already) */ |
---|
1810 | 1958 | cache++; |
---|
1811 | 1959 | continue; |
---|
.. | .. |
---|
1816 | 1964 | if (!skb) |
---|
1817 | 1965 | break; |
---|
1818 | 1966 | } |
---|
1819 | | - skb = tcp_sacktag_skip(skb, sk, state, start_seq); |
---|
| 1967 | + skb = tcp_sacktag_skip(skb, sk, start_seq); |
---|
1820 | 1968 | |
---|
1821 | 1969 | walk: |
---|
1822 | 1970 | skb = tcp_sacktag_walk(skb, sk, next_dup, state, |
---|
.. | .. |
---|
1878 | 2026 | return; |
---|
1879 | 2027 | |
---|
1880 | 2028 | tp->reordering = min_t(u32, tp->packets_out + addend, |
---|
1881 | | - sock_net(sk)->ipv4.sysctl_tcp_max_reordering); |
---|
| 2029 | + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering)); |
---|
1882 | 2030 | tp->reord_seen++; |
---|
1883 | 2031 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER); |
---|
1884 | 2032 | } |
---|
1885 | 2033 | |
---|
1886 | 2034 | /* Emulate SACKs for SACKless connection: account for a new dupack. */ |
---|
1887 | 2035 | |
---|
1888 | | -static void tcp_add_reno_sack(struct sock *sk) |
---|
| 2036 | +static void tcp_add_reno_sack(struct sock *sk, int num_dupack, bool ece_ack) |
---|
1889 | 2037 | { |
---|
1890 | | - struct tcp_sock *tp = tcp_sk(sk); |
---|
1891 | | - u32 prior_sacked = tp->sacked_out; |
---|
| 2038 | + if (num_dupack) { |
---|
| 2039 | + struct tcp_sock *tp = tcp_sk(sk); |
---|
| 2040 | + u32 prior_sacked = tp->sacked_out; |
---|
| 2041 | + s32 delivered; |
---|
1892 | 2042 | |
---|
1893 | | - tp->sacked_out++; |
---|
1894 | | - tcp_check_reno_reordering(sk, 0); |
---|
1895 | | - if (tp->sacked_out > prior_sacked) |
---|
1896 | | - tp->delivered++; /* Some out-of-order packet is delivered */ |
---|
1897 | | - tcp_verify_left_out(tp); |
---|
| 2043 | + tp->sacked_out += num_dupack; |
---|
| 2044 | + tcp_check_reno_reordering(sk, 0); |
---|
| 2045 | + delivered = tp->sacked_out - prior_sacked; |
---|
| 2046 | + if (delivered > 0) |
---|
| 2047 | + tcp_count_delivered(tp, delivered, ece_ack); |
---|
| 2048 | + tcp_verify_left_out(tp); |
---|
| 2049 | + } |
---|
1898 | 2050 | } |
---|
1899 | 2051 | |
---|
1900 | 2052 | /* Account for ACK, ACKing some data in Reno Recovery phase. */ |
---|
1901 | 2053 | |
---|
1902 | | -static void tcp_remove_reno_sacks(struct sock *sk, int acked) |
---|
| 2054 | +static void tcp_remove_reno_sacks(struct sock *sk, int acked, bool ece_ack) |
---|
1903 | 2055 | { |
---|
1904 | 2056 | struct tcp_sock *tp = tcp_sk(sk); |
---|
1905 | 2057 | |
---|
1906 | 2058 | if (acked > 0) { |
---|
1907 | 2059 | /* One ACK acked hole. The rest eat duplicate ACKs. */ |
---|
1908 | | - tp->delivered += max_t(int, acked - tp->sacked_out, 1); |
---|
| 2060 | + tcp_count_delivered(tp, max_t(int, acked - tp->sacked_out, 1), |
---|
| 2061 | + ece_ack); |
---|
1909 | 2062 | if (acked - 1 >= tp->sacked_out) |
---|
1910 | 2063 | tp->sacked_out = 0; |
---|
1911 | 2064 | else |
---|
.. | .. |
---|
1938 | 2091 | |
---|
1939 | 2092 | static bool tcp_is_rack(const struct sock *sk) |
---|
1940 | 2093 | { |
---|
1941 | | - return sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION; |
---|
| 2094 | + return READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) & |
---|
| 2095 | + TCP_RACK_LOSS_DETECTION; |
---|
1942 | 2096 | } |
---|
1943 | 2097 | |
---|
1944 | 2098 | /* If we detect SACK reneging, forget all SACK information |
---|
.. | .. |
---|
1982 | 2136 | struct tcp_sock *tp = tcp_sk(sk); |
---|
1983 | 2137 | struct net *net = sock_net(sk); |
---|
1984 | 2138 | bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery; |
---|
| 2139 | + u8 reordering; |
---|
1985 | 2140 | |
---|
1986 | 2141 | tcp_timeout_mark_lost(sk); |
---|
1987 | 2142 | |
---|
.. | .. |
---|
2002 | 2157 | /* Timeout in disordered state after receiving substantial DUPACKs |
---|
2003 | 2158 | * suggests that the degree of reordering is over-estimated. |
---|
2004 | 2159 | */ |
---|
| 2160 | + reordering = READ_ONCE(net->ipv4.sysctl_tcp_reordering); |
---|
2005 | 2161 | if (icsk->icsk_ca_state <= TCP_CA_Disorder && |
---|
2006 | | - tp->sacked_out >= net->ipv4.sysctl_tcp_reordering) |
---|
| 2162 | + tp->sacked_out >= reordering) |
---|
2007 | 2163 | tp->reordering = min_t(unsigned int, tp->reordering, |
---|
2008 | | - net->ipv4.sysctl_tcp_reordering); |
---|
| 2164 | + reordering); |
---|
| 2165 | + |
---|
2009 | 2166 | tcp_set_ca_state(sk, TCP_CA_Loss); |
---|
2010 | 2167 | tp->high_seq = tp->snd_nxt; |
---|
2011 | 2168 | tcp_ecn_queue_cwr(tp); |
---|
.. | .. |
---|
2014 | 2171 | * loss recovery is underway except recurring timeout(s) on |
---|
2015 | 2172 | * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing |
---|
2016 | 2173 | */ |
---|
2017 | | - tp->frto = net->ipv4.sysctl_tcp_frto && |
---|
| 2174 | + tp->frto = READ_ONCE(net->ipv4.sysctl_tcp_frto) && |
---|
2018 | 2175 | (new_recovery || icsk->icsk_retransmits) && |
---|
2019 | 2176 | !inet_csk(sk)->icsk_mtup.probe_size; |
---|
2020 | 2177 | } |
---|
.. | .. |
---|
2031 | 2188 | */ |
---|
2032 | 2189 | static bool tcp_check_sack_reneging(struct sock *sk, int flag) |
---|
2033 | 2190 | { |
---|
2034 | | - if (flag & FLAG_SACK_RENEGING) { |
---|
| 2191 | + if (flag & FLAG_SACK_RENEGING && |
---|
| 2192 | + flag & FLAG_SND_UNA_ADVANCED) { |
---|
2035 | 2193 | struct tcp_sock *tp = tcp_sk(sk); |
---|
2036 | 2194 | unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4), |
---|
2037 | 2195 | msecs_to_jiffies(10)); |
---|
.. | .. |
---|
2172 | 2330 | } |
---|
2173 | 2331 | |
---|
2174 | 2332 | /* Detect loss in event "A" above by marking head of queue up as lost. |
---|
2175 | | - * For non-SACK(Reno) senders, the first "packets" number of segments |
---|
2176 | | - * are considered lost. For RFC3517 SACK, a segment is considered lost if it |
---|
| 2333 | + * For RFC3517 SACK, a segment is considered lost if it |
---|
2177 | 2334 | * has at least tp->reordering SACKed seqments above it; "packets" refers to |
---|
2178 | 2335 | * the maximum SACKed segments to pass before reaching this limit. |
---|
2179 | 2336 | */ |
---|
.. | .. |
---|
2181 | 2338 | { |
---|
2182 | 2339 | struct tcp_sock *tp = tcp_sk(sk); |
---|
2183 | 2340 | struct sk_buff *skb; |
---|
2184 | | - int cnt, oldcnt, lost; |
---|
2185 | | - unsigned int mss; |
---|
| 2341 | + int cnt; |
---|
2186 | 2342 | /* Use SACK to deduce losses of new sequences sent during recovery */ |
---|
2187 | | - const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq; |
---|
| 2343 | + const u32 loss_high = tp->snd_nxt; |
---|
2188 | 2344 | |
---|
2189 | 2345 | WARN_ON(packets > tp->packets_out); |
---|
2190 | 2346 | skb = tp->lost_skb_hint; |
---|
.. | .. |
---|
2207 | 2363 | if (after(TCP_SKB_CB(skb)->end_seq, loss_high)) |
---|
2208 | 2364 | break; |
---|
2209 | 2365 | |
---|
2210 | | - oldcnt = cnt; |
---|
2211 | | - if (tcp_is_reno(tp) || |
---|
2212 | | - (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) |
---|
| 2366 | + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) |
---|
2213 | 2367 | cnt += tcp_skb_pcount(skb); |
---|
2214 | 2368 | |
---|
2215 | | - if (cnt > packets) { |
---|
2216 | | - if (tcp_is_sack(tp) || |
---|
2217 | | - (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || |
---|
2218 | | - (oldcnt >= packets)) |
---|
2219 | | - break; |
---|
| 2369 | + if (cnt > packets) |
---|
| 2370 | + break; |
---|
2220 | 2371 | |
---|
2221 | | - mss = tcp_skb_mss(skb); |
---|
2222 | | - /* If needed, chop off the prefix to mark as lost. */ |
---|
2223 | | - lost = (packets - oldcnt) * mss; |
---|
2224 | | - if (lost < skb->len && |
---|
2225 | | - tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, |
---|
2226 | | - lost, mss, GFP_ATOMIC) < 0) |
---|
2227 | | - break; |
---|
2228 | | - cnt = packets; |
---|
2229 | | - } |
---|
2230 | | - |
---|
2231 | | - tcp_skb_mark_lost(tp, skb); |
---|
| 2372 | + if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST)) |
---|
| 2373 | + tcp_mark_skb_lost(sk, skb); |
---|
2232 | 2374 | |
---|
2233 | 2375 | if (mark_head) |
---|
2234 | 2376 | break; |
---|
.. | .. |
---|
2272 | 2414 | */ |
---|
2273 | 2415 | static inline bool tcp_packet_delayed(const struct tcp_sock *tp) |
---|
2274 | 2416 | { |
---|
2275 | | - return !tp->retrans_stamp || |
---|
| 2417 | + return tp->retrans_stamp && |
---|
2276 | 2418 | tcp_tsopt_ecr_before(tp, tp->retrans_stamp); |
---|
2277 | 2419 | } |
---|
2278 | 2420 | |
---|
.. | .. |
---|
2368 | 2510 | return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp)); |
---|
2369 | 2511 | } |
---|
2370 | 2512 | |
---|
| 2513 | +static bool tcp_is_non_sack_preventing_reopen(struct sock *sk) |
---|
| 2514 | +{ |
---|
| 2515 | + struct tcp_sock *tp = tcp_sk(sk); |
---|
| 2516 | + |
---|
| 2517 | + if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { |
---|
| 2518 | + /* Hold old state until something *above* high_seq |
---|
| 2519 | + * is ACKed. For Reno it is MUST to prevent false |
---|
| 2520 | + * fast retransmits (RFC2582). SACK TCP is safe. */ |
---|
| 2521 | + if (!tcp_any_retrans_done(sk)) |
---|
| 2522 | + tp->retrans_stamp = 0; |
---|
| 2523 | + return true; |
---|
| 2524 | + } |
---|
| 2525 | + return false; |
---|
| 2526 | +} |
---|
| 2527 | + |
---|
2371 | 2528 | /* People celebrate: "We love our President!" */ |
---|
2372 | 2529 | static bool tcp_try_undo_recovery(struct sock *sk) |
---|
2373 | 2530 | { |
---|
.. | .. |
---|
2390 | 2547 | } else if (tp->rack.reo_wnd_persist) { |
---|
2391 | 2548 | tp->rack.reo_wnd_persist--; |
---|
2392 | 2549 | } |
---|
2393 | | - if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { |
---|
2394 | | - /* Hold old state until something *above* high_seq |
---|
2395 | | - * is ACKed. For Reno it is MUST to prevent false |
---|
2396 | | - * fast retransmits (RFC2582). SACK TCP is safe. */ |
---|
2397 | | - if (!tcp_any_retrans_done(sk)) |
---|
2398 | | - tp->retrans_stamp = 0; |
---|
| 2550 | + if (tcp_is_non_sack_preventing_reopen(sk)) |
---|
2399 | 2551 | return true; |
---|
2400 | | - } |
---|
2401 | 2552 | tcp_set_ca_state(sk, TCP_CA_Open); |
---|
2402 | 2553 | tp->is_sack_reneg = 0; |
---|
2403 | 2554 | return false; |
---|
.. | .. |
---|
2433 | 2584 | NET_INC_STATS(sock_net(sk), |
---|
2434 | 2585 | LINUX_MIB_TCPSPURIOUSRTOS); |
---|
2435 | 2586 | inet_csk(sk)->icsk_retransmits = 0; |
---|
| 2587 | + if (tcp_is_non_sack_preventing_reopen(sk)) |
---|
| 2588 | + return true; |
---|
2436 | 2589 | if (frto_undo || tcp_is_sack(tp)) { |
---|
2437 | 2590 | tcp_set_ca_state(sk, TCP_CA_Open); |
---|
2438 | 2591 | tp->is_sack_reneg = 0; |
---|
.. | .. |
---|
2479 | 2632 | u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered + |
---|
2480 | 2633 | tp->prior_cwnd - 1; |
---|
2481 | 2634 | sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out; |
---|
2482 | | - } else if ((flag & FLAG_RETRANS_DATA_ACKED) && |
---|
2483 | | - !(flag & FLAG_LOST_RETRANS)) { |
---|
| 2635 | + } else if ((flag & (FLAG_RETRANS_DATA_ACKED | FLAG_LOST_RETRANS)) == |
---|
| 2636 | + FLAG_RETRANS_DATA_ACKED) { |
---|
2484 | 2637 | sndcnt = min_t(int, delta, |
---|
2485 | 2638 | max_t(int, tp->prr_delivered - tp->prr_out, |
---|
2486 | 2639 | newly_acked_sacked) + 1); |
---|
.. | .. |
---|
2566 | 2719 | { |
---|
2567 | 2720 | struct tcp_sock *tp = tcp_sk(sk); |
---|
2568 | 2721 | struct inet_connection_sock *icsk = inet_csk(sk); |
---|
| 2722 | + u64 val; |
---|
2569 | 2723 | |
---|
2570 | | - /* FIXME: breaks with very large cwnd */ |
---|
2571 | 2724 | tp->prior_ssthresh = tcp_current_ssthresh(sk); |
---|
2572 | | - tp->snd_cwnd = tp->snd_cwnd * |
---|
2573 | | - tcp_mss_to_mtu(sk, tp->mss_cache) / |
---|
2574 | | - icsk->icsk_mtup.probe_size; |
---|
| 2725 | + |
---|
| 2726 | + val = (u64)tp->snd_cwnd * tcp_mss_to_mtu(sk, tp->mss_cache); |
---|
| 2727 | + do_div(val, icsk->icsk_mtup.probe_size); |
---|
| 2728 | + WARN_ON_ONCE((u32)val != val); |
---|
| 2729 | + tp->snd_cwnd = max_t(u32, 1U, val); |
---|
| 2730 | + |
---|
2575 | 2731 | tp->snd_cwnd_cnt = 0; |
---|
2576 | 2732 | tp->snd_cwnd_stamp = tcp_jiffies32; |
---|
2577 | 2733 | tp->snd_ssthresh = tcp_current_ssthresh(sk); |
---|
.. | .. |
---|
2594 | 2750 | unsigned int mss = tcp_current_mss(sk); |
---|
2595 | 2751 | |
---|
2596 | 2752 | skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { |
---|
2597 | | - if (tcp_skb_seglen(skb) > mss && |
---|
2598 | | - !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { |
---|
2599 | | - if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { |
---|
2600 | | - TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; |
---|
2601 | | - tp->retrans_out -= tcp_skb_pcount(skb); |
---|
2602 | | - } |
---|
2603 | | - tcp_skb_mark_lost_uncond_verify(tp, skb); |
---|
2604 | | - } |
---|
| 2753 | + if (tcp_skb_seglen(skb) > mss) |
---|
| 2754 | + tcp_mark_skb_lost(sk, skb); |
---|
2605 | 2755 | } |
---|
2606 | 2756 | |
---|
2607 | 2757 | tcp_clear_retrans_hints_partial(tp); |
---|
.. | .. |
---|
2656 | 2806 | /* Process an ACK in CA_Loss state. Move to CA_Open if lost data are |
---|
2657 | 2807 | * recovered or spurious. Otherwise retransmits more on partial ACKs. |
---|
2658 | 2808 | */ |
---|
2659 | | -static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, |
---|
| 2809 | +static void tcp_process_loss(struct sock *sk, int flag, int num_dupack, |
---|
2660 | 2810 | int *rexmit) |
---|
2661 | 2811 | { |
---|
2662 | 2812 | struct tcp_sock *tp = tcp_sk(sk); |
---|
2663 | 2813 | bool recovered = !before(tp->snd_una, tp->high_seq); |
---|
2664 | 2814 | |
---|
2665 | | - if ((flag & FLAG_SND_UNA_ADVANCED) && |
---|
| 2815 | + if ((flag & FLAG_SND_UNA_ADVANCED || rcu_access_pointer(tp->fastopen_rsk)) && |
---|
2666 | 2816 | tcp_try_undo_loss(sk, false)) |
---|
2667 | 2817 | return; |
---|
2668 | 2818 | |
---|
.. | .. |
---|
2675 | 2825 | return; |
---|
2676 | 2826 | |
---|
2677 | 2827 | if (after(tp->snd_nxt, tp->high_seq)) { |
---|
2678 | | - if (flag & FLAG_DATA_SACKED || is_dupack) |
---|
| 2828 | + if (flag & FLAG_DATA_SACKED || num_dupack) |
---|
2679 | 2829 | tp->frto = 0; /* Step 3.a. loss was real */ |
---|
2680 | 2830 | } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) { |
---|
2681 | 2831 | tp->high_seq = tp->snd_nxt; |
---|
.. | .. |
---|
2701 | 2851 | /* A Reno DUPACK means new data in F-RTO step 2.b above are |
---|
2702 | 2852 | * delivered. Lower inflight to clock out (re)tranmissions. |
---|
2703 | 2853 | */ |
---|
2704 | | - if (after(tp->snd_nxt, tp->high_seq) && is_dupack) |
---|
2705 | | - tcp_add_reno_sack(sk); |
---|
| 2854 | + if (after(tp->snd_nxt, tp->high_seq) && num_dupack) |
---|
| 2855 | + tcp_add_reno_sack(sk, num_dupack, flag & FLAG_ECE); |
---|
2706 | 2856 | else if (flag & FLAG_SND_UNA_ADVANCED) |
---|
2707 | 2857 | tcp_reset_reno_sack(tp); |
---|
2708 | 2858 | } |
---|
2709 | 2859 | *rexmit = REXMIT_LOST; |
---|
2710 | 2860 | } |
---|
2711 | 2861 | |
---|
| 2862 | +static bool tcp_force_fast_retransmit(struct sock *sk) |
---|
| 2863 | +{ |
---|
| 2864 | + struct tcp_sock *tp = tcp_sk(sk); |
---|
| 2865 | + |
---|
| 2866 | + return after(tcp_highest_sack_seq(tp), |
---|
| 2867 | + tp->snd_una + tp->reordering * tp->mss_cache); |
---|
| 2868 | +} |
---|
| 2869 | + |
---|
2712 | 2870 | /* Undo during fast recovery after partial ACK. */ |
---|
2713 | | -static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una) |
---|
| 2871 | +static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una, |
---|
| 2872 | + bool *do_lost) |
---|
2714 | 2873 | { |
---|
2715 | 2874 | struct tcp_sock *tp = tcp_sk(sk); |
---|
2716 | 2875 | |
---|
.. | .. |
---|
2735 | 2894 | tcp_undo_cwnd_reduction(sk, true); |
---|
2736 | 2895 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); |
---|
2737 | 2896 | tcp_try_keep_open(sk); |
---|
2738 | | - return true; |
---|
| 2897 | + } else { |
---|
| 2898 | + /* Partial ACK arrived. Force fast retransmit. */ |
---|
| 2899 | + *do_lost = tcp_force_fast_retransmit(sk); |
---|
2739 | 2900 | } |
---|
2740 | 2901 | return false; |
---|
2741 | 2902 | } |
---|
.. | .. |
---|
2759 | 2920 | } |
---|
2760 | 2921 | } |
---|
2761 | 2922 | |
---|
2762 | | -static bool tcp_force_fast_retransmit(struct sock *sk) |
---|
2763 | | -{ |
---|
2764 | | - struct tcp_sock *tp = tcp_sk(sk); |
---|
2765 | | - |
---|
2766 | | - return after(tcp_highest_sack_seq(tp), |
---|
2767 | | - tp->snd_una + tp->reordering * tp->mss_cache); |
---|
2768 | | -} |
---|
2769 | | - |
---|
2770 | 2923 | /* Process an event, which can update packets-in-flight not trivially. |
---|
2771 | 2924 | * Main goal of this function is to calculate new estimate for left_out, |
---|
2772 | 2925 | * taking into account both packets sitting in receiver's buffer and |
---|
.. | .. |
---|
2780 | 2933 | * tcp_xmit_retransmit_queue(). |
---|
2781 | 2934 | */ |
---|
2782 | 2935 | static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, |
---|
2783 | | - bool is_dupack, int *ack_flag, int *rexmit) |
---|
| 2936 | + int num_dupack, int *ack_flag, int *rexmit) |
---|
2784 | 2937 | { |
---|
2785 | 2938 | struct inet_connection_sock *icsk = inet_csk(sk); |
---|
2786 | 2939 | struct tcp_sock *tp = tcp_sk(sk); |
---|
2787 | 2940 | int fast_rexmit = 0, flag = *ack_flag; |
---|
2788 | | - bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && |
---|
2789 | | - tcp_force_fast_retransmit(sk)); |
---|
| 2941 | + bool ece_ack = flag & FLAG_ECE; |
---|
| 2942 | + bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) && |
---|
| 2943 | + tcp_force_fast_retransmit(sk)); |
---|
2790 | 2944 | |
---|
2791 | 2945 | if (!tp->packets_out && tp->sacked_out) |
---|
2792 | 2946 | tp->sacked_out = 0; |
---|
2793 | 2947 | |
---|
2794 | 2948 | /* Now state machine starts. |
---|
2795 | 2949 | * A. ECE, hence prohibit cwnd undoing, the reduction is required. */ |
---|
2796 | | - if (flag & FLAG_ECE) |
---|
| 2950 | + if (ece_ack) |
---|
2797 | 2951 | tp->prior_ssthresh = 0; |
---|
2798 | 2952 | |
---|
2799 | 2953 | /* B. In all the states check for reneging SACKs. */ |
---|
.. | .. |
---|
2833 | 2987 | switch (icsk->icsk_ca_state) { |
---|
2834 | 2988 | case TCP_CA_Recovery: |
---|
2835 | 2989 | if (!(flag & FLAG_SND_UNA_ADVANCED)) { |
---|
2836 | | - if (tcp_is_reno(tp) && is_dupack) |
---|
2837 | | - tcp_add_reno_sack(sk); |
---|
2838 | | - } else { |
---|
2839 | | - if (tcp_try_undo_partial(sk, prior_snd_una)) |
---|
2840 | | - return; |
---|
2841 | | - /* Partial ACK arrived. Force fast retransmit. */ |
---|
2842 | | - do_lost = tcp_is_reno(tp) || |
---|
2843 | | - tcp_force_fast_retransmit(sk); |
---|
2844 | | - } |
---|
2845 | | - if (tcp_try_undo_dsack(sk)) { |
---|
2846 | | - tcp_try_keep_open(sk); |
---|
| 2990 | + if (tcp_is_reno(tp)) |
---|
| 2991 | + tcp_add_reno_sack(sk, num_dupack, ece_ack); |
---|
| 2992 | + } else if (tcp_try_undo_partial(sk, prior_snd_una, &do_lost)) |
---|
2847 | 2993 | return; |
---|
2848 | | - } |
---|
| 2994 | + |
---|
| 2995 | + if (tcp_try_undo_dsack(sk)) |
---|
| 2996 | + tcp_try_keep_open(sk); |
---|
| 2997 | + |
---|
2849 | 2998 | tcp_identify_packet_loss(sk, ack_flag); |
---|
| 2999 | + if (icsk->icsk_ca_state != TCP_CA_Recovery) { |
---|
| 3000 | + if (!tcp_time_to_recover(sk, flag)) |
---|
| 3001 | + return; |
---|
| 3002 | + /* Undo reverts the recovery state. If loss is evident, |
---|
| 3003 | + * starts a new recovery (e.g. reordering then loss); |
---|
| 3004 | + */ |
---|
| 3005 | + tcp_enter_recovery(sk, ece_ack); |
---|
| 3006 | + } |
---|
2850 | 3007 | break; |
---|
2851 | 3008 | case TCP_CA_Loss: |
---|
2852 | | - tcp_process_loss(sk, flag, is_dupack, rexmit); |
---|
| 3009 | + tcp_process_loss(sk, flag, num_dupack, rexmit); |
---|
2853 | 3010 | tcp_identify_packet_loss(sk, ack_flag); |
---|
2854 | 3011 | if (!(icsk->icsk_ca_state == TCP_CA_Open || |
---|
2855 | 3012 | (*ack_flag & FLAG_LOST_RETRANS))) |
---|
2856 | 3013 | return; |
---|
2857 | 3014 | /* Change state if cwnd is undone or retransmits are lost */ |
---|
2858 | | - /* fall through */ |
---|
| 3015 | + fallthrough; |
---|
2859 | 3016 | default: |
---|
2860 | 3017 | if (tcp_is_reno(tp)) { |
---|
2861 | 3018 | if (flag & FLAG_SND_UNA_ADVANCED) |
---|
2862 | 3019 | tcp_reset_reno_sack(tp); |
---|
2863 | | - if (is_dupack) |
---|
2864 | | - tcp_add_reno_sack(sk); |
---|
| 3020 | + tcp_add_reno_sack(sk, num_dupack, ece_ack); |
---|
2865 | 3021 | } |
---|
2866 | 3022 | |
---|
2867 | 3023 | if (icsk->icsk_ca_state <= TCP_CA_Disorder) |
---|
.. | .. |
---|
2885 | 3041 | } |
---|
2886 | 3042 | |
---|
2887 | 3043 | /* Otherwise enter Recovery state */ |
---|
2888 | | - tcp_enter_recovery(sk, (flag & FLAG_ECE)); |
---|
| 3044 | + tcp_enter_recovery(sk, ece_ack); |
---|
2889 | 3045 | fast_rexmit = 1; |
---|
2890 | 3046 | } |
---|
2891 | 3047 | |
---|
.. | .. |
---|
2896 | 3052 | |
---|
2897 | 3053 | static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us, const int flag) |
---|
2898 | 3054 | { |
---|
2899 | | - u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ; |
---|
| 3055 | + u32 wlen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen) * HZ; |
---|
2900 | 3056 | struct tcp_sock *tp = tcp_sk(sk); |
---|
2901 | 3057 | |
---|
2902 | 3058 | if ((flag & FLAG_ACK_MAYBE_DELAYED) && rtt_us > tcp_min_rtt(tp)) { |
---|
.. | .. |
---|
2935 | 3091 | u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; |
---|
2936 | 3092 | |
---|
2937 | 3093 | if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { |
---|
| 3094 | + if (!delta) |
---|
| 3095 | + delta = 1; |
---|
2938 | 3096 | seq_rtt_us = delta * (USEC_PER_SEC / TCP_TS_HZ); |
---|
2939 | 3097 | ca_rtt_us = seq_rtt_us; |
---|
2940 | 3098 | } |
---|
.. | .. |
---|
2988 | 3146 | /* If the retrans timer is currently being used by Fast Open |
---|
2989 | 3147 | * for SYN-ACK retrans purpose, stay put. |
---|
2990 | 3148 | */ |
---|
2991 | | - if (tp->fastopen_rsk) |
---|
| 3149 | + if (rcu_access_pointer(tp->fastopen_rsk)) |
---|
2992 | 3150 | return; |
---|
2993 | 3151 | |
---|
2994 | 3152 | if (!tp->packets_out) { |
---|
.. | .. |
---|
3004 | 3162 | */ |
---|
3005 | 3163 | rto = usecs_to_jiffies(max_t(int, delta_us, 1)); |
---|
3006 | 3164 | } |
---|
3007 | | - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, |
---|
3008 | | - TCP_RTO_MAX); |
---|
| 3165 | + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, |
---|
| 3166 | + TCP_RTO_MAX); |
---|
3009 | 3167 | } |
---|
3010 | 3168 | } |
---|
3011 | 3169 | |
---|
.. | .. |
---|
3061 | 3219 | */ |
---|
3062 | 3220 | static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, |
---|
3063 | 3221 | u32 prior_snd_una, |
---|
3064 | | - struct tcp_sacktag_state *sack) |
---|
| 3222 | + struct tcp_sacktag_state *sack, bool ece_ack) |
---|
3065 | 3223 | { |
---|
3066 | 3224 | const struct inet_connection_sock *icsk = inet_csk(sk); |
---|
3067 | 3225 | u64 first_ackt, last_ackt; |
---|
.. | .. |
---|
3086 | 3244 | u8 sacked = scb->sacked; |
---|
3087 | 3245 | u32 acked_pcount; |
---|
3088 | 3246 | |
---|
3089 | | - tcp_ack_tstamp(sk, skb, prior_snd_una); |
---|
3090 | | - |
---|
3091 | 3247 | /* Determine how many packets and what bytes were acked, tso and else */ |
---|
3092 | 3248 | if (after(scb->end_seq, tp->snd_una)) { |
---|
3093 | 3249 | if (tcp_skb_pcount(skb) == 1 || |
---|
.. | .. |
---|
3107 | 3263 | tp->retrans_out -= acked_pcount; |
---|
3108 | 3264 | flag |= FLAG_RETRANS_DATA_ACKED; |
---|
3109 | 3265 | } else if (!(sacked & TCPCB_SACKED_ACKED)) { |
---|
3110 | | - last_ackt = skb->skb_mstamp; |
---|
| 3266 | + last_ackt = tcp_skb_timestamp_us(skb); |
---|
3111 | 3267 | WARN_ON_ONCE(last_ackt == 0); |
---|
3112 | 3268 | if (!first_ackt) |
---|
3113 | 3269 | first_ackt = last_ackt; |
---|
.. | .. |
---|
3122 | 3278 | if (sacked & TCPCB_SACKED_ACKED) { |
---|
3123 | 3279 | tp->sacked_out -= acked_pcount; |
---|
3124 | 3280 | } else if (tcp_is_sack(tp)) { |
---|
3125 | | - tp->delivered += acked_pcount; |
---|
| 3281 | + tcp_count_delivered(tp, acked_pcount, ece_ack); |
---|
3126 | 3282 | if (!tcp_skb_spurious_retrans(tp, skb)) |
---|
3127 | 3283 | tcp_rack_advance(tp, sacked, scb->end_seq, |
---|
3128 | | - skb->skb_mstamp); |
---|
| 3284 | + tcp_skb_timestamp_us(skb)); |
---|
3129 | 3285 | } |
---|
3130 | 3286 | if (sacked & TCPCB_LOST) |
---|
3131 | 3287 | tp->lost_out -= acked_pcount; |
---|
.. | .. |
---|
3151 | 3307 | if (!fully_acked) |
---|
3152 | 3308 | break; |
---|
3153 | 3309 | |
---|
| 3310 | + tcp_ack_tstamp(sk, skb, prior_snd_una); |
---|
| 3311 | + |
---|
3154 | 3312 | next = skb_rb_next(skb); |
---|
3155 | 3313 | if (unlikely(skb == tp->retransmit_skb_hint)) |
---|
3156 | 3314 | tp->retransmit_skb_hint = NULL; |
---|
.. | .. |
---|
3166 | 3324 | if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una))) |
---|
3167 | 3325 | tp->snd_up = tp->snd_una; |
---|
3168 | 3326 | |
---|
3169 | | - if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) |
---|
3170 | | - flag |= FLAG_SACK_RENEGING; |
---|
| 3327 | + if (skb) { |
---|
| 3328 | + tcp_ack_tstamp(sk, skb, prior_snd_una); |
---|
| 3329 | + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) |
---|
| 3330 | + flag |= FLAG_SACK_RENEGING; |
---|
| 3331 | + } |
---|
3171 | 3332 | |
---|
3172 | 3333 | if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) { |
---|
3173 | 3334 | seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt); |
---|
.. | .. |
---|
3199 | 3360 | } |
---|
3200 | 3361 | |
---|
3201 | 3362 | if (tcp_is_reno(tp)) { |
---|
3202 | | - tcp_remove_reno_sacks(sk, pkts_acked); |
---|
| 3363 | + tcp_remove_reno_sacks(sk, pkts_acked, ece_ack); |
---|
3203 | 3364 | |
---|
3204 | 3365 | /* If any of the cumulatively ACKed segments was |
---|
3205 | 3366 | * retransmitted, non-SACK case cannot confirm that |
---|
.. | .. |
---|
3220 | 3381 | tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta); |
---|
3221 | 3382 | } |
---|
3222 | 3383 | } else if (skb && rtt_update && sack_rtt_us >= 0 && |
---|
3223 | | - sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) { |
---|
| 3384 | + sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, |
---|
| 3385 | + tcp_skb_timestamp_us(skb))) { |
---|
3224 | 3386 | /* Do not re-arm RTO if the sack RTT is measured from data sent |
---|
3225 | 3387 | * after when the head was last (re)transmitted. Otherwise the |
---|
3226 | 3388 | * timeout may continue to extend in loss recovery. |
---|
.. | .. |
---|
3273 | 3435 | return; |
---|
3274 | 3436 | if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) { |
---|
3275 | 3437 | icsk->icsk_backoff = 0; |
---|
| 3438 | + icsk->icsk_probes_tstamp = 0; |
---|
3276 | 3439 | inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); |
---|
3277 | 3440 | /* Socket must be waked up by subsequent tcp_data_snd_check(). |
---|
3278 | 3441 | * This function is not for random using! |
---|
.. | .. |
---|
3280 | 3443 | } else { |
---|
3281 | 3444 | unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX); |
---|
3282 | 3445 | |
---|
3283 | | - inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, |
---|
3284 | | - when, TCP_RTO_MAX); |
---|
| 3446 | + when = tcp_clamp_probe0_to_user_timeout(sk, when); |
---|
| 3447 | + tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, TCP_RTO_MAX); |
---|
3285 | 3448 | } |
---|
3286 | 3449 | } |
---|
3287 | 3450 | |
---|
.. | .. |
---|
3300 | 3463 | * new SACK or ECE mark may first advance cwnd here and later reduce |
---|
3301 | 3464 | * cwnd in tcp_fastretrans_alert() based on more states. |
---|
3302 | 3465 | */ |
---|
3303 | | - if (tcp_sk(sk)->reordering > sock_net(sk)->ipv4.sysctl_tcp_reordering) |
---|
| 3466 | + if (tcp_sk(sk)->reordering > |
---|
| 3467 | + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering)) |
---|
3304 | 3468 | return flag & FLAG_FORWARD_PROGRESS; |
---|
3305 | 3469 | |
---|
3306 | 3470 | return flag & FLAG_DATA_ACKED; |
---|
.. | .. |
---|
3409 | 3573 | static bool __tcp_oow_rate_limited(struct net *net, int mib_idx, |
---|
3410 | 3574 | u32 *last_oow_ack_time) |
---|
3411 | 3575 | { |
---|
3412 | | - if (*last_oow_ack_time) { |
---|
3413 | | - s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time); |
---|
| 3576 | + /* Paired with the WRITE_ONCE() in this function. */ |
---|
| 3577 | + u32 val = READ_ONCE(*last_oow_ack_time); |
---|
3414 | 3578 | |
---|
3415 | | - if (0 <= elapsed && elapsed < net->ipv4.sysctl_tcp_invalid_ratelimit) { |
---|
| 3579 | + if (val) { |
---|
| 3580 | + s32 elapsed = (s32)(tcp_jiffies32 - val); |
---|
| 3581 | + |
---|
| 3582 | + if (0 <= elapsed && |
---|
| 3583 | + elapsed < READ_ONCE(net->ipv4.sysctl_tcp_invalid_ratelimit)) { |
---|
3416 | 3584 | NET_INC_STATS(net, mib_idx); |
---|
3417 | 3585 | return true; /* rate-limited: don't send yet! */ |
---|
3418 | 3586 | } |
---|
3419 | 3587 | } |
---|
3420 | 3588 | |
---|
3421 | | - *last_oow_ack_time = tcp_jiffies32; |
---|
| 3589 | + /* Paired with the prior READ_ONCE() and with itself, |
---|
| 3590 | + * as we might be lockless. |
---|
| 3591 | + */ |
---|
| 3592 | + WRITE_ONCE(*last_oow_ack_time, tcp_jiffies32); |
---|
3422 | 3593 | |
---|
3423 | 3594 | return false; /* not rate-limited: go ahead, send dupack now! */ |
---|
3424 | 3595 | } |
---|
.. | .. |
---|
3459 | 3630 | |
---|
3460 | 3631 | /* Then check host-wide RFC 5961 rate limit. */ |
---|
3461 | 3632 | now = jiffies / HZ; |
---|
3462 | | - if (now != challenge_timestamp) { |
---|
3463 | | - u32 ack_limit = net->ipv4.sysctl_tcp_challenge_ack_limit; |
---|
| 3633 | + if (now != READ_ONCE(challenge_timestamp)) { |
---|
| 3634 | + u32 ack_limit = READ_ONCE(net->ipv4.sysctl_tcp_challenge_ack_limit); |
---|
3464 | 3635 | u32 half = (ack_limit + 1) >> 1; |
---|
3465 | 3636 | |
---|
3466 | | - challenge_timestamp = now; |
---|
| 3637 | + WRITE_ONCE(challenge_timestamp, now); |
---|
3467 | 3638 | WRITE_ONCE(challenge_count, half + prandom_u32_max(ack_limit)); |
---|
3468 | 3639 | } |
---|
3469 | 3640 | count = READ_ONCE(challenge_count); |
---|
.. | .. |
---|
3544 | 3715 | { |
---|
3545 | 3716 | struct tcp_sock *tp = tcp_sk(sk); |
---|
3546 | 3717 | |
---|
3547 | | - if (rexmit == REXMIT_NONE) |
---|
| 3718 | + if (rexmit == REXMIT_NONE || sk->sk_state == TCP_SYN_SENT) |
---|
3548 | 3719 | return; |
---|
3549 | 3720 | |
---|
3550 | | - if (unlikely(rexmit == 2)) { |
---|
| 3721 | + if (unlikely(rexmit == REXMIT_NEW)) { |
---|
3551 | 3722 | __tcp_push_pending_frames(sk, tcp_current_mss(sk), |
---|
3552 | 3723 | TCP_NAGLE_OFF); |
---|
3553 | 3724 | if (after(tp->snd_nxt, tp->high_seq)) |
---|
.. | .. |
---|
3566 | 3737 | |
---|
3567 | 3738 | delivered = tp->delivered - prior_delivered; |
---|
3568 | 3739 | NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered); |
---|
3569 | | - if (flag & FLAG_ECE) { |
---|
3570 | | - tp->delivered_ce += delivered; |
---|
| 3740 | + if (flag & FLAG_ECE) |
---|
3571 | 3741 | NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered); |
---|
3572 | | - } |
---|
| 3742 | + |
---|
3573 | 3743 | return delivered; |
---|
3574 | 3744 | } |
---|
3575 | 3745 | |
---|
.. | .. |
---|
3584 | 3754 | bool is_sack_reneg = tp->is_sack_reneg; |
---|
3585 | 3755 | u32 ack_seq = TCP_SKB_CB(skb)->seq; |
---|
3586 | 3756 | u32 ack = TCP_SKB_CB(skb)->ack_seq; |
---|
3587 | | - bool is_dupack = false; |
---|
| 3757 | + int num_dupack = 0; |
---|
3588 | 3758 | int prior_packets = tp->packets_out; |
---|
3589 | 3759 | u32 delivered = tp->delivered; |
---|
3590 | 3760 | u32 lost = tp->lost; |
---|
.. | .. |
---|
3593 | 3763 | |
---|
3594 | 3764 | sack_state.first_sackt = 0; |
---|
3595 | 3765 | sack_state.rate = &rs; |
---|
| 3766 | + sack_state.sack_delivered = 0; |
---|
3596 | 3767 | |
---|
3597 | 3768 | /* We very likely will need to access rtx queue. */ |
---|
3598 | 3769 | prefetch(sk->tcp_rtx_queue.rb_node); |
---|
.. | .. |
---|
3614 | 3785 | * this segment (RFC793 Section 3.9). |
---|
3615 | 3786 | */ |
---|
3616 | 3787 | if (after(ack, tp->snd_nxt)) |
---|
3617 | | - goto invalid_ack; |
---|
| 3788 | + return -1; |
---|
3618 | 3789 | |
---|
3619 | 3790 | if (after(ack, prior_snd_una)) { |
---|
3620 | 3791 | flag |= FLAG_SND_UNA_ADVANCED; |
---|
3621 | 3792 | icsk->icsk_retransmits = 0; |
---|
3622 | 3793 | |
---|
3623 | 3794 | #if IS_ENABLED(CONFIG_TLS_DEVICE) |
---|
3624 | | - if (static_branch_unlikely(&clean_acked_data_enabled)) |
---|
| 3795 | + if (static_branch_unlikely(&clean_acked_data_enabled.key)) |
---|
3625 | 3796 | if (icsk->icsk_clean_acked) |
---|
3626 | 3797 | icsk->icsk_clean_acked(sk, ack); |
---|
3627 | 3798 | #endif |
---|
.. | .. |
---|
3636 | 3807 | if (flag & FLAG_UPDATE_TS_RECENT) |
---|
3637 | 3808 | tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); |
---|
3638 | 3809 | |
---|
3639 | | - if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) { |
---|
| 3810 | + if ((flag & (FLAG_SLOWPATH | FLAG_SND_UNA_ADVANCED)) == |
---|
| 3811 | + FLAG_SND_UNA_ADVANCED) { |
---|
3640 | 3812 | /* Window is constant, pure forward advance. |
---|
3641 | 3813 | * No more checks are required. |
---|
3642 | 3814 | * Note, we use the fact that SND.UNA>=SND.WL2. |
---|
.. | .. |
---|
3667 | 3839 | ack_ev_flags |= CA_ACK_ECE; |
---|
3668 | 3840 | } |
---|
3669 | 3841 | |
---|
| 3842 | + if (sack_state.sack_delivered) |
---|
| 3843 | + tcp_count_delivered(tp, sack_state.sack_delivered, |
---|
| 3844 | + flag & FLAG_ECE); |
---|
| 3845 | + |
---|
3670 | 3846 | if (flag & FLAG_WIN_UPDATE) |
---|
3671 | 3847 | ack_ev_flags |= CA_ACK_WIN_UPDATE; |
---|
3672 | 3848 | |
---|
.. | .. |
---|
3692 | 3868 | goto no_queue; |
---|
3693 | 3869 | |
---|
3694 | 3870 | /* See if we can take anything off of the retransmit queue. */ |
---|
3695 | | - flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state); |
---|
| 3871 | + flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state, |
---|
| 3872 | + flag & FLAG_ECE); |
---|
3696 | 3873 | |
---|
3697 | 3874 | tcp_rack_update_reo_wnd(sk, &rs); |
---|
3698 | 3875 | |
---|
.. | .. |
---|
3700 | 3877 | tcp_process_tlp_ack(sk, ack, flag); |
---|
3701 | 3878 | |
---|
3702 | 3879 | if (tcp_ack_is_dubious(sk, flag)) { |
---|
3703 | | - is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); |
---|
3704 | | - tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, |
---|
| 3880 | + if (!(flag & (FLAG_SND_UNA_ADVANCED | |
---|
| 3881 | + FLAG_NOT_DUP | FLAG_DSACKING_ACK))) { |
---|
| 3882 | + num_dupack = 1; |
---|
| 3883 | + /* Consider if pure acks were aggregated in tcp_add_backlog() */ |
---|
| 3884 | + if (!(flag & FLAG_DATA)) |
---|
| 3885 | + num_dupack = max_t(u16, 1, skb_shinfo(skb)->gso_segs); |
---|
| 3886 | + } |
---|
| 3887 | + tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, |
---|
3705 | 3888 | &rexmit); |
---|
3706 | 3889 | } |
---|
3707 | 3890 | |
---|
.. | .. |
---|
3723 | 3906 | no_queue: |
---|
3724 | 3907 | /* If data was DSACKed, see if we can undo a cwnd reduction. */ |
---|
3725 | 3908 | if (flag & FLAG_DSACKING_ACK) { |
---|
3726 | | - tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, |
---|
| 3909 | + tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, |
---|
3727 | 3910 | &rexmit); |
---|
3728 | 3911 | tcp_newly_delivered(sk, delivered, flag); |
---|
3729 | 3912 | } |
---|
.. | .. |
---|
3737 | 3920 | tcp_process_tlp_ack(sk, ack, flag); |
---|
3738 | 3921 | return 1; |
---|
3739 | 3922 | |
---|
3740 | | -invalid_ack: |
---|
3741 | | - SOCK_DEBUG(sk, "Ack %u after %u:%u\n", ack, tp->snd_una, tp->snd_nxt); |
---|
3742 | | - return -1; |
---|
3743 | | - |
---|
3744 | 3923 | old_ack: |
---|
3745 | 3924 | /* If data was SACKed, tag it and see if we should send more data. |
---|
3746 | 3925 | * If data was DSACKed, see if we can undo a cwnd reduction. |
---|
.. | .. |
---|
3748 | 3927 | if (TCP_SKB_CB(skb)->sacked) { |
---|
3749 | 3928 | flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, |
---|
3750 | 3929 | &sack_state); |
---|
3751 | | - tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, |
---|
| 3930 | + tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, |
---|
3752 | 3931 | &rexmit); |
---|
3753 | 3932 | tcp_newly_delivered(sk, delivered, flag); |
---|
3754 | 3933 | tcp_xmit_recovery(sk, rexmit); |
---|
3755 | 3934 | } |
---|
3756 | 3935 | |
---|
3757 | | - SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt); |
---|
3758 | 3936 | return 0; |
---|
3759 | 3937 | } |
---|
3760 | 3938 | |
---|
.. | .. |
---|
3775 | 3953 | foc->exp = exp_opt; |
---|
3776 | 3954 | } |
---|
3777 | 3955 | |
---|
3778 | | -static void smc_parse_options(const struct tcphdr *th, |
---|
| 3956 | +static bool smc_parse_options(const struct tcphdr *th, |
---|
3779 | 3957 | struct tcp_options_received *opt_rx, |
---|
3780 | 3958 | const unsigned char *ptr, |
---|
3781 | 3959 | int opsize) |
---|
.. | .. |
---|
3784 | 3962 | if (static_branch_unlikely(&tcp_have_smc)) { |
---|
3785 | 3963 | if (th->syn && !(opsize & 1) && |
---|
3786 | 3964 | opsize >= TCPOLEN_EXP_SMC_BASE && |
---|
3787 | | - get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) |
---|
| 3965 | + get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) { |
---|
3788 | 3966 | opt_rx->smc_ok = 1; |
---|
| 3967 | + return true; |
---|
| 3968 | + } |
---|
3789 | 3969 | } |
---|
3790 | 3970 | #endif |
---|
| 3971 | + return false; |
---|
| 3972 | +} |
---|
| 3973 | + |
---|
| 3974 | +/* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped |
---|
| 3975 | + * value on success. |
---|
| 3976 | + */ |
---|
| 3977 | +static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss) |
---|
| 3978 | +{ |
---|
| 3979 | + const unsigned char *ptr = (const unsigned char *)(th + 1); |
---|
| 3980 | + int length = (th->doff * 4) - sizeof(struct tcphdr); |
---|
| 3981 | + u16 mss = 0; |
---|
| 3982 | + |
---|
| 3983 | + while (length > 0) { |
---|
| 3984 | + int opcode = *ptr++; |
---|
| 3985 | + int opsize; |
---|
| 3986 | + |
---|
| 3987 | + switch (opcode) { |
---|
| 3988 | + case TCPOPT_EOL: |
---|
| 3989 | + return mss; |
---|
| 3990 | + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ |
---|
| 3991 | + length--; |
---|
| 3992 | + continue; |
---|
| 3993 | + default: |
---|
| 3994 | + if (length < 2) |
---|
| 3995 | + return mss; |
---|
| 3996 | + opsize = *ptr++; |
---|
| 3997 | + if (opsize < 2) /* "silly options" */ |
---|
| 3998 | + return mss; |
---|
| 3999 | + if (opsize > length) |
---|
| 4000 | + return mss; /* fail on partial options */ |
---|
| 4001 | + if (opcode == TCPOPT_MSS && opsize == TCPOLEN_MSS) { |
---|
| 4002 | + u16 in_mss = get_unaligned_be16(ptr); |
---|
| 4003 | + |
---|
| 4004 | + if (in_mss) { |
---|
| 4005 | + if (user_mss && user_mss < in_mss) |
---|
| 4006 | + in_mss = user_mss; |
---|
| 4007 | + mss = in_mss; |
---|
| 4008 | + } |
---|
| 4009 | + } |
---|
| 4010 | + ptr += opsize - 2; |
---|
| 4011 | + length -= opsize; |
---|
| 4012 | + } |
---|
| 4013 | + } |
---|
| 4014 | + return mss; |
---|
3791 | 4015 | } |
---|
3792 | 4016 | |
---|
3793 | 4017 | /* Look for tcp options. Normally only called on SYN and SYNACK packets. |
---|
.. | .. |
---|
3805 | 4029 | |
---|
3806 | 4030 | ptr = (const unsigned char *)(th + 1); |
---|
3807 | 4031 | opt_rx->saw_tstamp = 0; |
---|
| 4032 | + opt_rx->saw_unknown = 0; |
---|
3808 | 4033 | |
---|
3809 | 4034 | while (length > 0) { |
---|
3810 | 4035 | int opcode = *ptr++; |
---|
.. | .. |
---|
3817 | 4042 | length--; |
---|
3818 | 4043 | continue; |
---|
3819 | 4044 | default: |
---|
| 4045 | + if (length < 2) |
---|
| 4046 | + return; |
---|
3820 | 4047 | opsize = *ptr++; |
---|
3821 | 4048 | if (opsize < 2) /* "silly options" */ |
---|
3822 | 4049 | return; |
---|
.. | .. |
---|
3836 | 4063 | break; |
---|
3837 | 4064 | case TCPOPT_WINDOW: |
---|
3838 | 4065 | if (opsize == TCPOLEN_WINDOW && th->syn && |
---|
3839 | | - !estab && net->ipv4.sysctl_tcp_window_scaling) { |
---|
| 4066 | + !estab && READ_ONCE(net->ipv4.sysctl_tcp_window_scaling)) { |
---|
3840 | 4067 | __u8 snd_wscale = *(__u8 *)ptr; |
---|
3841 | 4068 | opt_rx->wscale_ok = 1; |
---|
3842 | 4069 | if (snd_wscale > TCP_MAX_WSCALE) { |
---|
.. | .. |
---|
3852 | 4079 | case TCPOPT_TIMESTAMP: |
---|
3853 | 4080 | if ((opsize == TCPOLEN_TIMESTAMP) && |
---|
3854 | 4081 | ((estab && opt_rx->tstamp_ok) || |
---|
3855 | | - (!estab && net->ipv4.sysctl_tcp_timestamps))) { |
---|
| 4082 | + (!estab && READ_ONCE(net->ipv4.sysctl_tcp_timestamps)))) { |
---|
3856 | 4083 | opt_rx->saw_tstamp = 1; |
---|
3857 | 4084 | opt_rx->rcv_tsval = get_unaligned_be32(ptr); |
---|
3858 | 4085 | opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4); |
---|
.. | .. |
---|
3860 | 4087 | break; |
---|
3861 | 4088 | case TCPOPT_SACK_PERM: |
---|
3862 | 4089 | if (opsize == TCPOLEN_SACK_PERM && th->syn && |
---|
3863 | | - !estab && net->ipv4.sysctl_tcp_sack) { |
---|
| 4090 | + !estab && READ_ONCE(net->ipv4.sysctl_tcp_sack)) { |
---|
3864 | 4091 | opt_rx->sack_ok = TCP_SACK_SEEN; |
---|
3865 | 4092 | tcp_sack_reset(opt_rx); |
---|
3866 | 4093 | } |
---|
.. | .. |
---|
3893 | 4120 | */ |
---|
3894 | 4121 | if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE && |
---|
3895 | 4122 | get_unaligned_be16(ptr) == |
---|
3896 | | - TCPOPT_FASTOPEN_MAGIC) |
---|
| 4123 | + TCPOPT_FASTOPEN_MAGIC) { |
---|
3897 | 4124 | tcp_parse_fastopen_option(opsize - |
---|
3898 | 4125 | TCPOLEN_EXP_FASTOPEN_BASE, |
---|
3899 | 4126 | ptr + 2, th->syn, foc, true); |
---|
3900 | | - else |
---|
3901 | | - smc_parse_options(th, opt_rx, ptr, |
---|
3902 | | - opsize); |
---|
| 4127 | + break; |
---|
| 4128 | + } |
---|
| 4129 | + |
---|
| 4130 | + if (smc_parse_options(th, opt_rx, ptr, opsize)) |
---|
| 4131 | + break; |
---|
| 4132 | + |
---|
| 4133 | + opt_rx->saw_unknown = 1; |
---|
3903 | 4134 | break; |
---|
3904 | 4135 | |
---|
| 4136 | + default: |
---|
| 4137 | + opt_rx->saw_unknown = 1; |
---|
3905 | 4138 | } |
---|
3906 | 4139 | ptr += opsize-2; |
---|
3907 | 4140 | length -= opsize; |
---|
.. | .. |
---|
4109 | 4342 | |
---|
4110 | 4343 | inet_csk_schedule_ack(sk); |
---|
4111 | 4344 | |
---|
4112 | | - sk->sk_shutdown |= RCV_SHUTDOWN; |
---|
| 4345 | + WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | RCV_SHUTDOWN); |
---|
4113 | 4346 | sock_set_flag(sk, SOCK_DONE); |
---|
4114 | 4347 | |
---|
4115 | 4348 | switch (sk->sk_state) { |
---|
.. | .. |
---|
4117 | 4350 | case TCP_ESTABLISHED: |
---|
4118 | 4351 | /* Move to CLOSE_WAIT */ |
---|
4119 | 4352 | tcp_set_state(sk, TCP_CLOSE_WAIT); |
---|
4120 | | - inet_csk(sk)->icsk_ack.pingpong = 1; |
---|
| 4353 | + inet_csk_enter_pingpong_mode(sk); |
---|
4121 | 4354 | break; |
---|
4122 | 4355 | |
---|
4123 | 4356 | case TCP_CLOSE_WAIT: |
---|
.. | .. |
---|
4189 | 4422 | { |
---|
4190 | 4423 | struct tcp_sock *tp = tcp_sk(sk); |
---|
4191 | 4424 | |
---|
4192 | | - if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) { |
---|
| 4425 | + if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) { |
---|
4193 | 4426 | int mib_idx; |
---|
4194 | 4427 | |
---|
4195 | 4428 | if (before(seq, tp->rcv_nxt)) |
---|
.. | .. |
---|
4215 | 4448 | tcp_sack_extend(tp->duplicate_sack, seq, end_seq); |
---|
4216 | 4449 | } |
---|
4217 | 4450 | |
---|
| 4451 | +static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb) |
---|
| 4452 | +{ |
---|
| 4453 | + /* When the ACK path fails or drops most ACKs, the sender would |
---|
| 4454 | + * timeout and spuriously retransmit the same segment repeatedly. |
---|
| 4455 | + * The receiver remembers and reflects via DSACKs. Leverage the |
---|
| 4456 | + * DSACK state and change the txhash to re-route speculatively. |
---|
| 4457 | + */ |
---|
| 4458 | + if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq && |
---|
| 4459 | + sk_rethink_txhash(sk)) |
---|
| 4460 | + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH); |
---|
| 4461 | +} |
---|
| 4462 | + |
---|
4218 | 4463 | static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) |
---|
4219 | 4464 | { |
---|
4220 | 4465 | struct tcp_sock *tp = tcp_sk(sk); |
---|
.. | .. |
---|
4224 | 4469 | NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); |
---|
4225 | 4470 | tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); |
---|
4226 | 4471 | |
---|
4227 | | - if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) { |
---|
| 4472 | + if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) { |
---|
4228 | 4473 | u32 end_seq = TCP_SKB_CB(skb)->end_seq; |
---|
4229 | 4474 | |
---|
| 4475 | + tcp_rcv_spurious_retrans(sk, skb); |
---|
4230 | 4476 | if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) |
---|
4231 | 4477 | end_seq = tp->rcv_nxt; |
---|
4232 | 4478 | tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq); |
---|
.. | .. |
---|
4260 | 4506 | sp[i] = sp[i + 1]; |
---|
4261 | 4507 | continue; |
---|
4262 | 4508 | } |
---|
4263 | | - this_sack++, swalk++; |
---|
| 4509 | + this_sack++; |
---|
| 4510 | + swalk++; |
---|
4264 | 4511 | } |
---|
4265 | 4512 | } |
---|
| 4513 | + |
---|
| 4514 | +static void tcp_sack_compress_send_ack(struct sock *sk) |
---|
| 4515 | +{ |
---|
| 4516 | + struct tcp_sock *tp = tcp_sk(sk); |
---|
| 4517 | + |
---|
| 4518 | + if (!tp->compressed_ack) |
---|
| 4519 | + return; |
---|
| 4520 | + |
---|
| 4521 | + if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) |
---|
| 4522 | + __sock_put(sk); |
---|
| 4523 | + |
---|
| 4524 | + /* Since we have to send one ack finally, |
---|
| 4525 | + * substract one from tp->compressed_ack to keep |
---|
| 4526 | + * LINUX_MIB_TCPACKCOMPRESSED accurate. |
---|
| 4527 | + */ |
---|
| 4528 | + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, |
---|
| 4529 | + tp->compressed_ack - 1); |
---|
| 4530 | + |
---|
| 4531 | + tp->compressed_ack = 0; |
---|
| 4532 | + tcp_send_ack(sk); |
---|
| 4533 | +} |
---|
| 4534 | + |
---|
| 4535 | +/* Reasonable amount of sack blocks included in TCP SACK option |
---|
| 4536 | + * The max is 4, but this becomes 3 if TCP timestamps are there. |
---|
| 4537 | + * Given that SACK packets might be lost, be conservative and use 2. |
---|
| 4538 | + */ |
---|
| 4539 | +#define TCP_SACK_BLOCKS_EXPECTED 2 |
---|
4266 | 4540 | |
---|
4267 | 4541 | static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) |
---|
4268 | 4542 | { |
---|
.. | .. |
---|
4276 | 4550 | |
---|
4277 | 4551 | for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) { |
---|
4278 | 4552 | if (tcp_sack_extend(sp, seq, end_seq)) { |
---|
| 4553 | + if (this_sack >= TCP_SACK_BLOCKS_EXPECTED) |
---|
| 4554 | + tcp_sack_compress_send_ack(sk); |
---|
4279 | 4555 | /* Rotate this_sack to the first one. */ |
---|
4280 | 4556 | for (; this_sack > 0; this_sack--, sp--) |
---|
4281 | 4557 | swap(*sp, *(sp - 1)); |
---|
.. | .. |
---|
4285 | 4561 | } |
---|
4286 | 4562 | } |
---|
4287 | 4563 | |
---|
| 4564 | + if (this_sack >= TCP_SACK_BLOCKS_EXPECTED) |
---|
| 4565 | + tcp_sack_compress_send_ack(sk); |
---|
| 4566 | + |
---|
4288 | 4567 | /* Could not find an adjacent existing SACK, build a new one, |
---|
4289 | 4568 | * put it at the front, and shift everyone else down. We |
---|
4290 | 4569 | * always know there is at least one SACK present already here. |
---|
.. | .. |
---|
4292 | 4571 | * If the sack array is full, forget about the last one. |
---|
4293 | 4572 | */ |
---|
4294 | 4573 | if (this_sack >= TCP_NUM_SACKS) { |
---|
4295 | | - if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) |
---|
4296 | | - tcp_send_ack(sk); |
---|
4297 | 4574 | this_sack--; |
---|
4298 | 4575 | tp->rx_opt.num_sacks--; |
---|
4299 | 4576 | sp--; |
---|
.. | .. |
---|
4345 | 4622 | /** |
---|
4346 | 4623 | * tcp_try_coalesce - try to merge skb to prior one |
---|
4347 | 4624 | * @sk: socket |
---|
4348 | | - * @dest: destination queue |
---|
4349 | 4625 | * @to: prior buffer |
---|
4350 | 4626 | * @from: buffer to add in queue |
---|
4351 | 4627 | * @fragstolen: pointer to boolean |
---|
.. | .. |
---|
4367 | 4643 | |
---|
4368 | 4644 | /* Its possible this segment overlaps with prior segment in queue */ |
---|
4369 | 4645 | if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) |
---|
| 4646 | + return false; |
---|
| 4647 | + |
---|
| 4648 | + if (!mptcp_skb_can_collapse(to, from)) |
---|
4370 | 4649 | return false; |
---|
4371 | 4650 | |
---|
4372 | 4651 | #ifdef CONFIG_TLS_DEVICE |
---|
.. | .. |
---|
4412 | 4691 | |
---|
4413 | 4692 | static void tcp_drop(struct sock *sk, struct sk_buff *skb) |
---|
4414 | 4693 | { |
---|
| 4694 | + trace_android_vh_kfree_skb(skb); |
---|
4415 | 4695 | sk_drops_add(sk, skb); |
---|
4416 | 4696 | __kfree_skb(skb); |
---|
4417 | 4697 | } |
---|
.. | .. |
---|
4443 | 4723 | rb_erase(&skb->rbnode, &tp->out_of_order_queue); |
---|
4444 | 4724 | |
---|
4445 | 4725 | if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) { |
---|
4446 | | - SOCK_DEBUG(sk, "ofo packet was already received\n"); |
---|
4447 | 4726 | tcp_drop(sk, skb); |
---|
4448 | 4727 | continue; |
---|
4449 | 4728 | } |
---|
4450 | | - SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n", |
---|
4451 | | - tp->rcv_nxt, TCP_SKB_CB(skb)->seq, |
---|
4452 | | - TCP_SKB_CB(skb)->end_seq); |
---|
4453 | 4729 | |
---|
4454 | 4730 | tail = skb_peek_tail(&sk->sk_receive_queue); |
---|
4455 | 4731 | eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); |
---|
.. | .. |
---|
4511 | 4787 | tp->pred_flags = 0; |
---|
4512 | 4788 | inet_csk_schedule_ack(sk); |
---|
4513 | 4789 | |
---|
| 4790 | + tp->rcv_ooopack += max_t(u16, 1, skb_shinfo(skb)->gso_segs); |
---|
4514 | 4791 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); |
---|
4515 | 4792 | seq = TCP_SKB_CB(skb)->seq; |
---|
4516 | 4793 | end_seq = TCP_SKB_CB(skb)->end_seq; |
---|
4517 | | - SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", |
---|
4518 | | - tp->rcv_nxt, seq, end_seq); |
---|
4519 | 4794 | |
---|
4520 | 4795 | p = &tp->out_of_order_queue.rb_node; |
---|
4521 | 4796 | if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) { |
---|
.. | .. |
---|
4541 | 4816 | * and trigger fast retransmit. |
---|
4542 | 4817 | */ |
---|
4543 | 4818 | if (tcp_is_sack(tp)) |
---|
4544 | | - tcp_grow_window(sk, skb); |
---|
| 4819 | + tcp_grow_window(sk, skb, true); |
---|
4545 | 4820 | kfree_skb_partial(skb, fragstolen); |
---|
4546 | 4821 | skb = NULL; |
---|
4547 | 4822 | goto add_sack; |
---|
.. | .. |
---|
4629 | 4904 | * and trigger fast retransmit. |
---|
4630 | 4905 | */ |
---|
4631 | 4906 | if (tcp_is_sack(tp)) |
---|
4632 | | - tcp_grow_window(sk, skb); |
---|
| 4907 | + tcp_grow_window(sk, skb, false); |
---|
4633 | 4908 | skb_condense(skb); |
---|
4634 | 4909 | skb_set_owner_r(skb, sk); |
---|
4635 | 4910 | } |
---|
4636 | 4911 | } |
---|
4637 | 4912 | |
---|
4638 | | -static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen, |
---|
4639 | | - bool *fragstolen) |
---|
| 4913 | +static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, |
---|
| 4914 | + bool *fragstolen) |
---|
4640 | 4915 | { |
---|
4641 | 4916 | int eaten; |
---|
4642 | 4917 | struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue); |
---|
4643 | 4918 | |
---|
4644 | | - __skb_pull(skb, hdrlen); |
---|
4645 | 4919 | eaten = (tail && |
---|
4646 | 4920 | tcp_try_coalesce(sk, tail, |
---|
4647 | 4921 | skb, fragstolen)) ? 1 : 0; |
---|
.. | .. |
---|
4692 | 4966 | TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size; |
---|
4693 | 4967 | TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1; |
---|
4694 | 4968 | |
---|
4695 | | - if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) { |
---|
| 4969 | + if (tcp_queue_rcv(sk, skb, &fragstolen)) { |
---|
4696 | 4970 | WARN_ON_ONCE(fragstolen); /* should not happen */ |
---|
4697 | 4971 | __kfree_skb(skb); |
---|
4698 | 4972 | } |
---|
.. | .. |
---|
4724 | 4998 | bool fragstolen; |
---|
4725 | 4999 | int eaten; |
---|
4726 | 5000 | |
---|
| 5001 | + if (sk_is_mptcp(sk)) |
---|
| 5002 | + mptcp_incoming_options(sk, skb); |
---|
| 5003 | + |
---|
4727 | 5004 | if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { |
---|
4728 | 5005 | __kfree_skb(skb); |
---|
4729 | 5006 | return; |
---|
.. | .. |
---|
4753 | 5030 | goto drop; |
---|
4754 | 5031 | } |
---|
4755 | 5032 | |
---|
4756 | | - eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); |
---|
| 5033 | + eaten = tcp_queue_rcv(sk, skb, &fragstolen); |
---|
4757 | 5034 | if (skb->len) |
---|
4758 | 5035 | tcp_event_data_recv(sk, skb); |
---|
4759 | 5036 | if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) |
---|
.. | .. |
---|
4782 | 5059 | } |
---|
4783 | 5060 | |
---|
4784 | 5061 | if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { |
---|
| 5062 | + tcp_rcv_spurious_retrans(sk, skb); |
---|
4785 | 5063 | /* A retransmit, 2nd most common case. Force an immediate ack. */ |
---|
4786 | 5064 | NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); |
---|
4787 | 5065 | tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); |
---|
.. | .. |
---|
4800 | 5078 | |
---|
4801 | 5079 | if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { |
---|
4802 | 5080 | /* Partial packet, seq < rcv_next < end_seq */ |
---|
4803 | | - SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n", |
---|
4804 | | - tp->rcv_nxt, TCP_SKB_CB(skb)->seq, |
---|
4805 | | - TCP_SKB_CB(skb)->end_seq); |
---|
4806 | | - |
---|
4807 | 5081 | tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt); |
---|
4808 | 5082 | |
---|
4809 | 5083 | /* If window is closed, drop tail of packet. But after |
---|
.. | .. |
---|
4897 | 5171 | /* The first skb to collapse is: |
---|
4898 | 5172 | * - not SYN/FIN and |
---|
4899 | 5173 | * - bloated or contains data before "start" or |
---|
4900 | | - * overlaps to the next one. |
---|
| 5174 | + * overlaps to the next one and mptcp allow collapsing. |
---|
4901 | 5175 | */ |
---|
4902 | 5176 | if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) && |
---|
4903 | 5177 | (tcp_win_from_space(sk, skb->truesize) > skb->len || |
---|
.. | .. |
---|
4906 | 5180 | break; |
---|
4907 | 5181 | } |
---|
4908 | 5182 | |
---|
4909 | | - if (n && n != tail && |
---|
| 5183 | + if (n && n != tail && mptcp_skb_can_collapse(skb, n) && |
---|
4910 | 5184 | TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) { |
---|
4911 | 5185 | end_of_skbs = false; |
---|
4912 | 5186 | break; |
---|
.. | .. |
---|
4939 | 5213 | else |
---|
4940 | 5214 | __skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */ |
---|
4941 | 5215 | skb_set_owner_r(nskb, sk); |
---|
| 5216 | + mptcp_skb_ext_move(nskb, skb); |
---|
4942 | 5217 | |
---|
4943 | 5218 | /* Copy data, releasing collapsed skbs. */ |
---|
4944 | 5219 | while (copy > 0) { |
---|
.. | .. |
---|
4958 | 5233 | skb = tcp_collapse_one(sk, skb, list, root); |
---|
4959 | 5234 | if (!skb || |
---|
4960 | 5235 | skb == tail || |
---|
| 5236 | + !mptcp_skb_can_collapse(nskb, skb) || |
---|
4961 | 5237 | (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) |
---|
4962 | 5238 | goto end; |
---|
4963 | 5239 | #ifdef CONFIG_TLS_DEVICE |
---|
.. | .. |
---|
5082 | 5358 | { |
---|
5083 | 5359 | struct tcp_sock *tp = tcp_sk(sk); |
---|
5084 | 5360 | |
---|
5085 | | - SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq); |
---|
5086 | | - |
---|
5087 | 5361 | NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED); |
---|
5088 | 5362 | |
---|
5089 | 5363 | if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) |
---|
.. | .. |
---|
5149 | 5423 | return true; |
---|
5150 | 5424 | } |
---|
5151 | 5425 | |
---|
5152 | | -/* When incoming ACK allowed to free some skb from write_queue, |
---|
5153 | | - * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket |
---|
5154 | | - * on the exit from tcp input handler. |
---|
5155 | | - * |
---|
5156 | | - * PROBLEM: sndbuf expansion does not work well with largesend. |
---|
5157 | | - */ |
---|
5158 | 5426 | static void tcp_new_space(struct sock *sk) |
---|
5159 | 5427 | { |
---|
5160 | 5428 | struct tcp_sock *tp = tcp_sk(sk); |
---|
.. | .. |
---|
5167 | 5435 | sk->sk_write_space(sk); |
---|
5168 | 5436 | } |
---|
5169 | 5437 | |
---|
5170 | | -static void tcp_check_space(struct sock *sk) |
---|
| 5438 | +/* Caller made space either from: |
---|
| 5439 | + * 1) Freeing skbs in rtx queues (after tp->snd_una has advanced) |
---|
| 5440 | + * 2) Sent skbs from output queue (and thus advancing tp->snd_nxt) |
---|
| 5441 | + * |
---|
| 5442 | + * We might be able to generate EPOLLOUT to the application if: |
---|
| 5443 | + * 1) Space consumed in output/rtx queues is below sk->sk_sndbuf/2 |
---|
| 5444 | + * 2) notsent amount (tp->write_seq - tp->snd_nxt) became |
---|
| 5445 | + * small enough that tcp_stream_memory_free() decides it |
---|
| 5446 | + * is time to generate EPOLLOUT. |
---|
| 5447 | + */ |
---|
| 5448 | +void tcp_check_space(struct sock *sk) |
---|
5171 | 5449 | { |
---|
5172 | | - if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) { |
---|
5173 | | - sock_reset_flag(sk, SOCK_QUEUE_SHRUNK); |
---|
5174 | | - /* pairs with tcp_poll() */ |
---|
5175 | | - smp_mb(); |
---|
5176 | | - if (sk->sk_socket && |
---|
5177 | | - test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { |
---|
5178 | | - tcp_new_space(sk); |
---|
5179 | | - if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) |
---|
5180 | | - tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); |
---|
5181 | | - } |
---|
| 5450 | + /* pairs with tcp_poll() */ |
---|
| 5451 | + smp_mb(); |
---|
| 5452 | + if (sk->sk_socket && |
---|
| 5453 | + test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { |
---|
| 5454 | + tcp_new_space(sk); |
---|
| 5455 | + if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) |
---|
| 5456 | + tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); |
---|
5182 | 5457 | } |
---|
5183 | 5458 | } |
---|
5184 | 5459 | |
---|
.. | .. |
---|
5220 | 5495 | } |
---|
5221 | 5496 | |
---|
5222 | 5497 | if (!tcp_is_sack(tp) || |
---|
5223 | | - tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr) |
---|
| 5498 | + tp->compressed_ack >= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr)) |
---|
5224 | 5499 | goto send_now; |
---|
5225 | 5500 | |
---|
5226 | 5501 | if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) { |
---|
5227 | 5502 | tp->compressed_ack_rcv_nxt = tp->rcv_nxt; |
---|
5228 | | - if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) |
---|
5229 | | - NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, |
---|
5230 | | - tp->compressed_ack - TCP_FASTRETRANS_THRESH); |
---|
5231 | | - tp->compressed_ack = 0; |
---|
| 5503 | + tp->dup_ack_counter = 0; |
---|
5232 | 5504 | } |
---|
5233 | | - |
---|
5234 | | - if (++tp->compressed_ack <= TCP_FASTRETRANS_THRESH) |
---|
| 5505 | + if (tp->dup_ack_counter < TCP_FASTRETRANS_THRESH) { |
---|
| 5506 | + tp->dup_ack_counter++; |
---|
5235 | 5507 | goto send_now; |
---|
5236 | | - |
---|
| 5508 | + } |
---|
| 5509 | + tp->compressed_ack++; |
---|
5237 | 5510 | if (hrtimer_is_queued(&tp->compressed_ack_timer)) |
---|
5238 | 5511 | return; |
---|
5239 | 5512 | |
---|
.. | .. |
---|
5243 | 5516 | if (tp->srtt_us && tp->srtt_us < rtt) |
---|
5244 | 5517 | rtt = tp->srtt_us; |
---|
5245 | 5518 | |
---|
5246 | | - delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns, |
---|
| 5519 | + delay = min_t(unsigned long, |
---|
| 5520 | + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns), |
---|
5247 | 5521 | rtt * (NSEC_PER_USEC >> 3)/20); |
---|
5248 | 5522 | sock_hold(sk); |
---|
5249 | | - hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay), |
---|
5250 | | - HRTIMER_MODE_REL_PINNED_SOFT); |
---|
| 5523 | + hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay), |
---|
| 5524 | + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns), |
---|
| 5525 | + HRTIMER_MODE_REL_PINNED_SOFT); |
---|
5251 | 5526 | } |
---|
5252 | 5527 | |
---|
5253 | 5528 | static inline void tcp_ack_snd_check(struct sock *sk) |
---|
.. | .. |
---|
5274 | 5549 | struct tcp_sock *tp = tcp_sk(sk); |
---|
5275 | 5550 | u32 ptr = ntohs(th->urg_ptr); |
---|
5276 | 5551 | |
---|
5277 | | - if (ptr && !sock_net(sk)->ipv4.sysctl_tcp_stdurg) |
---|
| 5552 | + if (ptr && !READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_stdurg)) |
---|
5278 | 5553 | ptr--; |
---|
5279 | 5554 | ptr += ntohl(th->seq); |
---|
5280 | 5555 | |
---|
.. | .. |
---|
5328 | 5603 | } |
---|
5329 | 5604 | |
---|
5330 | 5605 | tp->urg_data = TCP_URG_NOTYET; |
---|
5331 | | - tp->urg_seq = ptr; |
---|
| 5606 | + WRITE_ONCE(tp->urg_seq, ptr); |
---|
5332 | 5607 | |
---|
5333 | 5608 | /* Disable header prediction. */ |
---|
5334 | 5609 | tp->pred_flags = 0; |
---|
.. | .. |
---|
5481 | 5756 | goto discard; |
---|
5482 | 5757 | } |
---|
5483 | 5758 | |
---|
| 5759 | + bpf_skops_parse_hdr(sk, skb); |
---|
| 5760 | + |
---|
5484 | 5761 | return true; |
---|
5485 | 5762 | |
---|
5486 | 5763 | discard: |
---|
.. | .. |
---|
5521 | 5798 | trace_tcp_probe(sk, skb); |
---|
5522 | 5799 | |
---|
5523 | 5800 | tcp_mstamp_refresh(tp); |
---|
5524 | | - if (unlikely(!sk->sk_rx_dst)) |
---|
| 5801 | + if (unlikely(!rcu_access_pointer(sk->sk_rx_dst))) |
---|
5525 | 5802 | inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); |
---|
5526 | 5803 | /* |
---|
5527 | 5804 | * Header prediction. |
---|
.. | .. |
---|
5628 | 5905 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS); |
---|
5629 | 5906 | |
---|
5630 | 5907 | /* Bulk data transfer: receiver */ |
---|
5631 | | - eaten = tcp_queue_rcv(sk, skb, tcp_header_len, |
---|
5632 | | - &fragstolen); |
---|
| 5908 | + __skb_pull(skb, tcp_header_len); |
---|
| 5909 | + eaten = tcp_queue_rcv(sk, skb, &fragstolen); |
---|
5633 | 5910 | |
---|
5634 | 5911 | tcp_event_data_recv(sk, skb); |
---|
5635 | 5912 | |
---|
.. | .. |
---|
5691 | 5968 | } |
---|
5692 | 5969 | EXPORT_SYMBOL(tcp_rcv_established); |
---|
5693 | 5970 | |
---|
| 5971 | +void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb) |
---|
| 5972 | +{ |
---|
| 5973 | + struct inet_connection_sock *icsk = inet_csk(sk); |
---|
| 5974 | + struct tcp_sock *tp = tcp_sk(sk); |
---|
| 5975 | + |
---|
| 5976 | + tcp_mtup_init(sk); |
---|
| 5977 | + icsk->icsk_af_ops->rebuild_header(sk); |
---|
| 5978 | + tcp_init_metrics(sk); |
---|
| 5979 | + |
---|
| 5980 | + /* Initialize the congestion window to start the transfer. |
---|
| 5981 | + * Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been |
---|
| 5982 | + * retransmitted. In light of RFC6298 more aggressive 1sec |
---|
| 5983 | + * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK |
---|
| 5984 | + * retransmission has occurred. |
---|
| 5985 | + */ |
---|
| 5986 | + if (tp->total_retrans > 1 && tp->undo_marker) |
---|
| 5987 | + tp->snd_cwnd = 1; |
---|
| 5988 | + else |
---|
| 5989 | + tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk)); |
---|
| 5990 | + tp->snd_cwnd_stamp = tcp_jiffies32; |
---|
| 5991 | + |
---|
| 5992 | + bpf_skops_established(sk, bpf_op, skb); |
---|
| 5993 | + /* Initialize congestion control unless BPF initialized it already: */ |
---|
| 5994 | + if (!icsk->icsk_ca_initialized) |
---|
| 5995 | + tcp_init_congestion_control(sk); |
---|
| 5996 | + tcp_init_buffer_space(sk); |
---|
| 5997 | +} |
---|
| 5998 | + |
---|
5694 | 5999 | void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) |
---|
5695 | 6000 | { |
---|
5696 | 6001 | struct tcp_sock *tp = tcp_sk(sk); |
---|
.. | .. |
---|
5705 | 6010 | sk_mark_napi_id(sk, skb); |
---|
5706 | 6011 | } |
---|
5707 | 6012 | |
---|
5708 | | - tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); |
---|
| 6013 | + tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, skb); |
---|
5709 | 6014 | |
---|
5710 | 6015 | /* Prevent spurious tcp_cwnd_restart() on first data |
---|
5711 | 6016 | * packet. |
---|
.. | .. |
---|
5760 | 6065 | tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp); |
---|
5761 | 6066 | |
---|
5762 | 6067 | if (data) { /* Retransmit unacked data in SYN */ |
---|
| 6068 | + if (tp->total_retrans) |
---|
| 6069 | + tp->fastopen_client_fail = TFO_SYN_RETRANSMITTED; |
---|
| 6070 | + else |
---|
| 6071 | + tp->fastopen_client_fail = TFO_DATA_NOT_ACKED; |
---|
5763 | 6072 | skb_rbtree_walk_from(data) { |
---|
5764 | 6073 | if (__tcp_retransmit_skb(sk, data, 1)) |
---|
5765 | 6074 | break; |
---|
.. | .. |
---|
5792 | 6101 | #endif |
---|
5793 | 6102 | } |
---|
5794 | 6103 | |
---|
| 6104 | +static void tcp_try_undo_spurious_syn(struct sock *sk) |
---|
| 6105 | +{ |
---|
| 6106 | + struct tcp_sock *tp = tcp_sk(sk); |
---|
| 6107 | + u32 syn_stamp; |
---|
| 6108 | + |
---|
| 6109 | + /* undo_marker is set when SYN or SYNACK times out. The timeout is |
---|
| 6110 | + * spurious if the ACK's timestamp option echo value matches the |
---|
| 6111 | + * original SYN timestamp. |
---|
| 6112 | + */ |
---|
| 6113 | + syn_stamp = tp->retrans_stamp; |
---|
| 6114 | + if (tp->undo_marker && syn_stamp && tp->rx_opt.saw_tstamp && |
---|
| 6115 | + syn_stamp == tp->rx_opt.rcv_tsecr) |
---|
| 6116 | + tp->undo_marker = 0; |
---|
| 6117 | +} |
---|
| 6118 | + |
---|
5795 | 6119 | static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, |
---|
5796 | 6120 | const struct tcphdr *th) |
---|
5797 | 6121 | { |
---|
.. | .. |
---|
5815 | 6139 | * the segment and return)" |
---|
5816 | 6140 | */ |
---|
5817 | 6141 | if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) || |
---|
5818 | | - after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) |
---|
| 6142 | + after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) { |
---|
| 6143 | + /* Previous FIN/ACK or RST/ACK might be ignored. */ |
---|
| 6144 | + if (icsk->icsk_retransmits == 0) |
---|
| 6145 | + inet_csk_reset_xmit_timer(sk, |
---|
| 6146 | + ICSK_TIME_RETRANS, |
---|
| 6147 | + TCP_TIMEOUT_MIN, TCP_RTO_MAX); |
---|
5819 | 6148 | goto reset_and_undo; |
---|
| 6149 | + } |
---|
5820 | 6150 | |
---|
5821 | 6151 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && |
---|
5822 | 6152 | !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp, |
---|
.. | .. |
---|
5859 | 6189 | tcp_ecn_rcv_synack(tp, th); |
---|
5860 | 6190 | |
---|
5861 | 6191 | tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); |
---|
| 6192 | + tcp_try_undo_spurious_syn(sk); |
---|
5862 | 6193 | tcp_ack(sk, skb, FLAG_SLOWPATH); |
---|
5863 | 6194 | |
---|
5864 | 6195 | /* Ok.. it's good. Set up sequence numbers and |
---|
.. | .. |
---|
5912 | 6243 | return -1; |
---|
5913 | 6244 | if (sk->sk_write_pending || |
---|
5914 | 6245 | icsk->icsk_accept_queue.rskq_defer_accept || |
---|
5915 | | - icsk->icsk_ack.pingpong) { |
---|
| 6246 | + inet_csk_in_pingpong_mode(sk)) { |
---|
5916 | 6247 | /* Save one ACK. Data will be ready after |
---|
5917 | 6248 | * several ticks, if write_pending is set. |
---|
5918 | 6249 | * |
---|
.. | .. |
---|
6017 | 6348 | return 1; |
---|
6018 | 6349 | } |
---|
6019 | 6350 | |
---|
| 6351 | +static void tcp_rcv_synrecv_state_fastopen(struct sock *sk) |
---|
| 6352 | +{ |
---|
| 6353 | + struct request_sock *req; |
---|
| 6354 | + |
---|
| 6355 | + /* If we are still handling the SYNACK RTO, see if timestamp ECR allows |
---|
| 6356 | + * undo. If peer SACKs triggered fast recovery, we can't undo here. |
---|
| 6357 | + */ |
---|
| 6358 | + if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) |
---|
| 6359 | + tcp_try_undo_loss(sk, false); |
---|
| 6360 | + |
---|
| 6361 | + /* Reset rtx states to prevent spurious retransmits_timed_out() */ |
---|
| 6362 | + tcp_sk(sk)->retrans_stamp = 0; |
---|
| 6363 | + inet_csk(sk)->icsk_retransmits = 0; |
---|
| 6364 | + |
---|
| 6365 | + /* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1, |
---|
| 6366 | + * we no longer need req so release it. |
---|
| 6367 | + */ |
---|
| 6368 | + req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, |
---|
| 6369 | + lockdep_sock_is_held(sk)); |
---|
| 6370 | + reqsk_fastopen_remove(sk, req, false); |
---|
| 6371 | + |
---|
| 6372 | + /* Re-arm the timer because data may have been sent out. |
---|
| 6373 | + * This is similar to the regular data transmission case |
---|
| 6374 | + * when new data has just been ack'ed. |
---|
| 6375 | + * |
---|
| 6376 | + * (TFO) - we could try to be more aggressive and |
---|
| 6377 | + * retransmitting any data sooner based on when they |
---|
| 6378 | + * are sent out. |
---|
| 6379 | + */ |
---|
| 6380 | + tcp_rearm_rto(sk); |
---|
| 6381 | +} |
---|
| 6382 | + |
---|
6020 | 6383 | /* |
---|
6021 | 6384 | * This function implements the receiving procedure of RFC 793 for |
---|
6022 | 6385 | * all states except ESTABLISHED and TIME_WAIT. |
---|
.. | .. |
---|
6079 | 6442 | |
---|
6080 | 6443 | tcp_mstamp_refresh(tp); |
---|
6081 | 6444 | tp->rx_opt.saw_tstamp = 0; |
---|
6082 | | - req = tp->fastopen_rsk; |
---|
| 6445 | + req = rcu_dereference_protected(tp->fastopen_rsk, |
---|
| 6446 | + lockdep_sock_is_held(sk)); |
---|
6083 | 6447 | if (req) { |
---|
6084 | 6448 | bool req_stolen; |
---|
6085 | 6449 | |
---|
.. | .. |
---|
6113 | 6477 | if (!tp->srtt_us) |
---|
6114 | 6478 | tcp_synack_rtt_meas(sk, req); |
---|
6115 | 6479 | |
---|
6116 | | - /* Once we leave TCP_SYN_RECV, we no longer need req |
---|
6117 | | - * so release it. |
---|
6118 | | - */ |
---|
6119 | 6480 | if (req) { |
---|
6120 | | - inet_csk(sk)->icsk_retransmits = 0; |
---|
6121 | | - reqsk_fastopen_remove(sk, req, false); |
---|
6122 | | - /* Re-arm the timer because data may have been sent out. |
---|
6123 | | - * This is similar to the regular data transmission case |
---|
6124 | | - * when new data has just been ack'ed. |
---|
6125 | | - * |
---|
6126 | | - * (TFO) - we could try to be more aggressive and |
---|
6127 | | - * retransmitting any data sooner based on when they |
---|
6128 | | - * are sent out. |
---|
6129 | | - */ |
---|
6130 | | - tcp_rearm_rto(sk); |
---|
| 6481 | + tcp_rcv_synrecv_state_fastopen(sk); |
---|
6131 | 6482 | } else { |
---|
6132 | | - tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); |
---|
| 6483 | + tcp_try_undo_spurious_syn(sk); |
---|
| 6484 | + tp->retrans_stamp = 0; |
---|
| 6485 | + tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, |
---|
| 6486 | + skb); |
---|
6133 | 6487 | WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); |
---|
6134 | 6488 | } |
---|
6135 | 6489 | smp_mb(); |
---|
.. | .. |
---|
6163 | 6517 | case TCP_FIN_WAIT1: { |
---|
6164 | 6518 | int tmo; |
---|
6165 | 6519 | |
---|
6166 | | - /* If we enter the TCP_FIN_WAIT1 state and we are a |
---|
6167 | | - * Fast Open socket and this is the first acceptable |
---|
6168 | | - * ACK we have received, this would have acknowledged |
---|
6169 | | - * our SYNACK so stop the SYNACK timer. |
---|
6170 | | - */ |
---|
6171 | | - if (req) { |
---|
6172 | | - /* We no longer need the request sock. */ |
---|
6173 | | - reqsk_fastopen_remove(sk, req, false); |
---|
6174 | | - tcp_rearm_rto(sk); |
---|
6175 | | - } |
---|
| 6520 | + if (req) |
---|
| 6521 | + tcp_rcv_synrecv_state_fastopen(sk); |
---|
| 6522 | + |
---|
6176 | 6523 | if (tp->snd_una != tp->write_seq) |
---|
6177 | 6524 | break; |
---|
6178 | 6525 | |
---|
6179 | 6526 | tcp_set_state(sk, TCP_FIN_WAIT2); |
---|
6180 | | - sk->sk_shutdown |= SEND_SHUTDOWN; |
---|
| 6527 | + WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | SEND_SHUTDOWN); |
---|
6181 | 6528 | |
---|
6182 | 6529 | sk_dst_confirm(sk); |
---|
6183 | 6530 | |
---|
.. | .. |
---|
6244 | 6591 | case TCP_CLOSE_WAIT: |
---|
6245 | 6592 | case TCP_CLOSING: |
---|
6246 | 6593 | case TCP_LAST_ACK: |
---|
6247 | | - if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) |
---|
| 6594 | + if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { |
---|
| 6595 | + if (sk_is_mptcp(sk)) |
---|
| 6596 | + mptcp_incoming_options(sk, skb); |
---|
6248 | 6597 | break; |
---|
6249 | | - /* fall through */ |
---|
| 6598 | + } |
---|
| 6599 | + fallthrough; |
---|
6250 | 6600 | case TCP_FIN_WAIT1: |
---|
6251 | 6601 | case TCP_FIN_WAIT2: |
---|
6252 | 6602 | /* RFC 793 says to queue data in these states, |
---|
.. | .. |
---|
6261 | 6611 | return 1; |
---|
6262 | 6612 | } |
---|
6263 | 6613 | } |
---|
6264 | | - /* Fall through */ |
---|
| 6614 | + fallthrough; |
---|
6265 | 6615 | case TCP_ESTABLISHED: |
---|
6266 | 6616 | tcp_data_queue(sk, skb); |
---|
6267 | 6617 | queued = 1; |
---|
.. | .. |
---|
6307 | 6657 | * congestion control: Linux DCTCP asserts ECT on all packets, |
---|
6308 | 6658 | * including SYN, which is most optimal solution; however, |
---|
6309 | 6659 | * others, such as FreeBSD do not. |
---|
| 6660 | + * |
---|
| 6661 | + * Exception: At least one of the reserved bits of the TCP header (th->res1) is |
---|
| 6662 | + * set, indicating the use of a future TCP extension (such as AccECN). See |
---|
| 6663 | + * RFC8311 ยง4.3 which updates RFC3168 to allow the development of such |
---|
| 6664 | + * extensions. |
---|
6310 | 6665 | */ |
---|
6311 | 6666 | static void tcp_ecn_create_request(struct request_sock *req, |
---|
6312 | 6667 | const struct sk_buff *skb, |
---|
.. | .. |
---|
6326 | 6681 | ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK); |
---|
6327 | 6682 | ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst; |
---|
6328 | 6683 | |
---|
6329 | | - if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) || |
---|
| 6684 | + if (((!ect || th->res1) && ecn_ok) || tcp_ca_needs_ecn(listen_sk) || |
---|
6330 | 6685 | (ecn_ok_dst & DST_FEATURE_ECN_CA) || |
---|
6331 | 6686 | tcp_bpf_ca_needs_ecn((struct sock *)req)) |
---|
6332 | 6687 | inet_rsk(req)->ecn_ok = 1; |
---|
.. | .. |
---|
6339 | 6694 | struct inet_request_sock *ireq = inet_rsk(req); |
---|
6340 | 6695 | |
---|
6341 | 6696 | req->rsk_rcv_wnd = 0; /* So that tcp_send_synack() knows! */ |
---|
6342 | | - req->cookie_ts = 0; |
---|
6343 | 6697 | tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; |
---|
6344 | 6698 | tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; |
---|
6345 | | - tcp_rsk(req)->snt_synack = tcp_clock_us(); |
---|
| 6699 | + tcp_rsk(req)->snt_synack = 0; |
---|
6346 | 6700 | tcp_rsk(req)->last_oow_ack_time = 0; |
---|
6347 | 6701 | req->mss = rx_opt->mss_clamp; |
---|
6348 | 6702 | req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; |
---|
.. | .. |
---|
6387 | 6741 | /* |
---|
6388 | 6742 | * Return true if a syncookie should be sent |
---|
6389 | 6743 | */ |
---|
6390 | | -static bool tcp_syn_flood_action(const struct sock *sk, |
---|
6391 | | - const struct sk_buff *skb, |
---|
6392 | | - const char *proto) |
---|
| 6744 | +static bool tcp_syn_flood_action(const struct sock *sk, const char *proto) |
---|
6393 | 6745 | { |
---|
6394 | 6746 | struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; |
---|
6395 | 6747 | const char *msg = "Dropping request"; |
---|
6396 | | - bool want_cookie = false; |
---|
6397 | 6748 | struct net *net = sock_net(sk); |
---|
| 6749 | + bool want_cookie = false; |
---|
| 6750 | + u8 syncookies; |
---|
| 6751 | + |
---|
| 6752 | + syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies); |
---|
6398 | 6753 | |
---|
6399 | 6754 | #ifdef CONFIG_SYN_COOKIES |
---|
6400 | | - if (net->ipv4.sysctl_tcp_syncookies) { |
---|
| 6755 | + if (syncookies) { |
---|
6401 | 6756 | msg = "Sending cookies"; |
---|
6402 | 6757 | want_cookie = true; |
---|
6403 | 6758 | __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES); |
---|
.. | .. |
---|
6405 | 6760 | #endif |
---|
6406 | 6761 | __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); |
---|
6407 | 6762 | |
---|
6408 | | - if (!queue->synflood_warned && |
---|
6409 | | - net->ipv4.sysctl_tcp_syncookies != 2 && |
---|
| 6763 | + if (!queue->synflood_warned && syncookies != 2 && |
---|
6410 | 6764 | xchg(&queue->synflood_warned, 1) == 0) |
---|
6411 | 6765 | net_info_ratelimited("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", |
---|
6412 | | - proto, ntohs(tcp_hdr(skb)->dest), msg); |
---|
| 6766 | + proto, sk->sk_num, msg); |
---|
6413 | 6767 | |
---|
6414 | 6768 | return want_cookie; |
---|
6415 | 6769 | } |
---|
.. | .. |
---|
6420 | 6774 | { |
---|
6421 | 6775 | if (tcp_sk(sk)->save_syn) { |
---|
6422 | 6776 | u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb); |
---|
6423 | | - u32 *copy; |
---|
| 6777 | + struct saved_syn *saved_syn; |
---|
| 6778 | + u32 mac_hdrlen; |
---|
| 6779 | + void *base; |
---|
6424 | 6780 | |
---|
6425 | | - copy = kmalloc(len + sizeof(u32), GFP_ATOMIC); |
---|
6426 | | - if (copy) { |
---|
6427 | | - copy[0] = len; |
---|
6428 | | - memcpy(©[1], skb_network_header(skb), len); |
---|
6429 | | - req->saved_syn = copy; |
---|
| 6781 | + if (tcp_sk(sk)->save_syn == 2) { /* Save full header. */ |
---|
| 6782 | + base = skb_mac_header(skb); |
---|
| 6783 | + mac_hdrlen = skb_mac_header_len(skb); |
---|
| 6784 | + len += mac_hdrlen; |
---|
| 6785 | + } else { |
---|
| 6786 | + base = skb_network_header(skb); |
---|
| 6787 | + mac_hdrlen = 0; |
---|
| 6788 | + } |
---|
| 6789 | + |
---|
| 6790 | + saved_syn = kmalloc(struct_size(saved_syn, data, len), |
---|
| 6791 | + GFP_ATOMIC); |
---|
| 6792 | + if (saved_syn) { |
---|
| 6793 | + saved_syn->mac_hdrlen = mac_hdrlen; |
---|
| 6794 | + saved_syn->network_hdrlen = skb_network_header_len(skb); |
---|
| 6795 | + saved_syn->tcp_hdrlen = tcp_hdrlen(skb); |
---|
| 6796 | + memcpy(saved_syn->data, base, len); |
---|
| 6797 | + req->saved_syn = saved_syn; |
---|
6430 | 6798 | } |
---|
6431 | 6799 | } |
---|
6432 | 6800 | } |
---|
| 6801 | + |
---|
| 6802 | +/* If a SYN cookie is required and supported, returns a clamped MSS value to be |
---|
| 6803 | + * used for SYN cookie generation. |
---|
| 6804 | + */ |
---|
| 6805 | +u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, |
---|
| 6806 | + const struct tcp_request_sock_ops *af_ops, |
---|
| 6807 | + struct sock *sk, struct tcphdr *th) |
---|
| 6808 | +{ |
---|
| 6809 | + struct tcp_sock *tp = tcp_sk(sk); |
---|
| 6810 | + u16 mss; |
---|
| 6811 | + |
---|
| 6812 | + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies) != 2 && |
---|
| 6813 | + !inet_csk_reqsk_queue_is_full(sk)) |
---|
| 6814 | + return 0; |
---|
| 6815 | + |
---|
| 6816 | + if (!tcp_syn_flood_action(sk, rsk_ops->slab_name)) |
---|
| 6817 | + return 0; |
---|
| 6818 | + |
---|
| 6819 | + if (sk_acceptq_is_full(sk)) { |
---|
| 6820 | + NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); |
---|
| 6821 | + return 0; |
---|
| 6822 | + } |
---|
| 6823 | + |
---|
| 6824 | + mss = tcp_parse_mss_option(th, tp->rx_opt.user_mss); |
---|
| 6825 | + if (!mss) |
---|
| 6826 | + mss = af_ops->mss_clamp; |
---|
| 6827 | + |
---|
| 6828 | + return mss; |
---|
| 6829 | +} |
---|
| 6830 | +EXPORT_SYMBOL_GPL(tcp_get_syncookie_mss); |
---|
6433 | 6831 | |
---|
6434 | 6832 | int tcp_conn_request(struct request_sock_ops *rsk_ops, |
---|
6435 | 6833 | const struct tcp_request_sock_ops *af_ops, |
---|
.. | .. |
---|
6445 | 6843 | bool want_cookie = false; |
---|
6446 | 6844 | struct dst_entry *dst; |
---|
6447 | 6845 | struct flowi fl; |
---|
| 6846 | + u8 syncookies; |
---|
| 6847 | + |
---|
| 6848 | + syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies); |
---|
6448 | 6849 | |
---|
6449 | 6850 | /* TW buckets are converted to open requests without |
---|
6450 | 6851 | * limitations, they conserve resources and peer is |
---|
6451 | 6852 | * evidently real one. |
---|
6452 | 6853 | */ |
---|
6453 | | - if ((net->ipv4.sysctl_tcp_syncookies == 2 || |
---|
6454 | | - inet_csk_reqsk_queue_is_full(sk)) && !isn) { |
---|
6455 | | - want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name); |
---|
| 6854 | + if ((syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) && !isn) { |
---|
| 6855 | + want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name); |
---|
6456 | 6856 | if (!want_cookie) |
---|
6457 | 6857 | goto drop; |
---|
6458 | 6858 | } |
---|
.. | .. |
---|
6466 | 6866 | if (!req) |
---|
6467 | 6867 | goto drop; |
---|
6468 | 6868 | |
---|
| 6869 | + req->syncookie = want_cookie; |
---|
6469 | 6870 | tcp_rsk(req)->af_specific = af_ops; |
---|
6470 | 6871 | tcp_rsk(req)->ts_off = 0; |
---|
| 6872 | +#if IS_ENABLED(CONFIG_MPTCP) |
---|
| 6873 | + tcp_rsk(req)->is_mptcp = 0; |
---|
| 6874 | +#endif |
---|
6471 | 6875 | |
---|
6472 | 6876 | tcp_clear_options(&tmp_opt); |
---|
6473 | 6877 | tmp_opt.mss_clamp = af_ops->mss_clamp; |
---|
.. | .. |
---|
6501 | 6905 | goto drop_and_free; |
---|
6502 | 6906 | |
---|
6503 | 6907 | if (!want_cookie && !isn) { |
---|
| 6908 | + int max_syn_backlog = READ_ONCE(net->ipv4.sysctl_max_syn_backlog); |
---|
| 6909 | + |
---|
6504 | 6910 | /* Kill the following clause, if you dislike this way. */ |
---|
6505 | | - if (!net->ipv4.sysctl_tcp_syncookies && |
---|
6506 | | - (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < |
---|
6507 | | - (net->ipv4.sysctl_max_syn_backlog >> 2)) && |
---|
| 6911 | + if (!syncookies && |
---|
| 6912 | + (max_syn_backlog - inet_csk_reqsk_queue_len(sk) < |
---|
| 6913 | + (max_syn_backlog >> 2)) && |
---|
6508 | 6914 | !tcp_peer_is_proven(req, dst)) { |
---|
6509 | 6915 | /* Without syncookies last quarter of |
---|
6510 | 6916 | * backlog is filled with destinations, |
---|
.. | .. |
---|
6525 | 6931 | |
---|
6526 | 6932 | if (want_cookie) { |
---|
6527 | 6933 | isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); |
---|
6528 | | - req->cookie_ts = tmp_opt.tstamp_ok; |
---|
6529 | 6934 | if (!tmp_opt.tstamp_ok) |
---|
6530 | 6935 | inet_rsk(req)->ecn_ok = 0; |
---|
6531 | 6936 | } |
---|
6532 | 6937 | |
---|
6533 | 6938 | tcp_rsk(req)->snt_isn = isn; |
---|
6534 | 6939 | tcp_rsk(req)->txhash = net_tx_rndhash(); |
---|
| 6940 | + tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield; |
---|
6535 | 6941 | tcp_openreq_init_rwin(req, sk, dst); |
---|
6536 | 6942 | sk_rx_queue_set(req_to_sk(req), skb); |
---|
6537 | 6943 | if (!want_cookie) { |
---|
.. | .. |
---|
6540 | 6946 | } |
---|
6541 | 6947 | if (fastopen_sk) { |
---|
6542 | 6948 | af_ops->send_synack(fastopen_sk, dst, &fl, req, |
---|
6543 | | - &foc, TCP_SYNACK_FASTOPEN); |
---|
| 6949 | + &foc, TCP_SYNACK_FASTOPEN, skb); |
---|
6544 | 6950 | /* Add the child socket directly into the accept queue */ |
---|
6545 | 6951 | if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) { |
---|
6546 | 6952 | reqsk_fastopen_remove(fastopen_sk, req, false); |
---|
6547 | 6953 | bh_unlock_sock(fastopen_sk); |
---|
6548 | 6954 | sock_put(fastopen_sk); |
---|
6549 | | - reqsk_put(req); |
---|
6550 | | - goto drop; |
---|
| 6955 | + goto drop_and_free; |
---|
6551 | 6956 | } |
---|
6552 | 6957 | sk->sk_data_ready(sk); |
---|
6553 | 6958 | bh_unlock_sock(fastopen_sk); |
---|
.. | .. |
---|
6559 | 6964 | tcp_timeout_init((struct sock *)req)); |
---|
6560 | 6965 | af_ops->send_synack(sk, dst, &fl, req, &foc, |
---|
6561 | 6966 | !want_cookie ? TCP_SYNACK_NORMAL : |
---|
6562 | | - TCP_SYNACK_COOKIE); |
---|
| 6967 | + TCP_SYNACK_COOKIE, |
---|
| 6968 | + skb); |
---|
6563 | 6969 | if (want_cookie) { |
---|
6564 | 6970 | reqsk_free(req); |
---|
6565 | 6971 | return 0; |
---|
.. | .. |
---|
6571 | 6977 | drop_and_release: |
---|
6572 | 6978 | dst_release(dst); |
---|
6573 | 6979 | drop_and_free: |
---|
6574 | | - reqsk_free(req); |
---|
| 6980 | + __reqsk_free(req); |
---|
6575 | 6981 | drop: |
---|
6576 | 6982 | tcp_listendrop(sk); |
---|
6577 | 6983 | return 0; |
---|