hc
2024-12-19 9370bb92b2d16684ee45cf24e879c93c509162da
kernel/net/ipv4/tcp_input.c
....@@ -77,8 +77,10 @@
7777 #include <asm/unaligned.h>
7878 #include <linux/errqueue.h>
7979 #include <trace/events/tcp.h>
80
-#include <linux/static_key.h>
80
+#include <linux/jump_label_ratelimit.h>
8181 #include <net/busy_poll.h>
82
+#include <net/mptcp.h>
83
+#include <trace/hooks/net.h>
8284
8385 int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
8486
....@@ -113,22 +115,91 @@
113115 #define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */
114116
115117 #if IS_ENABLED(CONFIG_TLS_DEVICE)
116
-static DEFINE_STATIC_KEY_FALSE(clean_acked_data_enabled);
118
+static DEFINE_STATIC_KEY_DEFERRED_FALSE(clean_acked_data_enabled, HZ);
117119
118120 void clean_acked_data_enable(struct inet_connection_sock *icsk,
119121 void (*cad)(struct sock *sk, u32 ack_seq))
120122 {
121123 icsk->icsk_clean_acked = cad;
122
- static_branch_inc(&clean_acked_data_enabled);
124
+ static_branch_deferred_inc(&clean_acked_data_enabled);
123125 }
124126 EXPORT_SYMBOL_GPL(clean_acked_data_enable);
125127
126128 void clean_acked_data_disable(struct inet_connection_sock *icsk)
127129 {
128
- static_branch_dec(&clean_acked_data_enabled);
130
+ static_branch_slow_dec_deferred(&clean_acked_data_enabled);
129131 icsk->icsk_clean_acked = NULL;
130132 }
131133 EXPORT_SYMBOL_GPL(clean_acked_data_disable);
134
+
135
+void clean_acked_data_flush(void)
136
+{
137
+ static_key_deferred_flush(&clean_acked_data_enabled);
138
+}
139
+EXPORT_SYMBOL_GPL(clean_acked_data_flush);
140
+#endif
141
+
142
+#ifdef CONFIG_CGROUP_BPF
143
+static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb)
144
+{
145
+ bool unknown_opt = tcp_sk(sk)->rx_opt.saw_unknown &&
146
+ BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
147
+ BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG);
148
+ bool parse_all_opt = BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
149
+ BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG);
150
+ struct bpf_sock_ops_kern sock_ops;
151
+
152
+ if (likely(!unknown_opt && !parse_all_opt))
153
+ return;
154
+
155
+ /* The skb will be handled in the
156
+ * bpf_skops_established() or
157
+ * bpf_skops_write_hdr_opt().
158
+ */
159
+ switch (sk->sk_state) {
160
+ case TCP_SYN_RECV:
161
+ case TCP_SYN_SENT:
162
+ case TCP_LISTEN:
163
+ return;
164
+ }
165
+
166
+ sock_owned_by_me(sk);
167
+
168
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
169
+ sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB;
170
+ sock_ops.is_fullsock = 1;
171
+ sock_ops.sk = sk;
172
+ bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb));
173
+
174
+ BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
175
+}
176
+
177
+static void bpf_skops_established(struct sock *sk, int bpf_op,
178
+ struct sk_buff *skb)
179
+{
180
+ struct bpf_sock_ops_kern sock_ops;
181
+
182
+ sock_owned_by_me(sk);
183
+
184
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
185
+ sock_ops.op = bpf_op;
186
+ sock_ops.is_fullsock = 1;
187
+ sock_ops.sk = sk;
188
+ /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */
189
+ if (skb)
190
+ bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb));
191
+
192
+ BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
193
+}
194
+#else
195
+static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb)
196
+{
197
+}
198
+
199
+static void bpf_skops_established(struct sock *sk, int bpf_op,
200
+ struct sk_buff *skb)
201
+{
202
+}
132203 #endif
133204
134205 static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb,
....@@ -172,6 +243,19 @@
172243 if (unlikely(len > icsk->icsk_ack.rcv_mss +
173244 MAX_TCP_OPTION_SPACE))
174245 tcp_gro_dev_warn(sk, skb, len);
246
+ /* If the skb has a len of exactly 1*MSS and has the PSH bit
247
+ * set then it is likely the end of an application write. So
248
+ * more data may not be arriving soon, and yet the data sender
249
+ * may be waiting for an ACK if cwnd-bound or using TX zero
250
+ * copy. So we set ICSK_ACK_PUSHED here so that
251
+ * tcp_cleanup_rbuf() will send an ACK immediately if the app
252
+ * reads all of the data and is not ping-pong. If len > MSS
253
+ * then this logic does not matter (and does not hurt) because
254
+ * tcp_cleanup_rbuf() will always ACK immediately if the app
255
+ * reads data and there is more than an MSS of unACKed data.
256
+ */
257
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_PSH)
258
+ icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
175259 } else {
176260 /* Otherwise, we make more careful check taking into account,
177261 * that SACKs block is variable.
....@@ -216,15 +300,14 @@
216300 icsk->icsk_ack.quick = quickacks;
217301 }
218302
219
-void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
303
+static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
220304 {
221305 struct inet_connection_sock *icsk = inet_csk(sk);
222306
223307 tcp_incr_quickack(sk, max_quickacks);
224
- icsk->icsk_ack.pingpong = 0;
308
+ inet_csk_exit_pingpong_mode(sk);
225309 icsk->icsk_ack.ato = TCP_ATO_MIN;
226310 }
227
-EXPORT_SYMBOL(tcp_enter_quickack_mode);
228311
229312 /* Send ACKs quickly, if "quick" count is not exhausted
230313 * and the session is not interactive.
....@@ -236,7 +319,7 @@
236319 const struct dst_entry *dst = __sk_dst_get(sk);
237320
238321 return (dst && dst_metric(dst, RTAX_QUICKACK)) ||
239
- (icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong);
322
+ (icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk));
240323 }
241324
242325 static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
....@@ -354,7 +437,8 @@
354437 sndmem *= nr_segs * per_mss;
355438
356439 if (sk->sk_sndbuf < sndmem)
357
- sk->sk_sndbuf = min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]);
440
+ WRITE_ONCE(sk->sk_sndbuf,
441
+ min(sndmem, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[2])));
358442 }
359443
360444 /* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
....@@ -383,12 +467,13 @@
383467 */
384468
385469 /* Slow part of check#2. */
386
-static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
470
+static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb,
471
+ unsigned int skbtruesize)
387472 {
388473 struct tcp_sock *tp = tcp_sk(sk);
389474 /* Optimize this! */
390
- int truesize = tcp_win_from_space(sk, skb->truesize) >> 1;
391
- int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
475
+ int truesize = tcp_win_from_space(sk, skbtruesize) >> 1;
476
+ int window = tcp_win_from_space(sk, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])) >> 1;
392477
393478 while (tp->rcv_ssthresh <= window) {
394479 if (truesize <= skb->len)
....@@ -400,7 +485,27 @@
400485 return 0;
401486 }
402487
403
-static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
488
+/* Even if skb appears to have a bad len/truesize ratio, TCP coalescing
489
+ * can play nice with us, as sk_buff and skb->head might be either
490
+ * freed or shared with up to MAX_SKB_FRAGS segments.
491
+ * Only give a boost to drivers using page frag(s) to hold the frame(s),
492
+ * and if no payload was pulled in skb->head before reaching us.
493
+ */
494
+static u32 truesize_adjust(bool adjust, const struct sk_buff *skb)
495
+{
496
+ u32 truesize = skb->truesize;
497
+
498
+ if (adjust && !skb_headlen(skb)) {
499
+ truesize -= SKB_TRUESIZE(skb_end_offset(skb));
500
+ /* paranoid check, some drivers might be buggy */
501
+ if (unlikely((int)truesize < (int)skb->len))
502
+ truesize = skb->truesize;
503
+ }
504
+ return truesize;
505
+}
506
+
507
+static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb,
508
+ bool adjust)
404509 {
405510 struct tcp_sock *tp = tcp_sk(sk);
406511 int room;
....@@ -409,15 +514,16 @@
409514
410515 /* Check #1 */
411516 if (room > 0 && !tcp_under_memory_pressure(sk)) {
517
+ unsigned int truesize = truesize_adjust(adjust, skb);
412518 int incr;
413519
414520 /* Check #2. Increase window, if skb with such overhead
415521 * will fit to rcvbuf in future.
416522 */
417
- if (tcp_win_from_space(sk, skb->truesize) <= skb->len)
523
+ if (tcp_win_from_space(sk, truesize) <= skb->len)
418524 incr = 2 * tp->advmss;
419525 else
420
- incr = __tcp_grow_window(sk, skb);
526
+ incr = __tcp_grow_window(sk, skb, truesize);
421527
422528 if (incr) {
423529 incr = max_t(int, incr, 2 * skb->len);
....@@ -430,9 +536,9 @@
430536 /* 3. Try to fixup all. It is made immediately after connection enters
431537 * established state.
432538 */
433
-void tcp_init_buffer_space(struct sock *sk)
539
+static void tcp_init_buffer_space(struct sock *sk)
434540 {
435
- int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win;
541
+ int tcp_app_win = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_app_win);
436542 struct tcp_sock *tp = tcp_sk(sk);
437543 int maxwin;
438544
....@@ -472,15 +578,17 @@
472578 struct tcp_sock *tp = tcp_sk(sk);
473579 struct inet_connection_sock *icsk = inet_csk(sk);
474580 struct net *net = sock_net(sk);
581
+ int rmem2;
475582
476583 icsk->icsk_ack.quick = 0;
584
+ rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
477585
478
- if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] &&
586
+ if (sk->sk_rcvbuf < rmem2 &&
479587 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
480588 !tcp_under_memory_pressure(sk) &&
481589 sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
482
- sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
483
- net->ipv4.sysctl_tcp_rmem[2]);
590
+ WRITE_ONCE(sk->sk_rcvbuf,
591
+ min(atomic_read(&sk->sk_rmem_alloc), rmem2));
484592 }
485593 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
486594 tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
....@@ -510,7 +618,7 @@
510618 *
511619 * The algorithm for RTT estimation w/o timestamps is based on
512620 * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
513
- * <http://public.lanl.gov/radiant/pubs.html#DRS>
621
+ * <https://public.lanl.gov/radiant/pubs.html#DRS>
514622 *
515623 * More detail on this code can be found at
516624 * <http://staff.psc.edu/jheffner/>,
....@@ -621,7 +729,7 @@
621729 * <prev RTT . ><current RTT .. ><next RTT .... >
622730 */
623731
624
- if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf &&
732
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
625733 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
626734 int rcvmem, rcvbuf;
627735 u64 rcvwin, grow;
....@@ -642,9 +750,9 @@
642750
643751 do_div(rcvwin, tp->advmss);
644752 rcvbuf = min_t(u64, rcvwin * rcvmem,
645
- sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
753
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
646754 if (rcvbuf > sk->sk_rcvbuf) {
647
- sk->sk_rcvbuf = rcvbuf;
755
+ WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
648756
649757 /* Make the window clamp follow along. */
650758 tp->window_clamp = tcp_win_from_space(sk, rcvbuf);
....@@ -710,7 +818,7 @@
710818 tcp_ecn_check_ce(sk, skb);
711819
712820 if (skb->len >= 128)
713
- tcp_grow_window(sk, skb);
821
+ tcp_grow_window(sk, skb, true);
714822 }
715823
716824 /* Called to compute a smoothed rtt estimate. The data fed to this
....@@ -774,6 +882,8 @@
774882 tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
775883 tp->rtt_seq = tp->snd_nxt;
776884 tp->mdev_max_us = tcp_rto_min_us(sk);
885
+
886
+ tcp_bpf_rtt(sk);
777887 }
778888 } else {
779889 /* no previous measure. */
....@@ -782,6 +892,8 @@
782892 tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
783893 tp->mdev_max_us = tp->rttvar_us;
784894 tp->rtt_seq = tp->snd_nxt;
895
+
896
+ tcp_bpf_rtt(sk);
785897 }
786898 tp->srtt_us = max(1U, srtt);
787899 }
....@@ -859,12 +971,54 @@
859971 return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
860972 }
861973
862
-/* Take a notice that peer is sending D-SACKs */
863
-static void tcp_dsack_seen(struct tcp_sock *tp)
974
+struct tcp_sacktag_state {
975
+ /* Timestamps for earliest and latest never-retransmitted segment
976
+ * that was SACKed. RTO needs the earliest RTT to stay conservative,
977
+ * but congestion control should still get an accurate delay signal.
978
+ */
979
+ u64 first_sackt;
980
+ u64 last_sackt;
981
+ u32 reord;
982
+ u32 sack_delivered;
983
+ int flag;
984
+ unsigned int mss_now;
985
+ struct rate_sample *rate;
986
+};
987
+
988
+/* Take a notice that peer is sending D-SACKs. Skip update of data delivery
989
+ * and spurious retransmission information if this DSACK is unlikely caused by
990
+ * sender's action:
991
+ * - DSACKed sequence range is larger than maximum receiver's window.
992
+ * - Total no. of DSACKed segments exceed the total no. of retransmitted segs.
993
+ */
994
+static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq,
995
+ u32 end_seq, struct tcp_sacktag_state *state)
864996 {
997
+ u32 seq_len, dup_segs = 1;
998
+
999
+ if (!before(start_seq, end_seq))
1000
+ return 0;
1001
+
1002
+ seq_len = end_seq - start_seq;
1003
+ /* Dubious DSACK: DSACKed range greater than maximum advertised rwnd */
1004
+ if (seq_len > tp->max_window)
1005
+ return 0;
1006
+ if (seq_len > tp->mss_cache)
1007
+ dup_segs = DIV_ROUND_UP(seq_len, tp->mss_cache);
1008
+
1009
+ tp->dsack_dups += dup_segs;
1010
+ /* Skip the DSACK if dup segs weren't retransmitted by sender */
1011
+ if (tp->dsack_dups > tp->total_retrans)
1012
+ return 0;
1013
+
8651014 tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
8661015 tp->rack.dsack_seen = 1;
867
- tp->dsack_dups++;
1016
+
1017
+ state->flag |= FLAG_DSACKING_ACK;
1018
+ /* A spurious retransmission is delivered */
1019
+ state->sack_delivered += dup_segs;
1020
+
1021
+ return dup_segs;
8681022 }
8691023
8701024 /* It's reordering when higher sequence was delivered (i.e. sacked) before
....@@ -893,7 +1047,7 @@
8931047 tp->undo_marker ? tp->undo_retrans : 0);
8941048 #endif
8951049 tp->reordering = min_t(u32, (metric + mss - 1) / mss,
896
- sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
1050
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering));
8971051 }
8981052
8991053 /* This exciting event is worth to be remembered. 8) */
....@@ -902,7 +1056,11 @@
9021056 ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER);
9031057 }
9041058
905
-/* This must be called before lost_out is incremented */
1059
+ /* This must be called before lost_out or retrans_out are updated
1060
+ * on a new loss, because we want to know if all skbs previously
1061
+ * known to be lost have already been retransmitted, indicating
1062
+ * that this newly lost skb is our next skb to retransmit.
1063
+ */
9061064 static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
9071065 {
9081066 if ((!tp->retransmit_skb_hint && tp->retrans_out >= tp->lost_out) ||
....@@ -912,42 +1070,46 @@
9121070 tp->retransmit_skb_hint = skb;
9131071 }
9141072
915
-/* Sum the number of packets on the wire we have marked as lost.
916
- * There are two cases we care about here:
917
- * a) Packet hasn't been marked lost (nor retransmitted),
918
- * and this is the first loss.
919
- * b) Packet has been marked both lost and retransmitted,
920
- * and this means we think it was lost again.
1073
+/* Sum the number of packets on the wire we have marked as lost, and
1074
+ * notify the congestion control module that the given skb was marked lost.
9211075 */
922
-static void tcp_sum_lost(struct tcp_sock *tp, struct sk_buff *skb)
1076
+static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb)
1077
+{
1078
+ tp->lost += tcp_skb_pcount(skb);
1079
+}
1080
+
1081
+void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
9231082 {
9241083 __u8 sacked = TCP_SKB_CB(skb)->sacked;
1084
+ struct tcp_sock *tp = tcp_sk(sk);
9251085
926
- if (!(sacked & TCPCB_LOST) ||
927
- ((sacked & TCPCB_LOST) && (sacked & TCPCB_SACKED_RETRANS)))
928
- tp->lost += tcp_skb_pcount(skb);
929
-}
1086
+ if (sacked & TCPCB_SACKED_ACKED)
1087
+ return;
9301088
931
-static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
932
-{
933
- if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
934
- tcp_verify_retransmit_hint(tp, skb);
935
-
936
- tp->lost_out += tcp_skb_pcount(skb);
937
- tcp_sum_lost(tp, skb);
938
- TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
939
- }
940
-}
941
-
942
-void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
943
-{
9441089 tcp_verify_retransmit_hint(tp, skb);
945
-
946
- tcp_sum_lost(tp, skb);
947
- if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
1090
+ if (sacked & TCPCB_LOST) {
1091
+ if (sacked & TCPCB_SACKED_RETRANS) {
1092
+ /* Account for retransmits that are lost again */
1093
+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1094
+ tp->retrans_out -= tcp_skb_pcount(skb);
1095
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT,
1096
+ tcp_skb_pcount(skb));
1097
+ tcp_notify_skb_loss_event(tp, skb);
1098
+ }
1099
+ } else {
9481100 tp->lost_out += tcp_skb_pcount(skb);
9491101 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1102
+ tcp_notify_skb_loss_event(tp, skb);
9501103 }
1104
+}
1105
+
1106
+/* Updates the delivered and delivered_ce counts */
1107
+static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered,
1108
+ bool ece_ack)
1109
+{
1110
+ tp->delivered += delivered;
1111
+ if (ece_ack)
1112
+ tp->delivered_ce += delivered;
9511113 }
9521114
9531115 /* This procedure tags the retransmission queue when SACKs arrive.
....@@ -1082,51 +1244,42 @@
10821244
10831245 static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
10841246 struct tcp_sack_block_wire *sp, int num_sacks,
1085
- u32 prior_snd_una)
1247
+ u32 prior_snd_una, struct tcp_sacktag_state *state)
10861248 {
10871249 struct tcp_sock *tp = tcp_sk(sk);
10881250 u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
10891251 u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
1090
- bool dup_sack = false;
1252
+ u32 dup_segs;
10911253
10921254 if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
1093
- dup_sack = true;
1094
- tcp_dsack_seen(tp);
10951255 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
10961256 } else if (num_sacks > 1) {
10971257 u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
10981258 u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
10991259
1100
- if (!after(end_seq_0, end_seq_1) &&
1101
- !before(start_seq_0, start_seq_1)) {
1102
- dup_sack = true;
1103
- tcp_dsack_seen(tp);
1104
- NET_INC_STATS(sock_net(sk),
1105
- LINUX_MIB_TCPDSACKOFORECV);
1106
- }
1260
+ if (after(end_seq_0, end_seq_1) || before(start_seq_0, start_seq_1))
1261
+ return false;
1262
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKOFORECV);
1263
+ } else {
1264
+ return false;
11071265 }
11081266
1267
+ dup_segs = tcp_dsack_seen(tp, start_seq_0, end_seq_0, state);
1268
+ if (!dup_segs) { /* Skip dubious DSACK */
1269
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKIGNOREDDUBIOUS);
1270
+ return false;
1271
+ }
1272
+
1273
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECVSEGS, dup_segs);
1274
+
11091275 /* D-SACK for already forgotten data... Do dumb counting. */
1110
- if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 &&
1276
+ if (tp->undo_marker && tp->undo_retrans > 0 &&
11111277 !after(end_seq_0, prior_snd_una) &&
11121278 after(end_seq_0, tp->undo_marker))
1113
- tp->undo_retrans--;
1279
+ tp->undo_retrans = max_t(int, 0, tp->undo_retrans - dup_segs);
11141280
1115
- return dup_sack;
1281
+ return true;
11161282 }
1117
-
1118
-struct tcp_sacktag_state {
1119
- u32 reord;
1120
- /* Timestamps for earliest and latest never-retransmitted segment
1121
- * that was SACKed. RTO needs the earliest RTT to stay conservative,
1122
- * but congestion control should still get an accurate delay signal.
1123
- */
1124
- u64 first_sackt;
1125
- u64 last_sackt;
1126
- struct rate_sample *rate;
1127
- int flag;
1128
- unsigned int mss_now;
1129
-};
11301283
11311284 /* Check if skb is fully within the SACK block. In presence of GSO skbs,
11321285 * the incoming SACK may not exactly match but we can find smaller MSS
....@@ -1246,7 +1399,8 @@
12461399 sacked |= TCPCB_SACKED_ACKED;
12471400 state->flag |= FLAG_DATA_SACKED;
12481401 tp->sacked_out += pcount;
1249
- tp->delivered += pcount; /* Out-of-order packets delivered */
1402
+ /* Out-of-order packets delivered */
1403
+ state->sack_delivered += pcount;
12501404
12511405 /* Lost marker hint past SACKed? Tweak RFC3517 cnt */
12521406 if (tp->lost_skb_hint &&
....@@ -1289,7 +1443,7 @@
12891443 */
12901444 tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
12911445 start_seq, end_seq, dup_sack, pcount,
1292
- skb->skb_mstamp);
1446
+ tcp_skb_timestamp_us(skb));
12931447 tcp_rate_skb_delivered(sk, skb, state->rate);
12941448
12951449 if (skb == tp->lost_skb_hint)
....@@ -1413,7 +1567,7 @@
14131567 if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
14141568 goto fallback;
14151569
1416
- if (!tcp_skb_can_collapse_to(prev))
1570
+ if (!tcp_skb_can_collapse(prev, skb))
14171571 goto fallback;
14181572
14191573 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
....@@ -1502,6 +1656,8 @@
15021656 (mss != tcp_skb_seglen(skb)))
15031657 goto out;
15041658
1659
+ if (!tcp_skb_can_collapse(prev, skb))
1660
+ goto out;
15051661 len = skb->len;
15061662 pcount = tcp_skb_pcount(skb);
15071663 if (tcp_skb_shift(prev, skb, pcount, len))
....@@ -1578,7 +1734,7 @@
15781734 TCP_SKB_CB(skb)->end_seq,
15791735 dup_sack,
15801736 tcp_skb_pcount(skb),
1581
- skb->skb_mstamp);
1737
+ tcp_skb_timestamp_us(skb));
15821738 tcp_rate_skb_delivered(sk, skb, state->rate);
15831739 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
15841740 list_del_init(&skb->tcp_tsorted_anchor);
....@@ -1591,9 +1747,7 @@
15911747 return skb;
15921748 }
15931749
1594
-static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk,
1595
- struct tcp_sacktag_state *state,
1596
- u32 seq)
1750
+static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk, u32 seq)
15971751 {
15981752 struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node;
15991753 struct sk_buff *skb;
....@@ -1615,13 +1769,12 @@
16151769 }
16161770
16171771 static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
1618
- struct tcp_sacktag_state *state,
16191772 u32 skip_to_seq)
16201773 {
16211774 if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq))
16221775 return skb;
16231776
1624
- return tcp_sacktag_bsearch(sk, state, skip_to_seq);
1777
+ return tcp_sacktag_bsearch(sk, skip_to_seq);
16251778 }
16261779
16271780 static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
....@@ -1634,7 +1787,7 @@
16341787 return skb;
16351788
16361789 if (before(next_dup->start_seq, skip_to_seq)) {
1637
- skb = tcp_sacktag_skip(skb, sk, state, next_dup->start_seq);
1790
+ skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq);
16381791 skb = tcp_sacktag_walk(skb, sk, NULL, state,
16391792 next_dup->start_seq, next_dup->end_seq,
16401793 1);
....@@ -1672,11 +1825,7 @@
16721825 tcp_highest_sack_reset(sk);
16731826
16741827 found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
1675
- num_sacks, prior_snd_una);
1676
- if (found_dup_sack) {
1677
- state->flag |= FLAG_DSACKING_ACK;
1678
- tp->delivered++; /* A spurious retransmission is delivered */
1679
- }
1828
+ num_sacks, prior_snd_una, state);
16801829
16811830 /* Eliminate too old ACKs, but take into
16821831 * account more or less fresh ones, they can
....@@ -1778,8 +1927,7 @@
17781927
17791928 /* Head todo? */
17801929 if (before(start_seq, cache->start_seq)) {
1781
- skb = tcp_sacktag_skip(skb, sk, state,
1782
- start_seq);
1930
+ skb = tcp_sacktag_skip(skb, sk, start_seq);
17831931 skb = tcp_sacktag_walk(skb, sk, next_dup,
17841932 state,
17851933 start_seq,
....@@ -1805,7 +1953,7 @@
18051953 goto walk;
18061954 }
18071955
1808
- skb = tcp_sacktag_skip(skb, sk, state, cache->end_seq);
1956
+ skb = tcp_sacktag_skip(skb, sk, cache->end_seq);
18091957 /* Check overlap against next cached too (past this one already) */
18101958 cache++;
18111959 continue;
....@@ -1816,7 +1964,7 @@
18161964 if (!skb)
18171965 break;
18181966 }
1819
- skb = tcp_sacktag_skip(skb, sk, state, start_seq);
1967
+ skb = tcp_sacktag_skip(skb, sk, start_seq);
18201968
18211969 walk:
18221970 skb = tcp_sacktag_walk(skb, sk, next_dup, state,
....@@ -1878,34 +2026,39 @@
18782026 return;
18792027
18802028 tp->reordering = min_t(u32, tp->packets_out + addend,
1881
- sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
2029
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering));
18822030 tp->reord_seen++;
18832031 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER);
18842032 }
18852033
18862034 /* Emulate SACKs for SACKless connection: account for a new dupack. */
18872035
1888
-static void tcp_add_reno_sack(struct sock *sk)
2036
+static void tcp_add_reno_sack(struct sock *sk, int num_dupack, bool ece_ack)
18892037 {
1890
- struct tcp_sock *tp = tcp_sk(sk);
1891
- u32 prior_sacked = tp->sacked_out;
2038
+ if (num_dupack) {
2039
+ struct tcp_sock *tp = tcp_sk(sk);
2040
+ u32 prior_sacked = tp->sacked_out;
2041
+ s32 delivered;
18922042
1893
- tp->sacked_out++;
1894
- tcp_check_reno_reordering(sk, 0);
1895
- if (tp->sacked_out > prior_sacked)
1896
- tp->delivered++; /* Some out-of-order packet is delivered */
1897
- tcp_verify_left_out(tp);
2043
+ tp->sacked_out += num_dupack;
2044
+ tcp_check_reno_reordering(sk, 0);
2045
+ delivered = tp->sacked_out - prior_sacked;
2046
+ if (delivered > 0)
2047
+ tcp_count_delivered(tp, delivered, ece_ack);
2048
+ tcp_verify_left_out(tp);
2049
+ }
18982050 }
18992051
19002052 /* Account for ACK, ACKing some data in Reno Recovery phase. */
19012053
1902
-static void tcp_remove_reno_sacks(struct sock *sk, int acked)
2054
+static void tcp_remove_reno_sacks(struct sock *sk, int acked, bool ece_ack)
19032055 {
19042056 struct tcp_sock *tp = tcp_sk(sk);
19052057
19062058 if (acked > 0) {
19072059 /* One ACK acked hole. The rest eat duplicate ACKs. */
1908
- tp->delivered += max_t(int, acked - tp->sacked_out, 1);
2060
+ tcp_count_delivered(tp, max_t(int, acked - tp->sacked_out, 1),
2061
+ ece_ack);
19092062 if (acked - 1 >= tp->sacked_out)
19102063 tp->sacked_out = 0;
19112064 else
....@@ -1938,7 +2091,8 @@
19382091
19392092 static bool tcp_is_rack(const struct sock *sk)
19402093 {
1941
- return sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION;
2094
+ return READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) &
2095
+ TCP_RACK_LOSS_DETECTION;
19422096 }
19432097
19442098 /* If we detect SACK reneging, forget all SACK information
....@@ -1982,6 +2136,7 @@
19822136 struct tcp_sock *tp = tcp_sk(sk);
19832137 struct net *net = sock_net(sk);
19842138 bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
2139
+ u8 reordering;
19852140
19862141 tcp_timeout_mark_lost(sk);
19872142
....@@ -2002,10 +2157,12 @@
20022157 /* Timeout in disordered state after receiving substantial DUPACKs
20032158 * suggests that the degree of reordering is over-estimated.
20042159 */
2160
+ reordering = READ_ONCE(net->ipv4.sysctl_tcp_reordering);
20052161 if (icsk->icsk_ca_state <= TCP_CA_Disorder &&
2006
- tp->sacked_out >= net->ipv4.sysctl_tcp_reordering)
2162
+ tp->sacked_out >= reordering)
20072163 tp->reordering = min_t(unsigned int, tp->reordering,
2008
- net->ipv4.sysctl_tcp_reordering);
2164
+ reordering);
2165
+
20092166 tcp_set_ca_state(sk, TCP_CA_Loss);
20102167 tp->high_seq = tp->snd_nxt;
20112168 tcp_ecn_queue_cwr(tp);
....@@ -2014,7 +2171,7 @@
20142171 * loss recovery is underway except recurring timeout(s) on
20152172 * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
20162173 */
2017
- tp->frto = net->ipv4.sysctl_tcp_frto &&
2174
+ tp->frto = READ_ONCE(net->ipv4.sysctl_tcp_frto) &&
20182175 (new_recovery || icsk->icsk_retransmits) &&
20192176 !inet_csk(sk)->icsk_mtup.probe_size;
20202177 }
....@@ -2031,7 +2188,8 @@
20312188 */
20322189 static bool tcp_check_sack_reneging(struct sock *sk, int flag)
20332190 {
2034
- if (flag & FLAG_SACK_RENEGING) {
2191
+ if (flag & FLAG_SACK_RENEGING &&
2192
+ flag & FLAG_SND_UNA_ADVANCED) {
20352193 struct tcp_sock *tp = tcp_sk(sk);
20362194 unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4),
20372195 msecs_to_jiffies(10));
....@@ -2172,8 +2330,7 @@
21722330 }
21732331
21742332 /* Detect loss in event "A" above by marking head of queue up as lost.
2175
- * For non-SACK(Reno) senders, the first "packets" number of segments
2176
- * are considered lost. For RFC3517 SACK, a segment is considered lost if it
2333
+ * For RFC3517 SACK, a segment is considered lost if it
21772334 * has at least tp->reordering SACKed seqments above it; "packets" refers to
21782335 * the maximum SACKed segments to pass before reaching this limit.
21792336 */
....@@ -2181,10 +2338,9 @@
21812338 {
21822339 struct tcp_sock *tp = tcp_sk(sk);
21832340 struct sk_buff *skb;
2184
- int cnt, oldcnt, lost;
2185
- unsigned int mss;
2341
+ int cnt;
21862342 /* Use SACK to deduce losses of new sequences sent during recovery */
2187
- const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq;
2343
+ const u32 loss_high = tp->snd_nxt;
21882344
21892345 WARN_ON(packets > tp->packets_out);
21902346 skb = tp->lost_skb_hint;
....@@ -2207,28 +2363,14 @@
22072363 if (after(TCP_SKB_CB(skb)->end_seq, loss_high))
22082364 break;
22092365
2210
- oldcnt = cnt;
2211
- if (tcp_is_reno(tp) ||
2212
- (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
2366
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
22132367 cnt += tcp_skb_pcount(skb);
22142368
2215
- if (cnt > packets) {
2216
- if (tcp_is_sack(tp) ||
2217
- (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
2218
- (oldcnt >= packets))
2219
- break;
2369
+ if (cnt > packets)
2370
+ break;
22202371
2221
- mss = tcp_skb_mss(skb);
2222
- /* If needed, chop off the prefix to mark as lost. */
2223
- lost = (packets - oldcnt) * mss;
2224
- if (lost < skb->len &&
2225
- tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
2226
- lost, mss, GFP_ATOMIC) < 0)
2227
- break;
2228
- cnt = packets;
2229
- }
2230
-
2231
- tcp_skb_mark_lost(tp, skb);
2372
+ if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST))
2373
+ tcp_mark_skb_lost(sk, skb);
22322374
22332375 if (mark_head)
22342376 break;
....@@ -2272,7 +2414,7 @@
22722414 */
22732415 static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
22742416 {
2275
- return !tp->retrans_stamp ||
2417
+ return tp->retrans_stamp &&
22762418 tcp_tsopt_ecr_before(tp, tp->retrans_stamp);
22772419 }
22782420
....@@ -2368,6 +2510,21 @@
23682510 return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp));
23692511 }
23702512
2513
+static bool tcp_is_non_sack_preventing_reopen(struct sock *sk)
2514
+{
2515
+ struct tcp_sock *tp = tcp_sk(sk);
2516
+
2517
+ if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
2518
+ /* Hold old state until something *above* high_seq
2519
+ * is ACKed. For Reno it is MUST to prevent false
2520
+ * fast retransmits (RFC2582). SACK TCP is safe. */
2521
+ if (!tcp_any_retrans_done(sk))
2522
+ tp->retrans_stamp = 0;
2523
+ return true;
2524
+ }
2525
+ return false;
2526
+}
2527
+
23712528 /* People celebrate: "We love our President!" */
23722529 static bool tcp_try_undo_recovery(struct sock *sk)
23732530 {
....@@ -2390,14 +2547,8 @@
23902547 } else if (tp->rack.reo_wnd_persist) {
23912548 tp->rack.reo_wnd_persist--;
23922549 }
2393
- if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
2394
- /* Hold old state until something *above* high_seq
2395
- * is ACKed. For Reno it is MUST to prevent false
2396
- * fast retransmits (RFC2582). SACK TCP is safe. */
2397
- if (!tcp_any_retrans_done(sk))
2398
- tp->retrans_stamp = 0;
2550
+ if (tcp_is_non_sack_preventing_reopen(sk))
23992551 return true;
2400
- }
24012552 tcp_set_ca_state(sk, TCP_CA_Open);
24022553 tp->is_sack_reneg = 0;
24032554 return false;
....@@ -2433,6 +2584,8 @@
24332584 NET_INC_STATS(sock_net(sk),
24342585 LINUX_MIB_TCPSPURIOUSRTOS);
24352586 inet_csk(sk)->icsk_retransmits = 0;
2587
+ if (tcp_is_non_sack_preventing_reopen(sk))
2588
+ return true;
24362589 if (frto_undo || tcp_is_sack(tp)) {
24372590 tcp_set_ca_state(sk, TCP_CA_Open);
24382591 tp->is_sack_reneg = 0;
....@@ -2479,8 +2632,8 @@
24792632 u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
24802633 tp->prior_cwnd - 1;
24812634 sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
2482
- } else if ((flag & FLAG_RETRANS_DATA_ACKED) &&
2483
- !(flag & FLAG_LOST_RETRANS)) {
2635
+ } else if ((flag & (FLAG_RETRANS_DATA_ACKED | FLAG_LOST_RETRANS)) ==
2636
+ FLAG_RETRANS_DATA_ACKED) {
24842637 sndcnt = min_t(int, delta,
24852638 max_t(int, tp->prr_delivered - tp->prr_out,
24862639 newly_acked_sacked) + 1);
....@@ -2566,12 +2719,15 @@
25662719 {
25672720 struct tcp_sock *tp = tcp_sk(sk);
25682721 struct inet_connection_sock *icsk = inet_csk(sk);
2722
+ u64 val;
25692723
2570
- /* FIXME: breaks with very large cwnd */
25712724 tp->prior_ssthresh = tcp_current_ssthresh(sk);
2572
- tp->snd_cwnd = tp->snd_cwnd *
2573
- tcp_mss_to_mtu(sk, tp->mss_cache) /
2574
- icsk->icsk_mtup.probe_size;
2725
+
2726
+ val = (u64)tp->snd_cwnd * tcp_mss_to_mtu(sk, tp->mss_cache);
2727
+ do_div(val, icsk->icsk_mtup.probe_size);
2728
+ WARN_ON_ONCE((u32)val != val);
2729
+ tp->snd_cwnd = max_t(u32, 1U, val);
2730
+
25752731 tp->snd_cwnd_cnt = 0;
25762732 tp->snd_cwnd_stamp = tcp_jiffies32;
25772733 tp->snd_ssthresh = tcp_current_ssthresh(sk);
....@@ -2594,14 +2750,8 @@
25942750 unsigned int mss = tcp_current_mss(sk);
25952751
25962752 skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
2597
- if (tcp_skb_seglen(skb) > mss &&
2598
- !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
2599
- if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
2600
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
2601
- tp->retrans_out -= tcp_skb_pcount(skb);
2602
- }
2603
- tcp_skb_mark_lost_uncond_verify(tp, skb);
2604
- }
2753
+ if (tcp_skb_seglen(skb) > mss)
2754
+ tcp_mark_skb_lost(sk, skb);
26052755 }
26062756
26072757 tcp_clear_retrans_hints_partial(tp);
....@@ -2656,13 +2806,13 @@
26562806 /* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
26572807 * recovered or spurious. Otherwise retransmits more on partial ACKs.
26582808 */
2659
-static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
2809
+static void tcp_process_loss(struct sock *sk, int flag, int num_dupack,
26602810 int *rexmit)
26612811 {
26622812 struct tcp_sock *tp = tcp_sk(sk);
26632813 bool recovered = !before(tp->snd_una, tp->high_seq);
26642814
2665
- if ((flag & FLAG_SND_UNA_ADVANCED) &&
2815
+ if ((flag & FLAG_SND_UNA_ADVANCED || rcu_access_pointer(tp->fastopen_rsk)) &&
26662816 tcp_try_undo_loss(sk, false))
26672817 return;
26682818
....@@ -2675,7 +2825,7 @@
26752825 return;
26762826
26772827 if (after(tp->snd_nxt, tp->high_seq)) {
2678
- if (flag & FLAG_DATA_SACKED || is_dupack)
2828
+ if (flag & FLAG_DATA_SACKED || num_dupack)
26792829 tp->frto = 0; /* Step 3.a. loss was real */
26802830 } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
26812831 tp->high_seq = tp->snd_nxt;
....@@ -2701,16 +2851,25 @@
27012851 /* A Reno DUPACK means new data in F-RTO step 2.b above are
27022852 * delivered. Lower inflight to clock out (re)tranmissions.
27032853 */
2704
- if (after(tp->snd_nxt, tp->high_seq) && is_dupack)
2705
- tcp_add_reno_sack(sk);
2854
+ if (after(tp->snd_nxt, tp->high_seq) && num_dupack)
2855
+ tcp_add_reno_sack(sk, num_dupack, flag & FLAG_ECE);
27062856 else if (flag & FLAG_SND_UNA_ADVANCED)
27072857 tcp_reset_reno_sack(tp);
27082858 }
27092859 *rexmit = REXMIT_LOST;
27102860 }
27112861
2862
+static bool tcp_force_fast_retransmit(struct sock *sk)
2863
+{
2864
+ struct tcp_sock *tp = tcp_sk(sk);
2865
+
2866
+ return after(tcp_highest_sack_seq(tp),
2867
+ tp->snd_una + tp->reordering * tp->mss_cache);
2868
+}
2869
+
27122870 /* Undo during fast recovery after partial ACK. */
2713
-static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una)
2871
+static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una,
2872
+ bool *do_lost)
27142873 {
27152874 struct tcp_sock *tp = tcp_sk(sk);
27162875
....@@ -2735,7 +2894,9 @@
27352894 tcp_undo_cwnd_reduction(sk, true);
27362895 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
27372896 tcp_try_keep_open(sk);
2738
- return true;
2897
+ } else {
2898
+ /* Partial ACK arrived. Force fast retransmit. */
2899
+ *do_lost = tcp_force_fast_retransmit(sk);
27392900 }
27402901 return false;
27412902 }
....@@ -2759,14 +2920,6 @@
27592920 }
27602921 }
27612922
2762
-static bool tcp_force_fast_retransmit(struct sock *sk)
2763
-{
2764
- struct tcp_sock *tp = tcp_sk(sk);
2765
-
2766
- return after(tcp_highest_sack_seq(tp),
2767
- tp->snd_una + tp->reordering * tp->mss_cache);
2768
-}
2769
-
27702923 /* Process an event, which can update packets-in-flight not trivially.
27712924 * Main goal of this function is to calculate new estimate for left_out,
27722925 * taking into account both packets sitting in receiver's buffer and
....@@ -2780,20 +2933,21 @@
27802933 * tcp_xmit_retransmit_queue().
27812934 */
27822935 static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
2783
- bool is_dupack, int *ack_flag, int *rexmit)
2936
+ int num_dupack, int *ack_flag, int *rexmit)
27842937 {
27852938 struct inet_connection_sock *icsk = inet_csk(sk);
27862939 struct tcp_sock *tp = tcp_sk(sk);
27872940 int fast_rexmit = 0, flag = *ack_flag;
2788
- bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
2789
- tcp_force_fast_retransmit(sk));
2941
+ bool ece_ack = flag & FLAG_ECE;
2942
+ bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) &&
2943
+ tcp_force_fast_retransmit(sk));
27902944
27912945 if (!tp->packets_out && tp->sacked_out)
27922946 tp->sacked_out = 0;
27932947
27942948 /* Now state machine starts.
27952949 * A. ECE, hence prohibit cwnd undoing, the reduction is required. */
2796
- if (flag & FLAG_ECE)
2950
+ if (ece_ack)
27972951 tp->prior_ssthresh = 0;
27982952
27992953 /* B. In all the states check for reneging SACKs. */
....@@ -2833,35 +2987,37 @@
28332987 switch (icsk->icsk_ca_state) {
28342988 case TCP_CA_Recovery:
28352989 if (!(flag & FLAG_SND_UNA_ADVANCED)) {
2836
- if (tcp_is_reno(tp) && is_dupack)
2837
- tcp_add_reno_sack(sk);
2838
- } else {
2839
- if (tcp_try_undo_partial(sk, prior_snd_una))
2840
- return;
2841
- /* Partial ACK arrived. Force fast retransmit. */
2842
- do_lost = tcp_is_reno(tp) ||
2843
- tcp_force_fast_retransmit(sk);
2844
- }
2845
- if (tcp_try_undo_dsack(sk)) {
2846
- tcp_try_keep_open(sk);
2990
+ if (tcp_is_reno(tp))
2991
+ tcp_add_reno_sack(sk, num_dupack, ece_ack);
2992
+ } else if (tcp_try_undo_partial(sk, prior_snd_una, &do_lost))
28472993 return;
2848
- }
2994
+
2995
+ if (tcp_try_undo_dsack(sk))
2996
+ tcp_try_keep_open(sk);
2997
+
28492998 tcp_identify_packet_loss(sk, ack_flag);
2999
+ if (icsk->icsk_ca_state != TCP_CA_Recovery) {
3000
+ if (!tcp_time_to_recover(sk, flag))
3001
+ return;
3002
+ /* Undo reverts the recovery state. If loss is evident,
3003
+ * starts a new recovery (e.g. reordering then loss);
3004
+ */
3005
+ tcp_enter_recovery(sk, ece_ack);
3006
+ }
28503007 break;
28513008 case TCP_CA_Loss:
2852
- tcp_process_loss(sk, flag, is_dupack, rexmit);
3009
+ tcp_process_loss(sk, flag, num_dupack, rexmit);
28533010 tcp_identify_packet_loss(sk, ack_flag);
28543011 if (!(icsk->icsk_ca_state == TCP_CA_Open ||
28553012 (*ack_flag & FLAG_LOST_RETRANS)))
28563013 return;
28573014 /* Change state if cwnd is undone or retransmits are lost */
2858
- /* fall through */
3015
+ fallthrough;
28593016 default:
28603017 if (tcp_is_reno(tp)) {
28613018 if (flag & FLAG_SND_UNA_ADVANCED)
28623019 tcp_reset_reno_sack(tp);
2863
- if (is_dupack)
2864
- tcp_add_reno_sack(sk);
3020
+ tcp_add_reno_sack(sk, num_dupack, ece_ack);
28653021 }
28663022
28673023 if (icsk->icsk_ca_state <= TCP_CA_Disorder)
....@@ -2885,7 +3041,7 @@
28853041 }
28863042
28873043 /* Otherwise enter Recovery state */
2888
- tcp_enter_recovery(sk, (flag & FLAG_ECE));
3044
+ tcp_enter_recovery(sk, ece_ack);
28893045 fast_rexmit = 1;
28903046 }
28913047
....@@ -2896,7 +3052,7 @@
28963052
28973053 static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us, const int flag)
28983054 {
2899
- u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ;
3055
+ u32 wlen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen) * HZ;
29003056 struct tcp_sock *tp = tcp_sk(sk);
29013057
29023058 if ((flag & FLAG_ACK_MAYBE_DELAYED) && rtt_us > tcp_min_rtt(tp)) {
....@@ -2935,6 +3091,8 @@
29353091 u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
29363092
29373093 if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
3094
+ if (!delta)
3095
+ delta = 1;
29383096 seq_rtt_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
29393097 ca_rtt_us = seq_rtt_us;
29403098 }
....@@ -2988,7 +3146,7 @@
29883146 /* If the retrans timer is currently being used by Fast Open
29893147 * for SYN-ACK retrans purpose, stay put.
29903148 */
2991
- if (tp->fastopen_rsk)
3149
+ if (rcu_access_pointer(tp->fastopen_rsk))
29923150 return;
29933151
29943152 if (!tp->packets_out) {
....@@ -3004,8 +3162,8 @@
30043162 */
30053163 rto = usecs_to_jiffies(max_t(int, delta_us, 1));
30063164 }
3007
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
3008
- TCP_RTO_MAX);
3165
+ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
3166
+ TCP_RTO_MAX);
30093167 }
30103168 }
30113169
....@@ -3061,7 +3219,7 @@
30613219 */
30623220 static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
30633221 u32 prior_snd_una,
3064
- struct tcp_sacktag_state *sack)
3222
+ struct tcp_sacktag_state *sack, bool ece_ack)
30653223 {
30663224 const struct inet_connection_sock *icsk = inet_csk(sk);
30673225 u64 first_ackt, last_ackt;
....@@ -3086,8 +3244,6 @@
30863244 u8 sacked = scb->sacked;
30873245 u32 acked_pcount;
30883246
3089
- tcp_ack_tstamp(sk, skb, prior_snd_una);
3090
-
30913247 /* Determine how many packets and what bytes were acked, tso and else */
30923248 if (after(scb->end_seq, tp->snd_una)) {
30933249 if (tcp_skb_pcount(skb) == 1 ||
....@@ -3107,7 +3263,7 @@
31073263 tp->retrans_out -= acked_pcount;
31083264 flag |= FLAG_RETRANS_DATA_ACKED;
31093265 } else if (!(sacked & TCPCB_SACKED_ACKED)) {
3110
- last_ackt = skb->skb_mstamp;
3266
+ last_ackt = tcp_skb_timestamp_us(skb);
31113267 WARN_ON_ONCE(last_ackt == 0);
31123268 if (!first_ackt)
31133269 first_ackt = last_ackt;
....@@ -3122,10 +3278,10 @@
31223278 if (sacked & TCPCB_SACKED_ACKED) {
31233279 tp->sacked_out -= acked_pcount;
31243280 } else if (tcp_is_sack(tp)) {
3125
- tp->delivered += acked_pcount;
3281
+ tcp_count_delivered(tp, acked_pcount, ece_ack);
31263282 if (!tcp_skb_spurious_retrans(tp, skb))
31273283 tcp_rack_advance(tp, sacked, scb->end_seq,
3128
- skb->skb_mstamp);
3284
+ tcp_skb_timestamp_us(skb));
31293285 }
31303286 if (sacked & TCPCB_LOST)
31313287 tp->lost_out -= acked_pcount;
....@@ -3151,6 +3307,8 @@
31513307 if (!fully_acked)
31523308 break;
31533309
3310
+ tcp_ack_tstamp(sk, skb, prior_snd_una);
3311
+
31543312 next = skb_rb_next(skb);
31553313 if (unlikely(skb == tp->retransmit_skb_hint))
31563314 tp->retransmit_skb_hint = NULL;
....@@ -3166,8 +3324,11 @@
31663324 if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
31673325 tp->snd_up = tp->snd_una;
31683326
3169
- if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
3170
- flag |= FLAG_SACK_RENEGING;
3327
+ if (skb) {
3328
+ tcp_ack_tstamp(sk, skb, prior_snd_una);
3329
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
3330
+ flag |= FLAG_SACK_RENEGING;
3331
+ }
31713332
31723333 if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
31733334 seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt);
....@@ -3199,7 +3360,7 @@
31993360 }
32003361
32013362 if (tcp_is_reno(tp)) {
3202
- tcp_remove_reno_sacks(sk, pkts_acked);
3363
+ tcp_remove_reno_sacks(sk, pkts_acked, ece_ack);
32033364
32043365 /* If any of the cumulatively ACKed segments was
32053366 * retransmitted, non-SACK case cannot confirm that
....@@ -3220,7 +3381,8 @@
32203381 tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
32213382 }
32223383 } else if (skb && rtt_update && sack_rtt_us >= 0 &&
3223
- sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) {
3384
+ sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp,
3385
+ tcp_skb_timestamp_us(skb))) {
32243386 /* Do not re-arm RTO if the sack RTT is measured from data sent
32253387 * after when the head was last (re)transmitted. Otherwise the
32263388 * timeout may continue to extend in loss recovery.
....@@ -3273,6 +3435,7 @@
32733435 return;
32743436 if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) {
32753437 icsk->icsk_backoff = 0;
3438
+ icsk->icsk_probes_tstamp = 0;
32763439 inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
32773440 /* Socket must be waked up by subsequent tcp_data_snd_check().
32783441 * This function is not for random using!
....@@ -3280,8 +3443,8 @@
32803443 } else {
32813444 unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);
32823445
3283
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
3284
- when, TCP_RTO_MAX);
3446
+ when = tcp_clamp_probe0_to_user_timeout(sk, when);
3447
+ tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, TCP_RTO_MAX);
32853448 }
32863449 }
32873450
....@@ -3300,7 +3463,8 @@
33003463 * new SACK or ECE mark may first advance cwnd here and later reduce
33013464 * cwnd in tcp_fastretrans_alert() based on more states.
33023465 */
3303
- if (tcp_sk(sk)->reordering > sock_net(sk)->ipv4.sysctl_tcp_reordering)
3466
+ if (tcp_sk(sk)->reordering >
3467
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering))
33043468 return flag & FLAG_FORWARD_PROGRESS;
33053469
33063470 return flag & FLAG_DATA_ACKED;
....@@ -3409,16 +3573,23 @@
34093573 static bool __tcp_oow_rate_limited(struct net *net, int mib_idx,
34103574 u32 *last_oow_ack_time)
34113575 {
3412
- if (*last_oow_ack_time) {
3413
- s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time);
3576
+ /* Paired with the WRITE_ONCE() in this function. */
3577
+ u32 val = READ_ONCE(*last_oow_ack_time);
34143578
3415
- if (0 <= elapsed && elapsed < net->ipv4.sysctl_tcp_invalid_ratelimit) {
3579
+ if (val) {
3580
+ s32 elapsed = (s32)(tcp_jiffies32 - val);
3581
+
3582
+ if (0 <= elapsed &&
3583
+ elapsed < READ_ONCE(net->ipv4.sysctl_tcp_invalid_ratelimit)) {
34163584 NET_INC_STATS(net, mib_idx);
34173585 return true; /* rate-limited: don't send yet! */
34183586 }
34193587 }
34203588
3421
- *last_oow_ack_time = tcp_jiffies32;
3589
+ /* Paired with the prior READ_ONCE() and with itself,
3590
+ * as we might be lockless.
3591
+ */
3592
+ WRITE_ONCE(*last_oow_ack_time, tcp_jiffies32);
34223593
34233594 return false; /* not rate-limited: go ahead, send dupack now! */
34243595 }
....@@ -3459,11 +3630,11 @@
34593630
34603631 /* Then check host-wide RFC 5961 rate limit. */
34613632 now = jiffies / HZ;
3462
- if (now != challenge_timestamp) {
3463
- u32 ack_limit = net->ipv4.sysctl_tcp_challenge_ack_limit;
3633
+ if (now != READ_ONCE(challenge_timestamp)) {
3634
+ u32 ack_limit = READ_ONCE(net->ipv4.sysctl_tcp_challenge_ack_limit);
34643635 u32 half = (ack_limit + 1) >> 1;
34653636
3466
- challenge_timestamp = now;
3637
+ WRITE_ONCE(challenge_timestamp, now);
34673638 WRITE_ONCE(challenge_count, half + prandom_u32_max(ack_limit));
34683639 }
34693640 count = READ_ONCE(challenge_count);
....@@ -3544,10 +3715,10 @@
35443715 {
35453716 struct tcp_sock *tp = tcp_sk(sk);
35463717
3547
- if (rexmit == REXMIT_NONE)
3718
+ if (rexmit == REXMIT_NONE || sk->sk_state == TCP_SYN_SENT)
35483719 return;
35493720
3550
- if (unlikely(rexmit == 2)) {
3721
+ if (unlikely(rexmit == REXMIT_NEW)) {
35513722 __tcp_push_pending_frames(sk, tcp_current_mss(sk),
35523723 TCP_NAGLE_OFF);
35533724 if (after(tp->snd_nxt, tp->high_seq))
....@@ -3566,10 +3737,9 @@
35663737
35673738 delivered = tp->delivered - prior_delivered;
35683739 NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered);
3569
- if (flag & FLAG_ECE) {
3570
- tp->delivered_ce += delivered;
3740
+ if (flag & FLAG_ECE)
35713741 NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered);
3572
- }
3742
+
35733743 return delivered;
35743744 }
35753745
....@@ -3584,7 +3754,7 @@
35843754 bool is_sack_reneg = tp->is_sack_reneg;
35853755 u32 ack_seq = TCP_SKB_CB(skb)->seq;
35863756 u32 ack = TCP_SKB_CB(skb)->ack_seq;
3587
- bool is_dupack = false;
3757
+ int num_dupack = 0;
35883758 int prior_packets = tp->packets_out;
35893759 u32 delivered = tp->delivered;
35903760 u32 lost = tp->lost;
....@@ -3593,6 +3763,7 @@
35933763
35943764 sack_state.first_sackt = 0;
35953765 sack_state.rate = &rs;
3766
+ sack_state.sack_delivered = 0;
35963767
35973768 /* We very likely will need to access rtx queue. */
35983769 prefetch(sk->tcp_rtx_queue.rb_node);
....@@ -3614,14 +3785,14 @@
36143785 * this segment (RFC793 Section 3.9).
36153786 */
36163787 if (after(ack, tp->snd_nxt))
3617
- goto invalid_ack;
3788
+ return -1;
36183789
36193790 if (after(ack, prior_snd_una)) {
36203791 flag |= FLAG_SND_UNA_ADVANCED;
36213792 icsk->icsk_retransmits = 0;
36223793
36233794 #if IS_ENABLED(CONFIG_TLS_DEVICE)
3624
- if (static_branch_unlikely(&clean_acked_data_enabled))
3795
+ if (static_branch_unlikely(&clean_acked_data_enabled.key))
36253796 if (icsk->icsk_clean_acked)
36263797 icsk->icsk_clean_acked(sk, ack);
36273798 #endif
....@@ -3636,7 +3807,8 @@
36363807 if (flag & FLAG_UPDATE_TS_RECENT)
36373808 tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
36383809
3639
- if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
3810
+ if ((flag & (FLAG_SLOWPATH | FLAG_SND_UNA_ADVANCED)) ==
3811
+ FLAG_SND_UNA_ADVANCED) {
36403812 /* Window is constant, pure forward advance.
36413813 * No more checks are required.
36423814 * Note, we use the fact that SND.UNA>=SND.WL2.
....@@ -3667,6 +3839,10 @@
36673839 ack_ev_flags |= CA_ACK_ECE;
36683840 }
36693841
3842
+ if (sack_state.sack_delivered)
3843
+ tcp_count_delivered(tp, sack_state.sack_delivered,
3844
+ flag & FLAG_ECE);
3845
+
36703846 if (flag & FLAG_WIN_UPDATE)
36713847 ack_ev_flags |= CA_ACK_WIN_UPDATE;
36723848
....@@ -3692,7 +3868,8 @@
36923868 goto no_queue;
36933869
36943870 /* See if we can take anything off of the retransmit queue. */
3695
- flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state);
3871
+ flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state,
3872
+ flag & FLAG_ECE);
36963873
36973874 tcp_rack_update_reo_wnd(sk, &rs);
36983875
....@@ -3700,8 +3877,14 @@
37003877 tcp_process_tlp_ack(sk, ack, flag);
37013878
37023879 if (tcp_ack_is_dubious(sk, flag)) {
3703
- is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
3704
- tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
3880
+ if (!(flag & (FLAG_SND_UNA_ADVANCED |
3881
+ FLAG_NOT_DUP | FLAG_DSACKING_ACK))) {
3882
+ num_dupack = 1;
3883
+ /* Consider if pure acks were aggregated in tcp_add_backlog() */
3884
+ if (!(flag & FLAG_DATA))
3885
+ num_dupack = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
3886
+ }
3887
+ tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
37053888 &rexmit);
37063889 }
37073890
....@@ -3723,7 +3906,7 @@
37233906 no_queue:
37243907 /* If data was DSACKed, see if we can undo a cwnd reduction. */
37253908 if (flag & FLAG_DSACKING_ACK) {
3726
- tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
3909
+ tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
37273910 &rexmit);
37283911 tcp_newly_delivered(sk, delivered, flag);
37293912 }
....@@ -3737,10 +3920,6 @@
37373920 tcp_process_tlp_ack(sk, ack, flag);
37383921 return 1;
37393922
3740
-invalid_ack:
3741
- SOCK_DEBUG(sk, "Ack %u after %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
3742
- return -1;
3743
-
37443923 old_ack:
37453924 /* If data was SACKed, tag it and see if we should send more data.
37463925 * If data was DSACKed, see if we can undo a cwnd reduction.
....@@ -3748,13 +3927,12 @@
37483927 if (TCP_SKB_CB(skb)->sacked) {
37493928 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
37503929 &sack_state);
3751
- tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
3930
+ tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
37523931 &rexmit);
37533932 tcp_newly_delivered(sk, delivered, flag);
37543933 tcp_xmit_recovery(sk, rexmit);
37553934 }
37563935
3757
- SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
37583936 return 0;
37593937 }
37603938
....@@ -3775,7 +3953,7 @@
37753953 foc->exp = exp_opt;
37763954 }
37773955
3778
-static void smc_parse_options(const struct tcphdr *th,
3956
+static bool smc_parse_options(const struct tcphdr *th,
37793957 struct tcp_options_received *opt_rx,
37803958 const unsigned char *ptr,
37813959 int opsize)
....@@ -3784,10 +3962,56 @@
37843962 if (static_branch_unlikely(&tcp_have_smc)) {
37853963 if (th->syn && !(opsize & 1) &&
37863964 opsize >= TCPOLEN_EXP_SMC_BASE &&
3787
- get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC)
3965
+ get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) {
37883966 opt_rx->smc_ok = 1;
3967
+ return true;
3968
+ }
37893969 }
37903970 #endif
3971
+ return false;
3972
+}
3973
+
3974
+/* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped
3975
+ * value on success.
3976
+ */
3977
+static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss)
3978
+{
3979
+ const unsigned char *ptr = (const unsigned char *)(th + 1);
3980
+ int length = (th->doff * 4) - sizeof(struct tcphdr);
3981
+ u16 mss = 0;
3982
+
3983
+ while (length > 0) {
3984
+ int opcode = *ptr++;
3985
+ int opsize;
3986
+
3987
+ switch (opcode) {
3988
+ case TCPOPT_EOL:
3989
+ return mss;
3990
+ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
3991
+ length--;
3992
+ continue;
3993
+ default:
3994
+ if (length < 2)
3995
+ return mss;
3996
+ opsize = *ptr++;
3997
+ if (opsize < 2) /* "silly options" */
3998
+ return mss;
3999
+ if (opsize > length)
4000
+ return mss; /* fail on partial options */
4001
+ if (opcode == TCPOPT_MSS && opsize == TCPOLEN_MSS) {
4002
+ u16 in_mss = get_unaligned_be16(ptr);
4003
+
4004
+ if (in_mss) {
4005
+ if (user_mss && user_mss < in_mss)
4006
+ in_mss = user_mss;
4007
+ mss = in_mss;
4008
+ }
4009
+ }
4010
+ ptr += opsize - 2;
4011
+ length -= opsize;
4012
+ }
4013
+ }
4014
+ return mss;
37914015 }
37924016
37934017 /* Look for tcp options. Normally only called on SYN and SYNACK packets.
....@@ -3805,6 +4029,7 @@
38054029
38064030 ptr = (const unsigned char *)(th + 1);
38074031 opt_rx->saw_tstamp = 0;
4032
+ opt_rx->saw_unknown = 0;
38084033
38094034 while (length > 0) {
38104035 int opcode = *ptr++;
....@@ -3817,6 +4042,8 @@
38174042 length--;
38184043 continue;
38194044 default:
4045
+ if (length < 2)
4046
+ return;
38204047 opsize = *ptr++;
38214048 if (opsize < 2) /* "silly options" */
38224049 return;
....@@ -3836,7 +4063,7 @@
38364063 break;
38374064 case TCPOPT_WINDOW:
38384065 if (opsize == TCPOLEN_WINDOW && th->syn &&
3839
- !estab && net->ipv4.sysctl_tcp_window_scaling) {
4066
+ !estab && READ_ONCE(net->ipv4.sysctl_tcp_window_scaling)) {
38404067 __u8 snd_wscale = *(__u8 *)ptr;
38414068 opt_rx->wscale_ok = 1;
38424069 if (snd_wscale > TCP_MAX_WSCALE) {
....@@ -3852,7 +4079,7 @@
38524079 case TCPOPT_TIMESTAMP:
38534080 if ((opsize == TCPOLEN_TIMESTAMP) &&
38544081 ((estab && opt_rx->tstamp_ok) ||
3855
- (!estab && net->ipv4.sysctl_tcp_timestamps))) {
4082
+ (!estab && READ_ONCE(net->ipv4.sysctl_tcp_timestamps)))) {
38564083 opt_rx->saw_tstamp = 1;
38574084 opt_rx->rcv_tsval = get_unaligned_be32(ptr);
38584085 opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
....@@ -3860,7 +4087,7 @@
38604087 break;
38614088 case TCPOPT_SACK_PERM:
38624089 if (opsize == TCPOLEN_SACK_PERM && th->syn &&
3863
- !estab && net->ipv4.sysctl_tcp_sack) {
4090
+ !estab && READ_ONCE(net->ipv4.sysctl_tcp_sack)) {
38644091 opt_rx->sack_ok = TCP_SACK_SEEN;
38654092 tcp_sack_reset(opt_rx);
38664093 }
....@@ -3893,15 +4120,21 @@
38934120 */
38944121 if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE &&
38954122 get_unaligned_be16(ptr) ==
3896
- TCPOPT_FASTOPEN_MAGIC)
4123
+ TCPOPT_FASTOPEN_MAGIC) {
38974124 tcp_parse_fastopen_option(opsize -
38984125 TCPOLEN_EXP_FASTOPEN_BASE,
38994126 ptr + 2, th->syn, foc, true);
3900
- else
3901
- smc_parse_options(th, opt_rx, ptr,
3902
- opsize);
4127
+ break;
4128
+ }
4129
+
4130
+ if (smc_parse_options(th, opt_rx, ptr, opsize))
4131
+ break;
4132
+
4133
+ opt_rx->saw_unknown = 1;
39034134 break;
39044135
4136
+ default:
4137
+ opt_rx->saw_unknown = 1;
39054138 }
39064139 ptr += opsize-2;
39074140 length -= opsize;
....@@ -4109,7 +4342,7 @@
41094342
41104343 inet_csk_schedule_ack(sk);
41114344
4112
- sk->sk_shutdown |= RCV_SHUTDOWN;
4345
+ WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | RCV_SHUTDOWN);
41134346 sock_set_flag(sk, SOCK_DONE);
41144347
41154348 switch (sk->sk_state) {
....@@ -4117,7 +4350,7 @@
41174350 case TCP_ESTABLISHED:
41184351 /* Move to CLOSE_WAIT */
41194352 tcp_set_state(sk, TCP_CLOSE_WAIT);
4120
- inet_csk(sk)->icsk_ack.pingpong = 1;
4353
+ inet_csk_enter_pingpong_mode(sk);
41214354 break;
41224355
41234356 case TCP_CLOSE_WAIT:
....@@ -4189,7 +4422,7 @@
41894422 {
41904423 struct tcp_sock *tp = tcp_sk(sk);
41914424
4192
- if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
4425
+ if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) {
41934426 int mib_idx;
41944427
41954428 if (before(seq, tp->rcv_nxt))
....@@ -4215,6 +4448,18 @@
42154448 tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
42164449 }
42174450
4451
+static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb)
4452
+{
4453
+ /* When the ACK path fails or drops most ACKs, the sender would
4454
+ * timeout and spuriously retransmit the same segment repeatedly.
4455
+ * The receiver remembers and reflects via DSACKs. Leverage the
4456
+ * DSACK state and change the txhash to re-route speculatively.
4457
+ */
4458
+ if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq &&
4459
+ sk_rethink_txhash(sk))
4460
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH);
4461
+}
4462
+
42184463 static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
42194464 {
42204465 struct tcp_sock *tp = tcp_sk(sk);
....@@ -4224,9 +4469,10 @@
42244469 NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
42254470 tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
42264471
4227
- if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
4472
+ if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) {
42284473 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
42294474
4475
+ tcp_rcv_spurious_retrans(sk, skb);
42304476 if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
42314477 end_seq = tp->rcv_nxt;
42324478 tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
....@@ -4260,9 +4506,37 @@
42604506 sp[i] = sp[i + 1];
42614507 continue;
42624508 }
4263
- this_sack++, swalk++;
4509
+ this_sack++;
4510
+ swalk++;
42644511 }
42654512 }
4513
+
4514
+static void tcp_sack_compress_send_ack(struct sock *sk)
4515
+{
4516
+ struct tcp_sock *tp = tcp_sk(sk);
4517
+
4518
+ if (!tp->compressed_ack)
4519
+ return;
4520
+
4521
+ if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
4522
+ __sock_put(sk);
4523
+
4524
+ /* Since we have to send one ack finally,
4525
+ * substract one from tp->compressed_ack to keep
4526
+ * LINUX_MIB_TCPACKCOMPRESSED accurate.
4527
+ */
4528
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
4529
+ tp->compressed_ack - 1);
4530
+
4531
+ tp->compressed_ack = 0;
4532
+ tcp_send_ack(sk);
4533
+}
4534
+
4535
+/* Reasonable amount of sack blocks included in TCP SACK option
4536
+ * The max is 4, but this becomes 3 if TCP timestamps are there.
4537
+ * Given that SACK packets might be lost, be conservative and use 2.
4538
+ */
4539
+#define TCP_SACK_BLOCKS_EXPECTED 2
42664540
42674541 static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
42684542 {
....@@ -4276,6 +4550,8 @@
42764550
42774551 for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) {
42784552 if (tcp_sack_extend(sp, seq, end_seq)) {
4553
+ if (this_sack >= TCP_SACK_BLOCKS_EXPECTED)
4554
+ tcp_sack_compress_send_ack(sk);
42794555 /* Rotate this_sack to the first one. */
42804556 for (; this_sack > 0; this_sack--, sp--)
42814557 swap(*sp, *(sp - 1));
....@@ -4285,6 +4561,9 @@
42854561 }
42864562 }
42874563
4564
+ if (this_sack >= TCP_SACK_BLOCKS_EXPECTED)
4565
+ tcp_sack_compress_send_ack(sk);
4566
+
42884567 /* Could not find an adjacent existing SACK, build a new one,
42894568 * put it at the front, and shift everyone else down. We
42904569 * always know there is at least one SACK present already here.
....@@ -4292,8 +4571,6 @@
42924571 * If the sack array is full, forget about the last one.
42934572 */
42944573 if (this_sack >= TCP_NUM_SACKS) {
4295
- if (tp->compressed_ack > TCP_FASTRETRANS_THRESH)
4296
- tcp_send_ack(sk);
42974574 this_sack--;
42984575 tp->rx_opt.num_sacks--;
42994576 sp--;
....@@ -4345,7 +4622,6 @@
43454622 /**
43464623 * tcp_try_coalesce - try to merge skb to prior one
43474624 * @sk: socket
4348
- * @dest: destination queue
43494625 * @to: prior buffer
43504626 * @from: buffer to add in queue
43514627 * @fragstolen: pointer to boolean
....@@ -4367,6 +4643,9 @@
43674643
43684644 /* Its possible this segment overlaps with prior segment in queue */
43694645 if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
4646
+ return false;
4647
+
4648
+ if (!mptcp_skb_can_collapse(to, from))
43704649 return false;
43714650
43724651 #ifdef CONFIG_TLS_DEVICE
....@@ -4412,6 +4691,7 @@
44124691
44134692 static void tcp_drop(struct sock *sk, struct sk_buff *skb)
44144693 {
4694
+ trace_android_vh_kfree_skb(skb);
44154695 sk_drops_add(sk, skb);
44164696 __kfree_skb(skb);
44174697 }
....@@ -4443,13 +4723,9 @@
44434723 rb_erase(&skb->rbnode, &tp->out_of_order_queue);
44444724
44454725 if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
4446
- SOCK_DEBUG(sk, "ofo packet was already received\n");
44474726 tcp_drop(sk, skb);
44484727 continue;
44494728 }
4450
- SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
4451
- tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
4452
- TCP_SKB_CB(skb)->end_seq);
44534729
44544730 tail = skb_peek_tail(&sk->sk_receive_queue);
44554731 eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
....@@ -4511,11 +4787,10 @@
45114787 tp->pred_flags = 0;
45124788 inet_csk_schedule_ack(sk);
45134789
4790
+ tp->rcv_ooopack += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
45144791 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
45154792 seq = TCP_SKB_CB(skb)->seq;
45164793 end_seq = TCP_SKB_CB(skb)->end_seq;
4517
- SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
4518
- tp->rcv_nxt, seq, end_seq);
45194794
45204795 p = &tp->out_of_order_queue.rb_node;
45214796 if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
....@@ -4541,7 +4816,7 @@
45414816 * and trigger fast retransmit.
45424817 */
45434818 if (tcp_is_sack(tp))
4544
- tcp_grow_window(sk, skb);
4819
+ tcp_grow_window(sk, skb, true);
45454820 kfree_skb_partial(skb, fragstolen);
45464821 skb = NULL;
45474822 goto add_sack;
....@@ -4629,19 +4904,18 @@
46294904 * and trigger fast retransmit.
46304905 */
46314906 if (tcp_is_sack(tp))
4632
- tcp_grow_window(sk, skb);
4907
+ tcp_grow_window(sk, skb, false);
46334908 skb_condense(skb);
46344909 skb_set_owner_r(skb, sk);
46354910 }
46364911 }
46374912
4638
-static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
4639
- bool *fragstolen)
4913
+static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb,
4914
+ bool *fragstolen)
46404915 {
46414916 int eaten;
46424917 struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
46434918
4644
- __skb_pull(skb, hdrlen);
46454919 eaten = (tail &&
46464920 tcp_try_coalesce(sk, tail,
46474921 skb, fragstolen)) ? 1 : 0;
....@@ -4692,7 +4966,7 @@
46924966 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
46934967 TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
46944968
4695
- if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) {
4969
+ if (tcp_queue_rcv(sk, skb, &fragstolen)) {
46964970 WARN_ON_ONCE(fragstolen); /* should not happen */
46974971 __kfree_skb(skb);
46984972 }
....@@ -4724,6 +4998,9 @@
47244998 bool fragstolen;
47254999 int eaten;
47265000
5001
+ if (sk_is_mptcp(sk))
5002
+ mptcp_incoming_options(sk, skb);
5003
+
47275004 if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
47285005 __kfree_skb(skb);
47295006 return;
....@@ -4753,7 +5030,7 @@
47535030 goto drop;
47545031 }
47555032
4756
- eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
5033
+ eaten = tcp_queue_rcv(sk, skb, &fragstolen);
47575034 if (skb->len)
47585035 tcp_event_data_recv(sk, skb);
47595036 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
....@@ -4782,6 +5059,7 @@
47825059 }
47835060
47845061 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
5062
+ tcp_rcv_spurious_retrans(sk, skb);
47855063 /* A retransmit, 2nd most common case. Force an immediate ack. */
47865064 NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
47875065 tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
....@@ -4800,10 +5078,6 @@
48005078
48015079 if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
48025080 /* Partial packet, seq < rcv_next < end_seq */
4803
- SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
4804
- tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
4805
- TCP_SKB_CB(skb)->end_seq);
4806
-
48075081 tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
48085082
48095083 /* If window is closed, drop tail of packet. But after
....@@ -4897,7 +5171,7 @@
48975171 /* The first skb to collapse is:
48985172 * - not SYN/FIN and
48995173 * - bloated or contains data before "start" or
4900
- * overlaps to the next one.
5174
+ * overlaps to the next one and mptcp allow collapsing.
49015175 */
49025176 if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) &&
49035177 (tcp_win_from_space(sk, skb->truesize) > skb->len ||
....@@ -4906,7 +5180,7 @@
49065180 break;
49075181 }
49085182
4909
- if (n && n != tail &&
5183
+ if (n && n != tail && mptcp_skb_can_collapse(skb, n) &&
49105184 TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
49115185 end_of_skbs = false;
49125186 break;
....@@ -4939,6 +5213,7 @@
49395213 else
49405214 __skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
49415215 skb_set_owner_r(nskb, sk);
5216
+ mptcp_skb_ext_move(nskb, skb);
49425217
49435218 /* Copy data, releasing collapsed skbs. */
49445219 while (copy > 0) {
....@@ -4958,6 +5233,7 @@
49585233 skb = tcp_collapse_one(sk, skb, list, root);
49595234 if (!skb ||
49605235 skb == tail ||
5236
+ !mptcp_skb_can_collapse(nskb, skb) ||
49615237 (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
49625238 goto end;
49635239 #ifdef CONFIG_TLS_DEVICE
....@@ -5082,8 +5358,6 @@
50825358 {
50835359 struct tcp_sock *tp = tcp_sk(sk);
50845360
5085
- SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
5086
-
50875361 NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);
50885362
50895363 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
....@@ -5149,12 +5423,6 @@
51495423 return true;
51505424 }
51515425
5152
-/* When incoming ACK allowed to free some skb from write_queue,
5153
- * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
5154
- * on the exit from tcp input handler.
5155
- *
5156
- * PROBLEM: sndbuf expansion does not work well with largesend.
5157
- */
51585426 static void tcp_new_space(struct sock *sk)
51595427 {
51605428 struct tcp_sock *tp = tcp_sk(sk);
....@@ -5167,18 +5435,25 @@
51675435 sk->sk_write_space(sk);
51685436 }
51695437
5170
-static void tcp_check_space(struct sock *sk)
5438
+/* Caller made space either from:
5439
+ * 1) Freeing skbs in rtx queues (after tp->snd_una has advanced)
5440
+ * 2) Sent skbs from output queue (and thus advancing tp->snd_nxt)
5441
+ *
5442
+ * We might be able to generate EPOLLOUT to the application if:
5443
+ * 1) Space consumed in output/rtx queues is below sk->sk_sndbuf/2
5444
+ * 2) notsent amount (tp->write_seq - tp->snd_nxt) became
5445
+ * small enough that tcp_stream_memory_free() decides it
5446
+ * is time to generate EPOLLOUT.
5447
+ */
5448
+void tcp_check_space(struct sock *sk)
51715449 {
5172
- if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
5173
- sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
5174
- /* pairs with tcp_poll() */
5175
- smp_mb();
5176
- if (sk->sk_socket &&
5177
- test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
5178
- tcp_new_space(sk);
5179
- if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
5180
- tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
5181
- }
5450
+ /* pairs with tcp_poll() */
5451
+ smp_mb();
5452
+ if (sk->sk_socket &&
5453
+ test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
5454
+ tcp_new_space(sk);
5455
+ if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
5456
+ tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
51825457 }
51835458 }
51845459
....@@ -5220,20 +5495,18 @@
52205495 }
52215496
52225497 if (!tcp_is_sack(tp) ||
5223
- tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr)
5498
+ tp->compressed_ack >= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr))
52245499 goto send_now;
52255500
52265501 if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) {
52275502 tp->compressed_ack_rcv_nxt = tp->rcv_nxt;
5228
- if (tp->compressed_ack > TCP_FASTRETRANS_THRESH)
5229
- NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
5230
- tp->compressed_ack - TCP_FASTRETRANS_THRESH);
5231
- tp->compressed_ack = 0;
5503
+ tp->dup_ack_counter = 0;
52325504 }
5233
-
5234
- if (++tp->compressed_ack <= TCP_FASTRETRANS_THRESH)
5505
+ if (tp->dup_ack_counter < TCP_FASTRETRANS_THRESH) {
5506
+ tp->dup_ack_counter++;
52355507 goto send_now;
5236
-
5508
+ }
5509
+ tp->compressed_ack++;
52375510 if (hrtimer_is_queued(&tp->compressed_ack_timer))
52385511 return;
52395512
....@@ -5243,11 +5516,13 @@
52435516 if (tp->srtt_us && tp->srtt_us < rtt)
52445517 rtt = tp->srtt_us;
52455518
5246
- delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns,
5519
+ delay = min_t(unsigned long,
5520
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns),
52475521 rtt * (NSEC_PER_USEC >> 3)/20);
52485522 sock_hold(sk);
5249
- hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay),
5250
- HRTIMER_MODE_REL_PINNED_SOFT);
5523
+ hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay),
5524
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns),
5525
+ HRTIMER_MODE_REL_PINNED_SOFT);
52515526 }
52525527
52535528 static inline void tcp_ack_snd_check(struct sock *sk)
....@@ -5274,7 +5549,7 @@
52745549 struct tcp_sock *tp = tcp_sk(sk);
52755550 u32 ptr = ntohs(th->urg_ptr);
52765551
5277
- if (ptr && !sock_net(sk)->ipv4.sysctl_tcp_stdurg)
5552
+ if (ptr && !READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_stdurg))
52785553 ptr--;
52795554 ptr += ntohl(th->seq);
52805555
....@@ -5328,7 +5603,7 @@
53285603 }
53295604
53305605 tp->urg_data = TCP_URG_NOTYET;
5331
- tp->urg_seq = ptr;
5606
+ WRITE_ONCE(tp->urg_seq, ptr);
53325607
53335608 /* Disable header prediction. */
53345609 tp->pred_flags = 0;
....@@ -5481,6 +5756,8 @@
54815756 goto discard;
54825757 }
54835758
5759
+ bpf_skops_parse_hdr(sk, skb);
5760
+
54845761 return true;
54855762
54865763 discard:
....@@ -5521,7 +5798,7 @@
55215798 trace_tcp_probe(sk, skb);
55225799
55235800 tcp_mstamp_refresh(tp);
5524
- if (unlikely(!sk->sk_rx_dst))
5801
+ if (unlikely(!rcu_access_pointer(sk->sk_rx_dst)))
55255802 inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
55265803 /*
55275804 * Header prediction.
....@@ -5628,8 +5905,8 @@
56285905 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
56295906
56305907 /* Bulk data transfer: receiver */
5631
- eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
5632
- &fragstolen);
5908
+ __skb_pull(skb, tcp_header_len);
5909
+ eaten = tcp_queue_rcv(sk, skb, &fragstolen);
56335910
56345911 tcp_event_data_recv(sk, skb);
56355912
....@@ -5691,6 +5968,34 @@
56915968 }
56925969 EXPORT_SYMBOL(tcp_rcv_established);
56935970
5971
+void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb)
5972
+{
5973
+ struct inet_connection_sock *icsk = inet_csk(sk);
5974
+ struct tcp_sock *tp = tcp_sk(sk);
5975
+
5976
+ tcp_mtup_init(sk);
5977
+ icsk->icsk_af_ops->rebuild_header(sk);
5978
+ tcp_init_metrics(sk);
5979
+
5980
+ /* Initialize the congestion window to start the transfer.
5981
+ * Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
5982
+ * retransmitted. In light of RFC6298 more aggressive 1sec
5983
+ * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
5984
+ * retransmission has occurred.
5985
+ */
5986
+ if (tp->total_retrans > 1 && tp->undo_marker)
5987
+ tp->snd_cwnd = 1;
5988
+ else
5989
+ tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
5990
+ tp->snd_cwnd_stamp = tcp_jiffies32;
5991
+
5992
+ bpf_skops_established(sk, bpf_op, skb);
5993
+ /* Initialize congestion control unless BPF initialized it already: */
5994
+ if (!icsk->icsk_ca_initialized)
5995
+ tcp_init_congestion_control(sk);
5996
+ tcp_init_buffer_space(sk);
5997
+}
5998
+
56945999 void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
56956000 {
56966001 struct tcp_sock *tp = tcp_sk(sk);
....@@ -5705,7 +6010,7 @@
57056010 sk_mark_napi_id(sk, skb);
57066011 }
57076012
5708
- tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);
6013
+ tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, skb);
57096014
57106015 /* Prevent spurious tcp_cwnd_restart() on first data
57116016 * packet.
....@@ -5760,6 +6065,10 @@
57606065 tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
57616066
57626067 if (data) { /* Retransmit unacked data in SYN */
6068
+ if (tp->total_retrans)
6069
+ tp->fastopen_client_fail = TFO_SYN_RETRANSMITTED;
6070
+ else
6071
+ tp->fastopen_client_fail = TFO_DATA_NOT_ACKED;
57636072 skb_rbtree_walk_from(data) {
57646073 if (__tcp_retransmit_skb(sk, data, 1))
57656074 break;
....@@ -5792,6 +6101,21 @@
57926101 #endif
57936102 }
57946103
6104
+static void tcp_try_undo_spurious_syn(struct sock *sk)
6105
+{
6106
+ struct tcp_sock *tp = tcp_sk(sk);
6107
+ u32 syn_stamp;
6108
+
6109
+ /* undo_marker is set when SYN or SYNACK times out. The timeout is
6110
+ * spurious if the ACK's timestamp option echo value matches the
6111
+ * original SYN timestamp.
6112
+ */
6113
+ syn_stamp = tp->retrans_stamp;
6114
+ if (tp->undo_marker && syn_stamp && tp->rx_opt.saw_tstamp &&
6115
+ syn_stamp == tp->rx_opt.rcv_tsecr)
6116
+ tp->undo_marker = 0;
6117
+}
6118
+
57956119 static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
57966120 const struct tcphdr *th)
57976121 {
....@@ -5815,8 +6139,14 @@
58156139 * the segment and return)"
58166140 */
58176141 if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) ||
5818
- after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt))
6142
+ after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
6143
+ /* Previous FIN/ACK or RST/ACK might be ignored. */
6144
+ if (icsk->icsk_retransmits == 0)
6145
+ inet_csk_reset_xmit_timer(sk,
6146
+ ICSK_TIME_RETRANS,
6147
+ TCP_TIMEOUT_MIN, TCP_RTO_MAX);
58196148 goto reset_and_undo;
6149
+ }
58206150
58216151 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
58226152 !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
....@@ -5859,6 +6189,7 @@
58596189 tcp_ecn_rcv_synack(tp, th);
58606190
58616191 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
6192
+ tcp_try_undo_spurious_syn(sk);
58626193 tcp_ack(sk, skb, FLAG_SLOWPATH);
58636194
58646195 /* Ok.. it's good. Set up sequence numbers and
....@@ -5912,7 +6243,7 @@
59126243 return -1;
59136244 if (sk->sk_write_pending ||
59146245 icsk->icsk_accept_queue.rskq_defer_accept ||
5915
- icsk->icsk_ack.pingpong) {
6246
+ inet_csk_in_pingpong_mode(sk)) {
59166247 /* Save one ACK. Data will be ready after
59176248 * several ticks, if write_pending is set.
59186249 *
....@@ -6017,6 +6348,38 @@
60176348 return 1;
60186349 }
60196350
6351
+static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
6352
+{
6353
+ struct request_sock *req;
6354
+
6355
+ /* If we are still handling the SYNACK RTO, see if timestamp ECR allows
6356
+ * undo. If peer SACKs triggered fast recovery, we can't undo here.
6357
+ */
6358
+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
6359
+ tcp_try_undo_loss(sk, false);
6360
+
6361
+ /* Reset rtx states to prevent spurious retransmits_timed_out() */
6362
+ tcp_sk(sk)->retrans_stamp = 0;
6363
+ inet_csk(sk)->icsk_retransmits = 0;
6364
+
6365
+ /* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1,
6366
+ * we no longer need req so release it.
6367
+ */
6368
+ req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
6369
+ lockdep_sock_is_held(sk));
6370
+ reqsk_fastopen_remove(sk, req, false);
6371
+
6372
+ /* Re-arm the timer because data may have been sent out.
6373
+ * This is similar to the regular data transmission case
6374
+ * when new data has just been ack'ed.
6375
+ *
6376
+ * (TFO) - we could try to be more aggressive and
6377
+ * retransmitting any data sooner based on when they
6378
+ * are sent out.
6379
+ */
6380
+ tcp_rearm_rto(sk);
6381
+}
6382
+
60206383 /*
60216384 * This function implements the receiving procedure of RFC 793 for
60226385 * all states except ESTABLISHED and TIME_WAIT.
....@@ -6079,7 +6442,8 @@
60796442
60806443 tcp_mstamp_refresh(tp);
60816444 tp->rx_opt.saw_tstamp = 0;
6082
- req = tp->fastopen_rsk;
6445
+ req = rcu_dereference_protected(tp->fastopen_rsk,
6446
+ lockdep_sock_is_held(sk));
60836447 if (req) {
60846448 bool req_stolen;
60856449
....@@ -6113,23 +6477,13 @@
61136477 if (!tp->srtt_us)
61146478 tcp_synack_rtt_meas(sk, req);
61156479
6116
- /* Once we leave TCP_SYN_RECV, we no longer need req
6117
- * so release it.
6118
- */
61196480 if (req) {
6120
- inet_csk(sk)->icsk_retransmits = 0;
6121
- reqsk_fastopen_remove(sk, req, false);
6122
- /* Re-arm the timer because data may have been sent out.
6123
- * This is similar to the regular data transmission case
6124
- * when new data has just been ack'ed.
6125
- *
6126
- * (TFO) - we could try to be more aggressive and
6127
- * retransmitting any data sooner based on when they
6128
- * are sent out.
6129
- */
6130
- tcp_rearm_rto(sk);
6481
+ tcp_rcv_synrecv_state_fastopen(sk);
61316482 } else {
6132
- tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
6483
+ tcp_try_undo_spurious_syn(sk);
6484
+ tp->retrans_stamp = 0;
6485
+ tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB,
6486
+ skb);
61336487 WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
61346488 }
61356489 smp_mb();
....@@ -6163,21 +6517,14 @@
61636517 case TCP_FIN_WAIT1: {
61646518 int tmo;
61656519
6166
- /* If we enter the TCP_FIN_WAIT1 state and we are a
6167
- * Fast Open socket and this is the first acceptable
6168
- * ACK we have received, this would have acknowledged
6169
- * our SYNACK so stop the SYNACK timer.
6170
- */
6171
- if (req) {
6172
- /* We no longer need the request sock. */
6173
- reqsk_fastopen_remove(sk, req, false);
6174
- tcp_rearm_rto(sk);
6175
- }
6520
+ if (req)
6521
+ tcp_rcv_synrecv_state_fastopen(sk);
6522
+
61766523 if (tp->snd_una != tp->write_seq)
61776524 break;
61786525
61796526 tcp_set_state(sk, TCP_FIN_WAIT2);
6180
- sk->sk_shutdown |= SEND_SHUTDOWN;
6527
+ WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | SEND_SHUTDOWN);
61816528
61826529 sk_dst_confirm(sk);
61836530
....@@ -6244,9 +6591,12 @@
62446591 case TCP_CLOSE_WAIT:
62456592 case TCP_CLOSING:
62466593 case TCP_LAST_ACK:
6247
- if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
6594
+ if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
6595
+ if (sk_is_mptcp(sk))
6596
+ mptcp_incoming_options(sk, skb);
62486597 break;
6249
- /* fall through */
6598
+ }
6599
+ fallthrough;
62506600 case TCP_FIN_WAIT1:
62516601 case TCP_FIN_WAIT2:
62526602 /* RFC 793 says to queue data in these states,
....@@ -6261,7 +6611,7 @@
62616611 return 1;
62626612 }
62636613 }
6264
- /* Fall through */
6614
+ fallthrough;
62656615 case TCP_ESTABLISHED:
62666616 tcp_data_queue(sk, skb);
62676617 queued = 1;
....@@ -6307,6 +6657,11 @@
63076657 * congestion control: Linux DCTCP asserts ECT on all packets,
63086658 * including SYN, which is most optimal solution; however,
63096659 * others, such as FreeBSD do not.
6660
+ *
6661
+ * Exception: At least one of the reserved bits of the TCP header (th->res1) is
6662
+ * set, indicating the use of a future TCP extension (such as AccECN). See
6663
+ * RFC8311 ยง4.3 which updates RFC3168 to allow the development of such
6664
+ * extensions.
63106665 */
63116666 static void tcp_ecn_create_request(struct request_sock *req,
63126667 const struct sk_buff *skb,
....@@ -6326,7 +6681,7 @@
63266681 ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
63276682 ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst;
63286683
6329
- if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) ||
6684
+ if (((!ect || th->res1) && ecn_ok) || tcp_ca_needs_ecn(listen_sk) ||
63306685 (ecn_ok_dst & DST_FEATURE_ECN_CA) ||
63316686 tcp_bpf_ca_needs_ecn((struct sock *)req))
63326687 inet_rsk(req)->ecn_ok = 1;
....@@ -6339,10 +6694,9 @@
63396694 struct inet_request_sock *ireq = inet_rsk(req);
63406695
63416696 req->rsk_rcv_wnd = 0; /* So that tcp_send_synack() knows! */
6342
- req->cookie_ts = 0;
63436697 tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
63446698 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
6345
- tcp_rsk(req)->snt_synack = tcp_clock_us();
6699
+ tcp_rsk(req)->snt_synack = 0;
63466700 tcp_rsk(req)->last_oow_ack_time = 0;
63476701 req->mss = rx_opt->mss_clamp;
63486702 req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
....@@ -6387,17 +6741,18 @@
63876741 /*
63886742 * Return true if a syncookie should be sent
63896743 */
6390
-static bool tcp_syn_flood_action(const struct sock *sk,
6391
- const struct sk_buff *skb,
6392
- const char *proto)
6744
+static bool tcp_syn_flood_action(const struct sock *sk, const char *proto)
63936745 {
63946746 struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
63956747 const char *msg = "Dropping request";
6396
- bool want_cookie = false;
63976748 struct net *net = sock_net(sk);
6749
+ bool want_cookie = false;
6750
+ u8 syncookies;
6751
+
6752
+ syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies);
63986753
63996754 #ifdef CONFIG_SYN_COOKIES
6400
- if (net->ipv4.sysctl_tcp_syncookies) {
6755
+ if (syncookies) {
64016756 msg = "Sending cookies";
64026757 want_cookie = true;
64036758 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
....@@ -6405,11 +6760,10 @@
64056760 #endif
64066761 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
64076762
6408
- if (!queue->synflood_warned &&
6409
- net->ipv4.sysctl_tcp_syncookies != 2 &&
6763
+ if (!queue->synflood_warned && syncookies != 2 &&
64106764 xchg(&queue->synflood_warned, 1) == 0)
64116765 net_info_ratelimited("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
6412
- proto, ntohs(tcp_hdr(skb)->dest), msg);
6766
+ proto, sk->sk_num, msg);
64136767
64146768 return want_cookie;
64156769 }
....@@ -6420,16 +6774,60 @@
64206774 {
64216775 if (tcp_sk(sk)->save_syn) {
64226776 u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb);
6423
- u32 *copy;
6777
+ struct saved_syn *saved_syn;
6778
+ u32 mac_hdrlen;
6779
+ void *base;
64246780
6425
- copy = kmalloc(len + sizeof(u32), GFP_ATOMIC);
6426
- if (copy) {
6427
- copy[0] = len;
6428
- memcpy(&copy[1], skb_network_header(skb), len);
6429
- req->saved_syn = copy;
6781
+ if (tcp_sk(sk)->save_syn == 2) { /* Save full header. */
6782
+ base = skb_mac_header(skb);
6783
+ mac_hdrlen = skb_mac_header_len(skb);
6784
+ len += mac_hdrlen;
6785
+ } else {
6786
+ base = skb_network_header(skb);
6787
+ mac_hdrlen = 0;
6788
+ }
6789
+
6790
+ saved_syn = kmalloc(struct_size(saved_syn, data, len),
6791
+ GFP_ATOMIC);
6792
+ if (saved_syn) {
6793
+ saved_syn->mac_hdrlen = mac_hdrlen;
6794
+ saved_syn->network_hdrlen = skb_network_header_len(skb);
6795
+ saved_syn->tcp_hdrlen = tcp_hdrlen(skb);
6796
+ memcpy(saved_syn->data, base, len);
6797
+ req->saved_syn = saved_syn;
64306798 }
64316799 }
64326800 }
6801
+
6802
+/* If a SYN cookie is required and supported, returns a clamped MSS value to be
6803
+ * used for SYN cookie generation.
6804
+ */
6805
+u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
6806
+ const struct tcp_request_sock_ops *af_ops,
6807
+ struct sock *sk, struct tcphdr *th)
6808
+{
6809
+ struct tcp_sock *tp = tcp_sk(sk);
6810
+ u16 mss;
6811
+
6812
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies) != 2 &&
6813
+ !inet_csk_reqsk_queue_is_full(sk))
6814
+ return 0;
6815
+
6816
+ if (!tcp_syn_flood_action(sk, rsk_ops->slab_name))
6817
+ return 0;
6818
+
6819
+ if (sk_acceptq_is_full(sk)) {
6820
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
6821
+ return 0;
6822
+ }
6823
+
6824
+ mss = tcp_parse_mss_option(th, tp->rx_opt.user_mss);
6825
+ if (!mss)
6826
+ mss = af_ops->mss_clamp;
6827
+
6828
+ return mss;
6829
+}
6830
+EXPORT_SYMBOL_GPL(tcp_get_syncookie_mss);
64336831
64346832 int tcp_conn_request(struct request_sock_ops *rsk_ops,
64356833 const struct tcp_request_sock_ops *af_ops,
....@@ -6445,14 +6843,16 @@
64456843 bool want_cookie = false;
64466844 struct dst_entry *dst;
64476845 struct flowi fl;
6846
+ u8 syncookies;
6847
+
6848
+ syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies);
64486849
64496850 /* TW buckets are converted to open requests without
64506851 * limitations, they conserve resources and peer is
64516852 * evidently real one.
64526853 */
6453
- if ((net->ipv4.sysctl_tcp_syncookies == 2 ||
6454
- inet_csk_reqsk_queue_is_full(sk)) && !isn) {
6455
- want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
6854
+ if ((syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) && !isn) {
6855
+ want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name);
64566856 if (!want_cookie)
64576857 goto drop;
64586858 }
....@@ -6466,8 +6866,12 @@
64666866 if (!req)
64676867 goto drop;
64686868
6869
+ req->syncookie = want_cookie;
64696870 tcp_rsk(req)->af_specific = af_ops;
64706871 tcp_rsk(req)->ts_off = 0;
6872
+#if IS_ENABLED(CONFIG_MPTCP)
6873
+ tcp_rsk(req)->is_mptcp = 0;
6874
+#endif
64716875
64726876 tcp_clear_options(&tmp_opt);
64736877 tmp_opt.mss_clamp = af_ops->mss_clamp;
....@@ -6501,10 +6905,12 @@
65016905 goto drop_and_free;
65026906
65036907 if (!want_cookie && !isn) {
6908
+ int max_syn_backlog = READ_ONCE(net->ipv4.sysctl_max_syn_backlog);
6909
+
65046910 /* Kill the following clause, if you dislike this way. */
6505
- if (!net->ipv4.sysctl_tcp_syncookies &&
6506
- (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
6507
- (net->ipv4.sysctl_max_syn_backlog >> 2)) &&
6911
+ if (!syncookies &&
6912
+ (max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
6913
+ (max_syn_backlog >> 2)) &&
65086914 !tcp_peer_is_proven(req, dst)) {
65096915 /* Without syncookies last quarter of
65106916 * backlog is filled with destinations,
....@@ -6525,13 +6931,13 @@
65256931
65266932 if (want_cookie) {
65276933 isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
6528
- req->cookie_ts = tmp_opt.tstamp_ok;
65296934 if (!tmp_opt.tstamp_ok)
65306935 inet_rsk(req)->ecn_ok = 0;
65316936 }
65326937
65336938 tcp_rsk(req)->snt_isn = isn;
65346939 tcp_rsk(req)->txhash = net_tx_rndhash();
6940
+ tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
65356941 tcp_openreq_init_rwin(req, sk, dst);
65366942 sk_rx_queue_set(req_to_sk(req), skb);
65376943 if (!want_cookie) {
....@@ -6540,14 +6946,13 @@
65406946 }
65416947 if (fastopen_sk) {
65426948 af_ops->send_synack(fastopen_sk, dst, &fl, req,
6543
- &foc, TCP_SYNACK_FASTOPEN);
6949
+ &foc, TCP_SYNACK_FASTOPEN, skb);
65446950 /* Add the child socket directly into the accept queue */
65456951 if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) {
65466952 reqsk_fastopen_remove(fastopen_sk, req, false);
65476953 bh_unlock_sock(fastopen_sk);
65486954 sock_put(fastopen_sk);
6549
- reqsk_put(req);
6550
- goto drop;
6955
+ goto drop_and_free;
65516956 }
65526957 sk->sk_data_ready(sk);
65536958 bh_unlock_sock(fastopen_sk);
....@@ -6559,7 +6964,8 @@
65596964 tcp_timeout_init((struct sock *)req));
65606965 af_ops->send_synack(sk, dst, &fl, req, &foc,
65616966 !want_cookie ? TCP_SYNACK_NORMAL :
6562
- TCP_SYNACK_COOKIE);
6967
+ TCP_SYNACK_COOKIE,
6968
+ skb);
65636969 if (want_cookie) {
65646970 reqsk_free(req);
65656971 return 0;
....@@ -6571,7 +6977,7 @@
65716977 drop_and_release:
65726978 dst_release(dst);
65736979 drop_and_free:
6574
- reqsk_free(req);
6980
+ __reqsk_free(req);
65756981 drop:
65766982 tcp_listendrop(sk);
65776983 return 0;