hc
2023-12-06 08f87f769b595151be1afeff53e144f543faa614
kernel/net/ipv4/tcp_input.c
....@@ -77,8 +77,10 @@
7777 #include <asm/unaligned.h>
7878 #include <linux/errqueue.h>
7979 #include <trace/events/tcp.h>
80
-#include <linux/static_key.h>
80
+#include <linux/jump_label_ratelimit.h>
8181 #include <net/busy_poll.h>
82
+#include <net/mptcp.h>
83
+#include <trace/hooks/net.h>
8284
8385 int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
8486
....@@ -113,22 +115,91 @@
113115 #define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */
114116
115117 #if IS_ENABLED(CONFIG_TLS_DEVICE)
116
-static DEFINE_STATIC_KEY_FALSE(clean_acked_data_enabled);
118
+static DEFINE_STATIC_KEY_DEFERRED_FALSE(clean_acked_data_enabled, HZ);
117119
118120 void clean_acked_data_enable(struct inet_connection_sock *icsk,
119121 void (*cad)(struct sock *sk, u32 ack_seq))
120122 {
121123 icsk->icsk_clean_acked = cad;
122
- static_branch_inc(&clean_acked_data_enabled);
124
+ static_branch_deferred_inc(&clean_acked_data_enabled);
123125 }
124126 EXPORT_SYMBOL_GPL(clean_acked_data_enable);
125127
126128 void clean_acked_data_disable(struct inet_connection_sock *icsk)
127129 {
128
- static_branch_dec(&clean_acked_data_enabled);
130
+ static_branch_slow_dec_deferred(&clean_acked_data_enabled);
129131 icsk->icsk_clean_acked = NULL;
130132 }
131133 EXPORT_SYMBOL_GPL(clean_acked_data_disable);
134
+
135
+void clean_acked_data_flush(void)
136
+{
137
+ static_key_deferred_flush(&clean_acked_data_enabled);
138
+}
139
+EXPORT_SYMBOL_GPL(clean_acked_data_flush);
140
+#endif
141
+
142
+#ifdef CONFIG_CGROUP_BPF
143
+static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb)
144
+{
145
+ bool unknown_opt = tcp_sk(sk)->rx_opt.saw_unknown &&
146
+ BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
147
+ BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG);
148
+ bool parse_all_opt = BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
149
+ BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG);
150
+ struct bpf_sock_ops_kern sock_ops;
151
+
152
+ if (likely(!unknown_opt && !parse_all_opt))
153
+ return;
154
+
155
+ /* The skb will be handled in the
156
+ * bpf_skops_established() or
157
+ * bpf_skops_write_hdr_opt().
158
+ */
159
+ switch (sk->sk_state) {
160
+ case TCP_SYN_RECV:
161
+ case TCP_SYN_SENT:
162
+ case TCP_LISTEN:
163
+ return;
164
+ }
165
+
166
+ sock_owned_by_me(sk);
167
+
168
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
169
+ sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB;
170
+ sock_ops.is_fullsock = 1;
171
+ sock_ops.sk = sk;
172
+ bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb));
173
+
174
+ BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
175
+}
176
+
177
+static void bpf_skops_established(struct sock *sk, int bpf_op,
178
+ struct sk_buff *skb)
179
+{
180
+ struct bpf_sock_ops_kern sock_ops;
181
+
182
+ sock_owned_by_me(sk);
183
+
184
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
185
+ sock_ops.op = bpf_op;
186
+ sock_ops.is_fullsock = 1;
187
+ sock_ops.sk = sk;
188
+ /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */
189
+ if (skb)
190
+ bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb));
191
+
192
+ BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
193
+}
194
+#else
195
+static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb)
196
+{
197
+}
198
+
199
+static void bpf_skops_established(struct sock *sk, int bpf_op,
200
+ struct sk_buff *skb)
201
+{
202
+}
132203 #endif
133204
134205 static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb,
....@@ -221,7 +292,7 @@
221292 struct inet_connection_sock *icsk = inet_csk(sk);
222293
223294 tcp_incr_quickack(sk, max_quickacks);
224
- icsk->icsk_ack.pingpong = 0;
295
+ inet_csk_exit_pingpong_mode(sk);
225296 icsk->icsk_ack.ato = TCP_ATO_MIN;
226297 }
227298 EXPORT_SYMBOL(tcp_enter_quickack_mode);
....@@ -236,7 +307,7 @@
236307 const struct dst_entry *dst = __sk_dst_get(sk);
237308
238309 return (dst && dst_metric(dst, RTAX_QUICKACK)) ||
239
- (icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong);
310
+ (icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk));
240311 }
241312
242313 static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
....@@ -354,7 +425,8 @@
354425 sndmem *= nr_segs * per_mss;
355426
356427 if (sk->sk_sndbuf < sndmem)
357
- sk->sk_sndbuf = min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]);
428
+ WRITE_ONCE(sk->sk_sndbuf,
429
+ min(sndmem, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[2])));
358430 }
359431
360432 /* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
....@@ -383,12 +455,13 @@
383455 */
384456
385457 /* Slow part of check#2. */
386
-static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
458
+static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb,
459
+ unsigned int skbtruesize)
387460 {
388461 struct tcp_sock *tp = tcp_sk(sk);
389462 /* Optimize this! */
390
- int truesize = tcp_win_from_space(sk, skb->truesize) >> 1;
391
- int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
463
+ int truesize = tcp_win_from_space(sk, skbtruesize) >> 1;
464
+ int window = tcp_win_from_space(sk, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])) >> 1;
392465
393466 while (tp->rcv_ssthresh <= window) {
394467 if (truesize <= skb->len)
....@@ -400,7 +473,27 @@
400473 return 0;
401474 }
402475
403
-static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
476
+/* Even if skb appears to have a bad len/truesize ratio, TCP coalescing
477
+ * can play nice with us, as sk_buff and skb->head might be either
478
+ * freed or shared with up to MAX_SKB_FRAGS segments.
479
+ * Only give a boost to drivers using page frag(s) to hold the frame(s),
480
+ * and if no payload was pulled in skb->head before reaching us.
481
+ */
482
+static u32 truesize_adjust(bool adjust, const struct sk_buff *skb)
483
+{
484
+ u32 truesize = skb->truesize;
485
+
486
+ if (adjust && !skb_headlen(skb)) {
487
+ truesize -= SKB_TRUESIZE(skb_end_offset(skb));
488
+ /* paranoid check, some drivers might be buggy */
489
+ if (unlikely((int)truesize < (int)skb->len))
490
+ truesize = skb->truesize;
491
+ }
492
+ return truesize;
493
+}
494
+
495
+static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb,
496
+ bool adjust)
404497 {
405498 struct tcp_sock *tp = tcp_sk(sk);
406499 int room;
....@@ -409,15 +502,16 @@
409502
410503 /* Check #1 */
411504 if (room > 0 && !tcp_under_memory_pressure(sk)) {
505
+ unsigned int truesize = truesize_adjust(adjust, skb);
412506 int incr;
413507
414508 /* Check #2. Increase window, if skb with such overhead
415509 * will fit to rcvbuf in future.
416510 */
417
- if (tcp_win_from_space(sk, skb->truesize) <= skb->len)
511
+ if (tcp_win_from_space(sk, truesize) <= skb->len)
418512 incr = 2 * tp->advmss;
419513 else
420
- incr = __tcp_grow_window(sk, skb);
514
+ incr = __tcp_grow_window(sk, skb, truesize);
421515
422516 if (incr) {
423517 incr = max_t(int, incr, 2 * skb->len);
....@@ -430,9 +524,9 @@
430524 /* 3. Try to fixup all. It is made immediately after connection enters
431525 * established state.
432526 */
433
-void tcp_init_buffer_space(struct sock *sk)
527
+static void tcp_init_buffer_space(struct sock *sk)
434528 {
435
- int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win;
529
+ int tcp_app_win = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_app_win);
436530 struct tcp_sock *tp = tcp_sk(sk);
437531 int maxwin;
438532
....@@ -472,15 +566,17 @@
472566 struct tcp_sock *tp = tcp_sk(sk);
473567 struct inet_connection_sock *icsk = inet_csk(sk);
474568 struct net *net = sock_net(sk);
569
+ int rmem2;
475570
476571 icsk->icsk_ack.quick = 0;
572
+ rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
477573
478
- if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] &&
574
+ if (sk->sk_rcvbuf < rmem2 &&
479575 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
480576 !tcp_under_memory_pressure(sk) &&
481577 sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
482
- sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
483
- net->ipv4.sysctl_tcp_rmem[2]);
578
+ WRITE_ONCE(sk->sk_rcvbuf,
579
+ min(atomic_read(&sk->sk_rmem_alloc), rmem2));
484580 }
485581 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
486582 tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
....@@ -510,7 +606,7 @@
510606 *
511607 * The algorithm for RTT estimation w/o timestamps is based on
512608 * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
513
- * <http://public.lanl.gov/radiant/pubs.html#DRS>
609
+ * <https://public.lanl.gov/radiant/pubs.html#DRS>
514610 *
515611 * More detail on this code can be found at
516612 * <http://staff.psc.edu/jheffner/>,
....@@ -621,7 +717,7 @@
621717 * <prev RTT . ><current RTT .. ><next RTT .... >
622718 */
623719
624
- if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf &&
720
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
625721 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
626722 int rcvmem, rcvbuf;
627723 u64 rcvwin, grow;
....@@ -642,9 +738,9 @@
642738
643739 do_div(rcvwin, tp->advmss);
644740 rcvbuf = min_t(u64, rcvwin * rcvmem,
645
- sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
741
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
646742 if (rcvbuf > sk->sk_rcvbuf) {
647
- sk->sk_rcvbuf = rcvbuf;
743
+ WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
648744
649745 /* Make the window clamp follow along. */
650746 tp->window_clamp = tcp_win_from_space(sk, rcvbuf);
....@@ -710,7 +806,7 @@
710806 tcp_ecn_check_ce(sk, skb);
711807
712808 if (skb->len >= 128)
713
- tcp_grow_window(sk, skb);
809
+ tcp_grow_window(sk, skb, true);
714810 }
715811
716812 /* Called to compute a smoothed rtt estimate. The data fed to this
....@@ -774,6 +870,8 @@
774870 tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
775871 tp->rtt_seq = tp->snd_nxt;
776872 tp->mdev_max_us = tcp_rto_min_us(sk);
873
+
874
+ tcp_bpf_rtt(sk);
777875 }
778876 } else {
779877 /* no previous measure. */
....@@ -782,6 +880,8 @@
782880 tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
783881 tp->mdev_max_us = tp->rttvar_us;
784882 tp->rtt_seq = tp->snd_nxt;
883
+
884
+ tcp_bpf_rtt(sk);
785885 }
786886 tp->srtt_us = max(1U, srtt);
787887 }
....@@ -859,12 +959,54 @@
859959 return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
860960 }
861961
862
-/* Take a notice that peer is sending D-SACKs */
863
-static void tcp_dsack_seen(struct tcp_sock *tp)
962
+struct tcp_sacktag_state {
963
+ /* Timestamps for earliest and latest never-retransmitted segment
964
+ * that was SACKed. RTO needs the earliest RTT to stay conservative,
965
+ * but congestion control should still get an accurate delay signal.
966
+ */
967
+ u64 first_sackt;
968
+ u64 last_sackt;
969
+ u32 reord;
970
+ u32 sack_delivered;
971
+ int flag;
972
+ unsigned int mss_now;
973
+ struct rate_sample *rate;
974
+};
975
+
976
+/* Take a notice that peer is sending D-SACKs. Skip update of data delivery
977
+ * and spurious retransmission information if this DSACK is unlikely caused by
978
+ * sender's action:
979
+ * - DSACKed sequence range is larger than maximum receiver's window.
980
+ * - Total no. of DSACKed segments exceed the total no. of retransmitted segs.
981
+ */
982
+static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq,
983
+ u32 end_seq, struct tcp_sacktag_state *state)
864984 {
985
+ u32 seq_len, dup_segs = 1;
986
+
987
+ if (!before(start_seq, end_seq))
988
+ return 0;
989
+
990
+ seq_len = end_seq - start_seq;
991
+ /* Dubious DSACK: DSACKed range greater than maximum advertised rwnd */
992
+ if (seq_len > tp->max_window)
993
+ return 0;
994
+ if (seq_len > tp->mss_cache)
995
+ dup_segs = DIV_ROUND_UP(seq_len, tp->mss_cache);
996
+
997
+ tp->dsack_dups += dup_segs;
998
+ /* Skip the DSACK if dup segs weren't retransmitted by sender */
999
+ if (tp->dsack_dups > tp->total_retrans)
1000
+ return 0;
1001
+
8651002 tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
8661003 tp->rack.dsack_seen = 1;
867
- tp->dsack_dups++;
1004
+
1005
+ state->flag |= FLAG_DSACKING_ACK;
1006
+ /* A spurious retransmission is delivered */
1007
+ state->sack_delivered += dup_segs;
1008
+
1009
+ return dup_segs;
8681010 }
8691011
8701012 /* It's reordering when higher sequence was delivered (i.e. sacked) before
....@@ -893,7 +1035,7 @@
8931035 tp->undo_marker ? tp->undo_retrans : 0);
8941036 #endif
8951037 tp->reordering = min_t(u32, (metric + mss - 1) / mss,
896
- sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
1038
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering));
8971039 }
8981040
8991041 /* This exciting event is worth to be remembered. 8) */
....@@ -902,7 +1044,11 @@
9021044 ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER);
9031045 }
9041046
905
-/* This must be called before lost_out is incremented */
1047
+ /* This must be called before lost_out or retrans_out are updated
1048
+ * on a new loss, because we want to know if all skbs previously
1049
+ * known to be lost have already been retransmitted, indicating
1050
+ * that this newly lost skb is our next skb to retransmit.
1051
+ */
9061052 static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
9071053 {
9081054 if ((!tp->retransmit_skb_hint && tp->retrans_out >= tp->lost_out) ||
....@@ -912,42 +1058,46 @@
9121058 tp->retransmit_skb_hint = skb;
9131059 }
9141060
915
-/* Sum the number of packets on the wire we have marked as lost.
916
- * There are two cases we care about here:
917
- * a) Packet hasn't been marked lost (nor retransmitted),
918
- * and this is the first loss.
919
- * b) Packet has been marked both lost and retransmitted,
920
- * and this means we think it was lost again.
1061
+/* Sum the number of packets on the wire we have marked as lost, and
1062
+ * notify the congestion control module that the given skb was marked lost.
9211063 */
922
-static void tcp_sum_lost(struct tcp_sock *tp, struct sk_buff *skb)
1064
+static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb)
1065
+{
1066
+ tp->lost += tcp_skb_pcount(skb);
1067
+}
1068
+
1069
+void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
9231070 {
9241071 __u8 sacked = TCP_SKB_CB(skb)->sacked;
1072
+ struct tcp_sock *tp = tcp_sk(sk);
9251073
926
- if (!(sacked & TCPCB_LOST) ||
927
- ((sacked & TCPCB_LOST) && (sacked & TCPCB_SACKED_RETRANS)))
928
- tp->lost += tcp_skb_pcount(skb);
929
-}
1074
+ if (sacked & TCPCB_SACKED_ACKED)
1075
+ return;
9301076
931
-static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
932
-{
933
- if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
934
- tcp_verify_retransmit_hint(tp, skb);
935
-
936
- tp->lost_out += tcp_skb_pcount(skb);
937
- tcp_sum_lost(tp, skb);
938
- TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
939
- }
940
-}
941
-
942
-void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
943
-{
9441077 tcp_verify_retransmit_hint(tp, skb);
945
-
946
- tcp_sum_lost(tp, skb);
947
- if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
1078
+ if (sacked & TCPCB_LOST) {
1079
+ if (sacked & TCPCB_SACKED_RETRANS) {
1080
+ /* Account for retransmits that are lost again */
1081
+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1082
+ tp->retrans_out -= tcp_skb_pcount(skb);
1083
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT,
1084
+ tcp_skb_pcount(skb));
1085
+ tcp_notify_skb_loss_event(tp, skb);
1086
+ }
1087
+ } else {
9481088 tp->lost_out += tcp_skb_pcount(skb);
9491089 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1090
+ tcp_notify_skb_loss_event(tp, skb);
9501091 }
1092
+}
1093
+
1094
+/* Updates the delivered and delivered_ce counts */
1095
+static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered,
1096
+ bool ece_ack)
1097
+{
1098
+ tp->delivered += delivered;
1099
+ if (ece_ack)
1100
+ tp->delivered_ce += delivered;
9511101 }
9521102
9531103 /* This procedure tags the retransmission queue when SACKs arrive.
....@@ -1082,51 +1232,42 @@
10821232
10831233 static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
10841234 struct tcp_sack_block_wire *sp, int num_sacks,
1085
- u32 prior_snd_una)
1235
+ u32 prior_snd_una, struct tcp_sacktag_state *state)
10861236 {
10871237 struct tcp_sock *tp = tcp_sk(sk);
10881238 u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
10891239 u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
1090
- bool dup_sack = false;
1240
+ u32 dup_segs;
10911241
10921242 if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
1093
- dup_sack = true;
1094
- tcp_dsack_seen(tp);
10951243 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
10961244 } else if (num_sacks > 1) {
10971245 u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
10981246 u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
10991247
1100
- if (!after(end_seq_0, end_seq_1) &&
1101
- !before(start_seq_0, start_seq_1)) {
1102
- dup_sack = true;
1103
- tcp_dsack_seen(tp);
1104
- NET_INC_STATS(sock_net(sk),
1105
- LINUX_MIB_TCPDSACKOFORECV);
1106
- }
1248
+ if (after(end_seq_0, end_seq_1) || before(start_seq_0, start_seq_1))
1249
+ return false;
1250
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKOFORECV);
1251
+ } else {
1252
+ return false;
11071253 }
11081254
1255
+ dup_segs = tcp_dsack_seen(tp, start_seq_0, end_seq_0, state);
1256
+ if (!dup_segs) { /* Skip dubious DSACK */
1257
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKIGNOREDDUBIOUS);
1258
+ return false;
1259
+ }
1260
+
1261
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECVSEGS, dup_segs);
1262
+
11091263 /* D-SACK for already forgotten data... Do dumb counting. */
1110
- if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 &&
1264
+ if (tp->undo_marker && tp->undo_retrans > 0 &&
11111265 !after(end_seq_0, prior_snd_una) &&
11121266 after(end_seq_0, tp->undo_marker))
1113
- tp->undo_retrans--;
1267
+ tp->undo_retrans = max_t(int, 0, tp->undo_retrans - dup_segs);
11141268
1115
- return dup_sack;
1269
+ return true;
11161270 }
1117
-
1118
-struct tcp_sacktag_state {
1119
- u32 reord;
1120
- /* Timestamps for earliest and latest never-retransmitted segment
1121
- * that was SACKed. RTO needs the earliest RTT to stay conservative,
1122
- * but congestion control should still get an accurate delay signal.
1123
- */
1124
- u64 first_sackt;
1125
- u64 last_sackt;
1126
- struct rate_sample *rate;
1127
- int flag;
1128
- unsigned int mss_now;
1129
-};
11301271
11311272 /* Check if skb is fully within the SACK block. In presence of GSO skbs,
11321273 * the incoming SACK may not exactly match but we can find smaller MSS
....@@ -1246,7 +1387,8 @@
12461387 sacked |= TCPCB_SACKED_ACKED;
12471388 state->flag |= FLAG_DATA_SACKED;
12481389 tp->sacked_out += pcount;
1249
- tp->delivered += pcount; /* Out-of-order packets delivered */
1390
+ /* Out-of-order packets delivered */
1391
+ state->sack_delivered += pcount;
12501392
12511393 /* Lost marker hint past SACKed? Tweak RFC3517 cnt */
12521394 if (tp->lost_skb_hint &&
....@@ -1289,7 +1431,7 @@
12891431 */
12901432 tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
12911433 start_seq, end_seq, dup_sack, pcount,
1292
- skb->skb_mstamp);
1434
+ tcp_skb_timestamp_us(skb));
12931435 tcp_rate_skb_delivered(sk, skb, state->rate);
12941436
12951437 if (skb == tp->lost_skb_hint)
....@@ -1413,7 +1555,7 @@
14131555 if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
14141556 goto fallback;
14151557
1416
- if (!tcp_skb_can_collapse_to(prev))
1558
+ if (!tcp_skb_can_collapse(prev, skb))
14171559 goto fallback;
14181560
14191561 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
....@@ -1502,6 +1644,8 @@
15021644 (mss != tcp_skb_seglen(skb)))
15031645 goto out;
15041646
1647
+ if (!tcp_skb_can_collapse(prev, skb))
1648
+ goto out;
15051649 len = skb->len;
15061650 pcount = tcp_skb_pcount(skb);
15071651 if (tcp_skb_shift(prev, skb, pcount, len))
....@@ -1578,7 +1722,7 @@
15781722 TCP_SKB_CB(skb)->end_seq,
15791723 dup_sack,
15801724 tcp_skb_pcount(skb),
1581
- skb->skb_mstamp);
1725
+ tcp_skb_timestamp_us(skb));
15821726 tcp_rate_skb_delivered(sk, skb, state->rate);
15831727 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
15841728 list_del_init(&skb->tcp_tsorted_anchor);
....@@ -1591,9 +1735,7 @@
15911735 return skb;
15921736 }
15931737
1594
-static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk,
1595
- struct tcp_sacktag_state *state,
1596
- u32 seq)
1738
+static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk, u32 seq)
15971739 {
15981740 struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node;
15991741 struct sk_buff *skb;
....@@ -1615,13 +1757,12 @@
16151757 }
16161758
16171759 static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
1618
- struct tcp_sacktag_state *state,
16191760 u32 skip_to_seq)
16201761 {
16211762 if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq))
16221763 return skb;
16231764
1624
- return tcp_sacktag_bsearch(sk, state, skip_to_seq);
1765
+ return tcp_sacktag_bsearch(sk, skip_to_seq);
16251766 }
16261767
16271768 static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
....@@ -1634,7 +1775,7 @@
16341775 return skb;
16351776
16361777 if (before(next_dup->start_seq, skip_to_seq)) {
1637
- skb = tcp_sacktag_skip(skb, sk, state, next_dup->start_seq);
1778
+ skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq);
16381779 skb = tcp_sacktag_walk(skb, sk, NULL, state,
16391780 next_dup->start_seq, next_dup->end_seq,
16401781 1);
....@@ -1672,11 +1813,7 @@
16721813 tcp_highest_sack_reset(sk);
16731814
16741815 found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
1675
- num_sacks, prior_snd_una);
1676
- if (found_dup_sack) {
1677
- state->flag |= FLAG_DSACKING_ACK;
1678
- tp->delivered++; /* A spurious retransmission is delivered */
1679
- }
1816
+ num_sacks, prior_snd_una, state);
16801817
16811818 /* Eliminate too old ACKs, but take into
16821819 * account more or less fresh ones, they can
....@@ -1778,8 +1915,7 @@
17781915
17791916 /* Head todo? */
17801917 if (before(start_seq, cache->start_seq)) {
1781
- skb = tcp_sacktag_skip(skb, sk, state,
1782
- start_seq);
1918
+ skb = tcp_sacktag_skip(skb, sk, start_seq);
17831919 skb = tcp_sacktag_walk(skb, sk, next_dup,
17841920 state,
17851921 start_seq,
....@@ -1805,7 +1941,7 @@
18051941 goto walk;
18061942 }
18071943
1808
- skb = tcp_sacktag_skip(skb, sk, state, cache->end_seq);
1944
+ skb = tcp_sacktag_skip(skb, sk, cache->end_seq);
18091945 /* Check overlap against next cached too (past this one already) */
18101946 cache++;
18111947 continue;
....@@ -1816,7 +1952,7 @@
18161952 if (!skb)
18171953 break;
18181954 }
1819
- skb = tcp_sacktag_skip(skb, sk, state, start_seq);
1955
+ skb = tcp_sacktag_skip(skb, sk, start_seq);
18201956
18211957 walk:
18221958 skb = tcp_sacktag_walk(skb, sk, next_dup, state,
....@@ -1878,34 +2014,39 @@
18782014 return;
18792015
18802016 tp->reordering = min_t(u32, tp->packets_out + addend,
1881
- sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
2017
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering));
18822018 tp->reord_seen++;
18832019 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER);
18842020 }
18852021
18862022 /* Emulate SACKs for SACKless connection: account for a new dupack. */
18872023
1888
-static void tcp_add_reno_sack(struct sock *sk)
2024
+static void tcp_add_reno_sack(struct sock *sk, int num_dupack, bool ece_ack)
18892025 {
1890
- struct tcp_sock *tp = tcp_sk(sk);
1891
- u32 prior_sacked = tp->sacked_out;
2026
+ if (num_dupack) {
2027
+ struct tcp_sock *tp = tcp_sk(sk);
2028
+ u32 prior_sacked = tp->sacked_out;
2029
+ s32 delivered;
18922030
1893
- tp->sacked_out++;
1894
- tcp_check_reno_reordering(sk, 0);
1895
- if (tp->sacked_out > prior_sacked)
1896
- tp->delivered++; /* Some out-of-order packet is delivered */
1897
- tcp_verify_left_out(tp);
2031
+ tp->sacked_out += num_dupack;
2032
+ tcp_check_reno_reordering(sk, 0);
2033
+ delivered = tp->sacked_out - prior_sacked;
2034
+ if (delivered > 0)
2035
+ tcp_count_delivered(tp, delivered, ece_ack);
2036
+ tcp_verify_left_out(tp);
2037
+ }
18982038 }
18992039
19002040 /* Account for ACK, ACKing some data in Reno Recovery phase. */
19012041
1902
-static void tcp_remove_reno_sacks(struct sock *sk, int acked)
2042
+static void tcp_remove_reno_sacks(struct sock *sk, int acked, bool ece_ack)
19032043 {
19042044 struct tcp_sock *tp = tcp_sk(sk);
19052045
19062046 if (acked > 0) {
19072047 /* One ACK acked hole. The rest eat duplicate ACKs. */
1908
- tp->delivered += max_t(int, acked - tp->sacked_out, 1);
2048
+ tcp_count_delivered(tp, max_t(int, acked - tp->sacked_out, 1),
2049
+ ece_ack);
19092050 if (acked - 1 >= tp->sacked_out)
19102051 tp->sacked_out = 0;
19112052 else
....@@ -1938,7 +2079,8 @@
19382079
19392080 static bool tcp_is_rack(const struct sock *sk)
19402081 {
1941
- return sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION;
2082
+ return READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) &
2083
+ TCP_RACK_LOSS_DETECTION;
19422084 }
19432085
19442086 /* If we detect SACK reneging, forget all SACK information
....@@ -1982,6 +2124,7 @@
19822124 struct tcp_sock *tp = tcp_sk(sk);
19832125 struct net *net = sock_net(sk);
19842126 bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
2127
+ u8 reordering;
19852128
19862129 tcp_timeout_mark_lost(sk);
19872130
....@@ -2002,10 +2145,12 @@
20022145 /* Timeout in disordered state after receiving substantial DUPACKs
20032146 * suggests that the degree of reordering is over-estimated.
20042147 */
2148
+ reordering = READ_ONCE(net->ipv4.sysctl_tcp_reordering);
20052149 if (icsk->icsk_ca_state <= TCP_CA_Disorder &&
2006
- tp->sacked_out >= net->ipv4.sysctl_tcp_reordering)
2150
+ tp->sacked_out >= reordering)
20072151 tp->reordering = min_t(unsigned int, tp->reordering,
2008
- net->ipv4.sysctl_tcp_reordering);
2152
+ reordering);
2153
+
20092154 tcp_set_ca_state(sk, TCP_CA_Loss);
20102155 tp->high_seq = tp->snd_nxt;
20112156 tcp_ecn_queue_cwr(tp);
....@@ -2014,7 +2159,7 @@
20142159 * loss recovery is underway except recurring timeout(s) on
20152160 * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
20162161 */
2017
- tp->frto = net->ipv4.sysctl_tcp_frto &&
2162
+ tp->frto = READ_ONCE(net->ipv4.sysctl_tcp_frto) &&
20182163 (new_recovery || icsk->icsk_retransmits) &&
20192164 !inet_csk(sk)->icsk_mtup.probe_size;
20202165 }
....@@ -2031,7 +2176,8 @@
20312176 */
20322177 static bool tcp_check_sack_reneging(struct sock *sk, int flag)
20332178 {
2034
- if (flag & FLAG_SACK_RENEGING) {
2179
+ if (flag & FLAG_SACK_RENEGING &&
2180
+ flag & FLAG_SND_UNA_ADVANCED) {
20352181 struct tcp_sock *tp = tcp_sk(sk);
20362182 unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4),
20372183 msecs_to_jiffies(10));
....@@ -2172,8 +2318,7 @@
21722318 }
21732319
21742320 /* Detect loss in event "A" above by marking head of queue up as lost.
2175
- * For non-SACK(Reno) senders, the first "packets" number of segments
2176
- * are considered lost. For RFC3517 SACK, a segment is considered lost if it
2321
+ * For RFC3517 SACK, a segment is considered lost if it
21772322 * has at least tp->reordering SACKed seqments above it; "packets" refers to
21782323 * the maximum SACKed segments to pass before reaching this limit.
21792324 */
....@@ -2181,10 +2326,9 @@
21812326 {
21822327 struct tcp_sock *tp = tcp_sk(sk);
21832328 struct sk_buff *skb;
2184
- int cnt, oldcnt, lost;
2185
- unsigned int mss;
2329
+ int cnt;
21862330 /* Use SACK to deduce losses of new sequences sent during recovery */
2187
- const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq;
2331
+ const u32 loss_high = tp->snd_nxt;
21882332
21892333 WARN_ON(packets > tp->packets_out);
21902334 skb = tp->lost_skb_hint;
....@@ -2207,28 +2351,14 @@
22072351 if (after(TCP_SKB_CB(skb)->end_seq, loss_high))
22082352 break;
22092353
2210
- oldcnt = cnt;
2211
- if (tcp_is_reno(tp) ||
2212
- (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
2354
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
22132355 cnt += tcp_skb_pcount(skb);
22142356
2215
- if (cnt > packets) {
2216
- if (tcp_is_sack(tp) ||
2217
- (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
2218
- (oldcnt >= packets))
2219
- break;
2357
+ if (cnt > packets)
2358
+ break;
22202359
2221
- mss = tcp_skb_mss(skb);
2222
- /* If needed, chop off the prefix to mark as lost. */
2223
- lost = (packets - oldcnt) * mss;
2224
- if (lost < skb->len &&
2225
- tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
2226
- lost, mss, GFP_ATOMIC) < 0)
2227
- break;
2228
- cnt = packets;
2229
- }
2230
-
2231
- tcp_skb_mark_lost(tp, skb);
2360
+ if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST))
2361
+ tcp_mark_skb_lost(sk, skb);
22322362
22332363 if (mark_head)
22342364 break;
....@@ -2272,7 +2402,7 @@
22722402 */
22732403 static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
22742404 {
2275
- return !tp->retrans_stamp ||
2405
+ return tp->retrans_stamp &&
22762406 tcp_tsopt_ecr_before(tp, tp->retrans_stamp);
22772407 }
22782408
....@@ -2368,6 +2498,21 @@
23682498 return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp));
23692499 }
23702500
2501
+static bool tcp_is_non_sack_preventing_reopen(struct sock *sk)
2502
+{
2503
+ struct tcp_sock *tp = tcp_sk(sk);
2504
+
2505
+ if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
2506
+ /* Hold old state until something *above* high_seq
2507
+ * is ACKed. For Reno it is MUST to prevent false
2508
+ * fast retransmits (RFC2582). SACK TCP is safe. */
2509
+ if (!tcp_any_retrans_done(sk))
2510
+ tp->retrans_stamp = 0;
2511
+ return true;
2512
+ }
2513
+ return false;
2514
+}
2515
+
23712516 /* People celebrate: "We love our President!" */
23722517 static bool tcp_try_undo_recovery(struct sock *sk)
23732518 {
....@@ -2390,14 +2535,8 @@
23902535 } else if (tp->rack.reo_wnd_persist) {
23912536 tp->rack.reo_wnd_persist--;
23922537 }
2393
- if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
2394
- /* Hold old state until something *above* high_seq
2395
- * is ACKed. For Reno it is MUST to prevent false
2396
- * fast retransmits (RFC2582). SACK TCP is safe. */
2397
- if (!tcp_any_retrans_done(sk))
2398
- tp->retrans_stamp = 0;
2538
+ if (tcp_is_non_sack_preventing_reopen(sk))
23992539 return true;
2400
- }
24012540 tcp_set_ca_state(sk, TCP_CA_Open);
24022541 tp->is_sack_reneg = 0;
24032542 return false;
....@@ -2433,6 +2572,8 @@
24332572 NET_INC_STATS(sock_net(sk),
24342573 LINUX_MIB_TCPSPURIOUSRTOS);
24352574 inet_csk(sk)->icsk_retransmits = 0;
2575
+ if (tcp_is_non_sack_preventing_reopen(sk))
2576
+ return true;
24362577 if (frto_undo || tcp_is_sack(tp)) {
24372578 tcp_set_ca_state(sk, TCP_CA_Open);
24382579 tp->is_sack_reneg = 0;
....@@ -2479,8 +2620,8 @@
24792620 u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
24802621 tp->prior_cwnd - 1;
24812622 sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
2482
- } else if ((flag & FLAG_RETRANS_DATA_ACKED) &&
2483
- !(flag & FLAG_LOST_RETRANS)) {
2623
+ } else if ((flag & (FLAG_RETRANS_DATA_ACKED | FLAG_LOST_RETRANS)) ==
2624
+ FLAG_RETRANS_DATA_ACKED) {
24842625 sndcnt = min_t(int, delta,
24852626 max_t(int, tp->prr_delivered - tp->prr_out,
24862627 newly_acked_sacked) + 1);
....@@ -2566,12 +2707,15 @@
25662707 {
25672708 struct tcp_sock *tp = tcp_sk(sk);
25682709 struct inet_connection_sock *icsk = inet_csk(sk);
2710
+ u64 val;
25692711
2570
- /* FIXME: breaks with very large cwnd */
25712712 tp->prior_ssthresh = tcp_current_ssthresh(sk);
2572
- tp->snd_cwnd = tp->snd_cwnd *
2573
- tcp_mss_to_mtu(sk, tp->mss_cache) /
2574
- icsk->icsk_mtup.probe_size;
2713
+
2714
+ val = (u64)tp->snd_cwnd * tcp_mss_to_mtu(sk, tp->mss_cache);
2715
+ do_div(val, icsk->icsk_mtup.probe_size);
2716
+ WARN_ON_ONCE((u32)val != val);
2717
+ tp->snd_cwnd = max_t(u32, 1U, val);
2718
+
25752719 tp->snd_cwnd_cnt = 0;
25762720 tp->snd_cwnd_stamp = tcp_jiffies32;
25772721 tp->snd_ssthresh = tcp_current_ssthresh(sk);
....@@ -2594,14 +2738,8 @@
25942738 unsigned int mss = tcp_current_mss(sk);
25952739
25962740 skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
2597
- if (tcp_skb_seglen(skb) > mss &&
2598
- !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
2599
- if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
2600
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
2601
- tp->retrans_out -= tcp_skb_pcount(skb);
2602
- }
2603
- tcp_skb_mark_lost_uncond_verify(tp, skb);
2604
- }
2741
+ if (tcp_skb_seglen(skb) > mss)
2742
+ tcp_mark_skb_lost(sk, skb);
26052743 }
26062744
26072745 tcp_clear_retrans_hints_partial(tp);
....@@ -2656,13 +2794,13 @@
26562794 /* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
26572795 * recovered or spurious. Otherwise retransmits more on partial ACKs.
26582796 */
2659
-static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
2797
+static void tcp_process_loss(struct sock *sk, int flag, int num_dupack,
26602798 int *rexmit)
26612799 {
26622800 struct tcp_sock *tp = tcp_sk(sk);
26632801 bool recovered = !before(tp->snd_una, tp->high_seq);
26642802
2665
- if ((flag & FLAG_SND_UNA_ADVANCED) &&
2803
+ if ((flag & FLAG_SND_UNA_ADVANCED || rcu_access_pointer(tp->fastopen_rsk)) &&
26662804 tcp_try_undo_loss(sk, false))
26672805 return;
26682806
....@@ -2675,7 +2813,7 @@
26752813 return;
26762814
26772815 if (after(tp->snd_nxt, tp->high_seq)) {
2678
- if (flag & FLAG_DATA_SACKED || is_dupack)
2816
+ if (flag & FLAG_DATA_SACKED || num_dupack)
26792817 tp->frto = 0; /* Step 3.a. loss was real */
26802818 } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
26812819 tp->high_seq = tp->snd_nxt;
....@@ -2701,16 +2839,25 @@
27012839 /* A Reno DUPACK means new data in F-RTO step 2.b above are
27022840 * delivered. Lower inflight to clock out (re)tranmissions.
27032841 */
2704
- if (after(tp->snd_nxt, tp->high_seq) && is_dupack)
2705
- tcp_add_reno_sack(sk);
2842
+ if (after(tp->snd_nxt, tp->high_seq) && num_dupack)
2843
+ tcp_add_reno_sack(sk, num_dupack, flag & FLAG_ECE);
27062844 else if (flag & FLAG_SND_UNA_ADVANCED)
27072845 tcp_reset_reno_sack(tp);
27082846 }
27092847 *rexmit = REXMIT_LOST;
27102848 }
27112849
2850
+static bool tcp_force_fast_retransmit(struct sock *sk)
2851
+{
2852
+ struct tcp_sock *tp = tcp_sk(sk);
2853
+
2854
+ return after(tcp_highest_sack_seq(tp),
2855
+ tp->snd_una + tp->reordering * tp->mss_cache);
2856
+}
2857
+
27122858 /* Undo during fast recovery after partial ACK. */
2713
-static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una)
2859
+static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una,
2860
+ bool *do_lost)
27142861 {
27152862 struct tcp_sock *tp = tcp_sk(sk);
27162863
....@@ -2735,7 +2882,9 @@
27352882 tcp_undo_cwnd_reduction(sk, true);
27362883 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
27372884 tcp_try_keep_open(sk);
2738
- return true;
2885
+ } else {
2886
+ /* Partial ACK arrived. Force fast retransmit. */
2887
+ *do_lost = tcp_force_fast_retransmit(sk);
27392888 }
27402889 return false;
27412890 }
....@@ -2759,14 +2908,6 @@
27592908 }
27602909 }
27612910
2762
-static bool tcp_force_fast_retransmit(struct sock *sk)
2763
-{
2764
- struct tcp_sock *tp = tcp_sk(sk);
2765
-
2766
- return after(tcp_highest_sack_seq(tp),
2767
- tp->snd_una + tp->reordering * tp->mss_cache);
2768
-}
2769
-
27702911 /* Process an event, which can update packets-in-flight not trivially.
27712912 * Main goal of this function is to calculate new estimate for left_out,
27722913 * taking into account both packets sitting in receiver's buffer and
....@@ -2780,20 +2921,21 @@
27802921 * tcp_xmit_retransmit_queue().
27812922 */
27822923 static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
2783
- bool is_dupack, int *ack_flag, int *rexmit)
2924
+ int num_dupack, int *ack_flag, int *rexmit)
27842925 {
27852926 struct inet_connection_sock *icsk = inet_csk(sk);
27862927 struct tcp_sock *tp = tcp_sk(sk);
27872928 int fast_rexmit = 0, flag = *ack_flag;
2788
- bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
2789
- tcp_force_fast_retransmit(sk));
2929
+ bool ece_ack = flag & FLAG_ECE;
2930
+ bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) &&
2931
+ tcp_force_fast_retransmit(sk));
27902932
27912933 if (!tp->packets_out && tp->sacked_out)
27922934 tp->sacked_out = 0;
27932935
27942936 /* Now state machine starts.
27952937 * A. ECE, hence prohibit cwnd undoing, the reduction is required. */
2796
- if (flag & FLAG_ECE)
2938
+ if (ece_ack)
27972939 tp->prior_ssthresh = 0;
27982940
27992941 /* B. In all the states check for reneging SACKs. */
....@@ -2833,35 +2975,37 @@
28332975 switch (icsk->icsk_ca_state) {
28342976 case TCP_CA_Recovery:
28352977 if (!(flag & FLAG_SND_UNA_ADVANCED)) {
2836
- if (tcp_is_reno(tp) && is_dupack)
2837
- tcp_add_reno_sack(sk);
2838
- } else {
2839
- if (tcp_try_undo_partial(sk, prior_snd_una))
2840
- return;
2841
- /* Partial ACK arrived. Force fast retransmit. */
2842
- do_lost = tcp_is_reno(tp) ||
2843
- tcp_force_fast_retransmit(sk);
2844
- }
2845
- if (tcp_try_undo_dsack(sk)) {
2846
- tcp_try_keep_open(sk);
2978
+ if (tcp_is_reno(tp))
2979
+ tcp_add_reno_sack(sk, num_dupack, ece_ack);
2980
+ } else if (tcp_try_undo_partial(sk, prior_snd_una, &do_lost))
28472981 return;
2848
- }
2982
+
2983
+ if (tcp_try_undo_dsack(sk))
2984
+ tcp_try_keep_open(sk);
2985
+
28492986 tcp_identify_packet_loss(sk, ack_flag);
2987
+ if (icsk->icsk_ca_state != TCP_CA_Recovery) {
2988
+ if (!tcp_time_to_recover(sk, flag))
2989
+ return;
2990
+ /* Undo reverts the recovery state. If loss is evident,
2991
+ * starts a new recovery (e.g. reordering then loss);
2992
+ */
2993
+ tcp_enter_recovery(sk, ece_ack);
2994
+ }
28502995 break;
28512996 case TCP_CA_Loss:
2852
- tcp_process_loss(sk, flag, is_dupack, rexmit);
2997
+ tcp_process_loss(sk, flag, num_dupack, rexmit);
28532998 tcp_identify_packet_loss(sk, ack_flag);
28542999 if (!(icsk->icsk_ca_state == TCP_CA_Open ||
28553000 (*ack_flag & FLAG_LOST_RETRANS)))
28563001 return;
28573002 /* Change state if cwnd is undone or retransmits are lost */
2858
- /* fall through */
3003
+ fallthrough;
28593004 default:
28603005 if (tcp_is_reno(tp)) {
28613006 if (flag & FLAG_SND_UNA_ADVANCED)
28623007 tcp_reset_reno_sack(tp);
2863
- if (is_dupack)
2864
- tcp_add_reno_sack(sk);
3008
+ tcp_add_reno_sack(sk, num_dupack, ece_ack);
28653009 }
28663010
28673011 if (icsk->icsk_ca_state <= TCP_CA_Disorder)
....@@ -2885,7 +3029,7 @@
28853029 }
28863030
28873031 /* Otherwise enter Recovery state */
2888
- tcp_enter_recovery(sk, (flag & FLAG_ECE));
3032
+ tcp_enter_recovery(sk, ece_ack);
28893033 fast_rexmit = 1;
28903034 }
28913035
....@@ -2896,7 +3040,7 @@
28963040
28973041 static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us, const int flag)
28983042 {
2899
- u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ;
3043
+ u32 wlen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen) * HZ;
29003044 struct tcp_sock *tp = tcp_sk(sk);
29013045
29023046 if ((flag & FLAG_ACK_MAYBE_DELAYED) && rtt_us > tcp_min_rtt(tp)) {
....@@ -2935,6 +3079,8 @@
29353079 u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
29363080
29373081 if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
3082
+ if (!delta)
3083
+ delta = 1;
29383084 seq_rtt_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
29393085 ca_rtt_us = seq_rtt_us;
29403086 }
....@@ -2988,7 +3134,7 @@
29883134 /* If the retrans timer is currently being used by Fast Open
29893135 * for SYN-ACK retrans purpose, stay put.
29903136 */
2991
- if (tp->fastopen_rsk)
3137
+ if (rcu_access_pointer(tp->fastopen_rsk))
29923138 return;
29933139
29943140 if (!tp->packets_out) {
....@@ -3004,8 +3150,8 @@
30043150 */
30053151 rto = usecs_to_jiffies(max_t(int, delta_us, 1));
30063152 }
3007
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
3008
- TCP_RTO_MAX);
3153
+ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
3154
+ TCP_RTO_MAX);
30093155 }
30103156 }
30113157
....@@ -3061,7 +3207,7 @@
30613207 */
30623208 static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
30633209 u32 prior_snd_una,
3064
- struct tcp_sacktag_state *sack)
3210
+ struct tcp_sacktag_state *sack, bool ece_ack)
30653211 {
30663212 const struct inet_connection_sock *icsk = inet_csk(sk);
30673213 u64 first_ackt, last_ackt;
....@@ -3086,8 +3232,6 @@
30863232 u8 sacked = scb->sacked;
30873233 u32 acked_pcount;
30883234
3089
- tcp_ack_tstamp(sk, skb, prior_snd_una);
3090
-
30913235 /* Determine how many packets and what bytes were acked, tso and else */
30923236 if (after(scb->end_seq, tp->snd_una)) {
30933237 if (tcp_skb_pcount(skb) == 1 ||
....@@ -3107,7 +3251,7 @@
31073251 tp->retrans_out -= acked_pcount;
31083252 flag |= FLAG_RETRANS_DATA_ACKED;
31093253 } else if (!(sacked & TCPCB_SACKED_ACKED)) {
3110
- last_ackt = skb->skb_mstamp;
3254
+ last_ackt = tcp_skb_timestamp_us(skb);
31113255 WARN_ON_ONCE(last_ackt == 0);
31123256 if (!first_ackt)
31133257 first_ackt = last_ackt;
....@@ -3122,10 +3266,10 @@
31223266 if (sacked & TCPCB_SACKED_ACKED) {
31233267 tp->sacked_out -= acked_pcount;
31243268 } else if (tcp_is_sack(tp)) {
3125
- tp->delivered += acked_pcount;
3269
+ tcp_count_delivered(tp, acked_pcount, ece_ack);
31263270 if (!tcp_skb_spurious_retrans(tp, skb))
31273271 tcp_rack_advance(tp, sacked, scb->end_seq,
3128
- skb->skb_mstamp);
3272
+ tcp_skb_timestamp_us(skb));
31293273 }
31303274 if (sacked & TCPCB_LOST)
31313275 tp->lost_out -= acked_pcount;
....@@ -3151,6 +3295,8 @@
31513295 if (!fully_acked)
31523296 break;
31533297
3298
+ tcp_ack_tstamp(sk, skb, prior_snd_una);
3299
+
31543300 next = skb_rb_next(skb);
31553301 if (unlikely(skb == tp->retransmit_skb_hint))
31563302 tp->retransmit_skb_hint = NULL;
....@@ -3166,8 +3312,11 @@
31663312 if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
31673313 tp->snd_up = tp->snd_una;
31683314
3169
- if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
3170
- flag |= FLAG_SACK_RENEGING;
3315
+ if (skb) {
3316
+ tcp_ack_tstamp(sk, skb, prior_snd_una);
3317
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
3318
+ flag |= FLAG_SACK_RENEGING;
3319
+ }
31713320
31723321 if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
31733322 seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt);
....@@ -3199,7 +3348,7 @@
31993348 }
32003349
32013350 if (tcp_is_reno(tp)) {
3202
- tcp_remove_reno_sacks(sk, pkts_acked);
3351
+ tcp_remove_reno_sacks(sk, pkts_acked, ece_ack);
32033352
32043353 /* If any of the cumulatively ACKed segments was
32053354 * retransmitted, non-SACK case cannot confirm that
....@@ -3220,7 +3369,8 @@
32203369 tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
32213370 }
32223371 } else if (skb && rtt_update && sack_rtt_us >= 0 &&
3223
- sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) {
3372
+ sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp,
3373
+ tcp_skb_timestamp_us(skb))) {
32243374 /* Do not re-arm RTO if the sack RTT is measured from data sent
32253375 * after when the head was last (re)transmitted. Otherwise the
32263376 * timeout may continue to extend in loss recovery.
....@@ -3273,6 +3423,7 @@
32733423 return;
32743424 if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) {
32753425 icsk->icsk_backoff = 0;
3426
+ icsk->icsk_probes_tstamp = 0;
32763427 inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
32773428 /* Socket must be waked up by subsequent tcp_data_snd_check().
32783429 * This function is not for random using!
....@@ -3280,8 +3431,8 @@
32803431 } else {
32813432 unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);
32823433
3283
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
3284
- when, TCP_RTO_MAX);
3434
+ when = tcp_clamp_probe0_to_user_timeout(sk, when);
3435
+ tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, TCP_RTO_MAX);
32853436 }
32863437 }
32873438
....@@ -3300,7 +3451,8 @@
33003451 * new SACK or ECE mark may first advance cwnd here and later reduce
33013452 * cwnd in tcp_fastretrans_alert() based on more states.
33023453 */
3303
- if (tcp_sk(sk)->reordering > sock_net(sk)->ipv4.sysctl_tcp_reordering)
3454
+ if (tcp_sk(sk)->reordering >
3455
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering))
33043456 return flag & FLAG_FORWARD_PROGRESS;
33053457
33063458 return flag & FLAG_DATA_ACKED;
....@@ -3412,7 +3564,8 @@
34123564 if (*last_oow_ack_time) {
34133565 s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time);
34143566
3415
- if (0 <= elapsed && elapsed < net->ipv4.sysctl_tcp_invalid_ratelimit) {
3567
+ if (0 <= elapsed &&
3568
+ elapsed < READ_ONCE(net->ipv4.sysctl_tcp_invalid_ratelimit)) {
34163569 NET_INC_STATS(net, mib_idx);
34173570 return true; /* rate-limited: don't send yet! */
34183571 }
....@@ -3459,11 +3612,11 @@
34593612
34603613 /* Then check host-wide RFC 5961 rate limit. */
34613614 now = jiffies / HZ;
3462
- if (now != challenge_timestamp) {
3463
- u32 ack_limit = net->ipv4.sysctl_tcp_challenge_ack_limit;
3615
+ if (now != READ_ONCE(challenge_timestamp)) {
3616
+ u32 ack_limit = READ_ONCE(net->ipv4.sysctl_tcp_challenge_ack_limit);
34643617 u32 half = (ack_limit + 1) >> 1;
34653618
3466
- challenge_timestamp = now;
3619
+ WRITE_ONCE(challenge_timestamp, now);
34673620 WRITE_ONCE(challenge_count, half + prandom_u32_max(ack_limit));
34683621 }
34693622 count = READ_ONCE(challenge_count);
....@@ -3544,10 +3697,10 @@
35443697 {
35453698 struct tcp_sock *tp = tcp_sk(sk);
35463699
3547
- if (rexmit == REXMIT_NONE)
3700
+ if (rexmit == REXMIT_NONE || sk->sk_state == TCP_SYN_SENT)
35483701 return;
35493702
3550
- if (unlikely(rexmit == 2)) {
3703
+ if (unlikely(rexmit == REXMIT_NEW)) {
35513704 __tcp_push_pending_frames(sk, tcp_current_mss(sk),
35523705 TCP_NAGLE_OFF);
35533706 if (after(tp->snd_nxt, tp->high_seq))
....@@ -3566,10 +3719,9 @@
35663719
35673720 delivered = tp->delivered - prior_delivered;
35683721 NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered);
3569
- if (flag & FLAG_ECE) {
3570
- tp->delivered_ce += delivered;
3722
+ if (flag & FLAG_ECE)
35713723 NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered);
3572
- }
3724
+
35733725 return delivered;
35743726 }
35753727
....@@ -3584,7 +3736,7 @@
35843736 bool is_sack_reneg = tp->is_sack_reneg;
35853737 u32 ack_seq = TCP_SKB_CB(skb)->seq;
35863738 u32 ack = TCP_SKB_CB(skb)->ack_seq;
3587
- bool is_dupack = false;
3739
+ int num_dupack = 0;
35883740 int prior_packets = tp->packets_out;
35893741 u32 delivered = tp->delivered;
35903742 u32 lost = tp->lost;
....@@ -3593,6 +3745,7 @@
35933745
35943746 sack_state.first_sackt = 0;
35953747 sack_state.rate = &rs;
3748
+ sack_state.sack_delivered = 0;
35963749
35973750 /* We very likely will need to access rtx queue. */
35983751 prefetch(sk->tcp_rtx_queue.rb_node);
....@@ -3614,14 +3767,14 @@
36143767 * this segment (RFC793 Section 3.9).
36153768 */
36163769 if (after(ack, tp->snd_nxt))
3617
- goto invalid_ack;
3770
+ return -1;
36183771
36193772 if (after(ack, prior_snd_una)) {
36203773 flag |= FLAG_SND_UNA_ADVANCED;
36213774 icsk->icsk_retransmits = 0;
36223775
36233776 #if IS_ENABLED(CONFIG_TLS_DEVICE)
3624
- if (static_branch_unlikely(&clean_acked_data_enabled))
3777
+ if (static_branch_unlikely(&clean_acked_data_enabled.key))
36253778 if (icsk->icsk_clean_acked)
36263779 icsk->icsk_clean_acked(sk, ack);
36273780 #endif
....@@ -3636,7 +3789,8 @@
36363789 if (flag & FLAG_UPDATE_TS_RECENT)
36373790 tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
36383791
3639
- if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
3792
+ if ((flag & (FLAG_SLOWPATH | FLAG_SND_UNA_ADVANCED)) ==
3793
+ FLAG_SND_UNA_ADVANCED) {
36403794 /* Window is constant, pure forward advance.
36413795 * No more checks are required.
36423796 * Note, we use the fact that SND.UNA>=SND.WL2.
....@@ -3667,6 +3821,10 @@
36673821 ack_ev_flags |= CA_ACK_ECE;
36683822 }
36693823
3824
+ if (sack_state.sack_delivered)
3825
+ tcp_count_delivered(tp, sack_state.sack_delivered,
3826
+ flag & FLAG_ECE);
3827
+
36703828 if (flag & FLAG_WIN_UPDATE)
36713829 ack_ev_flags |= CA_ACK_WIN_UPDATE;
36723830
....@@ -3692,7 +3850,8 @@
36923850 goto no_queue;
36933851
36943852 /* See if we can take anything off of the retransmit queue. */
3695
- flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state);
3853
+ flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state,
3854
+ flag & FLAG_ECE);
36963855
36973856 tcp_rack_update_reo_wnd(sk, &rs);
36983857
....@@ -3700,8 +3859,14 @@
37003859 tcp_process_tlp_ack(sk, ack, flag);
37013860
37023861 if (tcp_ack_is_dubious(sk, flag)) {
3703
- is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
3704
- tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
3862
+ if (!(flag & (FLAG_SND_UNA_ADVANCED |
3863
+ FLAG_NOT_DUP | FLAG_DSACKING_ACK))) {
3864
+ num_dupack = 1;
3865
+ /* Consider if pure acks were aggregated in tcp_add_backlog() */
3866
+ if (!(flag & FLAG_DATA))
3867
+ num_dupack = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
3868
+ }
3869
+ tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
37053870 &rexmit);
37063871 }
37073872
....@@ -3723,7 +3888,7 @@
37233888 no_queue:
37243889 /* If data was DSACKed, see if we can undo a cwnd reduction. */
37253890 if (flag & FLAG_DSACKING_ACK) {
3726
- tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
3891
+ tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
37273892 &rexmit);
37283893 tcp_newly_delivered(sk, delivered, flag);
37293894 }
....@@ -3737,10 +3902,6 @@
37373902 tcp_process_tlp_ack(sk, ack, flag);
37383903 return 1;
37393904
3740
-invalid_ack:
3741
- SOCK_DEBUG(sk, "Ack %u after %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
3742
- return -1;
3743
-
37443905 old_ack:
37453906 /* If data was SACKed, tag it and see if we should send more data.
37463907 * If data was DSACKed, see if we can undo a cwnd reduction.
....@@ -3748,13 +3909,12 @@
37483909 if (TCP_SKB_CB(skb)->sacked) {
37493910 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
37503911 &sack_state);
3751
- tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
3912
+ tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
37523913 &rexmit);
37533914 tcp_newly_delivered(sk, delivered, flag);
37543915 tcp_xmit_recovery(sk, rexmit);
37553916 }
37563917
3757
- SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
37583918 return 0;
37593919 }
37603920
....@@ -3775,7 +3935,7 @@
37753935 foc->exp = exp_opt;
37763936 }
37773937
3778
-static void smc_parse_options(const struct tcphdr *th,
3938
+static bool smc_parse_options(const struct tcphdr *th,
37793939 struct tcp_options_received *opt_rx,
37803940 const unsigned char *ptr,
37813941 int opsize)
....@@ -3784,10 +3944,56 @@
37843944 if (static_branch_unlikely(&tcp_have_smc)) {
37853945 if (th->syn && !(opsize & 1) &&
37863946 opsize >= TCPOLEN_EXP_SMC_BASE &&
3787
- get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC)
3947
+ get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) {
37883948 opt_rx->smc_ok = 1;
3949
+ return true;
3950
+ }
37893951 }
37903952 #endif
3953
+ return false;
3954
+}
3955
+
3956
+/* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped
3957
+ * value on success.
3958
+ */
3959
+static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss)
3960
+{
3961
+ const unsigned char *ptr = (const unsigned char *)(th + 1);
3962
+ int length = (th->doff * 4) - sizeof(struct tcphdr);
3963
+ u16 mss = 0;
3964
+
3965
+ while (length > 0) {
3966
+ int opcode = *ptr++;
3967
+ int opsize;
3968
+
3969
+ switch (opcode) {
3970
+ case TCPOPT_EOL:
3971
+ return mss;
3972
+ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
3973
+ length--;
3974
+ continue;
3975
+ default:
3976
+ if (length < 2)
3977
+ return mss;
3978
+ opsize = *ptr++;
3979
+ if (opsize < 2) /* "silly options" */
3980
+ return mss;
3981
+ if (opsize > length)
3982
+ return mss; /* fail on partial options */
3983
+ if (opcode == TCPOPT_MSS && opsize == TCPOLEN_MSS) {
3984
+ u16 in_mss = get_unaligned_be16(ptr);
3985
+
3986
+ if (in_mss) {
3987
+ if (user_mss && user_mss < in_mss)
3988
+ in_mss = user_mss;
3989
+ mss = in_mss;
3990
+ }
3991
+ }
3992
+ ptr += opsize - 2;
3993
+ length -= opsize;
3994
+ }
3995
+ }
3996
+ return mss;
37913997 }
37923998
37933999 /* Look for tcp options. Normally only called on SYN and SYNACK packets.
....@@ -3805,6 +4011,7 @@
38054011
38064012 ptr = (const unsigned char *)(th + 1);
38074013 opt_rx->saw_tstamp = 0;
4014
+ opt_rx->saw_unknown = 0;
38084015
38094016 while (length > 0) {
38104017 int opcode = *ptr++;
....@@ -3817,6 +4024,8 @@
38174024 length--;
38184025 continue;
38194026 default:
4027
+ if (length < 2)
4028
+ return;
38204029 opsize = *ptr++;
38214030 if (opsize < 2) /* "silly options" */
38224031 return;
....@@ -3836,7 +4045,7 @@
38364045 break;
38374046 case TCPOPT_WINDOW:
38384047 if (opsize == TCPOLEN_WINDOW && th->syn &&
3839
- !estab && net->ipv4.sysctl_tcp_window_scaling) {
4048
+ !estab && READ_ONCE(net->ipv4.sysctl_tcp_window_scaling)) {
38404049 __u8 snd_wscale = *(__u8 *)ptr;
38414050 opt_rx->wscale_ok = 1;
38424051 if (snd_wscale > TCP_MAX_WSCALE) {
....@@ -3852,7 +4061,7 @@
38524061 case TCPOPT_TIMESTAMP:
38534062 if ((opsize == TCPOLEN_TIMESTAMP) &&
38544063 ((estab && opt_rx->tstamp_ok) ||
3855
- (!estab && net->ipv4.sysctl_tcp_timestamps))) {
4064
+ (!estab && READ_ONCE(net->ipv4.sysctl_tcp_timestamps)))) {
38564065 opt_rx->saw_tstamp = 1;
38574066 opt_rx->rcv_tsval = get_unaligned_be32(ptr);
38584067 opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
....@@ -3860,7 +4069,7 @@
38604069 break;
38614070 case TCPOPT_SACK_PERM:
38624071 if (opsize == TCPOLEN_SACK_PERM && th->syn &&
3863
- !estab && net->ipv4.sysctl_tcp_sack) {
4072
+ !estab && READ_ONCE(net->ipv4.sysctl_tcp_sack)) {
38644073 opt_rx->sack_ok = TCP_SACK_SEEN;
38654074 tcp_sack_reset(opt_rx);
38664075 }
....@@ -3893,15 +4102,21 @@
38934102 */
38944103 if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE &&
38954104 get_unaligned_be16(ptr) ==
3896
- TCPOPT_FASTOPEN_MAGIC)
4105
+ TCPOPT_FASTOPEN_MAGIC) {
38974106 tcp_parse_fastopen_option(opsize -
38984107 TCPOLEN_EXP_FASTOPEN_BASE,
38994108 ptr + 2, th->syn, foc, true);
3900
- else
3901
- smc_parse_options(th, opt_rx, ptr,
3902
- opsize);
4109
+ break;
4110
+ }
4111
+
4112
+ if (smc_parse_options(th, opt_rx, ptr, opsize))
4113
+ break;
4114
+
4115
+ opt_rx->saw_unknown = 1;
39034116 break;
39044117
4118
+ default:
4119
+ opt_rx->saw_unknown = 1;
39054120 }
39064121 ptr += opsize-2;
39074122 length -= opsize;
....@@ -4117,7 +4332,7 @@
41174332 case TCP_ESTABLISHED:
41184333 /* Move to CLOSE_WAIT */
41194334 tcp_set_state(sk, TCP_CLOSE_WAIT);
4120
- inet_csk(sk)->icsk_ack.pingpong = 1;
4335
+ inet_csk_enter_pingpong_mode(sk);
41214336 break;
41224337
41234338 case TCP_CLOSE_WAIT:
....@@ -4189,7 +4404,7 @@
41894404 {
41904405 struct tcp_sock *tp = tcp_sk(sk);
41914406
4192
- if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
4407
+ if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) {
41934408 int mib_idx;
41944409
41954410 if (before(seq, tp->rcv_nxt))
....@@ -4215,6 +4430,18 @@
42154430 tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
42164431 }
42174432
4433
+static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb)
4434
+{
4435
+ /* When the ACK path fails or drops most ACKs, the sender would
4436
+ * timeout and spuriously retransmit the same segment repeatedly.
4437
+ * The receiver remembers and reflects via DSACKs. Leverage the
4438
+ * DSACK state and change the txhash to re-route speculatively.
4439
+ */
4440
+ if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq &&
4441
+ sk_rethink_txhash(sk))
4442
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH);
4443
+}
4444
+
42184445 static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
42194446 {
42204447 struct tcp_sock *tp = tcp_sk(sk);
....@@ -4224,9 +4451,10 @@
42244451 NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
42254452 tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
42264453
4227
- if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
4454
+ if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) {
42284455 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
42294456
4457
+ tcp_rcv_spurious_retrans(sk, skb);
42304458 if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
42314459 end_seq = tp->rcv_nxt;
42324460 tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
....@@ -4260,9 +4488,37 @@
42604488 sp[i] = sp[i + 1];
42614489 continue;
42624490 }
4263
- this_sack++, swalk++;
4491
+ this_sack++;
4492
+ swalk++;
42644493 }
42654494 }
4495
+
4496
+static void tcp_sack_compress_send_ack(struct sock *sk)
4497
+{
4498
+ struct tcp_sock *tp = tcp_sk(sk);
4499
+
4500
+ if (!tp->compressed_ack)
4501
+ return;
4502
+
4503
+ if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
4504
+ __sock_put(sk);
4505
+
4506
+ /* Since we have to send one ack finally,
4507
+ * substract one from tp->compressed_ack to keep
4508
+ * LINUX_MIB_TCPACKCOMPRESSED accurate.
4509
+ */
4510
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
4511
+ tp->compressed_ack - 1);
4512
+
4513
+ tp->compressed_ack = 0;
4514
+ tcp_send_ack(sk);
4515
+}
4516
+
4517
+/* Reasonable amount of sack blocks included in TCP SACK option
4518
+ * The max is 4, but this becomes 3 if TCP timestamps are there.
4519
+ * Given that SACK packets might be lost, be conservative and use 2.
4520
+ */
4521
+#define TCP_SACK_BLOCKS_EXPECTED 2
42664522
42674523 static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
42684524 {
....@@ -4276,6 +4532,8 @@
42764532
42774533 for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) {
42784534 if (tcp_sack_extend(sp, seq, end_seq)) {
4535
+ if (this_sack >= TCP_SACK_BLOCKS_EXPECTED)
4536
+ tcp_sack_compress_send_ack(sk);
42794537 /* Rotate this_sack to the first one. */
42804538 for (; this_sack > 0; this_sack--, sp--)
42814539 swap(*sp, *(sp - 1));
....@@ -4285,6 +4543,9 @@
42854543 }
42864544 }
42874545
4546
+ if (this_sack >= TCP_SACK_BLOCKS_EXPECTED)
4547
+ tcp_sack_compress_send_ack(sk);
4548
+
42884549 /* Could not find an adjacent existing SACK, build a new one,
42894550 * put it at the front, and shift everyone else down. We
42904551 * always know there is at least one SACK present already here.
....@@ -4292,8 +4553,6 @@
42924553 * If the sack array is full, forget about the last one.
42934554 */
42944555 if (this_sack >= TCP_NUM_SACKS) {
4295
- if (tp->compressed_ack > TCP_FASTRETRANS_THRESH)
4296
- tcp_send_ack(sk);
42974556 this_sack--;
42984557 tp->rx_opt.num_sacks--;
42994558 sp--;
....@@ -4345,7 +4604,6 @@
43454604 /**
43464605 * tcp_try_coalesce - try to merge skb to prior one
43474606 * @sk: socket
4348
- * @dest: destination queue
43494607 * @to: prior buffer
43504608 * @from: buffer to add in queue
43514609 * @fragstolen: pointer to boolean
....@@ -4367,6 +4625,9 @@
43674625
43684626 /* Its possible this segment overlaps with prior segment in queue */
43694627 if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
4628
+ return false;
4629
+
4630
+ if (!mptcp_skb_can_collapse(to, from))
43704631 return false;
43714632
43724633 #ifdef CONFIG_TLS_DEVICE
....@@ -4412,6 +4673,7 @@
44124673
44134674 static void tcp_drop(struct sock *sk, struct sk_buff *skb)
44144675 {
4676
+ trace_android_vh_kfree_skb(skb);
44154677 sk_drops_add(sk, skb);
44164678 __kfree_skb(skb);
44174679 }
....@@ -4443,13 +4705,9 @@
44434705 rb_erase(&skb->rbnode, &tp->out_of_order_queue);
44444706
44454707 if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
4446
- SOCK_DEBUG(sk, "ofo packet was already received\n");
44474708 tcp_drop(sk, skb);
44484709 continue;
44494710 }
4450
- SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
4451
- tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
4452
- TCP_SKB_CB(skb)->end_seq);
44534711
44544712 tail = skb_peek_tail(&sk->sk_receive_queue);
44554713 eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
....@@ -4511,11 +4769,10 @@
45114769 tp->pred_flags = 0;
45124770 inet_csk_schedule_ack(sk);
45134771
4772
+ tp->rcv_ooopack += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
45144773 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
45154774 seq = TCP_SKB_CB(skb)->seq;
45164775 end_seq = TCP_SKB_CB(skb)->end_seq;
4517
- SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
4518
- tp->rcv_nxt, seq, end_seq);
45194776
45204777 p = &tp->out_of_order_queue.rb_node;
45214778 if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
....@@ -4541,7 +4798,7 @@
45414798 * and trigger fast retransmit.
45424799 */
45434800 if (tcp_is_sack(tp))
4544
- tcp_grow_window(sk, skb);
4801
+ tcp_grow_window(sk, skb, true);
45454802 kfree_skb_partial(skb, fragstolen);
45464803 skb = NULL;
45474804 goto add_sack;
....@@ -4629,19 +4886,18 @@
46294886 * and trigger fast retransmit.
46304887 */
46314888 if (tcp_is_sack(tp))
4632
- tcp_grow_window(sk, skb);
4889
+ tcp_grow_window(sk, skb, false);
46334890 skb_condense(skb);
46344891 skb_set_owner_r(skb, sk);
46354892 }
46364893 }
46374894
4638
-static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
4639
- bool *fragstolen)
4895
+static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb,
4896
+ bool *fragstolen)
46404897 {
46414898 int eaten;
46424899 struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
46434900
4644
- __skb_pull(skb, hdrlen);
46454901 eaten = (tail &&
46464902 tcp_try_coalesce(sk, tail,
46474903 skb, fragstolen)) ? 1 : 0;
....@@ -4692,7 +4948,7 @@
46924948 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
46934949 TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
46944950
4695
- if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) {
4951
+ if (tcp_queue_rcv(sk, skb, &fragstolen)) {
46964952 WARN_ON_ONCE(fragstolen); /* should not happen */
46974953 __kfree_skb(skb);
46984954 }
....@@ -4724,6 +4980,9 @@
47244980 bool fragstolen;
47254981 int eaten;
47264982
4983
+ if (sk_is_mptcp(sk))
4984
+ mptcp_incoming_options(sk, skb);
4985
+
47274986 if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
47284987 __kfree_skb(skb);
47294988 return;
....@@ -4753,7 +5012,7 @@
47535012 goto drop;
47545013 }
47555014
4756
- eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
5015
+ eaten = tcp_queue_rcv(sk, skb, &fragstolen);
47575016 if (skb->len)
47585017 tcp_event_data_recv(sk, skb);
47595018 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
....@@ -4782,6 +5041,7 @@
47825041 }
47835042
47845043 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
5044
+ tcp_rcv_spurious_retrans(sk, skb);
47855045 /* A retransmit, 2nd most common case. Force an immediate ack. */
47865046 NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
47875047 tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
....@@ -4800,10 +5060,6 @@
48005060
48015061 if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
48025062 /* Partial packet, seq < rcv_next < end_seq */
4803
- SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
4804
- tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
4805
- TCP_SKB_CB(skb)->end_seq);
4806
-
48075063 tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
48085064
48095065 /* If window is closed, drop tail of packet. But after
....@@ -4897,7 +5153,7 @@
48975153 /* The first skb to collapse is:
48985154 * - not SYN/FIN and
48995155 * - bloated or contains data before "start" or
4900
- * overlaps to the next one.
5156
+ * overlaps to the next one and mptcp allow collapsing.
49015157 */
49025158 if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) &&
49035159 (tcp_win_from_space(sk, skb->truesize) > skb->len ||
....@@ -4906,7 +5162,7 @@
49065162 break;
49075163 }
49085164
4909
- if (n && n != tail &&
5165
+ if (n && n != tail && mptcp_skb_can_collapse(skb, n) &&
49105166 TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
49115167 end_of_skbs = false;
49125168 break;
....@@ -4939,6 +5195,7 @@
49395195 else
49405196 __skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
49415197 skb_set_owner_r(nskb, sk);
5198
+ mptcp_skb_ext_move(nskb, skb);
49425199
49435200 /* Copy data, releasing collapsed skbs. */
49445201 while (copy > 0) {
....@@ -4958,6 +5215,7 @@
49585215 skb = tcp_collapse_one(sk, skb, list, root);
49595216 if (!skb ||
49605217 skb == tail ||
5218
+ !mptcp_skb_can_collapse(nskb, skb) ||
49615219 (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
49625220 goto end;
49635221 #ifdef CONFIG_TLS_DEVICE
....@@ -5082,8 +5340,6 @@
50825340 {
50835341 struct tcp_sock *tp = tcp_sk(sk);
50845342
5085
- SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
5086
-
50875343 NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);
50885344
50895345 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
....@@ -5149,12 +5405,6 @@
51495405 return true;
51505406 }
51515407
5152
-/* When incoming ACK allowed to free some skb from write_queue,
5153
- * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
5154
- * on the exit from tcp input handler.
5155
- *
5156
- * PROBLEM: sndbuf expansion does not work well with largesend.
5157
- */
51585408 static void tcp_new_space(struct sock *sk)
51595409 {
51605410 struct tcp_sock *tp = tcp_sk(sk);
....@@ -5167,18 +5417,25 @@
51675417 sk->sk_write_space(sk);
51685418 }
51695419
5170
-static void tcp_check_space(struct sock *sk)
5420
+/* Caller made space either from:
5421
+ * 1) Freeing skbs in rtx queues (after tp->snd_una has advanced)
5422
+ * 2) Sent skbs from output queue (and thus advancing tp->snd_nxt)
5423
+ *
5424
+ * We might be able to generate EPOLLOUT to the application if:
5425
+ * 1) Space consumed in output/rtx queues is below sk->sk_sndbuf/2
5426
+ * 2) notsent amount (tp->write_seq - tp->snd_nxt) became
5427
+ * small enough that tcp_stream_memory_free() decides it
5428
+ * is time to generate EPOLLOUT.
5429
+ */
5430
+void tcp_check_space(struct sock *sk)
51715431 {
5172
- if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
5173
- sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
5174
- /* pairs with tcp_poll() */
5175
- smp_mb();
5176
- if (sk->sk_socket &&
5177
- test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
5178
- tcp_new_space(sk);
5179
- if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
5180
- tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
5181
- }
5432
+ /* pairs with tcp_poll() */
5433
+ smp_mb();
5434
+ if (sk->sk_socket &&
5435
+ test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
5436
+ tcp_new_space(sk);
5437
+ if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
5438
+ tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
51825439 }
51835440 }
51845441
....@@ -5220,20 +5477,18 @@
52205477 }
52215478
52225479 if (!tcp_is_sack(tp) ||
5223
- tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr)
5480
+ tp->compressed_ack >= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr))
52245481 goto send_now;
52255482
52265483 if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) {
52275484 tp->compressed_ack_rcv_nxt = tp->rcv_nxt;
5228
- if (tp->compressed_ack > TCP_FASTRETRANS_THRESH)
5229
- NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
5230
- tp->compressed_ack - TCP_FASTRETRANS_THRESH);
5231
- tp->compressed_ack = 0;
5485
+ tp->dup_ack_counter = 0;
52325486 }
5233
-
5234
- if (++tp->compressed_ack <= TCP_FASTRETRANS_THRESH)
5487
+ if (tp->dup_ack_counter < TCP_FASTRETRANS_THRESH) {
5488
+ tp->dup_ack_counter++;
52355489 goto send_now;
5236
-
5490
+ }
5491
+ tp->compressed_ack++;
52375492 if (hrtimer_is_queued(&tp->compressed_ack_timer))
52385493 return;
52395494
....@@ -5243,11 +5498,13 @@
52435498 if (tp->srtt_us && tp->srtt_us < rtt)
52445499 rtt = tp->srtt_us;
52455500
5246
- delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns,
5501
+ delay = min_t(unsigned long,
5502
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns),
52475503 rtt * (NSEC_PER_USEC >> 3)/20);
52485504 sock_hold(sk);
5249
- hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay),
5250
- HRTIMER_MODE_REL_PINNED_SOFT);
5505
+ hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay),
5506
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns),
5507
+ HRTIMER_MODE_REL_PINNED_SOFT);
52515508 }
52525509
52535510 static inline void tcp_ack_snd_check(struct sock *sk)
....@@ -5274,7 +5531,7 @@
52745531 struct tcp_sock *tp = tcp_sk(sk);
52755532 u32 ptr = ntohs(th->urg_ptr);
52765533
5277
- if (ptr && !sock_net(sk)->ipv4.sysctl_tcp_stdurg)
5534
+ if (ptr && !READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_stdurg))
52785535 ptr--;
52795536 ptr += ntohl(th->seq);
52805537
....@@ -5328,7 +5585,7 @@
53285585 }
53295586
53305587 tp->urg_data = TCP_URG_NOTYET;
5331
- tp->urg_seq = ptr;
5588
+ WRITE_ONCE(tp->urg_seq, ptr);
53325589
53335590 /* Disable header prediction. */
53345591 tp->pred_flags = 0;
....@@ -5481,6 +5738,8 @@
54815738 goto discard;
54825739 }
54835740
5741
+ bpf_skops_parse_hdr(sk, skb);
5742
+
54845743 return true;
54855744
54865745 discard:
....@@ -5521,7 +5780,7 @@
55215780 trace_tcp_probe(sk, skb);
55225781
55235782 tcp_mstamp_refresh(tp);
5524
- if (unlikely(!sk->sk_rx_dst))
5783
+ if (unlikely(!rcu_access_pointer(sk->sk_rx_dst)))
55255784 inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
55265785 /*
55275786 * Header prediction.
....@@ -5628,8 +5887,8 @@
56285887 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
56295888
56305889 /* Bulk data transfer: receiver */
5631
- eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
5632
- &fragstolen);
5890
+ __skb_pull(skb, tcp_header_len);
5891
+ eaten = tcp_queue_rcv(sk, skb, &fragstolen);
56335892
56345893 tcp_event_data_recv(sk, skb);
56355894
....@@ -5691,6 +5950,34 @@
56915950 }
56925951 EXPORT_SYMBOL(tcp_rcv_established);
56935952
5953
+void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb)
5954
+{
5955
+ struct inet_connection_sock *icsk = inet_csk(sk);
5956
+ struct tcp_sock *tp = tcp_sk(sk);
5957
+
5958
+ tcp_mtup_init(sk);
5959
+ icsk->icsk_af_ops->rebuild_header(sk);
5960
+ tcp_init_metrics(sk);
5961
+
5962
+ /* Initialize the congestion window to start the transfer.
5963
+ * Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
5964
+ * retransmitted. In light of RFC6298 more aggressive 1sec
5965
+ * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
5966
+ * retransmission has occurred.
5967
+ */
5968
+ if (tp->total_retrans > 1 && tp->undo_marker)
5969
+ tp->snd_cwnd = 1;
5970
+ else
5971
+ tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
5972
+ tp->snd_cwnd_stamp = tcp_jiffies32;
5973
+
5974
+ bpf_skops_established(sk, bpf_op, skb);
5975
+ /* Initialize congestion control unless BPF initialized it already: */
5976
+ if (!icsk->icsk_ca_initialized)
5977
+ tcp_init_congestion_control(sk);
5978
+ tcp_init_buffer_space(sk);
5979
+}
5980
+
56945981 void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
56955982 {
56965983 struct tcp_sock *tp = tcp_sk(sk);
....@@ -5705,7 +5992,7 @@
57055992 sk_mark_napi_id(sk, skb);
57065993 }
57075994
5708
- tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);
5995
+ tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, skb);
57095996
57105997 /* Prevent spurious tcp_cwnd_restart() on first data
57115998 * packet.
....@@ -5760,6 +6047,10 @@
57606047 tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
57616048
57626049 if (data) { /* Retransmit unacked data in SYN */
6050
+ if (tp->total_retrans)
6051
+ tp->fastopen_client_fail = TFO_SYN_RETRANSMITTED;
6052
+ else
6053
+ tp->fastopen_client_fail = TFO_DATA_NOT_ACKED;
57636054 skb_rbtree_walk_from(data) {
57646055 if (__tcp_retransmit_skb(sk, data, 1))
57656056 break;
....@@ -5792,6 +6083,21 @@
57926083 #endif
57936084 }
57946085
6086
+static void tcp_try_undo_spurious_syn(struct sock *sk)
6087
+{
6088
+ struct tcp_sock *tp = tcp_sk(sk);
6089
+ u32 syn_stamp;
6090
+
6091
+ /* undo_marker is set when SYN or SYNACK times out. The timeout is
6092
+ * spurious if the ACK's timestamp option echo value matches the
6093
+ * original SYN timestamp.
6094
+ */
6095
+ syn_stamp = tp->retrans_stamp;
6096
+ if (tp->undo_marker && syn_stamp && tp->rx_opt.saw_tstamp &&
6097
+ syn_stamp == tp->rx_opt.rcv_tsecr)
6098
+ tp->undo_marker = 0;
6099
+}
6100
+
57956101 static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
57966102 const struct tcphdr *th)
57976103 {
....@@ -5815,8 +6121,14 @@
58156121 * the segment and return)"
58166122 */
58176123 if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) ||
5818
- after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt))
6124
+ after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
6125
+ /* Previous FIN/ACK or RST/ACK might be ignored. */
6126
+ if (icsk->icsk_retransmits == 0)
6127
+ inet_csk_reset_xmit_timer(sk,
6128
+ ICSK_TIME_RETRANS,
6129
+ TCP_TIMEOUT_MIN, TCP_RTO_MAX);
58196130 goto reset_and_undo;
6131
+ }
58206132
58216133 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
58226134 !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
....@@ -5859,6 +6171,7 @@
58596171 tcp_ecn_rcv_synack(tp, th);
58606172
58616173 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
6174
+ tcp_try_undo_spurious_syn(sk);
58626175 tcp_ack(sk, skb, FLAG_SLOWPATH);
58636176
58646177 /* Ok.. it's good. Set up sequence numbers and
....@@ -5912,7 +6225,7 @@
59126225 return -1;
59136226 if (sk->sk_write_pending ||
59146227 icsk->icsk_accept_queue.rskq_defer_accept ||
5915
- icsk->icsk_ack.pingpong) {
6228
+ inet_csk_in_pingpong_mode(sk)) {
59166229 /* Save one ACK. Data will be ready after
59176230 * several ticks, if write_pending is set.
59186231 *
....@@ -6017,6 +6330,38 @@
60176330 return 1;
60186331 }
60196332
6333
+static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
6334
+{
6335
+ struct request_sock *req;
6336
+
6337
+ /* If we are still handling the SYNACK RTO, see if timestamp ECR allows
6338
+ * undo. If peer SACKs triggered fast recovery, we can't undo here.
6339
+ */
6340
+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
6341
+ tcp_try_undo_loss(sk, false);
6342
+
6343
+ /* Reset rtx states to prevent spurious retransmits_timed_out() */
6344
+ tcp_sk(sk)->retrans_stamp = 0;
6345
+ inet_csk(sk)->icsk_retransmits = 0;
6346
+
6347
+ /* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1,
6348
+ * we no longer need req so release it.
6349
+ */
6350
+ req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
6351
+ lockdep_sock_is_held(sk));
6352
+ reqsk_fastopen_remove(sk, req, false);
6353
+
6354
+ /* Re-arm the timer because data may have been sent out.
6355
+ * This is similar to the regular data transmission case
6356
+ * when new data has just been ack'ed.
6357
+ *
6358
+ * (TFO) - we could try to be more aggressive and
6359
+ * retransmitting any data sooner based on when they
6360
+ * are sent out.
6361
+ */
6362
+ tcp_rearm_rto(sk);
6363
+}
6364
+
60206365 /*
60216366 * This function implements the receiving procedure of RFC 793 for
60226367 * all states except ESTABLISHED and TIME_WAIT.
....@@ -6079,7 +6424,8 @@
60796424
60806425 tcp_mstamp_refresh(tp);
60816426 tp->rx_opt.saw_tstamp = 0;
6082
- req = tp->fastopen_rsk;
6427
+ req = rcu_dereference_protected(tp->fastopen_rsk,
6428
+ lockdep_sock_is_held(sk));
60836429 if (req) {
60846430 bool req_stolen;
60856431
....@@ -6113,23 +6459,13 @@
61136459 if (!tp->srtt_us)
61146460 tcp_synack_rtt_meas(sk, req);
61156461
6116
- /* Once we leave TCP_SYN_RECV, we no longer need req
6117
- * so release it.
6118
- */
61196462 if (req) {
6120
- inet_csk(sk)->icsk_retransmits = 0;
6121
- reqsk_fastopen_remove(sk, req, false);
6122
- /* Re-arm the timer because data may have been sent out.
6123
- * This is similar to the regular data transmission case
6124
- * when new data has just been ack'ed.
6125
- *
6126
- * (TFO) - we could try to be more aggressive and
6127
- * retransmitting any data sooner based on when they
6128
- * are sent out.
6129
- */
6130
- tcp_rearm_rto(sk);
6463
+ tcp_rcv_synrecv_state_fastopen(sk);
61316464 } else {
6132
- tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
6465
+ tcp_try_undo_spurious_syn(sk);
6466
+ tp->retrans_stamp = 0;
6467
+ tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB,
6468
+ skb);
61336469 WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
61346470 }
61356471 smp_mb();
....@@ -6163,16 +6499,9 @@
61636499 case TCP_FIN_WAIT1: {
61646500 int tmo;
61656501
6166
- /* If we enter the TCP_FIN_WAIT1 state and we are a
6167
- * Fast Open socket and this is the first acceptable
6168
- * ACK we have received, this would have acknowledged
6169
- * our SYNACK so stop the SYNACK timer.
6170
- */
6171
- if (req) {
6172
- /* We no longer need the request sock. */
6173
- reqsk_fastopen_remove(sk, req, false);
6174
- tcp_rearm_rto(sk);
6175
- }
6502
+ if (req)
6503
+ tcp_rcv_synrecv_state_fastopen(sk);
6504
+
61766505 if (tp->snd_una != tp->write_seq)
61776506 break;
61786507
....@@ -6244,9 +6573,12 @@
62446573 case TCP_CLOSE_WAIT:
62456574 case TCP_CLOSING:
62466575 case TCP_LAST_ACK:
6247
- if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
6576
+ if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
6577
+ if (sk_is_mptcp(sk))
6578
+ mptcp_incoming_options(sk, skb);
62486579 break;
6249
- /* fall through */
6580
+ }
6581
+ fallthrough;
62506582 case TCP_FIN_WAIT1:
62516583 case TCP_FIN_WAIT2:
62526584 /* RFC 793 says to queue data in these states,
....@@ -6261,7 +6593,7 @@
62616593 return 1;
62626594 }
62636595 }
6264
- /* Fall through */
6596
+ fallthrough;
62656597 case TCP_ESTABLISHED:
62666598 tcp_data_queue(sk, skb);
62676599 queued = 1;
....@@ -6307,6 +6639,11 @@
63076639 * congestion control: Linux DCTCP asserts ECT on all packets,
63086640 * including SYN, which is most optimal solution; however,
63096641 * others, such as FreeBSD do not.
6642
+ *
6643
+ * Exception: At least one of the reserved bits of the TCP header (th->res1) is
6644
+ * set, indicating the use of a future TCP extension (such as AccECN). See
6645
+ * RFC8311 ยง4.3 which updates RFC3168 to allow the development of such
6646
+ * extensions.
63106647 */
63116648 static void tcp_ecn_create_request(struct request_sock *req,
63126649 const struct sk_buff *skb,
....@@ -6326,7 +6663,7 @@
63266663 ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
63276664 ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst;
63286665
6329
- if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) ||
6666
+ if (((!ect || th->res1) && ecn_ok) || tcp_ca_needs_ecn(listen_sk) ||
63306667 (ecn_ok_dst & DST_FEATURE_ECN_CA) ||
63316668 tcp_bpf_ca_needs_ecn((struct sock *)req))
63326669 inet_rsk(req)->ecn_ok = 1;
....@@ -6339,10 +6676,9 @@
63396676 struct inet_request_sock *ireq = inet_rsk(req);
63406677
63416678 req->rsk_rcv_wnd = 0; /* So that tcp_send_synack() knows! */
6342
- req->cookie_ts = 0;
63436679 tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
63446680 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
6345
- tcp_rsk(req)->snt_synack = tcp_clock_us();
6681
+ tcp_rsk(req)->snt_synack = 0;
63466682 tcp_rsk(req)->last_oow_ack_time = 0;
63476683 req->mss = rx_opt->mss_clamp;
63486684 req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
....@@ -6387,17 +6723,18 @@
63876723 /*
63886724 * Return true if a syncookie should be sent
63896725 */
6390
-static bool tcp_syn_flood_action(const struct sock *sk,
6391
- const struct sk_buff *skb,
6392
- const char *proto)
6726
+static bool tcp_syn_flood_action(const struct sock *sk, const char *proto)
63936727 {
63946728 struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
63956729 const char *msg = "Dropping request";
6396
- bool want_cookie = false;
63976730 struct net *net = sock_net(sk);
6731
+ bool want_cookie = false;
6732
+ u8 syncookies;
6733
+
6734
+ syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies);
63986735
63996736 #ifdef CONFIG_SYN_COOKIES
6400
- if (net->ipv4.sysctl_tcp_syncookies) {
6737
+ if (syncookies) {
64016738 msg = "Sending cookies";
64026739 want_cookie = true;
64036740 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
....@@ -6405,11 +6742,10 @@
64056742 #endif
64066743 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
64076744
6408
- if (!queue->synflood_warned &&
6409
- net->ipv4.sysctl_tcp_syncookies != 2 &&
6745
+ if (!queue->synflood_warned && syncookies != 2 &&
64106746 xchg(&queue->synflood_warned, 1) == 0)
64116747 net_info_ratelimited("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
6412
- proto, ntohs(tcp_hdr(skb)->dest), msg);
6748
+ proto, sk->sk_num, msg);
64136749
64146750 return want_cookie;
64156751 }
....@@ -6420,16 +6756,60 @@
64206756 {
64216757 if (tcp_sk(sk)->save_syn) {
64226758 u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb);
6423
- u32 *copy;
6759
+ struct saved_syn *saved_syn;
6760
+ u32 mac_hdrlen;
6761
+ void *base;
64246762
6425
- copy = kmalloc(len + sizeof(u32), GFP_ATOMIC);
6426
- if (copy) {
6427
- copy[0] = len;
6428
- memcpy(&copy[1], skb_network_header(skb), len);
6429
- req->saved_syn = copy;
6763
+ if (tcp_sk(sk)->save_syn == 2) { /* Save full header. */
6764
+ base = skb_mac_header(skb);
6765
+ mac_hdrlen = skb_mac_header_len(skb);
6766
+ len += mac_hdrlen;
6767
+ } else {
6768
+ base = skb_network_header(skb);
6769
+ mac_hdrlen = 0;
6770
+ }
6771
+
6772
+ saved_syn = kmalloc(struct_size(saved_syn, data, len),
6773
+ GFP_ATOMIC);
6774
+ if (saved_syn) {
6775
+ saved_syn->mac_hdrlen = mac_hdrlen;
6776
+ saved_syn->network_hdrlen = skb_network_header_len(skb);
6777
+ saved_syn->tcp_hdrlen = tcp_hdrlen(skb);
6778
+ memcpy(saved_syn->data, base, len);
6779
+ req->saved_syn = saved_syn;
64306780 }
64316781 }
64326782 }
6783
+
6784
+/* If a SYN cookie is required and supported, returns a clamped MSS value to be
6785
+ * used for SYN cookie generation.
6786
+ */
6787
+u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
6788
+ const struct tcp_request_sock_ops *af_ops,
6789
+ struct sock *sk, struct tcphdr *th)
6790
+{
6791
+ struct tcp_sock *tp = tcp_sk(sk);
6792
+ u16 mss;
6793
+
6794
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies) != 2 &&
6795
+ !inet_csk_reqsk_queue_is_full(sk))
6796
+ return 0;
6797
+
6798
+ if (!tcp_syn_flood_action(sk, rsk_ops->slab_name))
6799
+ return 0;
6800
+
6801
+ if (sk_acceptq_is_full(sk)) {
6802
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
6803
+ return 0;
6804
+ }
6805
+
6806
+ mss = tcp_parse_mss_option(th, tp->rx_opt.user_mss);
6807
+ if (!mss)
6808
+ mss = af_ops->mss_clamp;
6809
+
6810
+ return mss;
6811
+}
6812
+EXPORT_SYMBOL_GPL(tcp_get_syncookie_mss);
64336813
64346814 int tcp_conn_request(struct request_sock_ops *rsk_ops,
64356815 const struct tcp_request_sock_ops *af_ops,
....@@ -6445,14 +6825,16 @@
64456825 bool want_cookie = false;
64466826 struct dst_entry *dst;
64476827 struct flowi fl;
6828
+ u8 syncookies;
6829
+
6830
+ syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies);
64486831
64496832 /* TW buckets are converted to open requests without
64506833 * limitations, they conserve resources and peer is
64516834 * evidently real one.
64526835 */
6453
- if ((net->ipv4.sysctl_tcp_syncookies == 2 ||
6454
- inet_csk_reqsk_queue_is_full(sk)) && !isn) {
6455
- want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
6836
+ if ((syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) && !isn) {
6837
+ want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name);
64566838 if (!want_cookie)
64576839 goto drop;
64586840 }
....@@ -6466,8 +6848,12 @@
64666848 if (!req)
64676849 goto drop;
64686850
6851
+ req->syncookie = want_cookie;
64696852 tcp_rsk(req)->af_specific = af_ops;
64706853 tcp_rsk(req)->ts_off = 0;
6854
+#if IS_ENABLED(CONFIG_MPTCP)
6855
+ tcp_rsk(req)->is_mptcp = 0;
6856
+#endif
64716857
64726858 tcp_clear_options(&tmp_opt);
64736859 tmp_opt.mss_clamp = af_ops->mss_clamp;
....@@ -6501,10 +6887,12 @@
65016887 goto drop_and_free;
65026888
65036889 if (!want_cookie && !isn) {
6890
+ int max_syn_backlog = READ_ONCE(net->ipv4.sysctl_max_syn_backlog);
6891
+
65046892 /* Kill the following clause, if you dislike this way. */
6505
- if (!net->ipv4.sysctl_tcp_syncookies &&
6506
- (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
6507
- (net->ipv4.sysctl_max_syn_backlog >> 2)) &&
6893
+ if (!syncookies &&
6894
+ (max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
6895
+ (max_syn_backlog >> 2)) &&
65086896 !tcp_peer_is_proven(req, dst)) {
65096897 /* Without syncookies last quarter of
65106898 * backlog is filled with destinations,
....@@ -6525,13 +6913,13 @@
65256913
65266914 if (want_cookie) {
65276915 isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
6528
- req->cookie_ts = tmp_opt.tstamp_ok;
65296916 if (!tmp_opt.tstamp_ok)
65306917 inet_rsk(req)->ecn_ok = 0;
65316918 }
65326919
65336920 tcp_rsk(req)->snt_isn = isn;
65346921 tcp_rsk(req)->txhash = net_tx_rndhash();
6922
+ tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
65356923 tcp_openreq_init_rwin(req, sk, dst);
65366924 sk_rx_queue_set(req_to_sk(req), skb);
65376925 if (!want_cookie) {
....@@ -6540,14 +6928,13 @@
65406928 }
65416929 if (fastopen_sk) {
65426930 af_ops->send_synack(fastopen_sk, dst, &fl, req,
6543
- &foc, TCP_SYNACK_FASTOPEN);
6931
+ &foc, TCP_SYNACK_FASTOPEN, skb);
65446932 /* Add the child socket directly into the accept queue */
65456933 if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) {
65466934 reqsk_fastopen_remove(fastopen_sk, req, false);
65476935 bh_unlock_sock(fastopen_sk);
65486936 sock_put(fastopen_sk);
6549
- reqsk_put(req);
6550
- goto drop;
6937
+ goto drop_and_free;
65516938 }
65526939 sk->sk_data_ready(sk);
65536940 bh_unlock_sock(fastopen_sk);
....@@ -6559,7 +6946,8 @@
65596946 tcp_timeout_init((struct sock *)req));
65606947 af_ops->send_synack(sk, dst, &fl, req, &foc,
65616948 !want_cookie ? TCP_SYNACK_NORMAL :
6562
- TCP_SYNACK_COOKIE);
6949
+ TCP_SYNACK_COOKIE,
6950
+ skb);
65636951 if (want_cookie) {
65646952 reqsk_free(req);
65656953 return 0;
....@@ -6571,7 +6959,7 @@
65716959 drop_and_release:
65726960 dst_release(dst);
65736961 drop_and_free:
6574
- reqsk_free(req);
6962
+ __reqsk_free(req);
65756963 drop:
65766964 tcp_listendrop(sk);
65776965 return 0;