~ljy/RK356X_SDK_RELEASE.git

..	..	@@ -77,8 +77,10 @@
77	77	#include <asm/unaligned.h>
78	78	#include <linux/errqueue.h>
79	79	#include <trace/events/tcp.h>
80		-#include <linux/static_key.h>
	80	+#include <linux/jump_label_ratelimit.h>
81	81	#include <net/busy_poll.h>
	82	+#include <net/mptcp.h>
	83	+#include <trace/hooks/net.h>
82	84
83	85	int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
84	86
..	..	@@ -113,22 +115,91 @@
113	115	#define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */
114	116
115	117	#if IS_ENABLED(CONFIG_TLS_DEVICE)
116		-static DEFINE_STATIC_KEY_FALSE(clean_acked_data_enabled);
	118	+static DEFINE_STATIC_KEY_DEFERRED_FALSE(clean_acked_data_enabled, HZ);
117	119
118	120	void clean_acked_data_enable(struct inet_connection_sock *icsk,
119	121	void (cad)(struct sock sk, u32 ack_seq))
120	122	{
121	123	icsk->icsk_clean_acked = cad;
122		- static_branch_inc(&clean_acked_data_enabled);
	124	+ static_branch_deferred_inc(&clean_acked_data_enabled);
123	125	}
124	126	EXPORT_SYMBOL_GPL(clean_acked_data_enable);
125	127
126	128	void clean_acked_data_disable(struct inet_connection_sock *icsk)
127	129	{
128		- static_branch_dec(&clean_acked_data_enabled);
	130	+ static_branch_slow_dec_deferred(&clean_acked_data_enabled);
129	131	icsk->icsk_clean_acked = NULL;
130	132	}
131	133	EXPORT_SYMBOL_GPL(clean_acked_data_disable);
	134	+
	135	+void clean_acked_data_flush(void)
	136	+{
	137	+ static_key_deferred_flush(&clean_acked_data_enabled);
	138	+}
	139	+EXPORT_SYMBOL_GPL(clean_acked_data_flush);
	140	+#endif
	141	+
	142	+#ifdef CONFIG_CGROUP_BPF
	143	+static void bpf_skops_parse_hdr(struct sock sk, struct sk_buff skb)
	144	+{
	145	+ bool unknown_opt = tcp_sk(sk)->rx_opt.saw_unknown &&
	146	+ BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
	147	+ BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG);
	148	+ bool parse_all_opt = BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
	149	+ BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG);
	150	+ struct bpf_sock_ops_kern sock_ops;
	151	+
	152	+ if (likely(!unknown_opt && !parse_all_opt))
	153	+ return;
	154	+
	155	+ /* The skb will be handled in the
	156	+ * bpf_skops_established() or
	157	+ * bpf_skops_write_hdr_opt().
	158	+ */
	159	+ switch (sk->sk_state) {
	160	+ case TCP_SYN_RECV:
	161	+ case TCP_SYN_SENT:
	162	+ case TCP_LISTEN:
	163	+ return;
	164	+ }
	165	+
	166	+ sock_owned_by_me(sk);
	167	+
	168	+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
	169	+ sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB;
	170	+ sock_ops.is_fullsock = 1;
	171	+ sock_ops.sk = sk;
	172	+ bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb));
	173	+
	174	+ BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
	175	+}
	176	+
	177	+static void bpf_skops_established(struct sock *sk, int bpf_op,
	178	+ struct sk_buff *skb)
	179	+{
	180	+ struct bpf_sock_ops_kern sock_ops;
	181	+
	182	+ sock_owned_by_me(sk);
	183	+
	184	+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
	185	+ sock_ops.op = bpf_op;
	186	+ sock_ops.is_fullsock = 1;
	187	+ sock_ops.sk = sk;
	188	+ /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */
	189	+ if (skb)
	190	+ bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb));
	191	+
	192	+ BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
	193	+}
	194	+#else
	195	+static void bpf_skops_parse_hdr(struct sock sk, struct sk_buff skb)
	196	+{
	197	+}
	198	+
	199	+static void bpf_skops_established(struct sock *sk, int bpf_op,
	200	+ struct sk_buff *skb)
	201	+{
	202	+}
132	203	#endif
133	204
134	205	static void tcp_gro_dev_warn(struct sock sk, const struct sk_buff skb,
..	..	@@ -221,7 +292,7 @@
221	292	struct inet_connection_sock *icsk = inet_csk(sk);
222	293
223	294	tcp_incr_quickack(sk, max_quickacks);
224		- icsk->icsk_ack.pingpong = 0;
	295	+ inet_csk_exit_pingpong_mode(sk);
225	296	icsk->icsk_ack.ato = TCP_ATO_MIN;
226	297	}
227	298	EXPORT_SYMBOL(tcp_enter_quickack_mode);
..	..	@@ -236,7 +307,7 @@
236	307	const struct dst_entry *dst = __sk_dst_get(sk);
237	308
238	309	return (dst && dst_metric(dst, RTAX_QUICKACK)) \|\|
239		- (icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong);
	310	+ (icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk));
240	311	}
241	312
242	313	static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
..	..	@@ -354,7 +425,8 @@
354	425	sndmem = nr_segs per_mss;
355	426
356	427	if (sk->sk_sndbuf < sndmem)
357		- sk->sk_sndbuf = min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]);
	428	+ WRITE_ONCE(sk->sk_sndbuf,
	429	+ min(sndmem, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[2])));
358	430	}
359	431
360	432	/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
..	..	@@ -383,12 +455,13 @@
383	455	*/
384	456
385	457	/* Slow part of check#2. */
386		-static int __tcp_grow_window(const struct sock sk, const struct sk_buff skb)
	458	+static int __tcp_grow_window(const struct sock sk, const struct sk_buff skb,
	459	+ unsigned int skbtruesize)
387	460	{
388	461	struct tcp_sock *tp = tcp_sk(sk);
389	462	/* Optimize this! */
390		- int truesize = tcp_win_from_space(sk, skb->truesize) >> 1;
391		- int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
	463	+ int truesize = tcp_win_from_space(sk, skbtruesize) >> 1;
	464	+ int window = tcp_win_from_space(sk, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])) >> 1;
392	465
393	466	while (tp->rcv_ssthresh <= window) {
394	467	if (truesize <= skb->len)
..	..	@@ -400,7 +473,27 @@
400	473	return 0;
401	474	}
402	475
403		-static void tcp_grow_window(struct sock sk, const struct sk_buff skb)
	476	+/* Even if skb appears to have a bad len/truesize ratio, TCP coalescing
	477	+ * can play nice with us, as sk_buff and skb->head might be either
	478	+ * freed or shared with up to MAX_SKB_FRAGS segments.
	479	+ * Only give a boost to drivers using page frag(s) to hold the frame(s),
	480	+ * and if no payload was pulled in skb->head before reaching us.
	481	+ */
	482	+static u32 truesize_adjust(bool adjust, const struct sk_buff *skb)
	483	+{
	484	+ u32 truesize = skb->truesize;
	485	+
	486	+ if (adjust && !skb_headlen(skb)) {
	487	+ truesize -= SKB_TRUESIZE(skb_end_offset(skb));
	488	+ /* paranoid check, some drivers might be buggy */
	489	+ if (unlikely((int)truesize < (int)skb->len))
	490	+ truesize = skb->truesize;
	491	+ }
	492	+ return truesize;
	493	+}
	494	+
	495	+static void tcp_grow_window(struct sock sk, const struct sk_buff skb,
	496	+ bool adjust)
404	497	{
405	498	struct tcp_sock *tp = tcp_sk(sk);
406	499	int room;
..	..	@@ -409,15 +502,16 @@
409	502
410	503	/* Check #1 */
411	504	if (room > 0 && !tcp_under_memory_pressure(sk)) {
	505	+ unsigned int truesize = truesize_adjust(adjust, skb);
412	506	int incr;
413	507
414	508	/* Check #2. Increase window, if skb with such overhead
415	509	* will fit to rcvbuf in future.
416	510	*/
417		- if (tcp_win_from_space(sk, skb->truesize) <= skb->len)
	511	+ if (tcp_win_from_space(sk, truesize) <= skb->len)
418	512	incr = 2 * tp->advmss;
419	513	else
420		- incr = __tcp_grow_window(sk, skb);
	514	+ incr = __tcp_grow_window(sk, skb, truesize);
421	515
422	516	if (incr) {
423	517	incr = max_t(int, incr, 2 * skb->len);
..	..	@@ -430,9 +524,9 @@
430	524	/* 3. Try to fixup all. It is made immediately after connection enters
431	525	* established state.
432	526	*/
433		-void tcp_init_buffer_space(struct sock *sk)
	527	+static void tcp_init_buffer_space(struct sock *sk)
434	528	{
435		- int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win;
	529	+ int tcp_app_win = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_app_win);
436	530	struct tcp_sock *tp = tcp_sk(sk);
437	531	int maxwin;
438	532
..	..	@@ -472,15 +566,17 @@
472	566	struct tcp_sock *tp = tcp_sk(sk);
473	567	struct inet_connection_sock *icsk = inet_csk(sk);
474	568	struct net *net = sock_net(sk);
	569	+ int rmem2;
475	570
476	571	icsk->icsk_ack.quick = 0;
	572	+ rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
477	573
478		- if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] &&
	574	+ if (sk->sk_rcvbuf < rmem2 &&
479	575	!(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
480	576	!tcp_under_memory_pressure(sk) &&
481	577	sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
482		- sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
483		- net->ipv4.sysctl_tcp_rmem[2]);
	578	+ WRITE_ONCE(sk->sk_rcvbuf,
	579	+ min(atomic_read(&sk->sk_rmem_alloc), rmem2));
484	580	}
485	581	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
486	582	tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
..	..	@@ -510,7 +606,7 @@
510	606	*
511	607	* The algorithm for RTT estimation w/o timestamps is based on
512	608	* Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
513		- * <http://public.lanl.gov/radiant/pubs.html#DRS>
	609	+ * <https://public.lanl.gov/radiant/pubs.html#DRS>
514	610	*
515	611	* More detail on this code can be found at
516	612	* <http://staff.psc.edu/jheffner/>,
..	..	@@ -621,7 +717,7 @@
621	717	* <prev RTT . ><current RTT .. ><next RTT .... >
622	718	*/
623	719
624		- if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf &&
	720	+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
625	721	!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
626	722	int rcvmem, rcvbuf;
627	723	u64 rcvwin, grow;
..	..	@@ -642,9 +738,9 @@
642	738
643	739	do_div(rcvwin, tp->advmss);
644	740	rcvbuf = min_t(u64, rcvwin * rcvmem,
645		- sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
	741	+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
646	742	if (rcvbuf > sk->sk_rcvbuf) {
647		- sk->sk_rcvbuf = rcvbuf;
	743	+ WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
648	744
649	745	/* Make the window clamp follow along. */
650	746	tp->window_clamp = tcp_win_from_space(sk, rcvbuf);
..	..	@@ -710,7 +806,7 @@
710	806	tcp_ecn_check_ce(sk, skb);
711	807
712	808	if (skb->len >= 128)
713		- tcp_grow_window(sk, skb);
	809	+ tcp_grow_window(sk, skb, true);
714	810	}
715	811
716	812	/* Called to compute a smoothed rtt estimate. The data fed to this
..	..	@@ -774,6 +870,8 @@
774	870	tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
775	871	tp->rtt_seq = tp->snd_nxt;
776	872	tp->mdev_max_us = tcp_rto_min_us(sk);
	873	+
	874	+ tcp_bpf_rtt(sk);
777	875	}
778	876	} else {
779	877	/* no previous measure. */
..	..	@@ -782,6 +880,8 @@
782	880	tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
783	881	tp->mdev_max_us = tp->rttvar_us;
784	882	tp->rtt_seq = tp->snd_nxt;
	883	+
	884	+ tcp_bpf_rtt(sk);
785	885	}
786	886	tp->srtt_us = max(1U, srtt);
787	887	}
..	..	@@ -859,12 +959,54 @@
859	959	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
860	960	}
861	961
862		-/* Take a notice that peer is sending D-SACKs */
863		-static void tcp_dsack_seen(struct tcp_sock *tp)
	962	+struct tcp_sacktag_state {
	963	+ /* Timestamps for earliest and latest never-retransmitted segment
	964	+ * that was SACKed. RTO needs the earliest RTT to stay conservative,
	965	+ * but congestion control should still get an accurate delay signal.
	966	+ */
	967	+ u64 first_sackt;
	968	+ u64 last_sackt;
	969	+ u32 reord;
	970	+ u32 sack_delivered;
	971	+ int flag;
	972	+ unsigned int mss_now;
	973	+ struct rate_sample *rate;
	974	+};
	975	+
	976	+/* Take a notice that peer is sending D-SACKs. Skip update of data delivery
	977	+ * and spurious retransmission information if this DSACK is unlikely caused by
	978	+ * sender's action:
	979	+ * - DSACKed sequence range is larger than maximum receiver's window.
	980	+ * - Total no. of DSACKed segments exceed the total no. of retransmitted segs.
	981	+ */
	982	+static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq,
	983	+ u32 end_seq, struct tcp_sacktag_state *state)
864	984	{
	985	+ u32 seq_len, dup_segs = 1;
	986	+
	987	+ if (!before(start_seq, end_seq))
	988	+ return 0;
	989	+
	990	+ seq_len = end_seq - start_seq;
	991	+ /* Dubious DSACK: DSACKed range greater than maximum advertised rwnd */
	992	+ if (seq_len > tp->max_window)
	993	+ return 0;
	994	+ if (seq_len > tp->mss_cache)
	995	+ dup_segs = DIV_ROUND_UP(seq_len, tp->mss_cache);
	996	+
	997	+ tp->dsack_dups += dup_segs;
	998	+ /* Skip the DSACK if dup segs weren't retransmitted by sender */
	999	+ if (tp->dsack_dups > tp->total_retrans)
	1000	+ return 0;
	1001	+
865	1002	tp->rx_opt.sack_ok \|= TCP_DSACK_SEEN;
866	1003	tp->rack.dsack_seen = 1;
867		- tp->dsack_dups++;
	1004	+
	1005	+ state->flag \|= FLAG_DSACKING_ACK;
	1006	+ /* A spurious retransmission is delivered */
	1007	+ state->sack_delivered += dup_segs;
	1008	+
	1009	+ return dup_segs;
868	1010	}
869	1011
870	1012	/* It's reordering when higher sequence was delivered (i.e. sacked) before
..	..	@@ -893,7 +1035,7 @@
893	1035	tp->undo_marker ? tp->undo_retrans : 0);
894	1036	#endif
895	1037	tp->reordering = min_t(u32, (metric + mss - 1) / mss,
896		- sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
	1038	+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering));
897	1039	}
898	1040
899	1041	/* This exciting event is worth to be remembered. 8) */
..	..	@@ -902,7 +1044,11 @@
902	1044	ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER);
903	1045	}
904	1046
905		-/* This must be called before lost_out is incremented */
	1047	+ /* This must be called before lost_out or retrans_out are updated
	1048	+ * on a new loss, because we want to know if all skbs previously
	1049	+ * known to be lost have already been retransmitted, indicating
	1050	+ * that this newly lost skb is our next skb to retransmit.
	1051	+ */
906	1052	static void tcp_verify_retransmit_hint(struct tcp_sock tp, struct sk_buff skb)
907	1053	{
908	1054	if ((!tp->retransmit_skb_hint && tp->retrans_out >= tp->lost_out) \|\|
..	..	@@ -912,42 +1058,46 @@
912	1058	tp->retransmit_skb_hint = skb;
913	1059	}
914	1060
915		-/* Sum the number of packets on the wire we have marked as lost.
916		- * There are two cases we care about here:
917		- * a) Packet hasn't been marked lost (nor retransmitted),
918		- * and this is the first loss.
919		- * b) Packet has been marked both lost and retransmitted,
920		- * and this means we think it was lost again.
	1061	+/* Sum the number of packets on the wire we have marked as lost, and
	1062	+ * notify the congestion control module that the given skb was marked lost.
921	1063	*/
922		-static void tcp_sum_lost(struct tcp_sock tp, struct sk_buff skb)
	1064	+static void tcp_notify_skb_loss_event(struct tcp_sock tp, const struct sk_buff skb)
	1065	+{
	1066	+ tp->lost += tcp_skb_pcount(skb);
	1067	+}
	1068	+
	1069	+void tcp_mark_skb_lost(struct sock sk, struct sk_buff skb)
923	1070	{
924	1071	__u8 sacked = TCP_SKB_CB(skb)->sacked;
	1072	+ struct tcp_sock *tp = tcp_sk(sk);
925	1073
926		- if (!(sacked & TCPCB_LOST) \|\|
927		- ((sacked & TCPCB_LOST) && (sacked & TCPCB_SACKED_RETRANS)))
928		- tp->lost += tcp_skb_pcount(skb);
929		-}
	1074	+ if (sacked & TCPCB_SACKED_ACKED)
	1075	+ return;
930	1076
931		-static void tcp_skb_mark_lost(struct tcp_sock tp, struct sk_buff skb)
932		-{
933		- if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST\|TCPCB_SACKED_ACKED))) {
934		- tcp_verify_retransmit_hint(tp, skb);
935		-
936		- tp->lost_out += tcp_skb_pcount(skb);
937		- tcp_sum_lost(tp, skb);
938		- TCP_SKB_CB(skb)->sacked \|= TCPCB_LOST;
939		- }
940		-}
941		-
942		-void tcp_skb_mark_lost_uncond_verify(struct tcp_sock tp, struct sk_buff skb)
943		-{
944	1077	tcp_verify_retransmit_hint(tp, skb);
945		-
946		- tcp_sum_lost(tp, skb);
947		- if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST\|TCPCB_SACKED_ACKED))) {
	1078	+ if (sacked & TCPCB_LOST) {
	1079	+ if (sacked & TCPCB_SACKED_RETRANS) {
	1080	+ /* Account for retransmits that are lost again */
	1081	+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
	1082	+ tp->retrans_out -= tcp_skb_pcount(skb);
	1083	+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT,
	1084	+ tcp_skb_pcount(skb));
	1085	+ tcp_notify_skb_loss_event(tp, skb);
	1086	+ }
	1087	+ } else {
948	1088	tp->lost_out += tcp_skb_pcount(skb);
949	1089	TCP_SKB_CB(skb)->sacked \|= TCPCB_LOST;
	1090	+ tcp_notify_skb_loss_event(tp, skb);
950	1091	}
	1092	+}
	1093	+
	1094	+/* Updates the delivered and delivered_ce counts */
	1095	+static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered,
	1096	+ bool ece_ack)
	1097	+{
	1098	+ tp->delivered += delivered;
	1099	+ if (ece_ack)
	1100	+ tp->delivered_ce += delivered;
951	1101	}
952	1102
953	1103	/* This procedure tags the retransmission queue when SACKs arrive.
..	..	@@ -1082,51 +1232,42 @@
1082	1232
1083	1233	static bool tcp_check_dsack(struct sock sk, const struct sk_buff ack_skb,
1084	1234	struct tcp_sack_block_wire *sp, int num_sacks,
1085		- u32 prior_snd_una)
	1235	+ u32 prior_snd_una, struct tcp_sacktag_state *state)
1086	1236	{
1087	1237	struct tcp_sock *tp = tcp_sk(sk);
1088	1238	u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
1089	1239	u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
1090		- bool dup_sack = false;
	1240	+ u32 dup_segs;
1091	1241
1092	1242	if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
1093		- dup_sack = true;
1094		- tcp_dsack_seen(tp);
1095	1243	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
1096	1244	} else if (num_sacks > 1) {
1097	1245	u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
1098	1246	u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
1099	1247
1100		- if (!after(end_seq_0, end_seq_1) &&
1101		- !before(start_seq_0, start_seq_1)) {
1102		- dup_sack = true;
1103		- tcp_dsack_seen(tp);
1104		- NET_INC_STATS(sock_net(sk),
1105		- LINUX_MIB_TCPDSACKOFORECV);
1106		- }
	1248	+ if (after(end_seq_0, end_seq_1) \|\| before(start_seq_0, start_seq_1))
	1249	+ return false;
	1250	+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKOFORECV);
	1251	+ } else {
	1252	+ return false;
1107	1253	}
1108	1254
	1255	+ dup_segs = tcp_dsack_seen(tp, start_seq_0, end_seq_0, state);
	1256	+ if (!dup_segs) { /* Skip dubious DSACK */
	1257	+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKIGNOREDDUBIOUS);
	1258	+ return false;
	1259	+ }
	1260	+
	1261	+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECVSEGS, dup_segs);
	1262	+
1109	1263	/* D-SACK for already forgotten data... Do dumb counting. */
1110		- if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 &&
	1264	+ if (tp->undo_marker && tp->undo_retrans > 0 &&
1111	1265	!after(end_seq_0, prior_snd_una) &&
1112	1266	after(end_seq_0, tp->undo_marker))
1113		- tp->undo_retrans--;
	1267	+ tp->undo_retrans = max_t(int, 0, tp->undo_retrans - dup_segs);
1114	1268
1115		- return dup_sack;
	1269	+ return true;
1116	1270	}
1117		-
1118		-struct tcp_sacktag_state {
1119		- u32 reord;
1120		- /* Timestamps for earliest and latest never-retransmitted segment
1121		- * that was SACKed. RTO needs the earliest RTT to stay conservative,
1122		- * but congestion control should still get an accurate delay signal.
1123		- */
1124		- u64 first_sackt;
1125		- u64 last_sackt;
1126		- struct rate_sample *rate;
1127		- int flag;
1128		- unsigned int mss_now;
1129		-};
1130	1271
1131	1272	/* Check if skb is fully within the SACK block. In presence of GSO skbs,
1132	1273	* the incoming SACK may not exactly match but we can find smaller MSS
..	..	@@ -1246,7 +1387,8 @@
1246	1387	sacked \|= TCPCB_SACKED_ACKED;
1247	1388	state->flag \|= FLAG_DATA_SACKED;
1248	1389	tp->sacked_out += pcount;
1249		- tp->delivered += pcount; /* Out-of-order packets delivered */
	1390	+ /* Out-of-order packets delivered */
	1391	+ state->sack_delivered += pcount;
1250	1392
1251	1393	/* Lost marker hint past SACKed? Tweak RFC3517 cnt */
1252	1394	if (tp->lost_skb_hint &&
..	..	@@ -1289,7 +1431,7 @@
1289	1431	*/
1290	1432	tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
1291	1433	start_seq, end_seq, dup_sack, pcount,
1292		- skb->skb_mstamp);
	1434	+ tcp_skb_timestamp_us(skb));
1293	1435	tcp_rate_skb_delivered(sk, skb, state->rate);
1294	1436
1295	1437	if (skb == tp->lost_skb_hint)
..	..	@@ -1413,7 +1555,7 @@
1413	1555	if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
1414	1556	goto fallback;
1415	1557
1416		- if (!tcp_skb_can_collapse_to(prev))
	1558	+ if (!tcp_skb_can_collapse(prev, skb))
1417	1559	goto fallback;
1418	1560
1419	1561	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
..	..	@@ -1502,6 +1644,8 @@
1502	1644	(mss != tcp_skb_seglen(skb)))
1503	1645	goto out;
1504	1646
	1647	+ if (!tcp_skb_can_collapse(prev, skb))
	1648	+ goto out;
1505	1649	len = skb->len;
1506	1650	pcount = tcp_skb_pcount(skb);
1507	1651	if (tcp_skb_shift(prev, skb, pcount, len))
..	..	@@ -1578,7 +1722,7 @@
1578	1722	TCP_SKB_CB(skb)->end_seq,
1579	1723	dup_sack,
1580	1724	tcp_skb_pcount(skb),
1581		- skb->skb_mstamp);
	1725	+ tcp_skb_timestamp_us(skb));
1582	1726	tcp_rate_skb_delivered(sk, skb, state->rate);
1583	1727	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
1584	1728	list_del_init(&skb->tcp_tsorted_anchor);
..	..	@@ -1591,9 +1735,7 @@
1591	1735	return skb;
1592	1736	}
1593	1737
1594		-static struct sk_buff tcp_sacktag_bsearch(struct sock sk,
1595		- struct tcp_sacktag_state *state,
1596		- u32 seq)
	1738	+static struct sk_buff tcp_sacktag_bsearch(struct sock sk, u32 seq)
1597	1739	{
1598	1740	struct rb_node parent, *p = &sk->tcp_rtx_queue.rb_node;
1599	1741	struct sk_buff *skb;
..	..	@@ -1615,13 +1757,12 @@
1615	1757	}
1616	1758
1617	1759	static struct sk_buff tcp_sacktag_skip(struct sk_buff skb, struct sock *sk,
1618		- struct tcp_sacktag_state *state,
1619	1760	u32 skip_to_seq)
1620	1761	{
1621	1762	if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq))
1622	1763	return skb;
1623	1764
1624		- return tcp_sacktag_bsearch(sk, state, skip_to_seq);
	1765	+ return tcp_sacktag_bsearch(sk, skip_to_seq);
1625	1766	}
1626	1767
1627	1768	static struct sk_buff tcp_maybe_skipping_dsack(struct sk_buff skb,
..	..	@@ -1634,7 +1775,7 @@
1634	1775	return skb;
1635	1776
1636	1777	if (before(next_dup->start_seq, skip_to_seq)) {
1637		- skb = tcp_sacktag_skip(skb, sk, state, next_dup->start_seq);
	1778	+ skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq);
1638	1779	skb = tcp_sacktag_walk(skb, sk, NULL, state,
1639	1780	next_dup->start_seq, next_dup->end_seq,
1640	1781	1);
..	..	@@ -1672,11 +1813,7 @@
1672	1813	tcp_highest_sack_reset(sk);
1673	1814
1674	1815	found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
1675		- num_sacks, prior_snd_una);
1676		- if (found_dup_sack) {
1677		- state->flag \|= FLAG_DSACKING_ACK;
1678		- tp->delivered++; /* A spurious retransmission is delivered */
1679		- }
	1816	+ num_sacks, prior_snd_una, state);
1680	1817
1681	1818	/* Eliminate too old ACKs, but take into
1682	1819	* account more or less fresh ones, they can
..	..	@@ -1778,8 +1915,7 @@
1778	1915
1779	1916	/* Head todo? */
1780	1917	if (before(start_seq, cache->start_seq)) {
1781		- skb = tcp_sacktag_skip(skb, sk, state,
1782		- start_seq);
	1918	+ skb = tcp_sacktag_skip(skb, sk, start_seq);
1783	1919	skb = tcp_sacktag_walk(skb, sk, next_dup,
1784	1920	state,
1785	1921	start_seq,
..	..	@@ -1805,7 +1941,7 @@
1805	1941	goto walk;
1806	1942	}
1807	1943
1808		- skb = tcp_sacktag_skip(skb, sk, state, cache->end_seq);
	1944	+ skb = tcp_sacktag_skip(skb, sk, cache->end_seq);
1809	1945	/* Check overlap against next cached too (past this one already) */
1810	1946	cache++;
1811	1947	continue;
..	..	@@ -1816,7 +1952,7 @@
1816	1952	if (!skb)
1817	1953	break;
1818	1954	}
1819		- skb = tcp_sacktag_skip(skb, sk, state, start_seq);
	1955	+ skb = tcp_sacktag_skip(skb, sk, start_seq);
1820	1956
1821	1957	walk:
1822	1958	skb = tcp_sacktag_walk(skb, sk, next_dup, state,
..	..	@@ -1878,34 +2014,39 @@
1878	2014	return;
1879	2015
1880	2016	tp->reordering = min_t(u32, tp->packets_out + addend,
1881		- sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
	2017	+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering));
1882	2018	tp->reord_seen++;
1883	2019	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER);
1884	2020	}
1885	2021
1886	2022	/* Emulate SACKs for SACKless connection: account for a new dupack. */
1887	2023
1888		-static void tcp_add_reno_sack(struct sock *sk)
	2024	+static void tcp_add_reno_sack(struct sock *sk, int num_dupack, bool ece_ack)
1889	2025	{
1890		- struct tcp_sock *tp = tcp_sk(sk);
1891		- u32 prior_sacked = tp->sacked_out;
	2026	+ if (num_dupack) {
	2027	+ struct tcp_sock *tp = tcp_sk(sk);
	2028	+ u32 prior_sacked = tp->sacked_out;
	2029	+ s32 delivered;
1892	2030
1893		- tp->sacked_out++;
1894		- tcp_check_reno_reordering(sk, 0);
1895		- if (tp->sacked_out > prior_sacked)
1896		- tp->delivered++; /* Some out-of-order packet is delivered */
1897		- tcp_verify_left_out(tp);
	2031	+ tp->sacked_out += num_dupack;
	2032	+ tcp_check_reno_reordering(sk, 0);
	2033	+ delivered = tp->sacked_out - prior_sacked;
	2034	+ if (delivered > 0)
	2035	+ tcp_count_delivered(tp, delivered, ece_ack);
	2036	+ tcp_verify_left_out(tp);
	2037	+ }
1898	2038	}
1899	2039
1900	2040	/* Account for ACK, ACKing some data in Reno Recovery phase. */
1901	2041
1902		-static void tcp_remove_reno_sacks(struct sock *sk, int acked)
	2042	+static void tcp_remove_reno_sacks(struct sock *sk, int acked, bool ece_ack)
1903	2043	{
1904	2044	struct tcp_sock *tp = tcp_sk(sk);
1905	2045
1906	2046	if (acked > 0) {
1907	2047	/* One ACK acked hole. The rest eat duplicate ACKs. */
1908		- tp->delivered += max_t(int, acked - tp->sacked_out, 1);
	2048	+ tcp_count_delivered(tp, max_t(int, acked - tp->sacked_out, 1),
	2049	+ ece_ack);
1909	2050	if (acked - 1 >= tp->sacked_out)
1910	2051	tp->sacked_out = 0;
1911	2052	else
..	..	@@ -1938,7 +2079,8 @@
1938	2079
1939	2080	static bool tcp_is_rack(const struct sock *sk)
1940	2081	{
1941		- return sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION;
	2082	+ return READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) &
	2083	+ TCP_RACK_LOSS_DETECTION;
1942	2084	}
1943	2085
1944	2086	/* If we detect SACK reneging, forget all SACK information
..	..	@@ -1982,6 +2124,7 @@
1982	2124	struct tcp_sock *tp = tcp_sk(sk);
1983	2125	struct net *net = sock_net(sk);
1984	2126	bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
	2127	+ u8 reordering;
1985	2128
1986	2129	tcp_timeout_mark_lost(sk);
1987	2130
..	..	@@ -2002,10 +2145,12 @@
2002	2145	/* Timeout in disordered state after receiving substantial DUPACKs
2003	2146	* suggests that the degree of reordering is over-estimated.
2004	2147	*/
	2148	+ reordering = READ_ONCE(net->ipv4.sysctl_tcp_reordering);
2005	2149	if (icsk->icsk_ca_state <= TCP_CA_Disorder &&
2006		- tp->sacked_out >= net->ipv4.sysctl_tcp_reordering)
	2150	+ tp->sacked_out >= reordering)
2007	2151	tp->reordering = min_t(unsigned int, tp->reordering,
2008		- net->ipv4.sysctl_tcp_reordering);
	2152	+ reordering);
	2153	+
2009	2154	tcp_set_ca_state(sk, TCP_CA_Loss);
2010	2155	tp->high_seq = tp->snd_nxt;
2011	2156	tcp_ecn_queue_cwr(tp);
..	..	@@ -2014,7 +2159,7 @@
2014	2159	* loss recovery is underway except recurring timeout(s) on
2015	2160	* the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
2016	2161	*/
2017		- tp->frto = net->ipv4.sysctl_tcp_frto &&
	2162	+ tp->frto = READ_ONCE(net->ipv4.sysctl_tcp_frto) &&
2018	2163	(new_recovery \|\| icsk->icsk_retransmits) &&
2019	2164	!inet_csk(sk)->icsk_mtup.probe_size;
2020	2165	}
..	..	@@ -2031,7 +2176,8 @@
2031	2176	*/
2032	2177	static bool tcp_check_sack_reneging(struct sock *sk, int flag)
2033	2178	{
2034		- if (flag & FLAG_SACK_RENEGING) {
	2179	+ if (flag & FLAG_SACK_RENEGING &&
	2180	+ flag & FLAG_SND_UNA_ADVANCED) {
2035	2181	struct tcp_sock *tp = tcp_sk(sk);
2036	2182	unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4),
2037	2183	msecs_to_jiffies(10));
..	..	@@ -2172,8 +2318,7 @@
2172	2318	}
2173	2319
2174	2320	/* Detect loss in event "A" above by marking head of queue up as lost.
2175		- * For non-SACK(Reno) senders, the first "packets" number of segments
2176		- * are considered lost. For RFC3517 SACK, a segment is considered lost if it
	2321	+ * For RFC3517 SACK, a segment is considered lost if it
2177	2322	* has at least tp->reordering SACKed seqments above it; "packets" refers to
2178	2323	* the maximum SACKed segments to pass before reaching this limit.
2179	2324	*/
..	..	@@ -2181,10 +2326,9 @@
2181	2326	{
2182	2327	struct tcp_sock *tp = tcp_sk(sk);
2183	2328	struct sk_buff *skb;
2184		- int cnt, oldcnt, lost;
2185		- unsigned int mss;
	2329	+ int cnt;
2186	2330	/* Use SACK to deduce losses of new sequences sent during recovery */
2187		- const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq;
	2331	+ const u32 loss_high = tp->snd_nxt;
2188	2332
2189	2333	WARN_ON(packets > tp->packets_out);
2190	2334	skb = tp->lost_skb_hint;
..	..	@@ -2207,28 +2351,14 @@
2207	2351	if (after(TCP_SKB_CB(skb)->end_seq, loss_high))
2208	2352	break;
2209	2353
2210		- oldcnt = cnt;
2211		- if (tcp_is_reno(tp) \|\|
2212		- (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
	2354	+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
2213	2355	cnt += tcp_skb_pcount(skb);
2214	2356
2215		- if (cnt > packets) {
2216		- if (tcp_is_sack(tp) \|\|
2217		- (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) \|\|
2218		- (oldcnt >= packets))
2219		- break;
	2357	+ if (cnt > packets)
	2358	+ break;
2220	2359
2221		- mss = tcp_skb_mss(skb);
2222		- /* If needed, chop off the prefix to mark as lost. */
2223		- lost = (packets - oldcnt) * mss;
2224		- if (lost < skb->len &&
2225		- tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
2226		- lost, mss, GFP_ATOMIC) < 0)
2227		- break;
2228		- cnt = packets;
2229		- }
2230		-
2231		- tcp_skb_mark_lost(tp, skb);
	2360	+ if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST))
	2361	+ tcp_mark_skb_lost(sk, skb);
2232	2362
2233	2363	if (mark_head)
2234	2364	break;
..	..	@@ -2272,7 +2402,7 @@
2272	2402	*/
2273	2403	static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
2274	2404	{
2275		- return !tp->retrans_stamp \|\|
	2405	+ return tp->retrans_stamp &&
2276	2406	tcp_tsopt_ecr_before(tp, tp->retrans_stamp);
2277	2407	}
2278	2408
..	..	@@ -2368,6 +2498,21 @@
2368	2498	return tp->undo_marker && (!tp->undo_retrans \|\| tcp_packet_delayed(tp));
2369	2499	}
2370	2500
	2501	+static bool tcp_is_non_sack_preventing_reopen(struct sock *sk)
	2502	+{
	2503	+ struct tcp_sock *tp = tcp_sk(sk);
	2504	+
	2505	+ if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
	2506	+ /* Hold old state until something above high_seq
	2507	+ * is ACKed. For Reno it is MUST to prevent false
	2508	+ * fast retransmits (RFC2582). SACK TCP is safe. */
	2509	+ if (!tcp_any_retrans_done(sk))
	2510	+ tp->retrans_stamp = 0;
	2511	+ return true;
	2512	+ }
	2513	+ return false;
	2514	+}
	2515	+
2371	2516	/* People celebrate: "We love our President!" */
2372	2517	static bool tcp_try_undo_recovery(struct sock *sk)
2373	2518	{
..	..	@@ -2390,14 +2535,8 @@
2390	2535	} else if (tp->rack.reo_wnd_persist) {
2391	2536	tp->rack.reo_wnd_persist--;
2392	2537	}
2393		- if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
2394		- /* Hold old state until something above high_seq
2395		- * is ACKed. For Reno it is MUST to prevent false
2396		- * fast retransmits (RFC2582). SACK TCP is safe. */
2397		- if (!tcp_any_retrans_done(sk))
2398		- tp->retrans_stamp = 0;
	2538	+ if (tcp_is_non_sack_preventing_reopen(sk))
2399	2539	return true;
2400		- }
2401	2540	tcp_set_ca_state(sk, TCP_CA_Open);
2402	2541	tp->is_sack_reneg = 0;
2403	2542	return false;
..	..	@@ -2433,6 +2572,8 @@
2433	2572	NET_INC_STATS(sock_net(sk),
2434	2573	LINUX_MIB_TCPSPURIOUSRTOS);
2435	2574	inet_csk(sk)->icsk_retransmits = 0;
	2575	+ if (tcp_is_non_sack_preventing_reopen(sk))
	2576	+ return true;
2436	2577	if (frto_undo \|\| tcp_is_sack(tp)) {
2437	2578	tcp_set_ca_state(sk, TCP_CA_Open);
2438	2579	tp->is_sack_reneg = 0;
..	..	@@ -2479,8 +2620,8 @@
2479	2620	u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
2480	2621	tp->prior_cwnd - 1;
2481	2622	sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
2482		- } else if ((flag & FLAG_RETRANS_DATA_ACKED) &&
2483		- !(flag & FLAG_LOST_RETRANS)) {
	2623	+ } else if ((flag & (FLAG_RETRANS_DATA_ACKED \| FLAG_LOST_RETRANS)) ==
	2624	+ FLAG_RETRANS_DATA_ACKED) {
2484	2625	sndcnt = min_t(int, delta,
2485	2626	max_t(int, tp->prr_delivered - tp->prr_out,
2486	2627	newly_acked_sacked) + 1);
..	..	@@ -2566,12 +2707,15 @@
2566	2707	{
2567	2708	struct tcp_sock *tp = tcp_sk(sk);
2568	2709	struct inet_connection_sock *icsk = inet_csk(sk);
	2710	+ u64 val;
2569	2711
2570		- /* FIXME: breaks with very large cwnd */
2571	2712	tp->prior_ssthresh = tcp_current_ssthresh(sk);
2572		- tp->snd_cwnd = tp->snd_cwnd *
2573		- tcp_mss_to_mtu(sk, tp->mss_cache) /
2574		- icsk->icsk_mtup.probe_size;
	2713	+
	2714	+ val = (u64)tp->snd_cwnd * tcp_mss_to_mtu(sk, tp->mss_cache);
	2715	+ do_div(val, icsk->icsk_mtup.probe_size);
	2716	+ WARN_ON_ONCE((u32)val != val);
	2717	+ tp->snd_cwnd = max_t(u32, 1U, val);
	2718	+
2575	2719	tp->snd_cwnd_cnt = 0;
2576	2720	tp->snd_cwnd_stamp = tcp_jiffies32;
2577	2721	tp->snd_ssthresh = tcp_current_ssthresh(sk);
..	..	@@ -2594,14 +2738,8 @@
2594	2738	unsigned int mss = tcp_current_mss(sk);
2595	2739
2596	2740	skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
2597		- if (tcp_skb_seglen(skb) > mss &&
2598		- !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
2599		- if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
2600		- TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
2601		- tp->retrans_out -= tcp_skb_pcount(skb);
2602		- }
2603		- tcp_skb_mark_lost_uncond_verify(tp, skb);
2604		- }
	2741	+ if (tcp_skb_seglen(skb) > mss)
	2742	+ tcp_mark_skb_lost(sk, skb);
2605	2743	}
2606	2744
2607	2745	tcp_clear_retrans_hints_partial(tp);
..	..	@@ -2656,13 +2794,13 @@
2656	2794	/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
2657	2795	* recovered or spurious. Otherwise retransmits more on partial ACKs.
2658	2796	*/
2659		-static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
	2797	+static void tcp_process_loss(struct sock *sk, int flag, int num_dupack,
2660	2798	int *rexmit)
2661	2799	{
2662	2800	struct tcp_sock *tp = tcp_sk(sk);
2663	2801	bool recovered = !before(tp->snd_una, tp->high_seq);
2664	2802
2665		- if ((flag & FLAG_SND_UNA_ADVANCED) &&
	2803	+ if ((flag & FLAG_SND_UNA_ADVANCED \|\| rcu_access_pointer(tp->fastopen_rsk)) &&
2666	2804	tcp_try_undo_loss(sk, false))
2667	2805	return;
2668	2806
..	..	@@ -2675,7 +2813,7 @@
2675	2813	return;
2676	2814
2677	2815	if (after(tp->snd_nxt, tp->high_seq)) {
2678		- if (flag & FLAG_DATA_SACKED \|\| is_dupack)
	2816	+ if (flag & FLAG_DATA_SACKED \|\| num_dupack)
2679	2817	tp->frto = 0; /* Step 3.a. loss was real */
2680	2818	} else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
2681	2819	tp->high_seq = tp->snd_nxt;
..	..	@@ -2701,16 +2839,25 @@
2701	2839	/* A Reno DUPACK means new data in F-RTO step 2.b above are
2702	2840	* delivered. Lower inflight to clock out (re)tranmissions.
2703	2841	*/
2704		- if (after(tp->snd_nxt, tp->high_seq) && is_dupack)
2705		- tcp_add_reno_sack(sk);
	2842	+ if (after(tp->snd_nxt, tp->high_seq) && num_dupack)
	2843	+ tcp_add_reno_sack(sk, num_dupack, flag & FLAG_ECE);
2706	2844	else if (flag & FLAG_SND_UNA_ADVANCED)
2707	2845	tcp_reset_reno_sack(tp);
2708	2846	}
2709	2847	*rexmit = REXMIT_LOST;
2710	2848	}
2711	2849
	2850	+static bool tcp_force_fast_retransmit(struct sock *sk)
	2851	+{
	2852	+ struct tcp_sock *tp = tcp_sk(sk);
	2853	+
	2854	+ return after(tcp_highest_sack_seq(tp),
	2855	+ tp->snd_una + tp->reordering * tp->mss_cache);
	2856	+}
	2857	+
2712	2858	/* Undo during fast recovery after partial ACK. */
2713		-static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una)
	2859	+static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una,
	2860	+ bool *do_lost)
2714	2861	{
2715	2862	struct tcp_sock *tp = tcp_sk(sk);
2716	2863
..	..	@@ -2735,7 +2882,9 @@
2735	2882	tcp_undo_cwnd_reduction(sk, true);
2736	2883	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
2737	2884	tcp_try_keep_open(sk);
2738		- return true;
	2885	+ } else {
	2886	+ /* Partial ACK arrived. Force fast retransmit. */
	2887	+ *do_lost = tcp_force_fast_retransmit(sk);
2739	2888	}
2740	2889	return false;
2741	2890	}
..	..	@@ -2759,14 +2908,6 @@
2759	2908	}
2760	2909	}
2761	2910
2762		-static bool tcp_force_fast_retransmit(struct sock *sk)
2763		-{
2764		- struct tcp_sock *tp = tcp_sk(sk);
2765		-
2766		- return after(tcp_highest_sack_seq(tp),
2767		- tp->snd_una + tp->reordering * tp->mss_cache);
2768		-}
2769		-
2770	2911	/* Process an event, which can update packets-in-flight not trivially.
2771	2912	* Main goal of this function is to calculate new estimate for left_out,
2772	2913	* taking into account both packets sitting in receiver's buffer and
..	..	@@ -2780,20 +2921,21 @@
2780	2921	* tcp_xmit_retransmit_queue().
2781	2922	*/
2782	2923	static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
2783		- bool is_dupack, int ack_flag, int rexmit)
	2924	+ int num_dupack, int ack_flag, int rexmit)
2784	2925	{
2785	2926	struct inet_connection_sock *icsk = inet_csk(sk);
2786	2927	struct tcp_sock *tp = tcp_sk(sk);
2787	2928	int fast_rexmit = 0, flag = *ack_flag;
2788		- bool do_lost = is_dupack \|\| ((flag & FLAG_DATA_SACKED) &&
2789		- tcp_force_fast_retransmit(sk));
	2929	+ bool ece_ack = flag & FLAG_ECE;
	2930	+ bool do_lost = num_dupack \|\| ((flag & FLAG_DATA_SACKED) &&
	2931	+ tcp_force_fast_retransmit(sk));
2790	2932
2791	2933	if (!tp->packets_out && tp->sacked_out)
2792	2934	tp->sacked_out = 0;
2793	2935
2794	2936	/* Now state machine starts.
2795	2937	* A. ECE, hence prohibit cwnd undoing, the reduction is required. */
2796		- if (flag & FLAG_ECE)
	2938	+ if (ece_ack)
2797	2939	tp->prior_ssthresh = 0;
2798	2940
2799	2941	/* B. In all the states check for reneging SACKs. */
..	..	@@ -2833,35 +2975,37 @@
2833	2975	switch (icsk->icsk_ca_state) {
2834	2976	case TCP_CA_Recovery:
2835	2977	if (!(flag & FLAG_SND_UNA_ADVANCED)) {
2836		- if (tcp_is_reno(tp) && is_dupack)
2837		- tcp_add_reno_sack(sk);
2838		- } else {
2839		- if (tcp_try_undo_partial(sk, prior_snd_una))
2840		- return;
2841		- /* Partial ACK arrived. Force fast retransmit. */
2842		- do_lost = tcp_is_reno(tp) \|\|
2843		- tcp_force_fast_retransmit(sk);
2844		- }
2845		- if (tcp_try_undo_dsack(sk)) {
2846		- tcp_try_keep_open(sk);
	2978	+ if (tcp_is_reno(tp))
	2979	+ tcp_add_reno_sack(sk, num_dupack, ece_ack);
	2980	+ } else if (tcp_try_undo_partial(sk, prior_snd_una, &do_lost))
2847	2981	return;
2848		- }
	2982	+
	2983	+ if (tcp_try_undo_dsack(sk))
	2984	+ tcp_try_keep_open(sk);
	2985	+
2849	2986	tcp_identify_packet_loss(sk, ack_flag);
	2987	+ if (icsk->icsk_ca_state != TCP_CA_Recovery) {
	2988	+ if (!tcp_time_to_recover(sk, flag))
	2989	+ return;
	2990	+ /* Undo reverts the recovery state. If loss is evident,
	2991	+ * starts a new recovery (e.g. reordering then loss);
	2992	+ */
	2993	+ tcp_enter_recovery(sk, ece_ack);
	2994	+ }
2850	2995	break;
2851	2996	case TCP_CA_Loss:
2852		- tcp_process_loss(sk, flag, is_dupack, rexmit);
	2997	+ tcp_process_loss(sk, flag, num_dupack, rexmit);
2853	2998	tcp_identify_packet_loss(sk, ack_flag);
2854	2999	if (!(icsk->icsk_ca_state == TCP_CA_Open \|\|
2855	3000	(*ack_flag & FLAG_LOST_RETRANS)))
2856	3001	return;
2857	3002	/* Change state if cwnd is undone or retransmits are lost */
2858		- /* fall through */
	3003	+ fallthrough;
2859	3004	default:
2860	3005	if (tcp_is_reno(tp)) {
2861	3006	if (flag & FLAG_SND_UNA_ADVANCED)
2862	3007	tcp_reset_reno_sack(tp);
2863		- if (is_dupack)
2864		- tcp_add_reno_sack(sk);
	3008	+ tcp_add_reno_sack(sk, num_dupack, ece_ack);
2865	3009	}
2866	3010
2867	3011	if (icsk->icsk_ca_state <= TCP_CA_Disorder)
..	..	@@ -2885,7 +3029,7 @@
2885	3029	}
2886	3030
2887	3031	/* Otherwise enter Recovery state */
2888		- tcp_enter_recovery(sk, (flag & FLAG_ECE));
	3032	+ tcp_enter_recovery(sk, ece_ack);
2889	3033	fast_rexmit = 1;
2890	3034	}
2891	3035
..	..	@@ -2896,7 +3040,7 @@
2896	3040
2897	3041	static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us, const int flag)
2898	3042	{
2899		- u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ;
	3043	+ u32 wlen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen) * HZ;
2900	3044	struct tcp_sock *tp = tcp_sk(sk);
2901	3045
2902	3046	if ((flag & FLAG_ACK_MAYBE_DELAYED) && rtt_us > tcp_min_rtt(tp)) {
..	..	@@ -2935,6 +3079,8 @@
2935	3079	u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
2936	3080
2937	3081	if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
	3082	+ if (!delta)
	3083	+ delta = 1;
2938	3084	seq_rtt_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
2939	3085	ca_rtt_us = seq_rtt_us;
2940	3086	}
..	..	@@ -2988,7 +3134,7 @@
2988	3134	/* If the retrans timer is currently being used by Fast Open
2989	3135	* for SYN-ACK retrans purpose, stay put.
2990	3136	*/
2991		- if (tp->fastopen_rsk)
	3137	+ if (rcu_access_pointer(tp->fastopen_rsk))
2992	3138	return;
2993	3139
2994	3140	if (!tp->packets_out) {
..	..	@@ -3004,8 +3150,8 @@
3004	3150	*/
3005	3151	rto = usecs_to_jiffies(max_t(int, delta_us, 1));
3006	3152	}
3007		- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
3008		- TCP_RTO_MAX);
	3153	+ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
	3154	+ TCP_RTO_MAX);
3009	3155	}
3010	3156	}
3011	3157
..	..	@@ -3061,7 +3207,7 @@
3061	3207	*/
3062	3208	static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
3063	3209	u32 prior_snd_una,
3064		- struct tcp_sacktag_state *sack)
	3210	+ struct tcp_sacktag_state *sack, bool ece_ack)
3065	3211	{
3066	3212	const struct inet_connection_sock *icsk = inet_csk(sk);
3067	3213	u64 first_ackt, last_ackt;
..	..	@@ -3086,8 +3232,6 @@
3086	3232	u8 sacked = scb->sacked;
3087	3233	u32 acked_pcount;
3088	3234
3089		- tcp_ack_tstamp(sk, skb, prior_snd_una);
3090		-
3091	3235	/* Determine how many packets and what bytes were acked, tso and else */
3092	3236	if (after(scb->end_seq, tp->snd_una)) {
3093	3237	if (tcp_skb_pcount(skb) == 1 \|\|
..	..	@@ -3107,7 +3251,7 @@
3107	3251	tp->retrans_out -= acked_pcount;
3108	3252	flag \|= FLAG_RETRANS_DATA_ACKED;
3109	3253	} else if (!(sacked & TCPCB_SACKED_ACKED)) {
3110		- last_ackt = skb->skb_mstamp;
	3254	+ last_ackt = tcp_skb_timestamp_us(skb);
3111	3255	WARN_ON_ONCE(last_ackt == 0);
3112	3256	if (!first_ackt)
3113	3257	first_ackt = last_ackt;
..	..	@@ -3122,10 +3266,10 @@
3122	3266	if (sacked & TCPCB_SACKED_ACKED) {
3123	3267	tp->sacked_out -= acked_pcount;
3124	3268	} else if (tcp_is_sack(tp)) {
3125		- tp->delivered += acked_pcount;
	3269	+ tcp_count_delivered(tp, acked_pcount, ece_ack);
3126	3270	if (!tcp_skb_spurious_retrans(tp, skb))
3127	3271	tcp_rack_advance(tp, sacked, scb->end_seq,
3128		- skb->skb_mstamp);
	3272	+ tcp_skb_timestamp_us(skb));
3129	3273	}
3130	3274	if (sacked & TCPCB_LOST)
3131	3275	tp->lost_out -= acked_pcount;
..	..	@@ -3151,6 +3295,8 @@
3151	3295	if (!fully_acked)
3152	3296	break;
3153	3297
	3298	+ tcp_ack_tstamp(sk, skb, prior_snd_una);
	3299	+
3154	3300	next = skb_rb_next(skb);
3155	3301	if (unlikely(skb == tp->retransmit_skb_hint))
3156	3302	tp->retransmit_skb_hint = NULL;
..	..	@@ -3166,8 +3312,11 @@
3166	3312	if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
3167	3313	tp->snd_up = tp->snd_una;
3168	3314
3169		- if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
3170		- flag \|= FLAG_SACK_RENEGING;
	3315	+ if (skb) {
	3316	+ tcp_ack_tstamp(sk, skb, prior_snd_una);
	3317	+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
	3318	+ flag \|= FLAG_SACK_RENEGING;
	3319	+ }
3171	3320
3172	3321	if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
3173	3322	seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt);
..	..	@@ -3199,7 +3348,7 @@
3199	3348	}
3200	3349
3201	3350	if (tcp_is_reno(tp)) {
3202		- tcp_remove_reno_sacks(sk, pkts_acked);
	3351	+ tcp_remove_reno_sacks(sk, pkts_acked, ece_ack);
3203	3352
3204	3353	/* If any of the cumulatively ACKed segments was
3205	3354	* retransmitted, non-SACK case cannot confirm that
..	..	@@ -3220,7 +3369,8 @@
3220	3369	tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
3221	3370	}
3222	3371	} else if (skb && rtt_update && sack_rtt_us >= 0 &&
3223		- sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) {
	3372	+ sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp,
	3373	+ tcp_skb_timestamp_us(skb))) {
3224	3374	/* Do not re-arm RTO if the sack RTT is measured from data sent
3225	3375	* after when the head was last (re)transmitted. Otherwise the
3226	3376	* timeout may continue to extend in loss recovery.
..	..	@@ -3273,6 +3423,7 @@
3273	3423	return;
3274	3424	if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) {
3275	3425	icsk->icsk_backoff = 0;
	3426	+ icsk->icsk_probes_tstamp = 0;
3276	3427	inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
3277	3428	/* Socket must be waked up by subsequent tcp_data_snd_check().
3278	3429	* This function is not for random using!
..	..	@@ -3280,8 +3431,8 @@
3280	3431	} else {
3281	3432	unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);
3282	3433
3283		- inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
3284		- when, TCP_RTO_MAX);
	3434	+ when = tcp_clamp_probe0_to_user_timeout(sk, when);
	3435	+ tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, TCP_RTO_MAX);
3285	3436	}
3286	3437	}
3287	3438
..	..	@@ -3300,7 +3451,8 @@
3300	3451	* new SACK or ECE mark may first advance cwnd here and later reduce
3301	3452	* cwnd in tcp_fastretrans_alert() based on more states.
3302	3453	*/
3303		- if (tcp_sk(sk)->reordering > sock_net(sk)->ipv4.sysctl_tcp_reordering)
	3454	+ if (tcp_sk(sk)->reordering >
	3455	+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering))
3304	3456	return flag & FLAG_FORWARD_PROGRESS;
3305	3457
3306	3458	return flag & FLAG_DATA_ACKED;
..	..	@@ -3412,7 +3564,8 @@
3412	3564	if (*last_oow_ack_time) {
3413	3565	s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time);
3414	3566
3415		- if (0 <= elapsed && elapsed < net->ipv4.sysctl_tcp_invalid_ratelimit) {
	3567	+ if (0 <= elapsed &&
	3568	+ elapsed < READ_ONCE(net->ipv4.sysctl_tcp_invalid_ratelimit)) {
3416	3569	NET_INC_STATS(net, mib_idx);
3417	3570	return true; /* rate-limited: don't send yet! */
3418	3571	}
..	..	@@ -3459,11 +3612,11 @@
3459	3612
3460	3613	/* Then check host-wide RFC 5961 rate limit. */
3461	3614	now = jiffies / HZ;
3462		- if (now != challenge_timestamp) {
3463		- u32 ack_limit = net->ipv4.sysctl_tcp_challenge_ack_limit;
	3615	+ if (now != READ_ONCE(challenge_timestamp)) {
	3616	+ u32 ack_limit = READ_ONCE(net->ipv4.sysctl_tcp_challenge_ack_limit);
3464	3617	u32 half = (ack_limit + 1) >> 1;
3465	3618
3466		- challenge_timestamp = now;
	3619	+ WRITE_ONCE(challenge_timestamp, now);
3467	3620	WRITE_ONCE(challenge_count, half + prandom_u32_max(ack_limit));
3468	3621	}
3469	3622	count = READ_ONCE(challenge_count);
..	..	@@ -3544,10 +3697,10 @@
3544	3697	{
3545	3698	struct tcp_sock *tp = tcp_sk(sk);
3546	3699
3547		- if (rexmit == REXMIT_NONE)
	3700	+ if (rexmit == REXMIT_NONE \|\| sk->sk_state == TCP_SYN_SENT)
3548	3701	return;
3549	3702
3550		- if (unlikely(rexmit == 2)) {
	3703	+ if (unlikely(rexmit == REXMIT_NEW)) {
3551	3704	__tcp_push_pending_frames(sk, tcp_current_mss(sk),
3552	3705	TCP_NAGLE_OFF);
3553	3706	if (after(tp->snd_nxt, tp->high_seq))
..	..	@@ -3566,10 +3719,9 @@
3566	3719
3567	3720	delivered = tp->delivered - prior_delivered;
3568	3721	NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered);
3569		- if (flag & FLAG_ECE) {
3570		- tp->delivered_ce += delivered;
	3722	+ if (flag & FLAG_ECE)
3571	3723	NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered);
3572		- }
	3724	+
3573	3725	return delivered;
3574	3726	}
3575	3727
..	..	@@ -3584,7 +3736,7 @@
3584	3736	bool is_sack_reneg = tp->is_sack_reneg;
3585	3737	u32 ack_seq = TCP_SKB_CB(skb)->seq;
3586	3738	u32 ack = TCP_SKB_CB(skb)->ack_seq;
3587		- bool is_dupack = false;
	3739	+ int num_dupack = 0;
3588	3740	int prior_packets = tp->packets_out;
3589	3741	u32 delivered = tp->delivered;
3590	3742	u32 lost = tp->lost;
..	..	@@ -3593,6 +3745,7 @@
3593	3745
3594	3746	sack_state.first_sackt = 0;
3595	3747	sack_state.rate = &rs;
	3748	+ sack_state.sack_delivered = 0;
3596	3749
3597	3750	/* We very likely will need to access rtx queue. */
3598	3751	prefetch(sk->tcp_rtx_queue.rb_node);
..	..	@@ -3614,14 +3767,14 @@
3614	3767	* this segment (RFC793 Section 3.9).
3615	3768	*/
3616	3769	if (after(ack, tp->snd_nxt))
3617		- goto invalid_ack;
	3770	+ return -1;
3618	3771
3619	3772	if (after(ack, prior_snd_una)) {
3620	3773	flag \|= FLAG_SND_UNA_ADVANCED;
3621	3774	icsk->icsk_retransmits = 0;
3622	3775
3623	3776	#if IS_ENABLED(CONFIG_TLS_DEVICE)
3624		- if (static_branch_unlikely(&clean_acked_data_enabled))
	3777	+ if (static_branch_unlikely(&clean_acked_data_enabled.key))
3625	3778	if (icsk->icsk_clean_acked)
3626	3779	icsk->icsk_clean_acked(sk, ack);
3627	3780	#endif
..	..	@@ -3636,7 +3789,8 @@
3636	3789	if (flag & FLAG_UPDATE_TS_RECENT)
3637	3790	tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
3638	3791
3639		- if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
	3792	+ if ((flag & (FLAG_SLOWPATH \| FLAG_SND_UNA_ADVANCED)) ==
	3793	+ FLAG_SND_UNA_ADVANCED) {
3640	3794	/* Window is constant, pure forward advance.
3641	3795	* No more checks are required.
3642	3796	* Note, we use the fact that SND.UNA>=SND.WL2.
..	..	@@ -3667,6 +3821,10 @@
3667	3821	ack_ev_flags \|= CA_ACK_ECE;
3668	3822	}
3669	3823
	3824	+ if (sack_state.sack_delivered)
	3825	+ tcp_count_delivered(tp, sack_state.sack_delivered,
	3826	+ flag & FLAG_ECE);
	3827	+
3670	3828	if (flag & FLAG_WIN_UPDATE)
3671	3829	ack_ev_flags \|= CA_ACK_WIN_UPDATE;
3672	3830
..	..	@@ -3692,7 +3850,8 @@
3692	3850	goto no_queue;
3693	3851
3694	3852	/* See if we can take anything off of the retransmit queue. */
3695		- flag \|= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state);
	3853	+ flag \|= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state,
	3854	+ flag & FLAG_ECE);
3696	3855
3697	3856	tcp_rack_update_reo_wnd(sk, &rs);
3698	3857
..	..	@@ -3700,8 +3859,14 @@
3700	3859	tcp_process_tlp_ack(sk, ack, flag);
3701	3860
3702	3861	if (tcp_ack_is_dubious(sk, flag)) {
3703		- is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED \| FLAG_NOT_DUP));
3704		- tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
	3862	+ if (!(flag & (FLAG_SND_UNA_ADVANCED \|
	3863	+ FLAG_NOT_DUP \| FLAG_DSACKING_ACK))) {
	3864	+ num_dupack = 1;
	3865	+ /* Consider if pure acks were aggregated in tcp_add_backlog() */
	3866	+ if (!(flag & FLAG_DATA))
	3867	+ num_dupack = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
	3868	+ }
	3869	+ tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
3705	3870	&rexmit);
3706	3871	}
3707	3872
..	..	@@ -3723,7 +3888,7 @@
3723	3888	no_queue:
3724	3889	/* If data was DSACKed, see if we can undo a cwnd reduction. */
3725	3890	if (flag & FLAG_DSACKING_ACK) {
3726		- tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
	3891	+ tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
3727	3892	&rexmit);
3728	3893	tcp_newly_delivered(sk, delivered, flag);
3729	3894	}
..	..	@@ -3737,10 +3902,6 @@
3737	3902	tcp_process_tlp_ack(sk, ack, flag);
3738	3903	return 1;
3739	3904
3740		-invalid_ack:
3741		- SOCK_DEBUG(sk, "Ack %u after %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
3742		- return -1;
3743		-
3744	3905	old_ack:
3745	3906	/* If data was SACKed, tag it and see if we should send more data.
3746	3907	* If data was DSACKed, see if we can undo a cwnd reduction.
..	..	@@ -3748,13 +3909,12 @@
3748	3909	if (TCP_SKB_CB(skb)->sacked) {
3749	3910	flag \|= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
3750	3911	&sack_state);
3751		- tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
	3912	+ tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
3752	3913	&rexmit);
3753	3914	tcp_newly_delivered(sk, delivered, flag);
3754	3915	tcp_xmit_recovery(sk, rexmit);
3755	3916	}
3756	3917
3757		- SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
3758	3918	return 0;
3759	3919	}
3760	3920
..	..	@@ -3775,7 +3935,7 @@
3775	3935	foc->exp = exp_opt;
3776	3936	}
3777	3937
3778		-static void smc_parse_options(const struct tcphdr *th,
	3938	+static bool smc_parse_options(const struct tcphdr *th,
3779	3939	struct tcp_options_received *opt_rx,
3780	3940	const unsigned char *ptr,
3781	3941	int opsize)
..	..	@@ -3784,10 +3944,56 @@
3784	3944	if (static_branch_unlikely(&tcp_have_smc)) {
3785	3945	if (th->syn && !(opsize & 1) &&
3786	3946	opsize >= TCPOLEN_EXP_SMC_BASE &&
3787		- get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC)
	3947	+ get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) {
3788	3948	opt_rx->smc_ok = 1;
	3949	+ return true;
	3950	+ }
3789	3951	}
3790	3952	#endif
	3953	+ return false;
	3954	+}
	3955	+
	3956	+/* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped
	3957	+ * value on success.
	3958	+ */
	3959	+static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss)
	3960	+{
	3961	+ const unsigned char ptr = (const unsigned char )(th + 1);
	3962	+ int length = (th->doff * 4) - sizeof(struct tcphdr);
	3963	+ u16 mss = 0;
	3964	+
	3965	+ while (length > 0) {
	3966	+ int opcode = *ptr++;
	3967	+ int opsize;
	3968	+
	3969	+ switch (opcode) {
	3970	+ case TCPOPT_EOL:
	3971	+ return mss;
	3972	+ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
	3973	+ length--;
	3974	+ continue;
	3975	+ default:
	3976	+ if (length < 2)
	3977	+ return mss;
	3978	+ opsize = *ptr++;
	3979	+ if (opsize < 2) /* "silly options" */
	3980	+ return mss;
	3981	+ if (opsize > length)
	3982	+ return mss; /* fail on partial options */
	3983	+ if (opcode == TCPOPT_MSS && opsize == TCPOLEN_MSS) {
	3984	+ u16 in_mss = get_unaligned_be16(ptr);
	3985	+
	3986	+ if (in_mss) {
	3987	+ if (user_mss && user_mss < in_mss)
	3988	+ in_mss = user_mss;
	3989	+ mss = in_mss;
	3990	+ }
	3991	+ }
	3992	+ ptr += opsize - 2;
	3993	+ length -= opsize;
	3994	+ }
	3995	+ }
	3996	+ return mss;
3791	3997	}
3792	3998
3793	3999	/* Look for tcp options. Normally only called on SYN and SYNACK packets.
..	..	@@ -3805,6 +4011,7 @@
3805	4011
3806	4012	ptr = (const unsigned char *)(th + 1);
3807	4013	opt_rx->saw_tstamp = 0;
	4014	+ opt_rx->saw_unknown = 0;
3808	4015
3809	4016	while (length > 0) {
3810	4017	int opcode = *ptr++;
..	..	@@ -3817,6 +4024,8 @@
3817	4024	length--;
3818	4025	continue;
3819	4026	default:
	4027	+ if (length < 2)
	4028	+ return;
3820	4029	opsize = *ptr++;
3821	4030	if (opsize < 2) /* "silly options" */
3822	4031	return;
..	..	@@ -3836,7 +4045,7 @@
3836	4045	break;
3837	4046	case TCPOPT_WINDOW:
3838	4047	if (opsize == TCPOLEN_WINDOW && th->syn &&
3839		- !estab && net->ipv4.sysctl_tcp_window_scaling) {
	4048	+ !estab && READ_ONCE(net->ipv4.sysctl_tcp_window_scaling)) {
3840	4049	__u8 snd_wscale = (__u8 )ptr;
3841	4050	opt_rx->wscale_ok = 1;
3842	4051	if (snd_wscale > TCP_MAX_WSCALE) {
..	..	@@ -3852,7 +4061,7 @@
3852	4061	case TCPOPT_TIMESTAMP:
3853	4062	if ((opsize == TCPOLEN_TIMESTAMP) &&
3854	4063	((estab && opt_rx->tstamp_ok) \|\|
3855		- (!estab && net->ipv4.sysctl_tcp_timestamps))) {
	4064	+ (!estab && READ_ONCE(net->ipv4.sysctl_tcp_timestamps)))) {
3856	4065	opt_rx->saw_tstamp = 1;
3857	4066	opt_rx->rcv_tsval = get_unaligned_be32(ptr);
3858	4067	opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
..	..	@@ -3860,7 +4069,7 @@
3860	4069	break;
3861	4070	case TCPOPT_SACK_PERM:
3862	4071	if (opsize == TCPOLEN_SACK_PERM && th->syn &&
3863		- !estab && net->ipv4.sysctl_tcp_sack) {
	4072	+ !estab && READ_ONCE(net->ipv4.sysctl_tcp_sack)) {
3864	4073	opt_rx->sack_ok = TCP_SACK_SEEN;
3865	4074	tcp_sack_reset(opt_rx);
3866	4075	}
..	..	@@ -3893,15 +4102,21 @@
3893	4102	*/
3894	4103	if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE &&
3895	4104	get_unaligned_be16(ptr) ==
3896		- TCPOPT_FASTOPEN_MAGIC)
	4105	+ TCPOPT_FASTOPEN_MAGIC) {
3897	4106	tcp_parse_fastopen_option(opsize -
3898	4107	TCPOLEN_EXP_FASTOPEN_BASE,
3899	4108	ptr + 2, th->syn, foc, true);
3900		- else
3901		- smc_parse_options(th, opt_rx, ptr,
3902		- opsize);
	4109	+ break;
	4110	+ }
	4111	+
	4112	+ if (smc_parse_options(th, opt_rx, ptr, opsize))
	4113	+ break;
	4114	+
	4115	+ opt_rx->saw_unknown = 1;
3903	4116	break;
3904	4117
	4118	+ default:
	4119	+ opt_rx->saw_unknown = 1;
3905	4120	}
3906	4121	ptr += opsize-2;
3907	4122	length -= opsize;
..	..	@@ -4117,7 +4332,7 @@
4117	4332	case TCP_ESTABLISHED:
4118	4333	/* Move to CLOSE_WAIT */
4119	4334	tcp_set_state(sk, TCP_CLOSE_WAIT);
4120		- inet_csk(sk)->icsk_ack.pingpong = 1;
	4335	+ inet_csk_enter_pingpong_mode(sk);
4121	4336	break;
4122	4337
4123	4338	case TCP_CLOSE_WAIT:
..	..	@@ -4189,7 +4404,7 @@
4189	4404	{
4190	4405	struct tcp_sock *tp = tcp_sk(sk);
4191	4406
4192		- if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
	4407	+ if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) {
4193	4408	int mib_idx;
4194	4409
4195	4410	if (before(seq, tp->rcv_nxt))
..	..	@@ -4215,6 +4430,18 @@
4215	4430	tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
4216	4431	}
4217	4432
	4433	+static void tcp_rcv_spurious_retrans(struct sock sk, const struct sk_buff skb)
	4434	+{
	4435	+ /* When the ACK path fails or drops most ACKs, the sender would
	4436	+ * timeout and spuriously retransmit the same segment repeatedly.
	4437	+ * The receiver remembers and reflects via DSACKs. Leverage the
	4438	+ * DSACK state and change the txhash to re-route speculatively.
	4439	+ */
	4440	+ if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq &&
	4441	+ sk_rethink_txhash(sk))
	4442	+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH);
	4443	+}
	4444	+
4218	4445	static void tcp_send_dupack(struct sock sk, const struct sk_buff skb)
4219	4446	{
4220	4447	struct tcp_sock *tp = tcp_sk(sk);
..	..	@@ -4224,9 +4451,10 @@
4224	4451	NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
4225	4452	tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
4226	4453
4227		- if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
	4454	+ if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) {
4228	4455	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
4229	4456
	4457	+ tcp_rcv_spurious_retrans(sk, skb);
4230	4458	if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
4231	4459	end_seq = tp->rcv_nxt;
4232	4460	tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
..	..	@@ -4260,9 +4488,37 @@
4260	4488	sp[i] = sp[i + 1];
4261	4489	continue;
4262	4490	}
4263		- this_sack++, swalk++;
	4491	+ this_sack++;
	4492	+ swalk++;
4264	4493	}
4265	4494	}
	4495	+
	4496	+static void tcp_sack_compress_send_ack(struct sock *sk)
	4497	+{
	4498	+ struct tcp_sock *tp = tcp_sk(sk);
	4499	+
	4500	+ if (!tp->compressed_ack)
	4501	+ return;
	4502	+
	4503	+ if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
	4504	+ __sock_put(sk);
	4505	+
	4506	+ /* Since we have to send one ack finally,
	4507	+ * substract one from tp->compressed_ack to keep
	4508	+ * LINUX_MIB_TCPACKCOMPRESSED accurate.
	4509	+ */
	4510	+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
	4511	+ tp->compressed_ack - 1);
	4512	+
	4513	+ tp->compressed_ack = 0;
	4514	+ tcp_send_ack(sk);
	4515	+}
	4516	+
	4517	+/* Reasonable amount of sack blocks included in TCP SACK option
	4518	+ * The max is 4, but this becomes 3 if TCP timestamps are there.
	4519	+ * Given that SACK packets might be lost, be conservative and use 2.
	4520	+ */
	4521	+#define TCP_SACK_BLOCKS_EXPECTED 2
4266	4522
4267	4523	static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
4268	4524	{
..	..	@@ -4276,6 +4532,8 @@
4276	4532
4277	4533	for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) {
4278	4534	if (tcp_sack_extend(sp, seq, end_seq)) {
	4535	+ if (this_sack >= TCP_SACK_BLOCKS_EXPECTED)
	4536	+ tcp_sack_compress_send_ack(sk);
4279	4537	/* Rotate this_sack to the first one. */
4280	4538	for (; this_sack > 0; this_sack--, sp--)
4281	4539	swap(sp, (sp - 1));
..	..	@@ -4285,6 +4543,9 @@
4285	4543	}
4286	4544	}
4287	4545
	4546	+ if (this_sack >= TCP_SACK_BLOCKS_EXPECTED)
	4547	+ tcp_sack_compress_send_ack(sk);
	4548	+
4288	4549	/* Could not find an adjacent existing SACK, build a new one,
4289	4550	* put it at the front, and shift everyone else down. We
4290	4551	* always know there is at least one SACK present already here.
..	..	@@ -4292,8 +4553,6 @@
4292	4553	* If the sack array is full, forget about the last one.
4293	4554	*/
4294	4555	if (this_sack >= TCP_NUM_SACKS) {
4295		- if (tp->compressed_ack > TCP_FASTRETRANS_THRESH)
4296		- tcp_send_ack(sk);
4297	4556	this_sack--;
4298	4557	tp->rx_opt.num_sacks--;
4299	4558	sp--;
..	..	@@ -4345,7 +4604,6 @@
4345	4604	/**
4346	4605	* tcp_try_coalesce - try to merge skb to prior one
4347	4606	* @sk: socket
4348		- * @dest: destination queue
4349	4607	* @to: prior buffer
4350	4608	* @from: buffer to add in queue
4351	4609	* @fragstolen: pointer to boolean
..	..	@@ -4367,6 +4625,9 @@
4367	4625
4368	4626	/* Its possible this segment overlaps with prior segment in queue */
4369	4627	if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
	4628	+ return false;
	4629	+
	4630	+ if (!mptcp_skb_can_collapse(to, from))
4370	4631	return false;
4371	4632
4372	4633	#ifdef CONFIG_TLS_DEVICE
..	..	@@ -4412,6 +4673,7 @@
4412	4673
4413	4674	static void tcp_drop(struct sock sk, struct sk_buff skb)
4414	4675	{
	4676	+ trace_android_vh_kfree_skb(skb);
4415	4677	sk_drops_add(sk, skb);
4416	4678	__kfree_skb(skb);
4417	4679	}
..	..	@@ -4443,13 +4705,9 @@
4443	4705	rb_erase(&skb->rbnode, &tp->out_of_order_queue);
4444	4706
4445	4707	if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
4446		- SOCK_DEBUG(sk, "ofo packet was already received\n");
4447	4708	tcp_drop(sk, skb);
4448	4709	continue;
4449	4710	}
4450		- SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
4451		- tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
4452		- TCP_SKB_CB(skb)->end_seq);
4453	4711
4454	4712	tail = skb_peek_tail(&sk->sk_receive_queue);
4455	4713	eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
..	..	@@ -4511,11 +4769,10 @@
4511	4769	tp->pred_flags = 0;
4512	4770	inet_csk_schedule_ack(sk);
4513	4771
	4772	+ tp->rcv_ooopack += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
4514	4773	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
4515	4774	seq = TCP_SKB_CB(skb)->seq;
4516	4775	end_seq = TCP_SKB_CB(skb)->end_seq;
4517		- SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
4518		- tp->rcv_nxt, seq, end_seq);
4519	4776
4520	4777	p = &tp->out_of_order_queue.rb_node;
4521	4778	if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
..	..	@@ -4541,7 +4798,7 @@
4541	4798	* and trigger fast retransmit.
4542	4799	*/
4543	4800	if (tcp_is_sack(tp))
4544		- tcp_grow_window(sk, skb);
	4801	+ tcp_grow_window(sk, skb, true);
4545	4802	kfree_skb_partial(skb, fragstolen);
4546	4803	skb = NULL;
4547	4804	goto add_sack;
..	..	@@ -4629,19 +4886,18 @@
4629	4886	* and trigger fast retransmit.
4630	4887	*/
4631	4888	if (tcp_is_sack(tp))
4632		- tcp_grow_window(sk, skb);
	4889	+ tcp_grow_window(sk, skb, false);
4633	4890	skb_condense(skb);
4634	4891	skb_set_owner_r(skb, sk);
4635	4892	}
4636	4893	}
4637	4894
4638		-static int __must_check tcp_queue_rcv(struct sock sk, struct sk_buff skb, int hdrlen,
4639		- bool *fragstolen)
	4895	+static int __must_check tcp_queue_rcv(struct sock sk, struct sk_buff skb,
	4896	+ bool *fragstolen)
4640	4897	{
4641	4898	int eaten;
4642	4899	struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
4643	4900
4644		- __skb_pull(skb, hdrlen);
4645	4901	eaten = (tail &&
4646	4902	tcp_try_coalesce(sk, tail,
4647	4903	skb, fragstolen)) ? 1 : 0;
..	..	@@ -4692,7 +4948,7 @@
4692	4948	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
4693	4949	TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
4694	4950
4695		- if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) {
	4951	+ if (tcp_queue_rcv(sk, skb, &fragstolen)) {
4696	4952	WARN_ON_ONCE(fragstolen); /* should not happen */
4697	4953	__kfree_skb(skb);
4698	4954	}
..	..	@@ -4724,6 +4980,9 @@
4724	4980	bool fragstolen;
4725	4981	int eaten;
4726	4982
	4983	+ if (sk_is_mptcp(sk))
	4984	+ mptcp_incoming_options(sk, skb);
	4985	+
4727	4986	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
4728	4987	__kfree_skb(skb);
4729	4988	return;
..	..	@@ -4753,7 +5012,7 @@
4753	5012	goto drop;
4754	5013	}
4755	5014
4756		- eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
	5015	+ eaten = tcp_queue_rcv(sk, skb, &fragstolen);
4757	5016	if (skb->len)
4758	5017	tcp_event_data_recv(sk, skb);
4759	5018	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
..	..	@@ -4782,6 +5041,7 @@
4782	5041	}
4783	5042
4784	5043	if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
	5044	+ tcp_rcv_spurious_retrans(sk, skb);
4785	5045	/* A retransmit, 2nd most common case. Force an immediate ack. */
4786	5046	NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
4787	5047	tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
..	..	@@ -4800,10 +5060,6 @@
4800	5060
4801	5061	if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
4802	5062	/* Partial packet, seq < rcv_next < end_seq */
4803		- SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
4804		- tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
4805		- TCP_SKB_CB(skb)->end_seq);
4806		-
4807	5063	tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
4808	5064
4809	5065	/* If window is closed, drop tail of packet. But after
..	..	@@ -4897,7 +5153,7 @@
4897	5153	/* The first skb to collapse is:
4898	5154	* - not SYN/FIN and
4899	5155	* - bloated or contains data before "start" or
4900		- * overlaps to the next one.
	5156	+ * overlaps to the next one and mptcp allow collapsing.
4901	5157	*/
4902	5158	if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN \| TCPHDR_FIN)) &&
4903	5159	(tcp_win_from_space(sk, skb->truesize) > skb->len \|\|
..	..	@@ -4906,7 +5162,7 @@
4906	5162	break;
4907	5163	}
4908	5164
4909		- if (n && n != tail &&
	5165	+ if (n && n != tail && mptcp_skb_can_collapse(skb, n) &&
4910	5166	TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
4911	5167	end_of_skbs = false;
4912	5168	break;
..	..	@@ -4939,6 +5195,7 @@
4939	5195	else
4940	5196	__skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
4941	5197	skb_set_owner_r(nskb, sk);
	5198	+ mptcp_skb_ext_move(nskb, skb);
4942	5199
4943	5200	/* Copy data, releasing collapsed skbs. */
4944	5201	while (copy > 0) {
..	..	@@ -4958,6 +5215,7 @@
4958	5215	skb = tcp_collapse_one(sk, skb, list, root);
4959	5216	if (!skb \|\|
4960	5217	skb == tail \|\|
	5218	+ !mptcp_skb_can_collapse(nskb, skb) \|\|
4961	5219	(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN \| TCPHDR_FIN)))
4962	5220	goto end;
4963	5221	#ifdef CONFIG_TLS_DEVICE
..	..	@@ -5082,8 +5340,6 @@
5082	5340	{
5083	5341	struct tcp_sock *tp = tcp_sk(sk);
5084	5342
5085		- SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
5086		-
5087	5343	NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);
5088	5344
5089	5345	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
..	..	@@ -5149,12 +5405,6 @@
5149	5405	return true;
5150	5406	}
5151	5407
5152		-/* When incoming ACK allowed to free some skb from write_queue,
5153		- * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
5154		- * on the exit from tcp input handler.
5155		- *
5156		- * PROBLEM: sndbuf expansion does not work well with largesend.
5157		- */
5158	5408	static void tcp_new_space(struct sock *sk)
5159	5409	{
5160	5410	struct tcp_sock *tp = tcp_sk(sk);
..	..	@@ -5167,18 +5417,25 @@
5167	5417	sk->sk_write_space(sk);
5168	5418	}
5169	5419
5170		-static void tcp_check_space(struct sock *sk)
	5420	+/* Caller made space either from:
	5421	+ * 1) Freeing skbs in rtx queues (after tp->snd_una has advanced)
	5422	+ * 2) Sent skbs from output queue (and thus advancing tp->snd_nxt)
	5423	+ *
	5424	+ * We might be able to generate EPOLLOUT to the application if:
	5425	+ * 1) Space consumed in output/rtx queues is below sk->sk_sndbuf/2
	5426	+ * 2) notsent amount (tp->write_seq - tp->snd_nxt) became
	5427	+ * small enough that tcp_stream_memory_free() decides it
	5428	+ * is time to generate EPOLLOUT.
	5429	+ */
	5430	+void tcp_check_space(struct sock *sk)
5171	5431	{
5172		- if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
5173		- sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
5174		- /* pairs with tcp_poll() */
5175		- smp_mb();
5176		- if (sk->sk_socket &&
5177		- test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
5178		- tcp_new_space(sk);
5179		- if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
5180		- tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
5181		- }
	5432	+ /* pairs with tcp_poll() */
	5433	+ smp_mb();
	5434	+ if (sk->sk_socket &&
	5435	+ test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
	5436	+ tcp_new_space(sk);
	5437	+ if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
	5438	+ tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
5182	5439	}
5183	5440	}
5184	5441
..	..	@@ -5220,20 +5477,18 @@
5220	5477	}
5221	5478
5222	5479	if (!tcp_is_sack(tp) \|\|
5223		- tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr)
	5480	+ tp->compressed_ack >= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr))
5224	5481	goto send_now;
5225	5482
5226	5483	if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) {
5227	5484	tp->compressed_ack_rcv_nxt = tp->rcv_nxt;
5228		- if (tp->compressed_ack > TCP_FASTRETRANS_THRESH)
5229		- NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
5230		- tp->compressed_ack - TCP_FASTRETRANS_THRESH);
5231		- tp->compressed_ack = 0;
	5485	+ tp->dup_ack_counter = 0;
5232	5486	}
5233		-
5234		- if (++tp->compressed_ack <= TCP_FASTRETRANS_THRESH)
	5487	+ if (tp->dup_ack_counter < TCP_FASTRETRANS_THRESH) {
	5488	+ tp->dup_ack_counter++;
5235	5489	goto send_now;
5236		-
	5490	+ }
	5491	+ tp->compressed_ack++;
5237	5492	if (hrtimer_is_queued(&tp->compressed_ack_timer))
5238	5493	return;
5239	5494
..	..	@@ -5243,11 +5498,13 @@
5243	5498	if (tp->srtt_us && tp->srtt_us < rtt)
5244	5499	rtt = tp->srtt_us;
5245	5500
5246		- delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns,
	5501	+ delay = min_t(unsigned long,
	5502	+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns),
5247	5503	rtt * (NSEC_PER_USEC >> 3)/20);
5248	5504	sock_hold(sk);
5249		- hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay),
5250		- HRTIMER_MODE_REL_PINNED_SOFT);
	5505	+ hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay),
	5506	+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns),
	5507	+ HRTIMER_MODE_REL_PINNED_SOFT);
5251	5508	}
5252	5509
5253	5510	static inline void tcp_ack_snd_check(struct sock *sk)
..	..	@@ -5274,7 +5531,7 @@
5274	5531	struct tcp_sock *tp = tcp_sk(sk);
5275	5532	u32 ptr = ntohs(th->urg_ptr);
5276	5533
5277		- if (ptr && !sock_net(sk)->ipv4.sysctl_tcp_stdurg)
	5534	+ if (ptr && !READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_stdurg))
5278	5535	ptr--;
5279	5536	ptr += ntohl(th->seq);
5280	5537
..	..	@@ -5328,7 +5585,7 @@
5328	5585	}
5329	5586
5330	5587	tp->urg_data = TCP_URG_NOTYET;
5331		- tp->urg_seq = ptr;
	5588	+ WRITE_ONCE(tp->urg_seq, ptr);
5332	5589
5333	5590	/* Disable header prediction. */
5334	5591	tp->pred_flags = 0;
..	..	@@ -5481,6 +5738,8 @@
5481	5738	goto discard;
5482	5739	}
5483	5740
	5741	+ bpf_skops_parse_hdr(sk, skb);
	5742	+
5484	5743	return true;
5485	5744
5486	5745	discard:
..	..	@@ -5521,7 +5780,7 @@
5521	5780	trace_tcp_probe(sk, skb);
5522	5781
5523	5782	tcp_mstamp_refresh(tp);
5524		- if (unlikely(!sk->sk_rx_dst))
	5783	+ if (unlikely(!rcu_access_pointer(sk->sk_rx_dst)))
5525	5784	inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
5526	5785	/*
5527	5786	* Header prediction.
..	..	@@ -5628,8 +5887,8 @@
5628	5887	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
5629	5888
5630	5889	/* Bulk data transfer: receiver */
5631		- eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
5632		- &fragstolen);
	5890	+ __skb_pull(skb, tcp_header_len);
	5891	+ eaten = tcp_queue_rcv(sk, skb, &fragstolen);
5633	5892
5634	5893	tcp_event_data_recv(sk, skb);
5635	5894
..	..	@@ -5691,6 +5950,34 @@
5691	5950	}
5692	5951	EXPORT_SYMBOL(tcp_rcv_established);
5693	5952
	5953	+void tcp_init_transfer(struct sock sk, int bpf_op, struct sk_buff skb)
	5954	+{
	5955	+ struct inet_connection_sock *icsk = inet_csk(sk);
	5956	+ struct tcp_sock *tp = tcp_sk(sk);
	5957	+
	5958	+ tcp_mtup_init(sk);
	5959	+ icsk->icsk_af_ops->rebuild_header(sk);
	5960	+ tcp_init_metrics(sk);
	5961	+
	5962	+ /* Initialize the congestion window to start the transfer.
	5963	+ * Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
	5964	+ * retransmitted. In light of RFC6298 more aggressive 1sec
	5965	+ * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
	5966	+ * retransmission has occurred.
	5967	+ */
	5968	+ if (tp->total_retrans > 1 && tp->undo_marker)
	5969	+ tp->snd_cwnd = 1;
	5970	+ else
	5971	+ tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
	5972	+ tp->snd_cwnd_stamp = tcp_jiffies32;
	5973	+
	5974	+ bpf_skops_established(sk, bpf_op, skb);
	5975	+ /* Initialize congestion control unless BPF initialized it already: */
	5976	+ if (!icsk->icsk_ca_initialized)
	5977	+ tcp_init_congestion_control(sk);
	5978	+ tcp_init_buffer_space(sk);
	5979	+}
	5980	+
5694	5981	void tcp_finish_connect(struct sock sk, struct sk_buff skb)
5695	5982	{
5696	5983	struct tcp_sock *tp = tcp_sk(sk);
..	..	@@ -5705,7 +5992,7 @@
5705	5992	sk_mark_napi_id(sk, skb);
5706	5993	}
5707	5994
5708		- tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);
	5995	+ tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, skb);
5709	5996
5710	5997	/* Prevent spurious tcp_cwnd_restart() on first data
5711	5998	* packet.
..	..	@@ -5760,6 +6047,10 @@
5760	6047	tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
5761	6048
5762	6049	if (data) { /* Retransmit unacked data in SYN */
	6050	+ if (tp->total_retrans)
	6051	+ tp->fastopen_client_fail = TFO_SYN_RETRANSMITTED;
	6052	+ else
	6053	+ tp->fastopen_client_fail = TFO_DATA_NOT_ACKED;
5763	6054	skb_rbtree_walk_from(data) {
5764	6055	if (__tcp_retransmit_skb(sk, data, 1))
5765	6056	break;
..	..	@@ -5792,6 +6083,21 @@
5792	6083	#endif
5793	6084	}
5794	6085
	6086	+static void tcp_try_undo_spurious_syn(struct sock *sk)
	6087	+{
	6088	+ struct tcp_sock *tp = tcp_sk(sk);
	6089	+ u32 syn_stamp;
	6090	+
	6091	+ /* undo_marker is set when SYN or SYNACK times out. The timeout is
	6092	+ * spurious if the ACK's timestamp option echo value matches the
	6093	+ * original SYN timestamp.
	6094	+ */
	6095	+ syn_stamp = tp->retrans_stamp;
	6096	+ if (tp->undo_marker && syn_stamp && tp->rx_opt.saw_tstamp &&
	6097	+ syn_stamp == tp->rx_opt.rcv_tsecr)
	6098	+ tp->undo_marker = 0;
	6099	+}
	6100	+
5795	6101	static int tcp_rcv_synsent_state_process(struct sock sk, struct sk_buff skb,
5796	6102	const struct tcphdr *th)
5797	6103	{
..	..	@@ -5815,8 +6121,14 @@
5815	6121	* the segment and return)"
5816	6122	*/
5817	6123	if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) \|\|
5818		- after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt))
	6124	+ after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
	6125	+ /* Previous FIN/ACK or RST/ACK might be ignored. */
	6126	+ if (icsk->icsk_retransmits == 0)
	6127	+ inet_csk_reset_xmit_timer(sk,
	6128	+ ICSK_TIME_RETRANS,
	6129	+ TCP_TIMEOUT_MIN, TCP_RTO_MAX);
5819	6130	goto reset_and_undo;
	6131	+ }
5820	6132
5821	6133	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
5822	6134	!between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
..	..	@@ -5859,6 +6171,7 @@
5859	6171	tcp_ecn_rcv_synack(tp, th);
5860	6172
5861	6173	tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
	6174	+ tcp_try_undo_spurious_syn(sk);
5862	6175	tcp_ack(sk, skb, FLAG_SLOWPATH);
5863	6176
5864	6177	/* Ok.. it's good. Set up sequence numbers and
..	..	@@ -5912,7 +6225,7 @@
5912	6225	return -1;
5913	6226	if (sk->sk_write_pending \|\|
5914	6227	icsk->icsk_accept_queue.rskq_defer_accept \|\|
5915		- icsk->icsk_ack.pingpong) {
	6228	+ inet_csk_in_pingpong_mode(sk)) {
5916	6229	/* Save one ACK. Data will be ready after
5917	6230	* several ticks, if write_pending is set.
5918	6231	*
..	..	@@ -6017,6 +6330,38 @@
6017	6330	return 1;
6018	6331	}
6019	6332
	6333	+static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
	6334	+{
	6335	+ struct request_sock *req;
	6336	+
	6337	+ /* If we are still handling the SYNACK RTO, see if timestamp ECR allows
	6338	+ * undo. If peer SACKs triggered fast recovery, we can't undo here.
	6339	+ */
	6340	+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
	6341	+ tcp_try_undo_loss(sk, false);
	6342	+
	6343	+ /* Reset rtx states to prevent spurious retransmits_timed_out() */
	6344	+ tcp_sk(sk)->retrans_stamp = 0;
	6345	+ inet_csk(sk)->icsk_retransmits = 0;
	6346	+
	6347	+ /* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1,
	6348	+ * we no longer need req so release it.
	6349	+ */
	6350	+ req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
	6351	+ lockdep_sock_is_held(sk));
	6352	+ reqsk_fastopen_remove(sk, req, false);
	6353	+
	6354	+ /* Re-arm the timer because data may have been sent out.
	6355	+ * This is similar to the regular data transmission case
	6356	+ * when new data has just been ack'ed.
	6357	+ *
	6358	+ * (TFO) - we could try to be more aggressive and
	6359	+ * retransmitting any data sooner based on when they
	6360	+ * are sent out.
	6361	+ */
	6362	+ tcp_rearm_rto(sk);
	6363	+}
	6364	+
6020	6365	/*
6021	6366	* This function implements the receiving procedure of RFC 793 for
6022	6367	* all states except ESTABLISHED and TIME_WAIT.
..	..	@@ -6079,7 +6424,8 @@
6079	6424
6080	6425	tcp_mstamp_refresh(tp);
6081	6426	tp->rx_opt.saw_tstamp = 0;
6082		- req = tp->fastopen_rsk;
	6427	+ req = rcu_dereference_protected(tp->fastopen_rsk,
	6428	+ lockdep_sock_is_held(sk));
6083	6429	if (req) {
6084	6430	bool req_stolen;
6085	6431
..	..	@@ -6113,23 +6459,13 @@
6113	6459	if (!tp->srtt_us)
6114	6460	tcp_synack_rtt_meas(sk, req);
6115	6461
6116		- /* Once we leave TCP_SYN_RECV, we no longer need req
6117		- * so release it.
6118		- */
6119	6462	if (req) {
6120		- inet_csk(sk)->icsk_retransmits = 0;
6121		- reqsk_fastopen_remove(sk, req, false);
6122		- /* Re-arm the timer because data may have been sent out.
6123		- * This is similar to the regular data transmission case
6124		- * when new data has just been ack'ed.
6125		- *
6126		- * (TFO) - we could try to be more aggressive and
6127		- * retransmitting any data sooner based on when they
6128		- * are sent out.
6129		- */
6130		- tcp_rearm_rto(sk);
	6463	+ tcp_rcv_synrecv_state_fastopen(sk);
6131	6464	} else {
6132		- tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
	6465	+ tcp_try_undo_spurious_syn(sk);
	6466	+ tp->retrans_stamp = 0;
	6467	+ tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB,
	6468	+ skb);
6133	6469	WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
6134	6470	}
6135	6471	smp_mb();
..	..	@@ -6163,16 +6499,9 @@
6163	6499	case TCP_FIN_WAIT1: {
6164	6500	int tmo;
6165	6501
6166		- /* If we enter the TCP_FIN_WAIT1 state and we are a
6167		- * Fast Open socket and this is the first acceptable
6168		- * ACK we have received, this would have acknowledged
6169		- * our SYNACK so stop the SYNACK timer.
6170		- */
6171		- if (req) {
6172		- /* We no longer need the request sock. */
6173		- reqsk_fastopen_remove(sk, req, false);
6174		- tcp_rearm_rto(sk);
6175		- }
	6502	+ if (req)
	6503	+ tcp_rcv_synrecv_state_fastopen(sk);
	6504	+
6176	6505	if (tp->snd_una != tp->write_seq)
6177	6506	break;
6178	6507
..	..	@@ -6244,9 +6573,12 @@
6244	6573	case TCP_CLOSE_WAIT:
6245	6574	case TCP_CLOSING:
6246	6575	case TCP_LAST_ACK:
6247		- if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
	6576	+ if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
	6577	+ if (sk_is_mptcp(sk))
	6578	+ mptcp_incoming_options(sk, skb);
6248	6579	break;
6249		- /* fall through */
	6580	+ }
	6581	+ fallthrough;
6250	6582	case TCP_FIN_WAIT1:
6251	6583	case TCP_FIN_WAIT2:
6252	6584	/* RFC 793 says to queue data in these states,
..	..	@@ -6261,7 +6593,7 @@
6261	6593	return 1;
6262	6594	}
6263	6595	}
6264		- /* Fall through */
	6596	+ fallthrough;
6265	6597	case TCP_ESTABLISHED:
6266	6598	tcp_data_queue(sk, skb);
6267	6599	queued = 1;
..	..	@@ -6307,6 +6639,11 @@
6307	6639	* congestion control: Linux DCTCP asserts ECT on all packets,
6308	6640	* including SYN, which is most optimal solution; however,
6309	6641	* others, such as FreeBSD do not.
	6642	+ *
	6643	+ * Exception: At least one of the reserved bits of the TCP header (th->res1) is
	6644	+ * set, indicating the use of a future TCP extension (such as AccECN). See
	6645	+ * RFC8311 §4.3 which updates RFC3168 to allow the development of such
	6646	+ * extensions.
6310	6647	*/
6311	6648	static void tcp_ecn_create_request(struct request_sock *req,
6312	6649	const struct sk_buff *skb,
..	..	@@ -6326,7 +6663,7 @@
6326	6663	ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
6327	6664	ecn_ok = net->ipv4.sysctl_tcp_ecn \|\| ecn_ok_dst;
6328	6665
6329		- if ((!ect && ecn_ok) \|\| tcp_ca_needs_ecn(listen_sk) \|\|
	6666	+ if (((!ect \|\| th->res1) && ecn_ok) \|\| tcp_ca_needs_ecn(listen_sk) \|\|
6330	6667	(ecn_ok_dst & DST_FEATURE_ECN_CA) \|\|
6331	6668	tcp_bpf_ca_needs_ecn((struct sock *)req))
6332	6669	inet_rsk(req)->ecn_ok = 1;
..	..	@@ -6339,10 +6676,9 @@
6339	6676	struct inet_request_sock *ireq = inet_rsk(req);
6340	6677
6341	6678	req->rsk_rcv_wnd = 0; /* So that tcp_send_synack() knows! */
6342		- req->cookie_ts = 0;
6343	6679	tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
6344	6680	tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
6345		- tcp_rsk(req)->snt_synack = tcp_clock_us();
	6681	+ tcp_rsk(req)->snt_synack = 0;
6346	6682	tcp_rsk(req)->last_oow_ack_time = 0;
6347	6683	req->mss = rx_opt->mss_clamp;
6348	6684	req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
..	..	@@ -6387,17 +6723,18 @@
6387	6723	/*
6388	6724	* Return true if a syncookie should be sent
6389	6725	*/
6390		-static bool tcp_syn_flood_action(const struct sock *sk,
6391		- const struct sk_buff *skb,
6392		- const char *proto)
	6726	+static bool tcp_syn_flood_action(const struct sock sk, const char proto)
6393	6727	{
6394	6728	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
6395	6729	const char *msg = "Dropping request";
6396		- bool want_cookie = false;
6397	6730	struct net *net = sock_net(sk);
	6731	+ bool want_cookie = false;
	6732	+ u8 syncookies;
	6733	+
	6734	+ syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies);
6398	6735
6399	6736	#ifdef CONFIG_SYN_COOKIES
6400		- if (net->ipv4.sysctl_tcp_syncookies) {
	6737	+ if (syncookies) {
6401	6738	msg = "Sending cookies";
6402	6739	want_cookie = true;
6403	6740	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
..	..	@@ -6405,11 +6742,10 @@
6405	6742	#endif
6406	6743	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
6407	6744
6408		- if (!queue->synflood_warned &&
6409		- net->ipv4.sysctl_tcp_syncookies != 2 &&
	6745	+ if (!queue->synflood_warned && syncookies != 2 &&
6410	6746	xchg(&queue->synflood_warned, 1) == 0)
6411	6747	net_info_ratelimited("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
6412		- proto, ntohs(tcp_hdr(skb)->dest), msg);
	6748	+ proto, sk->sk_num, msg);
6413	6749
6414	6750	return want_cookie;
6415	6751	}
..	..	@@ -6420,16 +6756,60 @@
6420	6756	{
6421	6757	if (tcp_sk(sk)->save_syn) {
6422	6758	u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb);
6423		- u32 *copy;
	6759	+ struct saved_syn *saved_syn;
	6760	+ u32 mac_hdrlen;
	6761	+ void *base;
6424	6762
6425		- copy = kmalloc(len + sizeof(u32), GFP_ATOMIC);
6426		- if (copy) {
6427		- copy[0] = len;
6428		- memcpy(&copy[1], skb_network_header(skb), len);
6429		- req->saved_syn = copy;
	6763	+ if (tcp_sk(sk)->save_syn == 2) { /* Save full header. */
	6764	+ base = skb_mac_header(skb);
	6765	+ mac_hdrlen = skb_mac_header_len(skb);
	6766	+ len += mac_hdrlen;
	6767	+ } else {
	6768	+ base = skb_network_header(skb);
	6769	+ mac_hdrlen = 0;
	6770	+ }
	6771	+
	6772	+ saved_syn = kmalloc(struct_size(saved_syn, data, len),
	6773	+ GFP_ATOMIC);
	6774	+ if (saved_syn) {
	6775	+ saved_syn->mac_hdrlen = mac_hdrlen;
	6776	+ saved_syn->network_hdrlen = skb_network_header_len(skb);
	6777	+ saved_syn->tcp_hdrlen = tcp_hdrlen(skb);
	6778	+ memcpy(saved_syn->data, base, len);
	6779	+ req->saved_syn = saved_syn;
6430	6780	}
6431	6781	}
6432	6782	}
	6783	+
	6784	+/* If a SYN cookie is required and supported, returns a clamped MSS value to be
	6785	+ * used for SYN cookie generation.
	6786	+ */
	6787	+u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
	6788	+ const struct tcp_request_sock_ops *af_ops,
	6789	+ struct sock sk, struct tcphdr th)
	6790	+{
	6791	+ struct tcp_sock *tp = tcp_sk(sk);
	6792	+ u16 mss;
	6793	+
	6794	+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies) != 2 &&
	6795	+ !inet_csk_reqsk_queue_is_full(sk))
	6796	+ return 0;
	6797	+
	6798	+ if (!tcp_syn_flood_action(sk, rsk_ops->slab_name))
	6799	+ return 0;
	6800	+
	6801	+ if (sk_acceptq_is_full(sk)) {
	6802	+ NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
	6803	+ return 0;
	6804	+ }
	6805	+
	6806	+ mss = tcp_parse_mss_option(th, tp->rx_opt.user_mss);
	6807	+ if (!mss)
	6808	+ mss = af_ops->mss_clamp;
	6809	+
	6810	+ return mss;
	6811	+}
	6812	+EXPORT_SYMBOL_GPL(tcp_get_syncookie_mss);
6433	6813
6434	6814	int tcp_conn_request(struct request_sock_ops *rsk_ops,
6435	6815	const struct tcp_request_sock_ops *af_ops,
..	..	@@ -6445,14 +6825,16 @@
6445	6825	bool want_cookie = false;
6446	6826	struct dst_entry *dst;
6447	6827	struct flowi fl;
	6828	+ u8 syncookies;
	6829	+
	6830	+ syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies);
6448	6831
6449	6832	/* TW buckets are converted to open requests without
6450	6833	* limitations, they conserve resources and peer is
6451	6834	* evidently real one.
6452	6835	*/
6453		- if ((net->ipv4.sysctl_tcp_syncookies == 2 \|\|
6454		- inet_csk_reqsk_queue_is_full(sk)) && !isn) {
6455		- want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
	6836	+ if ((syncookies == 2 \|\| inet_csk_reqsk_queue_is_full(sk)) && !isn) {
	6837	+ want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name);
6456	6838	if (!want_cookie)
6457	6839	goto drop;
6458	6840	}
..	..	@@ -6466,8 +6848,12 @@
6466	6848	if (!req)
6467	6849	goto drop;
6468	6850
	6851	+ req->syncookie = want_cookie;
6469	6852	tcp_rsk(req)->af_specific = af_ops;
6470	6853	tcp_rsk(req)->ts_off = 0;
	6854	+#if IS_ENABLED(CONFIG_MPTCP)
	6855	+ tcp_rsk(req)->is_mptcp = 0;
	6856	+#endif
6471	6857
6472	6858	tcp_clear_options(&tmp_opt);
6473	6859	tmp_opt.mss_clamp = af_ops->mss_clamp;
..	..	@@ -6501,10 +6887,12 @@
6501	6887	goto drop_and_free;
6502	6888
6503	6889	if (!want_cookie && !isn) {
	6890	+ int max_syn_backlog = READ_ONCE(net->ipv4.sysctl_max_syn_backlog);
	6891	+
6504	6892	/* Kill the following clause, if you dislike this way. */
6505		- if (!net->ipv4.sysctl_tcp_syncookies &&
6506		- (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
6507		- (net->ipv4.sysctl_max_syn_backlog >> 2)) &&
	6893	+ if (!syncookies &&
	6894	+ (max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
	6895	+ (max_syn_backlog >> 2)) &&
6508	6896	!tcp_peer_is_proven(req, dst)) {
6509	6897	/* Without syncookies last quarter of
6510	6898	* backlog is filled with destinations,
..	..	@@ -6525,13 +6913,13 @@
6525	6913
6526	6914	if (want_cookie) {
6527	6915	isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
6528		- req->cookie_ts = tmp_opt.tstamp_ok;
6529	6916	if (!tmp_opt.tstamp_ok)
6530	6917	inet_rsk(req)->ecn_ok = 0;
6531	6918	}
6532	6919
6533	6920	tcp_rsk(req)->snt_isn = isn;
6534	6921	tcp_rsk(req)->txhash = net_tx_rndhash();
	6922	+ tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
6535	6923	tcp_openreq_init_rwin(req, sk, dst);
6536	6924	sk_rx_queue_set(req_to_sk(req), skb);
6537	6925	if (!want_cookie) {
..	..	@@ -6540,14 +6928,13 @@
6540	6928	}
6541	6929	if (fastopen_sk) {
6542	6930	af_ops->send_synack(fastopen_sk, dst, &fl, req,
6543		- &foc, TCP_SYNACK_FASTOPEN);
	6931	+ &foc, TCP_SYNACK_FASTOPEN, skb);
6544	6932	/* Add the child socket directly into the accept queue */
6545	6933	if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) {
6546	6934	reqsk_fastopen_remove(fastopen_sk, req, false);
6547	6935	bh_unlock_sock(fastopen_sk);
6548	6936	sock_put(fastopen_sk);
6549		- reqsk_put(req);
6550		- goto drop;
	6937	+ goto drop_and_free;
6551	6938	}
6552	6939	sk->sk_data_ready(sk);
6553	6940	bh_unlock_sock(fastopen_sk);
..	..	@@ -6559,7 +6946,8 @@
6559	6946	tcp_timeout_init((struct sock *)req));
6560	6947	af_ops->send_synack(sk, dst, &fl, req, &foc,
6561	6948	!want_cookie ? TCP_SYNACK_NORMAL :
6562		- TCP_SYNACK_COOKIE);
	6949	+ TCP_SYNACK_COOKIE,
	6950	+ skb);
6563	6951	if (want_cookie) {
6564	6952	reqsk_free(req);
6565	6953	return 0;
..	..	@@ -6571,7 +6959,7 @@
6571	6959	drop_and_release:
6572	6960	dst_release(dst);
6573	6961	drop_and_free:
6574		- reqsk_free(req);
	6962	+ __reqsk_free(req);
6575	6963	drop:
6576	6964	tcp_listendrop(sk);
6577	6965	return 0;