~hc/RK356X_SDK_RELEASE.git

..	..	@@ -77,8 +77,10 @@
77	77	#include <asm/unaligned.h>
78	78	#include <linux/errqueue.h>
79	79	#include <trace/events/tcp.h>
80		-#include <linux/static_key.h>
	80	+#include <linux/jump_label_ratelimit.h>
81	81	#include <net/busy_poll.h>
	82	+#include <net/mptcp.h>
	83	+#include <trace/hooks/net.h>
82	84
83	85	int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
84	86
..	..	@@ -113,22 +115,91 @@
113	115	#define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */
114	116
115	117	#if IS_ENABLED(CONFIG_TLS_DEVICE)
116		-static DEFINE_STATIC_KEY_FALSE(clean_acked_data_enabled);
	118	+static DEFINE_STATIC_KEY_DEFERRED_FALSE(clean_acked_data_enabled, HZ);
117	119
118	120	void clean_acked_data_enable(struct inet_connection_sock *icsk,
119	121	void (cad)(struct sock sk, u32 ack_seq))
120	122	{
121	123	icsk->icsk_clean_acked = cad;
122		- static_branch_inc(&clean_acked_data_enabled);
	124	+ static_branch_deferred_inc(&clean_acked_data_enabled);
123	125	}
124	126	EXPORT_SYMBOL_GPL(clean_acked_data_enable);
125	127
126	128	void clean_acked_data_disable(struct inet_connection_sock *icsk)
127	129	{
128		- static_branch_dec(&clean_acked_data_enabled);
	130	+ static_branch_slow_dec_deferred(&clean_acked_data_enabled);
129	131	icsk->icsk_clean_acked = NULL;
130	132	}
131	133	EXPORT_SYMBOL_GPL(clean_acked_data_disable);
	134	+
	135	+void clean_acked_data_flush(void)
	136	+{
	137	+ static_key_deferred_flush(&clean_acked_data_enabled);
	138	+}
	139	+EXPORT_SYMBOL_GPL(clean_acked_data_flush);
	140	+#endif
	141	+
	142	+#ifdef CONFIG_CGROUP_BPF
	143	+static void bpf_skops_parse_hdr(struct sock sk, struct sk_buff skb)
	144	+{
	145	+ bool unknown_opt = tcp_sk(sk)->rx_opt.saw_unknown &&
	146	+ BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
	147	+ BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG);
	148	+ bool parse_all_opt = BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
	149	+ BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG);
	150	+ struct bpf_sock_ops_kern sock_ops;
	151	+
	152	+ if (likely(!unknown_opt && !parse_all_opt))
	153	+ return;
	154	+
	155	+ /* The skb will be handled in the
	156	+ * bpf_skops_established() or
	157	+ * bpf_skops_write_hdr_opt().
	158	+ */
	159	+ switch (sk->sk_state) {
	160	+ case TCP_SYN_RECV:
	161	+ case TCP_SYN_SENT:
	162	+ case TCP_LISTEN:
	163	+ return;
	164	+ }
	165	+
	166	+ sock_owned_by_me(sk);
	167	+
	168	+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
	169	+ sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB;
	170	+ sock_ops.is_fullsock = 1;
	171	+ sock_ops.sk = sk;
	172	+ bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb));
	173	+
	174	+ BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
	175	+}
	176	+
	177	+static void bpf_skops_established(struct sock *sk, int bpf_op,
	178	+ struct sk_buff *skb)
	179	+{
	180	+ struct bpf_sock_ops_kern sock_ops;
	181	+
	182	+ sock_owned_by_me(sk);
	183	+
	184	+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
	185	+ sock_ops.op = bpf_op;
	186	+ sock_ops.is_fullsock = 1;
	187	+ sock_ops.sk = sk;
	188	+ /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */
	189	+ if (skb)
	190	+ bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb));
	191	+
	192	+ BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
	193	+}
	194	+#else
	195	+static void bpf_skops_parse_hdr(struct sock sk, struct sk_buff skb)
	196	+{
	197	+}
	198	+
	199	+static void bpf_skops_established(struct sock *sk, int bpf_op,
	200	+ struct sk_buff *skb)
	201	+{
	202	+}
132	203	#endif
133	204
134	205	static void tcp_gro_dev_warn(struct sock sk, const struct sk_buff skb,
..	..	@@ -172,6 +243,19 @@
172	243	if (unlikely(len > icsk->icsk_ack.rcv_mss +
173	244	MAX_TCP_OPTION_SPACE))
174	245	tcp_gro_dev_warn(sk, skb, len);
	246	+ /* If the skb has a len of exactly 1*MSS and has the PSH bit
	247	+ * set then it is likely the end of an application write. So
	248	+ * more data may not be arriving soon, and yet the data sender
	249	+ * may be waiting for an ACK if cwnd-bound or using TX zero
	250	+ * copy. So we set ICSK_ACK_PUSHED here so that
	251	+ * tcp_cleanup_rbuf() will send an ACK immediately if the app
	252	+ * reads all of the data and is not ping-pong. If len > MSS
	253	+ * then this logic does not matter (and does not hurt) because
	254	+ * tcp_cleanup_rbuf() will always ACK immediately if the app
	255	+ * reads data and there is more than an MSS of unACKed data.
	256	+ */
	257	+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_PSH)
	258	+ icsk->icsk_ack.pending \|= ICSK_ACK_PUSHED;
175	259	} else {
176	260	/* Otherwise, we make more careful check taking into account,
177	261	* that SACKs block is variable.
..	..	@@ -216,15 +300,14 @@
216	300	icsk->icsk_ack.quick = quickacks;
217	301	}
218	302
219		-void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
	303	+static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
220	304	{
221	305	struct inet_connection_sock *icsk = inet_csk(sk);
222	306
223	307	tcp_incr_quickack(sk, max_quickacks);
224		- icsk->icsk_ack.pingpong = 0;
	308	+ inet_csk_exit_pingpong_mode(sk);
225	309	icsk->icsk_ack.ato = TCP_ATO_MIN;
226	310	}
227		-EXPORT_SYMBOL(tcp_enter_quickack_mode);
228	311
229	312	/* Send ACKs quickly, if "quick" count is not exhausted
230	313	* and the session is not interactive.
..	..	@@ -236,7 +319,7 @@
236	319	const struct dst_entry *dst = __sk_dst_get(sk);
237	320
238	321	return (dst && dst_metric(dst, RTAX_QUICKACK)) \|\|
239		- (icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong);
	322	+ (icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk));
240	323	}
241	324
242	325	static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
..	..	@@ -354,7 +437,8 @@
354	437	sndmem = nr_segs per_mss;
355	438
356	439	if (sk->sk_sndbuf < sndmem)
357		- sk->sk_sndbuf = min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]);
	440	+ WRITE_ONCE(sk->sk_sndbuf,
	441	+ min(sndmem, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[2])));
358	442	}
359	443
360	444	/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
..	..	@@ -383,12 +467,13 @@
383	467	*/
384	468
385	469	/* Slow part of check#2. */
386		-static int __tcp_grow_window(const struct sock sk, const struct sk_buff skb)
	470	+static int __tcp_grow_window(const struct sock sk, const struct sk_buff skb,
	471	+ unsigned int skbtruesize)
387	472	{
388	473	struct tcp_sock *tp = tcp_sk(sk);
389	474	/* Optimize this! */
390		- int truesize = tcp_win_from_space(sk, skb->truesize) >> 1;
391		- int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
	475	+ int truesize = tcp_win_from_space(sk, skbtruesize) >> 1;
	476	+ int window = tcp_win_from_space(sk, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])) >> 1;
392	477
393	478	while (tp->rcv_ssthresh <= window) {
394	479	if (truesize <= skb->len)
..	..	@@ -400,7 +485,27 @@
400	485	return 0;
401	486	}
402	487
403		-static void tcp_grow_window(struct sock sk, const struct sk_buff skb)
	488	+/* Even if skb appears to have a bad len/truesize ratio, TCP coalescing
	489	+ * can play nice with us, as sk_buff and skb->head might be either
	490	+ * freed or shared with up to MAX_SKB_FRAGS segments.
	491	+ * Only give a boost to drivers using page frag(s) to hold the frame(s),
	492	+ * and if no payload was pulled in skb->head before reaching us.
	493	+ */
	494	+static u32 truesize_adjust(bool adjust, const struct sk_buff *skb)
	495	+{
	496	+ u32 truesize = skb->truesize;
	497	+
	498	+ if (adjust && !skb_headlen(skb)) {
	499	+ truesize -= SKB_TRUESIZE(skb_end_offset(skb));
	500	+ /* paranoid check, some drivers might be buggy */
	501	+ if (unlikely((int)truesize < (int)skb->len))
	502	+ truesize = skb->truesize;
	503	+ }
	504	+ return truesize;
	505	+}
	506	+
	507	+static void tcp_grow_window(struct sock sk, const struct sk_buff skb,
	508	+ bool adjust)
404	509	{
405	510	struct tcp_sock *tp = tcp_sk(sk);
406	511	int room;
..	..	@@ -409,15 +514,16 @@
409	514
410	515	/* Check #1 */
411	516	if (room > 0 && !tcp_under_memory_pressure(sk)) {
	517	+ unsigned int truesize = truesize_adjust(adjust, skb);
412	518	int incr;
413	519
414	520	/* Check #2. Increase window, if skb with such overhead
415	521	* will fit to rcvbuf in future.
416	522	*/
417		- if (tcp_win_from_space(sk, skb->truesize) <= skb->len)
	523	+ if (tcp_win_from_space(sk, truesize) <= skb->len)
418	524	incr = 2 * tp->advmss;
419	525	else
420		- incr = __tcp_grow_window(sk, skb);
	526	+ incr = __tcp_grow_window(sk, skb, truesize);
421	527
422	528	if (incr) {
423	529	incr = max_t(int, incr, 2 * skb->len);
..	..	@@ -430,9 +536,9 @@
430	536	/* 3. Try to fixup all. It is made immediately after connection enters
431	537	* established state.
432	538	*/
433		-void tcp_init_buffer_space(struct sock *sk)
	539	+static void tcp_init_buffer_space(struct sock *sk)
434	540	{
435		- int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win;
	541	+ int tcp_app_win = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_app_win);
436	542	struct tcp_sock *tp = tcp_sk(sk);
437	543	int maxwin;
438	544
..	..	@@ -472,15 +578,17 @@
472	578	struct tcp_sock *tp = tcp_sk(sk);
473	579	struct inet_connection_sock *icsk = inet_csk(sk);
474	580	struct net *net = sock_net(sk);
	581	+ int rmem2;
475	582
476	583	icsk->icsk_ack.quick = 0;
	584	+ rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
477	585
478		- if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] &&
	586	+ if (sk->sk_rcvbuf < rmem2 &&
479	587	!(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
480	588	!tcp_under_memory_pressure(sk) &&
481	589	sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
482		- sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
483		- net->ipv4.sysctl_tcp_rmem[2]);
	590	+ WRITE_ONCE(sk->sk_rcvbuf,
	591	+ min(atomic_read(&sk->sk_rmem_alloc), rmem2));
484	592	}
485	593	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
486	594	tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
..	..	@@ -510,7 +618,7 @@
510	618	*
511	619	* The algorithm for RTT estimation w/o timestamps is based on
512	620	* Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
513		- * <http://public.lanl.gov/radiant/pubs.html#DRS>
	621	+ * <https://public.lanl.gov/radiant/pubs.html#DRS>
514	622	*
515	623	* More detail on this code can be found at
516	624	* <http://staff.psc.edu/jheffner/>,
..	..	@@ -621,7 +729,7 @@
621	729	* <prev RTT . ><current RTT .. ><next RTT .... >
622	730	*/
623	731
624		- if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf &&
	732	+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
625	733	!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
626	734	int rcvmem, rcvbuf;
627	735	u64 rcvwin, grow;
..	..	@@ -642,9 +750,9 @@
642	750
643	751	do_div(rcvwin, tp->advmss);
644	752	rcvbuf = min_t(u64, rcvwin * rcvmem,
645		- sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
	753	+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
646	754	if (rcvbuf > sk->sk_rcvbuf) {
647		- sk->sk_rcvbuf = rcvbuf;
	755	+ WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
648	756
649	757	/* Make the window clamp follow along. */
650	758	tp->window_clamp = tcp_win_from_space(sk, rcvbuf);
..	..	@@ -710,7 +818,7 @@
710	818	tcp_ecn_check_ce(sk, skb);
711	819
712	820	if (skb->len >= 128)
713		- tcp_grow_window(sk, skb);
	821	+ tcp_grow_window(sk, skb, true);
714	822	}
715	823
716	824	/* Called to compute a smoothed rtt estimate. The data fed to this
..	..	@@ -774,6 +882,8 @@
774	882	tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
775	883	tp->rtt_seq = tp->snd_nxt;
776	884	tp->mdev_max_us = tcp_rto_min_us(sk);
	885	+
	886	+ tcp_bpf_rtt(sk);
777	887	}
778	888	} else {
779	889	/* no previous measure. */
..	..	@@ -782,6 +892,8 @@
782	892	tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
783	893	tp->mdev_max_us = tp->rttvar_us;
784	894	tp->rtt_seq = tp->snd_nxt;
	895	+
	896	+ tcp_bpf_rtt(sk);
785	897	}
786	898	tp->srtt_us = max(1U, srtt);
787	899	}
..	..	@@ -859,12 +971,54 @@
859	971	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
860	972	}
861	973
862		-/* Take a notice that peer is sending D-SACKs */
863		-static void tcp_dsack_seen(struct tcp_sock *tp)
	974	+struct tcp_sacktag_state {
	975	+ /* Timestamps for earliest and latest never-retransmitted segment
	976	+ * that was SACKed. RTO needs the earliest RTT to stay conservative,
	977	+ * but congestion control should still get an accurate delay signal.
	978	+ */
	979	+ u64 first_sackt;
	980	+ u64 last_sackt;
	981	+ u32 reord;
	982	+ u32 sack_delivered;
	983	+ int flag;
	984	+ unsigned int mss_now;
	985	+ struct rate_sample *rate;
	986	+};
	987	+
	988	+/* Take a notice that peer is sending D-SACKs. Skip update of data delivery
	989	+ * and spurious retransmission information if this DSACK is unlikely caused by
	990	+ * sender's action:
	991	+ * - DSACKed sequence range is larger than maximum receiver's window.
	992	+ * - Total no. of DSACKed segments exceed the total no. of retransmitted segs.
	993	+ */
	994	+static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq,
	995	+ u32 end_seq, struct tcp_sacktag_state *state)
864	996	{
	997	+ u32 seq_len, dup_segs = 1;
	998	+
	999	+ if (!before(start_seq, end_seq))
	1000	+ return 0;
	1001	+
	1002	+ seq_len = end_seq - start_seq;
	1003	+ /* Dubious DSACK: DSACKed range greater than maximum advertised rwnd */
	1004	+ if (seq_len > tp->max_window)
	1005	+ return 0;
	1006	+ if (seq_len > tp->mss_cache)
	1007	+ dup_segs = DIV_ROUND_UP(seq_len, tp->mss_cache);
	1008	+
	1009	+ tp->dsack_dups += dup_segs;
	1010	+ /* Skip the DSACK if dup segs weren't retransmitted by sender */
	1011	+ if (tp->dsack_dups > tp->total_retrans)
	1012	+ return 0;
	1013	+
865	1014	tp->rx_opt.sack_ok \|= TCP_DSACK_SEEN;
866	1015	tp->rack.dsack_seen = 1;
867		- tp->dsack_dups++;
	1016	+
	1017	+ state->flag \|= FLAG_DSACKING_ACK;
	1018	+ /* A spurious retransmission is delivered */
	1019	+ state->sack_delivered += dup_segs;
	1020	+
	1021	+ return dup_segs;
868	1022	}
869	1023
870	1024	/* It's reordering when higher sequence was delivered (i.e. sacked) before
..	..	@@ -893,7 +1047,7 @@
893	1047	tp->undo_marker ? tp->undo_retrans : 0);
894	1048	#endif
895	1049	tp->reordering = min_t(u32, (metric + mss - 1) / mss,
896		- sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
	1050	+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering));
897	1051	}
898	1052
899	1053	/* This exciting event is worth to be remembered. 8) */
..	..	@@ -902,7 +1056,11 @@
902	1056	ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER);
903	1057	}
904	1058
905		-/* This must be called before lost_out is incremented */
	1059	+ /* This must be called before lost_out or retrans_out are updated
	1060	+ * on a new loss, because we want to know if all skbs previously
	1061	+ * known to be lost have already been retransmitted, indicating
	1062	+ * that this newly lost skb is our next skb to retransmit.
	1063	+ */
906	1064	static void tcp_verify_retransmit_hint(struct tcp_sock tp, struct sk_buff skb)
907	1065	{
908	1066	if ((!tp->retransmit_skb_hint && tp->retrans_out >= tp->lost_out) \|\|
..	..	@@ -912,42 +1070,46 @@
912	1070	tp->retransmit_skb_hint = skb;
913	1071	}
914	1072
915		-/* Sum the number of packets on the wire we have marked as lost.
916		- * There are two cases we care about here:
917		- * a) Packet hasn't been marked lost (nor retransmitted),
918		- * and this is the first loss.
919		- * b) Packet has been marked both lost and retransmitted,
920		- * and this means we think it was lost again.
	1073	+/* Sum the number of packets on the wire we have marked as lost, and
	1074	+ * notify the congestion control module that the given skb was marked lost.
921	1075	*/
922		-static void tcp_sum_lost(struct tcp_sock tp, struct sk_buff skb)
	1076	+static void tcp_notify_skb_loss_event(struct tcp_sock tp, const struct sk_buff skb)
	1077	+{
	1078	+ tp->lost += tcp_skb_pcount(skb);
	1079	+}
	1080	+
	1081	+void tcp_mark_skb_lost(struct sock sk, struct sk_buff skb)
923	1082	{
924	1083	__u8 sacked = TCP_SKB_CB(skb)->sacked;
	1084	+ struct tcp_sock *tp = tcp_sk(sk);
925	1085
926		- if (!(sacked & TCPCB_LOST) \|\|
927		- ((sacked & TCPCB_LOST) && (sacked & TCPCB_SACKED_RETRANS)))
928		- tp->lost += tcp_skb_pcount(skb);
929		-}
	1086	+ if (sacked & TCPCB_SACKED_ACKED)
	1087	+ return;
930	1088
931		-static void tcp_skb_mark_lost(struct tcp_sock tp, struct sk_buff skb)
932		-{
933		- if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST\|TCPCB_SACKED_ACKED))) {
934		- tcp_verify_retransmit_hint(tp, skb);
935		-
936		- tp->lost_out += tcp_skb_pcount(skb);
937		- tcp_sum_lost(tp, skb);
938		- TCP_SKB_CB(skb)->sacked \|= TCPCB_LOST;
939		- }
940		-}
941		-
942		-void tcp_skb_mark_lost_uncond_verify(struct tcp_sock tp, struct sk_buff skb)
943		-{
944	1089	tcp_verify_retransmit_hint(tp, skb);
945		-
946		- tcp_sum_lost(tp, skb);
947		- if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST\|TCPCB_SACKED_ACKED))) {
	1090	+ if (sacked & TCPCB_LOST) {
	1091	+ if (sacked & TCPCB_SACKED_RETRANS) {
	1092	+ /* Account for retransmits that are lost again */
	1093	+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
	1094	+ tp->retrans_out -= tcp_skb_pcount(skb);
	1095	+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT,
	1096	+ tcp_skb_pcount(skb));
	1097	+ tcp_notify_skb_loss_event(tp, skb);
	1098	+ }
	1099	+ } else {
948	1100	tp->lost_out += tcp_skb_pcount(skb);
949	1101	TCP_SKB_CB(skb)->sacked \|= TCPCB_LOST;
	1102	+ tcp_notify_skb_loss_event(tp, skb);
950	1103	}
	1104	+}
	1105	+
	1106	+/* Updates the delivered and delivered_ce counts */
	1107	+static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered,
	1108	+ bool ece_ack)
	1109	+{
	1110	+ tp->delivered += delivered;
	1111	+ if (ece_ack)
	1112	+ tp->delivered_ce += delivered;
951	1113	}
952	1114
953	1115	/* This procedure tags the retransmission queue when SACKs arrive.
..	..	@@ -1082,51 +1244,42 @@
1082	1244
1083	1245	static bool tcp_check_dsack(struct sock sk, const struct sk_buff ack_skb,
1084	1246	struct tcp_sack_block_wire *sp, int num_sacks,
1085		- u32 prior_snd_una)
	1247	+ u32 prior_snd_una, struct tcp_sacktag_state *state)
1086	1248	{
1087	1249	struct tcp_sock *tp = tcp_sk(sk);
1088	1250	u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
1089	1251	u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
1090		- bool dup_sack = false;
	1252	+ u32 dup_segs;
1091	1253
1092	1254	if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
1093		- dup_sack = true;
1094		- tcp_dsack_seen(tp);
1095	1255	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
1096	1256	} else if (num_sacks > 1) {
1097	1257	u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
1098	1258	u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
1099	1259
1100		- if (!after(end_seq_0, end_seq_1) &&
1101		- !before(start_seq_0, start_seq_1)) {
1102		- dup_sack = true;
1103		- tcp_dsack_seen(tp);
1104		- NET_INC_STATS(sock_net(sk),
1105		- LINUX_MIB_TCPDSACKOFORECV);
1106		- }
	1260	+ if (after(end_seq_0, end_seq_1) \|\| before(start_seq_0, start_seq_1))
	1261	+ return false;
	1262	+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKOFORECV);
	1263	+ } else {
	1264	+ return false;
1107	1265	}
1108	1266
	1267	+ dup_segs = tcp_dsack_seen(tp, start_seq_0, end_seq_0, state);
	1268	+ if (!dup_segs) { /* Skip dubious DSACK */
	1269	+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKIGNOREDDUBIOUS);
	1270	+ return false;
	1271	+ }
	1272	+
	1273	+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECVSEGS, dup_segs);
	1274	+
1109	1275	/* D-SACK for already forgotten data... Do dumb counting. */
1110		- if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 &&
	1276	+ if (tp->undo_marker && tp->undo_retrans > 0 &&
1111	1277	!after(end_seq_0, prior_snd_una) &&
1112	1278	after(end_seq_0, tp->undo_marker))
1113		- tp->undo_retrans--;
	1279	+ tp->undo_retrans = max_t(int, 0, tp->undo_retrans - dup_segs);
1114	1280
1115		- return dup_sack;
	1281	+ return true;
1116	1282	}
1117		-
1118		-struct tcp_sacktag_state {
1119		- u32 reord;
1120		- /* Timestamps for earliest and latest never-retransmitted segment
1121		- * that was SACKed. RTO needs the earliest RTT to stay conservative,
1122		- * but congestion control should still get an accurate delay signal.
1123		- */
1124		- u64 first_sackt;
1125		- u64 last_sackt;
1126		- struct rate_sample *rate;
1127		- int flag;
1128		- unsigned int mss_now;
1129		-};
1130	1283
1131	1284	/* Check if skb is fully within the SACK block. In presence of GSO skbs,
1132	1285	* the incoming SACK may not exactly match but we can find smaller MSS
..	..	@@ -1246,7 +1399,8 @@
1246	1399	sacked \|= TCPCB_SACKED_ACKED;
1247	1400	state->flag \|= FLAG_DATA_SACKED;
1248	1401	tp->sacked_out += pcount;
1249		- tp->delivered += pcount; /* Out-of-order packets delivered */
	1402	+ /* Out-of-order packets delivered */
	1403	+ state->sack_delivered += pcount;
1250	1404
1251	1405	/* Lost marker hint past SACKed? Tweak RFC3517 cnt */
1252	1406	if (tp->lost_skb_hint &&
..	..	@@ -1289,7 +1443,7 @@
1289	1443	*/
1290	1444	tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
1291	1445	start_seq, end_seq, dup_sack, pcount,
1292		- skb->skb_mstamp);
	1446	+ tcp_skb_timestamp_us(skb));
1293	1447	tcp_rate_skb_delivered(sk, skb, state->rate);
1294	1448
1295	1449	if (skb == tp->lost_skb_hint)
..	..	@@ -1413,7 +1567,7 @@
1413	1567	if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
1414	1568	goto fallback;
1415	1569
1416		- if (!tcp_skb_can_collapse_to(prev))
	1570	+ if (!tcp_skb_can_collapse(prev, skb))
1417	1571	goto fallback;
1418	1572
1419	1573	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
..	..	@@ -1502,6 +1656,8 @@
1502	1656	(mss != tcp_skb_seglen(skb)))
1503	1657	goto out;
1504	1658
	1659	+ if (!tcp_skb_can_collapse(prev, skb))
	1660	+ goto out;
1505	1661	len = skb->len;
1506	1662	pcount = tcp_skb_pcount(skb);
1507	1663	if (tcp_skb_shift(prev, skb, pcount, len))
..	..	@@ -1578,7 +1734,7 @@
1578	1734	TCP_SKB_CB(skb)->end_seq,
1579	1735	dup_sack,
1580	1736	tcp_skb_pcount(skb),
1581		- skb->skb_mstamp);
	1737	+ tcp_skb_timestamp_us(skb));
1582	1738	tcp_rate_skb_delivered(sk, skb, state->rate);
1583	1739	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
1584	1740	list_del_init(&skb->tcp_tsorted_anchor);
..	..	@@ -1591,9 +1747,7 @@
1591	1747	return skb;
1592	1748	}
1593	1749
1594		-static struct sk_buff tcp_sacktag_bsearch(struct sock sk,
1595		- struct tcp_sacktag_state *state,
1596		- u32 seq)
	1750	+static struct sk_buff tcp_sacktag_bsearch(struct sock sk, u32 seq)
1597	1751	{
1598	1752	struct rb_node parent, *p = &sk->tcp_rtx_queue.rb_node;
1599	1753	struct sk_buff *skb;
..	..	@@ -1615,13 +1769,12 @@
1615	1769	}
1616	1770
1617	1771	static struct sk_buff tcp_sacktag_skip(struct sk_buff skb, struct sock *sk,
1618		- struct tcp_sacktag_state *state,
1619	1772	u32 skip_to_seq)
1620	1773	{
1621	1774	if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq))
1622	1775	return skb;
1623	1776
1624		- return tcp_sacktag_bsearch(sk, state, skip_to_seq);
	1777	+ return tcp_sacktag_bsearch(sk, skip_to_seq);
1625	1778	}
1626	1779
1627	1780	static struct sk_buff tcp_maybe_skipping_dsack(struct sk_buff skb,
..	..	@@ -1634,7 +1787,7 @@
1634	1787	return skb;
1635	1788
1636	1789	if (before(next_dup->start_seq, skip_to_seq)) {
1637		- skb = tcp_sacktag_skip(skb, sk, state, next_dup->start_seq);
	1790	+ skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq);
1638	1791	skb = tcp_sacktag_walk(skb, sk, NULL, state,
1639	1792	next_dup->start_seq, next_dup->end_seq,
1640	1793	1);
..	..	@@ -1672,11 +1825,7 @@
1672	1825	tcp_highest_sack_reset(sk);
1673	1826
1674	1827	found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
1675		- num_sacks, prior_snd_una);
1676		- if (found_dup_sack) {
1677		- state->flag \|= FLAG_DSACKING_ACK;
1678		- tp->delivered++; /* A spurious retransmission is delivered */
1679		- }
	1828	+ num_sacks, prior_snd_una, state);
1680	1829
1681	1830	/* Eliminate too old ACKs, but take into
1682	1831	* account more or less fresh ones, they can
..	..	@@ -1778,8 +1927,7 @@
1778	1927
1779	1928	/* Head todo? */
1780	1929	if (before(start_seq, cache->start_seq)) {
1781		- skb = tcp_sacktag_skip(skb, sk, state,
1782		- start_seq);
	1930	+ skb = tcp_sacktag_skip(skb, sk, start_seq);
1783	1931	skb = tcp_sacktag_walk(skb, sk, next_dup,
1784	1932	state,
1785	1933	start_seq,
..	..	@@ -1805,7 +1953,7 @@
1805	1953	goto walk;
1806	1954	}
1807	1955
1808		- skb = tcp_sacktag_skip(skb, sk, state, cache->end_seq);
	1956	+ skb = tcp_sacktag_skip(skb, sk, cache->end_seq);
1809	1957	/* Check overlap against next cached too (past this one already) */
1810	1958	cache++;
1811	1959	continue;
..	..	@@ -1816,7 +1964,7 @@
1816	1964	if (!skb)
1817	1965	break;
1818	1966	}
1819		- skb = tcp_sacktag_skip(skb, sk, state, start_seq);
	1967	+ skb = tcp_sacktag_skip(skb, sk, start_seq);
1820	1968
1821	1969	walk:
1822	1970	skb = tcp_sacktag_walk(skb, sk, next_dup, state,
..	..	@@ -1878,34 +2026,39 @@
1878	2026	return;
1879	2027
1880	2028	tp->reordering = min_t(u32, tp->packets_out + addend,
1881		- sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
	2029	+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering));
1882	2030	tp->reord_seen++;
1883	2031	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER);
1884	2032	}
1885	2033
1886	2034	/* Emulate SACKs for SACKless connection: account for a new dupack. */
1887	2035
1888		-static void tcp_add_reno_sack(struct sock *sk)
	2036	+static void tcp_add_reno_sack(struct sock *sk, int num_dupack, bool ece_ack)
1889	2037	{
1890		- struct tcp_sock *tp = tcp_sk(sk);
1891		- u32 prior_sacked = tp->sacked_out;
	2038	+ if (num_dupack) {
	2039	+ struct tcp_sock *tp = tcp_sk(sk);
	2040	+ u32 prior_sacked = tp->sacked_out;
	2041	+ s32 delivered;
1892	2042
1893		- tp->sacked_out++;
1894		- tcp_check_reno_reordering(sk, 0);
1895		- if (tp->sacked_out > prior_sacked)
1896		- tp->delivered++; /* Some out-of-order packet is delivered */
1897		- tcp_verify_left_out(tp);
	2043	+ tp->sacked_out += num_dupack;
	2044	+ tcp_check_reno_reordering(sk, 0);
	2045	+ delivered = tp->sacked_out - prior_sacked;
	2046	+ if (delivered > 0)
	2047	+ tcp_count_delivered(tp, delivered, ece_ack);
	2048	+ tcp_verify_left_out(tp);
	2049	+ }
1898	2050	}
1899	2051
1900	2052	/* Account for ACK, ACKing some data in Reno Recovery phase. */
1901	2053
1902		-static void tcp_remove_reno_sacks(struct sock *sk, int acked)
	2054	+static void tcp_remove_reno_sacks(struct sock *sk, int acked, bool ece_ack)
1903	2055	{
1904	2056	struct tcp_sock *tp = tcp_sk(sk);
1905	2057
1906	2058	if (acked > 0) {
1907	2059	/* One ACK acked hole. The rest eat duplicate ACKs. */
1908		- tp->delivered += max_t(int, acked - tp->sacked_out, 1);
	2060	+ tcp_count_delivered(tp, max_t(int, acked - tp->sacked_out, 1),
	2061	+ ece_ack);
1909	2062	if (acked - 1 >= tp->sacked_out)
1910	2063	tp->sacked_out = 0;
1911	2064	else
..	..	@@ -1938,7 +2091,8 @@
1938	2091
1939	2092	static bool tcp_is_rack(const struct sock *sk)
1940	2093	{
1941		- return sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION;
	2094	+ return READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) &
	2095	+ TCP_RACK_LOSS_DETECTION;
1942	2096	}
1943	2097
1944	2098	/* If we detect SACK reneging, forget all SACK information
..	..	@@ -1982,6 +2136,7 @@
1982	2136	struct tcp_sock *tp = tcp_sk(sk);
1983	2137	struct net *net = sock_net(sk);
1984	2138	bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
	2139	+ u8 reordering;
1985	2140
1986	2141	tcp_timeout_mark_lost(sk);
1987	2142
..	..	@@ -2002,10 +2157,12 @@
2002	2157	/* Timeout in disordered state after receiving substantial DUPACKs
2003	2158	* suggests that the degree of reordering is over-estimated.
2004	2159	*/
	2160	+ reordering = READ_ONCE(net->ipv4.sysctl_tcp_reordering);
2005	2161	if (icsk->icsk_ca_state <= TCP_CA_Disorder &&
2006		- tp->sacked_out >= net->ipv4.sysctl_tcp_reordering)
	2162	+ tp->sacked_out >= reordering)
2007	2163	tp->reordering = min_t(unsigned int, tp->reordering,
2008		- net->ipv4.sysctl_tcp_reordering);
	2164	+ reordering);
	2165	+
2009	2166	tcp_set_ca_state(sk, TCP_CA_Loss);
2010	2167	tp->high_seq = tp->snd_nxt;
2011	2168	tcp_ecn_queue_cwr(tp);
..	..	@@ -2014,7 +2171,7 @@
2014	2171	* loss recovery is underway except recurring timeout(s) on
2015	2172	* the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
2016	2173	*/
2017		- tp->frto = net->ipv4.sysctl_tcp_frto &&
	2174	+ tp->frto = READ_ONCE(net->ipv4.sysctl_tcp_frto) &&
2018	2175	(new_recovery \|\| icsk->icsk_retransmits) &&
2019	2176	!inet_csk(sk)->icsk_mtup.probe_size;
2020	2177	}
..	..	@@ -2031,7 +2188,8 @@
2031	2188	*/
2032	2189	static bool tcp_check_sack_reneging(struct sock *sk, int flag)
2033	2190	{
2034		- if (flag & FLAG_SACK_RENEGING) {
	2191	+ if (flag & FLAG_SACK_RENEGING &&
	2192	+ flag & FLAG_SND_UNA_ADVANCED) {
2035	2193	struct tcp_sock *tp = tcp_sk(sk);
2036	2194	unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4),
2037	2195	msecs_to_jiffies(10));
..	..	@@ -2172,8 +2330,7 @@
2172	2330	}
2173	2331
2174	2332	/* Detect loss in event "A" above by marking head of queue up as lost.
2175		- * For non-SACK(Reno) senders, the first "packets" number of segments
2176		- * are considered lost. For RFC3517 SACK, a segment is considered lost if it
	2333	+ * For RFC3517 SACK, a segment is considered lost if it
2177	2334	* has at least tp->reordering SACKed seqments above it; "packets" refers to
2178	2335	* the maximum SACKed segments to pass before reaching this limit.
2179	2336	*/
..	..	@@ -2181,10 +2338,9 @@
2181	2338	{
2182	2339	struct tcp_sock *tp = tcp_sk(sk);
2183	2340	struct sk_buff *skb;
2184		- int cnt, oldcnt, lost;
2185		- unsigned int mss;
	2341	+ int cnt;
2186	2342	/* Use SACK to deduce losses of new sequences sent during recovery */
2187		- const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq;
	2343	+ const u32 loss_high = tp->snd_nxt;
2188	2344
2189	2345	WARN_ON(packets > tp->packets_out);
2190	2346	skb = tp->lost_skb_hint;
..	..	@@ -2207,28 +2363,14 @@
2207	2363	if (after(TCP_SKB_CB(skb)->end_seq, loss_high))
2208	2364	break;
2209	2365
2210		- oldcnt = cnt;
2211		- if (tcp_is_reno(tp) \|\|
2212		- (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
	2366	+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
2213	2367	cnt += tcp_skb_pcount(skb);
2214	2368
2215		- if (cnt > packets) {
2216		- if (tcp_is_sack(tp) \|\|
2217		- (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) \|\|
2218		- (oldcnt >= packets))
2219		- break;
	2369	+ if (cnt > packets)
	2370	+ break;
2220	2371
2221		- mss = tcp_skb_mss(skb);
2222		- /* If needed, chop off the prefix to mark as lost. */
2223		- lost = (packets - oldcnt) * mss;
2224		- if (lost < skb->len &&
2225		- tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
2226		- lost, mss, GFP_ATOMIC) < 0)
2227		- break;
2228		- cnt = packets;
2229		- }
2230		-
2231		- tcp_skb_mark_lost(tp, skb);
	2372	+ if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST))
	2373	+ tcp_mark_skb_lost(sk, skb);
2232	2374
2233	2375	if (mark_head)
2234	2376	break;
..	..	@@ -2272,7 +2414,7 @@
2272	2414	*/
2273	2415	static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
2274	2416	{
2275		- return !tp->retrans_stamp \|\|
	2417	+ return tp->retrans_stamp &&
2276	2418	tcp_tsopt_ecr_before(tp, tp->retrans_stamp);
2277	2419	}
2278	2420
..	..	@@ -2368,6 +2510,21 @@
2368	2510	return tp->undo_marker && (!tp->undo_retrans \|\| tcp_packet_delayed(tp));
2369	2511	}
2370	2512
	2513	+static bool tcp_is_non_sack_preventing_reopen(struct sock *sk)
	2514	+{
	2515	+ struct tcp_sock *tp = tcp_sk(sk);
	2516	+
	2517	+ if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
	2518	+ /* Hold old state until something above high_seq
	2519	+ * is ACKed. For Reno it is MUST to prevent false
	2520	+ * fast retransmits (RFC2582). SACK TCP is safe. */
	2521	+ if (!tcp_any_retrans_done(sk))
	2522	+ tp->retrans_stamp = 0;
	2523	+ return true;
	2524	+ }
	2525	+ return false;
	2526	+}
	2527	+
2371	2528	/* People celebrate: "We love our President!" */
2372	2529	static bool tcp_try_undo_recovery(struct sock *sk)
2373	2530	{
..	..	@@ -2390,14 +2547,8 @@
2390	2547	} else if (tp->rack.reo_wnd_persist) {
2391	2548	tp->rack.reo_wnd_persist--;
2392	2549	}
2393		- if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
2394		- /* Hold old state until something above high_seq
2395		- * is ACKed. For Reno it is MUST to prevent false
2396		- * fast retransmits (RFC2582). SACK TCP is safe. */
2397		- if (!tcp_any_retrans_done(sk))
2398		- tp->retrans_stamp = 0;
	2550	+ if (tcp_is_non_sack_preventing_reopen(sk))
2399	2551	return true;
2400		- }
2401	2552	tcp_set_ca_state(sk, TCP_CA_Open);
2402	2553	tp->is_sack_reneg = 0;
2403	2554	return false;
..	..	@@ -2433,6 +2584,8 @@
2433	2584	NET_INC_STATS(sock_net(sk),
2434	2585	LINUX_MIB_TCPSPURIOUSRTOS);
2435	2586	inet_csk(sk)->icsk_retransmits = 0;
	2587	+ if (tcp_is_non_sack_preventing_reopen(sk))
	2588	+ return true;
2436	2589	if (frto_undo \|\| tcp_is_sack(tp)) {
2437	2590	tcp_set_ca_state(sk, TCP_CA_Open);
2438	2591	tp->is_sack_reneg = 0;
..	..	@@ -2479,8 +2632,8 @@
2479	2632	u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
2480	2633	tp->prior_cwnd - 1;
2481	2634	sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
2482		- } else if ((flag & FLAG_RETRANS_DATA_ACKED) &&
2483		- !(flag & FLAG_LOST_RETRANS)) {
	2635	+ } else if ((flag & (FLAG_RETRANS_DATA_ACKED \| FLAG_LOST_RETRANS)) ==
	2636	+ FLAG_RETRANS_DATA_ACKED) {
2484	2637	sndcnt = min_t(int, delta,
2485	2638	max_t(int, tp->prr_delivered - tp->prr_out,
2486	2639	newly_acked_sacked) + 1);
..	..	@@ -2566,12 +2719,15 @@
2566	2719	{
2567	2720	struct tcp_sock *tp = tcp_sk(sk);
2568	2721	struct inet_connection_sock *icsk = inet_csk(sk);
	2722	+ u64 val;
2569	2723
2570		- /* FIXME: breaks with very large cwnd */
2571	2724	tp->prior_ssthresh = tcp_current_ssthresh(sk);
2572		- tp->snd_cwnd = tp->snd_cwnd *
2573		- tcp_mss_to_mtu(sk, tp->mss_cache) /
2574		- icsk->icsk_mtup.probe_size;
	2725	+
	2726	+ val = (u64)tp->snd_cwnd * tcp_mss_to_mtu(sk, tp->mss_cache);
	2727	+ do_div(val, icsk->icsk_mtup.probe_size);
	2728	+ WARN_ON_ONCE((u32)val != val);
	2729	+ tp->snd_cwnd = max_t(u32, 1U, val);
	2730	+
2575	2731	tp->snd_cwnd_cnt = 0;
2576	2732	tp->snd_cwnd_stamp = tcp_jiffies32;
2577	2733	tp->snd_ssthresh = tcp_current_ssthresh(sk);
..	..	@@ -2594,14 +2750,8 @@
2594	2750	unsigned int mss = tcp_current_mss(sk);
2595	2751
2596	2752	skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
2597		- if (tcp_skb_seglen(skb) > mss &&
2598		- !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
2599		- if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
2600		- TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
2601		- tp->retrans_out -= tcp_skb_pcount(skb);
2602		- }
2603		- tcp_skb_mark_lost_uncond_verify(tp, skb);
2604		- }
	2753	+ if (tcp_skb_seglen(skb) > mss)
	2754	+ tcp_mark_skb_lost(sk, skb);
2605	2755	}
2606	2756
2607	2757	tcp_clear_retrans_hints_partial(tp);
..	..	@@ -2656,13 +2806,13 @@
2656	2806	/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
2657	2807	* recovered or spurious. Otherwise retransmits more on partial ACKs.
2658	2808	*/
2659		-static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
	2809	+static void tcp_process_loss(struct sock *sk, int flag, int num_dupack,
2660	2810	int *rexmit)
2661	2811	{
2662	2812	struct tcp_sock *tp = tcp_sk(sk);
2663	2813	bool recovered = !before(tp->snd_una, tp->high_seq);
2664	2814
2665		- if ((flag & FLAG_SND_UNA_ADVANCED) &&
	2815	+ if ((flag & FLAG_SND_UNA_ADVANCED \|\| rcu_access_pointer(tp->fastopen_rsk)) &&
2666	2816	tcp_try_undo_loss(sk, false))
2667	2817	return;
2668	2818
..	..	@@ -2675,7 +2825,7 @@
2675	2825	return;
2676	2826
2677	2827	if (after(tp->snd_nxt, tp->high_seq)) {
2678		- if (flag & FLAG_DATA_SACKED \|\| is_dupack)
	2828	+ if (flag & FLAG_DATA_SACKED \|\| num_dupack)
2679	2829	tp->frto = 0; /* Step 3.a. loss was real */
2680	2830	} else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
2681	2831	tp->high_seq = tp->snd_nxt;
..	..	@@ -2701,16 +2851,25 @@
2701	2851	/* A Reno DUPACK means new data in F-RTO step 2.b above are
2702	2852	* delivered. Lower inflight to clock out (re)tranmissions.
2703	2853	*/
2704		- if (after(tp->snd_nxt, tp->high_seq) && is_dupack)
2705		- tcp_add_reno_sack(sk);
	2854	+ if (after(tp->snd_nxt, tp->high_seq) && num_dupack)
	2855	+ tcp_add_reno_sack(sk, num_dupack, flag & FLAG_ECE);
2706	2856	else if (flag & FLAG_SND_UNA_ADVANCED)
2707	2857	tcp_reset_reno_sack(tp);
2708	2858	}
2709	2859	*rexmit = REXMIT_LOST;
2710	2860	}
2711	2861
	2862	+static bool tcp_force_fast_retransmit(struct sock *sk)
	2863	+{
	2864	+ struct tcp_sock *tp = tcp_sk(sk);
	2865	+
	2866	+ return after(tcp_highest_sack_seq(tp),
	2867	+ tp->snd_una + tp->reordering * tp->mss_cache);
	2868	+}
	2869	+
2712	2870	/* Undo during fast recovery after partial ACK. */
2713		-static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una)
	2871	+static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una,
	2872	+ bool *do_lost)
2714	2873	{
2715	2874	struct tcp_sock *tp = tcp_sk(sk);
2716	2875
..	..	@@ -2735,7 +2894,9 @@
2735	2894	tcp_undo_cwnd_reduction(sk, true);
2736	2895	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
2737	2896	tcp_try_keep_open(sk);
2738		- return true;
	2897	+ } else {
	2898	+ /* Partial ACK arrived. Force fast retransmit. */
	2899	+ *do_lost = tcp_force_fast_retransmit(sk);
2739	2900	}
2740	2901	return false;
2741	2902	}
..	..	@@ -2759,14 +2920,6 @@
2759	2920	}
2760	2921	}
2761	2922
2762		-static bool tcp_force_fast_retransmit(struct sock *sk)
2763		-{
2764		- struct tcp_sock *tp = tcp_sk(sk);
2765		-
2766		- return after(tcp_highest_sack_seq(tp),
2767		- tp->snd_una + tp->reordering * tp->mss_cache);
2768		-}
2769		-
2770	2923	/* Process an event, which can update packets-in-flight not trivially.
2771	2924	* Main goal of this function is to calculate new estimate for left_out,
2772	2925	* taking into account both packets sitting in receiver's buffer and
..	..	@@ -2780,20 +2933,21 @@
2780	2933	* tcp_xmit_retransmit_queue().
2781	2934	*/
2782	2935	static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
2783		- bool is_dupack, int ack_flag, int rexmit)
	2936	+ int num_dupack, int ack_flag, int rexmit)
2784	2937	{
2785	2938	struct inet_connection_sock *icsk = inet_csk(sk);
2786	2939	struct tcp_sock *tp = tcp_sk(sk);
2787	2940	int fast_rexmit = 0, flag = *ack_flag;
2788		- bool do_lost = is_dupack \|\| ((flag & FLAG_DATA_SACKED) &&
2789		- tcp_force_fast_retransmit(sk));
	2941	+ bool ece_ack = flag & FLAG_ECE;
	2942	+ bool do_lost = num_dupack \|\| ((flag & FLAG_DATA_SACKED) &&
	2943	+ tcp_force_fast_retransmit(sk));
2790	2944
2791	2945	if (!tp->packets_out && tp->sacked_out)
2792	2946	tp->sacked_out = 0;
2793	2947
2794	2948	/* Now state machine starts.
2795	2949	* A. ECE, hence prohibit cwnd undoing, the reduction is required. */
2796		- if (flag & FLAG_ECE)
	2950	+ if (ece_ack)
2797	2951	tp->prior_ssthresh = 0;
2798	2952
2799	2953	/* B. In all the states check for reneging SACKs. */
..	..	@@ -2833,35 +2987,37 @@
2833	2987	switch (icsk->icsk_ca_state) {
2834	2988	case TCP_CA_Recovery:
2835	2989	if (!(flag & FLAG_SND_UNA_ADVANCED)) {
2836		- if (tcp_is_reno(tp) && is_dupack)
2837		- tcp_add_reno_sack(sk);
2838		- } else {
2839		- if (tcp_try_undo_partial(sk, prior_snd_una))
2840		- return;
2841		- /* Partial ACK arrived. Force fast retransmit. */
2842		- do_lost = tcp_is_reno(tp) \|\|
2843		- tcp_force_fast_retransmit(sk);
2844		- }
2845		- if (tcp_try_undo_dsack(sk)) {
2846		- tcp_try_keep_open(sk);
	2990	+ if (tcp_is_reno(tp))
	2991	+ tcp_add_reno_sack(sk, num_dupack, ece_ack);
	2992	+ } else if (tcp_try_undo_partial(sk, prior_snd_una, &do_lost))
2847	2993	return;
2848		- }
	2994	+
	2995	+ if (tcp_try_undo_dsack(sk))
	2996	+ tcp_try_keep_open(sk);
	2997	+
2849	2998	tcp_identify_packet_loss(sk, ack_flag);
	2999	+ if (icsk->icsk_ca_state != TCP_CA_Recovery) {
	3000	+ if (!tcp_time_to_recover(sk, flag))
	3001	+ return;
	3002	+ /* Undo reverts the recovery state. If loss is evident,
	3003	+ * starts a new recovery (e.g. reordering then loss);
	3004	+ */
	3005	+ tcp_enter_recovery(sk, ece_ack);
	3006	+ }
2850	3007	break;
2851	3008	case TCP_CA_Loss:
2852		- tcp_process_loss(sk, flag, is_dupack, rexmit);
	3009	+ tcp_process_loss(sk, flag, num_dupack, rexmit);
2853	3010	tcp_identify_packet_loss(sk, ack_flag);
2854	3011	if (!(icsk->icsk_ca_state == TCP_CA_Open \|\|
2855	3012	(*ack_flag & FLAG_LOST_RETRANS)))
2856	3013	return;
2857	3014	/* Change state if cwnd is undone or retransmits are lost */
2858		- /* fall through */
	3015	+ fallthrough;
2859	3016	default:
2860	3017	if (tcp_is_reno(tp)) {
2861	3018	if (flag & FLAG_SND_UNA_ADVANCED)
2862	3019	tcp_reset_reno_sack(tp);
2863		- if (is_dupack)
2864		- tcp_add_reno_sack(sk);
	3020	+ tcp_add_reno_sack(sk, num_dupack, ece_ack);
2865	3021	}
2866	3022
2867	3023	if (icsk->icsk_ca_state <= TCP_CA_Disorder)
..	..	@@ -2885,7 +3041,7 @@
2885	3041	}
2886	3042
2887	3043	/* Otherwise enter Recovery state */
2888		- tcp_enter_recovery(sk, (flag & FLAG_ECE));
	3044	+ tcp_enter_recovery(sk, ece_ack);
2889	3045	fast_rexmit = 1;
2890	3046	}
2891	3047
..	..	@@ -2896,7 +3052,7 @@
2896	3052
2897	3053	static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us, const int flag)
2898	3054	{
2899		- u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ;
	3055	+ u32 wlen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen) * HZ;
2900	3056	struct tcp_sock *tp = tcp_sk(sk);
2901	3057
2902	3058	if ((flag & FLAG_ACK_MAYBE_DELAYED) && rtt_us > tcp_min_rtt(tp)) {
..	..	@@ -2935,6 +3091,8 @@
2935	3091	u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
2936	3092
2937	3093	if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
	3094	+ if (!delta)
	3095	+ delta = 1;
2938	3096	seq_rtt_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
2939	3097	ca_rtt_us = seq_rtt_us;
2940	3098	}
..	..	@@ -2988,7 +3146,7 @@
2988	3146	/* If the retrans timer is currently being used by Fast Open
2989	3147	* for SYN-ACK retrans purpose, stay put.
2990	3148	*/
2991		- if (tp->fastopen_rsk)
	3149	+ if (rcu_access_pointer(tp->fastopen_rsk))
2992	3150	return;
2993	3151
2994	3152	if (!tp->packets_out) {
..	..	@@ -3004,8 +3162,8 @@
3004	3162	*/
3005	3163	rto = usecs_to_jiffies(max_t(int, delta_us, 1));
3006	3164	}
3007		- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
3008		- TCP_RTO_MAX);
	3165	+ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
	3166	+ TCP_RTO_MAX);
3009	3167	}
3010	3168	}
3011	3169
..	..	@@ -3061,7 +3219,7 @@
3061	3219	*/
3062	3220	static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
3063	3221	u32 prior_snd_una,
3064		- struct tcp_sacktag_state *sack)
	3222	+ struct tcp_sacktag_state *sack, bool ece_ack)
3065	3223	{
3066	3224	const struct inet_connection_sock *icsk = inet_csk(sk);
3067	3225	u64 first_ackt, last_ackt;
..	..	@@ -3086,8 +3244,6 @@
3086	3244	u8 sacked = scb->sacked;
3087	3245	u32 acked_pcount;
3088	3246
3089		- tcp_ack_tstamp(sk, skb, prior_snd_una);
3090		-
3091	3247	/* Determine how many packets and what bytes were acked, tso and else */
3092	3248	if (after(scb->end_seq, tp->snd_una)) {
3093	3249	if (tcp_skb_pcount(skb) == 1 \|\|
..	..	@@ -3107,7 +3263,7 @@
3107	3263	tp->retrans_out -= acked_pcount;
3108	3264	flag \|= FLAG_RETRANS_DATA_ACKED;
3109	3265	} else if (!(sacked & TCPCB_SACKED_ACKED)) {
3110		- last_ackt = skb->skb_mstamp;
	3266	+ last_ackt = tcp_skb_timestamp_us(skb);
3111	3267	WARN_ON_ONCE(last_ackt == 0);
3112	3268	if (!first_ackt)
3113	3269	first_ackt = last_ackt;
..	..	@@ -3122,10 +3278,10 @@
3122	3278	if (sacked & TCPCB_SACKED_ACKED) {
3123	3279	tp->sacked_out -= acked_pcount;
3124	3280	} else if (tcp_is_sack(tp)) {
3125		- tp->delivered += acked_pcount;
	3281	+ tcp_count_delivered(tp, acked_pcount, ece_ack);
3126	3282	if (!tcp_skb_spurious_retrans(tp, skb))
3127	3283	tcp_rack_advance(tp, sacked, scb->end_seq,
3128		- skb->skb_mstamp);
	3284	+ tcp_skb_timestamp_us(skb));
3129	3285	}
3130	3286	if (sacked & TCPCB_LOST)
3131	3287	tp->lost_out -= acked_pcount;
..	..	@@ -3151,6 +3307,8 @@
3151	3307	if (!fully_acked)
3152	3308	break;
3153	3309
	3310	+ tcp_ack_tstamp(sk, skb, prior_snd_una);
	3311	+
3154	3312	next = skb_rb_next(skb);
3155	3313	if (unlikely(skb == tp->retransmit_skb_hint))
3156	3314	tp->retransmit_skb_hint = NULL;
..	..	@@ -3166,8 +3324,11 @@
3166	3324	if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
3167	3325	tp->snd_up = tp->snd_una;
3168	3326
3169		- if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
3170		- flag \|= FLAG_SACK_RENEGING;
	3327	+ if (skb) {
	3328	+ tcp_ack_tstamp(sk, skb, prior_snd_una);
	3329	+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
	3330	+ flag \|= FLAG_SACK_RENEGING;
	3331	+ }
3171	3332
3172	3333	if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
3173	3334	seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt);
..	..	@@ -3199,7 +3360,7 @@
3199	3360	}
3200	3361
3201	3362	if (tcp_is_reno(tp)) {
3202		- tcp_remove_reno_sacks(sk, pkts_acked);
	3363	+ tcp_remove_reno_sacks(sk, pkts_acked, ece_ack);
3203	3364
3204	3365	/* If any of the cumulatively ACKed segments was
3205	3366	* retransmitted, non-SACK case cannot confirm that
..	..	@@ -3220,7 +3381,8 @@
3220	3381	tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
3221	3382	}
3222	3383	} else if (skb && rtt_update && sack_rtt_us >= 0 &&
3223		- sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) {
	3384	+ sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp,
	3385	+ tcp_skb_timestamp_us(skb))) {
3224	3386	/* Do not re-arm RTO if the sack RTT is measured from data sent
3225	3387	* after when the head was last (re)transmitted. Otherwise the
3226	3388	* timeout may continue to extend in loss recovery.
..	..	@@ -3273,6 +3435,7 @@
3273	3435	return;
3274	3436	if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) {
3275	3437	icsk->icsk_backoff = 0;
	3438	+ icsk->icsk_probes_tstamp = 0;
3276	3439	inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
3277	3440	/* Socket must be waked up by subsequent tcp_data_snd_check().
3278	3441	* This function is not for random using!
..	..	@@ -3280,8 +3443,8 @@
3280	3443	} else {
3281	3444	unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);
3282	3445
3283		- inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
3284		- when, TCP_RTO_MAX);
	3446	+ when = tcp_clamp_probe0_to_user_timeout(sk, when);
	3447	+ tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, TCP_RTO_MAX);
3285	3448	}
3286	3449	}
3287	3450
..	..	@@ -3300,7 +3463,8 @@
3300	3463	* new SACK or ECE mark may first advance cwnd here and later reduce
3301	3464	* cwnd in tcp_fastretrans_alert() based on more states.
3302	3465	*/
3303		- if (tcp_sk(sk)->reordering > sock_net(sk)->ipv4.sysctl_tcp_reordering)
	3466	+ if (tcp_sk(sk)->reordering >
	3467	+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering))
3304	3468	return flag & FLAG_FORWARD_PROGRESS;
3305	3469
3306	3470	return flag & FLAG_DATA_ACKED;
..	..	@@ -3409,16 +3573,23 @@
3409	3573	static bool __tcp_oow_rate_limited(struct net *net, int mib_idx,
3410	3574	u32 *last_oow_ack_time)
3411	3575	{
3412		- if (*last_oow_ack_time) {
3413		- s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time);
	3576	+ /* Paired with the WRITE_ONCE() in this function. */
	3577	+ u32 val = READ_ONCE(*last_oow_ack_time);
3414	3578
3415		- if (0 <= elapsed && elapsed < net->ipv4.sysctl_tcp_invalid_ratelimit) {
	3579	+ if (val) {
	3580	+ s32 elapsed = (s32)(tcp_jiffies32 - val);
	3581	+
	3582	+ if (0 <= elapsed &&
	3583	+ elapsed < READ_ONCE(net->ipv4.sysctl_tcp_invalid_ratelimit)) {
3416	3584	NET_INC_STATS(net, mib_idx);
3417	3585	return true; /* rate-limited: don't send yet! */
3418	3586	}
3419	3587	}
3420	3588
3421		- *last_oow_ack_time = tcp_jiffies32;
	3589	+ /* Paired with the prior READ_ONCE() and with itself,
	3590	+ * as we might be lockless.
	3591	+ */
	3592	+ WRITE_ONCE(*last_oow_ack_time, tcp_jiffies32);
3422	3593
3423	3594	return false; /* not rate-limited: go ahead, send dupack now! */
3424	3595	}
..	..	@@ -3459,11 +3630,11 @@
3459	3630
3460	3631	/* Then check host-wide RFC 5961 rate limit. */
3461	3632	now = jiffies / HZ;
3462		- if (now != challenge_timestamp) {
3463		- u32 ack_limit = net->ipv4.sysctl_tcp_challenge_ack_limit;
	3633	+ if (now != READ_ONCE(challenge_timestamp)) {
	3634	+ u32 ack_limit = READ_ONCE(net->ipv4.sysctl_tcp_challenge_ack_limit);
3464	3635	u32 half = (ack_limit + 1) >> 1;
3465	3636
3466		- challenge_timestamp = now;
	3637	+ WRITE_ONCE(challenge_timestamp, now);
3467	3638	WRITE_ONCE(challenge_count, half + prandom_u32_max(ack_limit));
3468	3639	}
3469	3640	count = READ_ONCE(challenge_count);
..	..	@@ -3544,10 +3715,10 @@
3544	3715	{
3545	3716	struct tcp_sock *tp = tcp_sk(sk);
3546	3717
3547		- if (rexmit == REXMIT_NONE)
	3718	+ if (rexmit == REXMIT_NONE \|\| sk->sk_state == TCP_SYN_SENT)
3548	3719	return;
3549	3720
3550		- if (unlikely(rexmit == 2)) {
	3721	+ if (unlikely(rexmit == REXMIT_NEW)) {
3551	3722	__tcp_push_pending_frames(sk, tcp_current_mss(sk),
3552	3723	TCP_NAGLE_OFF);
3553	3724	if (after(tp->snd_nxt, tp->high_seq))
..	..	@@ -3566,10 +3737,9 @@
3566	3737
3567	3738	delivered = tp->delivered - prior_delivered;
3568	3739	NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered);
3569		- if (flag & FLAG_ECE) {
3570		- tp->delivered_ce += delivered;
	3740	+ if (flag & FLAG_ECE)
3571	3741	NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered);
3572		- }
	3742	+
3573	3743	return delivered;
3574	3744	}
3575	3745
..	..	@@ -3584,7 +3754,7 @@
3584	3754	bool is_sack_reneg = tp->is_sack_reneg;
3585	3755	u32 ack_seq = TCP_SKB_CB(skb)->seq;
3586	3756	u32 ack = TCP_SKB_CB(skb)->ack_seq;
3587		- bool is_dupack = false;
	3757	+ int num_dupack = 0;
3588	3758	int prior_packets = tp->packets_out;
3589	3759	u32 delivered = tp->delivered;
3590	3760	u32 lost = tp->lost;
..	..	@@ -3593,6 +3763,7 @@
3593	3763
3594	3764	sack_state.first_sackt = 0;
3595	3765	sack_state.rate = &rs;
	3766	+ sack_state.sack_delivered = 0;
3596	3767
3597	3768	/* We very likely will need to access rtx queue. */
3598	3769	prefetch(sk->tcp_rtx_queue.rb_node);
..	..	@@ -3614,14 +3785,14 @@
3614	3785	* this segment (RFC793 Section 3.9).
3615	3786	*/
3616	3787	if (after(ack, tp->snd_nxt))
3617		- goto invalid_ack;
	3788	+ return -1;
3618	3789
3619	3790	if (after(ack, prior_snd_una)) {
3620	3791	flag \|= FLAG_SND_UNA_ADVANCED;
3621	3792	icsk->icsk_retransmits = 0;
3622	3793
3623	3794	#if IS_ENABLED(CONFIG_TLS_DEVICE)
3624		- if (static_branch_unlikely(&clean_acked_data_enabled))
	3795	+ if (static_branch_unlikely(&clean_acked_data_enabled.key))
3625	3796	if (icsk->icsk_clean_acked)
3626	3797	icsk->icsk_clean_acked(sk, ack);
3627	3798	#endif
..	..	@@ -3636,7 +3807,8 @@
3636	3807	if (flag & FLAG_UPDATE_TS_RECENT)
3637	3808	tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
3638	3809
3639		- if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
	3810	+ if ((flag & (FLAG_SLOWPATH \| FLAG_SND_UNA_ADVANCED)) ==
	3811	+ FLAG_SND_UNA_ADVANCED) {
3640	3812	/* Window is constant, pure forward advance.
3641	3813	* No more checks are required.
3642	3814	* Note, we use the fact that SND.UNA>=SND.WL2.
..	..	@@ -3667,6 +3839,10 @@
3667	3839	ack_ev_flags \|= CA_ACK_ECE;
3668	3840	}
3669	3841
	3842	+ if (sack_state.sack_delivered)
	3843	+ tcp_count_delivered(tp, sack_state.sack_delivered,
	3844	+ flag & FLAG_ECE);
	3845	+
3670	3846	if (flag & FLAG_WIN_UPDATE)
3671	3847	ack_ev_flags \|= CA_ACK_WIN_UPDATE;
3672	3848
..	..	@@ -3692,7 +3868,8 @@
3692	3868	goto no_queue;
3693	3869
3694	3870	/* See if we can take anything off of the retransmit queue. */
3695		- flag \|= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state);
	3871	+ flag \|= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state,
	3872	+ flag & FLAG_ECE);
3696	3873
3697	3874	tcp_rack_update_reo_wnd(sk, &rs);
3698	3875
..	..	@@ -3700,8 +3877,14 @@
3700	3877	tcp_process_tlp_ack(sk, ack, flag);
3701	3878
3702	3879	if (tcp_ack_is_dubious(sk, flag)) {
3703		- is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED \| FLAG_NOT_DUP));
3704		- tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
	3880	+ if (!(flag & (FLAG_SND_UNA_ADVANCED \|
	3881	+ FLAG_NOT_DUP \| FLAG_DSACKING_ACK))) {
	3882	+ num_dupack = 1;
	3883	+ /* Consider if pure acks were aggregated in tcp_add_backlog() */
	3884	+ if (!(flag & FLAG_DATA))
	3885	+ num_dupack = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
	3886	+ }
	3887	+ tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
3705	3888	&rexmit);
3706	3889	}
3707	3890
..	..	@@ -3723,7 +3906,7 @@
3723	3906	no_queue:
3724	3907	/* If data was DSACKed, see if we can undo a cwnd reduction. */
3725	3908	if (flag & FLAG_DSACKING_ACK) {
3726		- tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
	3909	+ tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
3727	3910	&rexmit);
3728	3911	tcp_newly_delivered(sk, delivered, flag);
3729	3912	}
..	..	@@ -3737,10 +3920,6 @@
3737	3920	tcp_process_tlp_ack(sk, ack, flag);
3738	3921	return 1;
3739	3922
3740		-invalid_ack:
3741		- SOCK_DEBUG(sk, "Ack %u after %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
3742		- return -1;
3743		-
3744	3923	old_ack:
3745	3924	/* If data was SACKed, tag it and see if we should send more data.
3746	3925	* If data was DSACKed, see if we can undo a cwnd reduction.
..	..	@@ -3748,13 +3927,12 @@
3748	3927	if (TCP_SKB_CB(skb)->sacked) {
3749	3928	flag \|= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
3750	3929	&sack_state);
3751		- tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
	3930	+ tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
3752	3931	&rexmit);
3753	3932	tcp_newly_delivered(sk, delivered, flag);
3754	3933	tcp_xmit_recovery(sk, rexmit);
3755	3934	}
3756	3935
3757		- SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
3758	3936	return 0;
3759	3937	}
3760	3938
..	..	@@ -3775,7 +3953,7 @@
3775	3953	foc->exp = exp_opt;
3776	3954	}
3777	3955
3778		-static void smc_parse_options(const struct tcphdr *th,
	3956	+static bool smc_parse_options(const struct tcphdr *th,
3779	3957	struct tcp_options_received *opt_rx,
3780	3958	const unsigned char *ptr,
3781	3959	int opsize)
..	..	@@ -3784,10 +3962,56 @@
3784	3962	if (static_branch_unlikely(&tcp_have_smc)) {
3785	3963	if (th->syn && !(opsize & 1) &&
3786	3964	opsize >= TCPOLEN_EXP_SMC_BASE &&
3787		- get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC)
	3965	+ get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) {
3788	3966	opt_rx->smc_ok = 1;
	3967	+ return true;
	3968	+ }
3789	3969	}
3790	3970	#endif
	3971	+ return false;
	3972	+}
	3973	+
	3974	+/* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped
	3975	+ * value on success.
	3976	+ */
	3977	+static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss)
	3978	+{
	3979	+ const unsigned char ptr = (const unsigned char )(th + 1);
	3980	+ int length = (th->doff * 4) - sizeof(struct tcphdr);
	3981	+ u16 mss = 0;
	3982	+
	3983	+ while (length > 0) {
	3984	+ int opcode = *ptr++;
	3985	+ int opsize;
	3986	+
	3987	+ switch (opcode) {
	3988	+ case TCPOPT_EOL:
	3989	+ return mss;
	3990	+ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
	3991	+ length--;
	3992	+ continue;
	3993	+ default:
	3994	+ if (length < 2)
	3995	+ return mss;
	3996	+ opsize = *ptr++;
	3997	+ if (opsize < 2) /* "silly options" */
	3998	+ return mss;
	3999	+ if (opsize > length)
	4000	+ return mss; /* fail on partial options */
	4001	+ if (opcode == TCPOPT_MSS && opsize == TCPOLEN_MSS) {
	4002	+ u16 in_mss = get_unaligned_be16(ptr);
	4003	+
	4004	+ if (in_mss) {
	4005	+ if (user_mss && user_mss < in_mss)
	4006	+ in_mss = user_mss;
	4007	+ mss = in_mss;
	4008	+ }
	4009	+ }
	4010	+ ptr += opsize - 2;
	4011	+ length -= opsize;
	4012	+ }
	4013	+ }
	4014	+ return mss;
3791	4015	}
3792	4016
3793	4017	/* Look for tcp options. Normally only called on SYN and SYNACK packets.
..	..	@@ -3805,6 +4029,7 @@
3805	4029
3806	4030	ptr = (const unsigned char *)(th + 1);
3807	4031	opt_rx->saw_tstamp = 0;
	4032	+ opt_rx->saw_unknown = 0;
3808	4033
3809	4034	while (length > 0) {
3810	4035	int opcode = *ptr++;
..	..	@@ -3817,6 +4042,8 @@
3817	4042	length--;
3818	4043	continue;
3819	4044	default:
	4045	+ if (length < 2)
	4046	+ return;
3820	4047	opsize = *ptr++;
3821	4048	if (opsize < 2) /* "silly options" */
3822	4049	return;
..	..	@@ -3836,7 +4063,7 @@
3836	4063	break;
3837	4064	case TCPOPT_WINDOW:
3838	4065	if (opsize == TCPOLEN_WINDOW && th->syn &&
3839		- !estab && net->ipv4.sysctl_tcp_window_scaling) {
	4066	+ !estab && READ_ONCE(net->ipv4.sysctl_tcp_window_scaling)) {
3840	4067	__u8 snd_wscale = (__u8 )ptr;
3841	4068	opt_rx->wscale_ok = 1;
3842	4069	if (snd_wscale > TCP_MAX_WSCALE) {
..	..	@@ -3852,7 +4079,7 @@
3852	4079	case TCPOPT_TIMESTAMP:
3853	4080	if ((opsize == TCPOLEN_TIMESTAMP) &&
3854	4081	((estab && opt_rx->tstamp_ok) \|\|
3855		- (!estab && net->ipv4.sysctl_tcp_timestamps))) {
	4082	+ (!estab && READ_ONCE(net->ipv4.sysctl_tcp_timestamps)))) {
3856	4083	opt_rx->saw_tstamp = 1;
3857	4084	opt_rx->rcv_tsval = get_unaligned_be32(ptr);
3858	4085	opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
..	..	@@ -3860,7 +4087,7 @@
3860	4087	break;
3861	4088	case TCPOPT_SACK_PERM:
3862	4089	if (opsize == TCPOLEN_SACK_PERM && th->syn &&
3863		- !estab && net->ipv4.sysctl_tcp_sack) {
	4090	+ !estab && READ_ONCE(net->ipv4.sysctl_tcp_sack)) {
3864	4091	opt_rx->sack_ok = TCP_SACK_SEEN;
3865	4092	tcp_sack_reset(opt_rx);
3866	4093	}
..	..	@@ -3893,15 +4120,21 @@
3893	4120	*/
3894	4121	if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE &&
3895	4122	get_unaligned_be16(ptr) ==
3896		- TCPOPT_FASTOPEN_MAGIC)
	4123	+ TCPOPT_FASTOPEN_MAGIC) {
3897	4124	tcp_parse_fastopen_option(opsize -
3898	4125	TCPOLEN_EXP_FASTOPEN_BASE,
3899	4126	ptr + 2, th->syn, foc, true);
3900		- else
3901		- smc_parse_options(th, opt_rx, ptr,
3902		- opsize);
	4127	+ break;
	4128	+ }
	4129	+
	4130	+ if (smc_parse_options(th, opt_rx, ptr, opsize))
	4131	+ break;
	4132	+
	4133	+ opt_rx->saw_unknown = 1;
3903	4134	break;
3904	4135
	4136	+ default:
	4137	+ opt_rx->saw_unknown = 1;
3905	4138	}
3906	4139	ptr += opsize-2;
3907	4140	length -= opsize;
..	..	@@ -4109,7 +4342,7 @@
4109	4342
4110	4343	inet_csk_schedule_ack(sk);
4111	4344
4112		- sk->sk_shutdown \|= RCV_SHUTDOWN;
	4345	+ WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown \| RCV_SHUTDOWN);
4113	4346	sock_set_flag(sk, SOCK_DONE);
4114	4347
4115	4348	switch (sk->sk_state) {
..	..	@@ -4117,7 +4350,7 @@
4117	4350	case TCP_ESTABLISHED:
4118	4351	/* Move to CLOSE_WAIT */
4119	4352	tcp_set_state(sk, TCP_CLOSE_WAIT);
4120		- inet_csk(sk)->icsk_ack.pingpong = 1;
	4353	+ inet_csk_enter_pingpong_mode(sk);
4121	4354	break;
4122	4355
4123	4356	case TCP_CLOSE_WAIT:
..	..	@@ -4189,7 +4422,7 @@
4189	4422	{
4190	4423	struct tcp_sock *tp = tcp_sk(sk);
4191	4424
4192		- if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
	4425	+ if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) {
4193	4426	int mib_idx;
4194	4427
4195	4428	if (before(seq, tp->rcv_nxt))
..	..	@@ -4215,6 +4448,18 @@
4215	4448	tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
4216	4449	}
4217	4450
	4451	+static void tcp_rcv_spurious_retrans(struct sock sk, const struct sk_buff skb)
	4452	+{
	4453	+ /* When the ACK path fails or drops most ACKs, the sender would
	4454	+ * timeout and spuriously retransmit the same segment repeatedly.
	4455	+ * The receiver remembers and reflects via DSACKs. Leverage the
	4456	+ * DSACK state and change the txhash to re-route speculatively.
	4457	+ */
	4458	+ if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq &&
	4459	+ sk_rethink_txhash(sk))
	4460	+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH);
	4461	+}
	4462	+
4218	4463	static void tcp_send_dupack(struct sock sk, const struct sk_buff skb)
4219	4464	{
4220	4465	struct tcp_sock *tp = tcp_sk(sk);
..	..	@@ -4224,9 +4469,10 @@
4224	4469	NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
4225	4470	tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
4226	4471
4227		- if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
	4472	+ if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) {
4228	4473	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
4229	4474
	4475	+ tcp_rcv_spurious_retrans(sk, skb);
4230	4476	if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
4231	4477	end_seq = tp->rcv_nxt;
4232	4478	tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
..	..	@@ -4260,9 +4506,37 @@
4260	4506	sp[i] = sp[i + 1];
4261	4507	continue;
4262	4508	}
4263		- this_sack++, swalk++;
	4509	+ this_sack++;
	4510	+ swalk++;
4264	4511	}
4265	4512	}
	4513	+
	4514	+static void tcp_sack_compress_send_ack(struct sock *sk)
	4515	+{
	4516	+ struct tcp_sock *tp = tcp_sk(sk);
	4517	+
	4518	+ if (!tp->compressed_ack)
	4519	+ return;
	4520	+
	4521	+ if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
	4522	+ __sock_put(sk);
	4523	+
	4524	+ /* Since we have to send one ack finally,
	4525	+ * substract one from tp->compressed_ack to keep
	4526	+ * LINUX_MIB_TCPACKCOMPRESSED accurate.
	4527	+ */
	4528	+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
	4529	+ tp->compressed_ack - 1);
	4530	+
	4531	+ tp->compressed_ack = 0;
	4532	+ tcp_send_ack(sk);
	4533	+}
	4534	+
	4535	+/* Reasonable amount of sack blocks included in TCP SACK option
	4536	+ * The max is 4, but this becomes 3 if TCP timestamps are there.
	4537	+ * Given that SACK packets might be lost, be conservative and use 2.
	4538	+ */
	4539	+#define TCP_SACK_BLOCKS_EXPECTED 2
4266	4540
4267	4541	static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
4268	4542	{
..	..	@@ -4276,6 +4550,8 @@
4276	4550
4277	4551	for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) {
4278	4552	if (tcp_sack_extend(sp, seq, end_seq)) {
	4553	+ if (this_sack >= TCP_SACK_BLOCKS_EXPECTED)
	4554	+ tcp_sack_compress_send_ack(sk);
4279	4555	/* Rotate this_sack to the first one. */
4280	4556	for (; this_sack > 0; this_sack--, sp--)
4281	4557	swap(sp, (sp - 1));
..	..	@@ -4285,6 +4561,9 @@
4285	4561	}
4286	4562	}
4287	4563
	4564	+ if (this_sack >= TCP_SACK_BLOCKS_EXPECTED)
	4565	+ tcp_sack_compress_send_ack(sk);
	4566	+
4288	4567	/* Could not find an adjacent existing SACK, build a new one,
4289	4568	* put it at the front, and shift everyone else down. We
4290	4569	* always know there is at least one SACK present already here.
..	..	@@ -4292,8 +4571,6 @@
4292	4571	* If the sack array is full, forget about the last one.
4293	4572	*/
4294	4573	if (this_sack >= TCP_NUM_SACKS) {
4295		- if (tp->compressed_ack > TCP_FASTRETRANS_THRESH)
4296		- tcp_send_ack(sk);
4297	4574	this_sack--;
4298	4575	tp->rx_opt.num_sacks--;
4299	4576	sp--;
..	..	@@ -4345,7 +4622,6 @@
4345	4622	/**
4346	4623	* tcp_try_coalesce - try to merge skb to prior one
4347	4624	* @sk: socket
4348		- * @dest: destination queue
4349	4625	* @to: prior buffer
4350	4626	* @from: buffer to add in queue
4351	4627	* @fragstolen: pointer to boolean
..	..	@@ -4367,6 +4643,9 @@
4367	4643
4368	4644	/* Its possible this segment overlaps with prior segment in queue */
4369	4645	if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
	4646	+ return false;
	4647	+
	4648	+ if (!mptcp_skb_can_collapse(to, from))
4370	4649	return false;
4371	4650
4372	4651	#ifdef CONFIG_TLS_DEVICE
..	..	@@ -4412,6 +4691,7 @@
4412	4691
4413	4692	static void tcp_drop(struct sock sk, struct sk_buff skb)
4414	4693	{
	4694	+ trace_android_vh_kfree_skb(skb);
4415	4695	sk_drops_add(sk, skb);
4416	4696	__kfree_skb(skb);
4417	4697	}
..	..	@@ -4443,13 +4723,9 @@
4443	4723	rb_erase(&skb->rbnode, &tp->out_of_order_queue);
4444	4724
4445	4725	if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
4446		- SOCK_DEBUG(sk, "ofo packet was already received\n");
4447	4726	tcp_drop(sk, skb);
4448	4727	continue;
4449	4728	}
4450		- SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
4451		- tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
4452		- TCP_SKB_CB(skb)->end_seq);
4453	4729
4454	4730	tail = skb_peek_tail(&sk->sk_receive_queue);
4455	4731	eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
..	..	@@ -4511,11 +4787,10 @@
4511	4787	tp->pred_flags = 0;
4512	4788	inet_csk_schedule_ack(sk);
4513	4789
	4790	+ tp->rcv_ooopack += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
4514	4791	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
4515	4792	seq = TCP_SKB_CB(skb)->seq;
4516	4793	end_seq = TCP_SKB_CB(skb)->end_seq;
4517		- SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
4518		- tp->rcv_nxt, seq, end_seq);
4519	4794
4520	4795	p = &tp->out_of_order_queue.rb_node;
4521	4796	if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
..	..	@@ -4541,7 +4816,7 @@
4541	4816	* and trigger fast retransmit.
4542	4817	*/
4543	4818	if (tcp_is_sack(tp))
4544		- tcp_grow_window(sk, skb);
	4819	+ tcp_grow_window(sk, skb, true);
4545	4820	kfree_skb_partial(skb, fragstolen);
4546	4821	skb = NULL;
4547	4822	goto add_sack;
..	..	@@ -4629,19 +4904,18 @@
4629	4904	* and trigger fast retransmit.
4630	4905	*/
4631	4906	if (tcp_is_sack(tp))
4632		- tcp_grow_window(sk, skb);
	4907	+ tcp_grow_window(sk, skb, false);
4633	4908	skb_condense(skb);
4634	4909	skb_set_owner_r(skb, sk);
4635	4910	}
4636	4911	}
4637	4912
4638		-static int __must_check tcp_queue_rcv(struct sock sk, struct sk_buff skb, int hdrlen,
4639		- bool *fragstolen)
	4913	+static int __must_check tcp_queue_rcv(struct sock sk, struct sk_buff skb,
	4914	+ bool *fragstolen)
4640	4915	{
4641	4916	int eaten;
4642	4917	struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
4643	4918
4644		- __skb_pull(skb, hdrlen);
4645	4919	eaten = (tail &&
4646	4920	tcp_try_coalesce(sk, tail,
4647	4921	skb, fragstolen)) ? 1 : 0;
..	..	@@ -4692,7 +4966,7 @@
4692	4966	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
4693	4967	TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
4694	4968
4695		- if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) {
	4969	+ if (tcp_queue_rcv(sk, skb, &fragstolen)) {
4696	4970	WARN_ON_ONCE(fragstolen); /* should not happen */
4697	4971	__kfree_skb(skb);
4698	4972	}
..	..	@@ -4724,6 +4998,9 @@
4724	4998	bool fragstolen;
4725	4999	int eaten;
4726	5000
	5001	+ if (sk_is_mptcp(sk))
	5002	+ mptcp_incoming_options(sk, skb);
	5003	+
4727	5004	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
4728	5005	__kfree_skb(skb);
4729	5006	return;
..	..	@@ -4753,7 +5030,7 @@
4753	5030	goto drop;
4754	5031	}
4755	5032
4756		- eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
	5033	+ eaten = tcp_queue_rcv(sk, skb, &fragstolen);
4757	5034	if (skb->len)
4758	5035	tcp_event_data_recv(sk, skb);
4759	5036	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
..	..	@@ -4782,6 +5059,7 @@
4782	5059	}
4783	5060
4784	5061	if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
	5062	+ tcp_rcv_spurious_retrans(sk, skb);
4785	5063	/* A retransmit, 2nd most common case. Force an immediate ack. */
4786	5064	NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
4787	5065	tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
..	..	@@ -4800,10 +5078,6 @@
4800	5078
4801	5079	if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
4802	5080	/* Partial packet, seq < rcv_next < end_seq */
4803		- SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
4804		- tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
4805		- TCP_SKB_CB(skb)->end_seq);
4806		-
4807	5081	tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
4808	5082
4809	5083	/* If window is closed, drop tail of packet. But after
..	..	@@ -4897,7 +5171,7 @@
4897	5171	/* The first skb to collapse is:
4898	5172	* - not SYN/FIN and
4899	5173	* - bloated or contains data before "start" or
4900		- * overlaps to the next one.
	5174	+ * overlaps to the next one and mptcp allow collapsing.
4901	5175	*/
4902	5176	if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN \| TCPHDR_FIN)) &&
4903	5177	(tcp_win_from_space(sk, skb->truesize) > skb->len \|\|
..	..	@@ -4906,7 +5180,7 @@
4906	5180	break;
4907	5181	}
4908	5182
4909		- if (n && n != tail &&
	5183	+ if (n && n != tail && mptcp_skb_can_collapse(skb, n) &&
4910	5184	TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
4911	5185	end_of_skbs = false;
4912	5186	break;
..	..	@@ -4939,6 +5213,7 @@
4939	5213	else
4940	5214	__skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
4941	5215	skb_set_owner_r(nskb, sk);
	5216	+ mptcp_skb_ext_move(nskb, skb);
4942	5217
4943	5218	/* Copy data, releasing collapsed skbs. */
4944	5219	while (copy > 0) {
..	..	@@ -4958,6 +5233,7 @@
4958	5233	skb = tcp_collapse_one(sk, skb, list, root);
4959	5234	if (!skb \|\|
4960	5235	skb == tail \|\|
	5236	+ !mptcp_skb_can_collapse(nskb, skb) \|\|
4961	5237	(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN \| TCPHDR_FIN)))
4962	5238	goto end;
4963	5239	#ifdef CONFIG_TLS_DEVICE
..	..	@@ -5082,8 +5358,6 @@
5082	5358	{
5083	5359	struct tcp_sock *tp = tcp_sk(sk);
5084	5360
5085		- SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
5086		-
5087	5361	NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);
5088	5362
5089	5363	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
..	..	@@ -5149,12 +5423,6 @@
5149	5423	return true;
5150	5424	}
5151	5425
5152		-/* When incoming ACK allowed to free some skb from write_queue,
5153		- * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
5154		- * on the exit from tcp input handler.
5155		- *
5156		- * PROBLEM: sndbuf expansion does not work well with largesend.
5157		- */
5158	5426	static void tcp_new_space(struct sock *sk)
5159	5427	{
5160	5428	struct tcp_sock *tp = tcp_sk(sk);
..	..	@@ -5167,18 +5435,25 @@
5167	5435	sk->sk_write_space(sk);
5168	5436	}
5169	5437
5170		-static void tcp_check_space(struct sock *sk)
	5438	+/* Caller made space either from:
	5439	+ * 1) Freeing skbs in rtx queues (after tp->snd_una has advanced)
	5440	+ * 2) Sent skbs from output queue (and thus advancing tp->snd_nxt)
	5441	+ *
	5442	+ * We might be able to generate EPOLLOUT to the application if:
	5443	+ * 1) Space consumed in output/rtx queues is below sk->sk_sndbuf/2
	5444	+ * 2) notsent amount (tp->write_seq - tp->snd_nxt) became
	5445	+ * small enough that tcp_stream_memory_free() decides it
	5446	+ * is time to generate EPOLLOUT.
	5447	+ */
	5448	+void tcp_check_space(struct sock *sk)
5171	5449	{
5172		- if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
5173		- sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
5174		- /* pairs with tcp_poll() */
5175		- smp_mb();
5176		- if (sk->sk_socket &&
5177		- test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
5178		- tcp_new_space(sk);
5179		- if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
5180		- tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
5181		- }
	5450	+ /* pairs with tcp_poll() */
	5451	+ smp_mb();
	5452	+ if (sk->sk_socket &&
	5453	+ test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
	5454	+ tcp_new_space(sk);
	5455	+ if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
	5456	+ tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
5182	5457	}
5183	5458	}
5184	5459
..	..	@@ -5220,20 +5495,18 @@
5220	5495	}
5221	5496
5222	5497	if (!tcp_is_sack(tp) \|\|
5223		- tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr)
	5498	+ tp->compressed_ack >= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr))
5224	5499	goto send_now;
5225	5500
5226	5501	if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) {
5227	5502	tp->compressed_ack_rcv_nxt = tp->rcv_nxt;
5228		- if (tp->compressed_ack > TCP_FASTRETRANS_THRESH)
5229		- NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
5230		- tp->compressed_ack - TCP_FASTRETRANS_THRESH);
5231		- tp->compressed_ack = 0;
	5503	+ tp->dup_ack_counter = 0;
5232	5504	}
5233		-
5234		- if (++tp->compressed_ack <= TCP_FASTRETRANS_THRESH)
	5505	+ if (tp->dup_ack_counter < TCP_FASTRETRANS_THRESH) {
	5506	+ tp->dup_ack_counter++;
5235	5507	goto send_now;
5236		-
	5508	+ }
	5509	+ tp->compressed_ack++;
5237	5510	if (hrtimer_is_queued(&tp->compressed_ack_timer))
5238	5511	return;
5239	5512
..	..	@@ -5243,11 +5516,13 @@
5243	5516	if (tp->srtt_us && tp->srtt_us < rtt)
5244	5517	rtt = tp->srtt_us;
5245	5518
5246		- delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns,
	5519	+ delay = min_t(unsigned long,
	5520	+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns),
5247	5521	rtt * (NSEC_PER_USEC >> 3)/20);
5248	5522	sock_hold(sk);
5249		- hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay),
5250		- HRTIMER_MODE_REL_PINNED_SOFT);
	5523	+ hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay),
	5524	+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns),
	5525	+ HRTIMER_MODE_REL_PINNED_SOFT);
5251	5526	}
5252	5527
5253	5528	static inline void tcp_ack_snd_check(struct sock *sk)
..	..	@@ -5274,7 +5549,7 @@
5274	5549	struct tcp_sock *tp = tcp_sk(sk);
5275	5550	u32 ptr = ntohs(th->urg_ptr);
5276	5551
5277		- if (ptr && !sock_net(sk)->ipv4.sysctl_tcp_stdurg)
	5552	+ if (ptr && !READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_stdurg))
5278	5553	ptr--;
5279	5554	ptr += ntohl(th->seq);
5280	5555
..	..	@@ -5328,7 +5603,7 @@
5328	5603	}
5329	5604
5330	5605	tp->urg_data = TCP_URG_NOTYET;
5331		- tp->urg_seq = ptr;
	5606	+ WRITE_ONCE(tp->urg_seq, ptr);
5332	5607
5333	5608	/* Disable header prediction. */
5334	5609	tp->pred_flags = 0;
..	..	@@ -5481,6 +5756,8 @@
5481	5756	goto discard;
5482	5757	}
5483	5758
	5759	+ bpf_skops_parse_hdr(sk, skb);
	5760	+
5484	5761	return true;
5485	5762
5486	5763	discard:
..	..	@@ -5521,7 +5798,7 @@
5521	5798	trace_tcp_probe(sk, skb);
5522	5799
5523	5800	tcp_mstamp_refresh(tp);
5524		- if (unlikely(!sk->sk_rx_dst))
	5801	+ if (unlikely(!rcu_access_pointer(sk->sk_rx_dst)))
5525	5802	inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
5526	5803	/*
5527	5804	* Header prediction.
..	..	@@ -5628,8 +5905,8 @@
5628	5905	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
5629	5906
5630	5907	/* Bulk data transfer: receiver */
5631		- eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
5632		- &fragstolen);
	5908	+ __skb_pull(skb, tcp_header_len);
	5909	+ eaten = tcp_queue_rcv(sk, skb, &fragstolen);
5633	5910
5634	5911	tcp_event_data_recv(sk, skb);
5635	5912
..	..	@@ -5691,6 +5968,34 @@
5691	5968	}
5692	5969	EXPORT_SYMBOL(tcp_rcv_established);
5693	5970
	5971	+void tcp_init_transfer(struct sock sk, int bpf_op, struct sk_buff skb)
	5972	+{
	5973	+ struct inet_connection_sock *icsk = inet_csk(sk);
	5974	+ struct tcp_sock *tp = tcp_sk(sk);
	5975	+
	5976	+ tcp_mtup_init(sk);
	5977	+ icsk->icsk_af_ops->rebuild_header(sk);
	5978	+ tcp_init_metrics(sk);
	5979	+
	5980	+ /* Initialize the congestion window to start the transfer.
	5981	+ * Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
	5982	+ * retransmitted. In light of RFC6298 more aggressive 1sec
	5983	+ * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
	5984	+ * retransmission has occurred.
	5985	+ */
	5986	+ if (tp->total_retrans > 1 && tp->undo_marker)
	5987	+ tp->snd_cwnd = 1;
	5988	+ else
	5989	+ tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
	5990	+ tp->snd_cwnd_stamp = tcp_jiffies32;
	5991	+
	5992	+ bpf_skops_established(sk, bpf_op, skb);
	5993	+ /* Initialize congestion control unless BPF initialized it already: */
	5994	+ if (!icsk->icsk_ca_initialized)
	5995	+ tcp_init_congestion_control(sk);
	5996	+ tcp_init_buffer_space(sk);
	5997	+}
	5998	+
5694	5999	void tcp_finish_connect(struct sock sk, struct sk_buff skb)
5695	6000	{
5696	6001	struct tcp_sock *tp = tcp_sk(sk);
..	..	@@ -5705,7 +6010,7 @@
5705	6010	sk_mark_napi_id(sk, skb);
5706	6011	}
5707	6012
5708		- tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);
	6013	+ tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, skb);
5709	6014
5710	6015	/* Prevent spurious tcp_cwnd_restart() on first data
5711	6016	* packet.
..	..	@@ -5760,6 +6065,10 @@
5760	6065	tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
5761	6066
5762	6067	if (data) { /* Retransmit unacked data in SYN */
	6068	+ if (tp->total_retrans)
	6069	+ tp->fastopen_client_fail = TFO_SYN_RETRANSMITTED;
	6070	+ else
	6071	+ tp->fastopen_client_fail = TFO_DATA_NOT_ACKED;
5763	6072	skb_rbtree_walk_from(data) {
5764	6073	if (__tcp_retransmit_skb(sk, data, 1))
5765	6074	break;
..	..	@@ -5792,6 +6101,21 @@
5792	6101	#endif
5793	6102	}
5794	6103
	6104	+static void tcp_try_undo_spurious_syn(struct sock *sk)
	6105	+{
	6106	+ struct tcp_sock *tp = tcp_sk(sk);
	6107	+ u32 syn_stamp;
	6108	+
	6109	+ /* undo_marker is set when SYN or SYNACK times out. The timeout is
	6110	+ * spurious if the ACK's timestamp option echo value matches the
	6111	+ * original SYN timestamp.
	6112	+ */
	6113	+ syn_stamp = tp->retrans_stamp;
	6114	+ if (tp->undo_marker && syn_stamp && tp->rx_opt.saw_tstamp &&
	6115	+ syn_stamp == tp->rx_opt.rcv_tsecr)
	6116	+ tp->undo_marker = 0;
	6117	+}
	6118	+
5795	6119	static int tcp_rcv_synsent_state_process(struct sock sk, struct sk_buff skb,
5796	6120	const struct tcphdr *th)
5797	6121	{
..	..	@@ -5815,8 +6139,14 @@
5815	6139	* the segment and return)"
5816	6140	*/
5817	6141	if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) \|\|
5818		- after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt))
	6142	+ after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
	6143	+ /* Previous FIN/ACK or RST/ACK might be ignored. */
	6144	+ if (icsk->icsk_retransmits == 0)
	6145	+ inet_csk_reset_xmit_timer(sk,
	6146	+ ICSK_TIME_RETRANS,
	6147	+ TCP_TIMEOUT_MIN, TCP_RTO_MAX);
5819	6148	goto reset_and_undo;
	6149	+ }
5820	6150
5821	6151	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
5822	6152	!between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
..	..	@@ -5859,6 +6189,7 @@
5859	6189	tcp_ecn_rcv_synack(tp, th);
5860	6190
5861	6191	tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
	6192	+ tcp_try_undo_spurious_syn(sk);
5862	6193	tcp_ack(sk, skb, FLAG_SLOWPATH);
5863	6194
5864	6195	/* Ok.. it's good. Set up sequence numbers and
..	..	@@ -5912,7 +6243,7 @@
5912	6243	return -1;
5913	6244	if (sk->sk_write_pending \|\|
5914	6245	icsk->icsk_accept_queue.rskq_defer_accept \|\|
5915		- icsk->icsk_ack.pingpong) {
	6246	+ inet_csk_in_pingpong_mode(sk)) {
5916	6247	/* Save one ACK. Data will be ready after
5917	6248	* several ticks, if write_pending is set.
5918	6249	*
..	..	@@ -6017,6 +6348,38 @@
6017	6348	return 1;
6018	6349	}
6019	6350
	6351	+static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
	6352	+{
	6353	+ struct request_sock *req;
	6354	+
	6355	+ /* If we are still handling the SYNACK RTO, see if timestamp ECR allows
	6356	+ * undo. If peer SACKs triggered fast recovery, we can't undo here.
	6357	+ */
	6358	+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
	6359	+ tcp_try_undo_loss(sk, false);
	6360	+
	6361	+ /* Reset rtx states to prevent spurious retransmits_timed_out() */
	6362	+ tcp_sk(sk)->retrans_stamp = 0;
	6363	+ inet_csk(sk)->icsk_retransmits = 0;
	6364	+
	6365	+ /* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1,
	6366	+ * we no longer need req so release it.
	6367	+ */
	6368	+ req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
	6369	+ lockdep_sock_is_held(sk));
	6370	+ reqsk_fastopen_remove(sk, req, false);
	6371	+
	6372	+ /* Re-arm the timer because data may have been sent out.
	6373	+ * This is similar to the regular data transmission case
	6374	+ * when new data has just been ack'ed.
	6375	+ *
	6376	+ * (TFO) - we could try to be more aggressive and
	6377	+ * retransmitting any data sooner based on when they
	6378	+ * are sent out.
	6379	+ */
	6380	+ tcp_rearm_rto(sk);
	6381	+}
	6382	+
6020	6383	/*
6021	6384	* This function implements the receiving procedure of RFC 793 for
6022	6385	* all states except ESTABLISHED and TIME_WAIT.
..	..	@@ -6079,7 +6442,8 @@
6079	6442
6080	6443	tcp_mstamp_refresh(tp);
6081	6444	tp->rx_opt.saw_tstamp = 0;
6082		- req = tp->fastopen_rsk;
	6445	+ req = rcu_dereference_protected(tp->fastopen_rsk,
	6446	+ lockdep_sock_is_held(sk));
6083	6447	if (req) {
6084	6448	bool req_stolen;
6085	6449
..	..	@@ -6113,23 +6477,13 @@
6113	6477	if (!tp->srtt_us)
6114	6478	tcp_synack_rtt_meas(sk, req);
6115	6479
6116		- /* Once we leave TCP_SYN_RECV, we no longer need req
6117		- * so release it.
6118		- */
6119	6480	if (req) {
6120		- inet_csk(sk)->icsk_retransmits = 0;
6121		- reqsk_fastopen_remove(sk, req, false);
6122		- /* Re-arm the timer because data may have been sent out.
6123		- * This is similar to the regular data transmission case
6124		- * when new data has just been ack'ed.
6125		- *
6126		- * (TFO) - we could try to be more aggressive and
6127		- * retransmitting any data sooner based on when they
6128		- * are sent out.
6129		- */
6130		- tcp_rearm_rto(sk);
	6481	+ tcp_rcv_synrecv_state_fastopen(sk);
6131	6482	} else {
6132		- tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
	6483	+ tcp_try_undo_spurious_syn(sk);
	6484	+ tp->retrans_stamp = 0;
	6485	+ tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB,
	6486	+ skb);
6133	6487	WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
6134	6488	}
6135	6489	smp_mb();
..	..	@@ -6163,21 +6517,14 @@
6163	6517	case TCP_FIN_WAIT1: {
6164	6518	int tmo;
6165	6519
6166		- /* If we enter the TCP_FIN_WAIT1 state and we are a
6167		- * Fast Open socket and this is the first acceptable
6168		- * ACK we have received, this would have acknowledged
6169		- * our SYNACK so stop the SYNACK timer.
6170		- */
6171		- if (req) {
6172		- /* We no longer need the request sock. */
6173		- reqsk_fastopen_remove(sk, req, false);
6174		- tcp_rearm_rto(sk);
6175		- }
	6520	+ if (req)
	6521	+ tcp_rcv_synrecv_state_fastopen(sk);
	6522	+
6176	6523	if (tp->snd_una != tp->write_seq)
6177	6524	break;
6178	6525
6179	6526	tcp_set_state(sk, TCP_FIN_WAIT2);
6180		- sk->sk_shutdown \|= SEND_SHUTDOWN;
	6527	+ WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown \| SEND_SHUTDOWN);
6181	6528
6182	6529	sk_dst_confirm(sk);
6183	6530
..	..	@@ -6244,9 +6591,12 @@
6244	6591	case TCP_CLOSE_WAIT:
6245	6592	case TCP_CLOSING:
6246	6593	case TCP_LAST_ACK:
6247		- if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
	6594	+ if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
	6595	+ if (sk_is_mptcp(sk))
	6596	+ mptcp_incoming_options(sk, skb);
6248	6597	break;
6249		- /* fall through */
	6598	+ }
	6599	+ fallthrough;
6250	6600	case TCP_FIN_WAIT1:
6251	6601	case TCP_FIN_WAIT2:
6252	6602	/* RFC 793 says to queue data in these states,
..	..	@@ -6261,7 +6611,7 @@
6261	6611	return 1;
6262	6612	}
6263	6613	}
6264		- /* Fall through */
	6614	+ fallthrough;
6265	6615	case TCP_ESTABLISHED:
6266	6616	tcp_data_queue(sk, skb);
6267	6617	queued = 1;
..	..	@@ -6307,6 +6657,11 @@
6307	6657	* congestion control: Linux DCTCP asserts ECT on all packets,
6308	6658	* including SYN, which is most optimal solution; however,
6309	6659	* others, such as FreeBSD do not.
	6660	+ *
	6661	+ * Exception: At least one of the reserved bits of the TCP header (th->res1) is
	6662	+ * set, indicating the use of a future TCP extension (such as AccECN). See
	6663	+ * RFC8311 §4.3 which updates RFC3168 to allow the development of such
	6664	+ * extensions.
6310	6665	*/
6311	6666	static void tcp_ecn_create_request(struct request_sock *req,
6312	6667	const struct sk_buff *skb,
..	..	@@ -6326,7 +6681,7 @@
6326	6681	ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
6327	6682	ecn_ok = net->ipv4.sysctl_tcp_ecn \|\| ecn_ok_dst;
6328	6683
6329		- if ((!ect && ecn_ok) \|\| tcp_ca_needs_ecn(listen_sk) \|\|
	6684	+ if (((!ect \|\| th->res1) && ecn_ok) \|\| tcp_ca_needs_ecn(listen_sk) \|\|
6330	6685	(ecn_ok_dst & DST_FEATURE_ECN_CA) \|\|
6331	6686	tcp_bpf_ca_needs_ecn((struct sock *)req))
6332	6687	inet_rsk(req)->ecn_ok = 1;
..	..	@@ -6339,10 +6694,9 @@
6339	6694	struct inet_request_sock *ireq = inet_rsk(req);
6340	6695
6341	6696	req->rsk_rcv_wnd = 0; /* So that tcp_send_synack() knows! */
6342		- req->cookie_ts = 0;
6343	6697	tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
6344	6698	tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
6345		- tcp_rsk(req)->snt_synack = tcp_clock_us();
	6699	+ tcp_rsk(req)->snt_synack = 0;
6346	6700	tcp_rsk(req)->last_oow_ack_time = 0;
6347	6701	req->mss = rx_opt->mss_clamp;
6348	6702	req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
..	..	@@ -6387,17 +6741,18 @@
6387	6741	/*
6388	6742	* Return true if a syncookie should be sent
6389	6743	*/
6390		-static bool tcp_syn_flood_action(const struct sock *sk,
6391		- const struct sk_buff *skb,
6392		- const char *proto)
	6744	+static bool tcp_syn_flood_action(const struct sock sk, const char proto)
6393	6745	{
6394	6746	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
6395	6747	const char *msg = "Dropping request";
6396		- bool want_cookie = false;
6397	6748	struct net *net = sock_net(sk);
	6749	+ bool want_cookie = false;
	6750	+ u8 syncookies;
	6751	+
	6752	+ syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies);
6398	6753
6399	6754	#ifdef CONFIG_SYN_COOKIES
6400		- if (net->ipv4.sysctl_tcp_syncookies) {
	6755	+ if (syncookies) {
6401	6756	msg = "Sending cookies";
6402	6757	want_cookie = true;
6403	6758	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
..	..	@@ -6405,11 +6760,10 @@
6405	6760	#endif
6406	6761	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
6407	6762
6408		- if (!queue->synflood_warned &&
6409		- net->ipv4.sysctl_tcp_syncookies != 2 &&
	6763	+ if (!queue->synflood_warned && syncookies != 2 &&
6410	6764	xchg(&queue->synflood_warned, 1) == 0)
6411	6765	net_info_ratelimited("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
6412		- proto, ntohs(tcp_hdr(skb)->dest), msg);
	6766	+ proto, sk->sk_num, msg);
6413	6767
6414	6768	return want_cookie;
6415	6769	}
..	..	@@ -6420,16 +6774,60 @@
6420	6774	{
6421	6775	if (tcp_sk(sk)->save_syn) {
6422	6776	u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb);
6423		- u32 *copy;
	6777	+ struct saved_syn *saved_syn;
	6778	+ u32 mac_hdrlen;
	6779	+ void *base;
6424	6780
6425		- copy = kmalloc(len + sizeof(u32), GFP_ATOMIC);
6426		- if (copy) {
6427		- copy[0] = len;
6428		- memcpy(&copy[1], skb_network_header(skb), len);
6429		- req->saved_syn = copy;
	6781	+ if (tcp_sk(sk)->save_syn == 2) { /* Save full header. */
	6782	+ base = skb_mac_header(skb);
	6783	+ mac_hdrlen = skb_mac_header_len(skb);
	6784	+ len += mac_hdrlen;
	6785	+ } else {
	6786	+ base = skb_network_header(skb);
	6787	+ mac_hdrlen = 0;
	6788	+ }
	6789	+
	6790	+ saved_syn = kmalloc(struct_size(saved_syn, data, len),
	6791	+ GFP_ATOMIC);
	6792	+ if (saved_syn) {
	6793	+ saved_syn->mac_hdrlen = mac_hdrlen;
	6794	+ saved_syn->network_hdrlen = skb_network_header_len(skb);
	6795	+ saved_syn->tcp_hdrlen = tcp_hdrlen(skb);
	6796	+ memcpy(saved_syn->data, base, len);
	6797	+ req->saved_syn = saved_syn;
6430	6798	}
6431	6799	}
6432	6800	}
	6801	+
	6802	+/* If a SYN cookie is required and supported, returns a clamped MSS value to be
	6803	+ * used for SYN cookie generation.
	6804	+ */
	6805	+u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
	6806	+ const struct tcp_request_sock_ops *af_ops,
	6807	+ struct sock sk, struct tcphdr th)
	6808	+{
	6809	+ struct tcp_sock *tp = tcp_sk(sk);
	6810	+ u16 mss;
	6811	+
	6812	+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies) != 2 &&
	6813	+ !inet_csk_reqsk_queue_is_full(sk))
	6814	+ return 0;
	6815	+
	6816	+ if (!tcp_syn_flood_action(sk, rsk_ops->slab_name))
	6817	+ return 0;
	6818	+
	6819	+ if (sk_acceptq_is_full(sk)) {
	6820	+ NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
	6821	+ return 0;
	6822	+ }
	6823	+
	6824	+ mss = tcp_parse_mss_option(th, tp->rx_opt.user_mss);
	6825	+ if (!mss)
	6826	+ mss = af_ops->mss_clamp;
	6827	+
	6828	+ return mss;
	6829	+}
	6830	+EXPORT_SYMBOL_GPL(tcp_get_syncookie_mss);
6433	6831
6434	6832	int tcp_conn_request(struct request_sock_ops *rsk_ops,
6435	6833	const struct tcp_request_sock_ops *af_ops,
..	..	@@ -6445,14 +6843,16 @@
6445	6843	bool want_cookie = false;
6446	6844	struct dst_entry *dst;
6447	6845	struct flowi fl;
	6846	+ u8 syncookies;
	6847	+
	6848	+ syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies);
6448	6849
6449	6850	/* TW buckets are converted to open requests without
6450	6851	* limitations, they conserve resources and peer is
6451	6852	* evidently real one.
6452	6853	*/
6453		- if ((net->ipv4.sysctl_tcp_syncookies == 2 \|\|
6454		- inet_csk_reqsk_queue_is_full(sk)) && !isn) {
6455		- want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
	6854	+ if ((syncookies == 2 \|\| inet_csk_reqsk_queue_is_full(sk)) && !isn) {
	6855	+ want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name);
6456	6856	if (!want_cookie)
6457	6857	goto drop;
6458	6858	}
..	..	@@ -6466,8 +6866,12 @@
6466	6866	if (!req)
6467	6867	goto drop;
6468	6868
	6869	+ req->syncookie = want_cookie;
6469	6870	tcp_rsk(req)->af_specific = af_ops;
6470	6871	tcp_rsk(req)->ts_off = 0;
	6872	+#if IS_ENABLED(CONFIG_MPTCP)
	6873	+ tcp_rsk(req)->is_mptcp = 0;
	6874	+#endif
6471	6875
6472	6876	tcp_clear_options(&tmp_opt);
6473	6877	tmp_opt.mss_clamp = af_ops->mss_clamp;
..	..	@@ -6501,10 +6905,12 @@
6501	6905	goto drop_and_free;
6502	6906
6503	6907	if (!want_cookie && !isn) {
	6908	+ int max_syn_backlog = READ_ONCE(net->ipv4.sysctl_max_syn_backlog);
	6909	+
6504	6910	/* Kill the following clause, if you dislike this way. */
6505		- if (!net->ipv4.sysctl_tcp_syncookies &&
6506		- (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
6507		- (net->ipv4.sysctl_max_syn_backlog >> 2)) &&
	6911	+ if (!syncookies &&
	6912	+ (max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
	6913	+ (max_syn_backlog >> 2)) &&
6508	6914	!tcp_peer_is_proven(req, dst)) {
6509	6915	/* Without syncookies last quarter of
6510	6916	* backlog is filled with destinations,
..	..	@@ -6525,13 +6931,13 @@
6525	6931
6526	6932	if (want_cookie) {
6527	6933	isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
6528		- req->cookie_ts = tmp_opt.tstamp_ok;
6529	6934	if (!tmp_opt.tstamp_ok)
6530	6935	inet_rsk(req)->ecn_ok = 0;
6531	6936	}
6532	6937
6533	6938	tcp_rsk(req)->snt_isn = isn;
6534	6939	tcp_rsk(req)->txhash = net_tx_rndhash();
	6940	+ tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
6535	6941	tcp_openreq_init_rwin(req, sk, dst);
6536	6942	sk_rx_queue_set(req_to_sk(req), skb);
6537	6943	if (!want_cookie) {
..	..	@@ -6540,14 +6946,13 @@
6540	6946	}
6541	6947	if (fastopen_sk) {
6542	6948	af_ops->send_synack(fastopen_sk, dst, &fl, req,
6543		- &foc, TCP_SYNACK_FASTOPEN);
	6949	+ &foc, TCP_SYNACK_FASTOPEN, skb);
6544	6950	/* Add the child socket directly into the accept queue */
6545	6951	if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) {
6546	6952	reqsk_fastopen_remove(fastopen_sk, req, false);
6547	6953	bh_unlock_sock(fastopen_sk);
6548	6954	sock_put(fastopen_sk);
6549		- reqsk_put(req);
6550		- goto drop;
	6955	+ goto drop_and_free;
6551	6956	}
6552	6957	sk->sk_data_ready(sk);
6553	6958	bh_unlock_sock(fastopen_sk);
..	..	@@ -6559,7 +6964,8 @@
6559	6964	tcp_timeout_init((struct sock *)req));
6560	6965	af_ops->send_synack(sk, dst, &fl, req, &foc,
6561	6966	!want_cookie ? TCP_SYNACK_NORMAL :
6562		- TCP_SYNACK_COOKIE);
	6967	+ TCP_SYNACK_COOKIE,
	6968	+ skb);
6563	6969	if (want_cookie) {
6564	6970	reqsk_free(req);
6565	6971	return 0;
..	..	@@ -6571,7 +6977,7 @@
6571	6977	drop_and_release:
6572	6978	dst_release(dst);
6573	6979	drop_and_free:
6574		- reqsk_free(req);
	6980	+ __reqsk_free(req);
6575	6981	drop:
6576	6982	tcp_listendrop(sk);
6577	6983	return 0;