~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0-only
1	2	/*
2	3	* INET An implementation of the TCP/IP protocol suite for the LINUX
3	4	* operating system. INET is implemented using the BSD Socket
..	..	@@ -37,6 +38,7 @@
37	38	#define pr_fmt(fmt) "TCP: " fmt
38	39
39	40	#include <net/tcp.h>
	41	+#include <net/mptcp.h>
40	42
41	43	#include <linux/compiler.h>
42	44	#include <linux/gfp.h>
..	..	@@ -44,6 +46,17 @@
44	46	#include <linux/static_key.h>
45	47
46	48	#include <trace/events/tcp.h>
	49	+
	50	+/* Refresh clocks of a TCP socket,
	51	+ * ensuring monotically increasing values.
	52	+ */
	53	+void tcp_mstamp_refresh(struct tcp_sock *tp)
	54	+{
	55	+ u64 val = tcp_clock_ns();
	56	+
	57	+ tp->tcp_clock_cache = val;
	58	+ tp->tcp_mstamp = div_u64(val, NSEC_PER_USEC);
	59	+}
47	60
48	61	static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
49	62	int push_one, gfp_t gfp);
..	..	@@ -55,7 +68,7 @@
55	68	struct tcp_sock *tp = tcp_sk(sk);
56	69	unsigned int prior_packets = tp->packets_out;
57	70
58		- tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
	71	+ WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq);
59	72
60	73	__skb_unlink(skb, &sk->sk_write_queue);
61	74	tcp_rbtree_insert(&sk->tcp_rtx_queue, skb);
..	..	@@ -69,6 +82,7 @@
69	82
70	83	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
71	84	tcp_skb_pcount(skb));
	85	+ tcp_check_space(sk);
72	86	}
73	87
74	88	/* SND.NXT, if window was not shrunk or the amount of shrunk was less than one
..	..	@@ -159,26 +173,25 @@
159	173	* packet, enter pingpong mode.
160	174	*/
161	175	if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
162		- icsk->icsk_ack.pingpong = 1;
	176	+ inet_csk_enter_pingpong_mode(sk);
163	177	}
164	178
165	179	/* Account for an ACK we sent. */
166		-static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts,
167		- u32 rcv_nxt)
	180	+static inline void tcp_event_ack_sent(struct sock *sk, u32 rcv_nxt)
168	181	{
169	182	struct tcp_sock *tp = tcp_sk(sk);
170	183
171		- if (unlikely(tp->compressed_ack > TCP_FASTRETRANS_THRESH)) {
	184	+ if (unlikely(tp->compressed_ack)) {
172	185	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
173		- tp->compressed_ack - TCP_FASTRETRANS_THRESH);
174		- tp->compressed_ack = TCP_FASTRETRANS_THRESH;
	186	+ tp->compressed_ack);
	187	+ tp->compressed_ack = 0;
175	188	if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
176	189	__sock_put(sk);
177	190	}
178	191
179	192	if (unlikely(rcv_nxt != tp->rcv_nxt))
180	193	return; /* Special ACK sent by DCTCP to reflect ECN */
181		- tcp_dec_quickack_mode(sk, pkts);
	194	+ tcp_dec_quickack_mode(sk);
182	195	inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
183	196	}
184	197
..	..	@@ -221,16 +234,14 @@
221	234	if (init_rcv_wnd)
222	235	rcv_wnd = min(rcv_wnd, init_rcv_wnd * mss);
223	236
224		- (*rcv_wscale) = 0;
	237	+ *rcv_wscale = 0;
225	238	if (wscale_ok) {
226	239	/* Set window scaling on max possible window */
227		- space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
228		- space = max_t(u32, space, sysctl_rmem_max);
	240	+ space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
	241	+ space = max_t(u32, space, READ_ONCE(sysctl_rmem_max));
229	242	space = min_t(u32, space, *window_clamp);
230		- while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) {
231		- space >>= 1;
232		- (*rcv_wscale)++;
233		- }
	243	+ *rcv_wscale = clamp_t(int, ilog2(space) - 15,
	244	+ 0, TCP_MAX_WSCALE);
234	245	}
235	246	/* Set the clamp no higher than max representable value */
236	247	(window_clamp) = min_t(__u32, U16_MAX << (rcv_wscale), *window_clamp);
..	..	@@ -401,6 +412,7 @@
401	412	#define OPTION_WSCALE (1 << 3)
402	413	#define OPTION_FAST_OPEN_COOKIE (1 << 8)
403	414	#define OPTION_SMC (1 << 9)
	415	+#define OPTION_MPTCP (1 << 10)
404	416
405	417	static void smc_options_write(__be32 ptr, u16 options)
406	418	{
..	..	@@ -423,10 +435,159 @@
423	435	u8 ws; /* window scale, 0 to disable */
424	436	u8 num_sack_blocks; /* number of SACK blocks to include */
425	437	u8 hash_size; /* bytes in hash_location */
	438	+ u8 bpf_opt_len; /* length of BPF hdr option */
426	439	__u8 hash_location; / temporary pointer, overloaded */
427	440	__u32 tsval, tsecr; /* need to include OPTION_TS */
428	441	struct tcp_fastopen_cookie fastopen_cookie; / Fast open cookie */
	442	+ struct mptcp_out_options mptcp;
429	443	};
	444	+
	445	+static void mptcp_options_write(__be32 ptr, struct tcp_out_options opts)
	446	+{
	447	+#if IS_ENABLED(CONFIG_MPTCP)
	448	+ if (unlikely(OPTION_MPTCP & opts->options))
	449	+ mptcp_write_options(ptr, &opts->mptcp);
	450	+#endif
	451	+}
	452	+
	453	+#ifdef CONFIG_CGROUP_BPF
	454	+static int bpf_skops_write_hdr_opt_arg0(struct sk_buff *skb,
	455	+ enum tcp_synack_type synack_type)
	456	+{
	457	+ if (unlikely(!skb))
	458	+ return BPF_WRITE_HDR_TCP_CURRENT_MSS;
	459	+
	460	+ if (unlikely(synack_type == TCP_SYNACK_COOKIE))
	461	+ return BPF_WRITE_HDR_TCP_SYNACK_COOKIE;
	462	+
	463	+ return 0;
	464	+}
	465	+
	466	+/* req, syn_skb and synack_type are used when writing synack */
	467	+static void bpf_skops_hdr_opt_len(struct sock sk, struct sk_buff skb,
	468	+ struct request_sock *req,
	469	+ struct sk_buff *syn_skb,
	470	+ enum tcp_synack_type synack_type,
	471	+ struct tcp_out_options *opts,
	472	+ unsigned int *remaining)
	473	+{
	474	+ struct bpf_sock_ops_kern sock_ops;
	475	+ int err;
	476	+
	477	+ if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
	478	+ BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) \|\|
	479	+ !*remaining)
	480	+ return;
	481	+
	482	+ /* remaining has already been aligned to 4 bytes, so remaining >= 4 */
	483	+
	484	+ /* init sock_ops */
	485	+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
	486	+
	487	+ sock_ops.op = BPF_SOCK_OPS_HDR_OPT_LEN_CB;
	488	+
	489	+ if (req) {
	490	+ /* The listen "sk" cannot be passed here because
	491	+ * it is not locked. It would not make too much
	492	+ * sense to do bpf_setsockopt(listen_sk) based
	493	+ * on individual connection request also.
	494	+ *
	495	+ * Thus, "req" is passed here and the cgroup-bpf-progs
	496	+ * of the listen "sk" will be run.
	497	+ *
	498	+ * "req" is also used here for fastopen even the "sk" here is
	499	+ * a fullsock "child" sk. It is to keep the behavior
	500	+ * consistent between fastopen and non-fastopen on
	501	+ * the bpf programming side.
	502	+ */
	503	+ sock_ops.sk = (struct sock *)req;
	504	+ sock_ops.syn_skb = syn_skb;
	505	+ } else {
	506	+ sock_owned_by_me(sk);
	507	+
	508	+ sock_ops.is_fullsock = 1;
	509	+ sock_ops.sk = sk;
	510	+ }
	511	+
	512	+ sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
	513	+ sock_ops.remaining_opt_len = *remaining;
	514	+ /* tcp_current_mss() does not pass a skb */
	515	+ if (skb)
	516	+ bpf_skops_init_skb(&sock_ops, skb, 0);
	517	+
	518	+ err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
	519	+
	520	+ if (err \|\| sock_ops.remaining_opt_len == *remaining)
	521	+ return;
	522	+
	523	+ opts->bpf_opt_len = *remaining - sock_ops.remaining_opt_len;
	524	+ /* round up to 4 bytes */
	525	+ opts->bpf_opt_len = (opts->bpf_opt_len + 3) & ~3;
	526	+
	527	+ *remaining -= opts->bpf_opt_len;
	528	+}
	529	+
	530	+static void bpf_skops_write_hdr_opt(struct sock sk, struct sk_buff skb,
	531	+ struct request_sock *req,
	532	+ struct sk_buff *syn_skb,
	533	+ enum tcp_synack_type synack_type,
	534	+ struct tcp_out_options *opts)
	535	+{
	536	+ u8 first_opt_off, nr_written, max_opt_len = opts->bpf_opt_len;
	537	+ struct bpf_sock_ops_kern sock_ops;
	538	+ int err;
	539	+
	540	+ if (likely(!max_opt_len))
	541	+ return;
	542	+
	543	+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
	544	+
	545	+ sock_ops.op = BPF_SOCK_OPS_WRITE_HDR_OPT_CB;
	546	+
	547	+ if (req) {
	548	+ sock_ops.sk = (struct sock *)req;
	549	+ sock_ops.syn_skb = syn_skb;
	550	+ } else {
	551	+ sock_owned_by_me(sk);
	552	+
	553	+ sock_ops.is_fullsock = 1;
	554	+ sock_ops.sk = sk;
	555	+ }
	556	+
	557	+ sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
	558	+ sock_ops.remaining_opt_len = max_opt_len;
	559	+ first_opt_off = tcp_hdrlen(skb) - max_opt_len;
	560	+ bpf_skops_init_skb(&sock_ops, skb, first_opt_off);
	561	+
	562	+ err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
	563	+
	564	+ if (err)
	565	+ nr_written = 0;
	566	+ else
	567	+ nr_written = max_opt_len - sock_ops.remaining_opt_len;
	568	+
	569	+ if (nr_written < max_opt_len)
	570	+ memset(skb->data + first_opt_off + nr_written, TCPOPT_NOP,
	571	+ max_opt_len - nr_written);
	572	+}
	573	+#else
	574	+static void bpf_skops_hdr_opt_len(struct sock sk, struct sk_buff skb,
	575	+ struct request_sock *req,
	576	+ struct sk_buff *syn_skb,
	577	+ enum tcp_synack_type synack_type,
	578	+ struct tcp_out_options *opts,
	579	+ unsigned int *remaining)
	580	+{
	581	+}
	582	+
	583	+static void bpf_skops_write_hdr_opt(struct sock sk, struct sk_buff skb,
	584	+ struct request_sock *req,
	585	+ struct sk_buff *syn_skb,
	586	+ enum tcp_synack_type synack_type,
	587	+ struct tcp_out_options *opts)
	588	+{
	589	+}
	590	+#endif
430	591
431	592	/* Write previously computed TCP options to the packet.
432	593	*
..	..	@@ -536,6 +697,8 @@
536	697	}
537	698
538	699	smc_options_write(ptr, &options);
	700	+
	701	+ mptcp_options_write(ptr, opts);
539	702	}
540	703
541	704	static void smc_set_option(const struct tcp_sock *tp,
..	..	@@ -571,6 +734,22 @@
571	734	#endif
572	735	}
573	736
	737	+static void mptcp_set_option_cond(const struct request_sock *req,
	738	+ struct tcp_out_options *opts,
	739	+ unsigned int *remaining)
	740	+{
	741	+ if (rsk_is_mptcp(req)) {
	742	+ unsigned int size;
	743	+
	744	+ if (mptcp_synack_options(req, &size, &opts->mptcp)) {
	745	+ if (*remaining >= size) {
	746	+ opts->options \|= OPTION_MPTCP;
	747	+ *remaining -= size;
	748	+ }
	749	+ }
	750	+ }
	751	+}
	752	+
574	753	/* Compute TCP options for SYN packets. This is not the final
575	754	* network wire format yet.
576	755	*/
..	..	@@ -584,7 +763,8 @@
584	763
585	764	*md5 = NULL;
586	765	#ifdef CONFIG_TCP_MD5SIG
587		- if (unlikely(rcu_access_pointer(tp->md5sig_info))) {
	766	+ if (static_branch_unlikely(&tcp_md5_needed) &&
	767	+ rcu_access_pointer(tp->md5sig_info)) {
588	768	*md5 = tp->af_specific->md5_lookup(sk, sk);
589	769	if (*md5) {
590	770	opts->options \|= OPTION_MD5;
..	..	@@ -605,18 +785,18 @@
605	785	opts->mss = tcp_advertise_mss(sk);
606	786	remaining -= TCPOLEN_MSS_ALIGNED;
607	787
608		- if (likely(sock_net(sk)->ipv4.sysctl_tcp_timestamps && !*md5)) {
	788	+ if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps) && !*md5)) {
609	789	opts->options \|= OPTION_TS;
610	790	opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
611	791	opts->tsecr = tp->rx_opt.ts_recent;
612	792	remaining -= TCPOLEN_TSTAMP_ALIGNED;
613	793	}
614		- if (likely(sock_net(sk)->ipv4.sysctl_tcp_window_scaling)) {
	794	+ if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling))) {
615	795	opts->ws = tp->rx_opt.rcv_wscale;
616	796	opts->options \|= OPTION_WSCALE;
617	797	remaining -= TCPOLEN_WSCALE_ALIGNED;
618	798	}
619		- if (likely(sock_net(sk)->ipv4.sysctl_tcp_sack)) {
	799	+ if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_sack))) {
620	800	opts->options \|= OPTION_SACK_ADVERTISE;
621	801	if (unlikely(!(OPTION_TS & opts->options)))
622	802	remaining -= TCPOLEN_SACKPERM_ALIGNED;
..	..	@@ -639,6 +819,17 @@
639	819
640	820	smc_set_option(tp, opts, &remaining);
641	821
	822	+ if (sk_is_mptcp(sk)) {
	823	+ unsigned int size;
	824	+
	825	+ if (mptcp_syn_options(sk, skb, &size, &opts->mptcp)) {
	826	+ opts->options \|= OPTION_MPTCP;
	827	+ remaining -= size;
	828	+ }
	829	+ }
	830	+
	831	+ bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
	832	+
642	833	return MAX_TCP_OPTION_SPACE - remaining;
643	834	}
644	835
..	..	@@ -649,7 +840,8 @@
649	840	struct tcp_out_options *opts,
650	841	const struct tcp_md5sig_key *md5,
651	842	struct tcp_fastopen_cookie *foc,
652		- enum tcp_synack_type synack_type)
	843	+ enum tcp_synack_type synack_type,
	844	+ struct sk_buff *syn_skb)
653	845	{
654	846	struct inet_request_sock *ireq = inet_rsk(req);
655	847	unsigned int remaining = MAX_TCP_OPTION_SPACE;
..	..	@@ -681,7 +873,7 @@
681	873	if (likely(ireq->tstamp_ok)) {
682	874	opts->options \|= OPTION_TS;
683	875	opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off;
684		- opts->tsecr = req->ts_recent;
	876	+ opts->tsecr = READ_ONCE(req->ts_recent);
685	877	remaining -= TCPOLEN_TSTAMP_ALIGNED;
686	878	}
687	879	if (likely(ireq->sack_ok)) {
..	..	@@ -702,7 +894,12 @@
702	894	}
703	895	}
704	896
	897	+ mptcp_set_option_cond(req, opts, &remaining);
	898	+
705	899	smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
	900	+
	901	+ bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb,
	902	+ synack_type, opts, &remaining);
706	903
707	904	return MAX_TCP_OPTION_SPACE - remaining;
708	905	}
..	..	@@ -722,7 +919,8 @@
722	919
723	920	*md5 = NULL;
724	921	#ifdef CONFIG_TCP_MD5SIG
725		- if (unlikely(rcu_access_pointer(tp->md5sig_info))) {
	922	+ if (static_branch_unlikely(&tcp_md5_needed) &&
	923	+ rcu_access_pointer(tp->md5sig_info)) {
726	924	*md5 = tp->af_specific->md5_lookup(sk, sk);
727	925	if (*md5) {
728	926	opts->options \|= OPTION_MD5;
..	..	@@ -738,16 +936,46 @@
738	936	size += TCPOLEN_TSTAMP_ALIGNED;
739	937	}
740	938
	939	+ /* MPTCP options have precedence over SACK for the limited TCP
	940	+ * option space because a MPTCP connection would be forced to
	941	+ * fall back to regular TCP if a required multipath option is
	942	+ * missing. SACK still gets a chance to use whatever space is
	943	+ * left.
	944	+ */
	945	+ if (sk_is_mptcp(sk)) {
	946	+ unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
	947	+ unsigned int opt_size = 0;
	948	+
	949	+ if (mptcp_established_options(sk, skb, &opt_size, remaining,
	950	+ &opts->mptcp)) {
	951	+ opts->options \|= OPTION_MPTCP;
	952	+ size += opt_size;
	953	+ }
	954	+ }
	955	+
741	956	eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
742	957	if (unlikely(eff_sacks)) {
743	958	const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
	959	+ if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED +
	960	+ TCPOLEN_SACK_PERBLOCK))
	961	+ return size;
	962	+
744	963	opts->num_sack_blocks =
745	964	min_t(unsigned int, eff_sacks,
746	965	(remaining - TCPOLEN_SACK_BASE_ALIGNED) /
747	966	TCPOLEN_SACK_PERBLOCK);
748		- if (likely(opts->num_sack_blocks))
749		- size += TCPOLEN_SACK_BASE_ALIGNED +
750		- opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
	967	+
	968	+ size += TCPOLEN_SACK_BASE_ALIGNED +
	969	+ opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
	970	+ }
	971	+
	972	+ if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp,
	973	+ BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) {
	974	+ unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
	975	+
	976	+ bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
	977	+
	978	+ size = MAX_TCP_OPTION_SPACE - remaining;
751	979	}
752	980
753	981	return size;
..	..	@@ -966,48 +1194,33 @@
966	1194	return HRTIMER_NORESTART;
967	1195	}
968	1196
969		-static void tcp_internal_pacing(struct sock sk, const struct sk_buff skb)
	1197	+static void tcp_update_skb_after_send(struct sock sk, struct sk_buff skb,
	1198	+ u64 prior_wstamp)
970	1199	{
971	1200	struct tcp_sock *tp = tcp_sk(sk);
972		- ktime_t expire, now;
973		- u64 len_ns;
974		- u32 rate;
975	1201
976		- if (!tcp_needs_internal_pacing(sk))
977		- return;
978		- rate = sk->sk_pacing_rate;
979		- if (!rate \|\| rate == ~0U)
980		- return;
	1202	+ if (sk->sk_pacing_status != SK_PACING_NONE) {
	1203	+ unsigned long rate = sk->sk_pacing_rate;
981	1204
982		- len_ns = (u64)skb->len * NSEC_PER_SEC;
983		- do_div(len_ns, rate);
984		- now = ktime_get();
985		- /* If hrtimer is already armed, then our caller has not
986		- * used tcp_pacing_check().
987		- */
988		- if (unlikely(hrtimer_is_queued(&tp->pacing_timer))) {
989		- expire = hrtimer_get_softexpires(&tp->pacing_timer);
990		- if (ktime_after(expire, now))
991		- now = expire;
992		- if (hrtimer_try_to_cancel(&tp->pacing_timer) == 1)
993		- __sock_put(sk);
	1205	+ /* Original sch_fq does not pace first 10 MSS
	1206	+ * Note that tp->data_segs_out overflows after 2^32 packets,
	1207	+ * this is a minor annoyance.
	1208	+ */
	1209	+ if (rate != ~0UL && rate && tp->data_segs_out >= 10) {
	1210	+ u64 len_ns = div64_ul((u64)skb->len * NSEC_PER_SEC, rate);
	1211	+ u64 credit = tp->tcp_wstamp_ns - prior_wstamp;
	1212	+
	1213	+ /* take into account OS jitter */
	1214	+ len_ns -= min_t(u64, len_ns / 2, credit);
	1215	+ tp->tcp_wstamp_ns += len_ns;
	1216	+ }
994	1217	}
995		- hrtimer_start(&tp->pacing_timer, ktime_add_ns(now, len_ns),
996		- HRTIMER_MODE_ABS_PINNED_SOFT);
997		- sock_hold(sk);
998		-}
999		-
1000		-static bool tcp_pacing_check(const struct sock *sk)
1001		-{
1002		- return tcp_needs_internal_pacing(sk) &&
1003		- hrtimer_is_queued(&tcp_sk(sk)->pacing_timer);
1004		-}
1005		-
1006		-static void tcp_update_skb_after_send(struct tcp_sock tp, struct sk_buff skb)
1007		-{
1008		- skb->skb_mstamp = tp->tcp_mstamp;
1009	1218	list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
1010	1219	}
	1220	+
	1221	+INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock sk, struct sk_buff skb, struct flowi *fl));
	1222	+INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock sk, struct sk_buff skb, struct flowi *fl));
	1223	+INDIRECT_CALLABLE_DECLARE(void tcp_v4_send_check(struct sock sk, struct sk_buff skb));
1011	1224
1012	1225	/* This routine actually transmits TCP packets queued in by
1013	1226	* tcp_do_sendmsg(). This is used by both the initial
..	..	@@ -1032,11 +1245,14 @@
1032	1245	struct sk_buff *oskb = NULL;
1033	1246	struct tcp_md5sig_key *md5;
1034	1247	struct tcphdr *th;
	1248	+ u64 prior_wstamp;
1035	1249	int err;
1036	1250
1037	1251	BUG_ON(!skb \|\| !tcp_skb_pcount(skb));
1038	1252	tp = tcp_sk(sk);
1039		-
	1253	+ prior_wstamp = tp->tcp_wstamp_ns;
	1254	+ tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
	1255	+ skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
1040	1256	if (clone_it) {
1041	1257	TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
1042	1258	- tp->snd_una;
..	..	@@ -1051,18 +1267,32 @@
1051	1267
1052	1268	if (unlikely(!skb))
1053	1269	return -ENOBUFS;
	1270	+ /* retransmit skbs might have a non zero value in skb->dev
	1271	+ * because skb->dev is aliased with skb->rbnode.rb_left
	1272	+ */
	1273	+ skb->dev = NULL;
1054	1274	}
1055		- skb->skb_mstamp = tp->tcp_mstamp;
1056	1275
1057	1276	inet = inet_sk(sk);
1058	1277	tcb = TCP_SKB_CB(skb);
1059	1278	memset(&opts, 0, sizeof(opts));
1060	1279
1061		- if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
	1280	+ if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
1062	1281	tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
1063		- else
	1282	+ } else {
1064	1283	tcp_options_size = tcp_established_options(sk, skb, &opts,
1065	1284	&md5);
	1285	+ /* Force a PSH flag on all (GSO) packets to expedite GRO flush
	1286	+ * at receiver : This slightly improve GRO performance.
	1287	+ * Note that we do not force the PSH flag for non GSO packets,
	1288	+ * because they might be sent under high congestion events,
	1289	+ * and in this case it is better to delay the delivery of 1-MSS
	1290	+ * packets and thus the corresponding ACK packet that would
	1291	+ * release the following packet.
	1292	+ */
	1293	+ if (tcp_skb_pcount(skb) > 1)
	1294	+ tcb->tcp_flags \|= TCPHDR_PSH;
	1295	+ }
1066	1296	tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
1067	1297
1068	1298	/* if no packet is in qdisc/device queue, then allow XPS to select
..	..	@@ -1135,16 +1365,20 @@
1135	1365	}
1136	1366	#endif
1137	1367
1138		- icsk->icsk_af_ops->send_check(sk, skb);
	1368	+ /* BPF prog is the last one writing header option */
	1369	+ bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts);
	1370	+
	1371	+ INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check,
	1372	+ tcp_v6_send_check, tcp_v4_send_check,
	1373	+ sk, skb);
1139	1374
1140	1375	if (likely(tcb->tcp_flags & TCPHDR_ACK))
1141		- tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt);
	1376	+ tcp_event_ack_sent(sk, rcv_nxt);
1142	1377
1143	1378	if (skb->len != tcp_header_size) {
1144	1379	tcp_event_data_sent(tp, sk);
1145	1380	tp->data_segs_out += tcp_skb_pcount(skb);
1146	1381	tp->bytes_sent += skb->len - tcp_header_size;
1147		- tcp_internal_pacing(sk, skb);
1148	1382	}
1149	1383
1150	1384	if (after(tcb->end_seq, tp->snd_nxt) \|\| tcb->seq == tcb->end_seq)
..	..	@@ -1156,21 +1390,24 @@
1156	1390	skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
1157	1391	skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
1158	1392
1159		- /* Our usage of tstamp should remain private */
1160		- skb->tstamp = 0;
	1393	+ /* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */
1161	1394
1162	1395	/* Cleanup our debris for IP stacks */
1163	1396	memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
1164	1397	sizeof(struct inet6_skb_parm)));
1165	1398
1166		- err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
	1399	+ tcp_add_tx_delay(skb, tp);
	1400	+
	1401	+ err = INDIRECT_CALL_INET(icsk->icsk_af_ops->queue_xmit,
	1402	+ inet6_csk_xmit, ip_queue_xmit,
	1403	+ sk, skb, &inet->cork.fl);
1167	1404
1168	1405	if (unlikely(err > 0)) {
1169	1406	tcp_enter_cwr(sk);
1170	1407	err = net_xmit_eval(err);
1171	1408	}
1172	1409	if (!err && oskb) {
1173		- tcp_update_skb_after_send(tp, oskb);
	1410	+ tcp_update_skb_after_send(sk, oskb, prior_wstamp);
1174	1411	tcp_rate_skb_sent(sk, oskb);
1175	1412	}
1176	1413	return err;
..	..	@@ -1196,7 +1433,7 @@
1196	1433	WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq);
1197	1434	__skb_header_release(skb);
1198	1435	tcp_add_write_queue_tail(sk, skb);
1199		- sk->sk_wmem_queued += skb->truesize;
	1436	+ sk_wmem_queued_add(sk, skb->truesize);
1200	1437	sk_mem_charge(sk, skb->truesize);
1201	1438	}
1202	1439
..	..	@@ -1321,15 +1558,16 @@
1321	1558	return -ENOMEM;
1322	1559	}
1323	1560
1324		- if (skb_unclone(skb, gfp))
	1561	+ if (skb_unclone_keeptruesize(skb, gfp))
1325	1562	return -ENOMEM;
1326	1563
1327	1564	/* Get a new skb... force flag on. */
1328	1565	buff = sk_stream_alloc_skb(sk, nsize, gfp, true);
1329	1566	if (!buff)
1330	1567	return -ENOMEM; /* We'll just try again later. */
	1568	+ skb_copy_decrypted(buff, skb);
1331	1569
1332		- sk->sk_wmem_queued += buff->truesize;
	1570	+ sk_wmem_queued_add(sk, buff->truesize);
1333	1571	sk_mem_charge(sk, buff->truesize);
1334	1572	nlen = skb->len - len - nsize;
1335	1573	buff->truesize += nlen;
..	..	@@ -1410,7 +1648,7 @@
1410	1648	} else {
1411	1649	shinfo->frags[k] = shinfo->frags[i];
1412	1650	if (eat) {
1413		- shinfo->frags[k].page_offset += eat;
	1651	+ skb_frag_off_add(&shinfo->frags[k], eat);
1414	1652	skb_frag_size_sub(&shinfo->frags[k], eat);
1415	1653	eat = 0;
1416	1654	}
..	..	@@ -1429,7 +1667,7 @@
1429	1667	{
1430	1668	u32 delta_truesize;
1431	1669
1432		- if (skb_unclone(skb, GFP_ATOMIC))
	1670	+ if (skb_unclone_keeptruesize(skb, GFP_ATOMIC))
1433	1671	return -ENOMEM;
1434	1672
1435	1673	delta_truesize = __pskb_trim_head(skb, len);
..	..	@@ -1439,9 +1677,8 @@
1439	1677
1440	1678	if (delta_truesize) {
1441	1679	skb->truesize -= delta_truesize;
1442		- sk->sk_wmem_queued -= delta_truesize;
	1680	+ sk_wmem_queued_add(sk, -delta_truesize);
1443	1681	sk_mem_uncharge(sk, delta_truesize);
1444		- sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
1445	1682	}
1446	1683
1447	1684	/* Any change of skb->len requires recalculation of tso factor. */
..	..	@@ -1479,7 +1716,8 @@
1479	1716	mss_now -= icsk->icsk_ext_hdr_len;
1480	1717
1481	1718	/* Then reserve room for full set of TCP options and 8 bytes of data */
1482		- mss_now = max(mss_now, sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss);
	1719	+ mss_now = max(mss_now,
	1720	+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss));
1483	1721	return mss_now;
1484	1722	}
1485	1723
..	..	@@ -1522,10 +1760,10 @@
1522	1760	struct inet_connection_sock *icsk = inet_csk(sk);
1523	1761	struct net *net = sock_net(sk);
1524	1762
1525		- icsk->icsk_mtup.enabled = net->ipv4.sysctl_tcp_mtu_probing > 1;
	1763	+ icsk->icsk_mtup.enabled = READ_ONCE(net->ipv4.sysctl_tcp_mtu_probing) > 1;
1526	1764	icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
1527	1765	icsk->icsk_af_ops->net_header_len;
1528		- icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
	1766	+ icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, READ_ONCE(net->ipv4.sysctl_tcp_base_mss));
1529	1767	icsk->icsk_mtup.probe_size = 0;
1530	1768	if (icsk->icsk_mtup.enabled)
1531	1769	icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
..	..	@@ -1637,15 +1875,20 @@
1637	1875	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
1638	1876	struct tcp_sock *tp = tcp_sk(sk);
1639	1877
1640		- /* Track the maximum number of outstanding packets in each
1641		- * window, and remember whether we were cwnd-limited then.
	1878	+ /* Track the strongest available signal of the degree to which the cwnd
	1879	+ * is fully utilized. If cwnd-limited then remember that fact for the
	1880	+ * current window. If not cwnd-limited then track the maximum number of
	1881	+ * outstanding packets in the current window. (If cwnd-limited then we
	1882	+ * chose to not update tp->max_packets_out to avoid an extra else
	1883	+ * clause with no functional impact.)
1642	1884	*/
1643		- if (!before(tp->snd_una, tp->max_packets_seq) \|\|
1644		- tp->packets_out > tp->max_packets_out \|\|
1645		- is_cwnd_limited) {
1646		- tp->max_packets_out = tp->packets_out;
1647		- tp->max_packets_seq = tp->snd_nxt;
	1885	+ if (!before(tp->snd_una, tp->cwnd_usage_seq) \|\|
	1886	+ is_cwnd_limited \|\|
	1887	+ (!tp->is_cwnd_limited &&
	1888	+ tp->packets_out > tp->max_packets_out)) {
1648	1889	tp->is_cwnd_limited = is_cwnd_limited;
	1890	+ tp->max_packets_out = tp->packets_out;
	1891	+ tp->cwnd_usage_seq = tp->snd_nxt;
1649	1892	}
1650	1893
1651	1894	if (tcp_is_cwnd_limited(sk)) {
..	..	@@ -1657,7 +1900,7 @@
1657	1900	if (tp->packets_out > tp->snd_cwnd_used)
1658	1901	tp->snd_cwnd_used = tp->packets_out;
1659	1902
1660		- if (sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle &&
	1903	+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle) &&
1661	1904	(s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto &&
1662	1905	!ca_ops->cong_control)
1663	1906	tcp_cwnd_application_limited(sk);
..	..	@@ -1721,8 +1964,9 @@
1721	1964	{
1722	1965	u32 bytes, segs;
1723	1966
1724		- bytes = min(sk->sk_pacing_rate >> sk->sk_pacing_shift,
1725		- sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
	1967	+ bytes = min_t(unsigned long,
	1968	+ sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
	1969	+ sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
1726	1970
1727	1971	/* Goal is to send at least one packet per ms,
1728	1972	* not one big TSO packet every 100 ms.
..	..	@@ -1744,7 +1988,7 @@
1744	1988
1745	1989	min_tso = ca_ops->min_tso_segs ?
1746	1990	ca_ops->min_tso_segs(sk) :
1747		- sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs;
	1991	+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
1748	1992
1749	1993	tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
1750	1994	return min_t(u32, tso_segs, sk->sk_gso_max_segs);
..	..	@@ -1868,23 +2112,24 @@
1868	2112	* know that all the data is in scatter-gather pages, and that the
1869	2113	* packet has never been sent out before (and thus is not cloned).
1870	2114	*/
1871		-static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue,
1872		- struct sk_buff *skb, unsigned int len,
	2115	+static int tso_fragment(struct sock sk, struct sk_buff skb, unsigned int len,
1873	2116	unsigned int mss_now, gfp_t gfp)
1874	2117	{
1875		- struct sk_buff *buff;
1876	2118	int nlen = skb->len - len;
	2119	+ struct sk_buff *buff;
1877	2120	u8 flags;
1878	2121
1879	2122	/* All of a TSO frame must be composed of paged data. */
1880	2123	if (skb->len != skb->data_len)
1881		- return tcp_fragment(sk, tcp_queue, skb, len, mss_now, gfp);
	2124	+ return tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
	2125	+ skb, len, mss_now, gfp);
1882	2126
1883	2127	buff = sk_stream_alloc_skb(sk, 0, gfp, true);
1884	2128	if (unlikely(!buff))
1885	2129	return -ENOMEM;
	2130	+ skb_copy_decrypted(buff, skb);
1886	2131
1887		- sk->sk_wmem_queued += buff->truesize;
	2132	+ sk_wmem_queued_add(sk, buff->truesize);
1888	2133	sk_mem_charge(sk, buff->truesize);
1889	2134	buff->truesize += nlen;
1890	2135	skb->truesize -= nlen;
..	..	@@ -1914,7 +2159,7 @@
1914	2159
1915	2160	/* Link BUFF into the send queue. */
1916	2161	__skb_header_release(buff);
1917		- tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
	2162	+ tcp_insert_write_queue_after(skb, buff, sk, TCP_FRAG_IN_WRITE_QUEUE);
1918	2163
1919	2164	return 0;
1920	2165	}
..	..	@@ -1930,18 +2175,22 @@
1930	2175	u32 max_segs)
1931	2176	{
1932	2177	const struct inet_connection_sock *icsk = inet_csk(sk);
1933		- u32 age, send_win, cong_win, limit, in_flight;
	2178	+ u32 send_win, cong_win, limit, in_flight;
1934	2179	struct tcp_sock *tp = tcp_sk(sk);
1935	2180	struct sk_buff *head;
1936	2181	int win_divisor;
	2182	+ s64 delta;
1937	2183
1938	2184	if (icsk->icsk_ca_state >= TCP_CA_Recovery)
1939	2185	goto send_now;
1940	2186
1941	2187	/* Avoid bursty behavior by allowing defer
1942		- * only if the last write was recent.
	2188	+ * only if the last write was recent (1 ms).
	2189	+ * Note that tp->tcp_wstamp_ns can be in the future if we have
	2190	+ * packets waiting in a qdisc or device for EDT delivery.
1943	2191	*/
1944		- if ((s32)(tcp_jiffies32 - tp->lsndtime) > 0)
	2192	+ delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC;
	2193	+ if (delta > 0)
1945	2194	goto send_now;
1946	2195
1947	2196	in_flight = tcp_packets_in_flight(tp);
..	..	@@ -1988,9 +2237,9 @@
1988	2237	head = tcp_rtx_queue_head(sk);
1989	2238	if (!head)
1990	2239	goto send_now;
1991		- age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp);
	2240	+ delta = tp->tcp_clock_cache - head->tstamp;
1992	2241	/* If next ACK is likely to come too late (half srtt), do not defer */
1993		- if (age < (tp->srtt_us >> 4))
	2242	+ if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0)
1994	2243	goto send_now;
1995	2244
1996	2245	/* Ok, it looks like it is advisable to defer.
..	..	@@ -2012,7 +2261,8 @@
2012	2261	}
2013	2262
2014	2263	/* If this packet won't get more data, do not wait. */
2015		- if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
	2264	+ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) \|\|
	2265	+ TCP_SKB_CB(skb)->eor)
2016	2266	goto send_now;
2017	2267
2018	2268	return true;
..	..	@@ -2029,7 +2279,7 @@
2029	2279	u32 interval;
2030	2280	s32 delta;
2031	2281
2032		- interval = net->ipv4.sysctl_tcp_probe_interval;
	2282	+ interval = READ_ONCE(net->ipv4.sysctl_tcp_probe_interval);
2033	2283	delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp;
2034	2284	if (unlikely(delta >= interval * HZ)) {
2035	2285	int mss = tcp_current_mss(sk);
..	..	@@ -2111,7 +2361,7 @@
2111	2361	* probing process by not resetting search range to its orignal.
2112	2362	*/
2113	2363	if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) \|\|
2114		- interval < net->ipv4.sysctl_tcp_probe_threshold) {
	2364	+ interval < READ_ONCE(net->ipv4.sysctl_tcp_probe_threshold)) {
2115	2365	/* Check whether enough time has elaplased for
2116	2366	* another round of probing.
2117	2367	*/
..	..	@@ -2139,17 +2389,15 @@
2139	2389	if (!tcp_can_coalesce_send_queue_head(sk, probe_size))
2140	2390	return -1;
2141	2391
2142		- if (tcp_pacing_check(sk))
2143		- return -1;
2144		-
2145	2392	/* We're allowed to probe. Build it now. */
2146	2393	nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false);
2147	2394	if (!nskb)
2148	2395	return -1;
2149		- sk->sk_wmem_queued += nskb->truesize;
	2396	+ sk_wmem_queued_add(sk, nskb->truesize);
2150	2397	sk_mem_charge(sk, nskb->truesize);
2151	2398
2152	2399	skb = tcp_send_head(sk);
	2400	+ skb_copy_decrypted(nskb, skb);
2153	2401
2154	2402	TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
2155	2403	TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
..	..	@@ -2215,6 +2463,25 @@
2215	2463	return -1;
2216	2464	}
2217	2465
	2466	+static bool tcp_pacing_check(struct sock *sk)
	2467	+{
	2468	+ struct tcp_sock *tp = tcp_sk(sk);
	2469	+
	2470	+ if (!tcp_needs_internal_pacing(sk))
	2471	+ return false;
	2472	+
	2473	+ if (tp->tcp_wstamp_ns <= tp->tcp_clock_cache)
	2474	+ return false;
	2475	+
	2476	+ if (!hrtimer_is_queued(&tp->pacing_timer)) {
	2477	+ hrtimer_start(&tp->pacing_timer,
	2478	+ ns_to_ktime(tp->tcp_wstamp_ns),
	2479	+ HRTIMER_MODE_ABS_PINNED_SOFT);
	2480	+ sock_hold(sk);
	2481	+ }
	2482	+ return true;
	2483	+}
	2484	+
2218	2485	/* TCP Small Queues :
2219	2486	* Control number of packets in qdisc/devices to two packets / or ~1 ms.
2220	2487	* (These limits are doubled for retransmits)
..	..	@@ -2229,13 +2496,28 @@
2229	2496	static bool tcp_small_queue_check(struct sock sk, const struct sk_buff skb,
2230	2497	unsigned int factor)
2231	2498	{
2232		- unsigned int limit;
	2499	+ unsigned long limit;
2233	2500
2234		- limit = max(2 * skb->truesize, sk->sk_pacing_rate >> sk->sk_pacing_shift);
2235		- limit = min_t(u32, limit,
2236		- sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
	2501	+ limit = max_t(unsigned long,
	2502	+ 2 * skb->truesize,
	2503	+ sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift));
	2504	+ if (sk->sk_pacing_status == SK_PACING_NONE)
	2505	+ limit = min_t(unsigned long, limit,
	2506	+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes));
2237	2507	limit <<= factor;
2238	2508
	2509	+ if (static_branch_unlikely(&tcp_tx_delay_enabled) &&
	2510	+ tcp_sk(sk)->tcp_tx_delay) {
	2511	+ u64 extra_bytes = (u64)sk->sk_pacing_rate * tcp_sk(sk)->tcp_tx_delay;
	2512	+
	2513	+ /* TSQ is based on skb truesize sum (sk_wmem_alloc), so we
	2514	+ * approximate our needs assuming an ~100% skb->truesize overhead.
	2515	+ * USEC_PER_SEC is approximated by 2^20.
	2516	+ * do_div(extra_bytes, USEC_PER_SEC/2) is replaced by a right shift.
	2517	+ */
	2518	+ extra_bytes >>= (20 - 1);
	2519	+ limit += extra_bytes;
	2520	+ }
2239	2521	if (refcount_read(&sk->sk_wmem_alloc) > limit) {
2240	2522	/* Always send skb if rtx queue is empty.
2241	2523	* No need to wait for TX completion to call us back,
..	..	@@ -2341,17 +2623,19 @@
2341	2623	while ((skb = tcp_send_head(sk))) {
2342	2624	unsigned int limit;
2343	2625
	2626	+ if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
	2627	+ /* "skb_mstamp_ns" is used as a start point for the retransmit timer */
	2628	+ skb->skb_mstamp_ns = tp->tcp_wstamp_ns = tp->tcp_clock_cache;
	2629	+ list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
	2630	+ tcp_init_tso_segs(skb, mss_now);
	2631	+ goto repair; /* Skip network transmission */
	2632	+ }
	2633	+
2344	2634	if (tcp_pacing_check(sk))
2345	2635	break;
2346	2636
2347	2637	tso_segs = tcp_init_tso_segs(skb, mss_now);
2348	2638	BUG_ON(!tso_segs);
2349		-
2350		- if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
2351		- /* "skb_mstamp" is used as a start point for the retransmit timer */
2352		- tcp_update_skb_after_send(tp, skb);
2353		- goto repair; /* Skip network transmission */
2354		- }
2355	2639
2356	2640	cwnd_quota = tcp_cwnd_test(tp, skb);
2357	2641	if (!cwnd_quota) {
..	..	@@ -2388,8 +2672,7 @@
2388	2672	nonagle);
2389	2673
2390	2674	if (skb->len > limit &&
2391		- unlikely(tso_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
2392		- skb, limit, mss_now, gfp)))
	2675	+ unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
2393	2676	break;
2394	2677
2395	2678	if (tcp_small_queue_check(sk, skb, 0))
..	..	@@ -2450,10 +2733,10 @@
2450	2733	/* Don't do any loss probe on a Fast Open connection before 3WHS
2451	2734	* finishes.
2452	2735	*/
2453		- if (tp->fastopen_rsk)
	2736	+ if (rcu_access_pointer(tp->fastopen_rsk))
2454	2737	return false;
2455	2738
2456		- early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans;
	2739	+ early_retrans = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_early_retrans);
2457	2740	/* Schedule a loss probe in 2*RTT for SACK capable connections
2458	2741	* not in loss recovery, that are either limited by cwnd or application.
2459	2742	*/
..	..	@@ -2484,8 +2767,7 @@
2484	2767	if (rto_delta_us > 0)
2485	2768	timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
2486	2769
2487		- inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
2488		- TCP_RTO_MAX);
	2770	+ tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, TCP_RTO_MAX);
2489	2771	return true;
2490	2772	}
2491	2773
..	..	@@ -2666,8 +2948,12 @@
2666	2948	int mss = icsk->icsk_ack.rcv_mss;
2667	2949	int free_space = tcp_space(sk);
2668	2950	int allowed_space = tcp_full_space(sk);
2669		- int full_space = min_t(int, tp->window_clamp, allowed_space);
2670		- int window;
	2951	+ int full_space, window;
	2952	+
	2953	+ if (sk_is_mptcp(sk))
	2954	+ mptcp_space(sk, &free_space, &allowed_space);
	2955	+
	2956	+ full_space = min_t(int, tp->window_clamp, allowed_space);
2671	2957
2672	2958	if (unlikely(mss > full_space)) {
2673	2959	mss = full_space;
..	..	@@ -2815,7 +3101,7 @@
2815	3101	struct sk_buff skb = to, tmp;
2816	3102	bool first = true;
2817	3103
2818		- if (!sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse)
	3104	+ if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse))
2819	3105	return;
2820	3106	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
2821	3107	return;
..	..	@@ -2824,7 +3110,7 @@
2824	3110	if (!tcp_can_collapse(sk, skb))
2825	3111	break;
2826	3112
2827		- if (!tcp_skb_can_collapse_to(to))
	3113	+ if (!tcp_skb_can_collapse(to, skb))
2828	3114	break;
2829	3115
2830	3116	space -= skb->len;
..	..	@@ -2855,7 +3141,7 @@
2855	3141	struct tcp_sock *tp = tcp_sk(sk);
2856	3142	unsigned int cur_mss;
2857	3143	int diff, len, err;
2858		-
	3144	+ int avail_wnd;
2859	3145
2860	3146	/* Inconclusive MTU probe */
2861	3147	if (icsk->icsk_mtup.probe_size)
..	..	@@ -2885,23 +3171,31 @@
2885	3171	return -EHOSTUNREACH; /* Routing failure or similar. */
2886	3172
2887	3173	cur_mss = tcp_current_mss(sk);
	3174	+ avail_wnd = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
2888	3175
2889	3176	/* If receiver has shrunk his window, and skb is out of
2890	3177	* new window, do not retransmit it. The exception is the
2891	3178	* case, when window is shrunk to zero. In this case
2892		- * our retransmit serves as a zero window probe.
	3179	+ * our retransmit of one segment serves as a zero window probe.
2893	3180	*/
2894		- if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&
2895		- TCP_SKB_CB(skb)->seq != tp->snd_una)
2896		- return -EAGAIN;
	3181	+ if (avail_wnd <= 0) {
	3182	+ if (TCP_SKB_CB(skb)->seq != tp->snd_una)
	3183	+ return -EAGAIN;
	3184	+ avail_wnd = cur_mss;
	3185	+ }
2897	3186
2898	3187	len = cur_mss * segs;
	3188	+ if (len > avail_wnd) {
	3189	+ len = rounddown(avail_wnd, cur_mss);
	3190	+ if (!len)
	3191	+ len = avail_wnd;
	3192	+ }
2899	3193	if (skb->len > len) {
2900	3194	if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len,
2901	3195	cur_mss, GFP_ATOMIC))
2902	3196	return -ENOMEM; /* We'll try again later. */
2903	3197	} else {
2904		- if (skb_unclone(skb, GFP_ATOMIC))
	3198	+ if (skb_unclone_keeptruesize(skb, GFP_ATOMIC))
2905	3199	return -ENOMEM;
2906	3200
2907	3201	diff = tcp_skb_pcount(skb);
..	..	@@ -2909,8 +3203,9 @@
2909	3203	diff -= tcp_skb_pcount(skb);
2910	3204	if (diff)
2911	3205	tcp_adjust_pcount(sk, skb, diff);
2912		- if (skb->len < cur_mss)
2913		- tcp_retrans_try_collapse(sk, skb, cur_mss);
	3206	+ avail_wnd = min_t(int, avail_wnd, cur_mss);
	3207	+ if (skb->len < avail_wnd)
	3208	+ tcp_retrans_try_collapse(sk, skb, avail_wnd);
2914	3209	}
2915	3210
2916	3211	/* RFC3168, section 6.1.1.1. ECN fallback */
..	..	@@ -2935,24 +3230,32 @@
2935	3230
2936	3231	tcp_skb_tsorted_save(skb) {
2937	3232	nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
2938		- err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
2939		- -ENOBUFS;
	3233	+ if (nskb) {
	3234	+ nskb->dev = NULL;
	3235	+ err = tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC);
	3236	+ } else {
	3237	+ err = -ENOBUFS;
	3238	+ }
2940	3239	} tcp_skb_tsorted_restore(skb);
2941	3240
2942	3241	if (!err) {
2943		- tcp_update_skb_after_send(tp, skb);
	3242	+ tcp_update_skb_after_send(sk, skb, tp->tcp_wstamp_ns);
2944	3243	tcp_rate_skb_sent(sk, skb);
2945	3244	}
2946	3245	} else {
2947	3246	err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
2948	3247	}
2949	3248
	3249	+ /* To avoid taking spuriously low RTT samples based on a timestamp
	3250	+ * for a transmit that never happened, always mark EVER_RETRANS
	3251	+ */
	3252	+ TCP_SKB_CB(skb)->sacked \|= TCPCB_EVER_RETRANS;
	3253	+
2950	3254	if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG))
2951	3255	tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB,
2952	3256	TCP_SKB_CB(skb)->seq, segs, err);
2953	3257
2954	3258	if (likely(!err)) {
2955		- TCP_SKB_CB(skb)->sacked \|= TCPCB_EVER_RETRANS;
2956	3259	trace_tcp_retransmit_skb(sk, skb);
2957	3260	} else if (err != -EBUSY) {
2958	3261	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs);
..	..	@@ -2995,6 +3298,7 @@
2995	3298	const struct inet_connection_sock *icsk = inet_csk(sk);
2996	3299	struct sk_buff skb, rtx_head, *hole = NULL;
2997	3300	struct tcp_sock *tp = tcp_sk(sk);
	3301	+ bool rearm_timer = false;
2998	3302	u32 max_segs;
2999	3303	int mib_idx;
3000	3304
..	..	@@ -3017,7 +3321,7 @@
3017	3321
3018	3322	segs = tp->snd_cwnd - tcp_packets_in_flight(tp);
3019	3323	if (segs <= 0)
3020		- return;
	3324	+ break;
3021	3325	sacked = TCP_SKB_CB(skb)->sacked;
3022	3326	/* In case tcp_shift_skb_data() have aggregated large skbs,
3023	3327	* we need to make sure not sending too bigs TSO packets
..	..	@@ -3042,10 +3346,10 @@
3042	3346	continue;
3043	3347
3044	3348	if (tcp_small_queue_check(sk, skb, 1))
3045		- return;
	3349	+ break;
3046	3350
3047	3351	if (tcp_retransmit_skb(sk, skb, segs))
3048		- return;
	3352	+ break;
3049	3353
3050	3354	NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb));
3051	3355
..	..	@@ -3054,10 +3358,13 @@
3054	3358
3055	3359	if (skb == rtx_head &&
3056	3360	icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
3057		- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
3058		- inet_csk(sk)->icsk_rto,
3059		- TCP_RTO_MAX);
	3361	+ rearm_timer = true;
	3362	+
3060	3363	}
	3364	+ if (rearm_timer)
	3365	+ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
	3366	+ inet_csk(sk)->icsk_rto,
	3367	+ TCP_RTO_MAX);
3061	3368	}
3062	3369
3063	3370	/* We allow to exceed memory limits for FIN packets to expedite
..	..	@@ -3069,11 +3376,12 @@
3069	3376	*/
3070	3377	void sk_forced_mem_schedule(struct sock *sk, int size)
3071	3378	{
3072		- int amt;
	3379	+ int delta, amt;
3073	3380
3074		- if (size <= sk->sk_forward_alloc)
	3381	+ delta = size - sk->sk_forward_alloc;
	3382	+ if (delta <= 0)
3075	3383	return;
3076		- amt = sk_mem_pages(size);
	3384	+ amt = sk_mem_pages(delta);
3077	3385	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
3078	3386	sk_memory_allocated_add(sk, amt);
3079	3387
..	..	@@ -3086,7 +3394,7 @@
3086	3394	*/
3087	3395	void tcp_send_fin(struct sock *sk)
3088	3396	{
3089		- struct sk_buff skb, tskb = tcp_write_queue_tail(sk);
	3397	+ struct sk_buff skb, tskb, *tail = tcp_write_queue_tail(sk);
3090	3398	struct tcp_sock *tp = tcp_sk(sk);
3091	3399
3092	3400	/* Optimization, tack on the FIN if we have one skb in write queue and
..	..	@@ -3094,31 +3402,29 @@
3094	3402	* Note: in the latter case, FIN packet will be sent after a timeout,
3095	3403	* as TCP stack thinks it has already been transmitted.
3096	3404	*/
	3405	+ tskb = tail;
3097	3406	if (!tskb && tcp_under_memory_pressure(sk))
3098	3407	tskb = skb_rb_last(&sk->tcp_rtx_queue);
3099	3408
3100	3409	if (tskb) {
3101		-coalesce:
3102	3410	TCP_SKB_CB(tskb)->tcp_flags \|= TCPHDR_FIN;
3103	3411	TCP_SKB_CB(tskb)->end_seq++;
3104	3412	tp->write_seq++;
3105		- if (tcp_write_queue_empty(sk)) {
	3413	+ if (!tail) {
3106	3414	/* This means tskb was already sent.
3107	3415	* Pretend we included the FIN on previous transmit.
3108	3416	* We need to set tp->snd_nxt to the value it would have
3109	3417	* if FIN had been sent. This is because retransmit path
3110	3418	* does not change tp->snd_nxt.
3111	3419	*/
3112		- tp->snd_nxt++;
	3420	+ WRITE_ONCE(tp->snd_nxt, tp->snd_nxt + 1);
3113	3421	return;
3114	3422	}
3115	3423	} else {
3116	3424	skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
3117		- if (unlikely(!skb)) {
3118		- if (tskb)
3119		- goto coalesce;
	3425	+ if (unlikely(!skb))
3120	3426	return;
3121		- }
	3427	+
3122	3428	INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
3123	3429	skb_reserve(skb, MAX_TCP_HEADER);
3124	3430	sk_forced_mem_schedule(sk, skb->truesize);
..	..	@@ -3192,7 +3498,7 @@
3192	3498	tcp_rtx_queue_unlink_and_free(skb, sk);
3193	3499	__skb_header_release(nskb);
3194	3500	tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb);
3195		- sk->sk_wmem_queued += nskb->truesize;
	3501	+ sk_wmem_queued_add(sk, nskb->truesize);
3196	3502	sk_mem_charge(sk, nskb->truesize);
3197	3503	skb = nskb;
3198	3504	}
..	..	@@ -3204,18 +3510,20 @@
3204	3510	}
3205	3511
3206	3512	/**
3207		- * tcp_make_synack - Prepare a SYN-ACK.
3208		- * sk: listener socket
3209		- * dst: dst entry attached to the SYNACK
3210		- * req: request_sock pointer
3211		- *
3212		- * Allocate one skb and build a SYNACK packet.
3213		- * @dst is consumed : Caller should not use it again.
	3513	+ * tcp_make_synack - Allocate one skb and build a SYNACK packet.
	3514	+ * @sk: listener socket
	3515	+ * @dst: dst entry attached to the SYNACK. It is consumed and caller
	3516	+ * should not use it again.
	3517	+ * @req: request_sock pointer
	3518	+ * @foc: cookie for tcp fast open
	3519	+ * @synack_type: Type of synack to prepare
	3520	+ * @syn_skb: SYN packet just received. It could be NULL for rtx case.
3214	3521	*/
3215	3522	struct sk_buff tcp_make_synack(const struct sock sk, struct dst_entry *dst,
3216	3523	struct request_sock *req,
3217	3524	struct tcp_fastopen_cookie *foc,
3218		- enum tcp_synack_type synack_type)
	3525	+ enum tcp_synack_type synack_type,
	3526	+ struct sk_buff *syn_skb)
3219	3527	{
3220	3528	struct inet_request_sock *ireq = inet_rsk(req);
3221	3529	const struct tcp_sock *tp = tcp_sk(sk);
..	..	@@ -3225,6 +3533,7 @@
3225	3533	int tcp_header_size;
3226	3534	struct tcphdr *th;
3227	3535	int mss;
	3536	+ u64 now;
3228	3537
3229	3538	skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
3230	3539	if (unlikely(!skb)) {
..	..	@@ -3256,20 +3565,28 @@
3256	3565	mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
3257	3566
3258	3567	memset(&opts, 0, sizeof(opts));
	3568	+ now = tcp_clock_ns();
3259	3569	#ifdef CONFIG_SYN_COOKIES
3260		- if (unlikely(req->cookie_ts))
3261		- skb->skb_mstamp = cookie_init_timestamp(req);
	3570	+ if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok))
	3571	+ skb->skb_mstamp_ns = cookie_init_timestamp(req, now);
3262	3572	else
3263	3573	#endif
3264		- skb->skb_mstamp = tcp_clock_us();
	3574	+ {
	3575	+ skb->skb_mstamp_ns = now;
	3576	+ if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */
	3577	+ tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb);
	3578	+ }
3265	3579
3266	3580	#ifdef CONFIG_TCP_MD5SIG
3267	3581	rcu_read_lock();
3268	3582	md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
3269	3583	#endif
3270	3584	skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
	3585	+ /* bpf program will be interested in the tcp_flags */
	3586	+ TCP_SKB_CB(skb)->tcp_flags = TCPHDR_SYN \| TCPHDR_ACK;
3271	3587	tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
3272		- foc, synack_type) + sizeof(*th);
	3588	+ foc, synack_type,
	3589	+ syn_skb) + sizeof(*th);
3273	3590
3274	3591	skb_push(skb, tcp_header_size);
3275	3592	skb_reset_transport_header(skb);
..	..	@@ -3291,7 +3608,7 @@
3291	3608	th->window = htons(min(req->rsk_rcv_wnd, 65535U));
3292	3609	tcp_options_write((__be32 *)(th + 1), NULL, &opts);
3293	3610	th->doff = (tcp_header_size >> 2);
3294		- __TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
	3611	+ TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
3295	3612
3296	3613	#ifdef CONFIG_TCP_MD5SIG
3297	3614	/* Okay, we have all we need - do the md5 hash if needed */
..	..	@@ -3301,8 +3618,12 @@
3301	3618	rcu_read_unlock();
3302	3619	#endif
3303	3620
3304		- /* Do not fool tcpdump (if any), clean our debris */
3305		- skb->tstamp = 0;
	3621	+ bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb,
	3622	+ synack_type, &opts);
	3623	+
	3624	+ skb->skb_mstamp_ns = now;
	3625	+ tcp_add_tx_delay(skb, tp);
	3626	+
3306	3627	return skb;
3307	3628	}
3308	3629	EXPORT_SYMBOL(tcp_make_synack);
..	..	@@ -3318,8 +3639,8 @@
3318	3639
3319	3640	rcu_read_lock();
3320	3641	ca = tcp_ca_find_key(ca_key);
3321		- if (likely(ca && try_module_get(ca->owner))) {
3322		- module_put(icsk->icsk_ca_ops->owner);
	3642	+ if (likely(ca && bpf_try_module_get(ca, ca->owner))) {
	3643	+ bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner);
3323	3644	icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
3324	3645	icsk->icsk_ca_ops = ca;
3325	3646	}
..	..	@@ -3338,7 +3659,7 @@
3338	3659	* See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
3339	3660	*/
3340	3661	tp->tcp_header_len = sizeof(struct tcphdr);
3341		- if (sock_net(sk)->ipv4.sysctl_tcp_timestamps)
	3662	+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps))
3342	3663	tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED;
3343	3664
3344	3665	#ifdef CONFIG_TCP_MD5SIG
..	..	@@ -3374,7 +3695,7 @@
3374	3695	tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
3375	3696	&tp->rcv_wnd,
3376	3697	&tp->window_clamp,
3377		- sock_net(sk)->ipv4.sysctl_tcp_window_scaling,
	3698	+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling),
3378	3699	&rcv_wscale,
3379	3700	rcv_wnd);
3380	3701
..	..	@@ -3389,7 +3710,7 @@
3389	3710	tp->snd_una = tp->write_seq;
3390	3711	tp->snd_sml = tp->write_seq;
3391	3712	tp->snd_up = tp->write_seq;
3392		- tp->snd_nxt = tp->write_seq;
	3713	+ WRITE_ONCE(tp->snd_nxt, tp->write_seq);
3393	3714
3394	3715	if (likely(!tp->repair))
3395	3716	tp->rcv_nxt = 0;
..	..	@@ -3410,7 +3731,7 @@
3410	3731
3411	3732	tcb->end_seq += skb->len;
3412	3733	__skb_header_release(skb);
3413		- sk->sk_wmem_queued += skb->truesize;
	3734	+ sk_wmem_queued_add(sk, skb->truesize);
3414	3735	sk_mem_charge(sk, skb->truesize);
3415	3736	WRITE_ONCE(tp->write_seq, tcb->end_seq);
3416	3737	tp->packets_out += tcp_skb_pcount(skb);
..	..	@@ -3425,6 +3746,7 @@
3425	3746	*/
3426	3747	static int tcp_send_syn_data(struct sock sk, struct sk_buff syn)
3427	3748	{
	3749	+ struct inet_connection_sock *icsk = inet_csk(sk);
3428	3750	struct tcp_sock *tp = tcp_sk(sk);
3429	3751	struct tcp_fastopen_request *fo = tp->fastopen_req;
3430	3752	int space, err = 0;
..	..	@@ -3439,8 +3761,10 @@
3439	3761	* private TCP options. The cost is reduced data space in SYN :(
3440	3762	*/
3441	3763	tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp);
	3764	+ /* Sync mss_cache after updating the mss_clamp */
	3765	+ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
3442	3766
3443		- space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
	3767	+ space = __tcp_mtu_to_mss(sk, icsk->icsk_pmtu_cookie) -
3444	3768	MAX_TCP_OPTION_SPACE;
3445	3769
3446	3770	space = min_t(size_t, space, fo->size);
..	..	@@ -3465,6 +3789,7 @@
3465	3789	skb_trim(syn_data, copied);
3466	3790	space = copied;
3467	3791	}
	3792	+ skb_zcopy_set(syn_data, fo->uarg, NULL);
3468	3793	}
3469	3794	/* No more data pending in inet_wait_for_connect() */
3470	3795	if (space == fo->size)
..	..	@@ -3477,7 +3802,7 @@
3477	3802
3478	3803	err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
3479	3804
3480		- syn->skb_mstamp = syn_data->skb_mstamp;
	3805	+ syn->skb_mstamp_ns = syn_data->skb_mstamp_ns;
3481	3806
3482	3807	/* Now full SYN+DATA was cloned and sent (or not),
3483	3808	* remove the SYN from the original skb (syn_data)
..	..	@@ -3548,11 +3873,11 @@
3548	3873	/* We change tp->snd_nxt after the tcp_transmit_skb() call
3549	3874	* in order to make this packet get counted in tcpOutSegs.
3550	3875	*/
3551		- tp->snd_nxt = tp->write_seq;
	3876	+ WRITE_ONCE(tp->snd_nxt, tp->write_seq);
3552	3877	tp->pushed_seq = tp->write_seq;
3553	3878	buff = tcp_send_head(sk);
3554	3879	if (unlikely(buff)) {
3555		- tp->snd_nxt = TCP_SKB_CB(buff)->seq;
	3880	+ WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(buff)->seq);
3556	3881	tp->pushed_seq = TCP_SKB_CB(buff)->seq;
3557	3882	}
3558	3883	TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
..	..	@@ -3578,7 +3903,7 @@
3578	3903	const struct tcp_sock *tp = tcp_sk(sk);
3579	3904	int max_ato = HZ / 2;
3580	3905
3581		- if (icsk->icsk_ack.pingpong \|\|
	3906	+ if (inet_csk_in_pingpong_mode(sk) \|\|
3582	3907	(icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
3583	3908	max_ato = TCP_DELACK_MAX;
3584	3909
..	..	@@ -3599,16 +3924,15 @@
3599	3924	ato = min(ato, max_ato);
3600	3925	}
3601	3926
	3927	+ ato = min_t(u32, ato, inet_csk(sk)->icsk_delack_max);
	3928	+
3602	3929	/* Stay within the limit we were given */
3603	3930	timeout = jiffies + ato;
3604	3931
3605	3932	/* Use new timeout only if there wasn't a older one earlier. */
3606	3933	if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
3607		- /* If delack timer was blocked or is about to expire,
3608		- * send ACK now.
3609		- */
3610		- if (icsk->icsk_ack.blocked \|\|
3611		- time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
	3934	+ /* If delack timer is about to expire, send ACK now. */
	3935	+ if (time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
3612	3936	tcp_send_ack(sk);
3613	3937	return;
3614	3938	}
..	..	@@ -3637,10 +3961,15 @@
3637	3961	buff = alloc_skb(MAX_TCP_HEADER,
3638	3962	sk_gfp_mask(sk, GFP_ATOMIC \| __GFP_NOWARN));
3639	3963	if (unlikely(!buff)) {
	3964	+ struct inet_connection_sock *icsk = inet_csk(sk);
	3965	+ unsigned long delay;
	3966	+
	3967	+ delay = TCP_DELACK_MAX << icsk->icsk_ack.retry;
	3968	+ if (delay < TCP_RTO_MAX)
	3969	+ icsk->icsk_ack.retry++;
3640	3970	inet_csk_schedule_ack(sk);
3641		- inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
3642		- inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
3643		- TCP_DELACK_MAX, TCP_RTO_MAX);
	3971	+ icsk->icsk_ack.ato = TCP_ATO_MIN;
	3972	+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, delay, TCP_RTO_MAX);
3644	3973	return;
3645	3974	}
3646	3975
..	..	@@ -3759,7 +4088,7 @@
3759	4088	struct inet_connection_sock *icsk = inet_csk(sk);
3760	4089	struct tcp_sock *tp = tcp_sk(sk);
3761	4090	struct net *net = sock_net(sk);
3762		- unsigned long probe_max;
	4091	+ unsigned long timeout;
3763	4092	int err;
3764	4093
3765	4094	err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
..	..	@@ -3768,28 +4097,24 @@
3768	4097	/* Cancel probe timer, if it is not required. */
3769	4098	icsk->icsk_probes_out = 0;
3770	4099	icsk->icsk_backoff = 0;
	4100	+ icsk->icsk_probes_tstamp = 0;
3771	4101	return;
3772	4102	}
3773	4103
	4104	+ icsk->icsk_probes_out++;
3774	4105	if (err <= 0) {
3775		- if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2)
	4106	+ if (icsk->icsk_backoff < READ_ONCE(net->ipv4.sysctl_tcp_retries2))
3776	4107	icsk->icsk_backoff++;
3777		- icsk->icsk_probes_out++;
3778		- probe_max = TCP_RTO_MAX;
	4108	+ timeout = tcp_probe0_when(sk, TCP_RTO_MAX);
3779	4109	} else {
3780	4110	/* If packet was not sent due to local congestion,
3781		- * do not backoff and do not remember icsk_probes_out.
3782		- * Let local senders to fight for local resources.
3783		- *
3784		- * Use accumulated backoff yet.
	4111	+ * Let senders fight for local resources conservatively.
3785	4112	*/
3786		- if (!icsk->icsk_probes_out)
3787		- icsk->icsk_probes_out = 1;
3788		- probe_max = TCP_RESOURCE_PROBE_INTERVAL;
	4113	+ timeout = TCP_RESOURCE_PROBE_INTERVAL;
3789	4114	}
3790		- inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
3791		- tcp_probe0_when(sk, probe_max),
3792		- TCP_RTO_MAX);
	4115	+
	4116	+ timeout = tcp_clamp_probe0_to_user_timeout(sk, timeout);
	4117	+ tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX);
3793	4118	}
3794	4119
3795	4120	int tcp_rtx_synack(const struct sock sk, struct request_sock req)
..	..	@@ -3799,10 +4124,11 @@
3799	4124	int res;
3800	4125
3801	4126	tcp_rsk(req)->txhash = net_tx_rndhash();
3802		- res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL);
	4127	+ res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL,
	4128	+ NULL);
3803	4129	if (!res) {
3804		- __TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
3805		- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
	4130	+ TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
	4131	+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
3806	4132	if (unlikely(tcp_passive_fastopen(sk)))
3807	4133	tcp_sk(sk)->total_retrans++;
3808	4134	trace_tcp_retransmit_synack(sk, req);