~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0-only
1	2	/*
2	3	* INET An implementation of the TCP/IP protocol suite for the LINUX
3	4	* operating system. INET is implemented using the BSD Socket
..	..	@@ -37,6 +38,7 @@
37	38	#define pr_fmt(fmt) "TCP: " fmt
38	39
39	40	#include <net/tcp.h>
	41	+#include <net/mptcp.h>
40	42
41	43	#include <linux/compiler.h>
42	44	#include <linux/gfp.h>
..	..	@@ -44,6 +46,17 @@
44	46	#include <linux/static_key.h>
45	47
46	48	#include <trace/events/tcp.h>
	49	+
	50	+/* Refresh clocks of a TCP socket,
	51	+ * ensuring monotically increasing values.
	52	+ */
	53	+void tcp_mstamp_refresh(struct tcp_sock *tp)
	54	+{
	55	+ u64 val = tcp_clock_ns();
	56	+
	57	+ tp->tcp_clock_cache = val;
	58	+ tp->tcp_mstamp = div_u64(val, NSEC_PER_USEC);
	59	+}
47	60
48	61	static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
49	62	int push_one, gfp_t gfp);
..	..	@@ -55,7 +68,7 @@
55	68	struct tcp_sock *tp = tcp_sk(sk);
56	69	unsigned int prior_packets = tp->packets_out;
57	70
58		- tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
	71	+ WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq);
59	72
60	73	__skb_unlink(skb, &sk->sk_write_queue);
61	74	tcp_rbtree_insert(&sk->tcp_rtx_queue, skb);
..	..	@@ -69,6 +82,7 @@
69	82
70	83	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
71	84	tcp_skb_pcount(skb));
	85	+ tcp_check_space(sk);
72	86	}
73	87
74	88	/* SND.NXT, if window was not shrunk or the amount of shrunk was less than one
..	..	@@ -159,7 +173,7 @@
159	173	* packet, enter pingpong mode.
160	174	*/
161	175	if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
162		- icsk->icsk_ack.pingpong = 1;
	176	+ inet_csk_enter_pingpong_mode(sk);
163	177	}
164	178
165	179	/* Account for an ACK we sent. */
..	..	@@ -168,10 +182,10 @@
168	182	{
169	183	struct tcp_sock *tp = tcp_sk(sk);
170	184
171		- if (unlikely(tp->compressed_ack > TCP_FASTRETRANS_THRESH)) {
	185	+ if (unlikely(tp->compressed_ack)) {
172	186	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
173		- tp->compressed_ack - TCP_FASTRETRANS_THRESH);
174		- tp->compressed_ack = TCP_FASTRETRANS_THRESH;
	187	+ tp->compressed_ack);
	188	+ tp->compressed_ack = 0;
175	189	if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
176	190	__sock_put(sk);
177	191	}
..	..	@@ -221,16 +235,14 @@
221	235	if (init_rcv_wnd)
222	236	rcv_wnd = min(rcv_wnd, init_rcv_wnd * mss);
223	237
224		- (*rcv_wscale) = 0;
	238	+ *rcv_wscale = 0;
225	239	if (wscale_ok) {
226	240	/* Set window scaling on max possible window */
227		- space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
228		- space = max_t(u32, space, sysctl_rmem_max);
	241	+ space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
	242	+ space = max_t(u32, space, READ_ONCE(sysctl_rmem_max));
229	243	space = min_t(u32, space, *window_clamp);
230		- while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) {
231		- space >>= 1;
232		- (*rcv_wscale)++;
233		- }
	244	+ *rcv_wscale = clamp_t(int, ilog2(space) - 15,
	245	+ 0, TCP_MAX_WSCALE);
234	246	}
235	247	/* Set the clamp no higher than max representable value */
236	248	(window_clamp) = min_t(__u32, U16_MAX << (rcv_wscale), *window_clamp);
..	..	@@ -401,6 +413,7 @@
401	413	#define OPTION_WSCALE (1 << 3)
402	414	#define OPTION_FAST_OPEN_COOKIE (1 << 8)
403	415	#define OPTION_SMC (1 << 9)
	416	+#define OPTION_MPTCP (1 << 10)
404	417
405	418	static void smc_options_write(__be32 ptr, u16 options)
406	419	{
..	..	@@ -423,10 +436,159 @@
423	436	u8 ws; /* window scale, 0 to disable */
424	437	u8 num_sack_blocks; /* number of SACK blocks to include */
425	438	u8 hash_size; /* bytes in hash_location */
	439	+ u8 bpf_opt_len; /* length of BPF hdr option */
426	440	__u8 hash_location; / temporary pointer, overloaded */
427	441	__u32 tsval, tsecr; /* need to include OPTION_TS */
428	442	struct tcp_fastopen_cookie fastopen_cookie; / Fast open cookie */
	443	+ struct mptcp_out_options mptcp;
429	444	};
	445	+
	446	+static void mptcp_options_write(__be32 ptr, struct tcp_out_options opts)
	447	+{
	448	+#if IS_ENABLED(CONFIG_MPTCP)
	449	+ if (unlikely(OPTION_MPTCP & opts->options))
	450	+ mptcp_write_options(ptr, &opts->mptcp);
	451	+#endif
	452	+}
	453	+
	454	+#ifdef CONFIG_CGROUP_BPF
	455	+static int bpf_skops_write_hdr_opt_arg0(struct sk_buff *skb,
	456	+ enum tcp_synack_type synack_type)
	457	+{
	458	+ if (unlikely(!skb))
	459	+ return BPF_WRITE_HDR_TCP_CURRENT_MSS;
	460	+
	461	+ if (unlikely(synack_type == TCP_SYNACK_COOKIE))
	462	+ return BPF_WRITE_HDR_TCP_SYNACK_COOKIE;
	463	+
	464	+ return 0;
	465	+}
	466	+
	467	+/* req, syn_skb and synack_type are used when writing synack */
	468	+static void bpf_skops_hdr_opt_len(struct sock sk, struct sk_buff skb,
	469	+ struct request_sock *req,
	470	+ struct sk_buff *syn_skb,
	471	+ enum tcp_synack_type synack_type,
	472	+ struct tcp_out_options *opts,
	473	+ unsigned int *remaining)
	474	+{
	475	+ struct bpf_sock_ops_kern sock_ops;
	476	+ int err;
	477	+
	478	+ if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
	479	+ BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) \|\|
	480	+ !*remaining)
	481	+ return;
	482	+
	483	+ /* remaining has already been aligned to 4 bytes, so remaining >= 4 */
	484	+
	485	+ /* init sock_ops */
	486	+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
	487	+
	488	+ sock_ops.op = BPF_SOCK_OPS_HDR_OPT_LEN_CB;
	489	+
	490	+ if (req) {
	491	+ /* The listen "sk" cannot be passed here because
	492	+ * it is not locked. It would not make too much
	493	+ * sense to do bpf_setsockopt(listen_sk) based
	494	+ * on individual connection request also.
	495	+ *
	496	+ * Thus, "req" is passed here and the cgroup-bpf-progs
	497	+ * of the listen "sk" will be run.
	498	+ *
	499	+ * "req" is also used here for fastopen even the "sk" here is
	500	+ * a fullsock "child" sk. It is to keep the behavior
	501	+ * consistent between fastopen and non-fastopen on
	502	+ * the bpf programming side.
	503	+ */
	504	+ sock_ops.sk = (struct sock *)req;
	505	+ sock_ops.syn_skb = syn_skb;
	506	+ } else {
	507	+ sock_owned_by_me(sk);
	508	+
	509	+ sock_ops.is_fullsock = 1;
	510	+ sock_ops.sk = sk;
	511	+ }
	512	+
	513	+ sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
	514	+ sock_ops.remaining_opt_len = *remaining;
	515	+ /* tcp_current_mss() does not pass a skb */
	516	+ if (skb)
	517	+ bpf_skops_init_skb(&sock_ops, skb, 0);
	518	+
	519	+ err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
	520	+
	521	+ if (err \|\| sock_ops.remaining_opt_len == *remaining)
	522	+ return;
	523	+
	524	+ opts->bpf_opt_len = *remaining - sock_ops.remaining_opt_len;
	525	+ /* round up to 4 bytes */
	526	+ opts->bpf_opt_len = (opts->bpf_opt_len + 3) & ~3;
	527	+
	528	+ *remaining -= opts->bpf_opt_len;
	529	+}
	530	+
	531	+static void bpf_skops_write_hdr_opt(struct sock sk, struct sk_buff skb,
	532	+ struct request_sock *req,
	533	+ struct sk_buff *syn_skb,
	534	+ enum tcp_synack_type synack_type,
	535	+ struct tcp_out_options *opts)
	536	+{
	537	+ u8 first_opt_off, nr_written, max_opt_len = opts->bpf_opt_len;
	538	+ struct bpf_sock_ops_kern sock_ops;
	539	+ int err;
	540	+
	541	+ if (likely(!max_opt_len))
	542	+ return;
	543	+
	544	+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
	545	+
	546	+ sock_ops.op = BPF_SOCK_OPS_WRITE_HDR_OPT_CB;
	547	+
	548	+ if (req) {
	549	+ sock_ops.sk = (struct sock *)req;
	550	+ sock_ops.syn_skb = syn_skb;
	551	+ } else {
	552	+ sock_owned_by_me(sk);
	553	+
	554	+ sock_ops.is_fullsock = 1;
	555	+ sock_ops.sk = sk;
	556	+ }
	557	+
	558	+ sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
	559	+ sock_ops.remaining_opt_len = max_opt_len;
	560	+ first_opt_off = tcp_hdrlen(skb) - max_opt_len;
	561	+ bpf_skops_init_skb(&sock_ops, skb, first_opt_off);
	562	+
	563	+ err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
	564	+
	565	+ if (err)
	566	+ nr_written = 0;
	567	+ else
	568	+ nr_written = max_opt_len - sock_ops.remaining_opt_len;
	569	+
	570	+ if (nr_written < max_opt_len)
	571	+ memset(skb->data + first_opt_off + nr_written, TCPOPT_NOP,
	572	+ max_opt_len - nr_written);
	573	+}
	574	+#else
	575	+static void bpf_skops_hdr_opt_len(struct sock sk, struct sk_buff skb,
	576	+ struct request_sock *req,
	577	+ struct sk_buff *syn_skb,
	578	+ enum tcp_synack_type synack_type,
	579	+ struct tcp_out_options *opts,
	580	+ unsigned int *remaining)
	581	+{
	582	+}
	583	+
	584	+static void bpf_skops_write_hdr_opt(struct sock sk, struct sk_buff skb,
	585	+ struct request_sock *req,
	586	+ struct sk_buff *syn_skb,
	587	+ enum tcp_synack_type synack_type,
	588	+ struct tcp_out_options *opts)
	589	+{
	590	+}
	591	+#endif
430	592
431	593	/* Write previously computed TCP options to the packet.
432	594	*
..	..	@@ -536,6 +698,8 @@
536	698	}
537	699
538	700	smc_options_write(ptr, &options);
	701	+
	702	+ mptcp_options_write(ptr, opts);
539	703	}
540	704
541	705	static void smc_set_option(const struct tcp_sock *tp,
..	..	@@ -571,6 +735,22 @@
571	735	#endif
572	736	}
573	737
	738	+static void mptcp_set_option_cond(const struct request_sock *req,
	739	+ struct tcp_out_options *opts,
	740	+ unsigned int *remaining)
	741	+{
	742	+ if (rsk_is_mptcp(req)) {
	743	+ unsigned int size;
	744	+
	745	+ if (mptcp_synack_options(req, &size, &opts->mptcp)) {
	746	+ if (*remaining >= size) {
	747	+ opts->options \|= OPTION_MPTCP;
	748	+ *remaining -= size;
	749	+ }
	750	+ }
	751	+ }
	752	+}
	753	+
574	754	/* Compute TCP options for SYN packets. This is not the final
575	755	* network wire format yet.
576	756	*/
..	..	@@ -584,7 +764,8 @@
584	764
585	765	*md5 = NULL;
586	766	#ifdef CONFIG_TCP_MD5SIG
587		- if (unlikely(rcu_access_pointer(tp->md5sig_info))) {
	767	+ if (static_branch_unlikely(&tcp_md5_needed) &&
	768	+ rcu_access_pointer(tp->md5sig_info)) {
588	769	*md5 = tp->af_specific->md5_lookup(sk, sk);
589	770	if (*md5) {
590	771	opts->options \|= OPTION_MD5;
..	..	@@ -605,18 +786,18 @@
605	786	opts->mss = tcp_advertise_mss(sk);
606	787	remaining -= TCPOLEN_MSS_ALIGNED;
607	788
608		- if (likely(sock_net(sk)->ipv4.sysctl_tcp_timestamps && !*md5)) {
	789	+ if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps) && !*md5)) {
609	790	opts->options \|= OPTION_TS;
610	791	opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
611	792	opts->tsecr = tp->rx_opt.ts_recent;
612	793	remaining -= TCPOLEN_TSTAMP_ALIGNED;
613	794	}
614		- if (likely(sock_net(sk)->ipv4.sysctl_tcp_window_scaling)) {
	795	+ if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling))) {
615	796	opts->ws = tp->rx_opt.rcv_wscale;
616	797	opts->options \|= OPTION_WSCALE;
617	798	remaining -= TCPOLEN_WSCALE_ALIGNED;
618	799	}
619		- if (likely(sock_net(sk)->ipv4.sysctl_tcp_sack)) {
	800	+ if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_sack))) {
620	801	opts->options \|= OPTION_SACK_ADVERTISE;
621	802	if (unlikely(!(OPTION_TS & opts->options)))
622	803	remaining -= TCPOLEN_SACKPERM_ALIGNED;
..	..	@@ -639,6 +820,17 @@
639	820
640	821	smc_set_option(tp, opts, &remaining);
641	822
	823	+ if (sk_is_mptcp(sk)) {
	824	+ unsigned int size;
	825	+
	826	+ if (mptcp_syn_options(sk, skb, &size, &opts->mptcp)) {
	827	+ opts->options \|= OPTION_MPTCP;
	828	+ remaining -= size;
	829	+ }
	830	+ }
	831	+
	832	+ bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
	833	+
642	834	return MAX_TCP_OPTION_SPACE - remaining;
643	835	}
644	836
..	..	@@ -649,7 +841,8 @@
649	841	struct tcp_out_options *opts,
650	842	const struct tcp_md5sig_key *md5,
651	843	struct tcp_fastopen_cookie *foc,
652		- enum tcp_synack_type synack_type)
	844	+ enum tcp_synack_type synack_type,
	845	+ struct sk_buff *syn_skb)
653	846	{
654	847	struct inet_request_sock *ireq = inet_rsk(req);
655	848	unsigned int remaining = MAX_TCP_OPTION_SPACE;
..	..	@@ -702,7 +895,12 @@
702	895	}
703	896	}
704	897
	898	+ mptcp_set_option_cond(req, opts, &remaining);
	899	+
705	900	smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
	901	+
	902	+ bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb,
	903	+ synack_type, opts, &remaining);
706	904
707	905	return MAX_TCP_OPTION_SPACE - remaining;
708	906	}
..	..	@@ -722,7 +920,8 @@
722	920
723	921	*md5 = NULL;
724	922	#ifdef CONFIG_TCP_MD5SIG
725		- if (unlikely(rcu_access_pointer(tp->md5sig_info))) {
	923	+ if (static_branch_unlikely(&tcp_md5_needed) &&
	924	+ rcu_access_pointer(tp->md5sig_info)) {
726	925	*md5 = tp->af_specific->md5_lookup(sk, sk);
727	926	if (*md5) {
728	927	opts->options \|= OPTION_MD5;
..	..	@@ -738,16 +937,46 @@
738	937	size += TCPOLEN_TSTAMP_ALIGNED;
739	938	}
740	939
	940	+ /* MPTCP options have precedence over SACK for the limited TCP
	941	+ * option space because a MPTCP connection would be forced to
	942	+ * fall back to regular TCP if a required multipath option is
	943	+ * missing. SACK still gets a chance to use whatever space is
	944	+ * left.
	945	+ */
	946	+ if (sk_is_mptcp(sk)) {
	947	+ unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
	948	+ unsigned int opt_size = 0;
	949	+
	950	+ if (mptcp_established_options(sk, skb, &opt_size, remaining,
	951	+ &opts->mptcp)) {
	952	+ opts->options \|= OPTION_MPTCP;
	953	+ size += opt_size;
	954	+ }
	955	+ }
	956	+
741	957	eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
742	958	if (unlikely(eff_sacks)) {
743	959	const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
	960	+ if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED +
	961	+ TCPOLEN_SACK_PERBLOCK))
	962	+ return size;
	963	+
744	964	opts->num_sack_blocks =
745	965	min_t(unsigned int, eff_sacks,
746	966	(remaining - TCPOLEN_SACK_BASE_ALIGNED) /
747	967	TCPOLEN_SACK_PERBLOCK);
748		- if (likely(opts->num_sack_blocks))
749		- size += TCPOLEN_SACK_BASE_ALIGNED +
750		- opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
	968	+
	969	+ size += TCPOLEN_SACK_BASE_ALIGNED +
	970	+ opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
	971	+ }
	972	+
	973	+ if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp,
	974	+ BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) {
	975	+ unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
	976	+
	977	+ bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
	978	+
	979	+ size = MAX_TCP_OPTION_SPACE - remaining;
751	980	}
752	981
753	982	return size;
..	..	@@ -966,48 +1195,33 @@
966	1195	return HRTIMER_NORESTART;
967	1196	}
968	1197
969		-static void tcp_internal_pacing(struct sock sk, const struct sk_buff skb)
	1198	+static void tcp_update_skb_after_send(struct sock sk, struct sk_buff skb,
	1199	+ u64 prior_wstamp)
970	1200	{
971	1201	struct tcp_sock *tp = tcp_sk(sk);
972		- ktime_t expire, now;
973		- u64 len_ns;
974		- u32 rate;
975	1202
976		- if (!tcp_needs_internal_pacing(sk))
977		- return;
978		- rate = sk->sk_pacing_rate;
979		- if (!rate \|\| rate == ~0U)
980		- return;
	1203	+ if (sk->sk_pacing_status != SK_PACING_NONE) {
	1204	+ unsigned long rate = sk->sk_pacing_rate;
981	1205
982		- len_ns = (u64)skb->len * NSEC_PER_SEC;
983		- do_div(len_ns, rate);
984		- now = ktime_get();
985		- /* If hrtimer is already armed, then our caller has not
986		- * used tcp_pacing_check().
987		- */
988		- if (unlikely(hrtimer_is_queued(&tp->pacing_timer))) {
989		- expire = hrtimer_get_softexpires(&tp->pacing_timer);
990		- if (ktime_after(expire, now))
991		- now = expire;
992		- if (hrtimer_try_to_cancel(&tp->pacing_timer) == 1)
993		- __sock_put(sk);
	1206	+ /* Original sch_fq does not pace first 10 MSS
	1207	+ * Note that tp->data_segs_out overflows after 2^32 packets,
	1208	+ * this is a minor annoyance.
	1209	+ */
	1210	+ if (rate != ~0UL && rate && tp->data_segs_out >= 10) {
	1211	+ u64 len_ns = div64_ul((u64)skb->len * NSEC_PER_SEC, rate);
	1212	+ u64 credit = tp->tcp_wstamp_ns - prior_wstamp;
	1213	+
	1214	+ /* take into account OS jitter */
	1215	+ len_ns -= min_t(u64, len_ns / 2, credit);
	1216	+ tp->tcp_wstamp_ns += len_ns;
	1217	+ }
994	1218	}
995		- hrtimer_start(&tp->pacing_timer, ktime_add_ns(now, len_ns),
996		- HRTIMER_MODE_ABS_PINNED_SOFT);
997		- sock_hold(sk);
998		-}
999		-
1000		-static bool tcp_pacing_check(const struct sock *sk)
1001		-{
1002		- return tcp_needs_internal_pacing(sk) &&
1003		- hrtimer_is_queued(&tcp_sk(sk)->pacing_timer);
1004		-}
1005		-
1006		-static void tcp_update_skb_after_send(struct tcp_sock tp, struct sk_buff skb)
1007		-{
1008		- skb->skb_mstamp = tp->tcp_mstamp;
1009	1219	list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
1010	1220	}
	1221	+
	1222	+INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock sk, struct sk_buff skb, struct flowi *fl));
	1223	+INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock sk, struct sk_buff skb, struct flowi *fl));
	1224	+INDIRECT_CALLABLE_DECLARE(void tcp_v4_send_check(struct sock sk, struct sk_buff skb));
1011	1225
1012	1226	/* This routine actually transmits TCP packets queued in by
1013	1227	* tcp_do_sendmsg(). This is used by both the initial
..	..	@@ -1032,11 +1246,14 @@
1032	1246	struct sk_buff *oskb = NULL;
1033	1247	struct tcp_md5sig_key *md5;
1034	1248	struct tcphdr *th;
	1249	+ u64 prior_wstamp;
1035	1250	int err;
1036	1251
1037	1252	BUG_ON(!skb \|\| !tcp_skb_pcount(skb));
1038	1253	tp = tcp_sk(sk);
1039		-
	1254	+ prior_wstamp = tp->tcp_wstamp_ns;
	1255	+ tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
	1256	+ skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
1040	1257	if (clone_it) {
1041	1258	TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
1042	1259	- tp->snd_una;
..	..	@@ -1051,18 +1268,32 @@
1051	1268
1052	1269	if (unlikely(!skb))
1053	1270	return -ENOBUFS;
	1271	+ /* retransmit skbs might have a non zero value in skb->dev
	1272	+ * because skb->dev is aliased with skb->rbnode.rb_left
	1273	+ */
	1274	+ skb->dev = NULL;
1054	1275	}
1055		- skb->skb_mstamp = tp->tcp_mstamp;
1056	1276
1057	1277	inet = inet_sk(sk);
1058	1278	tcb = TCP_SKB_CB(skb);
1059	1279	memset(&opts, 0, sizeof(opts));
1060	1280
1061		- if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
	1281	+ if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
1062	1282	tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
1063		- else
	1283	+ } else {
1064	1284	tcp_options_size = tcp_established_options(sk, skb, &opts,
1065	1285	&md5);
	1286	+ /* Force a PSH flag on all (GSO) packets to expedite GRO flush
	1287	+ * at receiver : This slightly improve GRO performance.
	1288	+ * Note that we do not force the PSH flag for non GSO packets,
	1289	+ * because they might be sent under high congestion events,
	1290	+ * and in this case it is better to delay the delivery of 1-MSS
	1291	+ * packets and thus the corresponding ACK packet that would
	1292	+ * release the following packet.
	1293	+ */
	1294	+ if (tcp_skb_pcount(skb) > 1)
	1295	+ tcb->tcp_flags \|= TCPHDR_PSH;
	1296	+ }
1066	1297	tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
1067	1298
1068	1299	/* if no packet is in qdisc/device queue, then allow XPS to select
..	..	@@ -1135,7 +1366,12 @@
1135	1366	}
1136	1367	#endif
1137	1368
1138		- icsk->icsk_af_ops->send_check(sk, skb);
	1369	+ /* BPF prog is the last one writing header option */
	1370	+ bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts);
	1371	+
	1372	+ INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check,
	1373	+ tcp_v6_send_check, tcp_v4_send_check,
	1374	+ sk, skb);
1139	1375
1140	1376	if (likely(tcb->tcp_flags & TCPHDR_ACK))
1141	1377	tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt);
..	..	@@ -1144,7 +1380,6 @@
1144	1380	tcp_event_data_sent(tp, sk);
1145	1381	tp->data_segs_out += tcp_skb_pcount(skb);
1146	1382	tp->bytes_sent += skb->len - tcp_header_size;
1147		- tcp_internal_pacing(sk, skb);
1148	1383	}
1149	1384
1150	1385	if (after(tcb->end_seq, tp->snd_nxt) \|\| tcb->seq == tcb->end_seq)
..	..	@@ -1156,21 +1391,24 @@
1156	1391	skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
1157	1392	skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
1158	1393
1159		- /* Our usage of tstamp should remain private */
1160		- skb->tstamp = 0;
	1394	+ /* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */
1161	1395
1162	1396	/* Cleanup our debris for IP stacks */
1163	1397	memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
1164	1398	sizeof(struct inet6_skb_parm)));
1165	1399
1166		- err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
	1400	+ tcp_add_tx_delay(skb, tp);
	1401	+
	1402	+ err = INDIRECT_CALL_INET(icsk->icsk_af_ops->queue_xmit,
	1403	+ inet6_csk_xmit, ip_queue_xmit,
	1404	+ sk, skb, &inet->cork.fl);
1167	1405
1168	1406	if (unlikely(err > 0)) {
1169	1407	tcp_enter_cwr(sk);
1170	1408	err = net_xmit_eval(err);
1171	1409	}
1172	1410	if (!err && oskb) {
1173		- tcp_update_skb_after_send(tp, oskb);
	1411	+ tcp_update_skb_after_send(sk, oskb, prior_wstamp);
1174	1412	tcp_rate_skb_sent(sk, oskb);
1175	1413	}
1176	1414	return err;
..	..	@@ -1196,7 +1434,7 @@
1196	1434	WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq);
1197	1435	__skb_header_release(skb);
1198	1436	tcp_add_write_queue_tail(sk, skb);
1199		- sk->sk_wmem_queued += skb->truesize;
	1437	+ sk_wmem_queued_add(sk, skb->truesize);
1200	1438	sk_mem_charge(sk, skb->truesize);
1201	1439	}
1202	1440
..	..	@@ -1321,15 +1559,16 @@
1321	1559	return -ENOMEM;
1322	1560	}
1323	1561
1324		- if (skb_unclone(skb, gfp))
	1562	+ if (skb_unclone_keeptruesize(skb, gfp))
1325	1563	return -ENOMEM;
1326	1564
1327	1565	/* Get a new skb... force flag on. */
1328	1566	buff = sk_stream_alloc_skb(sk, nsize, gfp, true);
1329	1567	if (!buff)
1330	1568	return -ENOMEM; /* We'll just try again later. */
	1569	+ skb_copy_decrypted(buff, skb);
1331	1570
1332		- sk->sk_wmem_queued += buff->truesize;
	1571	+ sk_wmem_queued_add(sk, buff->truesize);
1333	1572	sk_mem_charge(sk, buff->truesize);
1334	1573	nlen = skb->len - len - nsize;
1335	1574	buff->truesize += nlen;
..	..	@@ -1410,7 +1649,7 @@
1410	1649	} else {
1411	1650	shinfo->frags[k] = shinfo->frags[i];
1412	1651	if (eat) {
1413		- shinfo->frags[k].page_offset += eat;
	1652	+ skb_frag_off_add(&shinfo->frags[k], eat);
1414	1653	skb_frag_size_sub(&shinfo->frags[k], eat);
1415	1654	eat = 0;
1416	1655	}
..	..	@@ -1429,7 +1668,7 @@
1429	1668	{
1430	1669	u32 delta_truesize;
1431	1670
1432		- if (skb_unclone(skb, GFP_ATOMIC))
	1671	+ if (skb_unclone_keeptruesize(skb, GFP_ATOMIC))
1433	1672	return -ENOMEM;
1434	1673
1435	1674	delta_truesize = __pskb_trim_head(skb, len);
..	..	@@ -1439,9 +1678,8 @@
1439	1678
1440	1679	if (delta_truesize) {
1441	1680	skb->truesize -= delta_truesize;
1442		- sk->sk_wmem_queued -= delta_truesize;
	1681	+ sk_wmem_queued_add(sk, -delta_truesize);
1443	1682	sk_mem_uncharge(sk, delta_truesize);
1444		- sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
1445	1683	}
1446	1684
1447	1685	/* Any change of skb->len requires recalculation of tso factor. */
..	..	@@ -1479,7 +1717,8 @@
1479	1717	mss_now -= icsk->icsk_ext_hdr_len;
1480	1718
1481	1719	/* Then reserve room for full set of TCP options and 8 bytes of data */
1482		- mss_now = max(mss_now, sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss);
	1720	+ mss_now = max(mss_now,
	1721	+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss));
1483	1722	return mss_now;
1484	1723	}
1485	1724
..	..	@@ -1522,10 +1761,10 @@
1522	1761	struct inet_connection_sock *icsk = inet_csk(sk);
1523	1762	struct net *net = sock_net(sk);
1524	1763
1525		- icsk->icsk_mtup.enabled = net->ipv4.sysctl_tcp_mtu_probing > 1;
	1764	+ icsk->icsk_mtup.enabled = READ_ONCE(net->ipv4.sysctl_tcp_mtu_probing) > 1;
1526	1765	icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
1527	1766	icsk->icsk_af_ops->net_header_len;
1528		- icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
	1767	+ icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, READ_ONCE(net->ipv4.sysctl_tcp_base_mss));
1529	1768	icsk->icsk_mtup.probe_size = 0;
1530	1769	if (icsk->icsk_mtup.enabled)
1531	1770	icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
..	..	@@ -1637,15 +1876,20 @@
1637	1876	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
1638	1877	struct tcp_sock *tp = tcp_sk(sk);
1639	1878
1640		- /* Track the maximum number of outstanding packets in each
1641		- * window, and remember whether we were cwnd-limited then.
	1879	+ /* Track the strongest available signal of the degree to which the cwnd
	1880	+ * is fully utilized. If cwnd-limited then remember that fact for the
	1881	+ * current window. If not cwnd-limited then track the maximum number of
	1882	+ * outstanding packets in the current window. (If cwnd-limited then we
	1883	+ * chose to not update tp->max_packets_out to avoid an extra else
	1884	+ * clause with no functional impact.)
1642	1885	*/
1643		- if (!before(tp->snd_una, tp->max_packets_seq) \|\|
1644		- tp->packets_out > tp->max_packets_out \|\|
1645		- is_cwnd_limited) {
1646		- tp->max_packets_out = tp->packets_out;
1647		- tp->max_packets_seq = tp->snd_nxt;
	1886	+ if (!before(tp->snd_una, tp->cwnd_usage_seq) \|\|
	1887	+ is_cwnd_limited \|\|
	1888	+ (!tp->is_cwnd_limited &&
	1889	+ tp->packets_out > tp->max_packets_out)) {
1648	1890	tp->is_cwnd_limited = is_cwnd_limited;
	1891	+ tp->max_packets_out = tp->packets_out;
	1892	+ tp->cwnd_usage_seq = tp->snd_nxt;
1649	1893	}
1650	1894
1651	1895	if (tcp_is_cwnd_limited(sk)) {
..	..	@@ -1657,7 +1901,7 @@
1657	1901	if (tp->packets_out > tp->snd_cwnd_used)
1658	1902	tp->snd_cwnd_used = tp->packets_out;
1659	1903
1660		- if (sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle &&
	1904	+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle) &&
1661	1905	(s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto &&
1662	1906	!ca_ops->cong_control)
1663	1907	tcp_cwnd_application_limited(sk);
..	..	@@ -1721,8 +1965,9 @@
1721	1965	{
1722	1966	u32 bytes, segs;
1723	1967
1724		- bytes = min(sk->sk_pacing_rate >> sk->sk_pacing_shift,
1725		- sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
	1968	+ bytes = min_t(unsigned long,
	1969	+ sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
	1970	+ sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
1726	1971
1727	1972	/* Goal is to send at least one packet per ms,
1728	1973	* not one big TSO packet every 100 ms.
..	..	@@ -1744,7 +1989,7 @@
1744	1989
1745	1990	min_tso = ca_ops->min_tso_segs ?
1746	1991	ca_ops->min_tso_segs(sk) :
1747		- sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs;
	1992	+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
1748	1993
1749	1994	tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
1750	1995	return min_t(u32, tso_segs, sk->sk_gso_max_segs);
..	..	@@ -1868,23 +2113,24 @@
1868	2113	* know that all the data is in scatter-gather pages, and that the
1869	2114	* packet has never been sent out before (and thus is not cloned).
1870	2115	*/
1871		-static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue,
1872		- struct sk_buff *skb, unsigned int len,
	2116	+static int tso_fragment(struct sock sk, struct sk_buff skb, unsigned int len,
1873	2117	unsigned int mss_now, gfp_t gfp)
1874	2118	{
1875		- struct sk_buff *buff;
1876	2119	int nlen = skb->len - len;
	2120	+ struct sk_buff *buff;
1877	2121	u8 flags;
1878	2122
1879	2123	/* All of a TSO frame must be composed of paged data. */
1880	2124	if (skb->len != skb->data_len)
1881		- return tcp_fragment(sk, tcp_queue, skb, len, mss_now, gfp);
	2125	+ return tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
	2126	+ skb, len, mss_now, gfp);
1882	2127
1883	2128	buff = sk_stream_alloc_skb(sk, 0, gfp, true);
1884	2129	if (unlikely(!buff))
1885	2130	return -ENOMEM;
	2131	+ skb_copy_decrypted(buff, skb);
1886	2132
1887		- sk->sk_wmem_queued += buff->truesize;
	2133	+ sk_wmem_queued_add(sk, buff->truesize);
1888	2134	sk_mem_charge(sk, buff->truesize);
1889	2135	buff->truesize += nlen;
1890	2136	skb->truesize -= nlen;
..	..	@@ -1914,7 +2160,7 @@
1914	2160
1915	2161	/* Link BUFF into the send queue. */
1916	2162	__skb_header_release(buff);
1917		- tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
	2163	+ tcp_insert_write_queue_after(skb, buff, sk, TCP_FRAG_IN_WRITE_QUEUE);
1918	2164
1919	2165	return 0;
1920	2166	}
..	..	@@ -1930,18 +2176,22 @@
1930	2176	u32 max_segs)
1931	2177	{
1932	2178	const struct inet_connection_sock *icsk = inet_csk(sk);
1933		- u32 age, send_win, cong_win, limit, in_flight;
	2179	+ u32 send_win, cong_win, limit, in_flight;
1934	2180	struct tcp_sock *tp = tcp_sk(sk);
1935	2181	struct sk_buff *head;
1936	2182	int win_divisor;
	2183	+ s64 delta;
1937	2184
1938	2185	if (icsk->icsk_ca_state >= TCP_CA_Recovery)
1939	2186	goto send_now;
1940	2187
1941	2188	/* Avoid bursty behavior by allowing defer
1942		- * only if the last write was recent.
	2189	+ * only if the last write was recent (1 ms).
	2190	+ * Note that tp->tcp_wstamp_ns can be in the future if we have
	2191	+ * packets waiting in a qdisc or device for EDT delivery.
1943	2192	*/
1944		- if ((s32)(tcp_jiffies32 - tp->lsndtime) > 0)
	2193	+ delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC;
	2194	+ if (delta > 0)
1945	2195	goto send_now;
1946	2196
1947	2197	in_flight = tcp_packets_in_flight(tp);
..	..	@@ -1988,9 +2238,9 @@
1988	2238	head = tcp_rtx_queue_head(sk);
1989	2239	if (!head)
1990	2240	goto send_now;
1991		- age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp);
	2241	+ delta = tp->tcp_clock_cache - head->tstamp;
1992	2242	/* If next ACK is likely to come too late (half srtt), do not defer */
1993		- if (age < (tp->srtt_us >> 4))
	2243	+ if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0)
1994	2244	goto send_now;
1995	2245
1996	2246	/* Ok, it looks like it is advisable to defer.
..	..	@@ -2012,7 +2262,8 @@
2012	2262	}
2013	2263
2014	2264	/* If this packet won't get more data, do not wait. */
2015		- if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
	2265	+ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) \|\|
	2266	+ TCP_SKB_CB(skb)->eor)
2016	2267	goto send_now;
2017	2268
2018	2269	return true;
..	..	@@ -2029,7 +2280,7 @@
2029	2280	u32 interval;
2030	2281	s32 delta;
2031	2282
2032		- interval = net->ipv4.sysctl_tcp_probe_interval;
	2283	+ interval = READ_ONCE(net->ipv4.sysctl_tcp_probe_interval);
2033	2284	delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp;
2034	2285	if (unlikely(delta >= interval * HZ)) {
2035	2286	int mss = tcp_current_mss(sk);
..	..	@@ -2111,7 +2362,7 @@
2111	2362	* probing process by not resetting search range to its orignal.
2112	2363	*/
2113	2364	if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) \|\|
2114		- interval < net->ipv4.sysctl_tcp_probe_threshold) {
	2365	+ interval < READ_ONCE(net->ipv4.sysctl_tcp_probe_threshold)) {
2115	2366	/* Check whether enough time has elaplased for
2116	2367	* another round of probing.
2117	2368	*/
..	..	@@ -2139,17 +2390,15 @@
2139	2390	if (!tcp_can_coalesce_send_queue_head(sk, probe_size))
2140	2391	return -1;
2141	2392
2142		- if (tcp_pacing_check(sk))
2143		- return -1;
2144		-
2145	2393	/* We're allowed to probe. Build it now. */
2146	2394	nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false);
2147	2395	if (!nskb)
2148	2396	return -1;
2149		- sk->sk_wmem_queued += nskb->truesize;
	2397	+ sk_wmem_queued_add(sk, nskb->truesize);
2150	2398	sk_mem_charge(sk, nskb->truesize);
2151	2399
2152	2400	skb = tcp_send_head(sk);
	2401	+ skb_copy_decrypted(nskb, skb);
2153	2402
2154	2403	TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
2155	2404	TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
..	..	@@ -2215,6 +2464,25 @@
2215	2464	return -1;
2216	2465	}
2217	2466
	2467	+static bool tcp_pacing_check(struct sock *sk)
	2468	+{
	2469	+ struct tcp_sock *tp = tcp_sk(sk);
	2470	+
	2471	+ if (!tcp_needs_internal_pacing(sk))
	2472	+ return false;
	2473	+
	2474	+ if (tp->tcp_wstamp_ns <= tp->tcp_clock_cache)
	2475	+ return false;
	2476	+
	2477	+ if (!hrtimer_is_queued(&tp->pacing_timer)) {
	2478	+ hrtimer_start(&tp->pacing_timer,
	2479	+ ns_to_ktime(tp->tcp_wstamp_ns),
	2480	+ HRTIMER_MODE_ABS_PINNED_SOFT);
	2481	+ sock_hold(sk);
	2482	+ }
	2483	+ return true;
	2484	+}
	2485	+
2218	2486	/* TCP Small Queues :
2219	2487	* Control number of packets in qdisc/devices to two packets / or ~1 ms.
2220	2488	* (These limits are doubled for retransmits)
..	..	@@ -2229,13 +2497,28 @@
2229	2497	static bool tcp_small_queue_check(struct sock sk, const struct sk_buff skb,
2230	2498	unsigned int factor)
2231	2499	{
2232		- unsigned int limit;
	2500	+ unsigned long limit;
2233	2501
2234		- limit = max(2 * skb->truesize, sk->sk_pacing_rate >> sk->sk_pacing_shift);
2235		- limit = min_t(u32, limit,
2236		- sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
	2502	+ limit = max_t(unsigned long,
	2503	+ 2 * skb->truesize,
	2504	+ sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift));
	2505	+ if (sk->sk_pacing_status == SK_PACING_NONE)
	2506	+ limit = min_t(unsigned long, limit,
	2507	+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes));
2237	2508	limit <<= factor;
2238	2509
	2510	+ if (static_branch_unlikely(&tcp_tx_delay_enabled) &&
	2511	+ tcp_sk(sk)->tcp_tx_delay) {
	2512	+ u64 extra_bytes = (u64)sk->sk_pacing_rate * tcp_sk(sk)->tcp_tx_delay;
	2513	+
	2514	+ /* TSQ is based on skb truesize sum (sk_wmem_alloc), so we
	2515	+ * approximate our needs assuming an ~100% skb->truesize overhead.
	2516	+ * USEC_PER_SEC is approximated by 2^20.
	2517	+ * do_div(extra_bytes, USEC_PER_SEC/2) is replaced by a right shift.
	2518	+ */
	2519	+ extra_bytes >>= (20 - 1);
	2520	+ limit += extra_bytes;
	2521	+ }
2239	2522	if (refcount_read(&sk->sk_wmem_alloc) > limit) {
2240	2523	/* Always send skb if rtx queue is empty.
2241	2524	* No need to wait for TX completion to call us back,
..	..	@@ -2341,17 +2624,19 @@
2341	2624	while ((skb = tcp_send_head(sk))) {
2342	2625	unsigned int limit;
2343	2626
	2627	+ if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
	2628	+ /* "skb_mstamp_ns" is used as a start point for the retransmit timer */
	2629	+ skb->skb_mstamp_ns = tp->tcp_wstamp_ns = tp->tcp_clock_cache;
	2630	+ list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
	2631	+ tcp_init_tso_segs(skb, mss_now);
	2632	+ goto repair; /* Skip network transmission */
	2633	+ }
	2634	+
2344	2635	if (tcp_pacing_check(sk))
2345	2636	break;
2346	2637
2347	2638	tso_segs = tcp_init_tso_segs(skb, mss_now);
2348	2639	BUG_ON(!tso_segs);
2349		-
2350		- if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
2351		- /* "skb_mstamp" is used as a start point for the retransmit timer */
2352		- tcp_update_skb_after_send(tp, skb);
2353		- goto repair; /* Skip network transmission */
2354		- }
2355	2640
2356	2641	cwnd_quota = tcp_cwnd_test(tp, skb);
2357	2642	if (!cwnd_quota) {
..	..	@@ -2388,8 +2673,7 @@
2388	2673	nonagle);
2389	2674
2390	2675	if (skb->len > limit &&
2391		- unlikely(tso_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
2392		- skb, limit, mss_now, gfp)))
	2676	+ unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
2393	2677	break;
2394	2678
2395	2679	if (tcp_small_queue_check(sk, skb, 0))
..	..	@@ -2450,10 +2734,10 @@
2450	2734	/* Don't do any loss probe on a Fast Open connection before 3WHS
2451	2735	* finishes.
2452	2736	*/
2453		- if (tp->fastopen_rsk)
	2737	+ if (rcu_access_pointer(tp->fastopen_rsk))
2454	2738	return false;
2455	2739
2456		- early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans;
	2740	+ early_retrans = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_early_retrans);
2457	2741	/* Schedule a loss probe in 2*RTT for SACK capable connections
2458	2742	* not in loss recovery, that are either limited by cwnd or application.
2459	2743	*/
..	..	@@ -2484,8 +2768,7 @@
2484	2768	if (rto_delta_us > 0)
2485	2769	timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
2486	2770
2487		- inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
2488		- TCP_RTO_MAX);
	2771	+ tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, TCP_RTO_MAX);
2489	2772	return true;
2490	2773	}
2491	2774
..	..	@@ -2666,8 +2949,12 @@
2666	2949	int mss = icsk->icsk_ack.rcv_mss;
2667	2950	int free_space = tcp_space(sk);
2668	2951	int allowed_space = tcp_full_space(sk);
2669		- int full_space = min_t(int, tp->window_clamp, allowed_space);
2670		- int window;
	2952	+ int full_space, window;
	2953	+
	2954	+ if (sk_is_mptcp(sk))
	2955	+ mptcp_space(sk, &free_space, &allowed_space);
	2956	+
	2957	+ full_space = min_t(int, tp->window_clamp, allowed_space);
2671	2958
2672	2959	if (unlikely(mss > full_space)) {
2673	2960	mss = full_space;
..	..	@@ -2815,7 +3102,7 @@
2815	3102	struct sk_buff skb = to, tmp;
2816	3103	bool first = true;
2817	3104
2818		- if (!sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse)
	3105	+ if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse))
2819	3106	return;
2820	3107	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
2821	3108	return;
..	..	@@ -2824,7 +3111,7 @@
2824	3111	if (!tcp_can_collapse(sk, skb))
2825	3112	break;
2826	3113
2827		- if (!tcp_skb_can_collapse_to(to))
	3114	+ if (!tcp_skb_can_collapse(to, skb))
2828	3115	break;
2829	3116
2830	3117	space -= skb->len;
..	..	@@ -2855,7 +3142,7 @@
2855	3142	struct tcp_sock *tp = tcp_sk(sk);
2856	3143	unsigned int cur_mss;
2857	3144	int diff, len, err;
2858		-
	3145	+ int avail_wnd;
2859	3146
2860	3147	/* Inconclusive MTU probe */
2861	3148	if (icsk->icsk_mtup.probe_size)
..	..	@@ -2885,23 +3172,31 @@
2885	3172	return -EHOSTUNREACH; /* Routing failure or similar. */
2886	3173
2887	3174	cur_mss = tcp_current_mss(sk);
	3175	+ avail_wnd = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
2888	3176
2889	3177	/* If receiver has shrunk his window, and skb is out of
2890	3178	* new window, do not retransmit it. The exception is the
2891	3179	* case, when window is shrunk to zero. In this case
2892		- * our retransmit serves as a zero window probe.
	3180	+ * our retransmit of one segment serves as a zero window probe.
2893	3181	*/
2894		- if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&
2895		- TCP_SKB_CB(skb)->seq != tp->snd_una)
2896		- return -EAGAIN;
	3182	+ if (avail_wnd <= 0) {
	3183	+ if (TCP_SKB_CB(skb)->seq != tp->snd_una)
	3184	+ return -EAGAIN;
	3185	+ avail_wnd = cur_mss;
	3186	+ }
2897	3187
2898	3188	len = cur_mss * segs;
	3189	+ if (len > avail_wnd) {
	3190	+ len = rounddown(avail_wnd, cur_mss);
	3191	+ if (!len)
	3192	+ len = avail_wnd;
	3193	+ }
2899	3194	if (skb->len > len) {
2900	3195	if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len,
2901	3196	cur_mss, GFP_ATOMIC))
2902	3197	return -ENOMEM; /* We'll try again later. */
2903	3198	} else {
2904		- if (skb_unclone(skb, GFP_ATOMIC))
	3199	+ if (skb_unclone_keeptruesize(skb, GFP_ATOMIC))
2905	3200	return -ENOMEM;
2906	3201
2907	3202	diff = tcp_skb_pcount(skb);
..	..	@@ -2909,8 +3204,9 @@
2909	3204	diff -= tcp_skb_pcount(skb);
2910	3205	if (diff)
2911	3206	tcp_adjust_pcount(sk, skb, diff);
2912		- if (skb->len < cur_mss)
2913		- tcp_retrans_try_collapse(sk, skb, cur_mss);
	3207	+ avail_wnd = min_t(int, avail_wnd, cur_mss);
	3208	+ if (skb->len < avail_wnd)
	3209	+ tcp_retrans_try_collapse(sk, skb, avail_wnd);
2914	3210	}
2915	3211
2916	3212	/* RFC3168, section 6.1.1.1. ECN fallback */
..	..	@@ -2935,24 +3231,32 @@
2935	3231
2936	3232	tcp_skb_tsorted_save(skb) {
2937	3233	nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
2938		- err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
2939		- -ENOBUFS;
	3234	+ if (nskb) {
	3235	+ nskb->dev = NULL;
	3236	+ err = tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC);
	3237	+ } else {
	3238	+ err = -ENOBUFS;
	3239	+ }
2940	3240	} tcp_skb_tsorted_restore(skb);
2941	3241
2942	3242	if (!err) {
2943		- tcp_update_skb_after_send(tp, skb);
	3243	+ tcp_update_skb_after_send(sk, skb, tp->tcp_wstamp_ns);
2944	3244	tcp_rate_skb_sent(sk, skb);
2945	3245	}
2946	3246	} else {
2947	3247	err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
2948	3248	}
2949	3249
	3250	+ /* To avoid taking spuriously low RTT samples based on a timestamp
	3251	+ * for a transmit that never happened, always mark EVER_RETRANS
	3252	+ */
	3253	+ TCP_SKB_CB(skb)->sacked \|= TCPCB_EVER_RETRANS;
	3254	+
2950	3255	if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG))
2951	3256	tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB,
2952	3257	TCP_SKB_CB(skb)->seq, segs, err);
2953	3258
2954	3259	if (likely(!err)) {
2955		- TCP_SKB_CB(skb)->sacked \|= TCPCB_EVER_RETRANS;
2956	3260	trace_tcp_retransmit_skb(sk, skb);
2957	3261	} else if (err != -EBUSY) {
2958	3262	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs);
..	..	@@ -2995,6 +3299,7 @@
2995	3299	const struct inet_connection_sock *icsk = inet_csk(sk);
2996	3300	struct sk_buff skb, rtx_head, *hole = NULL;
2997	3301	struct tcp_sock *tp = tcp_sk(sk);
	3302	+ bool rearm_timer = false;
2998	3303	u32 max_segs;
2999	3304	int mib_idx;
3000	3305
..	..	@@ -3017,7 +3322,7 @@
3017	3322
3018	3323	segs = tp->snd_cwnd - tcp_packets_in_flight(tp);
3019	3324	if (segs <= 0)
3020		- return;
	3325	+ break;
3021	3326	sacked = TCP_SKB_CB(skb)->sacked;
3022	3327	/* In case tcp_shift_skb_data() have aggregated large skbs,
3023	3328	* we need to make sure not sending too bigs TSO packets
..	..	@@ -3042,10 +3347,10 @@
3042	3347	continue;
3043	3348
3044	3349	if (tcp_small_queue_check(sk, skb, 1))
3045		- return;
	3350	+ break;
3046	3351
3047	3352	if (tcp_retransmit_skb(sk, skb, segs))
3048		- return;
	3353	+ break;
3049	3354
3050	3355	NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb));
3051	3356
..	..	@@ -3054,10 +3359,13 @@
3054	3359
3055	3360	if (skb == rtx_head &&
3056	3361	icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
3057		- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
3058		- inet_csk(sk)->icsk_rto,
3059		- TCP_RTO_MAX);
	3362	+ rearm_timer = true;
	3363	+
3060	3364	}
	3365	+ if (rearm_timer)
	3366	+ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
	3367	+ inet_csk(sk)->icsk_rto,
	3368	+ TCP_RTO_MAX);
3061	3369	}
3062	3370
3063	3371	/* We allow to exceed memory limits for FIN packets to expedite
..	..	@@ -3069,11 +3377,12 @@
3069	3377	*/
3070	3378	void sk_forced_mem_schedule(struct sock *sk, int size)
3071	3379	{
3072		- int amt;
	3380	+ int delta, amt;
3073	3381
3074		- if (size <= sk->sk_forward_alloc)
	3382	+ delta = size - sk->sk_forward_alloc;
	3383	+ if (delta <= 0)
3075	3384	return;
3076		- amt = sk_mem_pages(size);
	3385	+ amt = sk_mem_pages(delta);
3077	3386	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
3078	3387	sk_memory_allocated_add(sk, amt);
3079	3388
..	..	@@ -3086,7 +3395,7 @@
3086	3395	*/
3087	3396	void tcp_send_fin(struct sock *sk)
3088	3397	{
3089		- struct sk_buff skb, tskb = tcp_write_queue_tail(sk);
	3398	+ struct sk_buff skb, tskb, *tail = tcp_write_queue_tail(sk);
3090	3399	struct tcp_sock *tp = tcp_sk(sk);
3091	3400
3092	3401	/* Optimization, tack on the FIN if we have one skb in write queue and
..	..	@@ -3094,31 +3403,29 @@
3094	3403	* Note: in the latter case, FIN packet will be sent after a timeout,
3095	3404	* as TCP stack thinks it has already been transmitted.
3096	3405	*/
	3406	+ tskb = tail;
3097	3407	if (!tskb && tcp_under_memory_pressure(sk))
3098	3408	tskb = skb_rb_last(&sk->tcp_rtx_queue);
3099	3409
3100	3410	if (tskb) {
3101		-coalesce:
3102	3411	TCP_SKB_CB(tskb)->tcp_flags \|= TCPHDR_FIN;
3103	3412	TCP_SKB_CB(tskb)->end_seq++;
3104	3413	tp->write_seq++;
3105		- if (tcp_write_queue_empty(sk)) {
	3414	+ if (!tail) {
3106	3415	/* This means tskb was already sent.
3107	3416	* Pretend we included the FIN on previous transmit.
3108	3417	* We need to set tp->snd_nxt to the value it would have
3109	3418	* if FIN had been sent. This is because retransmit path
3110	3419	* does not change tp->snd_nxt.
3111	3420	*/
3112		- tp->snd_nxt++;
	3421	+ WRITE_ONCE(tp->snd_nxt, tp->snd_nxt + 1);
3113	3422	return;
3114	3423	}
3115	3424	} else {
3116	3425	skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
3117		- if (unlikely(!skb)) {
3118		- if (tskb)
3119		- goto coalesce;
	3426	+ if (unlikely(!skb))
3120	3427	return;
3121		- }
	3428	+
3122	3429	INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
3123	3430	skb_reserve(skb, MAX_TCP_HEADER);
3124	3431	sk_forced_mem_schedule(sk, skb->truesize);
..	..	@@ -3192,7 +3499,7 @@
3192	3499	tcp_rtx_queue_unlink_and_free(skb, sk);
3193	3500	__skb_header_release(nskb);
3194	3501	tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb);
3195		- sk->sk_wmem_queued += nskb->truesize;
	3502	+ sk_wmem_queued_add(sk, nskb->truesize);
3196	3503	sk_mem_charge(sk, nskb->truesize);
3197	3504	skb = nskb;
3198	3505	}
..	..	@@ -3204,18 +3511,20 @@
3204	3511	}
3205	3512
3206	3513	/**
3207		- * tcp_make_synack - Prepare a SYN-ACK.
3208		- * sk: listener socket
3209		- * dst: dst entry attached to the SYNACK
3210		- * req: request_sock pointer
3211		- *
3212		- * Allocate one skb and build a SYNACK packet.
3213		- * @dst is consumed : Caller should not use it again.
	3514	+ * tcp_make_synack - Allocate one skb and build a SYNACK packet.
	3515	+ * @sk: listener socket
	3516	+ * @dst: dst entry attached to the SYNACK. It is consumed and caller
	3517	+ * should not use it again.
	3518	+ * @req: request_sock pointer
	3519	+ * @foc: cookie for tcp fast open
	3520	+ * @synack_type: Type of synack to prepare
	3521	+ * @syn_skb: SYN packet just received. It could be NULL for rtx case.
3214	3522	*/
3215	3523	struct sk_buff tcp_make_synack(const struct sock sk, struct dst_entry *dst,
3216	3524	struct request_sock *req,
3217	3525	struct tcp_fastopen_cookie *foc,
3218		- enum tcp_synack_type synack_type)
	3526	+ enum tcp_synack_type synack_type,
	3527	+ struct sk_buff *syn_skb)
3219	3528	{
3220	3529	struct inet_request_sock *ireq = inet_rsk(req);
3221	3530	const struct tcp_sock *tp = tcp_sk(sk);
..	..	@@ -3225,6 +3534,7 @@
3225	3534	int tcp_header_size;
3226	3535	struct tcphdr *th;
3227	3536	int mss;
	3537	+ u64 now;
3228	3538
3229	3539	skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
3230	3540	if (unlikely(!skb)) {
..	..	@@ -3256,20 +3566,28 @@
3256	3566	mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
3257	3567
3258	3568	memset(&opts, 0, sizeof(opts));
	3569	+ now = tcp_clock_ns();
3259	3570	#ifdef CONFIG_SYN_COOKIES
3260		- if (unlikely(req->cookie_ts))
3261		- skb->skb_mstamp = cookie_init_timestamp(req);
	3571	+ if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok))
	3572	+ skb->skb_mstamp_ns = cookie_init_timestamp(req, now);
3262	3573	else
3263	3574	#endif
3264		- skb->skb_mstamp = tcp_clock_us();
	3575	+ {
	3576	+ skb->skb_mstamp_ns = now;
	3577	+ if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */
	3578	+ tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb);
	3579	+ }
3265	3580
3266	3581	#ifdef CONFIG_TCP_MD5SIG
3267	3582	rcu_read_lock();
3268	3583	md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
3269	3584	#endif
3270	3585	skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
	3586	+ /* bpf program will be interested in the tcp_flags */
	3587	+ TCP_SKB_CB(skb)->tcp_flags = TCPHDR_SYN \| TCPHDR_ACK;
3271	3588	tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
3272		- foc, synack_type) + sizeof(*th);
	3589	+ foc, synack_type,
	3590	+ syn_skb) + sizeof(*th);
3273	3591
3274	3592	skb_push(skb, tcp_header_size);
3275	3593	skb_reset_transport_header(skb);
..	..	@@ -3301,8 +3619,12 @@
3301	3619	rcu_read_unlock();
3302	3620	#endif
3303	3621
3304		- /* Do not fool tcpdump (if any), clean our debris */
3305		- skb->tstamp = 0;
	3622	+ bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb,
	3623	+ synack_type, &opts);
	3624	+
	3625	+ skb->skb_mstamp_ns = now;
	3626	+ tcp_add_tx_delay(skb, tp);
	3627	+
3306	3628	return skb;
3307	3629	}
3308	3630	EXPORT_SYMBOL(tcp_make_synack);
..	..	@@ -3318,8 +3640,8 @@
3318	3640
3319	3641	rcu_read_lock();
3320	3642	ca = tcp_ca_find_key(ca_key);
3321		- if (likely(ca && try_module_get(ca->owner))) {
3322		- module_put(icsk->icsk_ca_ops->owner);
	3643	+ if (likely(ca && bpf_try_module_get(ca, ca->owner))) {
	3644	+ bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner);
3323	3645	icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
3324	3646	icsk->icsk_ca_ops = ca;
3325	3647	}
..	..	@@ -3338,7 +3660,7 @@
3338	3660	* See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
3339	3661	*/
3340	3662	tp->tcp_header_len = sizeof(struct tcphdr);
3341		- if (sock_net(sk)->ipv4.sysctl_tcp_timestamps)
	3663	+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps))
3342	3664	tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED;
3343	3665
3344	3666	#ifdef CONFIG_TCP_MD5SIG
..	..	@@ -3374,7 +3696,7 @@
3374	3696	tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
3375	3697	&tp->rcv_wnd,
3376	3698	&tp->window_clamp,
3377		- sock_net(sk)->ipv4.sysctl_tcp_window_scaling,
	3699	+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling),
3378	3700	&rcv_wscale,
3379	3701	rcv_wnd);
3380	3702
..	..	@@ -3389,7 +3711,7 @@
3389	3711	tp->snd_una = tp->write_seq;
3390	3712	tp->snd_sml = tp->write_seq;
3391	3713	tp->snd_up = tp->write_seq;
3392		- tp->snd_nxt = tp->write_seq;
	3714	+ WRITE_ONCE(tp->snd_nxt, tp->write_seq);
3393	3715
3394	3716	if (likely(!tp->repair))
3395	3717	tp->rcv_nxt = 0;
..	..	@@ -3410,7 +3732,7 @@
3410	3732
3411	3733	tcb->end_seq += skb->len;
3412	3734	__skb_header_release(skb);
3413		- sk->sk_wmem_queued += skb->truesize;
	3735	+ sk_wmem_queued_add(sk, skb->truesize);
3414	3736	sk_mem_charge(sk, skb->truesize);
3415	3737	WRITE_ONCE(tp->write_seq, tcb->end_seq);
3416	3738	tp->packets_out += tcp_skb_pcount(skb);
..	..	@@ -3425,6 +3747,7 @@
3425	3747	*/
3426	3748	static int tcp_send_syn_data(struct sock sk, struct sk_buff syn)
3427	3749	{
	3750	+ struct inet_connection_sock *icsk = inet_csk(sk);
3428	3751	struct tcp_sock *tp = tcp_sk(sk);
3429	3752	struct tcp_fastopen_request *fo = tp->fastopen_req;
3430	3753	int space, err = 0;
..	..	@@ -3439,8 +3762,10 @@
3439	3762	* private TCP options. The cost is reduced data space in SYN :(
3440	3763	*/
3441	3764	tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp);
	3765	+ /* Sync mss_cache after updating the mss_clamp */
	3766	+ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
3442	3767
3443		- space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
	3768	+ space = __tcp_mtu_to_mss(sk, icsk->icsk_pmtu_cookie) -
3444	3769	MAX_TCP_OPTION_SPACE;
3445	3770
3446	3771	space = min_t(size_t, space, fo->size);
..	..	@@ -3465,6 +3790,7 @@
3465	3790	skb_trim(syn_data, copied);
3466	3791	space = copied;
3467	3792	}
	3793	+ skb_zcopy_set(syn_data, fo->uarg, NULL);
3468	3794	}
3469	3795	/* No more data pending in inet_wait_for_connect() */
3470	3796	if (space == fo->size)
..	..	@@ -3477,7 +3803,7 @@
3477	3803
3478	3804	err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
3479	3805
3480		- syn->skb_mstamp = syn_data->skb_mstamp;
	3806	+ syn->skb_mstamp_ns = syn_data->skb_mstamp_ns;
3481	3807
3482	3808	/* Now full SYN+DATA was cloned and sent (or not),
3483	3809	* remove the SYN from the original skb (syn_data)
..	..	@@ -3548,11 +3874,11 @@
3548	3874	/* We change tp->snd_nxt after the tcp_transmit_skb() call
3549	3875	* in order to make this packet get counted in tcpOutSegs.
3550	3876	*/
3551		- tp->snd_nxt = tp->write_seq;
	3877	+ WRITE_ONCE(tp->snd_nxt, tp->write_seq);
3552	3878	tp->pushed_seq = tp->write_seq;
3553	3879	buff = tcp_send_head(sk);
3554	3880	if (unlikely(buff)) {
3555		- tp->snd_nxt = TCP_SKB_CB(buff)->seq;
	3881	+ WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(buff)->seq);
3556	3882	tp->pushed_seq = TCP_SKB_CB(buff)->seq;
3557	3883	}
3558	3884	TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
..	..	@@ -3578,7 +3904,7 @@
3578	3904	const struct tcp_sock *tp = tcp_sk(sk);
3579	3905	int max_ato = HZ / 2;
3580	3906
3581		- if (icsk->icsk_ack.pingpong \|\|
	3907	+ if (inet_csk_in_pingpong_mode(sk) \|\|
3582	3908	(icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
3583	3909	max_ato = TCP_DELACK_MAX;
3584	3910
..	..	@@ -3599,16 +3925,15 @@
3599	3925	ato = min(ato, max_ato);
3600	3926	}
3601	3927
	3928	+ ato = min_t(u32, ato, inet_csk(sk)->icsk_delack_max);
	3929	+
3602	3930	/* Stay within the limit we were given */
3603	3931	timeout = jiffies + ato;
3604	3932
3605	3933	/* Use new timeout only if there wasn't a older one earlier. */
3606	3934	if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
3607		- /* If delack timer was blocked or is about to expire,
3608		- * send ACK now.
3609		- */
3610		- if (icsk->icsk_ack.blocked \|\|
3611		- time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
	3935	+ /* If delack timer is about to expire, send ACK now. */
	3936	+ if (time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
3612	3937	tcp_send_ack(sk);
3613	3938	return;
3614	3939	}
..	..	@@ -3637,10 +3962,15 @@
3637	3962	buff = alloc_skb(MAX_TCP_HEADER,
3638	3963	sk_gfp_mask(sk, GFP_ATOMIC \| __GFP_NOWARN));
3639	3964	if (unlikely(!buff)) {
	3965	+ struct inet_connection_sock *icsk = inet_csk(sk);
	3966	+ unsigned long delay;
	3967	+
	3968	+ delay = TCP_DELACK_MAX << icsk->icsk_ack.retry;
	3969	+ if (delay < TCP_RTO_MAX)
	3970	+ icsk->icsk_ack.retry++;
3640	3971	inet_csk_schedule_ack(sk);
3641		- inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
3642		- inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
3643		- TCP_DELACK_MAX, TCP_RTO_MAX);
	3972	+ icsk->icsk_ack.ato = TCP_ATO_MIN;
	3973	+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, delay, TCP_RTO_MAX);
3644	3974	return;
3645	3975	}
3646	3976
..	..	@@ -3759,7 +4089,7 @@
3759	4089	struct inet_connection_sock *icsk = inet_csk(sk);
3760	4090	struct tcp_sock *tp = tcp_sk(sk);
3761	4091	struct net *net = sock_net(sk);
3762		- unsigned long probe_max;
	4092	+ unsigned long timeout;
3763	4093	int err;
3764	4094
3765	4095	err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
..	..	@@ -3768,28 +4098,24 @@
3768	4098	/* Cancel probe timer, if it is not required. */
3769	4099	icsk->icsk_probes_out = 0;
3770	4100	icsk->icsk_backoff = 0;
	4101	+ icsk->icsk_probes_tstamp = 0;
3771	4102	return;
3772	4103	}
3773	4104
	4105	+ icsk->icsk_probes_out++;
3774	4106	if (err <= 0) {
3775		- if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2)
	4107	+ if (icsk->icsk_backoff < READ_ONCE(net->ipv4.sysctl_tcp_retries2))
3776	4108	icsk->icsk_backoff++;
3777		- icsk->icsk_probes_out++;
3778		- probe_max = TCP_RTO_MAX;
	4109	+ timeout = tcp_probe0_when(sk, TCP_RTO_MAX);
3779	4110	} else {
3780	4111	/* If packet was not sent due to local congestion,
3781		- * do not backoff and do not remember icsk_probes_out.
3782		- * Let local senders to fight for local resources.
3783		- *
3784		- * Use accumulated backoff yet.
	4112	+ * Let senders fight for local resources conservatively.
3785	4113	*/
3786		- if (!icsk->icsk_probes_out)
3787		- icsk->icsk_probes_out = 1;
3788		- probe_max = TCP_RESOURCE_PROBE_INTERVAL;
	4114	+ timeout = TCP_RESOURCE_PROBE_INTERVAL;
3789	4115	}
3790		- inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
3791		- tcp_probe0_when(sk, probe_max),
3792		- TCP_RTO_MAX);
	4116	+
	4117	+ timeout = tcp_clamp_probe0_to_user_timeout(sk, timeout);
	4118	+ tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX);
3793	4119	}
3794	4120
3795	4121	int tcp_rtx_synack(const struct sock sk, struct request_sock req)
..	..	@@ -3799,10 +4125,11 @@
3799	4125	int res;
3800	4126
3801	4127	tcp_rsk(req)->txhash = net_tx_rndhash();
3802		- res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL);
	4128	+ res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL,
	4129	+ NULL);
3803	4130	if (!res) {
3804		- __TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
3805		- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
	4131	+ TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
	4132	+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
3806	4133	if (unlikely(tcp_passive_fastopen(sk)))
3807	4134	tcp_sk(sk)->total_retrans++;
3808	4135	trace_tcp_retransmit_synack(sk, req);