hc
2024-12-19 9370bb92b2d16684ee45cf24e879c93c509162da
kernel/net/ipv4/tcp_output.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * INET An implementation of the TCP/IP protocol suite for the LINUX
34 * operating system. INET is implemented using the BSD Socket
....@@ -37,6 +38,7 @@
3738 #define pr_fmt(fmt) "TCP: " fmt
3839
3940 #include <net/tcp.h>
41
+#include <net/mptcp.h>
4042
4143 #include <linux/compiler.h>
4244 #include <linux/gfp.h>
....@@ -44,6 +46,17 @@
4446 #include <linux/static_key.h>
4547
4648 #include <trace/events/tcp.h>
49
+
50
+/* Refresh clocks of a TCP socket,
51
+ * ensuring monotically increasing values.
52
+ */
53
+void tcp_mstamp_refresh(struct tcp_sock *tp)
54
+{
55
+ u64 val = tcp_clock_ns();
56
+
57
+ tp->tcp_clock_cache = val;
58
+ tp->tcp_mstamp = div_u64(val, NSEC_PER_USEC);
59
+}
4760
4861 static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
4962 int push_one, gfp_t gfp);
....@@ -55,7 +68,7 @@
5568 struct tcp_sock *tp = tcp_sk(sk);
5669 unsigned int prior_packets = tp->packets_out;
5770
58
- tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
71
+ WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq);
5972
6073 __skb_unlink(skb, &sk->sk_write_queue);
6174 tcp_rbtree_insert(&sk->tcp_rtx_queue, skb);
....@@ -69,6 +82,7 @@
6982
7083 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
7184 tcp_skb_pcount(skb));
85
+ tcp_check_space(sk);
7286 }
7387
7488 /* SND.NXT, if window was not shrunk or the amount of shrunk was less than one
....@@ -159,26 +173,25 @@
159173 * packet, enter pingpong mode.
160174 */
161175 if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
162
- icsk->icsk_ack.pingpong = 1;
176
+ inet_csk_enter_pingpong_mode(sk);
163177 }
164178
165179 /* Account for an ACK we sent. */
166
-static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts,
167
- u32 rcv_nxt)
180
+static inline void tcp_event_ack_sent(struct sock *sk, u32 rcv_nxt)
168181 {
169182 struct tcp_sock *tp = tcp_sk(sk);
170183
171
- if (unlikely(tp->compressed_ack > TCP_FASTRETRANS_THRESH)) {
184
+ if (unlikely(tp->compressed_ack)) {
172185 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
173
- tp->compressed_ack - TCP_FASTRETRANS_THRESH);
174
- tp->compressed_ack = TCP_FASTRETRANS_THRESH;
186
+ tp->compressed_ack);
187
+ tp->compressed_ack = 0;
175188 if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
176189 __sock_put(sk);
177190 }
178191
179192 if (unlikely(rcv_nxt != tp->rcv_nxt))
180193 return; /* Special ACK sent by DCTCP to reflect ECN */
181
- tcp_dec_quickack_mode(sk, pkts);
194
+ tcp_dec_quickack_mode(sk);
182195 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
183196 }
184197
....@@ -221,16 +234,14 @@
221234 if (init_rcv_wnd)
222235 *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
223236
224
- (*rcv_wscale) = 0;
237
+ *rcv_wscale = 0;
225238 if (wscale_ok) {
226239 /* Set window scaling on max possible window */
227
- space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
228
- space = max_t(u32, space, sysctl_rmem_max);
240
+ space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
241
+ space = max_t(u32, space, READ_ONCE(sysctl_rmem_max));
229242 space = min_t(u32, space, *window_clamp);
230
- while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) {
231
- space >>= 1;
232
- (*rcv_wscale)++;
233
- }
243
+ *rcv_wscale = clamp_t(int, ilog2(space) - 15,
244
+ 0, TCP_MAX_WSCALE);
234245 }
235246 /* Set the clamp no higher than max representable value */
236247 (*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp);
....@@ -401,6 +412,7 @@
401412 #define OPTION_WSCALE (1 << 3)
402413 #define OPTION_FAST_OPEN_COOKIE (1 << 8)
403414 #define OPTION_SMC (1 << 9)
415
+#define OPTION_MPTCP (1 << 10)
404416
405417 static void smc_options_write(__be32 *ptr, u16 *options)
406418 {
....@@ -423,10 +435,159 @@
423435 u8 ws; /* window scale, 0 to disable */
424436 u8 num_sack_blocks; /* number of SACK blocks to include */
425437 u8 hash_size; /* bytes in hash_location */
438
+ u8 bpf_opt_len; /* length of BPF hdr option */
426439 __u8 *hash_location; /* temporary pointer, overloaded */
427440 __u32 tsval, tsecr; /* need to include OPTION_TS */
428441 struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
442
+ struct mptcp_out_options mptcp;
429443 };
444
+
445
+static void mptcp_options_write(__be32 *ptr, struct tcp_out_options *opts)
446
+{
447
+#if IS_ENABLED(CONFIG_MPTCP)
448
+ if (unlikely(OPTION_MPTCP & opts->options))
449
+ mptcp_write_options(ptr, &opts->mptcp);
450
+#endif
451
+}
452
+
453
+#ifdef CONFIG_CGROUP_BPF
454
+static int bpf_skops_write_hdr_opt_arg0(struct sk_buff *skb,
455
+ enum tcp_synack_type synack_type)
456
+{
457
+ if (unlikely(!skb))
458
+ return BPF_WRITE_HDR_TCP_CURRENT_MSS;
459
+
460
+ if (unlikely(synack_type == TCP_SYNACK_COOKIE))
461
+ return BPF_WRITE_HDR_TCP_SYNACK_COOKIE;
462
+
463
+ return 0;
464
+}
465
+
466
+/* req, syn_skb and synack_type are used when writing synack */
467
+static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
468
+ struct request_sock *req,
469
+ struct sk_buff *syn_skb,
470
+ enum tcp_synack_type synack_type,
471
+ struct tcp_out_options *opts,
472
+ unsigned int *remaining)
473
+{
474
+ struct bpf_sock_ops_kern sock_ops;
475
+ int err;
476
+
477
+ if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
478
+ BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) ||
479
+ !*remaining)
480
+ return;
481
+
482
+ /* *remaining has already been aligned to 4 bytes, so *remaining >= 4 */
483
+
484
+ /* init sock_ops */
485
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
486
+
487
+ sock_ops.op = BPF_SOCK_OPS_HDR_OPT_LEN_CB;
488
+
489
+ if (req) {
490
+ /* The listen "sk" cannot be passed here because
491
+ * it is not locked. It would not make too much
492
+ * sense to do bpf_setsockopt(listen_sk) based
493
+ * on individual connection request also.
494
+ *
495
+ * Thus, "req" is passed here and the cgroup-bpf-progs
496
+ * of the listen "sk" will be run.
497
+ *
498
+ * "req" is also used here for fastopen even the "sk" here is
499
+ * a fullsock "child" sk. It is to keep the behavior
500
+ * consistent between fastopen and non-fastopen on
501
+ * the bpf programming side.
502
+ */
503
+ sock_ops.sk = (struct sock *)req;
504
+ sock_ops.syn_skb = syn_skb;
505
+ } else {
506
+ sock_owned_by_me(sk);
507
+
508
+ sock_ops.is_fullsock = 1;
509
+ sock_ops.sk = sk;
510
+ }
511
+
512
+ sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
513
+ sock_ops.remaining_opt_len = *remaining;
514
+ /* tcp_current_mss() does not pass a skb */
515
+ if (skb)
516
+ bpf_skops_init_skb(&sock_ops, skb, 0);
517
+
518
+ err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
519
+
520
+ if (err || sock_ops.remaining_opt_len == *remaining)
521
+ return;
522
+
523
+ opts->bpf_opt_len = *remaining - sock_ops.remaining_opt_len;
524
+ /* round up to 4 bytes */
525
+ opts->bpf_opt_len = (opts->bpf_opt_len + 3) & ~3;
526
+
527
+ *remaining -= opts->bpf_opt_len;
528
+}
529
+
530
+static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
531
+ struct request_sock *req,
532
+ struct sk_buff *syn_skb,
533
+ enum tcp_synack_type synack_type,
534
+ struct tcp_out_options *opts)
535
+{
536
+ u8 first_opt_off, nr_written, max_opt_len = opts->bpf_opt_len;
537
+ struct bpf_sock_ops_kern sock_ops;
538
+ int err;
539
+
540
+ if (likely(!max_opt_len))
541
+ return;
542
+
543
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
544
+
545
+ sock_ops.op = BPF_SOCK_OPS_WRITE_HDR_OPT_CB;
546
+
547
+ if (req) {
548
+ sock_ops.sk = (struct sock *)req;
549
+ sock_ops.syn_skb = syn_skb;
550
+ } else {
551
+ sock_owned_by_me(sk);
552
+
553
+ sock_ops.is_fullsock = 1;
554
+ sock_ops.sk = sk;
555
+ }
556
+
557
+ sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
558
+ sock_ops.remaining_opt_len = max_opt_len;
559
+ first_opt_off = tcp_hdrlen(skb) - max_opt_len;
560
+ bpf_skops_init_skb(&sock_ops, skb, first_opt_off);
561
+
562
+ err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
563
+
564
+ if (err)
565
+ nr_written = 0;
566
+ else
567
+ nr_written = max_opt_len - sock_ops.remaining_opt_len;
568
+
569
+ if (nr_written < max_opt_len)
570
+ memset(skb->data + first_opt_off + nr_written, TCPOPT_NOP,
571
+ max_opt_len - nr_written);
572
+}
573
+#else
574
+static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
575
+ struct request_sock *req,
576
+ struct sk_buff *syn_skb,
577
+ enum tcp_synack_type synack_type,
578
+ struct tcp_out_options *opts,
579
+ unsigned int *remaining)
580
+{
581
+}
582
+
583
+static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
584
+ struct request_sock *req,
585
+ struct sk_buff *syn_skb,
586
+ enum tcp_synack_type synack_type,
587
+ struct tcp_out_options *opts)
588
+{
589
+}
590
+#endif
430591
431592 /* Write previously computed TCP options to the packet.
432593 *
....@@ -536,6 +697,8 @@
536697 }
537698
538699 smc_options_write(ptr, &options);
700
+
701
+ mptcp_options_write(ptr, opts);
539702 }
540703
541704 static void smc_set_option(const struct tcp_sock *tp,
....@@ -571,6 +734,22 @@
571734 #endif
572735 }
573736
737
+static void mptcp_set_option_cond(const struct request_sock *req,
738
+ struct tcp_out_options *opts,
739
+ unsigned int *remaining)
740
+{
741
+ if (rsk_is_mptcp(req)) {
742
+ unsigned int size;
743
+
744
+ if (mptcp_synack_options(req, &size, &opts->mptcp)) {
745
+ if (*remaining >= size) {
746
+ opts->options |= OPTION_MPTCP;
747
+ *remaining -= size;
748
+ }
749
+ }
750
+ }
751
+}
752
+
574753 /* Compute TCP options for SYN packets. This is not the final
575754 * network wire format yet.
576755 */
....@@ -584,7 +763,8 @@
584763
585764 *md5 = NULL;
586765 #ifdef CONFIG_TCP_MD5SIG
587
- if (unlikely(rcu_access_pointer(tp->md5sig_info))) {
766
+ if (static_branch_unlikely(&tcp_md5_needed) &&
767
+ rcu_access_pointer(tp->md5sig_info)) {
588768 *md5 = tp->af_specific->md5_lookup(sk, sk);
589769 if (*md5) {
590770 opts->options |= OPTION_MD5;
....@@ -605,18 +785,18 @@
605785 opts->mss = tcp_advertise_mss(sk);
606786 remaining -= TCPOLEN_MSS_ALIGNED;
607787
608
- if (likely(sock_net(sk)->ipv4.sysctl_tcp_timestamps && !*md5)) {
788
+ if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps) && !*md5)) {
609789 opts->options |= OPTION_TS;
610790 opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
611791 opts->tsecr = tp->rx_opt.ts_recent;
612792 remaining -= TCPOLEN_TSTAMP_ALIGNED;
613793 }
614
- if (likely(sock_net(sk)->ipv4.sysctl_tcp_window_scaling)) {
794
+ if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling))) {
615795 opts->ws = tp->rx_opt.rcv_wscale;
616796 opts->options |= OPTION_WSCALE;
617797 remaining -= TCPOLEN_WSCALE_ALIGNED;
618798 }
619
- if (likely(sock_net(sk)->ipv4.sysctl_tcp_sack)) {
799
+ if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_sack))) {
620800 opts->options |= OPTION_SACK_ADVERTISE;
621801 if (unlikely(!(OPTION_TS & opts->options)))
622802 remaining -= TCPOLEN_SACKPERM_ALIGNED;
....@@ -639,6 +819,17 @@
639819
640820 smc_set_option(tp, opts, &remaining);
641821
822
+ if (sk_is_mptcp(sk)) {
823
+ unsigned int size;
824
+
825
+ if (mptcp_syn_options(sk, skb, &size, &opts->mptcp)) {
826
+ opts->options |= OPTION_MPTCP;
827
+ remaining -= size;
828
+ }
829
+ }
830
+
831
+ bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
832
+
642833 return MAX_TCP_OPTION_SPACE - remaining;
643834 }
644835
....@@ -649,7 +840,8 @@
649840 struct tcp_out_options *opts,
650841 const struct tcp_md5sig_key *md5,
651842 struct tcp_fastopen_cookie *foc,
652
- enum tcp_synack_type synack_type)
843
+ enum tcp_synack_type synack_type,
844
+ struct sk_buff *syn_skb)
653845 {
654846 struct inet_request_sock *ireq = inet_rsk(req);
655847 unsigned int remaining = MAX_TCP_OPTION_SPACE;
....@@ -681,7 +873,7 @@
681873 if (likely(ireq->tstamp_ok)) {
682874 opts->options |= OPTION_TS;
683875 opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off;
684
- opts->tsecr = req->ts_recent;
876
+ opts->tsecr = READ_ONCE(req->ts_recent);
685877 remaining -= TCPOLEN_TSTAMP_ALIGNED;
686878 }
687879 if (likely(ireq->sack_ok)) {
....@@ -702,7 +894,12 @@
702894 }
703895 }
704896
897
+ mptcp_set_option_cond(req, opts, &remaining);
898
+
705899 smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
900
+
901
+ bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb,
902
+ synack_type, opts, &remaining);
706903
707904 return MAX_TCP_OPTION_SPACE - remaining;
708905 }
....@@ -722,7 +919,8 @@
722919
723920 *md5 = NULL;
724921 #ifdef CONFIG_TCP_MD5SIG
725
- if (unlikely(rcu_access_pointer(tp->md5sig_info))) {
922
+ if (static_branch_unlikely(&tcp_md5_needed) &&
923
+ rcu_access_pointer(tp->md5sig_info)) {
726924 *md5 = tp->af_specific->md5_lookup(sk, sk);
727925 if (*md5) {
728926 opts->options |= OPTION_MD5;
....@@ -738,16 +936,46 @@
738936 size += TCPOLEN_TSTAMP_ALIGNED;
739937 }
740938
939
+ /* MPTCP options have precedence over SACK for the limited TCP
940
+ * option space because a MPTCP connection would be forced to
941
+ * fall back to regular TCP if a required multipath option is
942
+ * missing. SACK still gets a chance to use whatever space is
943
+ * left.
944
+ */
945
+ if (sk_is_mptcp(sk)) {
946
+ unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
947
+ unsigned int opt_size = 0;
948
+
949
+ if (mptcp_established_options(sk, skb, &opt_size, remaining,
950
+ &opts->mptcp)) {
951
+ opts->options |= OPTION_MPTCP;
952
+ size += opt_size;
953
+ }
954
+ }
955
+
741956 eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
742957 if (unlikely(eff_sacks)) {
743958 const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
959
+ if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED +
960
+ TCPOLEN_SACK_PERBLOCK))
961
+ return size;
962
+
744963 opts->num_sack_blocks =
745964 min_t(unsigned int, eff_sacks,
746965 (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
747966 TCPOLEN_SACK_PERBLOCK);
748
- if (likely(opts->num_sack_blocks))
749
- size += TCPOLEN_SACK_BASE_ALIGNED +
750
- opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
967
+
968
+ size += TCPOLEN_SACK_BASE_ALIGNED +
969
+ opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
970
+ }
971
+
972
+ if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp,
973
+ BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) {
974
+ unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
975
+
976
+ bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
977
+
978
+ size = MAX_TCP_OPTION_SPACE - remaining;
751979 }
752980
753981 return size;
....@@ -966,48 +1194,33 @@
9661194 return HRTIMER_NORESTART;
9671195 }
9681196
969
-static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
1197
+static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb,
1198
+ u64 prior_wstamp)
9701199 {
9711200 struct tcp_sock *tp = tcp_sk(sk);
972
- ktime_t expire, now;
973
- u64 len_ns;
974
- u32 rate;
9751201
976
- if (!tcp_needs_internal_pacing(sk))
977
- return;
978
- rate = sk->sk_pacing_rate;
979
- if (!rate || rate == ~0U)
980
- return;
1202
+ if (sk->sk_pacing_status != SK_PACING_NONE) {
1203
+ unsigned long rate = sk->sk_pacing_rate;
9811204
982
- len_ns = (u64)skb->len * NSEC_PER_SEC;
983
- do_div(len_ns, rate);
984
- now = ktime_get();
985
- /* If hrtimer is already armed, then our caller has not
986
- * used tcp_pacing_check().
987
- */
988
- if (unlikely(hrtimer_is_queued(&tp->pacing_timer))) {
989
- expire = hrtimer_get_softexpires(&tp->pacing_timer);
990
- if (ktime_after(expire, now))
991
- now = expire;
992
- if (hrtimer_try_to_cancel(&tp->pacing_timer) == 1)
993
- __sock_put(sk);
1205
+ /* Original sch_fq does not pace first 10 MSS
1206
+ * Note that tp->data_segs_out overflows after 2^32 packets,
1207
+ * this is a minor annoyance.
1208
+ */
1209
+ if (rate != ~0UL && rate && tp->data_segs_out >= 10) {
1210
+ u64 len_ns = div64_ul((u64)skb->len * NSEC_PER_SEC, rate);
1211
+ u64 credit = tp->tcp_wstamp_ns - prior_wstamp;
1212
+
1213
+ /* take into account OS jitter */
1214
+ len_ns -= min_t(u64, len_ns / 2, credit);
1215
+ tp->tcp_wstamp_ns += len_ns;
1216
+ }
9941217 }
995
- hrtimer_start(&tp->pacing_timer, ktime_add_ns(now, len_ns),
996
- HRTIMER_MODE_ABS_PINNED_SOFT);
997
- sock_hold(sk);
998
-}
999
-
1000
-static bool tcp_pacing_check(const struct sock *sk)
1001
-{
1002
- return tcp_needs_internal_pacing(sk) &&
1003
- hrtimer_is_queued(&tcp_sk(sk)->pacing_timer);
1004
-}
1005
-
1006
-static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb)
1007
-{
1008
- skb->skb_mstamp = tp->tcp_mstamp;
10091218 list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
10101219 }
1220
+
1221
+INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl));
1222
+INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl));
1223
+INDIRECT_CALLABLE_DECLARE(void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb));
10111224
10121225 /* This routine actually transmits TCP packets queued in by
10131226 * tcp_do_sendmsg(). This is used by both the initial
....@@ -1032,11 +1245,14 @@
10321245 struct sk_buff *oskb = NULL;
10331246 struct tcp_md5sig_key *md5;
10341247 struct tcphdr *th;
1248
+ u64 prior_wstamp;
10351249 int err;
10361250
10371251 BUG_ON(!skb || !tcp_skb_pcount(skb));
10381252 tp = tcp_sk(sk);
1039
-
1253
+ prior_wstamp = tp->tcp_wstamp_ns;
1254
+ tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
1255
+ skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
10401256 if (clone_it) {
10411257 TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
10421258 - tp->snd_una;
....@@ -1051,18 +1267,32 @@
10511267
10521268 if (unlikely(!skb))
10531269 return -ENOBUFS;
1270
+ /* retransmit skbs might have a non zero value in skb->dev
1271
+ * because skb->dev is aliased with skb->rbnode.rb_left
1272
+ */
1273
+ skb->dev = NULL;
10541274 }
1055
- skb->skb_mstamp = tp->tcp_mstamp;
10561275
10571276 inet = inet_sk(sk);
10581277 tcb = TCP_SKB_CB(skb);
10591278 memset(&opts, 0, sizeof(opts));
10601279
1061
- if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
1280
+ if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
10621281 tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
1063
- else
1282
+ } else {
10641283 tcp_options_size = tcp_established_options(sk, skb, &opts,
10651284 &md5);
1285
+ /* Force a PSH flag on all (GSO) packets to expedite GRO flush
1286
+ * at receiver : This slightly improve GRO performance.
1287
+ * Note that we do not force the PSH flag for non GSO packets,
1288
+ * because they might be sent under high congestion events,
1289
+ * and in this case it is better to delay the delivery of 1-MSS
1290
+ * packets and thus the corresponding ACK packet that would
1291
+ * release the following packet.
1292
+ */
1293
+ if (tcp_skb_pcount(skb) > 1)
1294
+ tcb->tcp_flags |= TCPHDR_PSH;
1295
+ }
10661296 tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
10671297
10681298 /* if no packet is in qdisc/device queue, then allow XPS to select
....@@ -1135,16 +1365,20 @@
11351365 }
11361366 #endif
11371367
1138
- icsk->icsk_af_ops->send_check(sk, skb);
1368
+ /* BPF prog is the last one writing header option */
1369
+ bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts);
1370
+
1371
+ INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check,
1372
+ tcp_v6_send_check, tcp_v4_send_check,
1373
+ sk, skb);
11391374
11401375 if (likely(tcb->tcp_flags & TCPHDR_ACK))
1141
- tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt);
1376
+ tcp_event_ack_sent(sk, rcv_nxt);
11421377
11431378 if (skb->len != tcp_header_size) {
11441379 tcp_event_data_sent(tp, sk);
11451380 tp->data_segs_out += tcp_skb_pcount(skb);
11461381 tp->bytes_sent += skb->len - tcp_header_size;
1147
- tcp_internal_pacing(sk, skb);
11481382 }
11491383
11501384 if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
....@@ -1156,21 +1390,24 @@
11561390 skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
11571391 skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
11581392
1159
- /* Our usage of tstamp should remain private */
1160
- skb->tstamp = 0;
1393
+ /* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */
11611394
11621395 /* Cleanup our debris for IP stacks */
11631396 memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
11641397 sizeof(struct inet6_skb_parm)));
11651398
1166
- err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
1399
+ tcp_add_tx_delay(skb, tp);
1400
+
1401
+ err = INDIRECT_CALL_INET(icsk->icsk_af_ops->queue_xmit,
1402
+ inet6_csk_xmit, ip_queue_xmit,
1403
+ sk, skb, &inet->cork.fl);
11671404
11681405 if (unlikely(err > 0)) {
11691406 tcp_enter_cwr(sk);
11701407 err = net_xmit_eval(err);
11711408 }
11721409 if (!err && oskb) {
1173
- tcp_update_skb_after_send(tp, oskb);
1410
+ tcp_update_skb_after_send(sk, oskb, prior_wstamp);
11741411 tcp_rate_skb_sent(sk, oskb);
11751412 }
11761413 return err;
....@@ -1196,7 +1433,7 @@
11961433 WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq);
11971434 __skb_header_release(skb);
11981435 tcp_add_write_queue_tail(sk, skb);
1199
- sk->sk_wmem_queued += skb->truesize;
1436
+ sk_wmem_queued_add(sk, skb->truesize);
12001437 sk_mem_charge(sk, skb->truesize);
12011438 }
12021439
....@@ -1321,15 +1558,16 @@
13211558 return -ENOMEM;
13221559 }
13231560
1324
- if (skb_unclone(skb, gfp))
1561
+ if (skb_unclone_keeptruesize(skb, gfp))
13251562 return -ENOMEM;
13261563
13271564 /* Get a new skb... force flag on. */
13281565 buff = sk_stream_alloc_skb(sk, nsize, gfp, true);
13291566 if (!buff)
13301567 return -ENOMEM; /* We'll just try again later. */
1568
+ skb_copy_decrypted(buff, skb);
13311569
1332
- sk->sk_wmem_queued += buff->truesize;
1570
+ sk_wmem_queued_add(sk, buff->truesize);
13331571 sk_mem_charge(sk, buff->truesize);
13341572 nlen = skb->len - len - nsize;
13351573 buff->truesize += nlen;
....@@ -1410,7 +1648,7 @@
14101648 } else {
14111649 shinfo->frags[k] = shinfo->frags[i];
14121650 if (eat) {
1413
- shinfo->frags[k].page_offset += eat;
1651
+ skb_frag_off_add(&shinfo->frags[k], eat);
14141652 skb_frag_size_sub(&shinfo->frags[k], eat);
14151653 eat = 0;
14161654 }
....@@ -1429,7 +1667,7 @@
14291667 {
14301668 u32 delta_truesize;
14311669
1432
- if (skb_unclone(skb, GFP_ATOMIC))
1670
+ if (skb_unclone_keeptruesize(skb, GFP_ATOMIC))
14331671 return -ENOMEM;
14341672
14351673 delta_truesize = __pskb_trim_head(skb, len);
....@@ -1439,9 +1677,8 @@
14391677
14401678 if (delta_truesize) {
14411679 skb->truesize -= delta_truesize;
1442
- sk->sk_wmem_queued -= delta_truesize;
1680
+ sk_wmem_queued_add(sk, -delta_truesize);
14431681 sk_mem_uncharge(sk, delta_truesize);
1444
- sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
14451682 }
14461683
14471684 /* Any change of skb->len requires recalculation of tso factor. */
....@@ -1479,7 +1716,8 @@
14791716 mss_now -= icsk->icsk_ext_hdr_len;
14801717
14811718 /* Then reserve room for full set of TCP options and 8 bytes of data */
1482
- mss_now = max(mss_now, sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss);
1719
+ mss_now = max(mss_now,
1720
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss));
14831721 return mss_now;
14841722 }
14851723
....@@ -1522,10 +1760,10 @@
15221760 struct inet_connection_sock *icsk = inet_csk(sk);
15231761 struct net *net = sock_net(sk);
15241762
1525
- icsk->icsk_mtup.enabled = net->ipv4.sysctl_tcp_mtu_probing > 1;
1763
+ icsk->icsk_mtup.enabled = READ_ONCE(net->ipv4.sysctl_tcp_mtu_probing) > 1;
15261764 icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
15271765 icsk->icsk_af_ops->net_header_len;
1528
- icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
1766
+ icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, READ_ONCE(net->ipv4.sysctl_tcp_base_mss));
15291767 icsk->icsk_mtup.probe_size = 0;
15301768 if (icsk->icsk_mtup.enabled)
15311769 icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
....@@ -1637,15 +1875,20 @@
16371875 const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
16381876 struct tcp_sock *tp = tcp_sk(sk);
16391877
1640
- /* Track the maximum number of outstanding packets in each
1641
- * window, and remember whether we were cwnd-limited then.
1878
+ /* Track the strongest available signal of the degree to which the cwnd
1879
+ * is fully utilized. If cwnd-limited then remember that fact for the
1880
+ * current window. If not cwnd-limited then track the maximum number of
1881
+ * outstanding packets in the current window. (If cwnd-limited then we
1882
+ * chose to not update tp->max_packets_out to avoid an extra else
1883
+ * clause with no functional impact.)
16421884 */
1643
- if (!before(tp->snd_una, tp->max_packets_seq) ||
1644
- tp->packets_out > tp->max_packets_out ||
1645
- is_cwnd_limited) {
1646
- tp->max_packets_out = tp->packets_out;
1647
- tp->max_packets_seq = tp->snd_nxt;
1885
+ if (!before(tp->snd_una, tp->cwnd_usage_seq) ||
1886
+ is_cwnd_limited ||
1887
+ (!tp->is_cwnd_limited &&
1888
+ tp->packets_out > tp->max_packets_out)) {
16481889 tp->is_cwnd_limited = is_cwnd_limited;
1890
+ tp->max_packets_out = tp->packets_out;
1891
+ tp->cwnd_usage_seq = tp->snd_nxt;
16491892 }
16501893
16511894 if (tcp_is_cwnd_limited(sk)) {
....@@ -1657,7 +1900,7 @@
16571900 if (tp->packets_out > tp->snd_cwnd_used)
16581901 tp->snd_cwnd_used = tp->packets_out;
16591902
1660
- if (sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle &&
1903
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle) &&
16611904 (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto &&
16621905 !ca_ops->cong_control)
16631906 tcp_cwnd_application_limited(sk);
....@@ -1721,8 +1964,9 @@
17211964 {
17221965 u32 bytes, segs;
17231966
1724
- bytes = min(sk->sk_pacing_rate >> sk->sk_pacing_shift,
1725
- sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
1967
+ bytes = min_t(unsigned long,
1968
+ sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
1969
+ sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
17261970
17271971 /* Goal is to send at least one packet per ms,
17281972 * not one big TSO packet every 100 ms.
....@@ -1744,7 +1988,7 @@
17441988
17451989 min_tso = ca_ops->min_tso_segs ?
17461990 ca_ops->min_tso_segs(sk) :
1747
- sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs;
1991
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
17481992
17491993 tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
17501994 return min_t(u32, tso_segs, sk->sk_gso_max_segs);
....@@ -1868,23 +2112,24 @@
18682112 * know that all the data is in scatter-gather pages, and that the
18692113 * packet has never been sent out before (and thus is not cloned).
18702114 */
1871
-static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue,
1872
- struct sk_buff *skb, unsigned int len,
2115
+static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
18732116 unsigned int mss_now, gfp_t gfp)
18742117 {
1875
- struct sk_buff *buff;
18762118 int nlen = skb->len - len;
2119
+ struct sk_buff *buff;
18772120 u8 flags;
18782121
18792122 /* All of a TSO frame must be composed of paged data. */
18802123 if (skb->len != skb->data_len)
1881
- return tcp_fragment(sk, tcp_queue, skb, len, mss_now, gfp);
2124
+ return tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
2125
+ skb, len, mss_now, gfp);
18822126
18832127 buff = sk_stream_alloc_skb(sk, 0, gfp, true);
18842128 if (unlikely(!buff))
18852129 return -ENOMEM;
2130
+ skb_copy_decrypted(buff, skb);
18862131
1887
- sk->sk_wmem_queued += buff->truesize;
2132
+ sk_wmem_queued_add(sk, buff->truesize);
18882133 sk_mem_charge(sk, buff->truesize);
18892134 buff->truesize += nlen;
18902135 skb->truesize -= nlen;
....@@ -1914,7 +2159,7 @@
19142159
19152160 /* Link BUFF into the send queue. */
19162161 __skb_header_release(buff);
1917
- tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
2162
+ tcp_insert_write_queue_after(skb, buff, sk, TCP_FRAG_IN_WRITE_QUEUE);
19182163
19192164 return 0;
19202165 }
....@@ -1930,18 +2175,22 @@
19302175 u32 max_segs)
19312176 {
19322177 const struct inet_connection_sock *icsk = inet_csk(sk);
1933
- u32 age, send_win, cong_win, limit, in_flight;
2178
+ u32 send_win, cong_win, limit, in_flight;
19342179 struct tcp_sock *tp = tcp_sk(sk);
19352180 struct sk_buff *head;
19362181 int win_divisor;
2182
+ s64 delta;
19372183
19382184 if (icsk->icsk_ca_state >= TCP_CA_Recovery)
19392185 goto send_now;
19402186
19412187 /* Avoid bursty behavior by allowing defer
1942
- * only if the last write was recent.
2188
+ * only if the last write was recent (1 ms).
2189
+ * Note that tp->tcp_wstamp_ns can be in the future if we have
2190
+ * packets waiting in a qdisc or device for EDT delivery.
19432191 */
1944
- if ((s32)(tcp_jiffies32 - tp->lsndtime) > 0)
2192
+ delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC;
2193
+ if (delta > 0)
19452194 goto send_now;
19462195
19472196 in_flight = tcp_packets_in_flight(tp);
....@@ -1988,9 +2237,9 @@
19882237 head = tcp_rtx_queue_head(sk);
19892238 if (!head)
19902239 goto send_now;
1991
- age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp);
2240
+ delta = tp->tcp_clock_cache - head->tstamp;
19922241 /* If next ACK is likely to come too late (half srtt), do not defer */
1993
- if (age < (tp->srtt_us >> 4))
2242
+ if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0)
19942243 goto send_now;
19952244
19962245 /* Ok, it looks like it is advisable to defer.
....@@ -2012,7 +2261,8 @@
20122261 }
20132262
20142263 /* If this packet won't get more data, do not wait. */
2015
- if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2264
+ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) ||
2265
+ TCP_SKB_CB(skb)->eor)
20162266 goto send_now;
20172267
20182268 return true;
....@@ -2029,7 +2279,7 @@
20292279 u32 interval;
20302280 s32 delta;
20312281
2032
- interval = net->ipv4.sysctl_tcp_probe_interval;
2282
+ interval = READ_ONCE(net->ipv4.sysctl_tcp_probe_interval);
20332283 delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp;
20342284 if (unlikely(delta >= interval * HZ)) {
20352285 int mss = tcp_current_mss(sk);
....@@ -2111,7 +2361,7 @@
21112361 * probing process by not resetting search range to its orignal.
21122362 */
21132363 if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) ||
2114
- interval < net->ipv4.sysctl_tcp_probe_threshold) {
2364
+ interval < READ_ONCE(net->ipv4.sysctl_tcp_probe_threshold)) {
21152365 /* Check whether enough time has elaplased for
21162366 * another round of probing.
21172367 */
....@@ -2139,17 +2389,15 @@
21392389 if (!tcp_can_coalesce_send_queue_head(sk, probe_size))
21402390 return -1;
21412391
2142
- if (tcp_pacing_check(sk))
2143
- return -1;
2144
-
21452392 /* We're allowed to probe. Build it now. */
21462393 nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false);
21472394 if (!nskb)
21482395 return -1;
2149
- sk->sk_wmem_queued += nskb->truesize;
2396
+ sk_wmem_queued_add(sk, nskb->truesize);
21502397 sk_mem_charge(sk, nskb->truesize);
21512398
21522399 skb = tcp_send_head(sk);
2400
+ skb_copy_decrypted(nskb, skb);
21532401
21542402 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
21552403 TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
....@@ -2215,6 +2463,25 @@
22152463 return -1;
22162464 }
22172465
2466
+static bool tcp_pacing_check(struct sock *sk)
2467
+{
2468
+ struct tcp_sock *tp = tcp_sk(sk);
2469
+
2470
+ if (!tcp_needs_internal_pacing(sk))
2471
+ return false;
2472
+
2473
+ if (tp->tcp_wstamp_ns <= tp->tcp_clock_cache)
2474
+ return false;
2475
+
2476
+ if (!hrtimer_is_queued(&tp->pacing_timer)) {
2477
+ hrtimer_start(&tp->pacing_timer,
2478
+ ns_to_ktime(tp->tcp_wstamp_ns),
2479
+ HRTIMER_MODE_ABS_PINNED_SOFT);
2480
+ sock_hold(sk);
2481
+ }
2482
+ return true;
2483
+}
2484
+
22182485 /* TCP Small Queues :
22192486 * Control number of packets in qdisc/devices to two packets / or ~1 ms.
22202487 * (These limits are doubled for retransmits)
....@@ -2229,13 +2496,28 @@
22292496 static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
22302497 unsigned int factor)
22312498 {
2232
- unsigned int limit;
2499
+ unsigned long limit;
22332500
2234
- limit = max(2 * skb->truesize, sk->sk_pacing_rate >> sk->sk_pacing_shift);
2235
- limit = min_t(u32, limit,
2236
- sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
2501
+ limit = max_t(unsigned long,
2502
+ 2 * skb->truesize,
2503
+ sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift));
2504
+ if (sk->sk_pacing_status == SK_PACING_NONE)
2505
+ limit = min_t(unsigned long, limit,
2506
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes));
22372507 limit <<= factor;
22382508
2509
+ if (static_branch_unlikely(&tcp_tx_delay_enabled) &&
2510
+ tcp_sk(sk)->tcp_tx_delay) {
2511
+ u64 extra_bytes = (u64)sk->sk_pacing_rate * tcp_sk(sk)->tcp_tx_delay;
2512
+
2513
+ /* TSQ is based on skb truesize sum (sk_wmem_alloc), so we
2514
+ * approximate our needs assuming an ~100% skb->truesize overhead.
2515
+ * USEC_PER_SEC is approximated by 2^20.
2516
+ * do_div(extra_bytes, USEC_PER_SEC/2) is replaced by a right shift.
2517
+ */
2518
+ extra_bytes >>= (20 - 1);
2519
+ limit += extra_bytes;
2520
+ }
22392521 if (refcount_read(&sk->sk_wmem_alloc) > limit) {
22402522 /* Always send skb if rtx queue is empty.
22412523 * No need to wait for TX completion to call us back,
....@@ -2341,17 +2623,19 @@
23412623 while ((skb = tcp_send_head(sk))) {
23422624 unsigned int limit;
23432625
2626
+ if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
2627
+ /* "skb_mstamp_ns" is used as a start point for the retransmit timer */
2628
+ skb->skb_mstamp_ns = tp->tcp_wstamp_ns = tp->tcp_clock_cache;
2629
+ list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
2630
+ tcp_init_tso_segs(skb, mss_now);
2631
+ goto repair; /* Skip network transmission */
2632
+ }
2633
+
23442634 if (tcp_pacing_check(sk))
23452635 break;
23462636
23472637 tso_segs = tcp_init_tso_segs(skb, mss_now);
23482638 BUG_ON(!tso_segs);
2349
-
2350
- if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
2351
- /* "skb_mstamp" is used as a start point for the retransmit timer */
2352
- tcp_update_skb_after_send(tp, skb);
2353
- goto repair; /* Skip network transmission */
2354
- }
23552639
23562640 cwnd_quota = tcp_cwnd_test(tp, skb);
23572641 if (!cwnd_quota) {
....@@ -2388,8 +2672,7 @@
23882672 nonagle);
23892673
23902674 if (skb->len > limit &&
2391
- unlikely(tso_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
2392
- skb, limit, mss_now, gfp)))
2675
+ unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
23932676 break;
23942677
23952678 if (tcp_small_queue_check(sk, skb, 0))
....@@ -2450,10 +2733,10 @@
24502733 /* Don't do any loss probe on a Fast Open connection before 3WHS
24512734 * finishes.
24522735 */
2453
- if (tp->fastopen_rsk)
2736
+ if (rcu_access_pointer(tp->fastopen_rsk))
24542737 return false;
24552738
2456
- early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans;
2739
+ early_retrans = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_early_retrans);
24572740 /* Schedule a loss probe in 2*RTT for SACK capable connections
24582741 * not in loss recovery, that are either limited by cwnd or application.
24592742 */
....@@ -2484,8 +2767,7 @@
24842767 if (rto_delta_us > 0)
24852768 timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
24862769
2487
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
2488
- TCP_RTO_MAX);
2770
+ tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, TCP_RTO_MAX);
24892771 return true;
24902772 }
24912773
....@@ -2666,8 +2948,12 @@
26662948 int mss = icsk->icsk_ack.rcv_mss;
26672949 int free_space = tcp_space(sk);
26682950 int allowed_space = tcp_full_space(sk);
2669
- int full_space = min_t(int, tp->window_clamp, allowed_space);
2670
- int window;
2951
+ int full_space, window;
2952
+
2953
+ if (sk_is_mptcp(sk))
2954
+ mptcp_space(sk, &free_space, &allowed_space);
2955
+
2956
+ full_space = min_t(int, tp->window_clamp, allowed_space);
26712957
26722958 if (unlikely(mss > full_space)) {
26732959 mss = full_space;
....@@ -2815,7 +3101,7 @@
28153101 struct sk_buff *skb = to, *tmp;
28163102 bool first = true;
28173103
2818
- if (!sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse)
3104
+ if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse))
28193105 return;
28203106 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
28213107 return;
....@@ -2824,7 +3110,7 @@
28243110 if (!tcp_can_collapse(sk, skb))
28253111 break;
28263112
2827
- if (!tcp_skb_can_collapse_to(to))
3113
+ if (!tcp_skb_can_collapse(to, skb))
28283114 break;
28293115
28303116 space -= skb->len;
....@@ -2855,7 +3141,7 @@
28553141 struct tcp_sock *tp = tcp_sk(sk);
28563142 unsigned int cur_mss;
28573143 int diff, len, err;
2858
-
3144
+ int avail_wnd;
28593145
28603146 /* Inconclusive MTU probe */
28613147 if (icsk->icsk_mtup.probe_size)
....@@ -2885,23 +3171,31 @@
28853171 return -EHOSTUNREACH; /* Routing failure or similar. */
28863172
28873173 cur_mss = tcp_current_mss(sk);
3174
+ avail_wnd = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
28883175
28893176 /* If receiver has shrunk his window, and skb is out of
28903177 * new window, do not retransmit it. The exception is the
28913178 * case, when window is shrunk to zero. In this case
2892
- * our retransmit serves as a zero window probe.
3179
+ * our retransmit of one segment serves as a zero window probe.
28933180 */
2894
- if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&
2895
- TCP_SKB_CB(skb)->seq != tp->snd_una)
2896
- return -EAGAIN;
3181
+ if (avail_wnd <= 0) {
3182
+ if (TCP_SKB_CB(skb)->seq != tp->snd_una)
3183
+ return -EAGAIN;
3184
+ avail_wnd = cur_mss;
3185
+ }
28973186
28983187 len = cur_mss * segs;
3188
+ if (len > avail_wnd) {
3189
+ len = rounddown(avail_wnd, cur_mss);
3190
+ if (!len)
3191
+ len = avail_wnd;
3192
+ }
28993193 if (skb->len > len) {
29003194 if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len,
29013195 cur_mss, GFP_ATOMIC))
29023196 return -ENOMEM; /* We'll try again later. */
29033197 } else {
2904
- if (skb_unclone(skb, GFP_ATOMIC))
3198
+ if (skb_unclone_keeptruesize(skb, GFP_ATOMIC))
29053199 return -ENOMEM;
29063200
29073201 diff = tcp_skb_pcount(skb);
....@@ -2909,8 +3203,9 @@
29093203 diff -= tcp_skb_pcount(skb);
29103204 if (diff)
29113205 tcp_adjust_pcount(sk, skb, diff);
2912
- if (skb->len < cur_mss)
2913
- tcp_retrans_try_collapse(sk, skb, cur_mss);
3206
+ avail_wnd = min_t(int, avail_wnd, cur_mss);
3207
+ if (skb->len < avail_wnd)
3208
+ tcp_retrans_try_collapse(sk, skb, avail_wnd);
29143209 }
29153210
29163211 /* RFC3168, section 6.1.1.1. ECN fallback */
....@@ -2935,24 +3230,32 @@
29353230
29363231 tcp_skb_tsorted_save(skb) {
29373232 nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
2938
- err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
2939
- -ENOBUFS;
3233
+ if (nskb) {
3234
+ nskb->dev = NULL;
3235
+ err = tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC);
3236
+ } else {
3237
+ err = -ENOBUFS;
3238
+ }
29403239 } tcp_skb_tsorted_restore(skb);
29413240
29423241 if (!err) {
2943
- tcp_update_skb_after_send(tp, skb);
3242
+ tcp_update_skb_after_send(sk, skb, tp->tcp_wstamp_ns);
29443243 tcp_rate_skb_sent(sk, skb);
29453244 }
29463245 } else {
29473246 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
29483247 }
29493248
3249
+ /* To avoid taking spuriously low RTT samples based on a timestamp
3250
+ * for a transmit that never happened, always mark EVER_RETRANS
3251
+ */
3252
+ TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
3253
+
29503254 if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG))
29513255 tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB,
29523256 TCP_SKB_CB(skb)->seq, segs, err);
29533257
29543258 if (likely(!err)) {
2955
- TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
29563259 trace_tcp_retransmit_skb(sk, skb);
29573260 } else if (err != -EBUSY) {
29583261 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs);
....@@ -2995,6 +3298,7 @@
29953298 const struct inet_connection_sock *icsk = inet_csk(sk);
29963299 struct sk_buff *skb, *rtx_head, *hole = NULL;
29973300 struct tcp_sock *tp = tcp_sk(sk);
3301
+ bool rearm_timer = false;
29983302 u32 max_segs;
29993303 int mib_idx;
30003304
....@@ -3017,7 +3321,7 @@
30173321
30183322 segs = tp->snd_cwnd - tcp_packets_in_flight(tp);
30193323 if (segs <= 0)
3020
- return;
3324
+ break;
30213325 sacked = TCP_SKB_CB(skb)->sacked;
30223326 /* In case tcp_shift_skb_data() have aggregated large skbs,
30233327 * we need to make sure not sending too bigs TSO packets
....@@ -3042,10 +3346,10 @@
30423346 continue;
30433347
30443348 if (tcp_small_queue_check(sk, skb, 1))
3045
- return;
3349
+ break;
30463350
30473351 if (tcp_retransmit_skb(sk, skb, segs))
3048
- return;
3352
+ break;
30493353
30503354 NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb));
30513355
....@@ -3054,10 +3358,13 @@
30543358
30553359 if (skb == rtx_head &&
30563360 icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
3057
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
3058
- inet_csk(sk)->icsk_rto,
3059
- TCP_RTO_MAX);
3361
+ rearm_timer = true;
3362
+
30603363 }
3364
+ if (rearm_timer)
3365
+ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
3366
+ inet_csk(sk)->icsk_rto,
3367
+ TCP_RTO_MAX);
30613368 }
30623369
30633370 /* We allow to exceed memory limits for FIN packets to expedite
....@@ -3069,11 +3376,12 @@
30693376 */
30703377 void sk_forced_mem_schedule(struct sock *sk, int size)
30713378 {
3072
- int amt;
3379
+ int delta, amt;
30733380
3074
- if (size <= sk->sk_forward_alloc)
3381
+ delta = size - sk->sk_forward_alloc;
3382
+ if (delta <= 0)
30753383 return;
3076
- amt = sk_mem_pages(size);
3384
+ amt = sk_mem_pages(delta);
30773385 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
30783386 sk_memory_allocated_add(sk, amt);
30793387
....@@ -3086,7 +3394,7 @@
30863394 */
30873395 void tcp_send_fin(struct sock *sk)
30883396 {
3089
- struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk);
3397
+ struct sk_buff *skb, *tskb, *tail = tcp_write_queue_tail(sk);
30903398 struct tcp_sock *tp = tcp_sk(sk);
30913399
30923400 /* Optimization, tack on the FIN if we have one skb in write queue and
....@@ -3094,31 +3402,29 @@
30943402 * Note: in the latter case, FIN packet will be sent after a timeout,
30953403 * as TCP stack thinks it has already been transmitted.
30963404 */
3405
+ tskb = tail;
30973406 if (!tskb && tcp_under_memory_pressure(sk))
30983407 tskb = skb_rb_last(&sk->tcp_rtx_queue);
30993408
31003409 if (tskb) {
3101
-coalesce:
31023410 TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
31033411 TCP_SKB_CB(tskb)->end_seq++;
31043412 tp->write_seq++;
3105
- if (tcp_write_queue_empty(sk)) {
3413
+ if (!tail) {
31063414 /* This means tskb was already sent.
31073415 * Pretend we included the FIN on previous transmit.
31083416 * We need to set tp->snd_nxt to the value it would have
31093417 * if FIN had been sent. This is because retransmit path
31103418 * does not change tp->snd_nxt.
31113419 */
3112
- tp->snd_nxt++;
3420
+ WRITE_ONCE(tp->snd_nxt, tp->snd_nxt + 1);
31133421 return;
31143422 }
31153423 } else {
31163424 skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
3117
- if (unlikely(!skb)) {
3118
- if (tskb)
3119
- goto coalesce;
3425
+ if (unlikely(!skb))
31203426 return;
3121
- }
3427
+
31223428 INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
31233429 skb_reserve(skb, MAX_TCP_HEADER);
31243430 sk_forced_mem_schedule(sk, skb->truesize);
....@@ -3192,7 +3498,7 @@
31923498 tcp_rtx_queue_unlink_and_free(skb, sk);
31933499 __skb_header_release(nskb);
31943500 tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb);
3195
- sk->sk_wmem_queued += nskb->truesize;
3501
+ sk_wmem_queued_add(sk, nskb->truesize);
31963502 sk_mem_charge(sk, nskb->truesize);
31973503 skb = nskb;
31983504 }
....@@ -3204,18 +3510,20 @@
32043510 }
32053511
32063512 /**
3207
- * tcp_make_synack - Prepare a SYN-ACK.
3208
- * sk: listener socket
3209
- * dst: dst entry attached to the SYNACK
3210
- * req: request_sock pointer
3211
- *
3212
- * Allocate one skb and build a SYNACK packet.
3213
- * @dst is consumed : Caller should not use it again.
3513
+ * tcp_make_synack - Allocate one skb and build a SYNACK packet.
3514
+ * @sk: listener socket
3515
+ * @dst: dst entry attached to the SYNACK. It is consumed and caller
3516
+ * should not use it again.
3517
+ * @req: request_sock pointer
3518
+ * @foc: cookie for tcp fast open
3519
+ * @synack_type: Type of synack to prepare
3520
+ * @syn_skb: SYN packet just received. It could be NULL for rtx case.
32143521 */
32153522 struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
32163523 struct request_sock *req,
32173524 struct tcp_fastopen_cookie *foc,
3218
- enum tcp_synack_type synack_type)
3525
+ enum tcp_synack_type synack_type,
3526
+ struct sk_buff *syn_skb)
32193527 {
32203528 struct inet_request_sock *ireq = inet_rsk(req);
32213529 const struct tcp_sock *tp = tcp_sk(sk);
....@@ -3225,6 +3533,7 @@
32253533 int tcp_header_size;
32263534 struct tcphdr *th;
32273535 int mss;
3536
+ u64 now;
32283537
32293538 skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
32303539 if (unlikely(!skb)) {
....@@ -3256,20 +3565,28 @@
32563565 mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
32573566
32583567 memset(&opts, 0, sizeof(opts));
3568
+ now = tcp_clock_ns();
32593569 #ifdef CONFIG_SYN_COOKIES
3260
- if (unlikely(req->cookie_ts))
3261
- skb->skb_mstamp = cookie_init_timestamp(req);
3570
+ if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok))
3571
+ skb->skb_mstamp_ns = cookie_init_timestamp(req, now);
32623572 else
32633573 #endif
3264
- skb->skb_mstamp = tcp_clock_us();
3574
+ {
3575
+ skb->skb_mstamp_ns = now;
3576
+ if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */
3577
+ tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb);
3578
+ }
32653579
32663580 #ifdef CONFIG_TCP_MD5SIG
32673581 rcu_read_lock();
32683582 md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
32693583 #endif
32703584 skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
3585
+ /* bpf program will be interested in the tcp_flags */
3586
+ TCP_SKB_CB(skb)->tcp_flags = TCPHDR_SYN | TCPHDR_ACK;
32713587 tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
3272
- foc, synack_type) + sizeof(*th);
3588
+ foc, synack_type,
3589
+ syn_skb) + sizeof(*th);
32733590
32743591 skb_push(skb, tcp_header_size);
32753592 skb_reset_transport_header(skb);
....@@ -3291,7 +3608,7 @@
32913608 th->window = htons(min(req->rsk_rcv_wnd, 65535U));
32923609 tcp_options_write((__be32 *)(th + 1), NULL, &opts);
32933610 th->doff = (tcp_header_size >> 2);
3294
- __TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
3611
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
32953612
32963613 #ifdef CONFIG_TCP_MD5SIG
32973614 /* Okay, we have all we need - do the md5 hash if needed */
....@@ -3301,8 +3618,12 @@
33013618 rcu_read_unlock();
33023619 #endif
33033620
3304
- /* Do not fool tcpdump (if any), clean our debris */
3305
- skb->tstamp = 0;
3621
+ bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb,
3622
+ synack_type, &opts);
3623
+
3624
+ skb->skb_mstamp_ns = now;
3625
+ tcp_add_tx_delay(skb, tp);
3626
+
33063627 return skb;
33073628 }
33083629 EXPORT_SYMBOL(tcp_make_synack);
....@@ -3318,8 +3639,8 @@
33183639
33193640 rcu_read_lock();
33203641 ca = tcp_ca_find_key(ca_key);
3321
- if (likely(ca && try_module_get(ca->owner))) {
3322
- module_put(icsk->icsk_ca_ops->owner);
3642
+ if (likely(ca && bpf_try_module_get(ca, ca->owner))) {
3643
+ bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner);
33233644 icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
33243645 icsk->icsk_ca_ops = ca;
33253646 }
....@@ -3338,7 +3659,7 @@
33383659 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
33393660 */
33403661 tp->tcp_header_len = sizeof(struct tcphdr);
3341
- if (sock_net(sk)->ipv4.sysctl_tcp_timestamps)
3662
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps))
33423663 tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED;
33433664
33443665 #ifdef CONFIG_TCP_MD5SIG
....@@ -3374,7 +3695,7 @@
33743695 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
33753696 &tp->rcv_wnd,
33763697 &tp->window_clamp,
3377
- sock_net(sk)->ipv4.sysctl_tcp_window_scaling,
3698
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling),
33783699 &rcv_wscale,
33793700 rcv_wnd);
33803701
....@@ -3389,7 +3710,7 @@
33893710 tp->snd_una = tp->write_seq;
33903711 tp->snd_sml = tp->write_seq;
33913712 tp->snd_up = tp->write_seq;
3392
- tp->snd_nxt = tp->write_seq;
3713
+ WRITE_ONCE(tp->snd_nxt, tp->write_seq);
33933714
33943715 if (likely(!tp->repair))
33953716 tp->rcv_nxt = 0;
....@@ -3410,7 +3731,7 @@
34103731
34113732 tcb->end_seq += skb->len;
34123733 __skb_header_release(skb);
3413
- sk->sk_wmem_queued += skb->truesize;
3734
+ sk_wmem_queued_add(sk, skb->truesize);
34143735 sk_mem_charge(sk, skb->truesize);
34153736 WRITE_ONCE(tp->write_seq, tcb->end_seq);
34163737 tp->packets_out += tcp_skb_pcount(skb);
....@@ -3425,6 +3746,7 @@
34253746 */
34263747 static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
34273748 {
3749
+ struct inet_connection_sock *icsk = inet_csk(sk);
34283750 struct tcp_sock *tp = tcp_sk(sk);
34293751 struct tcp_fastopen_request *fo = tp->fastopen_req;
34303752 int space, err = 0;
....@@ -3439,8 +3761,10 @@
34393761 * private TCP options. The cost is reduced data space in SYN :(
34403762 */
34413763 tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp);
3764
+ /* Sync mss_cache after updating the mss_clamp */
3765
+ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
34423766
3443
- space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
3767
+ space = __tcp_mtu_to_mss(sk, icsk->icsk_pmtu_cookie) -
34443768 MAX_TCP_OPTION_SPACE;
34453769
34463770 space = min_t(size_t, space, fo->size);
....@@ -3465,6 +3789,7 @@
34653789 skb_trim(syn_data, copied);
34663790 space = copied;
34673791 }
3792
+ skb_zcopy_set(syn_data, fo->uarg, NULL);
34683793 }
34693794 /* No more data pending in inet_wait_for_connect() */
34703795 if (space == fo->size)
....@@ -3477,7 +3802,7 @@
34773802
34783803 err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
34793804
3480
- syn->skb_mstamp = syn_data->skb_mstamp;
3805
+ syn->skb_mstamp_ns = syn_data->skb_mstamp_ns;
34813806
34823807 /* Now full SYN+DATA was cloned and sent (or not),
34833808 * remove the SYN from the original skb (syn_data)
....@@ -3548,11 +3873,11 @@
35483873 /* We change tp->snd_nxt after the tcp_transmit_skb() call
35493874 * in order to make this packet get counted in tcpOutSegs.
35503875 */
3551
- tp->snd_nxt = tp->write_seq;
3876
+ WRITE_ONCE(tp->snd_nxt, tp->write_seq);
35523877 tp->pushed_seq = tp->write_seq;
35533878 buff = tcp_send_head(sk);
35543879 if (unlikely(buff)) {
3555
- tp->snd_nxt = TCP_SKB_CB(buff)->seq;
3880
+ WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(buff)->seq);
35563881 tp->pushed_seq = TCP_SKB_CB(buff)->seq;
35573882 }
35583883 TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
....@@ -3578,7 +3903,7 @@
35783903 const struct tcp_sock *tp = tcp_sk(sk);
35793904 int max_ato = HZ / 2;
35803905
3581
- if (icsk->icsk_ack.pingpong ||
3906
+ if (inet_csk_in_pingpong_mode(sk) ||
35823907 (icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
35833908 max_ato = TCP_DELACK_MAX;
35843909
....@@ -3599,16 +3924,15 @@
35993924 ato = min(ato, max_ato);
36003925 }
36013926
3927
+ ato = min_t(u32, ato, inet_csk(sk)->icsk_delack_max);
3928
+
36023929 /* Stay within the limit we were given */
36033930 timeout = jiffies + ato;
36043931
36053932 /* Use new timeout only if there wasn't a older one earlier. */
36063933 if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
3607
- /* If delack timer was blocked or is about to expire,
3608
- * send ACK now.
3609
- */
3610
- if (icsk->icsk_ack.blocked ||
3611
- time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
3934
+ /* If delack timer is about to expire, send ACK now. */
3935
+ if (time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
36123936 tcp_send_ack(sk);
36133937 return;
36143938 }
....@@ -3637,10 +3961,15 @@
36373961 buff = alloc_skb(MAX_TCP_HEADER,
36383962 sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
36393963 if (unlikely(!buff)) {
3964
+ struct inet_connection_sock *icsk = inet_csk(sk);
3965
+ unsigned long delay;
3966
+
3967
+ delay = TCP_DELACK_MAX << icsk->icsk_ack.retry;
3968
+ if (delay < TCP_RTO_MAX)
3969
+ icsk->icsk_ack.retry++;
36403970 inet_csk_schedule_ack(sk);
3641
- inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
3642
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
3643
- TCP_DELACK_MAX, TCP_RTO_MAX);
3971
+ icsk->icsk_ack.ato = TCP_ATO_MIN;
3972
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, delay, TCP_RTO_MAX);
36443973 return;
36453974 }
36463975
....@@ -3759,7 +4088,7 @@
37594088 struct inet_connection_sock *icsk = inet_csk(sk);
37604089 struct tcp_sock *tp = tcp_sk(sk);
37614090 struct net *net = sock_net(sk);
3762
- unsigned long probe_max;
4091
+ unsigned long timeout;
37634092 int err;
37644093
37654094 err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
....@@ -3768,28 +4097,24 @@
37684097 /* Cancel probe timer, if it is not required. */
37694098 icsk->icsk_probes_out = 0;
37704099 icsk->icsk_backoff = 0;
4100
+ icsk->icsk_probes_tstamp = 0;
37714101 return;
37724102 }
37734103
4104
+ icsk->icsk_probes_out++;
37744105 if (err <= 0) {
3775
- if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2)
4106
+ if (icsk->icsk_backoff < READ_ONCE(net->ipv4.sysctl_tcp_retries2))
37764107 icsk->icsk_backoff++;
3777
- icsk->icsk_probes_out++;
3778
- probe_max = TCP_RTO_MAX;
4108
+ timeout = tcp_probe0_when(sk, TCP_RTO_MAX);
37794109 } else {
37804110 /* If packet was not sent due to local congestion,
3781
- * do not backoff and do not remember icsk_probes_out.
3782
- * Let local senders to fight for local resources.
3783
- *
3784
- * Use accumulated backoff yet.
4111
+ * Let senders fight for local resources conservatively.
37854112 */
3786
- if (!icsk->icsk_probes_out)
3787
- icsk->icsk_probes_out = 1;
3788
- probe_max = TCP_RESOURCE_PROBE_INTERVAL;
4113
+ timeout = TCP_RESOURCE_PROBE_INTERVAL;
37894114 }
3790
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
3791
- tcp_probe0_when(sk, probe_max),
3792
- TCP_RTO_MAX);
4115
+
4116
+ timeout = tcp_clamp_probe0_to_user_timeout(sk, timeout);
4117
+ tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX);
37934118 }
37944119
37954120 int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
....@@ -3799,10 +4124,11 @@
37994124 int res;
38004125
38014126 tcp_rsk(req)->txhash = net_tx_rndhash();
3802
- res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL);
4127
+ res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL,
4128
+ NULL);
38034129 if (!res) {
3804
- __TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
3805
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
4130
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
4131
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
38064132 if (unlikely(tcp_passive_fastopen(sk)))
38074133 tcp_sk(sk)->total_retrans++;
38084134 trace_tcp_retransmit_synack(sk, req);