From ea08eeccae9297f7aabd2ef7f0c2517ac4549acc Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Tue, 20 Feb 2024 01:18:26 +0000
Subject: [PATCH] write in 30M
---
kernel/net/ipv4/tcp_output.c | 748 +++++++++++++++++++++++++++++++++++++++++----------------
1 files changed, 537 insertions(+), 211 deletions(-)
diff --git a/kernel/net/ipv4/tcp_output.c b/kernel/net/ipv4/tcp_output.c
index c97c027..c2b1657 100644
--- a/kernel/net/ipv4/tcp_output.c
+++ b/kernel/net/ipv4/tcp_output.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -37,6 +38,7 @@
#define pr_fmt(fmt) "TCP: " fmt
#include <net/tcp.h>
+#include <net/mptcp.h>
#include <linux/compiler.h>
#include <linux/gfp.h>
@@ -44,6 +46,17 @@
#include <linux/static_key.h>
#include <trace/events/tcp.h>
+
+/* Refresh clocks of a TCP socket,
+ * ensuring monotically increasing values.
+ */
+void tcp_mstamp_refresh(struct tcp_sock *tp)
+{
+ u64 val = tcp_clock_ns();
+
+ tp->tcp_clock_cache = val;
+ tp->tcp_mstamp = div_u64(val, NSEC_PER_USEC);
+}
static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
int push_one, gfp_t gfp);
@@ -55,7 +68,7 @@
struct tcp_sock *tp = tcp_sk(sk);
unsigned int prior_packets = tp->packets_out;
- tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+ WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq);
__skb_unlink(skb, &sk->sk_write_queue);
tcp_rbtree_insert(&sk->tcp_rtx_queue, skb);
@@ -69,6 +82,7 @@
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
tcp_skb_pcount(skb));
+ tcp_check_space(sk);
}
/* SND.NXT, if window was not shrunk or the amount of shrunk was less than one
@@ -159,26 +173,25 @@
* packet, enter pingpong mode.
*/
if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
- icsk->icsk_ack.pingpong = 1;
+ inet_csk_enter_pingpong_mode(sk);
}
/* Account for an ACK we sent. */
-static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts,
- u32 rcv_nxt)
+static inline void tcp_event_ack_sent(struct sock *sk, u32 rcv_nxt)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (unlikely(tp->compressed_ack > TCP_FASTRETRANS_THRESH)) {
+ if (unlikely(tp->compressed_ack)) {
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
- tp->compressed_ack - TCP_FASTRETRANS_THRESH);
- tp->compressed_ack = TCP_FASTRETRANS_THRESH;
+ tp->compressed_ack);
+ tp->compressed_ack = 0;
if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
__sock_put(sk);
}
if (unlikely(rcv_nxt != tp->rcv_nxt))
return; /* Special ACK sent by DCTCP to reflect ECN */
- tcp_dec_quickack_mode(sk, pkts);
+ tcp_dec_quickack_mode(sk);
inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
}
@@ -221,16 +234,14 @@
if (init_rcv_wnd)
*rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
- (*rcv_wscale) = 0;
+ *rcv_wscale = 0;
if (wscale_ok) {
/* Set window scaling on max possible window */
- space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
- space = max_t(u32, space, sysctl_rmem_max);
+ space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
+ space = max_t(u32, space, READ_ONCE(sysctl_rmem_max));
space = min_t(u32, space, *window_clamp);
- while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) {
- space >>= 1;
- (*rcv_wscale)++;
- }
+ *rcv_wscale = clamp_t(int, ilog2(space) - 15,
+ 0, TCP_MAX_WSCALE);
}
/* Set the clamp no higher than max representable value */
(*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp);
@@ -401,6 +412,7 @@
#define OPTION_WSCALE (1 << 3)
#define OPTION_FAST_OPEN_COOKIE (1 << 8)
#define OPTION_SMC (1 << 9)
+#define OPTION_MPTCP (1 << 10)
static void smc_options_write(__be32 *ptr, u16 *options)
{
@@ -423,10 +435,159 @@
u8 ws; /* window scale, 0 to disable */
u8 num_sack_blocks; /* number of SACK blocks to include */
u8 hash_size; /* bytes in hash_location */
+ u8 bpf_opt_len; /* length of BPF hdr option */
__u8 *hash_location; /* temporary pointer, overloaded */
__u32 tsval, tsecr; /* need to include OPTION_TS */
struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
+ struct mptcp_out_options mptcp;
};
+
+static void mptcp_options_write(__be32 *ptr, struct tcp_out_options *opts)
+{
+#if IS_ENABLED(CONFIG_MPTCP)
+ if (unlikely(OPTION_MPTCP & opts->options))
+ mptcp_write_options(ptr, &opts->mptcp);
+#endif
+}
+
+#ifdef CONFIG_CGROUP_BPF
+static int bpf_skops_write_hdr_opt_arg0(struct sk_buff *skb,
+ enum tcp_synack_type synack_type)
+{
+ if (unlikely(!skb))
+ return BPF_WRITE_HDR_TCP_CURRENT_MSS;
+
+ if (unlikely(synack_type == TCP_SYNACK_COOKIE))
+ return BPF_WRITE_HDR_TCP_SYNACK_COOKIE;
+
+ return 0;
+}
+
+/* req, syn_skb and synack_type are used when writing synack */
+static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct sk_buff *syn_skb,
+ enum tcp_synack_type synack_type,
+ struct tcp_out_options *opts,
+ unsigned int *remaining)
+{
+ struct bpf_sock_ops_kern sock_ops;
+ int err;
+
+ if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
+ BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) ||
+ !*remaining)
+ return;
+
+ /* *remaining has already been aligned to 4 bytes, so *remaining >= 4 */
+
+ /* init sock_ops */
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
+
+ sock_ops.op = BPF_SOCK_OPS_HDR_OPT_LEN_CB;
+
+ if (req) {
+ /* The listen "sk" cannot be passed here because
+ * it is not locked. It would not make too much
+ * sense to do bpf_setsockopt(listen_sk) based
+ * on individual connection request also.
+ *
+ * Thus, "req" is passed here and the cgroup-bpf-progs
+ * of the listen "sk" will be run.
+ *
+ * "req" is also used here for fastopen even the "sk" here is
+ * a fullsock "child" sk. It is to keep the behavior
+ * consistent between fastopen and non-fastopen on
+ * the bpf programming side.
+ */
+ sock_ops.sk = (struct sock *)req;
+ sock_ops.syn_skb = syn_skb;
+ } else {
+ sock_owned_by_me(sk);
+
+ sock_ops.is_fullsock = 1;
+ sock_ops.sk = sk;
+ }
+
+ sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
+ sock_ops.remaining_opt_len = *remaining;
+ /* tcp_current_mss() does not pass a skb */
+ if (skb)
+ bpf_skops_init_skb(&sock_ops, skb, 0);
+
+ err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
+
+ if (err || sock_ops.remaining_opt_len == *remaining)
+ return;
+
+ opts->bpf_opt_len = *remaining - sock_ops.remaining_opt_len;
+ /* round up to 4 bytes */
+ opts->bpf_opt_len = (opts->bpf_opt_len + 3) & ~3;
+
+ *remaining -= opts->bpf_opt_len;
+}
+
+static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct sk_buff *syn_skb,
+ enum tcp_synack_type synack_type,
+ struct tcp_out_options *opts)
+{
+ u8 first_opt_off, nr_written, max_opt_len = opts->bpf_opt_len;
+ struct bpf_sock_ops_kern sock_ops;
+ int err;
+
+ if (likely(!max_opt_len))
+ return;
+
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
+
+ sock_ops.op = BPF_SOCK_OPS_WRITE_HDR_OPT_CB;
+
+ if (req) {
+ sock_ops.sk = (struct sock *)req;
+ sock_ops.syn_skb = syn_skb;
+ } else {
+ sock_owned_by_me(sk);
+
+ sock_ops.is_fullsock = 1;
+ sock_ops.sk = sk;
+ }
+
+ sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
+ sock_ops.remaining_opt_len = max_opt_len;
+ first_opt_off = tcp_hdrlen(skb) - max_opt_len;
+ bpf_skops_init_skb(&sock_ops, skb, first_opt_off);
+
+ err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
+
+ if (err)
+ nr_written = 0;
+ else
+ nr_written = max_opt_len - sock_ops.remaining_opt_len;
+
+ if (nr_written < max_opt_len)
+ memset(skb->data + first_opt_off + nr_written, TCPOPT_NOP,
+ max_opt_len - nr_written);
+}
+#else
+static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct sk_buff *syn_skb,
+ enum tcp_synack_type synack_type,
+ struct tcp_out_options *opts,
+ unsigned int *remaining)
+{
+}
+
+static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct sk_buff *syn_skb,
+ enum tcp_synack_type synack_type,
+ struct tcp_out_options *opts)
+{
+}
+#endif
/* Write previously computed TCP options to the packet.
*
@@ -536,6 +697,8 @@
}
smc_options_write(ptr, &options);
+
+ mptcp_options_write(ptr, opts);
}
static void smc_set_option(const struct tcp_sock *tp,
@@ -571,6 +734,22 @@
#endif
}
+static void mptcp_set_option_cond(const struct request_sock *req,
+ struct tcp_out_options *opts,
+ unsigned int *remaining)
+{
+ if (rsk_is_mptcp(req)) {
+ unsigned int size;
+
+ if (mptcp_synack_options(req, &size, &opts->mptcp)) {
+ if (*remaining >= size) {
+ opts->options |= OPTION_MPTCP;
+ *remaining -= size;
+ }
+ }
+ }
+}
+
/* Compute TCP options for SYN packets. This is not the final
* network wire format yet.
*/
@@ -584,7 +763,8 @@
*md5 = NULL;
#ifdef CONFIG_TCP_MD5SIG
- if (unlikely(rcu_access_pointer(tp->md5sig_info))) {
+ if (static_branch_unlikely(&tcp_md5_needed) &&
+ rcu_access_pointer(tp->md5sig_info)) {
*md5 = tp->af_specific->md5_lookup(sk, sk);
if (*md5) {
opts->options |= OPTION_MD5;
@@ -605,18 +785,18 @@
opts->mss = tcp_advertise_mss(sk);
remaining -= TCPOLEN_MSS_ALIGNED;
- if (likely(sock_net(sk)->ipv4.sysctl_tcp_timestamps && !*md5)) {
+ if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps) && !*md5)) {
opts->options |= OPTION_TS;
opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
opts->tsecr = tp->rx_opt.ts_recent;
remaining -= TCPOLEN_TSTAMP_ALIGNED;
}
- if (likely(sock_net(sk)->ipv4.sysctl_tcp_window_scaling)) {
+ if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling))) {
opts->ws = tp->rx_opt.rcv_wscale;
opts->options |= OPTION_WSCALE;
remaining -= TCPOLEN_WSCALE_ALIGNED;
}
- if (likely(sock_net(sk)->ipv4.sysctl_tcp_sack)) {
+ if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_sack))) {
opts->options |= OPTION_SACK_ADVERTISE;
if (unlikely(!(OPTION_TS & opts->options)))
remaining -= TCPOLEN_SACKPERM_ALIGNED;
@@ -639,6 +819,17 @@
smc_set_option(tp, opts, &remaining);
+ if (sk_is_mptcp(sk)) {
+ unsigned int size;
+
+ if (mptcp_syn_options(sk, skb, &size, &opts->mptcp)) {
+ opts->options |= OPTION_MPTCP;
+ remaining -= size;
+ }
+ }
+
+ bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
+
return MAX_TCP_OPTION_SPACE - remaining;
}
@@ -649,7 +840,8 @@
struct tcp_out_options *opts,
const struct tcp_md5sig_key *md5,
struct tcp_fastopen_cookie *foc,
- enum tcp_synack_type synack_type)
+ enum tcp_synack_type synack_type,
+ struct sk_buff *syn_skb)
{
struct inet_request_sock *ireq = inet_rsk(req);
unsigned int remaining = MAX_TCP_OPTION_SPACE;
@@ -681,7 +873,7 @@
if (likely(ireq->tstamp_ok)) {
opts->options |= OPTION_TS;
opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off;
- opts->tsecr = req->ts_recent;
+ opts->tsecr = READ_ONCE(req->ts_recent);
remaining -= TCPOLEN_TSTAMP_ALIGNED;
}
if (likely(ireq->sack_ok)) {
@@ -702,7 +894,12 @@
}
}
+ mptcp_set_option_cond(req, opts, &remaining);
+
smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
+
+ bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb,
+ synack_type, opts, &remaining);
return MAX_TCP_OPTION_SPACE - remaining;
}
@@ -722,7 +919,8 @@
*md5 = NULL;
#ifdef CONFIG_TCP_MD5SIG
- if (unlikely(rcu_access_pointer(tp->md5sig_info))) {
+ if (static_branch_unlikely(&tcp_md5_needed) &&
+ rcu_access_pointer(tp->md5sig_info)) {
*md5 = tp->af_specific->md5_lookup(sk, sk);
if (*md5) {
opts->options |= OPTION_MD5;
@@ -738,16 +936,46 @@
size += TCPOLEN_TSTAMP_ALIGNED;
}
+ /* MPTCP options have precedence over SACK for the limited TCP
+ * option space because a MPTCP connection would be forced to
+ * fall back to regular TCP if a required multipath option is
+ * missing. SACK still gets a chance to use whatever space is
+ * left.
+ */
+ if (sk_is_mptcp(sk)) {
+ unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
+ unsigned int opt_size = 0;
+
+ if (mptcp_established_options(sk, skb, &opt_size, remaining,
+ &opts->mptcp)) {
+ opts->options |= OPTION_MPTCP;
+ size += opt_size;
+ }
+ }
+
eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
if (unlikely(eff_sacks)) {
const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
+ if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED +
+ TCPOLEN_SACK_PERBLOCK))
+ return size;
+
opts->num_sack_blocks =
min_t(unsigned int, eff_sacks,
(remaining - TCPOLEN_SACK_BASE_ALIGNED) /
TCPOLEN_SACK_PERBLOCK);
- if (likely(opts->num_sack_blocks))
- size += TCPOLEN_SACK_BASE_ALIGNED +
- opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
+
+ size += TCPOLEN_SACK_BASE_ALIGNED +
+ opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
+ }
+
+ if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp,
+ BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) {
+ unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
+
+ bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
+
+ size = MAX_TCP_OPTION_SPACE - remaining;
}
return size;
@@ -966,48 +1194,33 @@
return HRTIMER_NORESTART;
}
-static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
+static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb,
+ u64 prior_wstamp)
{
struct tcp_sock *tp = tcp_sk(sk);
- ktime_t expire, now;
- u64 len_ns;
- u32 rate;
- if (!tcp_needs_internal_pacing(sk))
- return;
- rate = sk->sk_pacing_rate;
- if (!rate || rate == ~0U)
- return;
+ if (sk->sk_pacing_status != SK_PACING_NONE) {
+ unsigned long rate = sk->sk_pacing_rate;
- len_ns = (u64)skb->len * NSEC_PER_SEC;
- do_div(len_ns, rate);
- now = ktime_get();
- /* If hrtimer is already armed, then our caller has not
- * used tcp_pacing_check().
- */
- if (unlikely(hrtimer_is_queued(&tp->pacing_timer))) {
- expire = hrtimer_get_softexpires(&tp->pacing_timer);
- if (ktime_after(expire, now))
- now = expire;
- if (hrtimer_try_to_cancel(&tp->pacing_timer) == 1)
- __sock_put(sk);
+ /* Original sch_fq does not pace first 10 MSS
+ * Note that tp->data_segs_out overflows after 2^32 packets,
+ * this is a minor annoyance.
+ */
+ if (rate != ~0UL && rate && tp->data_segs_out >= 10) {
+ u64 len_ns = div64_ul((u64)skb->len * NSEC_PER_SEC, rate);
+ u64 credit = tp->tcp_wstamp_ns - prior_wstamp;
+
+ /* take into account OS jitter */
+ len_ns -= min_t(u64, len_ns / 2, credit);
+ tp->tcp_wstamp_ns += len_ns;
+ }
}
- hrtimer_start(&tp->pacing_timer, ktime_add_ns(now, len_ns),
- HRTIMER_MODE_ABS_PINNED_SOFT);
- sock_hold(sk);
-}
-
-static bool tcp_pacing_check(const struct sock *sk)
-{
- return tcp_needs_internal_pacing(sk) &&
- hrtimer_is_queued(&tcp_sk(sk)->pacing_timer);
-}
-
-static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb)
-{
- skb->skb_mstamp = tp->tcp_mstamp;
list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
}
+
+INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl));
+INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl));
+INDIRECT_CALLABLE_DECLARE(void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb));
/* This routine actually transmits TCP packets queued in by
* tcp_do_sendmsg(). This is used by both the initial
@@ -1032,11 +1245,14 @@
struct sk_buff *oskb = NULL;
struct tcp_md5sig_key *md5;
struct tcphdr *th;
+ u64 prior_wstamp;
int err;
BUG_ON(!skb || !tcp_skb_pcount(skb));
tp = tcp_sk(sk);
-
+ prior_wstamp = tp->tcp_wstamp_ns;
+ tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
+ skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
if (clone_it) {
TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
- tp->snd_una;
@@ -1051,18 +1267,32 @@
if (unlikely(!skb))
return -ENOBUFS;
+ /* retransmit skbs might have a non zero value in skb->dev
+ * because skb->dev is aliased with skb->rbnode.rb_left
+ */
+ skb->dev = NULL;
}
- skb->skb_mstamp = tp->tcp_mstamp;
inet = inet_sk(sk);
tcb = TCP_SKB_CB(skb);
memset(&opts, 0, sizeof(opts));
- if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
+ if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
- else
+ } else {
tcp_options_size = tcp_established_options(sk, skb, &opts,
&md5);
+ /* Force a PSH flag on all (GSO) packets to expedite GRO flush
+ * at receiver : This slightly improve GRO performance.
+ * Note that we do not force the PSH flag for non GSO packets,
+ * because they might be sent under high congestion events,
+ * and in this case it is better to delay the delivery of 1-MSS
+ * packets and thus the corresponding ACK packet that would
+ * release the following packet.
+ */
+ if (tcp_skb_pcount(skb) > 1)
+ tcb->tcp_flags |= TCPHDR_PSH;
+ }
tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
/* if no packet is in qdisc/device queue, then allow XPS to select
@@ -1135,16 +1365,20 @@
}
#endif
- icsk->icsk_af_ops->send_check(sk, skb);
+ /* BPF prog is the last one writing header option */
+ bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts);
+
+ INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check,
+ tcp_v6_send_check, tcp_v4_send_check,
+ sk, skb);
if (likely(tcb->tcp_flags & TCPHDR_ACK))
- tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt);
+ tcp_event_ack_sent(sk, rcv_nxt);
if (skb->len != tcp_header_size) {
tcp_event_data_sent(tp, sk);
tp->data_segs_out += tcp_skb_pcount(skb);
tp->bytes_sent += skb->len - tcp_header_size;
- tcp_internal_pacing(sk, skb);
}
if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
@@ -1156,21 +1390,24 @@
skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
- /* Our usage of tstamp should remain private */
- skb->tstamp = 0;
+ /* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */
/* Cleanup our debris for IP stacks */
memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
sizeof(struct inet6_skb_parm)));
- err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
+ tcp_add_tx_delay(skb, tp);
+
+ err = INDIRECT_CALL_INET(icsk->icsk_af_ops->queue_xmit,
+ inet6_csk_xmit, ip_queue_xmit,
+ sk, skb, &inet->cork.fl);
if (unlikely(err > 0)) {
tcp_enter_cwr(sk);
err = net_xmit_eval(err);
}
if (!err && oskb) {
- tcp_update_skb_after_send(tp, oskb);
+ tcp_update_skb_after_send(sk, oskb, prior_wstamp);
tcp_rate_skb_sent(sk, oskb);
}
return err;
@@ -1196,7 +1433,7 @@
WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq);
__skb_header_release(skb);
tcp_add_write_queue_tail(sk, skb);
- sk->sk_wmem_queued += skb->truesize;
+ sk_wmem_queued_add(sk, skb->truesize);
sk_mem_charge(sk, skb->truesize);
}
@@ -1321,15 +1558,16 @@
return -ENOMEM;
}
- if (skb_unclone(skb, gfp))
+ if (skb_unclone_keeptruesize(skb, gfp))
return -ENOMEM;
/* Get a new skb... force flag on. */
buff = sk_stream_alloc_skb(sk, nsize, gfp, true);
if (!buff)
return -ENOMEM; /* We'll just try again later. */
+ skb_copy_decrypted(buff, skb);
- sk->sk_wmem_queued += buff->truesize;
+ sk_wmem_queued_add(sk, buff->truesize);
sk_mem_charge(sk, buff->truesize);
nlen = skb->len - len - nsize;
buff->truesize += nlen;
@@ -1410,7 +1648,7 @@
} else {
shinfo->frags[k] = shinfo->frags[i];
if (eat) {
- shinfo->frags[k].page_offset += eat;
+ skb_frag_off_add(&shinfo->frags[k], eat);
skb_frag_size_sub(&shinfo->frags[k], eat);
eat = 0;
}
@@ -1429,7 +1667,7 @@
{
u32 delta_truesize;
- if (skb_unclone(skb, GFP_ATOMIC))
+ if (skb_unclone_keeptruesize(skb, GFP_ATOMIC))
return -ENOMEM;
delta_truesize = __pskb_trim_head(skb, len);
@@ -1439,9 +1677,8 @@
if (delta_truesize) {
skb->truesize -= delta_truesize;
- sk->sk_wmem_queued -= delta_truesize;
+ sk_wmem_queued_add(sk, -delta_truesize);
sk_mem_uncharge(sk, delta_truesize);
- sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
}
/* Any change of skb->len requires recalculation of tso factor. */
@@ -1479,7 +1716,8 @@
mss_now -= icsk->icsk_ext_hdr_len;
/* Then reserve room for full set of TCP options and 8 bytes of data */
- mss_now = max(mss_now, sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss);
+ mss_now = max(mss_now,
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss));
return mss_now;
}
@@ -1522,10 +1760,10 @@
struct inet_connection_sock *icsk = inet_csk(sk);
struct net *net = sock_net(sk);
- icsk->icsk_mtup.enabled = net->ipv4.sysctl_tcp_mtu_probing > 1;
+ icsk->icsk_mtup.enabled = READ_ONCE(net->ipv4.sysctl_tcp_mtu_probing) > 1;
icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
icsk->icsk_af_ops->net_header_len;
- icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
+ icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, READ_ONCE(net->ipv4.sysctl_tcp_base_mss));
icsk->icsk_mtup.probe_size = 0;
if (icsk->icsk_mtup.enabled)
icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
@@ -1637,15 +1875,20 @@
const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
struct tcp_sock *tp = tcp_sk(sk);
- /* Track the maximum number of outstanding packets in each
- * window, and remember whether we were cwnd-limited then.
+ /* Track the strongest available signal of the degree to which the cwnd
+ * is fully utilized. If cwnd-limited then remember that fact for the
+ * current window. If not cwnd-limited then track the maximum number of
+ * outstanding packets in the current window. (If cwnd-limited then we
+ * chose to not update tp->max_packets_out to avoid an extra else
+ * clause with no functional impact.)
*/
- if (!before(tp->snd_una, tp->max_packets_seq) ||
- tp->packets_out > tp->max_packets_out ||
- is_cwnd_limited) {
- tp->max_packets_out = tp->packets_out;
- tp->max_packets_seq = tp->snd_nxt;
+ if (!before(tp->snd_una, tp->cwnd_usage_seq) ||
+ is_cwnd_limited ||
+ (!tp->is_cwnd_limited &&
+ tp->packets_out > tp->max_packets_out)) {
tp->is_cwnd_limited = is_cwnd_limited;
+ tp->max_packets_out = tp->packets_out;
+ tp->cwnd_usage_seq = tp->snd_nxt;
}
if (tcp_is_cwnd_limited(sk)) {
@@ -1657,7 +1900,7 @@
if (tp->packets_out > tp->snd_cwnd_used)
tp->snd_cwnd_used = tp->packets_out;
- if (sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle &&
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle) &&
(s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto &&
!ca_ops->cong_control)
tcp_cwnd_application_limited(sk);
@@ -1721,8 +1964,9 @@
{
u32 bytes, segs;
- bytes = min(sk->sk_pacing_rate >> sk->sk_pacing_shift,
- sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
+ bytes = min_t(unsigned long,
+ sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
+ sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
/* Goal is to send at least one packet per ms,
* not one big TSO packet every 100 ms.
@@ -1744,7 +1988,7 @@
min_tso = ca_ops->min_tso_segs ?
ca_ops->min_tso_segs(sk) :
- sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs;
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
return min_t(u32, tso_segs, sk->sk_gso_max_segs);
@@ -1868,23 +2112,24 @@
* know that all the data is in scatter-gather pages, and that the
* packet has never been sent out before (and thus is not cloned).
*/
-static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue,
- struct sk_buff *skb, unsigned int len,
+static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
unsigned int mss_now, gfp_t gfp)
{
- struct sk_buff *buff;
int nlen = skb->len - len;
+ struct sk_buff *buff;
u8 flags;
/* All of a TSO frame must be composed of paged data. */
if (skb->len != skb->data_len)
- return tcp_fragment(sk, tcp_queue, skb, len, mss_now, gfp);
+ return tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
+ skb, len, mss_now, gfp);
buff = sk_stream_alloc_skb(sk, 0, gfp, true);
if (unlikely(!buff))
return -ENOMEM;
+ skb_copy_decrypted(buff, skb);
- sk->sk_wmem_queued += buff->truesize;
+ sk_wmem_queued_add(sk, buff->truesize);
sk_mem_charge(sk, buff->truesize);
buff->truesize += nlen;
skb->truesize -= nlen;
@@ -1914,7 +2159,7 @@
/* Link BUFF into the send queue. */
__skb_header_release(buff);
- tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
+ tcp_insert_write_queue_after(skb, buff, sk, TCP_FRAG_IN_WRITE_QUEUE);
return 0;
}
@@ -1930,18 +2175,22 @@
u32 max_segs)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
- u32 age, send_win, cong_win, limit, in_flight;
+ u32 send_win, cong_win, limit, in_flight;
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *head;
int win_divisor;
+ s64 delta;
if (icsk->icsk_ca_state >= TCP_CA_Recovery)
goto send_now;
/* Avoid bursty behavior by allowing defer
- * only if the last write was recent.
+ * only if the last write was recent (1 ms).
+ * Note that tp->tcp_wstamp_ns can be in the future if we have
+ * packets waiting in a qdisc or device for EDT delivery.
*/
- if ((s32)(tcp_jiffies32 - tp->lsndtime) > 0)
+ delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC;
+ if (delta > 0)
goto send_now;
in_flight = tcp_packets_in_flight(tp);
@@ -1988,9 +2237,9 @@
head = tcp_rtx_queue_head(sk);
if (!head)
goto send_now;
- age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp);
+ delta = tp->tcp_clock_cache - head->tstamp;
/* If next ACK is likely to come too late (half srtt), do not defer */
- if (age < (tp->srtt_us >> 4))
+ if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0)
goto send_now;
/* Ok, it looks like it is advisable to defer.
@@ -2012,7 +2261,8 @@
}
/* If this packet won't get more data, do not wait. */
- if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) ||
+ TCP_SKB_CB(skb)->eor)
goto send_now;
return true;
@@ -2029,7 +2279,7 @@
u32 interval;
s32 delta;
- interval = net->ipv4.sysctl_tcp_probe_interval;
+ interval = READ_ONCE(net->ipv4.sysctl_tcp_probe_interval);
delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp;
if (unlikely(delta >= interval * HZ)) {
int mss = tcp_current_mss(sk);
@@ -2111,7 +2361,7 @@
* probing process by not resetting search range to its orignal.
*/
if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) ||
- interval < net->ipv4.sysctl_tcp_probe_threshold) {
+ interval < READ_ONCE(net->ipv4.sysctl_tcp_probe_threshold)) {
/* Check whether enough time has elaplased for
* another round of probing.
*/
@@ -2139,17 +2389,15 @@
if (!tcp_can_coalesce_send_queue_head(sk, probe_size))
return -1;
- if (tcp_pacing_check(sk))
- return -1;
-
/* We're allowed to probe. Build it now. */
nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false);
if (!nskb)
return -1;
- sk->sk_wmem_queued += nskb->truesize;
+ sk_wmem_queued_add(sk, nskb->truesize);
sk_mem_charge(sk, nskb->truesize);
skb = tcp_send_head(sk);
+ skb_copy_decrypted(nskb, skb);
TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
@@ -2215,6 +2463,25 @@
return -1;
}
+static bool tcp_pacing_check(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!tcp_needs_internal_pacing(sk))
+ return false;
+
+ if (tp->tcp_wstamp_ns <= tp->tcp_clock_cache)
+ return false;
+
+ if (!hrtimer_is_queued(&tp->pacing_timer)) {
+ hrtimer_start(&tp->pacing_timer,
+ ns_to_ktime(tp->tcp_wstamp_ns),
+ HRTIMER_MODE_ABS_PINNED_SOFT);
+ sock_hold(sk);
+ }
+ return true;
+}
+
/* TCP Small Queues :
* Control number of packets in qdisc/devices to two packets / or ~1 ms.
* (These limits are doubled for retransmits)
@@ -2229,13 +2496,28 @@
static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
unsigned int factor)
{
- unsigned int limit;
+ unsigned long limit;
- limit = max(2 * skb->truesize, sk->sk_pacing_rate >> sk->sk_pacing_shift);
- limit = min_t(u32, limit,
- sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
+ limit = max_t(unsigned long,
+ 2 * skb->truesize,
+ sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift));
+ if (sk->sk_pacing_status == SK_PACING_NONE)
+ limit = min_t(unsigned long, limit,
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes));
limit <<= factor;
+ if (static_branch_unlikely(&tcp_tx_delay_enabled) &&
+ tcp_sk(sk)->tcp_tx_delay) {
+ u64 extra_bytes = (u64)sk->sk_pacing_rate * tcp_sk(sk)->tcp_tx_delay;
+
+ /* TSQ is based on skb truesize sum (sk_wmem_alloc), so we
+ * approximate our needs assuming an ~100% skb->truesize overhead.
+ * USEC_PER_SEC is approximated by 2^20.
+ * do_div(extra_bytes, USEC_PER_SEC/2) is replaced by a right shift.
+ */
+ extra_bytes >>= (20 - 1);
+ limit += extra_bytes;
+ }
if (refcount_read(&sk->sk_wmem_alloc) > limit) {
/* Always send skb if rtx queue is empty.
* No need to wait for TX completion to call us back,
@@ -2341,17 +2623,19 @@
while ((skb = tcp_send_head(sk))) {
unsigned int limit;
+ if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
+ /* "skb_mstamp_ns" is used as a start point for the retransmit timer */
+ skb->skb_mstamp_ns = tp->tcp_wstamp_ns = tp->tcp_clock_cache;
+ list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
+ tcp_init_tso_segs(skb, mss_now);
+ goto repair; /* Skip network transmission */
+ }
+
if (tcp_pacing_check(sk))
break;
tso_segs = tcp_init_tso_segs(skb, mss_now);
BUG_ON(!tso_segs);
-
- if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
- /* "skb_mstamp" is used as a start point for the retransmit timer */
- tcp_update_skb_after_send(tp, skb);
- goto repair; /* Skip network transmission */
- }
cwnd_quota = tcp_cwnd_test(tp, skb);
if (!cwnd_quota) {
@@ -2388,8 +2672,7 @@
nonagle);
if (skb->len > limit &&
- unlikely(tso_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
- skb, limit, mss_now, gfp)))
+ unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
break;
if (tcp_small_queue_check(sk, skb, 0))
@@ -2450,10 +2733,10 @@
/* Don't do any loss probe on a Fast Open connection before 3WHS
* finishes.
*/
- if (tp->fastopen_rsk)
+ if (rcu_access_pointer(tp->fastopen_rsk))
return false;
- early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans;
+ early_retrans = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_early_retrans);
/* Schedule a loss probe in 2*RTT for SACK capable connections
* not in loss recovery, that are either limited by cwnd or application.
*/
@@ -2484,8 +2767,7 @@
if (rto_delta_us > 0)
timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
- TCP_RTO_MAX);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, TCP_RTO_MAX);
return true;
}
@@ -2666,8 +2948,12 @@
int mss = icsk->icsk_ack.rcv_mss;
int free_space = tcp_space(sk);
int allowed_space = tcp_full_space(sk);
- int full_space = min_t(int, tp->window_clamp, allowed_space);
- int window;
+ int full_space, window;
+
+ if (sk_is_mptcp(sk))
+ mptcp_space(sk, &free_space, &allowed_space);
+
+ full_space = min_t(int, tp->window_clamp, allowed_space);
if (unlikely(mss > full_space)) {
mss = full_space;
@@ -2815,7 +3101,7 @@
struct sk_buff *skb = to, *tmp;
bool first = true;
- if (!sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse)
+ if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse))
return;
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
return;
@@ -2824,7 +3110,7 @@
if (!tcp_can_collapse(sk, skb))
break;
- if (!tcp_skb_can_collapse_to(to))
+ if (!tcp_skb_can_collapse(to, skb))
break;
space -= skb->len;
@@ -2855,7 +3141,7 @@
struct tcp_sock *tp = tcp_sk(sk);
unsigned int cur_mss;
int diff, len, err;
-
+ int avail_wnd;
/* Inconclusive MTU probe */
if (icsk->icsk_mtup.probe_size)
@@ -2885,23 +3171,31 @@
return -EHOSTUNREACH; /* Routing failure or similar. */
cur_mss = tcp_current_mss(sk);
+ avail_wnd = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
/* If receiver has shrunk his window, and skb is out of
* new window, do not retransmit it. The exception is the
* case, when window is shrunk to zero. In this case
- * our retransmit serves as a zero window probe.
+ * our retransmit of one segment serves as a zero window probe.
*/
- if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&
- TCP_SKB_CB(skb)->seq != tp->snd_una)
- return -EAGAIN;
+ if (avail_wnd <= 0) {
+ if (TCP_SKB_CB(skb)->seq != tp->snd_una)
+ return -EAGAIN;
+ avail_wnd = cur_mss;
+ }
len = cur_mss * segs;
+ if (len > avail_wnd) {
+ len = rounddown(avail_wnd, cur_mss);
+ if (!len)
+ len = avail_wnd;
+ }
if (skb->len > len) {
if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len,
cur_mss, GFP_ATOMIC))
return -ENOMEM; /* We'll try again later. */
} else {
- if (skb_unclone(skb, GFP_ATOMIC))
+ if (skb_unclone_keeptruesize(skb, GFP_ATOMIC))
return -ENOMEM;
diff = tcp_skb_pcount(skb);
@@ -2909,8 +3203,9 @@
diff -= tcp_skb_pcount(skb);
if (diff)
tcp_adjust_pcount(sk, skb, diff);
- if (skb->len < cur_mss)
- tcp_retrans_try_collapse(sk, skb, cur_mss);
+ avail_wnd = min_t(int, avail_wnd, cur_mss);
+ if (skb->len < avail_wnd)
+ tcp_retrans_try_collapse(sk, skb, avail_wnd);
}
/* RFC3168, section 6.1.1.1. ECN fallback */
@@ -2935,24 +3230,32 @@
tcp_skb_tsorted_save(skb) {
nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
- err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
- -ENOBUFS;
+ if (nskb) {
+ nskb->dev = NULL;
+ err = tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC);
+ } else {
+ err = -ENOBUFS;
+ }
} tcp_skb_tsorted_restore(skb);
if (!err) {
- tcp_update_skb_after_send(tp, skb);
+ tcp_update_skb_after_send(sk, skb, tp->tcp_wstamp_ns);
tcp_rate_skb_sent(sk, skb);
}
} else {
err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
}
+ /* To avoid taking spuriously low RTT samples based on a timestamp
+ * for a transmit that never happened, always mark EVER_RETRANS
+ */
+ TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
+
if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG))
tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB,
TCP_SKB_CB(skb)->seq, segs, err);
if (likely(!err)) {
- TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
trace_tcp_retransmit_skb(sk, skb);
} else if (err != -EBUSY) {
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs);
@@ -2995,6 +3298,7 @@
const struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_buff *skb, *rtx_head, *hole = NULL;
struct tcp_sock *tp = tcp_sk(sk);
+ bool rearm_timer = false;
u32 max_segs;
int mib_idx;
@@ -3017,7 +3321,7 @@
segs = tp->snd_cwnd - tcp_packets_in_flight(tp);
if (segs <= 0)
- return;
+ break;
sacked = TCP_SKB_CB(skb)->sacked;
/* In case tcp_shift_skb_data() have aggregated large skbs,
* we need to make sure not sending too bigs TSO packets
@@ -3042,10 +3346,10 @@
continue;
if (tcp_small_queue_check(sk, skb, 1))
- return;
+ break;
if (tcp_retransmit_skb(sk, skb, segs))
- return;
+ break;
NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb));
@@ -3054,10 +3358,13 @@
if (skb == rtx_head &&
icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- inet_csk(sk)->icsk_rto,
- TCP_RTO_MAX);
+ rearm_timer = true;
+
}
+ if (rearm_timer)
+ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ inet_csk(sk)->icsk_rto,
+ TCP_RTO_MAX);
}
/* We allow to exceed memory limits for FIN packets to expedite
@@ -3069,11 +3376,12 @@
*/
void sk_forced_mem_schedule(struct sock *sk, int size)
{
- int amt;
+ int delta, amt;
- if (size <= sk->sk_forward_alloc)
+ delta = size - sk->sk_forward_alloc;
+ if (delta <= 0)
return;
- amt = sk_mem_pages(size);
+ amt = sk_mem_pages(delta);
sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
sk_memory_allocated_add(sk, amt);
@@ -3086,7 +3394,7 @@
*/
void tcp_send_fin(struct sock *sk)
{
- struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk);
+ struct sk_buff *skb, *tskb, *tail = tcp_write_queue_tail(sk);
struct tcp_sock *tp = tcp_sk(sk);
/* Optimization, tack on the FIN if we have one skb in write queue and
@@ -3094,31 +3402,29 @@
* Note: in the latter case, FIN packet will be sent after a timeout,
* as TCP stack thinks it has already been transmitted.
*/
+ tskb = tail;
if (!tskb && tcp_under_memory_pressure(sk))
tskb = skb_rb_last(&sk->tcp_rtx_queue);
if (tskb) {
-coalesce:
TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
TCP_SKB_CB(tskb)->end_seq++;
tp->write_seq++;
- if (tcp_write_queue_empty(sk)) {
+ if (!tail) {
/* This means tskb was already sent.
* Pretend we included the FIN on previous transmit.
* We need to set tp->snd_nxt to the value it would have
* if FIN had been sent. This is because retransmit path
* does not change tp->snd_nxt.
*/
- tp->snd_nxt++;
+ WRITE_ONCE(tp->snd_nxt, tp->snd_nxt + 1);
return;
}
} else {
skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
- if (unlikely(!skb)) {
- if (tskb)
- goto coalesce;
+ if (unlikely(!skb))
return;
- }
+
INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
skb_reserve(skb, MAX_TCP_HEADER);
sk_forced_mem_schedule(sk, skb->truesize);
@@ -3192,7 +3498,7 @@
tcp_rtx_queue_unlink_and_free(skb, sk);
__skb_header_release(nskb);
tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb);
- sk->sk_wmem_queued += nskb->truesize;
+ sk_wmem_queued_add(sk, nskb->truesize);
sk_mem_charge(sk, nskb->truesize);
skb = nskb;
}
@@ -3204,18 +3510,20 @@
}
/**
- * tcp_make_synack - Prepare a SYN-ACK.
- * sk: listener socket
- * dst: dst entry attached to the SYNACK
- * req: request_sock pointer
- *
- * Allocate one skb and build a SYNACK packet.
- * @dst is consumed : Caller should not use it again.
+ * tcp_make_synack - Allocate one skb and build a SYNACK packet.
+ * @sk: listener socket
+ * @dst: dst entry attached to the SYNACK. It is consumed and caller
+ * should not use it again.
+ * @req: request_sock pointer
+ * @foc: cookie for tcp fast open
+ * @synack_type: Type of synack to prepare
+ * @syn_skb: SYN packet just received. It could be NULL for rtx case.
*/
struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
struct request_sock *req,
struct tcp_fastopen_cookie *foc,
- enum tcp_synack_type synack_type)
+ enum tcp_synack_type synack_type,
+ struct sk_buff *syn_skb)
{
struct inet_request_sock *ireq = inet_rsk(req);
const struct tcp_sock *tp = tcp_sk(sk);
@@ -3225,6 +3533,7 @@
int tcp_header_size;
struct tcphdr *th;
int mss;
+ u64 now;
skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
if (unlikely(!skb)) {
@@ -3256,20 +3565,28 @@
mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
memset(&opts, 0, sizeof(opts));
+ now = tcp_clock_ns();
#ifdef CONFIG_SYN_COOKIES
- if (unlikely(req->cookie_ts))
- skb->skb_mstamp = cookie_init_timestamp(req);
+ if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok))
+ skb->skb_mstamp_ns = cookie_init_timestamp(req, now);
else
#endif
- skb->skb_mstamp = tcp_clock_us();
+ {
+ skb->skb_mstamp_ns = now;
+ if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */
+ tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb);
+ }
#ifdef CONFIG_TCP_MD5SIG
rcu_read_lock();
md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
#endif
skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
+ /* bpf program will be interested in the tcp_flags */
+ TCP_SKB_CB(skb)->tcp_flags = TCPHDR_SYN | TCPHDR_ACK;
tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
- foc, synack_type) + sizeof(*th);
+ foc, synack_type,
+ syn_skb) + sizeof(*th);
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
@@ -3291,7 +3608,7 @@
th->window = htons(min(req->rsk_rcv_wnd, 65535U));
tcp_options_write((__be32 *)(th + 1), NULL, &opts);
th->doff = (tcp_header_size >> 2);
- __TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
#ifdef CONFIG_TCP_MD5SIG
/* Okay, we have all we need - do the md5 hash if needed */
@@ -3301,8 +3618,12 @@
rcu_read_unlock();
#endif
- /* Do not fool tcpdump (if any), clean our debris */
- skb->tstamp = 0;
+ bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb,
+ synack_type, &opts);
+
+ skb->skb_mstamp_ns = now;
+ tcp_add_tx_delay(skb, tp);
+
return skb;
}
EXPORT_SYMBOL(tcp_make_synack);
@@ -3318,8 +3639,8 @@
rcu_read_lock();
ca = tcp_ca_find_key(ca_key);
- if (likely(ca && try_module_get(ca->owner))) {
- module_put(icsk->icsk_ca_ops->owner);
+ if (likely(ca && bpf_try_module_get(ca, ca->owner))) {
+ bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner);
icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
icsk->icsk_ca_ops = ca;
}
@@ -3338,7 +3659,7 @@
* See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
*/
tp->tcp_header_len = sizeof(struct tcphdr);
- if (sock_net(sk)->ipv4.sysctl_tcp_timestamps)
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps))
tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED;
#ifdef CONFIG_TCP_MD5SIG
@@ -3374,7 +3695,7 @@
tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
&tp->rcv_wnd,
&tp->window_clamp,
- sock_net(sk)->ipv4.sysctl_tcp_window_scaling,
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling),
&rcv_wscale,
rcv_wnd);
@@ -3389,7 +3710,7 @@
tp->snd_una = tp->write_seq;
tp->snd_sml = tp->write_seq;
tp->snd_up = tp->write_seq;
- tp->snd_nxt = tp->write_seq;
+ WRITE_ONCE(tp->snd_nxt, tp->write_seq);
if (likely(!tp->repair))
tp->rcv_nxt = 0;
@@ -3410,7 +3731,7 @@
tcb->end_seq += skb->len;
__skb_header_release(skb);
- sk->sk_wmem_queued += skb->truesize;
+ sk_wmem_queued_add(sk, skb->truesize);
sk_mem_charge(sk, skb->truesize);
WRITE_ONCE(tp->write_seq, tcb->end_seq);
tp->packets_out += tcp_skb_pcount(skb);
@@ -3425,6 +3746,7 @@
*/
static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_fastopen_request *fo = tp->fastopen_req;
int space, err = 0;
@@ -3439,8 +3761,10 @@
* private TCP options. The cost is reduced data space in SYN :(
*/
tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp);
+ /* Sync mss_cache after updating the mss_clamp */
+ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
- space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
+ space = __tcp_mtu_to_mss(sk, icsk->icsk_pmtu_cookie) -
MAX_TCP_OPTION_SPACE;
space = min_t(size_t, space, fo->size);
@@ -3465,6 +3789,7 @@
skb_trim(syn_data, copied);
space = copied;
}
+ skb_zcopy_set(syn_data, fo->uarg, NULL);
}
/* No more data pending in inet_wait_for_connect() */
if (space == fo->size)
@@ -3477,7 +3802,7 @@
err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
- syn->skb_mstamp = syn_data->skb_mstamp;
+ syn->skb_mstamp_ns = syn_data->skb_mstamp_ns;
/* Now full SYN+DATA was cloned and sent (or not),
* remove the SYN from the original skb (syn_data)
@@ -3548,11 +3873,11 @@
/* We change tp->snd_nxt after the tcp_transmit_skb() call
* in order to make this packet get counted in tcpOutSegs.
*/
- tp->snd_nxt = tp->write_seq;
+ WRITE_ONCE(tp->snd_nxt, tp->write_seq);
tp->pushed_seq = tp->write_seq;
buff = tcp_send_head(sk);
if (unlikely(buff)) {
- tp->snd_nxt = TCP_SKB_CB(buff)->seq;
+ WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(buff)->seq);
tp->pushed_seq = TCP_SKB_CB(buff)->seq;
}
TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
@@ -3578,7 +3903,7 @@
const struct tcp_sock *tp = tcp_sk(sk);
int max_ato = HZ / 2;
- if (icsk->icsk_ack.pingpong ||
+ if (inet_csk_in_pingpong_mode(sk) ||
(icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
max_ato = TCP_DELACK_MAX;
@@ -3599,16 +3924,15 @@
ato = min(ato, max_ato);
}
+ ato = min_t(u32, ato, inet_csk(sk)->icsk_delack_max);
+
/* Stay within the limit we were given */
timeout = jiffies + ato;
/* Use new timeout only if there wasn't a older one earlier. */
if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
- /* If delack timer was blocked or is about to expire,
- * send ACK now.
- */
- if (icsk->icsk_ack.blocked ||
- time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
+ /* If delack timer is about to expire, send ACK now. */
+ if (time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
tcp_send_ack(sk);
return;
}
@@ -3637,10 +3961,15 @@
buff = alloc_skb(MAX_TCP_HEADER,
sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
if (unlikely(!buff)) {
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ unsigned long delay;
+
+ delay = TCP_DELACK_MAX << icsk->icsk_ack.retry;
+ if (delay < TCP_RTO_MAX)
+ icsk->icsk_ack.retry++;
inet_csk_schedule_ack(sk);
- inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
- TCP_DELACK_MAX, TCP_RTO_MAX);
+ icsk->icsk_ack.ato = TCP_ATO_MIN;
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, delay, TCP_RTO_MAX);
return;
}
@@ -3759,7 +4088,7 @@
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
- unsigned long probe_max;
+ unsigned long timeout;
int err;
err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
@@ -3768,28 +4097,24 @@
/* Cancel probe timer, if it is not required. */
icsk->icsk_probes_out = 0;
icsk->icsk_backoff = 0;
+ icsk->icsk_probes_tstamp = 0;
return;
}
+ icsk->icsk_probes_out++;
if (err <= 0) {
- if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2)
+ if (icsk->icsk_backoff < READ_ONCE(net->ipv4.sysctl_tcp_retries2))
icsk->icsk_backoff++;
- icsk->icsk_probes_out++;
- probe_max = TCP_RTO_MAX;
+ timeout = tcp_probe0_when(sk, TCP_RTO_MAX);
} else {
/* If packet was not sent due to local congestion,
- * do not backoff and do not remember icsk_probes_out.
- * Let local senders to fight for local resources.
- *
- * Use accumulated backoff yet.
+ * Let senders fight for local resources conservatively.
*/
- if (!icsk->icsk_probes_out)
- icsk->icsk_probes_out = 1;
- probe_max = TCP_RESOURCE_PROBE_INTERVAL;
+ timeout = TCP_RESOURCE_PROBE_INTERVAL;
}
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
- tcp_probe0_when(sk, probe_max),
- TCP_RTO_MAX);
+
+ timeout = tcp_clamp_probe0_to_user_timeout(sk, timeout);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX);
}
int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
@@ -3799,10 +4124,11 @@
int res;
tcp_rsk(req)->txhash = net_tx_rndhash();
- res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL);
+ res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL,
+ NULL);
if (!res) {
- __TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
if (unlikely(tcp_passive_fastopen(sk)))
tcp_sk(sk)->total_retrans++;
trace_tcp_retransmit_synack(sk, req);
--
Gitblit v1.6.2