From ea08eeccae9297f7aabd2ef7f0c2517ac4549acc Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Tue, 20 Feb 2024 01:18:26 +0000
Subject: [PATCH] write in 30M
---
kernel/net/ipv4/tcp.c | 1054 +++++++++++++++++++++++++++++++++++++++-------------------
1 files changed, 709 insertions(+), 345 deletions(-)
diff --git a/kernel/net/ipv4/tcp.c b/kernel/net/ipv4/tcp.c
index 4dce1b4..9e3ed6d 100644
--- a/kernel/net/ipv4/tcp.c
+++ b/kernel/net/ipv4/tcp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -205,11 +206,6 @@
* Hirokazu Takahashi : Use copy_from_user() instead of
* csum_and_copy_from_user() if possible.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or(at your option) any later version.
- *
* Description of States:
*
* TCP_SYN_SENT sent a connection request, waiting for ack
@@ -262,7 +258,7 @@
#include <linux/net.h>
#include <linux/socket.h>
#include <linux/random.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/highmem.h>
#include <linux/swap.h>
#include <linux/cache.h>
@@ -275,6 +271,7 @@
#include <net/icmp.h>
#include <net/inet_common.h>
#include <net/tcp.h>
+#include <net/mptcp.h>
#include <net/xfrm.h>
#include <net/ip.h>
#include <net/sock.h>
@@ -282,6 +279,8 @@
#include <linux/uaccess.h>
#include <asm/ioctls.h>
#include <net/busy_poll.h>
+
+#include <trace/hooks/ipv4.h>
struct percpu_counter tcp_orphan_count;
EXPORT_SYMBOL_GPL(tcp_orphan_count);
@@ -320,6 +319,11 @@
*/
unsigned long tcp_memory_pressure __read_mostly;
EXPORT_SYMBOL_GPL(tcp_memory_pressure);
+
+DEFINE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key);
+EXPORT_SYMBOL(tcp_rx_skb_cache_key);
+
+DEFINE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key);
void tcp_enter_memory_pressure(struct sock *sk)
{
@@ -416,6 +420,8 @@
INIT_LIST_HEAD(&tp->tsorted_sent_queue);
icsk->icsk_rto = TCP_TIMEOUT_INIT;
+ icsk->icsk_rto_min = TCP_RTO_MIN;
+ icsk->icsk_delack_max = TCP_DELACK_MAX;
tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);
@@ -428,6 +434,7 @@
/* There's a bubble in the pipe until at least the first ACK. */
tp->app_limited = ~0U;
+ tp->rate_app_limited = 1;
/* See draft-stevens-tcpca-spec-01 for discussion of the
* initialization of these values.
@@ -436,38 +443,24 @@
tp->snd_cwnd_clamp = ~0;
tp->mss_cache = TCP_MSS_DEFAULT;
- tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
+ tp->reordering = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering);
tcp_assign_congestion_control(sk);
tp->tsoffset = 0;
tp->rack.reo_wnd_steps = 1;
-
- sk->sk_state = TCP_CLOSE;
sk->sk_write_space = sk_stream_write_space;
sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
icsk->icsk_sync_mss = tcp_sync_mss;
- sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1];
- sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1];
+ WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]));
+ WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]));
sk_sockets_allocated_inc(sk);
sk->sk_route_forced_caps = NETIF_F_GSO;
}
EXPORT_SYMBOL(tcp_init_sock);
-
-void tcp_init_transfer(struct sock *sk, int bpf_op)
-{
- struct inet_connection_sock *icsk = inet_csk(sk);
-
- tcp_mtup_init(sk);
- icsk->icsk_af_ops->rebuild_header(sk);
- tcp_init_metrics(sk);
- tcp_call_bpf(sk, bpf_op, 0, NULL);
- tcp_init_congestion_control(sk);
- tcp_init_buffer_space(sk);
-}
static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
{
@@ -515,6 +508,7 @@
__poll_t mask;
struct sock *sk = sock->sk;
const struct tcp_sock *tp = tcp_sk(sk);
+ u8 shutdown;
int state;
sock_poll_wait(file, sock, wait);
@@ -557,17 +551,18 @@
* NOTE. Check for TCP_CLOSE is added. The goal is to prevent
* blocking on fresh not-connected or disconnected socket. --ANK
*/
- if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
+ shutdown = READ_ONCE(sk->sk_shutdown);
+ if (shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
mask |= EPOLLHUP;
- if (sk->sk_shutdown & RCV_SHUTDOWN)
+ if (shutdown & RCV_SHUTDOWN)
mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
/* Connected or passive Fast Open socket? */
if (state != TCP_SYN_SENT &&
- (state != TCP_SYN_RECV || tp->fastopen_rsk)) {
+ (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) {
int target = sock_rcvlowat(sk, 0, INT_MAX);
- if (tp->urg_seq == READ_ONCE(tp->copied_seq) &&
+ if (READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) &&
!sock_flag(sk, SOCK_URGINLINE) &&
tp->urg_data)
target++;
@@ -575,8 +570,8 @@
if (tcp_stream_is_readable(tp, target, sk))
mask |= EPOLLIN | EPOLLRDNORM;
- if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
- if (sk_stream_is_writeable(sk)) {
+ if (!(shutdown & SEND_SHUTDOWN)) {
+ if (__sk_stream_is_writeable(sk, 1)) {
mask |= EPOLLOUT | EPOLLWRNORM;
} else { /* send SIGIO later */
sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
@@ -588,7 +583,7 @@
* pairs with the input side.
*/
smp_mb__after_atomic();
- if (sk_stream_is_writeable(sk))
+ if (__sk_stream_is_writeable(sk, 1))
mask |= EPOLLOUT | EPOLLWRNORM;
}
} else
@@ -628,7 +623,8 @@
unlock_sock_fast(sk, slow);
break;
case SIOCATMARK:
- answ = tp->urg_data && tp->urg_seq == READ_ONCE(tp->copied_seq);
+ answ = tp->urg_data &&
+ READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq);
break;
case SIOCOUTQ:
if (sk->sk_state == TCP_LISTEN)
@@ -646,7 +642,8 @@
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
answ = 0;
else
- answ = READ_ONCE(tp->write_seq) - tp->snd_nxt;
+ answ = READ_ONCE(tp->write_seq) -
+ READ_ONCE(tp->snd_nxt);
break;
default:
return -ENOIOCTLCMD;
@@ -678,7 +675,7 @@
tcb->sacked = 0;
__skb_header_release(skb);
tcp_add_write_queue_tail(sk, skb);
- sk->sk_wmem_queued += skb->truesize;
+ sk_wmem_queued_add(sk, skb->truesize);
sk_mem_charge(sk, skb->truesize);
if (tp->nonagle & TCP_NAGLE_PUSH)
tp->nonagle &= ~TCP_NAGLE_PUSH;
@@ -706,13 +703,13 @@
int size_goal)
{
return skb->len < size_goal &&
- sock_net(sk)->ipv4.sysctl_tcp_autocorking &&
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_autocorking) &&
!tcp_rtx_queue_empty(sk) &&
refcount_read(&sk->sk_wmem_alloc) > skb->truesize;
}
-static void tcp_push(struct sock *sk, int flags, int mss_now,
- int nonagle, int size_goal)
+void tcp_push(struct sock *sk, int flags, int mss_now,
+ int nonagle, int size_goal)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
@@ -875,6 +872,18 @@
{
struct sk_buff *skb;
+ if (likely(!size)) {
+ skb = sk->sk_tx_skb_cache;
+ if (skb) {
+ skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
+ sk->sk_tx_skb_cache = NULL;
+ pskb_trim(skb, 0);
+ INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
+ skb_shinfo(skb)->tx_flags = 0;
+ memset(TCP_SKB_CB(skb), 0, sizeof(struct tcp_skb_cb));
+ return skb;
+ }
+ }
/* The TCP header must be at least 32-bit aligned. */
size = ALIGN(size, 4);
@@ -934,7 +943,7 @@
return max(size_goal, mss_now);
}
-static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
+int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
{
int mss_now;
@@ -969,6 +978,11 @@
ssize_t copied;
long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+ if (IS_ENABLED(CONFIG_DEBUG_VM) &&
+ WARN_ONCE(!sendpage_ok(page),
+ "page must not be a Slab one and have page_count > 0"))
+ return -EINVAL;
+
/* Wait for a connection to finish. One exception is TCP Fast Open
* (passive side) where data is allowed to be sent before a connection
* is fully established.
@@ -998,13 +1012,16 @@
!tcp_skb_can_collapse_to(skb)) {
new_segment:
if (!sk_stream_memory_free(sk))
- goto wait_for_sndbuf;
+ goto wait_for_space;
skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
tcp_rtx_and_write_queues_empty(sk));
if (!skb)
- goto wait_for_memory;
+ goto wait_for_space;
+#ifdef CONFIG_TLS_DEVICE
+ skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
+#endif
skb_entail(sk, skb);
copy = size_goal;
}
@@ -1019,7 +1036,7 @@
goto new_segment;
}
if (!sk_wmem_schedule(sk, copy))
- goto wait_for_memory;
+ goto wait_for_space;
if (can_coalesce) {
skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
@@ -1034,7 +1051,7 @@
skb->len += copy;
skb->data_len += copy;
skb->truesize += copy;
- sk->sk_wmem_queued += copy;
+ sk_wmem_queued_add(sk, copy);
sk_mem_charge(sk, copy);
skb->ip_summed = CHECKSUM_PARTIAL;
WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
@@ -1060,9 +1077,8 @@
tcp_push_one(sk, mss_now);
continue;
-wait_for_sndbuf:
+wait_for_space:
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-wait_for_memory:
tcp_push(sk, flags & ~MSG_MORE, mss_now,
TCP_NAGLE_PUSH, size_goal);
@@ -1120,30 +1136,6 @@
}
EXPORT_SYMBOL(tcp_sendpage);
-/* Do not bother using a page frag for very small frames.
- * But use this heuristic only for the first skb in write queue.
- *
- * Having no payload in skb->head allows better SACK shifting
- * in tcp_shift_skb_data(), reducing sack/rack overhead, because
- * write queue has less skbs.
- * Each skb can hold up to MAX_SKB_FRAGS * 32Kbytes, or ~0.5 MB.
- * This also speeds up tso_fragment(), since it wont fallback
- * to tcp_fragment().
- */
-static int linear_payload_sz(bool first_skb)
-{
- if (first_skb)
- return SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
- return 0;
-}
-
-static int select_size(bool first_skb, bool zc)
-{
- if (zc)
- return 0;
- return linear_payload_sz(first_skb);
-}
-
void tcp_free_fastopen_req(struct tcp_sock *tp)
{
if (tp->fastopen_req) {
@@ -1153,14 +1145,16 @@
}
static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
- int *copied, size_t size)
+ int *copied, size_t size,
+ struct ubuf_info *uarg)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_sock *inet = inet_sk(sk);
struct sockaddr *uaddr = msg->msg_name;
int err, flags;
- if (!(sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) ||
+ if (!(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) &
+ TFO_CLIENT_ENABLE) ||
(uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) &&
uaddr->sa_family == AF_UNSPEC))
return -EOPNOTSUPP;
@@ -1173,6 +1167,7 @@
return -ENOBUFS;
tp->fastopen_req->data = msg;
tp->fastopen_req->size = size;
+ tp->fastopen_req->uarg = uarg;
if (inet->defer_connect) {
err = tcp_connect(sk);
@@ -1205,18 +1200,14 @@
struct sockcm_cookie sockc;
int flags, err, copied = 0;
int mss_now = 0, size_goal, copied_syn = 0;
- bool process_backlog = false;
+ int process_backlog = 0;
bool zc = false;
long timeo;
+ trace_android_rvh_tcp_sendmsg_locked(sk, size);
flags = msg->msg_flags;
if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) {
- if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
- err = -EINVAL;
- goto out_err;
- }
-
skb = tcp_write_queue_tail(sk);
uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb));
if (!uarg) {
@@ -1231,7 +1222,7 @@
if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) &&
!tp->repair) {
- err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
+ err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg);
if (err == -EINPROGRESS && copied_syn > 0)
goto out;
else if (err)
@@ -1297,31 +1288,30 @@
if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
bool first_skb;
- int linear;
new_segment:
if (!sk_stream_memory_free(sk))
- goto wait_for_sndbuf;
+ goto wait_for_space;
- if (process_backlog && sk_flush_backlog(sk)) {
- process_backlog = false;
- goto restart;
+ if (unlikely(process_backlog >= 16)) {
+ process_backlog = 0;
+ if (sk_flush_backlog(sk))
+ goto restart;
}
first_skb = tcp_rtx_and_write_queues_empty(sk);
- linear = select_size(first_skb, zc);
- skb = sk_stream_alloc_skb(sk, linear, sk->sk_allocation,
+ skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
first_skb);
if (!skb)
- goto wait_for_memory;
+ goto wait_for_space;
- process_backlog = true;
+ process_backlog++;
skb->ip_summed = CHECKSUM_PARTIAL;
skb_entail(sk, skb);
copy = size_goal;
/* All packets are restored as if they have
- * already been sent. skb_mstamp isn't set to
+ * already been sent. skb_mstamp_ns isn't set to
* avoid wrong rtt estimation.
*/
if (tp->repair)
@@ -1345,7 +1335,7 @@
struct page_frag *pfrag = sk_page_frag(sk);
if (!sk_page_frag_refill(sk, pfrag))
- goto wait_for_memory;
+ goto wait_for_space;
if (!skb_can_coalesce(skb, i, pfrag->page,
pfrag->offset)) {
@@ -1359,7 +1349,7 @@
copy = min_t(int, copy, pfrag->size - pfrag->offset);
if (!sk_wmem_schedule(sk, copy))
- goto wait_for_memory;
+ goto wait_for_space;
err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
pfrag->page,
@@ -1378,6 +1368,9 @@
}
pfrag->offset += copy;
} else {
+ if (!sk_wmem_schedule(sk, copy))
+ goto wait_for_space;
+
err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
if (err == -EMSGSIZE || err == -EEXIST) {
tcp_mark_push(tp, skb);
@@ -1412,9 +1405,8 @@
tcp_push_one(sk, mss_now);
continue;
-wait_for_sndbuf:
+wait_for_space:
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-wait_for_memory:
if (copied)
tcp_push(sk, flags & ~MSG_MORE, mss_now,
TCP_NAGLE_PUSH, size_goal);
@@ -1443,7 +1435,7 @@
if (copied + copied_syn)
goto out;
out_err:
- sock_zerocopy_put_abort(uarg);
+ sock_zerocopy_put_abort(uarg, true);
err = sk_stream_error(sk, flags, err);
/* make sure we wake any epoll edge trigger waiter */
if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
@@ -1546,7 +1538,7 @@
* calculation of whether or not we must ACK for the sake of
* a window update.
*/
-static void tcp_cleanup_rbuf(struct sock *sk, int copied)
+void tcp_cleanup_rbuf(struct sock *sk, int copied)
{
struct tcp_sock *tp = tcp_sk(sk);
bool time_to_ack = false;
@@ -1559,10 +1551,8 @@
if (inet_csk_ack_scheduled(sk)) {
const struct inet_connection_sock *icsk = inet_csk(sk);
- /* Delayed ACKs frequently hit locked sockets during bulk
- * receive. */
- if (icsk->icsk_ack.blocked ||
- /* Once-per-two-segments ACK was not sent by tcp_input.c */
+
+ if (/* Once-per-two-segments ACK was not sent by tcp_input.c */
tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
/*
* If this read emptied read buffer, we send ACK, if
@@ -1573,7 +1563,7 @@
(copied > 0 &&
((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
- !icsk->icsk_ack.pingpong)) &&
+ !inet_csk_in_pingpong_mode(sk))) &&
!atomic_read(&sk->sk_rmem_alloc)))
time_to_ack = true;
}
@@ -1669,11 +1659,13 @@
if (!copied)
copied = used;
break;
- } else if (used <= len) {
- seq += used;
- copied += used;
- offset += used;
}
+ if (WARN_ON_ONCE(used > len))
+ used = len;
+ seq += used;
+ copied += used;
+ offset += used;
+
/* If recv_actor drops the lock (e.g. TCP splice
* receive) the skb pointer might be invalid when
* getting here: tcp_collapse might have deleted it
@@ -1725,9 +1717,9 @@
if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
cap = sk->sk_rcvbuf >> 1;
else
- cap = sock_net(sk)->ipv4.sysctl_tcp_rmem[2] >> 1;
+ cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
val = min(val, cap);
- sk->sk_rcvlowat = val ? : 1;
+ WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
/* Check if we need to signal EPOLLIN right now */
tcp_data_ready(sk);
@@ -1737,7 +1729,7 @@
val <<= 1;
if (val > sk->sk_rcvbuf) {
- sk->sk_rcvbuf = val;
+ WRITE_ONCE(sk->sk_rcvbuf, val);
tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val);
}
return 0;
@@ -1755,7 +1747,7 @@
return -EPERM;
vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
- /* Instruct vm_insert_page() to not down_read(mmap_sem) */
+ /* Instruct vm_insert_page() to not mmap_read_lock(mm) */
vma->vm_flags |= VM_MIXEDMAP;
vma->vm_ops = &tcp_vm_ops;
@@ -1763,16 +1755,126 @@
}
EXPORT_SYMBOL(tcp_mmap);
+static skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb,
+ u32 *offset_frag)
+{
+ skb_frag_t *frag;
+
+ if (unlikely(offset_skb >= skb->len))
+ return NULL;
+
+ offset_skb -= skb_headlen(skb);
+ if ((int)offset_skb < 0 || skb_has_frag_list(skb))
+ return NULL;
+
+ frag = skb_shinfo(skb)->frags;
+ while (offset_skb) {
+ if (skb_frag_size(frag) > offset_skb) {
+ *offset_frag = offset_skb;
+ return frag;
+ }
+ offset_skb -= skb_frag_size(frag);
+ ++frag;
+ }
+ *offset_frag = 0;
+ return frag;
+}
+
+static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc,
+ struct sk_buff *skb, u32 copylen,
+ u32 *offset, u32 *seq)
+{
+ unsigned long copy_address = (unsigned long)zc->copybuf_address;
+ struct msghdr msg = {};
+ struct iovec iov;
+ int err;
+
+ if (copy_address != zc->copybuf_address)
+ return -EINVAL;
+
+ err = import_single_range(READ, (void __user *)copy_address,
+ copylen, &iov, &msg.msg_iter);
+ if (err)
+ return err;
+ err = skb_copy_datagram_msg(skb, *offset, &msg, copylen);
+ if (err)
+ return err;
+ zc->recv_skip_hint -= copylen;
+ *offset += copylen;
+ *seq += copylen;
+ return (__s32)copylen;
+}
+
+static int tcp_zerocopy_handle_leftover_data(struct tcp_zerocopy_receive *zc,
+ struct sock *sk,
+ struct sk_buff *skb,
+ u32 *seq,
+ s32 copybuf_len)
+{
+ u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint);
+
+ if (!copylen)
+ return 0;
+ /* skb is null if inq < PAGE_SIZE. */
+ if (skb)
+ offset = *seq - TCP_SKB_CB(skb)->seq;
+ else
+ skb = tcp_recv_skb(sk, *seq, &offset);
+
+ zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, &offset,
+ seq);
+ return zc->copybuf_len < 0 ? 0 : copylen;
+}
+
+static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
+ struct page **pages,
+ unsigned long pages_to_map,
+ unsigned long *insert_addr,
+ u32 *length_with_pending,
+ u32 *seq,
+ struct tcp_zerocopy_receive *zc)
+{
+ unsigned long pages_remaining = pages_to_map;
+ int bytes_mapped;
+ int ret;
+
+ ret = vm_insert_pages(vma, *insert_addr, pages, &pages_remaining);
+ bytes_mapped = PAGE_SIZE * (pages_to_map - pages_remaining);
+ /* Even if vm_insert_pages fails, it may have partially succeeded in
+ * mapping (some but not all of the pages).
+ */
+ *seq += bytes_mapped;
+ *insert_addr += bytes_mapped;
+ if (ret) {
+ /* But if vm_insert_pages did fail, we have to unroll some state
+ * we speculatively touched before.
+ */
+ const int bytes_not_mapped = PAGE_SIZE * pages_remaining;
+ *length_with_pending -= bytes_not_mapped;
+ zc->recv_skip_hint += bytes_not_mapped;
+ }
+ return ret;
+}
+
static int tcp_zerocopy_receive(struct sock *sk,
struct tcp_zerocopy_receive *zc)
{
+ u32 length = 0, offset, vma_len, avail_len, aligned_len, copylen = 0;
unsigned long address = (unsigned long)zc->address;
+ s32 copybuf_len = zc->copybuf_len;
+ struct tcp_sock *tp = tcp_sk(sk);
+ #define PAGE_BATCH_SIZE 8
+ struct page *pages[PAGE_BATCH_SIZE];
const skb_frag_t *frags = NULL;
- u32 length = 0, seq, offset;
struct vm_area_struct *vma;
struct sk_buff *skb = NULL;
- struct tcp_sock *tp;
+ unsigned long pg_idx = 0;
+ unsigned long curr_addr;
+ u32 seq = tp->copied_seq;
+ int inq = tcp_inq(sk);
int ret;
+
+ zc->copybuf_len = 0;
if (address & (PAGE_SIZE - 1) || address != zc->address)
return -EINVAL;
@@ -1782,65 +1884,98 @@
sock_rps_record_flow(sk);
- down_read(¤t->mm->mmap_sem);
+ mmap_read_lock(current->mm);
vma = find_vma(current->mm, address);
if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops) {
- up_read(¤t->mm->mmap_sem);
+ mmap_read_unlock(current->mm);
return -EINVAL;
}
- zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);
-
- tp = tcp_sk(sk);
- seq = tp->copied_seq;
- zc->length = min_t(u32, zc->length, tcp_inq(sk));
- zc->length &= ~(PAGE_SIZE - 1);
-
- zap_page_range(vma, address, zc->length);
-
- zc->recv_skip_hint = 0;
+ vma_len = min_t(unsigned long, zc->length, vma->vm_end - address);
+ avail_len = min_t(u32, vma_len, inq);
+ aligned_len = avail_len & ~(PAGE_SIZE - 1);
+ if (aligned_len) {
+ zap_page_range(vma, address, aligned_len);
+ zc->length = aligned_len;
+ zc->recv_skip_hint = 0;
+ } else {
+ zc->length = avail_len;
+ zc->recv_skip_hint = avail_len;
+ }
ret = 0;
+ curr_addr = address;
while (length + PAGE_SIZE <= zc->length) {
if (zc->recv_skip_hint < PAGE_SIZE) {
+ u32 offset_frag;
+
+ /* If we're here, finish the current batch. */
+ if (pg_idx) {
+ ret = tcp_zerocopy_vm_insert_batch(vma, pages,
+ pg_idx,
+ &curr_addr,
+ &length,
+ &seq, zc);
+ if (ret)
+ goto out;
+ pg_idx = 0;
+ }
if (skb) {
+ if (zc->recv_skip_hint > 0)
+ break;
skb = skb->next;
offset = seq - TCP_SKB_CB(skb)->seq;
} else {
skb = tcp_recv_skb(sk, seq, &offset);
}
-
zc->recv_skip_hint = skb->len - offset;
- offset -= skb_headlen(skb);
- if ((int)offset < 0 || skb_has_frag_list(skb))
+ frags = skb_advance_to_frag(skb, offset, &offset_frag);
+ if (!frags || offset_frag)
break;
- frags = skb_shinfo(skb)->frags;
- while (offset) {
- if (frags->size > offset)
- goto out;
- offset -= frags->size;
+ }
+ if (skb_frag_size(frags) != PAGE_SIZE || skb_frag_off(frags)) {
+ int remaining = zc->recv_skip_hint;
+
+ while (remaining && (skb_frag_size(frags) != PAGE_SIZE ||
+ skb_frag_off(frags))) {
+ remaining -= skb_frag_size(frags);
frags++;
}
+ zc->recv_skip_hint -= remaining;
+ break;
}
- if (frags->size != PAGE_SIZE || frags->page_offset)
- break;
- ret = vm_insert_page(vma, address + length,
- skb_frag_page(frags));
- if (ret)
- break;
+ pages[pg_idx] = skb_frag_page(frags);
+ pg_idx++;
length += PAGE_SIZE;
- seq += PAGE_SIZE;
zc->recv_skip_hint -= PAGE_SIZE;
frags++;
+ if (pg_idx == PAGE_BATCH_SIZE) {
+ ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx,
+ &curr_addr, &length,
+ &seq, zc);
+ if (ret)
+ goto out;
+ pg_idx = 0;
+ }
+ }
+ if (pg_idx) {
+ ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx,
+ &curr_addr, &length, &seq,
+ zc);
}
out:
- up_read(¤t->mm->mmap_sem);
- if (length) {
+ mmap_read_unlock(current->mm);
+ /* Try to copy straggler data. */
+ if (!ret)
+ copylen = tcp_zerocopy_handle_leftover_data(zc, sk, skb, &seq,
+ copybuf_len);
+
+ if (length + copylen) {
WRITE_ONCE(tp->copied_seq, seq);
tcp_rcv_space_adjust(sk);
/* Clean up data we have read: This will do ACK frames. */
tcp_recv_skb(sk, seq, &offset);
- tcp_cleanup_rbuf(sk, length);
+ tcp_cleanup_rbuf(sk, length + copylen);
ret = 0;
if (length == zc->length)
zc->recv_skip_hint = 0;
@@ -1854,57 +1989,82 @@
#endif
static void tcp_update_recv_tstamps(struct sk_buff *skb,
- struct scm_timestamping *tss)
+ struct scm_timestamping_internal *tss)
{
if (skb->tstamp)
- tss->ts[0] = ktime_to_timespec(skb->tstamp);
+ tss->ts[0] = ktime_to_timespec64(skb->tstamp);
else
- tss->ts[0] = (struct timespec) {0};
+ tss->ts[0] = (struct timespec64) {0};
if (skb_hwtstamps(skb)->hwtstamp)
- tss->ts[2] = ktime_to_timespec(skb_hwtstamps(skb)->hwtstamp);
+ tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp);
else
- tss->ts[2] = (struct timespec) {0};
+ tss->ts[2] = (struct timespec64) {0};
}
/* Similar to __sock_recv_timestamp, but does not require an skb */
static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
- struct scm_timestamping *tss)
+ struct scm_timestamping_internal *tss)
{
- struct timeval tv;
+ int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW);
bool has_timestamping = false;
if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) {
if (sock_flag(sk, SOCK_RCVTSTAMP)) {
if (sock_flag(sk, SOCK_RCVTSTAMPNS)) {
- put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS,
- sizeof(tss->ts[0]), &tss->ts[0]);
+ if (new_tstamp) {
+ struct __kernel_timespec kts = {
+ .tv_sec = tss->ts[0].tv_sec,
+ .tv_nsec = tss->ts[0].tv_nsec,
+ };
+ put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
+ sizeof(kts), &kts);
+ } else {
+ struct __kernel_old_timespec ts_old = {
+ .tv_sec = tss->ts[0].tv_sec,
+ .tv_nsec = tss->ts[0].tv_nsec,
+ };
+ put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
+ sizeof(ts_old), &ts_old);
+ }
} else {
- tv.tv_sec = tss->ts[0].tv_sec;
- tv.tv_usec = tss->ts[0].tv_nsec / 1000;
-
- put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP,
- sizeof(tv), &tv);
+ if (new_tstamp) {
+ struct __kernel_sock_timeval stv = {
+ .tv_sec = tss->ts[0].tv_sec,
+ .tv_usec = tss->ts[0].tv_nsec / 1000,
+ };
+ put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW,
+ sizeof(stv), &stv);
+ } else {
+ struct __kernel_old_timeval tv = {
+ .tv_sec = tss->ts[0].tv_sec,
+ .tv_usec = tss->ts[0].tv_nsec / 1000,
+ };
+ put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
+ sizeof(tv), &tv);
+ }
}
}
if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE)
has_timestamping = true;
else
- tss->ts[0] = (struct timespec) {0};
+ tss->ts[0] = (struct timespec64) {0};
}
if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) {
if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)
has_timestamping = true;
else
- tss->ts[2] = (struct timespec) {0};
+ tss->ts[2] = (struct timespec64) {0};
}
if (has_timestamping) {
- tss->ts[1] = (struct timespec) {0};
- put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING,
- sizeof(*tss), tss);
+ tss->ts[1] = (struct timespec64) {0};
+ if (sock_flag(sk, SOCK_TSTAMP_NEW))
+ put_cmsg_scm_timestamping64(msg, tss);
+ else
+ put_cmsg_scm_timestamping(msg, tss);
}
}
@@ -1950,12 +2110,12 @@
long timeo;
struct sk_buff *skb, *last;
u32 urg_hole = 0;
- struct scm_timestamping tss;
- bool has_tss = false;
- bool has_cmsg;
+ struct scm_timestamping_internal tss;
+ int cmsg_flags;
if (unlikely(flags & MSG_ERRQUEUE))
return inet_recv_error(sk, msg, len, addr_len);
+ trace_android_rvh_tcp_recvmsg(sk);
if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) &&
(sk->sk_state == TCP_ESTABLISHED))
@@ -1967,7 +2127,7 @@
if (sk->sk_state == TCP_LISTEN)
goto out;
- has_cmsg = tp->recvmsg_inq;
+ cmsg_flags = tp->recvmsg_inq ? 1 : 0;
timeo = sock_rcvtimeo(sk, nonblock);
/* Urgent data needs to be handled specially. */
@@ -2100,7 +2260,7 @@
}
continue;
- found_ok_skb:
+found_ok_skb:
/* Ok so how much can we use? */
used = skb->len - offset;
if (len < used)
@@ -2148,8 +2308,7 @@
if (TCP_SKB_CB(skb)->has_rxtstamp) {
tcp_update_recv_tstamps(skb, &tss);
- has_tss = true;
- has_cmsg = true;
+ cmsg_flags |= 2;
}
if (used + offset < skb->len)
@@ -2161,7 +2320,7 @@
sk_eat_skb(sk, skb);
continue;
- found_fin_ok:
+found_fin_ok:
/* Process the FIN. */
WRITE_ONCE(*seq, *seq + 1);
if (!(flags & MSG_PEEK))
@@ -2169,6 +2328,7 @@
break;
} while (len > 0);
+ trace_android_rvh_tcp_recvmsg_stat(sk, copied);
/* According to UNIX98, msg_name/msg_namelen are ignored
* on connected socket. I was just happy when found this 8) --ANK
*/
@@ -2178,10 +2338,10 @@
release_sock(sk);
- if (has_cmsg) {
- if (has_tss)
+ if (cmsg_flags) {
+ if (cmsg_flags & 2)
tcp_recv_timestamp(msg, sk, &tss);
- if (tp->recvmsg_inq) {
+ if (cmsg_flags & 1) {
inq = tcp_inq_hint(sk);
put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
}
@@ -2245,7 +2405,7 @@
if (inet_csk(sk)->icsk_bind_hash &&
!(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
inet_put_port(sk);
- /* fall through */
+ fallthrough;
default:
if (oldstate == TCP_ESTABLISHED)
TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
@@ -2255,10 +2415,6 @@
* socket sitting in hash tables.
*/
inet_sk_state_store(sk, state);
-
-#ifdef STATE_TRACE
- SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
-#endif
}
EXPORT_SYMBOL_GPL(tcp_set_state);
@@ -2335,14 +2491,13 @@
return too_many_orphans || out_of_socket_memory;
}
-void tcp_close(struct sock *sk, long timeout)
+void __tcp_close(struct sock *sk, long timeout)
{
struct sk_buff *skb;
int data_was_unread = 0;
int state;
- lock_sock(sk);
- sk->sk_shutdown = SHUTDOWN_MASK;
+ WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
if (sk->sk_state == TCP_LISTEN) {
tcp_set_state(sk, TCP_CLOSE);
@@ -2488,7 +2643,10 @@
}
if (sk->sk_state == TCP_CLOSE) {
- struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
+ struct request_sock *req;
+
+ req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
+ lockdep_sock_is_held(sk));
/* We could get here with a non-NULL req if the socket is
* aborted (e.g., closed with unread data) before 3WHS
* finishes.
@@ -2502,6 +2660,12 @@
out:
bh_unlock_sock(sk);
local_bh_enable();
+}
+
+void tcp_close(struct sock *sk, long timeout)
+{
+ lock_sock(sk);
+ __tcp_close(sk, timeout);
release_sock(sk);
sock_put(sk);
}
@@ -2543,6 +2707,11 @@
sk_wmem_free_skb(sk, skb);
}
tcp_rtx_queue_purge(sk);
+ skb = sk->sk_tx_skb_cache;
+ if (skb) {
+ __kfree_skb(skb);
+ sk->sk_tx_skb_cache = NULL;
+ }
INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
sk_mem_reclaim(sk);
tcp_clear_all_retrans_hints(tcp_sk(sk));
@@ -2579,6 +2748,10 @@
tcp_clear_xmit_timers(sk);
__skb_queue_purge(&sk->sk_receive_queue);
+ if (sk->sk_rx_skb_cache) {
+ __kfree_skb(sk->sk_rx_skb_cache);
+ sk->sk_rx_skb_cache = NULL;
+ }
WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
tp->urg_data = 0;
tcp_write_queue_purge(sk);
@@ -2590,9 +2763,10 @@
if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
inet_reset_saddr(sk);
- sk->sk_shutdown = 0;
+ WRITE_ONCE(sk->sk_shutdown, 0);
sock_reset_flag(sk, SOCK_DONE);
tp->srtt_us = 0;
+ tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
tp->rcv_rtt_last_tsecr = 0;
seq = tp->write_seq + tp->max_window + 2;
@@ -2600,16 +2774,24 @@
seq = 1;
WRITE_ONCE(tp->write_seq, seq);
- tp->snd_cwnd = 2;
+ icsk->icsk_backoff = 0;
icsk->icsk_probes_out = 0;
+ icsk->icsk_probes_tstamp = 0;
+ icsk->icsk_rto = TCP_TIMEOUT_INIT;
+ icsk->icsk_rto_min = TCP_RTO_MIN;
+ icsk->icsk_delack_max = TCP_DELACK_MAX;
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+ tp->snd_cwnd = TCP_INIT_CWND;
tp->snd_cwnd_cnt = 0;
+ tp->is_cwnd_limited = 0;
+ tp->max_packets_out = 0;
tp->window_clamp = 0;
tp->delivered = 0;
tp->delivered_ce = 0;
if (icsk->icsk_ca_ops->release)
icsk->icsk_ca_ops->release(sk);
memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
+ icsk->icsk_ca_initialized = 0;
tcp_set_ca_state(sk, TCP_CA_Open);
tp->is_sack_reneg = 0;
tcp_clear_retrans(tp);
@@ -2621,8 +2803,7 @@
icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
__sk_dst_reset(sk);
- dst_release(sk->sk_rx_dst);
- sk->sk_rx_dst = NULL;
+ dst_release(xchg((__force struct dst_entry **)&sk->sk_rx_dst, NULL));
tcp_saved_syn_free(tp);
tp->compressed_ack = 0;
tp->segs_in = 0;
@@ -2633,12 +2814,34 @@
tp->bytes_retrans = 0;
tp->data_segs_in = 0;
tp->data_segs_out = 0;
+ tp->duplicate_sack[0].start_seq = 0;
+ tp->duplicate_sack[0].end_seq = 0;
tp->dsack_dups = 0;
tp->reord_seen = 0;
+ tp->retrans_out = 0;
+ tp->sacked_out = 0;
+ tp->tlp_high_seq = 0;
+ tp->last_oow_ack_time = 0;
+ /* There's a bubble in the pipe until at least the first ACK. */
+ tp->app_limited = ~0U;
+ tp->rate_app_limited = 1;
+ tp->rack.mstamp = 0;
+ tp->rack.advanced = 0;
+ tp->rack.reo_wnd_steps = 1;
+ tp->rack.last_delivered = 0;
+ tp->rack.reo_wnd_persist = 0;
+ tp->rack.dsack_seen = 0;
+ tp->syn_data_acked = 0;
+ tp->rx_opt.saw_tstamp = 0;
+ tp->rx_opt.dsack = 0;
+ tp->rx_opt.num_sacks = 0;
+ tp->rcv_ooopack = 0;
+
/* Clean up fastopen related fields */
tcp_free_fastopen_req(tp);
inet->defer_connect = 0;
+ tp->fastopen_client_fail = 0;
WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
@@ -2659,7 +2862,7 @@
(sk->sk_state != TCP_LISTEN);
}
-static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int len)
+static int tcp_repair_set_window(struct tcp_sock *tp, sockptr_t optbuf, int len)
{
struct tcp_repair_window opt;
@@ -2669,7 +2872,7 @@
if (len != sizeof(opt))
return -EINVAL;
- if (copy_from_user(&opt, optbuf, sizeof(opt)))
+ if (copy_from_sockptr(&opt, optbuf, sizeof(opt)))
return -EFAULT;
if (opt.max_window < opt.snd_wnd)
@@ -2691,17 +2894,18 @@
return 0;
}
-static int tcp_repair_options_est(struct sock *sk,
- struct tcp_repair_opt __user *optbuf, unsigned int len)
+static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf,
+ unsigned int len)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_repair_opt opt;
+ size_t offset = 0;
while (len >= sizeof(opt)) {
- if (copy_from_user(&opt, optbuf, sizeof(opt)))
+ if (copy_from_sockptr_offset(&opt, optbuf, offset, sizeof(opt)))
return -EFAULT;
- optbuf++;
+ offset += sizeof(opt);
len -= sizeof(opt);
switch (opt.opt_code) {
@@ -2740,11 +2944,185 @@
return 0;
}
+DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
+EXPORT_SYMBOL(tcp_tx_delay_enabled);
+
+static void tcp_enable_tx_delay(void)
+{
+ if (!static_branch_unlikely(&tcp_tx_delay_enabled)) {
+ static int __tcp_tx_delay_enabled = 0;
+
+ if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) {
+ static_branch_enable(&tcp_tx_delay_enabled);
+ pr_info("TCP_TX_DELAY enabled\n");
+ }
+ }
+}
+
+/* When set indicates to always queue non-full frames. Later the user clears
+ * this option and we transmit any pending partial frames in the queue. This is
+ * meant to be used alongside sendfile() to get properly filled frames when the
+ * user (for example) must write out headers with a write() call first and then
+ * use sendfile to send out the data parts.
+ *
+ * TCP_CORK can be set together with TCP_NODELAY and it is stronger than
+ * TCP_NODELAY.
+ */
+static void __tcp_sock_set_cork(struct sock *sk, bool on)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (on) {
+ tp->nonagle |= TCP_NAGLE_CORK;
+ } else {
+ tp->nonagle &= ~TCP_NAGLE_CORK;
+ if (tp->nonagle & TCP_NAGLE_OFF)
+ tp->nonagle |= TCP_NAGLE_PUSH;
+ tcp_push_pending_frames(sk);
+ }
+}
+
+void tcp_sock_set_cork(struct sock *sk, bool on)
+{
+ lock_sock(sk);
+ __tcp_sock_set_cork(sk, on);
+ release_sock(sk);
+}
+EXPORT_SYMBOL(tcp_sock_set_cork);
+
+/* TCP_NODELAY is weaker than TCP_CORK, so that this option on corked socket is
+ * remembered, but it is not activated until cork is cleared.
+ *
+ * However, when TCP_NODELAY is set we make an explicit push, which overrides
+ * even TCP_CORK for currently queued segments.
+ */
+static void __tcp_sock_set_nodelay(struct sock *sk, bool on)
+{
+ if (on) {
+ tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
+ tcp_push_pending_frames(sk);
+ } else {
+ tcp_sk(sk)->nonagle &= ~TCP_NAGLE_OFF;
+ }
+}
+
+void tcp_sock_set_nodelay(struct sock *sk)
+{
+ lock_sock(sk);
+ __tcp_sock_set_nodelay(sk, true);
+ release_sock(sk);
+}
+EXPORT_SYMBOL(tcp_sock_set_nodelay);
+
+static void __tcp_sock_set_quickack(struct sock *sk, int val)
+{
+ if (!val) {
+ inet_csk_enter_pingpong_mode(sk);
+ return;
+ }
+
+ inet_csk_exit_pingpong_mode(sk);
+ if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
+ inet_csk_ack_scheduled(sk)) {
+ inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_PUSHED;
+ tcp_cleanup_rbuf(sk, 1);
+ if (!(val & 1))
+ inet_csk_enter_pingpong_mode(sk);
+ }
+}
+
+void tcp_sock_set_quickack(struct sock *sk, int val)
+{
+ lock_sock(sk);
+ __tcp_sock_set_quickack(sk, val);
+ release_sock(sk);
+}
+EXPORT_SYMBOL(tcp_sock_set_quickack);
+
+int tcp_sock_set_syncnt(struct sock *sk, int val)
+{
+ if (val < 1 || val > MAX_TCP_SYNCNT)
+ return -EINVAL;
+
+ lock_sock(sk);
+ inet_csk(sk)->icsk_syn_retries = val;
+ release_sock(sk);
+ return 0;
+}
+EXPORT_SYMBOL(tcp_sock_set_syncnt);
+
+void tcp_sock_set_user_timeout(struct sock *sk, u32 val)
+{
+ lock_sock(sk);
+ WRITE_ONCE(inet_csk(sk)->icsk_user_timeout, val);
+ release_sock(sk);
+}
+EXPORT_SYMBOL(tcp_sock_set_user_timeout);
+
+int tcp_sock_set_keepidle_locked(struct sock *sk, int val)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (val < 1 || val > MAX_TCP_KEEPIDLE)
+ return -EINVAL;
+
+ /* Paired with WRITE_ONCE() in keepalive_time_when() */
+ WRITE_ONCE(tp->keepalive_time, val * HZ);
+ if (sock_flag(sk, SOCK_KEEPOPEN) &&
+ !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) {
+ u32 elapsed = keepalive_time_elapsed(tp);
+
+ if (tp->keepalive_time > elapsed)
+ elapsed = tp->keepalive_time - elapsed;
+ else
+ elapsed = 0;
+ inet_csk_reset_keepalive_timer(sk, elapsed);
+ }
+
+ return 0;
+}
+
+int tcp_sock_set_keepidle(struct sock *sk, int val)
+{
+ int err;
+
+ lock_sock(sk);
+ err = tcp_sock_set_keepidle_locked(sk, val);
+ release_sock(sk);
+ return err;
+}
+EXPORT_SYMBOL(tcp_sock_set_keepidle);
+
+int tcp_sock_set_keepintvl(struct sock *sk, int val)
+{
+ if (val < 1 || val > MAX_TCP_KEEPINTVL)
+ return -EINVAL;
+
+ lock_sock(sk);
+ WRITE_ONCE(tcp_sk(sk)->keepalive_intvl, val * HZ);
+ release_sock(sk);
+ return 0;
+}
+EXPORT_SYMBOL(tcp_sock_set_keepintvl);
+
+int tcp_sock_set_keepcnt(struct sock *sk, int val)
+{
+ if (val < 1 || val > MAX_TCP_KEEPCNT)
+ return -EINVAL;
+
+ lock_sock(sk);
+ /* Paired with READ_ONCE() in keepalive_probes() */
+ WRITE_ONCE(tcp_sk(sk)->keepalive_probes, val);
+ release_sock(sk);
+ return 0;
+}
+EXPORT_SYMBOL(tcp_sock_set_keepcnt);
+
/*
* Socket option code for TCP.
*/
-static int do_tcp_setsockopt(struct sock *sk, int level,
- int optname, char __user *optval, unsigned int optlen)
+static int do_tcp_setsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -2760,14 +3138,14 @@
if (optlen < 1)
return -EINVAL;
- val = strncpy_from_user(name, optval,
+ val = strncpy_from_sockptr(name, optval,
min_t(long, TCP_CA_NAME_MAX-1, optlen));
if (val < 0)
return -EFAULT;
name[val] = 0;
lock_sock(sk);
- err = tcp_set_congestion_control(sk, name, true, true,
+ err = tcp_set_congestion_control(sk, name, true,
ns_capable(sock_net(sk)->user_ns,
CAP_NET_ADMIN));
release_sock(sk);
@@ -2779,7 +3157,7 @@
if (optlen < 1)
return -EINVAL;
- val = strncpy_from_user(name, optval,
+ val = strncpy_from_sockptr(name, optval,
min_t(long, TCP_ULP_NAME_MAX - 1,
optlen));
if (val < 0)
@@ -2792,15 +3170,23 @@
return err;
}
case TCP_FASTOPEN_KEY: {
- __u8 key[TCP_FASTOPEN_KEY_LENGTH];
+ __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH];
+ __u8 *backup_key = NULL;
- if (optlen != sizeof(key))
+ /* Allow a backup key as well to facilitate key rotation
+ * First key is the active one.
+ */
+ if (optlen != TCP_FASTOPEN_KEY_LENGTH &&
+ optlen != TCP_FASTOPEN_KEY_BUF_LENGTH)
return -EINVAL;
- if (copy_from_user(key, optval, optlen))
+ if (copy_from_sockptr(key, optval, optlen))
return -EFAULT;
- return tcp_fastopen_reset_cipher(net, sk, key, sizeof(key));
+ if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH)
+ backup_key = key + TCP_FASTOPEN_KEY_LENGTH;
+
+ return tcp_fastopen_reset_cipher(net, sk, key, backup_key);
}
default:
/* fallthru */
@@ -2810,7 +3196,7 @@
if (optlen < sizeof(int))
return -EINVAL;
- if (get_user(val, (int __user *)optval))
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT;
lock_sock(sk);
@@ -2829,20 +3215,7 @@
break;
case TCP_NODELAY:
- if (val) {
- /* TCP_NODELAY is weaker than TCP_CORK, so that
- * this option on corked socket is remembered, but
- * it is not activated until cork is cleared.
- *
- * However, when TCP_NODELAY is set we make
- * an explicit push, which overrides even TCP_CORK
- * for currently queued segments.
- */
- tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
- tcp_push_pending_frames(sk);
- } else {
- tp->nonagle &= ~TCP_NAGLE_OFF;
- }
+ __tcp_sock_set_nodelay(sk, val);
break;
case TCP_THIN_LINEAR_TIMEOUTS:
@@ -2908,64 +3281,30 @@
case TCP_REPAIR_OPTIONS:
if (!tp->repair)
err = -EINVAL;
- else if (sk->sk_state == TCP_ESTABLISHED)
- err = tcp_repair_options_est(sk,
- (struct tcp_repair_opt __user *)optval,
- optlen);
+ else if (sk->sk_state == TCP_ESTABLISHED && !tp->bytes_sent)
+ err = tcp_repair_options_est(sk, optval, optlen);
else
err = -EPERM;
break;
case TCP_CORK:
- /* When set indicates to always queue non-full frames.
- * Later the user clears this option and we transmit
- * any pending partial frames in the queue. This is
- * meant to be used alongside sendfile() to get properly
- * filled frames when the user (for example) must write
- * out headers with a write() call first and then use
- * sendfile to send out the data parts.
- *
- * TCP_CORK can be set together with TCP_NODELAY and it is
- * stronger than TCP_NODELAY.
- */
- if (val) {
- tp->nonagle |= TCP_NAGLE_CORK;
- } else {
- tp->nonagle &= ~TCP_NAGLE_CORK;
- if (tp->nonagle&TCP_NAGLE_OFF)
- tp->nonagle |= TCP_NAGLE_PUSH;
- tcp_push_pending_frames(sk);
- }
+ __tcp_sock_set_cork(sk, val);
break;
case TCP_KEEPIDLE:
- if (val < 1 || val > MAX_TCP_KEEPIDLE)
- err = -EINVAL;
- else {
- tp->keepalive_time = val * HZ;
- if (sock_flag(sk, SOCK_KEEPOPEN) &&
- !((1 << sk->sk_state) &
- (TCPF_CLOSE | TCPF_LISTEN))) {
- u32 elapsed = keepalive_time_elapsed(tp);
- if (tp->keepalive_time > elapsed)
- elapsed = tp->keepalive_time - elapsed;
- else
- elapsed = 0;
- inet_csk_reset_keepalive_timer(sk, elapsed);
- }
- }
+ err = tcp_sock_set_keepidle_locked(sk, val);
break;
case TCP_KEEPINTVL:
if (val < 1 || val > MAX_TCP_KEEPINTVL)
err = -EINVAL;
else
- tp->keepalive_intvl = val * HZ;
+ WRITE_ONCE(tp->keepalive_intvl, val * HZ);
break;
case TCP_KEEPCNT:
if (val < 1 || val > MAX_TCP_KEEPCNT)
err = -EINVAL;
else
- tp->keepalive_probes = val;
+ WRITE_ONCE(tp->keepalive_probes, val);
break;
case TCP_SYNCNT:
if (val < 1 || val > MAX_TCP_SYNCNT)
@@ -2975,7 +3314,8 @@
break;
case TCP_SAVE_SYN:
- if (val < 0 || val > 1)
+ /* 0: disable, 1: enable, 2: start from ether_header */
+ if (val < 0 || val > 2)
err = -EINVAL;
else
tp->save_syn = val;
@@ -2983,18 +3323,18 @@
case TCP_LINGER2:
if (val < 0)
- tp->linger2 = -1;
- else if (val > net->ipv4.sysctl_tcp_fin_timeout / HZ)
- tp->linger2 = 0;
+ WRITE_ONCE(tp->linger2, -1);
+ else if (val > TCP_FIN_TIMEOUT_MAX / HZ)
+ WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX);
else
- tp->linger2 = val * HZ;
+ WRITE_ONCE(tp->linger2, val * HZ);
break;
case TCP_DEFER_ACCEPT:
/* Translate value in seconds to number of retransmits */
- icsk->icsk_accept_queue.rskq_defer_accept =
- secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
- TCP_RTO_MAX / HZ);
+ WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept,
+ secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
+ TCP_RTO_MAX / HZ));
break;
case TCP_WINDOW_CLAMP:
@@ -3010,19 +3350,7 @@
break;
case TCP_QUICKACK:
- if (!val) {
- icsk->icsk_ack.pingpong = 1;
- } else {
- icsk->icsk_ack.pingpong = 0;
- if ((1 << sk->sk_state) &
- (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
- inet_csk_ack_scheduled(sk)) {
- icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
- tcp_cleanup_rbuf(sk, 1);
- if (!(val & 1))
- icsk->icsk_ack.pingpong = 1;
- }
- }
+ __tcp_sock_set_quickack(sk, val);
break;
#ifdef CONFIG_TCP_MD5SIG
@@ -3038,7 +3366,7 @@
if (val < 0)
err = -EINVAL;
else
- icsk->icsk_user_timeout = val;
+ WRITE_ONCE(icsk->icsk_user_timeout, val);
break;
case TCP_FASTOPEN:
@@ -3054,7 +3382,8 @@
case TCP_FASTOPEN_CONNECT:
if (val > 1 || val < 0) {
err = -EINVAL;
- } else if (net->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
+ } else if (READ_ONCE(net->ipv4.sysctl_tcp_fastopen) &
+ TFO_CLIENT_ENABLE) {
if (sk->sk_state == TCP_CLOSE)
tp->fastopen_connect = val;
else
@@ -3081,7 +3410,7 @@
err = tcp_repair_set_window(tp, optval, optlen);
break;
case TCP_NOTSENT_LOWAT:
- tp->notsent_lowat = val;
+ WRITE_ONCE(tp->notsent_lowat, val);
sk->sk_write_space(sk);
break;
case TCP_INQ:
@@ -3089,6 +3418,11 @@
err = -EINVAL;
else
tp->recvmsg_inq = val;
+ break;
+ case TCP_TX_DELAY:
+ if (val)
+ tcp_enable_tx_delay();
+ WRITE_ONCE(tp->tcp_tx_delay, val);
break;
default:
err = -ENOPROTOOPT;
@@ -3099,7 +3433,7 @@
return err;
}
-int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
+int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
unsigned int optlen)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -3110,18 +3444,6 @@
return do_tcp_setsockopt(sk, level, optname, optval, optlen);
}
EXPORT_SYMBOL(tcp_setsockopt);
-
-#ifdef CONFIG_COMPAT
-int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen)
-{
- if (level != SOL_TCP)
- return inet_csk_compat_setsockopt(sk, level, optname,
- optval, optlen);
- return do_tcp_setsockopt(sk, level, optname, optval, optlen);
-}
-EXPORT_SYMBOL(compat_tcp_setsockopt);
-#endif
static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
struct tcp_info *info)
@@ -3147,10 +3469,10 @@
{
const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
const struct inet_connection_sock *icsk = inet_csk(sk);
+ unsigned long rate;
u32 now;
u64 rate64;
bool slow;
- u32 rate;
memset(info, 0, sizeof(*info));
if (sk->sk_type != SOCK_STREAM)
@@ -3160,11 +3482,11 @@
/* Report meaningful fields for all TCP states, including listeners */
rate = READ_ONCE(sk->sk_pacing_rate);
- rate64 = rate != ~0U ? rate : ~0ULL;
+ rate64 = (rate != ~0UL) ? rate : ~0ULL;
info->tcpi_pacing_rate = rate64;
rate = READ_ONCE(sk->sk_max_pacing_rate);
- rate64 = rate != ~0U ? rate : ~0ULL;
+ rate64 = (rate != ~0UL) ? rate : ~0ULL;
info->tcpi_max_pacing_rate = rate64;
info->tcpi_reordering = tp->reordering;
@@ -3175,8 +3497,8 @@
* tcpi_unacked -> Number of children ready for accept()
* tcpi_sacked -> max backlog
*/
- info->tcpi_unacked = sk->sk_ack_backlog;
- info->tcpi_sacked = sk->sk_max_ack_backlog;
+ info->tcpi_unacked = READ_ONCE(sk->sk_ack_backlog);
+ info->tcpi_sacked = READ_ONCE(sk->sk_max_ack_backlog);
return;
}
@@ -3254,6 +3576,9 @@
info->tcpi_bytes_retrans = tp->bytes_retrans;
info->tcpi_dsack_dups = tp->dsack_dups;
info->tcpi_reord_seen = tp->reord_seen;
+ info->tcpi_rcv_ooopack = tp->rcv_ooopack;
+ info->tcpi_snd_wnd = tp->snd_wnd;
+ info->tcpi_fastopen_client_fail = tp->fastopen_client_fail;
unlock_sock_fast(sk, slow);
}
EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -3282,16 +3607,21 @@
nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */
nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */
nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */
+ nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */
0;
}
-struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
+struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
+ const struct sk_buff *orig_skb)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *stats;
struct tcp_info info;
+ unsigned long rate;
u64 rate64;
- u32 rate;
stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC);
if (!stats)
@@ -3310,7 +3640,7 @@
tp->total_retrans, TCP_NLA_PAD);
rate = READ_ONCE(sk->sk_pacing_rate);
- rate64 = rate != ~0U ? rate : ~0ULL;
+ rate64 = (rate != ~0UL) ? rate : ~0ULL;
nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD);
rate64 = tcp_compute_delivery_rate(tp);
@@ -3335,6 +3665,12 @@
TCP_NLA_PAD);
nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);
nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
+ nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3);
+ nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash);
+ nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT,
+ max_t(int, 0, tp->write_seq - tp->snd_nxt));
+ nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns,
+ TCP_NLA_PAD);
return stats;
}
@@ -3358,7 +3694,8 @@
switch (optname) {
case TCP_MAXSEG:
val = tp->mss_cache;
- if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
+ if (tp->rx_opt.user_mss &&
+ ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
val = tp->rx_opt.user_mss;
if (tp->repair)
val = tp->rx_opt.mss_clamp;
@@ -3382,13 +3719,14 @@
val = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
break;
case TCP_LINGER2:
- val = tp->linger2;
+ val = READ_ONCE(tp->linger2);
if (val >= 0)
- val = (val ? : net->ipv4.sysctl_tcp_fin_timeout) / HZ;
+ val = (val ? : READ_ONCE(net->ipv4.sysctl_tcp_fin_timeout)) / HZ;
break;
case TCP_DEFER_ACCEPT:
- val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
- TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
+ val = READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept);
+ val = retrans_to_secs(val, TCP_TIMEOUT_INIT / HZ,
+ TCP_RTO_MAX / HZ);
break;
case TCP_WINDOW_CLAMP:
val = tp->window_clamp;
@@ -3429,7 +3767,7 @@
return 0;
}
case TCP_QUICKACK:
- val = !icsk->icsk_ack.pingpong;
+ val = !inet_csk_in_pingpong_mode(sk);
break;
case TCP_CONGESTION:
@@ -3458,21 +3796,15 @@
return 0;
case TCP_FASTOPEN_KEY: {
- __u8 key[TCP_FASTOPEN_KEY_LENGTH];
- struct tcp_fastopen_context *ctx;
+ u64 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u64)];
+ unsigned int key_len;
if (get_user(len, optlen))
return -EFAULT;
- rcu_read_lock();
- ctx = rcu_dereference(icsk->icsk_accept_queue.fastopenq.ctx);
- if (ctx)
- memcpy(key, ctx->key, sizeof(key));
- else
- len = 0;
- rcu_read_unlock();
-
- len = min_t(unsigned int, len, sizeof(key));
+ key_len = tcp_fastopen_get_cipher(net, icsk, key) *
+ TCP_FASTOPEN_KEY_LENGTH;
+ len = min_t(unsigned int, len, key_len);
if (put_user(len, optlen))
return -EFAULT;
if (copy_to_user(optval, key, len))
@@ -3530,11 +3862,11 @@
break;
case TCP_USER_TIMEOUT:
- val = icsk->icsk_user_timeout;
+ val = READ_ONCE(icsk->icsk_user_timeout);
break;
case TCP_FASTOPEN:
- val = icsk->icsk_accept_queue.fastopenq.max_qlen;
+ val = READ_ONCE(icsk->icsk_accept_queue.fastopenq.max_qlen);
break;
case TCP_FASTOPEN_CONNECT:
@@ -3545,11 +3877,15 @@
val = tp->fastopen_no_cookie;
break;
+ case TCP_TX_DELAY:
+ val = READ_ONCE(tp->tcp_tx_delay);
+ break;
+
case TCP_TIMESTAMP:
val = tcp_time_stamp_raw() + tp->tsoffset;
break;
case TCP_NOTSENT_LOWAT:
- val = tp->notsent_lowat;
+ val = READ_ONCE(tp->notsent_lowat);
break;
case TCP_INQ:
val = tp->recvmsg_inq;
@@ -3563,20 +3899,21 @@
lock_sock(sk);
if (tp->saved_syn) {
- if (len < tp->saved_syn[0]) {
- if (put_user(tp->saved_syn[0], optlen)) {
+ if (len < tcp_saved_syn_len(tp->saved_syn)) {
+ if (put_user(tcp_saved_syn_len(tp->saved_syn),
+ optlen)) {
release_sock(sk);
return -EFAULT;
}
release_sock(sk);
return -EINVAL;
}
- len = tp->saved_syn[0];
+ len = tcp_saved_syn_len(tp->saved_syn);
if (put_user(len, optlen)) {
release_sock(sk);
return -EFAULT;
}
- if (copy_to_user(optval, tp->saved_syn + 1, len)) {
+ if (copy_to_user(optval, tp->saved_syn->data, len)) {
release_sock(sk);
return -EFAULT;
}
@@ -3592,18 +3929,41 @@
}
#ifdef CONFIG_MMU
case TCP_ZEROCOPY_RECEIVE: {
- struct tcp_zerocopy_receive zc;
+ struct tcp_zerocopy_receive zc = {};
int err;
if (get_user(len, optlen))
return -EFAULT;
- if (len != sizeof(zc))
+ if (len < 0 ||
+ len < offsetofend(struct tcp_zerocopy_receive, length))
return -EINVAL;
+ if (len > sizeof(zc)) {
+ len = sizeof(zc);
+ if (put_user(len, optlen))
+ return -EFAULT;
+ }
if (copy_from_user(&zc, optval, len))
return -EFAULT;
lock_sock(sk);
err = tcp_zerocopy_receive(sk, &zc);
release_sock(sk);
+ if (len >= offsetofend(struct tcp_zerocopy_receive, err))
+ goto zerocopy_rcv_sk_err;
+ switch (len) {
+ case offsetofend(struct tcp_zerocopy_receive, err):
+ goto zerocopy_rcv_sk_err;
+ case offsetofend(struct tcp_zerocopy_receive, inq):
+ goto zerocopy_rcv_inq;
+ case offsetofend(struct tcp_zerocopy_receive, length):
+ default:
+ goto zerocopy_rcv_out;
+ }
+zerocopy_rcv_sk_err:
+ if (!err)
+ zc.err = sock_error(sk);
+zerocopy_rcv_inq:
+ zc.inq = tcp_inq_hint(sk);
+zerocopy_rcv_out:
if (!err && copy_to_user(optval, &zc, len))
err = -EFAULT;
return err;
@@ -3631,18 +3991,6 @@
return do_tcp_getsockopt(sk, level, optname, optval, optlen);
}
EXPORT_SYMBOL(tcp_getsockopt);
-
-#ifdef CONFIG_COMPAT
-int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen)
-{
- if (level != SOL_TCP)
- return inet_csk_compat_getsockopt(sk, level, optname,
- optval, optlen);
- return do_tcp_getsockopt(sk, level, optname, optval, optlen);
-}
-EXPORT_SYMBOL(compat_tcp_getsockopt);
-#endif
#ifdef CONFIG_TCP_MD5SIG
static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool);
@@ -3686,20 +4034,28 @@
* to memory. See smp_rmb() in tcp_get_md5sig_pool()
*/
smp_wmb();
- tcp_md5sig_pool_populated = true;
+ /* Paired with READ_ONCE() from tcp_alloc_md5sig_pool()
+ * and tcp_get_md5sig_pool().
+ */
+ WRITE_ONCE(tcp_md5sig_pool_populated, true);
}
bool tcp_alloc_md5sig_pool(void)
{
- if (unlikely(!tcp_md5sig_pool_populated)) {
+ /* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */
+ if (unlikely(!READ_ONCE(tcp_md5sig_pool_populated))) {
mutex_lock(&tcp_md5sig_mutex);
- if (!tcp_md5sig_pool_populated)
+ if (!tcp_md5sig_pool_populated) {
__tcp_alloc_md5sig_pool();
+ if (tcp_md5sig_pool_populated)
+ static_branch_inc(&tcp_md5_needed);
+ }
mutex_unlock(&tcp_md5sig_mutex);
}
- return tcp_md5sig_pool_populated;
+ /* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */
+ return READ_ONCE(tcp_md5sig_pool_populated);
}
EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
@@ -3715,7 +4071,8 @@
{
local_bh_disable();
- if (tcp_md5sig_pool_populated) {
+ /* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */
+ if (READ_ONCE(tcp_md5sig_pool_populated)) {
/* coupled with smp_wmb() in __tcp_alloc_md5sig_pool() */
smp_rmb();
return this_cpu_ptr(&tcp_md5sig_pool);
@@ -3745,8 +4102,8 @@
return 1;
for (i = 0; i < shi->nr_frags; ++i) {
- const struct skb_frag_struct *f = &shi->frags[i];
- unsigned int offset = f->page_offset;
+ const skb_frag_t *f = &shi->frags[i];
+ unsigned int offset = skb_frag_off(f);
struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
sg_set_page(&sg, page, skb_frag_size(f),
@@ -3772,8 +4129,8 @@
sg_init_one(&sg, key->key, keylen);
ahash_request_set_crypt(hp->md5_req, &sg, NULL, keylen);
- /* tcp_md5_do_add() might change key->key under us */
- return crypto_ahash_update(hp->md5_req);
+ /* We use data_race() because tcp_md5_do_add() might change key->key under us */
+ return data_race(crypto_ahash_update(hp->md5_req));
}
EXPORT_SYMBOL(tcp_md5_hash_key);
@@ -3781,7 +4138,13 @@
void tcp_done(struct sock *sk)
{
- struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
+ struct request_sock *req;
+
+ /* We might be called with a new socket, after
+ * inet_csk_prepare_forced_close() has been called
+ * so we can not use lockdep_sock_is_held(sk)
+ */
+ req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, 1);
if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
@@ -3791,7 +4154,7 @@
if (req)
reqsk_fastopen_remove(sk, req, false);
- sk->sk_shutdown = SHUTDOWN_MASK;
+ WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
if (!sock_flag(sk, SOCK_DEAD))
sk->sk_state_change(sk);
@@ -3880,7 +4243,7 @@
BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE);
BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
- FIELD_SIZEOF(struct sk_buff, cb));
+ sizeof_field(struct sk_buff, cb));
percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
@@ -3954,4 +4317,5 @@
tcp_metrics_init();
BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
tcp_tasklet_init();
+ mptcp_init();
}
--
Gitblit v1.6.2