From 6778948f9de86c3cfaf36725a7c87dcff9ba247f Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Mon, 11 Dec 2023 08:20:59 +0000
Subject: [PATCH] kernel_5.10 no rt
---
kernel/include/net/sock.h | 484 ++++++++++++++++++++++++++++++++++++++---------------
1 files changed, 343 insertions(+), 141 deletions(-)
diff --git a/kernel/include/net/sock.h b/kernel/include/net/sock.h
index c080fc9..c604052 100644
--- a/kernel/include/net/sock.h
+++ b/kernel/include/net/sock.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -30,12 +31,6 @@
* respective headers and ipv4/v6, etc now
* use private slabcaches for its socks
* Pedro Hortas : New flags field for socket options
- *
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#ifndef _SOCK_H
#define _SOCK_H
@@ -64,6 +59,7 @@
#include <linux/filter.h>
#include <linux/rculist_nulls.h>
#include <linux/poll.h>
+#include <linux/sockptr.h>
#include <linux/atomic.h>
#include <linux/refcount.h>
@@ -71,9 +67,9 @@
#include <net/checksum.h>
#include <net/tcp_states.h>
#include <linux/net_tstamp.h>
-#include <net/smc.h>
#include <net/l3mdev.h>
#include <linux/android_kabi.h>
+#include <linux/android_vendor.h>
/*
* This structure really needs to be cleaned up.
@@ -124,19 +120,26 @@
* struct sock_common - minimal network layer representation of sockets
* @skc_daddr: Foreign IPv4 addr
* @skc_rcv_saddr: Bound local IPv4 addr
+ * @skc_addrpair: 8-byte-aligned __u64 union of @skc_daddr & @skc_rcv_saddr
* @skc_hash: hash value used with various protocol lookup tables
* @skc_u16hashes: two u16 hash values used by UDP lookup tables
* @skc_dport: placeholder for inet_dport/tw_dport
* @skc_num: placeholder for inet_num/tw_num
+ * @skc_portpair: __u32 union of @skc_dport & @skc_num
* @skc_family: network address family
* @skc_state: Connection state
* @skc_reuse: %SO_REUSEADDR setting
* @skc_reuseport: %SO_REUSEPORT setting
+ * @skc_ipv6only: socket is IPV6 only
+ * @skc_net_refcnt: socket is using net ref counting
* @skc_bound_dev_if: bound device index if != 0
* @skc_bind_node: bind hash linkage for various protocol lookup tables
* @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol
* @skc_prot: protocol handlers inside a network family
* @skc_net: reference to the network namespace of this socket
+ * @skc_v6_daddr: IPV6 destination address
+ * @skc_v6_rcv_saddr: IPV6 source address
+ * @skc_cookie: socket's cookie value
* @skc_node: main hash linkage for various protocol lookup tables
* @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
* @skc_tx_queue_mapping: tx queue number for this connection
@@ -144,16 +147,21 @@
* @skc_flags: place holder for sk_flags
* %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
* %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
+ * @skc_listener: connection request listener socket (aka rsk_listener)
+ * [union with @skc_flags]
+ * @skc_tw_dr: (aka tw_dr) ptr to &struct inet_timewait_death_row
+ * [union with @skc_flags]
* @skc_incoming_cpu: record/match cpu processing incoming packets
+ * @skc_rcv_wnd: (aka rsk_rcv_wnd) TCP receive window size (possibly scaled)
+ * [union with @skc_incoming_cpu]
+ * @skc_tw_rcv_nxt: (aka tw_rcv_nxt) TCP window next expected seq number
+ * [union with @skc_incoming_cpu]
* @skc_refcnt: reference count
*
* This is the minimal network layer representation of sockets, the header
* for struct sock and struct inet_timewait_sock.
*/
struct sock_common {
- /* skc_daddr and skc_rcv_saddr must be grouped on a 8 bytes aligned
- * address on 64bit arches : cf INET_MATCH()
- */
union {
__addrpair skc_addrpair;
struct {
@@ -237,6 +245,8 @@
/* public: */
};
+struct bpf_local_storage;
+
/**
* struct sock - network layer representation of sockets
* @__sk_common: shared layout with inet_timewait_sock
@@ -250,6 +260,7 @@
* @sk_dst_cache: destination cache
* @sk_dst_pending_confirm: need to confirm neighbour
* @sk_policy: flow policy
+ * @sk_rx_skb_cache: cache copy of recently accessed RX skb
* @sk_receive_queue: incoming packets
* @sk_wmem_alloc: transmit queue bytes committed
* @sk_tsq_flags: TCP Small Queues flags
@@ -270,6 +281,8 @@
* @sk_no_check_rx: allow zero checksum in RX packets
* @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO)
* @sk_route_nocaps: forbidden route capabilities (e.g NETIF_F_GSO_MASK)
+ * @sk_route_forced_caps: static, forced route capabilities
+ * (set in tcp_init_sock())
* @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4)
* @sk_gso_max_size: Maximum GSO segment size to build
* @sk_gso_max_segs: Maximum number of GSO segments
@@ -308,6 +321,8 @@
* @sk_frag: cached page frag
* @sk_peek_off: current peek_offset value
* @sk_send_head: front of stuff to transmit
+ * @tcp_rtx_queue: TCP re-transmit queue [union with @sk_send_head]
+ * @sk_tx_skb_cache: cache copy of recently accessed TX skb
* @sk_security: used by security modules
* @sk_mark: generic packet mark
* @sk_cgrp_data: cgroup data for this cgroup
@@ -318,11 +333,14 @@
* @sk_write_space: callback to indicate there is bf sending space available
* @sk_error_report: callback to indicate errors (e.g. %MSG_ERRQUEUE)
* @sk_backlog_rcv: callback to process the backlog
+ * @sk_validate_xmit_skb: ptr to an optional validate function
* @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0
* @sk_reuseport_cb: reuseport group container
+ * @sk_bpf_storage: ptr to cache and control for bpf_sk_storage
* @sk_rcu: used during RCU grace period
* @sk_clockid: clockid used by time-based scheduling (SO_TXTIME)
* @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME
+ * @sk_txtime_report_errors: set report errors mode for SO_TXTIME
* @sk_txtime_unused: unused txtime flags
*/
struct sock {
@@ -369,6 +387,7 @@
atomic_t sk_drops;
int sk_rcvlowat;
struct sk_buff_head sk_error_queue;
+ struct sk_buff *sk_rx_skb_cache;
struct sk_buff_head sk_receive_queue;
/*
* The backlog queue is special, it is always used with
@@ -397,12 +416,14 @@
struct sk_filter __rcu *sk_filter;
union {
struct socket_wq __rcu *sk_wq;
+ /* private: */
struct socket_wq *sk_wq_raw;
+ /* public: */
};
#ifdef CONFIG_XFRM
struct xfrm_policy __rcu *sk_policy[2];
#endif
- struct dst_entry *sk_rx_dst;
+ struct dst_entry __rcu *sk_rx_dst;
struct dst_entry __rcu *sk_dst_cache;
atomic_t sk_omem_alloc;
int sk_sndbuf;
@@ -415,6 +436,7 @@
struct sk_buff *sk_send_head;
struct rb_root tcp_rtx_queue;
};
+ struct sk_buff *sk_tx_skb_cache;
struct sk_buff_head sk_write_queue;
__s32 sk_peek_off;
int sk_write_pending;
@@ -424,8 +446,8 @@
struct timer_list sk_timer;
__u32 sk_priority;
__u32 sk_mark;
- u32 sk_pacing_rate; /* bytes per second */
- u32 sk_max_pacing_rate;
+ unsigned long sk_pacing_rate; /* bytes per second */
+ unsigned long sk_max_pacing_rate;
struct page_frag sk_frag;
netdev_features_t sk_route_caps;
netdev_features_t sk_route_nocaps;
@@ -439,31 +461,15 @@
* Because of non atomicity rules, all
* changes are protected by socket lock.
*/
- unsigned int __sk_flags_offset[0];
-#ifdef __BIG_ENDIAN_BITFIELD
-#define SK_FL_PROTO_SHIFT 16
-#define SK_FL_PROTO_MASK 0x00ff0000
-
-#define SK_FL_TYPE_SHIFT 0
-#define SK_FL_TYPE_MASK 0x0000ffff
-#else
-#define SK_FL_PROTO_SHIFT 8
-#define SK_FL_PROTO_MASK 0x0000ff00
-
-#define SK_FL_TYPE_SHIFT 16
-#define SK_FL_TYPE_MASK 0xffff0000
-#endif
-
- unsigned int sk_padding : 1,
+ u8 sk_padding : 1,
sk_kern_sock : 1,
sk_no_check_tx : 1,
sk_no_check_rx : 1,
- sk_userlocks : 4,
- sk_protocol : 8,
- sk_type : 16;
-#define SK_PROTOCOL_MAX U8_MAX
- u16 sk_gso_max_segs;
+ sk_userlocks : 4;
u8 sk_pacing_shift;
+ u16 sk_type;
+ u16 sk_protocol;
+ u16 sk_gso_max_segs;
unsigned long sk_lingertime;
struct proto *sk_prot_creator;
rwlock_t sk_callback_lock;
@@ -472,7 +478,7 @@
u32 sk_ack_backlog;
u32 sk_max_ack_backlog;
kuid_t sk_uid;
-#if IS_ENABLED(CONFIG_DEBUG_SPINLOCK) || IS_ENABLED(CONFIG_DEBUG_LOCK_ALLOC) || IS_ENABLED(CONFIG_PREEMPT_RT)
+#if IS_ENABLED(CONFIG_DEBUG_SPINLOCK) || IS_ENABLED(CONFIG_DEBUG_LOCK_ALLOC)
spinlock_t sk_peer_lock;
#else
/* sk_peer_lock is in the ANDROID_KABI_RESERVE(1) field below */
@@ -515,9 +521,12 @@
#endif
void (*sk_destruct)(struct sock *sk);
struct sock_reuseport __rcu *sk_reuseport_cb;
+#ifdef CONFIG_BPF_SYSCALL
+ struct bpf_local_storage __rcu *sk_bpf_storage;
+#endif
struct rcu_head sk_rcu;
-#if IS_ENABLED(CONFIG_DEBUG_SPINLOCK) || IS_ENABLED(CONFIG_DEBUG_LOCK_ALLOC) || IS_ENABLED(CONFIG_PREEMPT_RT)
+#if IS_ENABLED(CONFIG_DEBUG_SPINLOCK) || IS_ENABLED(CONFIG_DEBUG_LOCK_ALLOC)
ANDROID_KABI_RESERVE(1);
#else
ANDROID_KABI_USE(1, spinlock_t sk_peer_lock);
@@ -529,6 +538,8 @@
ANDROID_KABI_RESERVE(6);
ANDROID_KABI_RESERVE(7);
ANDROID_KABI_RESERVE(8);
+
+ ANDROID_OEM_DATA(1);
};
enum sk_pacing {
@@ -537,10 +548,72 @@
SK_PACING_FQ = 2,
};
+/* flag bits in sk_user_data
+ *
+ * - SK_USER_DATA_NOCOPY: Pointer stored in sk_user_data might
+ * not be suitable for copying when cloning the socket. For instance,
+ * it can point to a reference counted object. sk_user_data bottom
+ * bit is set if pointer must not be copied.
+ *
+ * - SK_USER_DATA_BPF: Mark whether sk_user_data field is
+ * managed/owned by a BPF reuseport array. This bit should be set
+ * when sk_user_data's sk is added to the bpf's reuseport_array.
+ *
+ * - SK_USER_DATA_PSOCK: Mark whether pointer stored in
+ * sk_user_data points to psock type. This bit should be set
+ * when sk_user_data is assigned to a psock object.
+ */
+#define SK_USER_DATA_NOCOPY 1UL
+#define SK_USER_DATA_BPF 2UL
+#define SK_USER_DATA_PSOCK 4UL
+#define SK_USER_DATA_PTRMASK ~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF |\
+ SK_USER_DATA_PSOCK)
+
+/**
+ * sk_user_data_is_nocopy - Test if sk_user_data pointer must not be copied
+ * @sk: socket
+ */
+static inline bool sk_user_data_is_nocopy(const struct sock *sk)
+{
+ return ((uintptr_t)sk->sk_user_data & SK_USER_DATA_NOCOPY);
+}
+
#define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data)))
-#define rcu_dereference_sk_user_data(sk) rcu_dereference(__sk_user_data((sk)))
-#define rcu_assign_sk_user_data(sk, ptr) rcu_assign_pointer(__sk_user_data((sk)), ptr)
+/**
+ * __rcu_dereference_sk_user_data_with_flags - return the pointer
+ * only if argument flags all has been set in sk_user_data. Otherwise
+ * return NULL
+ *
+ * @sk: socket
+ * @flags: flag bits
+ */
+static inline void *
+__rcu_dereference_sk_user_data_with_flags(const struct sock *sk,
+ uintptr_t flags)
+{
+ uintptr_t sk_user_data = (uintptr_t)rcu_dereference(__sk_user_data(sk));
+
+ WARN_ON_ONCE(flags & SK_USER_DATA_PTRMASK);
+
+ if ((sk_user_data & flags) == flags)
+ return (void *)(sk_user_data & SK_USER_DATA_PTRMASK);
+ return NULL;
+}
+
+#define rcu_dereference_sk_user_data(sk) \
+ __rcu_dereference_sk_user_data_with_flags(sk, 0)
+#define __rcu_assign_sk_user_data_with_flags(sk, ptr, flags) \
+({ \
+ uintptr_t __tmp1 = (uintptr_t)(ptr), \
+ __tmp2 = (uintptr_t)(flags); \
+ WARN_ON_ONCE(__tmp1 & ~SK_USER_DATA_PTRMASK); \
+ WARN_ON_ONCE(__tmp2 & SK_USER_DATA_PTRMASK); \
+ rcu_assign_pointer(__sk_user_data((sk)), \
+ __tmp1 | __tmp2); \
+})
+#define rcu_assign_sk_user_data(sk, ptr) \
+ __rcu_assign_sk_user_data_with_flags(sk, ptr, 0)
/*
* SK_CAN_REUSE and SK_NO_REUSE on a socket mean that the socket is OK
@@ -820,7 +893,6 @@
SOCK_RCVTSTAMP, /* %SO_TIMESTAMP setting */
SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */
SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */
- SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */
SOCK_MEMALLOC, /* VM depends on this socket for swapping */
SOCK_TIMESTAMPING_RX_SOFTWARE, /* %SOF_TIMESTAMPING_RX_SOFTWARE */
SOCK_FASYNC, /* fasync() active */
@@ -835,6 +907,8 @@
SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */
SOCK_RCU_FREE, /* wait rcu grace period in sk_destruct() */
SOCK_TXTIME,
+ SOCK_XDP, /* XDP is attached */
+ SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */
};
#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
@@ -852,6 +926,15 @@
static inline void sock_reset_flag(struct sock *sk, enum sock_flags flag)
{
__clear_bit(flag, &sk->sk_flags);
+}
+
+static inline void sock_valbool_flag(struct sock *sk, enum sock_flags bit,
+ int valbool)
+{
+ if (valbool)
+ sock_set_flag(sk, bit);
+ else
+ sock_reset_flag(sk, bit);
}
static inline bool sock_flag(const struct sock *sk, enum sock_flags flag)
@@ -885,17 +968,17 @@
static inline void sk_acceptq_removed(struct sock *sk)
{
- sk->sk_ack_backlog--;
+ WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog - 1);
}
static inline void sk_acceptq_added(struct sock *sk)
{
- sk->sk_ack_backlog++;
+ WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog + 1);
}
static inline bool sk_acceptq_is_full(const struct sock *sk)
{
- return sk->sk_ack_backlog > sk->sk_max_ack_backlog;
+ return READ_ONCE(sk->sk_ack_backlog) > READ_ONCE(sk->sk_max_ack_backlog);
}
/*
@@ -903,12 +986,17 @@
*/
static inline int sk_stream_min_wspace(const struct sock *sk)
{
- return sk->sk_wmem_queued >> 1;
+ return READ_ONCE(sk->sk_wmem_queued) >> 1;
}
static inline int sk_stream_wspace(const struct sock *sk)
{
- return sk->sk_sndbuf - sk->sk_wmem_queued;
+ return READ_ONCE(sk->sk_sndbuf) - READ_ONCE(sk->sk_wmem_queued);
+}
+
+static inline void sk_wmem_queued_add(struct sock *sk, int val)
+{
+ WRITE_ONCE(sk->sk_wmem_queued, sk->sk_wmem_queued + val);
}
void sk_stream_write_space(struct sock *sk);
@@ -993,7 +1081,7 @@
static inline void sock_rps_record_flow(const struct sock *sk)
{
#ifdef CONFIG_RPS
- if (static_key_false(&rfs_needed)) {
+ if (static_branch_unlikely(&rfs_needed)) {
/* Reading sk->sk_rxhash might incur an expensive cache line
* miss.
*
@@ -1104,21 +1192,13 @@
void (*destroy)(struct sock *sk);
void (*shutdown)(struct sock *sk, int how);
int (*setsockopt)(struct sock *sk, int level,
- int optname, char __user *optval,
+ int optname, sockptr_t optval,
unsigned int optlen);
int (*getsockopt)(struct sock *sk, int level,
int optname, char __user *optval,
int __user *option);
void (*keepalive)(struct sock *sk, int valbool);
#ifdef CONFIG_COMPAT
- int (*compat_setsockopt)(struct sock *sk,
- int level,
- int optname, char __user *optval,
- unsigned int optlen);
- int (*compat_getsockopt)(struct sock *sk,
- int level,
- int optname, char __user *optval,
- int __user *option);
int (*compat_ioctl)(struct sock *sk,
unsigned int cmd, unsigned long arg);
#endif
@@ -1130,7 +1210,9 @@
int (*sendpage)(struct sock *sk, struct page *page,
int offset, size_t size, int flags);
int (*bind)(struct sock *sk,
- struct sockaddr *uaddr, int addr_len);
+ struct sockaddr *addr, int addr_len);
+ int (*bind_add)(struct sock *sk,
+ struct sockaddr *addr, int addr_len);
int (*backlog_rcv) (struct sock *sk,
struct sk_buff *skb);
@@ -1148,7 +1230,7 @@
unsigned int inuse_idx;
#endif
- bool (*stream_memory_free)(const struct sock *sk);
+ bool (*stream_memory_free)(const struct sock *sk, int wake);
bool (*stream_memory_read)(const struct sock *sk);
/* Memory pressure */
void (*enter_memory_pressure)(struct sock *sk);
@@ -1230,19 +1312,29 @@
#define sk_refcnt_debug_release(sk) do { } while (0)
#endif /* SOCK_REFCNT_DEBUG */
-static inline bool sk_stream_memory_free(const struct sock *sk)
+static inline bool __sk_stream_memory_free(const struct sock *sk, int wake)
{
- if (sk->sk_wmem_queued >= sk->sk_sndbuf)
+ if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf))
return false;
return sk->sk_prot->stream_memory_free ?
- sk->sk_prot->stream_memory_free(sk) : true;
+ sk->sk_prot->stream_memory_free(sk, wake) : true;
+}
+
+static inline bool sk_stream_memory_free(const struct sock *sk)
+{
+ return __sk_stream_memory_free(sk, 0);
+}
+
+static inline bool __sk_stream_is_writeable(const struct sock *sk, int wake)
+{
+ return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) &&
+ __sk_stream_memory_free(sk, wake);
}
static inline bool sk_stream_is_writeable(const struct sock *sk)
{
- return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) &&
- sk_stream_memory_free(sk);
+ return __sk_stream_is_writeable(sk, 0);
}
static inline int sk_under_cgroup_hierarchy(struct sock *sk,
@@ -1399,7 +1491,7 @@
/* sysctl_mem values are in pages, we convert them in SK_MEM_QUANTUM units */
static inline long sk_prot_mem_limits(const struct sock *sk, int index)
{
- long val = sk->sk_prot->sysctl_mem[index];
+ long val = READ_ONCE(sk->sk_prot->sysctl_mem[index]);
#if PAGE_SIZE > SK_MEM_QUANTUM
val <<= PAGE_SHIFT - SK_MEM_QUANTUM_SHIFT;
@@ -1422,19 +1514,23 @@
static inline bool sk_wmem_schedule(struct sock *sk, int size)
{
+ int delta;
+
if (!sk_has_account(sk))
return true;
- return size <= sk->sk_forward_alloc ||
- __sk_mem_schedule(sk, size, SK_MEM_SEND);
+ delta = size - sk->sk_forward_alloc;
+ return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_SEND);
}
static inline bool
sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size)
{
+ int delta;
+
if (!sk_has_account(sk))
return true;
- return size<= sk->sk_forward_alloc ||
- __sk_mem_schedule(sk, size, SK_MEM_RECV) ||
+ delta = size - sk->sk_forward_alloc;
+ return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_RECV) ||
skb_pfmemalloc(skb);
}
@@ -1478,11 +1574,18 @@
__sk_mem_reclaim(sk, 1 << 20);
}
+DECLARE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key);
static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
{
- sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
- sk->sk_wmem_queued -= skb->truesize;
+ sk_wmem_queued_add(sk, -skb->truesize);
sk_mem_uncharge(sk, skb->truesize);
+ if (static_branch_unlikely(&tcp_tx_skb_cache_key) &&
+ !sk->sk_tx_skb_cache && !skb_cloned(skb)) {
+ skb_ext_reset(skb);
+ skb_zcopy_clear(skb, true);
+ sk->sk_tx_skb_cache = skb;
+ return;
+ }
__kfree_skb(skb);
}
@@ -1492,7 +1595,7 @@
sk->sk_lock.owned = 0;
/* The sk_lock has mutex_unlock() semantics: */
- mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
+ mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
}
}
@@ -1615,15 +1718,18 @@
void sock_efree(struct sk_buff *skb);
#ifdef CONFIG_INET
void sock_edemux(struct sk_buff *skb);
+void sock_pfree(struct sk_buff *skb);
#else
#define sock_edemux sock_efree
#endif
int sock_setsockopt(struct socket *sock, int level, int op,
- char __user *optval, unsigned int optlen);
+ sockptr_t optval, unsigned int optlen);
int sock_getsockopt(struct socket *sock, int level, int op,
char __user *optval, int __user *optlen);
+int sock_gettstamp(struct socket *sock, void __user *userstamp,
+ bool timeval, bool time32);
struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
int noblock, int *errcode);
struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
@@ -1663,8 +1769,6 @@
int sock_no_ioctl(struct socket *, unsigned int, unsigned long);
int sock_no_listen(struct socket *, int);
int sock_no_shutdown(struct socket *, int);
-int sock_no_getsockopt(struct socket *, int , int, char __user *, int __user *);
-int sock_no_setsockopt(struct socket *, int, int, char __user *, unsigned int);
int sock_no_sendmsg(struct socket *, struct msghdr *, size_t);
int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t len);
int sock_no_recvmsg(struct socket *, struct msghdr *, size_t, int);
@@ -1684,11 +1788,7 @@
int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
int flags);
int sock_common_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen);
-int compat_sock_common_getsockopt(struct socket *sock, int level,
- int optname, char __user *optval, int __user *optlen);
-int compat_sock_common_setsockopt(struct socket *sock, int level,
- int optname, char __user *optval, unsigned int optlen);
+ sockptr_t optval, unsigned int optlen);
void sk_common_release(struct sock *sk);
@@ -1827,7 +1927,7 @@
{
WARN_ON(parent->sk);
write_lock_bh(&sk->sk_callback_lock);
- rcu_assign_pointer(sk->sk_wq, parent->wq);
+ rcu_assign_pointer(sk->sk_wq, &parent->wq);
parent->sk = sk;
sk_set_socket(sk, parent);
sk->sk_uid = SOCK_INODE(parent)->i_uid;
@@ -1856,10 +1956,13 @@
WRITE_ONCE(sk->sk_txhash, net_tx_rndhash());
}
-static inline void sk_rethink_txhash(struct sock *sk)
+static inline bool sk_rethink_txhash(struct sock *sk)
{
- if (sk->sk_txhash)
+ if (sk->sk_txhash) {
sk_set_txhash(sk);
+ return true;
+ }
+ return false;
}
static inline struct dst_entry *
@@ -1882,11 +1985,9 @@
return dst;
}
-static inline void dst_negative_advice(struct sock *sk)
+static inline void __dst_negative_advice(struct sock *sk)
{
struct dst_entry *ndst, *dst = __sk_dst_get(sk);
-
- sk_rethink_txhash(sk);
if (dst && dst->ops->negative_advice) {
ndst = dst->ops->negative_advice(dst);
@@ -1897,6 +1998,12 @@
sk->sk_dst_pending_confirm = 0;
}
}
+}
+
+static inline void dst_negative_advice(struct sock *sk)
+{
+ sk_rethink_txhash(sk);
+ __dst_negative_advice(sk);
}
static inline void
@@ -1941,8 +2048,8 @@
static inline void sk_dst_confirm(struct sock *sk)
{
- if (!sk->sk_dst_pending_confirm)
- sk->sk_dst_pending_confirm = 1;
+ if (!READ_ONCE(sk->sk_dst_pending_confirm))
+ WRITE_ONCE(sk->sk_dst_pending_confirm, 1);
}
static inline void sock_confirm_neigh(struct sk_buff *skb, struct neighbour *n)
@@ -1952,10 +2059,10 @@
unsigned long now = jiffies;
/* avoid dirtying neighbour */
- if (n->confirmed != now)
- n->confirmed = now;
- if (sk && sk->sk_dst_pending_confirm)
- sk->sk_dst_pending_confirm = 0;
+ if (READ_ONCE(n->confirmed) != now)
+ WRITE_ONCE(n->confirmed, now);
+ if (sk && READ_ONCE(sk->sk_dst_pending_confirm))
+ WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
}
}
@@ -2020,7 +2127,7 @@
skb->len += copy;
skb->data_len += copy;
skb->truesize += copy;
- sk->sk_wmem_queued += copy;
+ sk_wmem_queued_add(sk, copy);
sk_mem_charge(sk, copy);
return 0;
}
@@ -2029,7 +2136,7 @@
* sk_wmem_alloc_get - returns write allocations
* @sk: socket
*
- * Returns sk_wmem_alloc minus initial offset of one
+ * Return: sk_wmem_alloc minus initial offset of one
*/
static inline int sk_wmem_alloc_get(const struct sock *sk)
{
@@ -2040,7 +2147,7 @@
* sk_rmem_alloc_get - returns read allocations
* @sk: socket
*
- * Returns sk_rmem_alloc
+ * Return: sk_rmem_alloc
*/
static inline int sk_rmem_alloc_get(const struct sock *sk)
{
@@ -2051,7 +2158,7 @@
* sk_has_allocations - check if allocations are outstanding
* @sk: socket
*
- * Returns true if socket has write or read allocations
+ * Return: true if socket has write or read allocations
*/
static inline bool sk_has_allocations(const struct sock *sk)
{
@@ -2062,7 +2169,7 @@
* skwq_has_sleeper - check if there are any waiting processes
* @wq: struct socket_wq
*
- * Returns true if socket_wq has waiting processes
+ * Return: true if socket_wq has waiting processes
*
* The purpose of the skwq_has_sleeper and sock_poll_wait is to wrap the memory
* barrier call. They were added due to the race found within the tcp code.
@@ -2101,18 +2208,12 @@
* @p: poll_table
*
* See the comments in the wq_has_sleeper function.
- *
- * Do not derive sock from filp->private_data here. An SMC socket establishes
- * an internal TCP socket that is used in the fallback case. All socket
- * operations on the SMC socket are then forwarded to the TCP socket. In case of
- * poll, the filp->private_data pointer references the SMC socket because the
- * TCP socket has no file assigned.
*/
static inline void sock_poll_wait(struct file *filp, struct socket *sock,
poll_table *p)
{
if (!poll_does_not_wait(p)) {
- poll_wait(filp, &sock->wq->wait, p);
+ poll_wait(filp, &sock->wq.wait, p);
/* We need to be sure we are in sync with the
* socket flags modification.
*
@@ -2152,10 +2253,23 @@
sk_mem_charge(sk, skb->truesize);
}
+static inline __must_check bool skb_set_owner_sk_safe(struct sk_buff *skb, struct sock *sk)
+{
+ if (sk && refcount_inc_not_zero(&sk->sk_refcnt)) {
+ skb_orphan(skb);
+ skb->destructor = sock_efree;
+ skb->sk = sk;
+ return true;
+ }
+ return false;
+}
+
void sk_reset_timer(struct sock *sk, struct timer_list *timer,
unsigned long expires);
void sk_stop_timer(struct sock *sk, struct timer_list *timer);
+
+void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer);
int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue,
struct sk_buff *skb, unsigned int flags,
@@ -2174,8 +2288,13 @@
static inline int sock_error(struct sock *sk)
{
int err;
- if (likely(!sk->sk_err))
+
+ /* Avoid an atomic operation for the common case.
+ * This is racy since another cpu/thread can change sk_err under us.
+ */
+ if (likely(data_race(!sk->sk_err)))
return 0;
+
err = xchg(&sk->sk_err, 0);
return -err;
}
@@ -2235,10 +2354,14 @@
static inline void sk_stream_moderate_sndbuf(struct sock *sk)
{
- if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) {
- sk->sk_sndbuf = min(sk->sk_sndbuf, sk->sk_wmem_queued >> 1);
- sk->sk_sndbuf = max_t(u32, sk->sk_sndbuf, SOCK_MIN_SNDBUF);
- }
+ u32 val;
+
+ if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
+ return;
+
+ val = min(sk->sk_sndbuf, sk->sk_wmem_queued >> 1);
+
+ WRITE_ONCE(sk->sk_sndbuf, max_t(u32, val, SOCK_MIN_SNDBUF));
}
struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
@@ -2249,16 +2372,22 @@
* @sk: socket
*
* Use the per task page_frag instead of the per socket one for
- * optimization when we know that we're in the normal context and owns
+ * optimization when we know that we're in process context and own
* everything that's associated with %current.
*
- * gfpflags_allow_blocking() isn't enough here as direct reclaim may nest
- * inside other socket operations and end up recursing into sk_page_frag()
- * while it's already in use.
+ * Both direct reclaim and page faults can nest inside other
+ * socket operations and end up recursing into sk_page_frag()
+ * while it's already in use: explicitly avoid task page_frag
+ * usage if the caller is potentially doing any of them.
+ * This assumes that page fault handlers use the GFP_NOFS flags.
+ *
+ * Return: a per task page_frag if context allows that,
+ * otherwise a per socket one.
*/
static inline struct page_frag *sk_page_frag(struct sock *sk)
{
- if (gfpflags_normal_context(sk->sk_allocation))
+ if ((sk->sk_allocation & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC | __GFP_FS)) ==
+ (__GFP_DIRECT_RECLAIM | __GFP_FS))
return ¤t->task_frag;
return &sk->sk_frag;
@@ -2266,16 +2395,12 @@
bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag);
-int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
- int sg_start, int *sg_curr, unsigned int *sg_size,
- int first_coalesce);
-
/*
* Default write policy as shown to user space via poll/select/SIGIO
*/
static inline bool sock_writeable(const struct sock *sk)
{
- return refcount_read(&sk->sk_wmem_alloc) < (sk->sk_sndbuf >> 1);
+ return refcount_read(&sk->sk_wmem_alloc) < (READ_ONCE(sk->sk_sndbuf) >> 1);
}
static inline gfp_t gfp_any(void)
@@ -2295,7 +2420,9 @@
static inline int sock_rcvlowat(const struct sock *sk, int waitall, int len)
{
- return (waitall ? len : min_t(int, sk->sk_rcvlowat, len)) ? : 1;
+ int v = waitall ? len : min_t(int, READ_ONCE(sk->sk_rcvlowat), len);
+
+ return v ?: 1;
}
/* Alas, with timeout socket operations are not restartable.
@@ -2314,7 +2441,7 @@
* using skb->cb[] would keep using it directly and utilize its
* alignement guarantee.
*/
-#define SOCK_SKB_CB_OFFSET ((FIELD_SIZEOF(struct sk_buff, cb) - \
+#define SOCK_SKB_CB_OFFSET ((sizeof_field(struct sk_buff, cb) - \
sizeof(struct sock_skb_cb)))
#define SOCK_SKB_CB(__skb) ((struct sock_skb_cb *)((__skb)->cb + \
@@ -2418,22 +2545,40 @@
void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags);
/**
- * sock_tx_timestamp - checks whether the outgoing packet is to be time stamped
+ * _sock_tx_timestamp - checks whether the outgoing packet is to be time stamped
* @sk: socket sending this packet
* @tsflags: timestamping flags to use
* @tx_flags: completed with instructions for time stamping
+ * @tskey: filled in with next sk_tskey (not for TCP, which uses seqno)
*
* Note: callers should take care of initial ``*tx_flags`` value (usually 0)
*/
-static inline void sock_tx_timestamp(const struct sock *sk, __u16 tsflags,
- __u8 *tx_flags)
+static inline void _sock_tx_timestamp(struct sock *sk, __u16 tsflags,
+ __u8 *tx_flags, __u32 *tskey)
{
- if (unlikely(tsflags))
+ if (unlikely(tsflags)) {
__sock_tx_timestamp(tsflags, tx_flags);
+ if (tsflags & SOF_TIMESTAMPING_OPT_ID && tskey &&
+ tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
+ *tskey = sk->sk_tskey++;
+ }
if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS)))
*tx_flags |= SKBTX_WIFI_STATUS;
}
+static inline void sock_tx_timestamp(struct sock *sk, __u16 tsflags,
+ __u8 *tx_flags)
+{
+ _sock_tx_timestamp(sk, tsflags, tx_flags, NULL);
+}
+
+static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags)
+{
+ _sock_tx_timestamp(skb->sk, tsflags, &skb_shinfo(skb)->tx_flags,
+ &skb_shinfo(skb)->tskey);
+}
+
+DECLARE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key);
/**
* sk_eat_skb - Release a skb if it is no longer needed
* @sk: socket to eat this skb from
@@ -2445,6 +2590,12 @@
static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
{
__skb_unlink(skb, &sk->sk_receive_queue);
+ if (static_branch_unlikely(&tcp_rx_skb_cache_key) &&
+ !sk->sk_rx_skb_cache) {
+ sk->sk_rx_skb_cache = skb;
+ skb_orphan(skb);
+ return;
+ }
__kfree_skb(skb);
}
@@ -2460,16 +2611,14 @@
write_pnet(&sk->sk_net, net);
}
-static inline struct sock *skb_steal_sock(struct sk_buff *skb)
+static inline bool
+skb_sk_is_prefetched(struct sk_buff *skb)
{
- if (skb->sk) {
- struct sock *sk = skb->sk;
-
- skb->destructor = NULL;
- skb->sk = NULL;
- return sk;
- }
- return NULL;
+#ifdef CONFIG_INET
+ return skb->destructor == sock_pfree;
+#else
+ return false;
+#endif /* CONFIG_INET */
}
/* This helper checks if a socket is a full socket,
@@ -2480,8 +2629,38 @@
return (1 << sk->sk_state) & ~(TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV);
}
+static inline bool
+sk_is_refcounted(struct sock *sk)
+{
+ /* Only full sockets have sk->sk_flags. */
+ return !sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE);
+}
+
+/**
+ * skb_steal_sock - steal a socket from an sk_buff
+ * @skb: sk_buff to steal the socket from
+ * @refcounted: is set to true if the socket is reference-counted
+ */
+static inline struct sock *
+skb_steal_sock(struct sk_buff *skb, bool *refcounted)
+{
+ if (skb->sk) {
+ struct sock *sk = skb->sk;
+
+ *refcounted = true;
+ if (skb_sk_is_prefetched(skb))
+ *refcounted = sk_is_refcounted(sk);
+ skb->destructor = NULL;
+ skb->sk = NULL;
+ return sk;
+ }
+ *refcounted = false;
+ return NULL;
+}
+
/* Checks if this SKB belongs to an HW offloaded socket
* and whether any SW fallbacks are required based on dev.
+ * Check decrypted mark in case skb_orphan() cleared socket.
*/
static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb,
struct net_device *dev)
@@ -2489,8 +2668,15 @@
#ifdef CONFIG_SOCK_VALIDATE_XMIT
struct sock *sk = skb->sk;
- if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb)
+ if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) {
skb = sk->sk_validate_xmit_skb(sk, dev, skb);
+#ifdef CONFIG_TLS_DEVICE
+ } else if (unlikely(skb->decrypted)) {
+ pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n");
+ kfree_skb(skb);
+ skb = NULL;
+#endif
+ }
#endif
return skb;
@@ -2504,9 +2690,7 @@
return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV);
}
-void sock_enable_timestamp(struct sock *sk, int flag);
-int sock_get_timestamp(struct sock *, struct timeval __user *);
-int sock_get_timestampns(struct sock *, struct timespec __user *);
+void sock_enable_timestamp(struct sock *sk, enum sock_flags flag);
int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level,
int type);
@@ -2536,22 +2720,25 @@
extern __u32 sysctl_wmem_default;
extern __u32 sysctl_rmem_default;
+#define SKB_FRAG_PAGE_ORDER get_order(32768)
+DECLARE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
+
static inline int sk_get_wmem0(const struct sock *sk, const struct proto *proto)
{
/* Does this proto have per netns sysctl_wmem ? */
if (proto->sysctl_wmem_offset)
- return *(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset);
+ return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset));
- return *proto->sysctl_wmem;
+ return READ_ONCE(*proto->sysctl_wmem);
}
static inline int sk_get_rmem0(const struct sock *sk, const struct proto *proto)
{
/* Does this proto have per netns sysctl_rmem ? */
if (proto->sysctl_rmem_offset)
- return *(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset);
+ return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset));
- return *proto->sysctl_rmem;
+ return READ_ONCE(*proto->sysctl_rmem);
}
/* Default TCP Small queue budget is ~1 ms of data (1sec >> 10)
@@ -2560,9 +2747,9 @@
*/
static inline void sk_pacing_shift_update(struct sock *sk, int val)
{
- if (!sk || !sk_fullsock(sk) || sk->sk_pacing_shift == val)
+ if (!sk || !sk_fullsock(sk) || READ_ONCE(sk->sk_pacing_shift) == val)
return;
- sk->sk_pacing_shift = val;
+ WRITE_ONCE(sk->sk_pacing_shift, val);
}
/* if a socket is bound to a device, check that the given device
@@ -2584,4 +2771,19 @@
return false;
}
+void sock_def_readable(struct sock *sk);
+
+int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk);
+void sock_enable_timestamps(struct sock *sk);
+void sock_no_linger(struct sock *sk);
+void sock_set_keepalive(struct sock *sk);
+void sock_set_priority(struct sock *sk, u32 priority);
+void sock_set_rcvbuf(struct sock *sk, int val);
+void sock_set_mark(struct sock *sk, u32 val);
+void sock_set_reuseaddr(struct sock *sk);
+void sock_set_reuseport(struct sock *sk);
+void sock_set_sndtimeo(struct sock *sk, s64 secs);
+
+int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len);
+
#endif /* _SOCK_H */
--
Gitblit v1.6.2