From 6778948f9de86c3cfaf36725a7c87dcff9ba247f Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Mon, 11 Dec 2023 08:20:59 +0000 Subject: [PATCH] kernel_5.10 no rt --- kernel/include/net/sock.h | 484 ++++++++++++++++++++++++++++++++++++++--------------- 1 files changed, 343 insertions(+), 141 deletions(-) diff --git a/kernel/include/net/sock.h b/kernel/include/net/sock.h index c080fc9..c604052 100644 --- a/kernel/include/net/sock.h +++ b/kernel/include/net/sock.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket @@ -30,12 +31,6 @@ * respective headers and ipv4/v6, etc now * use private slabcaches for its socks * Pedro Hortas : New flags field for socket options - * - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #ifndef _SOCK_H #define _SOCK_H @@ -64,6 +59,7 @@ #include <linux/filter.h> #include <linux/rculist_nulls.h> #include <linux/poll.h> +#include <linux/sockptr.h> #include <linux/atomic.h> #include <linux/refcount.h> @@ -71,9 +67,9 @@ #include <net/checksum.h> #include <net/tcp_states.h> #include <linux/net_tstamp.h> -#include <net/smc.h> #include <net/l3mdev.h> #include <linux/android_kabi.h> +#include <linux/android_vendor.h> /* * This structure really needs to be cleaned up. @@ -124,19 +120,26 @@ * struct sock_common - minimal network layer representation of sockets * @skc_daddr: Foreign IPv4 addr * @skc_rcv_saddr: Bound local IPv4 addr + * @skc_addrpair: 8-byte-aligned __u64 union of @skc_daddr & @skc_rcv_saddr * @skc_hash: hash value used with various protocol lookup tables * @skc_u16hashes: two u16 hash values used by UDP lookup tables * @skc_dport: placeholder for inet_dport/tw_dport * @skc_num: placeholder for inet_num/tw_num + * @skc_portpair: __u32 union of @skc_dport & @skc_num * @skc_family: network address family * @skc_state: Connection state * @skc_reuse: %SO_REUSEADDR setting * @skc_reuseport: %SO_REUSEPORT setting + * @skc_ipv6only: socket is IPV6 only + * @skc_net_refcnt: socket is using net ref counting * @skc_bound_dev_if: bound device index if != 0 * @skc_bind_node: bind hash linkage for various protocol lookup tables * @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol * @skc_prot: protocol handlers inside a network family * @skc_net: reference to the network namespace of this socket + * @skc_v6_daddr: IPV6 destination address + * @skc_v6_rcv_saddr: IPV6 source address + * @skc_cookie: socket's cookie value * @skc_node: main hash linkage for various protocol lookup tables * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol * @skc_tx_queue_mapping: tx queue number for this connection @@ -144,16 +147,21 @@ * @skc_flags: place holder for sk_flags * %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings + * @skc_listener: connection request listener socket (aka rsk_listener) + * [union with @skc_flags] + * @skc_tw_dr: (aka tw_dr) ptr to &struct inet_timewait_death_row + * [union with @skc_flags] * @skc_incoming_cpu: record/match cpu processing incoming packets + * @skc_rcv_wnd: (aka rsk_rcv_wnd) TCP receive window size (possibly scaled) + * [union with @skc_incoming_cpu] + * @skc_tw_rcv_nxt: (aka tw_rcv_nxt) TCP window next expected seq number + * [union with @skc_incoming_cpu] * @skc_refcnt: reference count * * This is the minimal network layer representation of sockets, the header * for struct sock and struct inet_timewait_sock. */ struct sock_common { - /* skc_daddr and skc_rcv_saddr must be grouped on a 8 bytes aligned - * address on 64bit arches : cf INET_MATCH() - */ union { __addrpair skc_addrpair; struct { @@ -237,6 +245,8 @@ /* public: */ }; +struct bpf_local_storage; + /** * struct sock - network layer representation of sockets * @__sk_common: shared layout with inet_timewait_sock @@ -250,6 +260,7 @@ * @sk_dst_cache: destination cache * @sk_dst_pending_confirm: need to confirm neighbour * @sk_policy: flow policy + * @sk_rx_skb_cache: cache copy of recently accessed RX skb * @sk_receive_queue: incoming packets * @sk_wmem_alloc: transmit queue bytes committed * @sk_tsq_flags: TCP Small Queues flags @@ -270,6 +281,8 @@ * @sk_no_check_rx: allow zero checksum in RX packets * @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO) * @sk_route_nocaps: forbidden route capabilities (e.g NETIF_F_GSO_MASK) + * @sk_route_forced_caps: static, forced route capabilities + * (set in tcp_init_sock()) * @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4) * @sk_gso_max_size: Maximum GSO segment size to build * @sk_gso_max_segs: Maximum number of GSO segments @@ -308,6 +321,8 @@ * @sk_frag: cached page frag * @sk_peek_off: current peek_offset value * @sk_send_head: front of stuff to transmit + * @tcp_rtx_queue: TCP re-transmit queue [union with @sk_send_head] + * @sk_tx_skb_cache: cache copy of recently accessed TX skb * @sk_security: used by security modules * @sk_mark: generic packet mark * @sk_cgrp_data: cgroup data for this cgroup @@ -318,11 +333,14 @@ * @sk_write_space: callback to indicate there is bf sending space available * @sk_error_report: callback to indicate errors (e.g. %MSG_ERRQUEUE) * @sk_backlog_rcv: callback to process the backlog + * @sk_validate_xmit_skb: ptr to an optional validate function * @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0 * @sk_reuseport_cb: reuseport group container + * @sk_bpf_storage: ptr to cache and control for bpf_sk_storage * @sk_rcu: used during RCU grace period * @sk_clockid: clockid used by time-based scheduling (SO_TXTIME) * @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME + * @sk_txtime_report_errors: set report errors mode for SO_TXTIME * @sk_txtime_unused: unused txtime flags */ struct sock { @@ -369,6 +387,7 @@ atomic_t sk_drops; int sk_rcvlowat; struct sk_buff_head sk_error_queue; + struct sk_buff *sk_rx_skb_cache; struct sk_buff_head sk_receive_queue; /* * The backlog queue is special, it is always used with @@ -397,12 +416,14 @@ struct sk_filter __rcu *sk_filter; union { struct socket_wq __rcu *sk_wq; + /* private: */ struct socket_wq *sk_wq_raw; + /* public: */ }; #ifdef CONFIG_XFRM struct xfrm_policy __rcu *sk_policy[2]; #endif - struct dst_entry *sk_rx_dst; + struct dst_entry __rcu *sk_rx_dst; struct dst_entry __rcu *sk_dst_cache; atomic_t sk_omem_alloc; int sk_sndbuf; @@ -415,6 +436,7 @@ struct sk_buff *sk_send_head; struct rb_root tcp_rtx_queue; }; + struct sk_buff *sk_tx_skb_cache; struct sk_buff_head sk_write_queue; __s32 sk_peek_off; int sk_write_pending; @@ -424,8 +446,8 @@ struct timer_list sk_timer; __u32 sk_priority; __u32 sk_mark; - u32 sk_pacing_rate; /* bytes per second */ - u32 sk_max_pacing_rate; + unsigned long sk_pacing_rate; /* bytes per second */ + unsigned long sk_max_pacing_rate; struct page_frag sk_frag; netdev_features_t sk_route_caps; netdev_features_t sk_route_nocaps; @@ -439,31 +461,15 @@ * Because of non atomicity rules, all * changes are protected by socket lock. */ - unsigned int __sk_flags_offset[0]; -#ifdef __BIG_ENDIAN_BITFIELD -#define SK_FL_PROTO_SHIFT 16 -#define SK_FL_PROTO_MASK 0x00ff0000 - -#define SK_FL_TYPE_SHIFT 0 -#define SK_FL_TYPE_MASK 0x0000ffff -#else -#define SK_FL_PROTO_SHIFT 8 -#define SK_FL_PROTO_MASK 0x0000ff00 - -#define SK_FL_TYPE_SHIFT 16 -#define SK_FL_TYPE_MASK 0xffff0000 -#endif - - unsigned int sk_padding : 1, + u8 sk_padding : 1, sk_kern_sock : 1, sk_no_check_tx : 1, sk_no_check_rx : 1, - sk_userlocks : 4, - sk_protocol : 8, - sk_type : 16; -#define SK_PROTOCOL_MAX U8_MAX - u16 sk_gso_max_segs; + sk_userlocks : 4; u8 sk_pacing_shift; + u16 sk_type; + u16 sk_protocol; + u16 sk_gso_max_segs; unsigned long sk_lingertime; struct proto *sk_prot_creator; rwlock_t sk_callback_lock; @@ -472,7 +478,7 @@ u32 sk_ack_backlog; u32 sk_max_ack_backlog; kuid_t sk_uid; -#if IS_ENABLED(CONFIG_DEBUG_SPINLOCK) || IS_ENABLED(CONFIG_DEBUG_LOCK_ALLOC) || IS_ENABLED(CONFIG_PREEMPT_RT) +#if IS_ENABLED(CONFIG_DEBUG_SPINLOCK) || IS_ENABLED(CONFIG_DEBUG_LOCK_ALLOC) spinlock_t sk_peer_lock; #else /* sk_peer_lock is in the ANDROID_KABI_RESERVE(1) field below */ @@ -515,9 +521,12 @@ #endif void (*sk_destruct)(struct sock *sk); struct sock_reuseport __rcu *sk_reuseport_cb; +#ifdef CONFIG_BPF_SYSCALL + struct bpf_local_storage __rcu *sk_bpf_storage; +#endif struct rcu_head sk_rcu; -#if IS_ENABLED(CONFIG_DEBUG_SPINLOCK) || IS_ENABLED(CONFIG_DEBUG_LOCK_ALLOC) || IS_ENABLED(CONFIG_PREEMPT_RT) +#if IS_ENABLED(CONFIG_DEBUG_SPINLOCK) || IS_ENABLED(CONFIG_DEBUG_LOCK_ALLOC) ANDROID_KABI_RESERVE(1); #else ANDROID_KABI_USE(1, spinlock_t sk_peer_lock); @@ -529,6 +538,8 @@ ANDROID_KABI_RESERVE(6); ANDROID_KABI_RESERVE(7); ANDROID_KABI_RESERVE(8); + + ANDROID_OEM_DATA(1); }; enum sk_pacing { @@ -537,10 +548,72 @@ SK_PACING_FQ = 2, }; +/* flag bits in sk_user_data + * + * - SK_USER_DATA_NOCOPY: Pointer stored in sk_user_data might + * not be suitable for copying when cloning the socket. For instance, + * it can point to a reference counted object. sk_user_data bottom + * bit is set if pointer must not be copied. + * + * - SK_USER_DATA_BPF: Mark whether sk_user_data field is + * managed/owned by a BPF reuseport array. This bit should be set + * when sk_user_data's sk is added to the bpf's reuseport_array. + * + * - SK_USER_DATA_PSOCK: Mark whether pointer stored in + * sk_user_data points to psock type. This bit should be set + * when sk_user_data is assigned to a psock object. + */ +#define SK_USER_DATA_NOCOPY 1UL +#define SK_USER_DATA_BPF 2UL +#define SK_USER_DATA_PSOCK 4UL +#define SK_USER_DATA_PTRMASK ~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF |\ + SK_USER_DATA_PSOCK) + +/** + * sk_user_data_is_nocopy - Test if sk_user_data pointer must not be copied + * @sk: socket + */ +static inline bool sk_user_data_is_nocopy(const struct sock *sk) +{ + return ((uintptr_t)sk->sk_user_data & SK_USER_DATA_NOCOPY); +} + #define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data))) -#define rcu_dereference_sk_user_data(sk) rcu_dereference(__sk_user_data((sk))) -#define rcu_assign_sk_user_data(sk, ptr) rcu_assign_pointer(__sk_user_data((sk)), ptr) +/** + * __rcu_dereference_sk_user_data_with_flags - return the pointer + * only if argument flags all has been set in sk_user_data. Otherwise + * return NULL + * + * @sk: socket + * @flags: flag bits + */ +static inline void * +__rcu_dereference_sk_user_data_with_flags(const struct sock *sk, + uintptr_t flags) +{ + uintptr_t sk_user_data = (uintptr_t)rcu_dereference(__sk_user_data(sk)); + + WARN_ON_ONCE(flags & SK_USER_DATA_PTRMASK); + + if ((sk_user_data & flags) == flags) + return (void *)(sk_user_data & SK_USER_DATA_PTRMASK); + return NULL; +} + +#define rcu_dereference_sk_user_data(sk) \ + __rcu_dereference_sk_user_data_with_flags(sk, 0) +#define __rcu_assign_sk_user_data_with_flags(sk, ptr, flags) \ +({ \ + uintptr_t __tmp1 = (uintptr_t)(ptr), \ + __tmp2 = (uintptr_t)(flags); \ + WARN_ON_ONCE(__tmp1 & ~SK_USER_DATA_PTRMASK); \ + WARN_ON_ONCE(__tmp2 & SK_USER_DATA_PTRMASK); \ + rcu_assign_pointer(__sk_user_data((sk)), \ + __tmp1 | __tmp2); \ +}) +#define rcu_assign_sk_user_data(sk, ptr) \ + __rcu_assign_sk_user_data_with_flags(sk, ptr, 0) /* * SK_CAN_REUSE and SK_NO_REUSE on a socket mean that the socket is OK @@ -820,7 +893,6 @@ SOCK_RCVTSTAMP, /* %SO_TIMESTAMP setting */ SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */ SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */ - SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */ SOCK_MEMALLOC, /* VM depends on this socket for swapping */ SOCK_TIMESTAMPING_RX_SOFTWARE, /* %SOF_TIMESTAMPING_RX_SOFTWARE */ SOCK_FASYNC, /* fasync() active */ @@ -835,6 +907,8 @@ SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */ SOCK_RCU_FREE, /* wait rcu grace period in sk_destruct() */ SOCK_TXTIME, + SOCK_XDP, /* XDP is attached */ + SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */ }; #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) @@ -852,6 +926,15 @@ static inline void sock_reset_flag(struct sock *sk, enum sock_flags flag) { __clear_bit(flag, &sk->sk_flags); +} + +static inline void sock_valbool_flag(struct sock *sk, enum sock_flags bit, + int valbool) +{ + if (valbool) + sock_set_flag(sk, bit); + else + sock_reset_flag(sk, bit); } static inline bool sock_flag(const struct sock *sk, enum sock_flags flag) @@ -885,17 +968,17 @@ static inline void sk_acceptq_removed(struct sock *sk) { - sk->sk_ack_backlog--; + WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog - 1); } static inline void sk_acceptq_added(struct sock *sk) { - sk->sk_ack_backlog++; + WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog + 1); } static inline bool sk_acceptq_is_full(const struct sock *sk) { - return sk->sk_ack_backlog > sk->sk_max_ack_backlog; + return READ_ONCE(sk->sk_ack_backlog) > READ_ONCE(sk->sk_max_ack_backlog); } /* @@ -903,12 +986,17 @@ */ static inline int sk_stream_min_wspace(const struct sock *sk) { - return sk->sk_wmem_queued >> 1; + return READ_ONCE(sk->sk_wmem_queued) >> 1; } static inline int sk_stream_wspace(const struct sock *sk) { - return sk->sk_sndbuf - sk->sk_wmem_queued; + return READ_ONCE(sk->sk_sndbuf) - READ_ONCE(sk->sk_wmem_queued); +} + +static inline void sk_wmem_queued_add(struct sock *sk, int val) +{ + WRITE_ONCE(sk->sk_wmem_queued, sk->sk_wmem_queued + val); } void sk_stream_write_space(struct sock *sk); @@ -993,7 +1081,7 @@ static inline void sock_rps_record_flow(const struct sock *sk) { #ifdef CONFIG_RPS - if (static_key_false(&rfs_needed)) { + if (static_branch_unlikely(&rfs_needed)) { /* Reading sk->sk_rxhash might incur an expensive cache line * miss. * @@ -1104,21 +1192,13 @@ void (*destroy)(struct sock *sk); void (*shutdown)(struct sock *sk, int how); int (*setsockopt)(struct sock *sk, int level, - int optname, char __user *optval, + int optname, sockptr_t optval, unsigned int optlen); int (*getsockopt)(struct sock *sk, int level, int optname, char __user *optval, int __user *option); void (*keepalive)(struct sock *sk, int valbool); #ifdef CONFIG_COMPAT - int (*compat_setsockopt)(struct sock *sk, - int level, - int optname, char __user *optval, - unsigned int optlen); - int (*compat_getsockopt)(struct sock *sk, - int level, - int optname, char __user *optval, - int __user *option); int (*compat_ioctl)(struct sock *sk, unsigned int cmd, unsigned long arg); #endif @@ -1130,7 +1210,9 @@ int (*sendpage)(struct sock *sk, struct page *page, int offset, size_t size, int flags); int (*bind)(struct sock *sk, - struct sockaddr *uaddr, int addr_len); + struct sockaddr *addr, int addr_len); + int (*bind_add)(struct sock *sk, + struct sockaddr *addr, int addr_len); int (*backlog_rcv) (struct sock *sk, struct sk_buff *skb); @@ -1148,7 +1230,7 @@ unsigned int inuse_idx; #endif - bool (*stream_memory_free)(const struct sock *sk); + bool (*stream_memory_free)(const struct sock *sk, int wake); bool (*stream_memory_read)(const struct sock *sk); /* Memory pressure */ void (*enter_memory_pressure)(struct sock *sk); @@ -1230,19 +1312,29 @@ #define sk_refcnt_debug_release(sk) do { } while (0) #endif /* SOCK_REFCNT_DEBUG */ -static inline bool sk_stream_memory_free(const struct sock *sk) +static inline bool __sk_stream_memory_free(const struct sock *sk, int wake) { - if (sk->sk_wmem_queued >= sk->sk_sndbuf) + if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf)) return false; return sk->sk_prot->stream_memory_free ? - sk->sk_prot->stream_memory_free(sk) : true; + sk->sk_prot->stream_memory_free(sk, wake) : true; +} + +static inline bool sk_stream_memory_free(const struct sock *sk) +{ + return __sk_stream_memory_free(sk, 0); +} + +static inline bool __sk_stream_is_writeable(const struct sock *sk, int wake) +{ + return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && + __sk_stream_memory_free(sk, wake); } static inline bool sk_stream_is_writeable(const struct sock *sk) { - return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && - sk_stream_memory_free(sk); + return __sk_stream_is_writeable(sk, 0); } static inline int sk_under_cgroup_hierarchy(struct sock *sk, @@ -1399,7 +1491,7 @@ /* sysctl_mem values are in pages, we convert them in SK_MEM_QUANTUM units */ static inline long sk_prot_mem_limits(const struct sock *sk, int index) { - long val = sk->sk_prot->sysctl_mem[index]; + long val = READ_ONCE(sk->sk_prot->sysctl_mem[index]); #if PAGE_SIZE > SK_MEM_QUANTUM val <<= PAGE_SHIFT - SK_MEM_QUANTUM_SHIFT; @@ -1422,19 +1514,23 @@ static inline bool sk_wmem_schedule(struct sock *sk, int size) { + int delta; + if (!sk_has_account(sk)) return true; - return size <= sk->sk_forward_alloc || - __sk_mem_schedule(sk, size, SK_MEM_SEND); + delta = size - sk->sk_forward_alloc; + return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_SEND); } static inline bool sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size) { + int delta; + if (!sk_has_account(sk)) return true; - return size<= sk->sk_forward_alloc || - __sk_mem_schedule(sk, size, SK_MEM_RECV) || + delta = size - sk->sk_forward_alloc; + return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_RECV) || skb_pfmemalloc(skb); } @@ -1478,11 +1574,18 @@ __sk_mem_reclaim(sk, 1 << 20); } +DECLARE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key); static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb) { - sock_set_flag(sk, SOCK_QUEUE_SHRUNK); - sk->sk_wmem_queued -= skb->truesize; + sk_wmem_queued_add(sk, -skb->truesize); sk_mem_uncharge(sk, skb->truesize); + if (static_branch_unlikely(&tcp_tx_skb_cache_key) && + !sk->sk_tx_skb_cache && !skb_cloned(skb)) { + skb_ext_reset(skb); + skb_zcopy_clear(skb, true); + sk->sk_tx_skb_cache = skb; + return; + } __kfree_skb(skb); } @@ -1492,7 +1595,7 @@ sk->sk_lock.owned = 0; /* The sk_lock has mutex_unlock() semantics: */ - mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); + mutex_release(&sk->sk_lock.dep_map, _RET_IP_); } } @@ -1615,15 +1718,18 @@ void sock_efree(struct sk_buff *skb); #ifdef CONFIG_INET void sock_edemux(struct sk_buff *skb); +void sock_pfree(struct sk_buff *skb); #else #define sock_edemux sock_efree #endif int sock_setsockopt(struct socket *sock, int level, int op, - char __user *optval, unsigned int optlen); + sockptr_t optval, unsigned int optlen); int sock_getsockopt(struct socket *sock, int level, int op, char __user *optval, int __user *optlen); +int sock_gettstamp(struct socket *sock, void __user *userstamp, + bool timeval, bool time32); struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, int noblock, int *errcode); struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, @@ -1663,8 +1769,6 @@ int sock_no_ioctl(struct socket *, unsigned int, unsigned long); int sock_no_listen(struct socket *, int); int sock_no_shutdown(struct socket *, int); -int sock_no_getsockopt(struct socket *, int , int, char __user *, int __user *); -int sock_no_setsockopt(struct socket *, int, int, char __user *, unsigned int); int sock_no_sendmsg(struct socket *, struct msghdr *, size_t); int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t len); int sock_no_recvmsg(struct socket *, struct msghdr *, size_t, int); @@ -1684,11 +1788,7 @@ int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags); int sock_common_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen); -int compat_sock_common_getsockopt(struct socket *sock, int level, - int optname, char __user *optval, int __user *optlen); -int compat_sock_common_setsockopt(struct socket *sock, int level, - int optname, char __user *optval, unsigned int optlen); + sockptr_t optval, unsigned int optlen); void sk_common_release(struct sock *sk); @@ -1827,7 +1927,7 @@ { WARN_ON(parent->sk); write_lock_bh(&sk->sk_callback_lock); - rcu_assign_pointer(sk->sk_wq, parent->wq); + rcu_assign_pointer(sk->sk_wq, &parent->wq); parent->sk = sk; sk_set_socket(sk, parent); sk->sk_uid = SOCK_INODE(parent)->i_uid; @@ -1856,10 +1956,13 @@ WRITE_ONCE(sk->sk_txhash, net_tx_rndhash()); } -static inline void sk_rethink_txhash(struct sock *sk) +static inline bool sk_rethink_txhash(struct sock *sk) { - if (sk->sk_txhash) + if (sk->sk_txhash) { sk_set_txhash(sk); + return true; + } + return false; } static inline struct dst_entry * @@ -1882,11 +1985,9 @@ return dst; } -static inline void dst_negative_advice(struct sock *sk) +static inline void __dst_negative_advice(struct sock *sk) { struct dst_entry *ndst, *dst = __sk_dst_get(sk); - - sk_rethink_txhash(sk); if (dst && dst->ops->negative_advice) { ndst = dst->ops->negative_advice(dst); @@ -1897,6 +1998,12 @@ sk->sk_dst_pending_confirm = 0; } } +} + +static inline void dst_negative_advice(struct sock *sk) +{ + sk_rethink_txhash(sk); + __dst_negative_advice(sk); } static inline void @@ -1941,8 +2048,8 @@ static inline void sk_dst_confirm(struct sock *sk) { - if (!sk->sk_dst_pending_confirm) - sk->sk_dst_pending_confirm = 1; + if (!READ_ONCE(sk->sk_dst_pending_confirm)) + WRITE_ONCE(sk->sk_dst_pending_confirm, 1); } static inline void sock_confirm_neigh(struct sk_buff *skb, struct neighbour *n) @@ -1952,10 +2059,10 @@ unsigned long now = jiffies; /* avoid dirtying neighbour */ - if (n->confirmed != now) - n->confirmed = now; - if (sk && sk->sk_dst_pending_confirm) - sk->sk_dst_pending_confirm = 0; + if (READ_ONCE(n->confirmed) != now) + WRITE_ONCE(n->confirmed, now); + if (sk && READ_ONCE(sk->sk_dst_pending_confirm)) + WRITE_ONCE(sk->sk_dst_pending_confirm, 0); } } @@ -2020,7 +2127,7 @@ skb->len += copy; skb->data_len += copy; skb->truesize += copy; - sk->sk_wmem_queued += copy; + sk_wmem_queued_add(sk, copy); sk_mem_charge(sk, copy); return 0; } @@ -2029,7 +2136,7 @@ * sk_wmem_alloc_get - returns write allocations * @sk: socket * - * Returns sk_wmem_alloc minus initial offset of one + * Return: sk_wmem_alloc minus initial offset of one */ static inline int sk_wmem_alloc_get(const struct sock *sk) { @@ -2040,7 +2147,7 @@ * sk_rmem_alloc_get - returns read allocations * @sk: socket * - * Returns sk_rmem_alloc + * Return: sk_rmem_alloc */ static inline int sk_rmem_alloc_get(const struct sock *sk) { @@ -2051,7 +2158,7 @@ * sk_has_allocations - check if allocations are outstanding * @sk: socket * - * Returns true if socket has write or read allocations + * Return: true if socket has write or read allocations */ static inline bool sk_has_allocations(const struct sock *sk) { @@ -2062,7 +2169,7 @@ * skwq_has_sleeper - check if there are any waiting processes * @wq: struct socket_wq * - * Returns true if socket_wq has waiting processes + * Return: true if socket_wq has waiting processes * * The purpose of the skwq_has_sleeper and sock_poll_wait is to wrap the memory * barrier call. They were added due to the race found within the tcp code. @@ -2101,18 +2208,12 @@ * @p: poll_table * * See the comments in the wq_has_sleeper function. - * - * Do not derive sock from filp->private_data here. An SMC socket establishes - * an internal TCP socket that is used in the fallback case. All socket - * operations on the SMC socket are then forwarded to the TCP socket. In case of - * poll, the filp->private_data pointer references the SMC socket because the - * TCP socket has no file assigned. */ static inline void sock_poll_wait(struct file *filp, struct socket *sock, poll_table *p) { if (!poll_does_not_wait(p)) { - poll_wait(filp, &sock->wq->wait, p); + poll_wait(filp, &sock->wq.wait, p); /* We need to be sure we are in sync with the * socket flags modification. * @@ -2152,10 +2253,23 @@ sk_mem_charge(sk, skb->truesize); } +static inline __must_check bool skb_set_owner_sk_safe(struct sk_buff *skb, struct sock *sk) +{ + if (sk && refcount_inc_not_zero(&sk->sk_refcnt)) { + skb_orphan(skb); + skb->destructor = sock_efree; + skb->sk = sk; + return true; + } + return false; +} + void sk_reset_timer(struct sock *sk, struct timer_list *timer, unsigned long expires); void sk_stop_timer(struct sock *sk, struct timer_list *timer); + +void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer); int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, struct sk_buff *skb, unsigned int flags, @@ -2174,8 +2288,13 @@ static inline int sock_error(struct sock *sk) { int err; - if (likely(!sk->sk_err)) + + /* Avoid an atomic operation for the common case. + * This is racy since another cpu/thread can change sk_err under us. + */ + if (likely(data_race(!sk->sk_err))) return 0; + err = xchg(&sk->sk_err, 0); return -err; } @@ -2235,10 +2354,14 @@ static inline void sk_stream_moderate_sndbuf(struct sock *sk) { - if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) { - sk->sk_sndbuf = min(sk->sk_sndbuf, sk->sk_wmem_queued >> 1); - sk->sk_sndbuf = max_t(u32, sk->sk_sndbuf, SOCK_MIN_SNDBUF); - } + u32 val; + + if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) + return; + + val = min(sk->sk_sndbuf, sk->sk_wmem_queued >> 1); + + WRITE_ONCE(sk->sk_sndbuf, max_t(u32, val, SOCK_MIN_SNDBUF)); } struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, @@ -2249,16 +2372,22 @@ * @sk: socket * * Use the per task page_frag instead of the per socket one for - * optimization when we know that we're in the normal context and owns + * optimization when we know that we're in process context and own * everything that's associated with %current. * - * gfpflags_allow_blocking() isn't enough here as direct reclaim may nest - * inside other socket operations and end up recursing into sk_page_frag() - * while it's already in use. + * Both direct reclaim and page faults can nest inside other + * socket operations and end up recursing into sk_page_frag() + * while it's already in use: explicitly avoid task page_frag + * usage if the caller is potentially doing any of them. + * This assumes that page fault handlers use the GFP_NOFS flags. + * + * Return: a per task page_frag if context allows that, + * otherwise a per socket one. */ static inline struct page_frag *sk_page_frag(struct sock *sk) { - if (gfpflags_normal_context(sk->sk_allocation)) + if ((sk->sk_allocation & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC | __GFP_FS)) == + (__GFP_DIRECT_RECLAIM | __GFP_FS)) return ¤t->task_frag; return &sk->sk_frag; @@ -2266,16 +2395,12 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag); -int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, - int sg_start, int *sg_curr, unsigned int *sg_size, - int first_coalesce); - /* * Default write policy as shown to user space via poll/select/SIGIO */ static inline bool sock_writeable(const struct sock *sk) { - return refcount_read(&sk->sk_wmem_alloc) < (sk->sk_sndbuf >> 1); + return refcount_read(&sk->sk_wmem_alloc) < (READ_ONCE(sk->sk_sndbuf) >> 1); } static inline gfp_t gfp_any(void) @@ -2295,7 +2420,9 @@ static inline int sock_rcvlowat(const struct sock *sk, int waitall, int len) { - return (waitall ? len : min_t(int, sk->sk_rcvlowat, len)) ? : 1; + int v = waitall ? len : min_t(int, READ_ONCE(sk->sk_rcvlowat), len); + + return v ?: 1; } /* Alas, with timeout socket operations are not restartable. @@ -2314,7 +2441,7 @@ * using skb->cb[] would keep using it directly and utilize its * alignement guarantee. */ -#define SOCK_SKB_CB_OFFSET ((FIELD_SIZEOF(struct sk_buff, cb) - \ +#define SOCK_SKB_CB_OFFSET ((sizeof_field(struct sk_buff, cb) - \ sizeof(struct sock_skb_cb))) #define SOCK_SKB_CB(__skb) ((struct sock_skb_cb *)((__skb)->cb + \ @@ -2418,22 +2545,40 @@ void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags); /** - * sock_tx_timestamp - checks whether the outgoing packet is to be time stamped + * _sock_tx_timestamp - checks whether the outgoing packet is to be time stamped * @sk: socket sending this packet * @tsflags: timestamping flags to use * @tx_flags: completed with instructions for time stamping + * @tskey: filled in with next sk_tskey (not for TCP, which uses seqno) * * Note: callers should take care of initial ``*tx_flags`` value (usually 0) */ -static inline void sock_tx_timestamp(const struct sock *sk, __u16 tsflags, - __u8 *tx_flags) +static inline void _sock_tx_timestamp(struct sock *sk, __u16 tsflags, + __u8 *tx_flags, __u32 *tskey) { - if (unlikely(tsflags)) + if (unlikely(tsflags)) { __sock_tx_timestamp(tsflags, tx_flags); + if (tsflags & SOF_TIMESTAMPING_OPT_ID && tskey && + tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) + *tskey = sk->sk_tskey++; + } if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS))) *tx_flags |= SKBTX_WIFI_STATUS; } +static inline void sock_tx_timestamp(struct sock *sk, __u16 tsflags, + __u8 *tx_flags) +{ + _sock_tx_timestamp(sk, tsflags, tx_flags, NULL); +} + +static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags) +{ + _sock_tx_timestamp(skb->sk, tsflags, &skb_shinfo(skb)->tx_flags, + &skb_shinfo(skb)->tskey); +} + +DECLARE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key); /** * sk_eat_skb - Release a skb if it is no longer needed * @sk: socket to eat this skb from @@ -2445,6 +2590,12 @@ static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb) { __skb_unlink(skb, &sk->sk_receive_queue); + if (static_branch_unlikely(&tcp_rx_skb_cache_key) && + !sk->sk_rx_skb_cache) { + sk->sk_rx_skb_cache = skb; + skb_orphan(skb); + return; + } __kfree_skb(skb); } @@ -2460,16 +2611,14 @@ write_pnet(&sk->sk_net, net); } -static inline struct sock *skb_steal_sock(struct sk_buff *skb) +static inline bool +skb_sk_is_prefetched(struct sk_buff *skb) { - if (skb->sk) { - struct sock *sk = skb->sk; - - skb->destructor = NULL; - skb->sk = NULL; - return sk; - } - return NULL; +#ifdef CONFIG_INET + return skb->destructor == sock_pfree; +#else + return false; +#endif /* CONFIG_INET */ } /* This helper checks if a socket is a full socket, @@ -2480,8 +2629,38 @@ return (1 << sk->sk_state) & ~(TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV); } +static inline bool +sk_is_refcounted(struct sock *sk) +{ + /* Only full sockets have sk->sk_flags. */ + return !sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE); +} + +/** + * skb_steal_sock - steal a socket from an sk_buff + * @skb: sk_buff to steal the socket from + * @refcounted: is set to true if the socket is reference-counted + */ +static inline struct sock * +skb_steal_sock(struct sk_buff *skb, bool *refcounted) +{ + if (skb->sk) { + struct sock *sk = skb->sk; + + *refcounted = true; + if (skb_sk_is_prefetched(skb)) + *refcounted = sk_is_refcounted(sk); + skb->destructor = NULL; + skb->sk = NULL; + return sk; + } + *refcounted = false; + return NULL; +} + /* Checks if this SKB belongs to an HW offloaded socket * and whether any SW fallbacks are required based on dev. + * Check decrypted mark in case skb_orphan() cleared socket. */ static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) @@ -2489,8 +2668,15 @@ #ifdef CONFIG_SOCK_VALIDATE_XMIT struct sock *sk = skb->sk; - if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) + if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) { skb = sk->sk_validate_xmit_skb(sk, dev, skb); +#ifdef CONFIG_TLS_DEVICE + } else if (unlikely(skb->decrypted)) { + pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n"); + kfree_skb(skb); + skb = NULL; +#endif + } #endif return skb; @@ -2504,9 +2690,7 @@ return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV); } -void sock_enable_timestamp(struct sock *sk, int flag); -int sock_get_timestamp(struct sock *, struct timeval __user *); -int sock_get_timestampns(struct sock *, struct timespec __user *); +void sock_enable_timestamp(struct sock *sk, enum sock_flags flag); int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level, int type); @@ -2536,22 +2720,25 @@ extern __u32 sysctl_wmem_default; extern __u32 sysctl_rmem_default; +#define SKB_FRAG_PAGE_ORDER get_order(32768) +DECLARE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); + static inline int sk_get_wmem0(const struct sock *sk, const struct proto *proto) { /* Does this proto have per netns sysctl_wmem ? */ if (proto->sysctl_wmem_offset) - return *(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset); + return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset)); - return *proto->sysctl_wmem; + return READ_ONCE(*proto->sysctl_wmem); } static inline int sk_get_rmem0(const struct sock *sk, const struct proto *proto) { /* Does this proto have per netns sysctl_rmem ? */ if (proto->sysctl_rmem_offset) - return *(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset); + return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset)); - return *proto->sysctl_rmem; + return READ_ONCE(*proto->sysctl_rmem); } /* Default TCP Small queue budget is ~1 ms of data (1sec >> 10) @@ -2560,9 +2747,9 @@ */ static inline void sk_pacing_shift_update(struct sock *sk, int val) { - if (!sk || !sk_fullsock(sk) || sk->sk_pacing_shift == val) + if (!sk || !sk_fullsock(sk) || READ_ONCE(sk->sk_pacing_shift) == val) return; - sk->sk_pacing_shift = val; + WRITE_ONCE(sk->sk_pacing_shift, val); } /* if a socket is bound to a device, check that the given device @@ -2584,4 +2771,19 @@ return false; } +void sock_def_readable(struct sock *sk); + +int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk); +void sock_enable_timestamps(struct sock *sk); +void sock_no_linger(struct sock *sk); +void sock_set_keepalive(struct sock *sk); +void sock_set_priority(struct sock *sk, u32 priority); +void sock_set_rcvbuf(struct sock *sk, int val); +void sock_set_mark(struct sock *sk, u32 val); +void sock_set_reuseaddr(struct sock *sk); +void sock_set_reuseport(struct sock *sk); +void sock_set_sndtimeo(struct sock *sk, s64 secs); + +int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len); + #endif /* _SOCK_H */ -- Gitblit v1.6.2