hc
2024-01-05 071106ecf68c401173c58808b1cf5f68cc50d390
kernel/include/net/sock.h
....@@ -1,3 +1,4 @@
1
+/* SPDX-License-Identifier: GPL-2.0-or-later */
12 /*
23 * INET An implementation of the TCP/IP protocol suite for the LINUX
34 * operating system. INET is implemented using the BSD Socket
....@@ -30,12 +31,6 @@
3031 * respective headers and ipv4/v6, etc now
3132 * use private slabcaches for its socks
3233 * Pedro Hortas : New flags field for socket options
33
- *
34
- *
35
- * This program is free software; you can redistribute it and/or
36
- * modify it under the terms of the GNU General Public License
37
- * as published by the Free Software Foundation; either version
38
- * 2 of the License, or (at your option) any later version.
3934 */
4035 #ifndef _SOCK_H
4136 #define _SOCK_H
....@@ -64,6 +59,7 @@
6459 #include <linux/filter.h>
6560 #include <linux/rculist_nulls.h>
6661 #include <linux/poll.h>
62
+#include <linux/sockptr.h>
6763
6864 #include <linux/atomic.h>
6965 #include <linux/refcount.h>
....@@ -71,9 +67,9 @@
7167 #include <net/checksum.h>
7268 #include <net/tcp_states.h>
7369 #include <linux/net_tstamp.h>
74
-#include <net/smc.h>
7570 #include <net/l3mdev.h>
7671 #include <linux/android_kabi.h>
72
+#include <linux/android_vendor.h>
7773
7874 /*
7975 * This structure really needs to be cleaned up.
....@@ -124,19 +120,26 @@
124120 * struct sock_common - minimal network layer representation of sockets
125121 * @skc_daddr: Foreign IPv4 addr
126122 * @skc_rcv_saddr: Bound local IPv4 addr
123
+ * @skc_addrpair: 8-byte-aligned __u64 union of @skc_daddr & @skc_rcv_saddr
127124 * @skc_hash: hash value used with various protocol lookup tables
128125 * @skc_u16hashes: two u16 hash values used by UDP lookup tables
129126 * @skc_dport: placeholder for inet_dport/tw_dport
130127 * @skc_num: placeholder for inet_num/tw_num
128
+ * @skc_portpair: __u32 union of @skc_dport & @skc_num
131129 * @skc_family: network address family
132130 * @skc_state: Connection state
133131 * @skc_reuse: %SO_REUSEADDR setting
134132 * @skc_reuseport: %SO_REUSEPORT setting
133
+ * @skc_ipv6only: socket is IPV6 only
134
+ * @skc_net_refcnt: socket is using net ref counting
135135 * @skc_bound_dev_if: bound device index if != 0
136136 * @skc_bind_node: bind hash linkage for various protocol lookup tables
137137 * @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol
138138 * @skc_prot: protocol handlers inside a network family
139139 * @skc_net: reference to the network namespace of this socket
140
+ * @skc_v6_daddr: IPV6 destination address
141
+ * @skc_v6_rcv_saddr: IPV6 source address
142
+ * @skc_cookie: socket's cookie value
140143 * @skc_node: main hash linkage for various protocol lookup tables
141144 * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
142145 * @skc_tx_queue_mapping: tx queue number for this connection
....@@ -144,16 +147,21 @@
144147 * @skc_flags: place holder for sk_flags
145148 * %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
146149 * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
150
+ * @skc_listener: connection request listener socket (aka rsk_listener)
151
+ * [union with @skc_flags]
152
+ * @skc_tw_dr: (aka tw_dr) ptr to &struct inet_timewait_death_row
153
+ * [union with @skc_flags]
147154 * @skc_incoming_cpu: record/match cpu processing incoming packets
155
+ * @skc_rcv_wnd: (aka rsk_rcv_wnd) TCP receive window size (possibly scaled)
156
+ * [union with @skc_incoming_cpu]
157
+ * @skc_tw_rcv_nxt: (aka tw_rcv_nxt) TCP window next expected seq number
158
+ * [union with @skc_incoming_cpu]
148159 * @skc_refcnt: reference count
149160 *
150161 * This is the minimal network layer representation of sockets, the header
151162 * for struct sock and struct inet_timewait_sock.
152163 */
153164 struct sock_common {
154
- /* skc_daddr and skc_rcv_saddr must be grouped on a 8 bytes aligned
155
- * address on 64bit arches : cf INET_MATCH()
156
- */
157165 union {
158166 __addrpair skc_addrpair;
159167 struct {
....@@ -237,6 +245,8 @@
237245 /* public: */
238246 };
239247
248
+struct bpf_local_storage;
249
+
240250 /**
241251 * struct sock - network layer representation of sockets
242252 * @__sk_common: shared layout with inet_timewait_sock
....@@ -250,6 +260,7 @@
250260 * @sk_dst_cache: destination cache
251261 * @sk_dst_pending_confirm: need to confirm neighbour
252262 * @sk_policy: flow policy
263
+ * @sk_rx_skb_cache: cache copy of recently accessed RX skb
253264 * @sk_receive_queue: incoming packets
254265 * @sk_wmem_alloc: transmit queue bytes committed
255266 * @sk_tsq_flags: TCP Small Queues flags
....@@ -270,6 +281,8 @@
270281 * @sk_no_check_rx: allow zero checksum in RX packets
271282 * @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO)
272283 * @sk_route_nocaps: forbidden route capabilities (e.g NETIF_F_GSO_MASK)
284
+ * @sk_route_forced_caps: static, forced route capabilities
285
+ * (set in tcp_init_sock())
273286 * @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4)
274287 * @sk_gso_max_size: Maximum GSO segment size to build
275288 * @sk_gso_max_segs: Maximum number of GSO segments
....@@ -304,10 +317,12 @@
304317 * @sk_tskey: counter to disambiguate concurrent tstamp requests
305318 * @sk_zckey: counter to order MSG_ZEROCOPY notifications
306319 * @sk_socket: Identd and reporting IO signals
307
- * @sk_user_data: RPC layer private data
320
+ * @sk_user_data: RPC layer private data. Write-protected by @sk_callback_lock.
308321 * @sk_frag: cached page frag
309322 * @sk_peek_off: current peek_offset value
310323 * @sk_send_head: front of stuff to transmit
324
+ * @tcp_rtx_queue: TCP re-transmit queue [union with @sk_send_head]
325
+ * @sk_tx_skb_cache: cache copy of recently accessed TX skb
311326 * @sk_security: used by security modules
312327 * @sk_mark: generic packet mark
313328 * @sk_cgrp_data: cgroup data for this cgroup
....@@ -318,11 +333,14 @@
318333 * @sk_write_space: callback to indicate there is bf sending space available
319334 * @sk_error_report: callback to indicate errors (e.g. %MSG_ERRQUEUE)
320335 * @sk_backlog_rcv: callback to process the backlog
336
+ * @sk_validate_xmit_skb: ptr to an optional validate function
321337 * @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0
322338 * @sk_reuseport_cb: reuseport group container
339
+ * @sk_bpf_storage: ptr to cache and control for bpf_sk_storage
323340 * @sk_rcu: used during RCU grace period
324341 * @sk_clockid: clockid used by time-based scheduling (SO_TXTIME)
325342 * @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME
343
+ * @sk_txtime_report_errors: set report errors mode for SO_TXTIME
326344 * @sk_txtime_unused: unused txtime flags
327345 */
328346 struct sock {
....@@ -369,6 +387,7 @@
369387 atomic_t sk_drops;
370388 int sk_rcvlowat;
371389 struct sk_buff_head sk_error_queue;
390
+ struct sk_buff *sk_rx_skb_cache;
372391 struct sk_buff_head sk_receive_queue;
373392 /*
374393 * The backlog queue is special, it is always used with
....@@ -397,12 +416,14 @@
397416 struct sk_filter __rcu *sk_filter;
398417 union {
399418 struct socket_wq __rcu *sk_wq;
419
+ /* private: */
400420 struct socket_wq *sk_wq_raw;
421
+ /* public: */
401422 };
402423 #ifdef CONFIG_XFRM
403424 struct xfrm_policy __rcu *sk_policy[2];
404425 #endif
405
- struct dst_entry *sk_rx_dst;
426
+ struct dst_entry __rcu *sk_rx_dst;
406427 struct dst_entry __rcu *sk_dst_cache;
407428 atomic_t sk_omem_alloc;
408429 int sk_sndbuf;
....@@ -415,6 +436,7 @@
415436 struct sk_buff *sk_send_head;
416437 struct rb_root tcp_rtx_queue;
417438 };
439
+ struct sk_buff *sk_tx_skb_cache;
418440 struct sk_buff_head sk_write_queue;
419441 __s32 sk_peek_off;
420442 int sk_write_pending;
....@@ -424,8 +446,8 @@
424446 struct timer_list sk_timer;
425447 __u32 sk_priority;
426448 __u32 sk_mark;
427
- u32 sk_pacing_rate; /* bytes per second */
428
- u32 sk_max_pacing_rate;
449
+ unsigned long sk_pacing_rate; /* bytes per second */
450
+ unsigned long sk_max_pacing_rate;
429451 struct page_frag sk_frag;
430452 netdev_features_t sk_route_caps;
431453 netdev_features_t sk_route_nocaps;
....@@ -439,31 +461,15 @@
439461 * Because of non atomicity rules, all
440462 * changes are protected by socket lock.
441463 */
442
- unsigned int __sk_flags_offset[0];
443
-#ifdef __BIG_ENDIAN_BITFIELD
444
-#define SK_FL_PROTO_SHIFT 16
445
-#define SK_FL_PROTO_MASK 0x00ff0000
446
-
447
-#define SK_FL_TYPE_SHIFT 0
448
-#define SK_FL_TYPE_MASK 0x0000ffff
449
-#else
450
-#define SK_FL_PROTO_SHIFT 8
451
-#define SK_FL_PROTO_MASK 0x0000ff00
452
-
453
-#define SK_FL_TYPE_SHIFT 16
454
-#define SK_FL_TYPE_MASK 0xffff0000
455
-#endif
456
-
457
- unsigned int sk_padding : 1,
464
+ u8 sk_padding : 1,
458465 sk_kern_sock : 1,
459466 sk_no_check_tx : 1,
460467 sk_no_check_rx : 1,
461
- sk_userlocks : 4,
462
- sk_protocol : 8,
463
- sk_type : 16;
464
-#define SK_PROTOCOL_MAX U8_MAX
465
- u16 sk_gso_max_segs;
468
+ sk_userlocks : 4;
466469 u8 sk_pacing_shift;
470
+ u16 sk_type;
471
+ u16 sk_protocol;
472
+ u16 sk_gso_max_segs;
467473 unsigned long sk_lingertime;
468474 struct proto *sk_prot_creator;
469475 rwlock_t sk_callback_lock;
....@@ -472,7 +478,7 @@
472478 u32 sk_ack_backlog;
473479 u32 sk_max_ack_backlog;
474480 kuid_t sk_uid;
475
-#if IS_ENABLED(CONFIG_DEBUG_SPINLOCK) || IS_ENABLED(CONFIG_DEBUG_LOCK_ALLOC) || IS_ENABLED(CONFIG_PREEMPT_RT)
481
+#if IS_ENABLED(CONFIG_DEBUG_SPINLOCK) || IS_ENABLED(CONFIG_DEBUG_LOCK_ALLOC)
476482 spinlock_t sk_peer_lock;
477483 #else
478484 /* sk_peer_lock is in the ANDROID_KABI_RESERVE(1) field below */
....@@ -515,9 +521,12 @@
515521 #endif
516522 void (*sk_destruct)(struct sock *sk);
517523 struct sock_reuseport __rcu *sk_reuseport_cb;
524
+#ifdef CONFIG_BPF_SYSCALL
525
+ struct bpf_local_storage __rcu *sk_bpf_storage;
526
+#endif
518527 struct rcu_head sk_rcu;
519528
520
-#if IS_ENABLED(CONFIG_DEBUG_SPINLOCK) || IS_ENABLED(CONFIG_DEBUG_LOCK_ALLOC) || IS_ENABLED(CONFIG_PREEMPT_RT)
529
+#if IS_ENABLED(CONFIG_DEBUG_SPINLOCK) || IS_ENABLED(CONFIG_DEBUG_LOCK_ALLOC)
521530 ANDROID_KABI_RESERVE(1);
522531 #else
523532 ANDROID_KABI_USE(1, spinlock_t sk_peer_lock);
....@@ -529,6 +538,8 @@
529538 ANDROID_KABI_RESERVE(6);
530539 ANDROID_KABI_RESERVE(7);
531540 ANDROID_KABI_RESERVE(8);
541
+
542
+ ANDROID_OEM_DATA(1);
532543 };
533544
534545 enum sk_pacing {
....@@ -537,10 +548,72 @@
537548 SK_PACING_FQ = 2,
538549 };
539550
551
+/* flag bits in sk_user_data
552
+ *
553
+ * - SK_USER_DATA_NOCOPY: Pointer stored in sk_user_data might
554
+ * not be suitable for copying when cloning the socket. For instance,
555
+ * it can point to a reference counted object. sk_user_data bottom
556
+ * bit is set if pointer must not be copied.
557
+ *
558
+ * - SK_USER_DATA_BPF: Mark whether sk_user_data field is
559
+ * managed/owned by a BPF reuseport array. This bit should be set
560
+ * when sk_user_data's sk is added to the bpf's reuseport_array.
561
+ *
562
+ * - SK_USER_DATA_PSOCK: Mark whether pointer stored in
563
+ * sk_user_data points to psock type. This bit should be set
564
+ * when sk_user_data is assigned to a psock object.
565
+ */
566
+#define SK_USER_DATA_NOCOPY 1UL
567
+#define SK_USER_DATA_BPF 2UL
568
+#define SK_USER_DATA_PSOCK 4UL
569
+#define SK_USER_DATA_PTRMASK ~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF |\
570
+ SK_USER_DATA_PSOCK)
571
+
572
+/**
573
+ * sk_user_data_is_nocopy - Test if sk_user_data pointer must not be copied
574
+ * @sk: socket
575
+ */
576
+static inline bool sk_user_data_is_nocopy(const struct sock *sk)
577
+{
578
+ return ((uintptr_t)sk->sk_user_data & SK_USER_DATA_NOCOPY);
579
+}
580
+
540581 #define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data)))
541582
542
-#define rcu_dereference_sk_user_data(sk) rcu_dereference(__sk_user_data((sk)))
543
-#define rcu_assign_sk_user_data(sk, ptr) rcu_assign_pointer(__sk_user_data((sk)), ptr)
583
+/**
584
+ * __rcu_dereference_sk_user_data_with_flags - return the pointer
585
+ * only if argument flags all has been set in sk_user_data. Otherwise
586
+ * return NULL
587
+ *
588
+ * @sk: socket
589
+ * @flags: flag bits
590
+ */
591
+static inline void *
592
+__rcu_dereference_sk_user_data_with_flags(const struct sock *sk,
593
+ uintptr_t flags)
594
+{
595
+ uintptr_t sk_user_data = (uintptr_t)rcu_dereference(__sk_user_data(sk));
596
+
597
+ WARN_ON_ONCE(flags & SK_USER_DATA_PTRMASK);
598
+
599
+ if ((sk_user_data & flags) == flags)
600
+ return (void *)(sk_user_data & SK_USER_DATA_PTRMASK);
601
+ return NULL;
602
+}
603
+
604
+#define rcu_dereference_sk_user_data(sk) \
605
+ __rcu_dereference_sk_user_data_with_flags(sk, 0)
606
+#define __rcu_assign_sk_user_data_with_flags(sk, ptr, flags) \
607
+({ \
608
+ uintptr_t __tmp1 = (uintptr_t)(ptr), \
609
+ __tmp2 = (uintptr_t)(flags); \
610
+ WARN_ON_ONCE(__tmp1 & ~SK_USER_DATA_PTRMASK); \
611
+ WARN_ON_ONCE(__tmp2 & SK_USER_DATA_PTRMASK); \
612
+ rcu_assign_pointer(__sk_user_data((sk)), \
613
+ __tmp1 | __tmp2); \
614
+})
615
+#define rcu_assign_sk_user_data(sk, ptr) \
616
+ __rcu_assign_sk_user_data_with_flags(sk, ptr, 0)
544617
545618 /*
546619 * SK_CAN_REUSE and SK_NO_REUSE on a socket mean that the socket is OK
....@@ -820,7 +893,6 @@
820893 SOCK_RCVTSTAMP, /* %SO_TIMESTAMP setting */
821894 SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */
822895 SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */
823
- SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */
824896 SOCK_MEMALLOC, /* VM depends on this socket for swapping */
825897 SOCK_TIMESTAMPING_RX_SOFTWARE, /* %SOF_TIMESTAMPING_RX_SOFTWARE */
826898 SOCK_FASYNC, /* fasync() active */
....@@ -835,6 +907,8 @@
835907 SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */
836908 SOCK_RCU_FREE, /* wait rcu grace period in sk_destruct() */
837909 SOCK_TXTIME,
910
+ SOCK_XDP, /* XDP is attached */
911
+ SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */
838912 };
839913
840914 #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
....@@ -852,6 +926,15 @@
852926 static inline void sock_reset_flag(struct sock *sk, enum sock_flags flag)
853927 {
854928 __clear_bit(flag, &sk->sk_flags);
929
+}
930
+
931
+static inline void sock_valbool_flag(struct sock *sk, enum sock_flags bit,
932
+ int valbool)
933
+{
934
+ if (valbool)
935
+ sock_set_flag(sk, bit);
936
+ else
937
+ sock_reset_flag(sk, bit);
855938 }
856939
857940 static inline bool sock_flag(const struct sock *sk, enum sock_flags flag)
....@@ -885,17 +968,17 @@
885968
886969 static inline void sk_acceptq_removed(struct sock *sk)
887970 {
888
- sk->sk_ack_backlog--;
971
+ WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog - 1);
889972 }
890973
891974 static inline void sk_acceptq_added(struct sock *sk)
892975 {
893
- sk->sk_ack_backlog++;
976
+ WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog + 1);
894977 }
895978
896979 static inline bool sk_acceptq_is_full(const struct sock *sk)
897980 {
898
- return sk->sk_ack_backlog > sk->sk_max_ack_backlog;
981
+ return READ_ONCE(sk->sk_ack_backlog) > READ_ONCE(sk->sk_max_ack_backlog);
899982 }
900983
901984 /*
....@@ -903,12 +986,17 @@
903986 */
904987 static inline int sk_stream_min_wspace(const struct sock *sk)
905988 {
906
- return sk->sk_wmem_queued >> 1;
989
+ return READ_ONCE(sk->sk_wmem_queued) >> 1;
907990 }
908991
909992 static inline int sk_stream_wspace(const struct sock *sk)
910993 {
911
- return sk->sk_sndbuf - sk->sk_wmem_queued;
994
+ return READ_ONCE(sk->sk_sndbuf) - READ_ONCE(sk->sk_wmem_queued);
995
+}
996
+
997
+static inline void sk_wmem_queued_add(struct sock *sk, int val)
998
+{
999
+ WRITE_ONCE(sk->sk_wmem_queued, sk->sk_wmem_queued + val);
9121000 }
9131001
9141002 void sk_stream_write_space(struct sock *sk);
....@@ -993,7 +1081,7 @@
9931081 static inline void sock_rps_record_flow(const struct sock *sk)
9941082 {
9951083 #ifdef CONFIG_RPS
996
- if (static_key_false(&rfs_needed)) {
1084
+ if (static_branch_unlikely(&rfs_needed)) {
9971085 /* Reading sk->sk_rxhash might incur an expensive cache line
9981086 * miss.
9991087 *
....@@ -1004,8 +1092,12 @@
10041092 * OR an additional socket flag
10051093 * [1] : sk_state and sk_prot are in the same cache line.
10061094 */
1007
- if (sk->sk_state == TCP_ESTABLISHED)
1008
- sock_rps_record_flow_hash(sk->sk_rxhash);
1095
+ if (sk->sk_state == TCP_ESTABLISHED) {
1096
+ /* This READ_ONCE() is paired with the WRITE_ONCE()
1097
+ * from sock_rps_save_rxhash() and sock_rps_reset_rxhash().
1098
+ */
1099
+ sock_rps_record_flow_hash(READ_ONCE(sk->sk_rxhash));
1100
+ }
10091101 }
10101102 #endif
10111103 }
....@@ -1014,15 +1106,19 @@
10141106 const struct sk_buff *skb)
10151107 {
10161108 #ifdef CONFIG_RPS
1017
- if (unlikely(sk->sk_rxhash != skb->hash))
1018
- sk->sk_rxhash = skb->hash;
1109
+ /* The following WRITE_ONCE() is paired with the READ_ONCE()
1110
+ * here, and another one in sock_rps_record_flow().
1111
+ */
1112
+ if (unlikely(READ_ONCE(sk->sk_rxhash) != skb->hash))
1113
+ WRITE_ONCE(sk->sk_rxhash, skb->hash);
10191114 #endif
10201115 }
10211116
10221117 static inline void sock_rps_reset_rxhash(struct sock *sk)
10231118 {
10241119 #ifdef CONFIG_RPS
1025
- sk->sk_rxhash = 0;
1120
+ /* Paired with READ_ONCE() in sock_rps_record_flow() */
1121
+ WRITE_ONCE(sk->sk_rxhash, 0);
10261122 #endif
10271123 }
10281124
....@@ -1104,21 +1200,13 @@
11041200 void (*destroy)(struct sock *sk);
11051201 void (*shutdown)(struct sock *sk, int how);
11061202 int (*setsockopt)(struct sock *sk, int level,
1107
- int optname, char __user *optval,
1203
+ int optname, sockptr_t optval,
11081204 unsigned int optlen);
11091205 int (*getsockopt)(struct sock *sk, int level,
11101206 int optname, char __user *optval,
11111207 int __user *option);
11121208 void (*keepalive)(struct sock *sk, int valbool);
11131209 #ifdef CONFIG_COMPAT
1114
- int (*compat_setsockopt)(struct sock *sk,
1115
- int level,
1116
- int optname, char __user *optval,
1117
- unsigned int optlen);
1118
- int (*compat_getsockopt)(struct sock *sk,
1119
- int level,
1120
- int optname, char __user *optval,
1121
- int __user *option);
11221210 int (*compat_ioctl)(struct sock *sk,
11231211 unsigned int cmd, unsigned long arg);
11241212 #endif
....@@ -1130,7 +1218,9 @@
11301218 int (*sendpage)(struct sock *sk, struct page *page,
11311219 int offset, size_t size, int flags);
11321220 int (*bind)(struct sock *sk,
1133
- struct sockaddr *uaddr, int addr_len);
1221
+ struct sockaddr *addr, int addr_len);
1222
+ int (*bind_add)(struct sock *sk,
1223
+ struct sockaddr *addr, int addr_len);
11341224
11351225 int (*backlog_rcv) (struct sock *sk,
11361226 struct sk_buff *skb);
....@@ -1148,7 +1238,7 @@
11481238 unsigned int inuse_idx;
11491239 #endif
11501240
1151
- bool (*stream_memory_free)(const struct sock *sk);
1241
+ bool (*stream_memory_free)(const struct sock *sk, int wake);
11521242 bool (*stream_memory_read)(const struct sock *sk);
11531243 /* Memory pressure */
11541244 void (*enter_memory_pressure)(struct sock *sk);
....@@ -1158,6 +1248,7 @@
11581248 /*
11591249 * Pressure flag: try to collapse.
11601250 * Technical note: it is used by multiple contexts non atomically.
1251
+ * Make sure to use READ_ONCE()/WRITE_ONCE() for all reads/writes.
11611252 * All the __sk_mem_schedule() is of this nature: accounting
11621253 * is strict, actions are advisory and have some latency.
11631254 */
....@@ -1230,19 +1321,29 @@
12301321 #define sk_refcnt_debug_release(sk) do { } while (0)
12311322 #endif /* SOCK_REFCNT_DEBUG */
12321323
1233
-static inline bool sk_stream_memory_free(const struct sock *sk)
1324
+static inline bool __sk_stream_memory_free(const struct sock *sk, int wake)
12341325 {
1235
- if (sk->sk_wmem_queued >= sk->sk_sndbuf)
1326
+ if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf))
12361327 return false;
12371328
12381329 return sk->sk_prot->stream_memory_free ?
1239
- sk->sk_prot->stream_memory_free(sk) : true;
1330
+ sk->sk_prot->stream_memory_free(sk, wake) : true;
1331
+}
1332
+
1333
+static inline bool sk_stream_memory_free(const struct sock *sk)
1334
+{
1335
+ return __sk_stream_memory_free(sk, 0);
1336
+}
1337
+
1338
+static inline bool __sk_stream_is_writeable(const struct sock *sk, int wake)
1339
+{
1340
+ return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) &&
1341
+ __sk_stream_memory_free(sk, wake);
12401342 }
12411343
12421344 static inline bool sk_stream_is_writeable(const struct sock *sk)
12431345 {
1244
- return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) &&
1245
- sk_stream_memory_free(sk);
1346
+ return __sk_stream_is_writeable(sk, 0);
12461347 }
12471348
12481349 static inline int sk_under_cgroup_hierarchy(struct sock *sk,
....@@ -1261,6 +1362,12 @@
12611362 return sk->sk_prot->memory_pressure != NULL;
12621363 }
12631364
1365
+static inline bool sk_under_global_memory_pressure(const struct sock *sk)
1366
+{
1367
+ return sk->sk_prot->memory_pressure &&
1368
+ !!READ_ONCE(*sk->sk_prot->memory_pressure);
1369
+}
1370
+
12641371 static inline bool sk_under_memory_pressure(const struct sock *sk)
12651372 {
12661373 if (!sk->sk_prot->memory_pressure)
....@@ -1270,7 +1377,7 @@
12701377 mem_cgroup_under_socket_pressure(sk->sk_memcg))
12711378 return true;
12721379
1273
- return !!*sk->sk_prot->memory_pressure;
1380
+ return !!READ_ONCE(*sk->sk_prot->memory_pressure);
12741381 }
12751382
12761383 static inline long
....@@ -1324,7 +1431,7 @@
13241431 {
13251432 if (!prot->memory_pressure)
13261433 return false;
1327
- return !!*prot->memory_pressure;
1434
+ return !!READ_ONCE(*prot->memory_pressure);
13281435 }
13291436
13301437
....@@ -1399,7 +1506,7 @@
13991506 /* sysctl_mem values are in pages, we convert them in SK_MEM_QUANTUM units */
14001507 static inline long sk_prot_mem_limits(const struct sock *sk, int index)
14011508 {
1402
- long val = sk->sk_prot->sysctl_mem[index];
1509
+ long val = READ_ONCE(sk->sk_prot->sysctl_mem[index]);
14031510
14041511 #if PAGE_SIZE > SK_MEM_QUANTUM
14051512 val <<= PAGE_SHIFT - SK_MEM_QUANTUM_SHIFT;
....@@ -1422,19 +1529,23 @@
14221529
14231530 static inline bool sk_wmem_schedule(struct sock *sk, int size)
14241531 {
1532
+ int delta;
1533
+
14251534 if (!sk_has_account(sk))
14261535 return true;
1427
- return size <= sk->sk_forward_alloc ||
1428
- __sk_mem_schedule(sk, size, SK_MEM_SEND);
1536
+ delta = size - sk->sk_forward_alloc;
1537
+ return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_SEND);
14291538 }
14301539
14311540 static inline bool
14321541 sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size)
14331542 {
1543
+ int delta;
1544
+
14341545 if (!sk_has_account(sk))
14351546 return true;
1436
- return size<= sk->sk_forward_alloc ||
1437
- __sk_mem_schedule(sk, size, SK_MEM_RECV) ||
1547
+ delta = size - sk->sk_forward_alloc;
1548
+ return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_RECV) ||
14381549 skb_pfmemalloc(skb);
14391550 }
14401551
....@@ -1478,11 +1589,18 @@
14781589 __sk_mem_reclaim(sk, 1 << 20);
14791590 }
14801591
1592
+DECLARE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key);
14811593 static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
14821594 {
1483
- sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
1484
- sk->sk_wmem_queued -= skb->truesize;
1595
+ sk_wmem_queued_add(sk, -skb->truesize);
14851596 sk_mem_uncharge(sk, skb->truesize);
1597
+ if (static_branch_unlikely(&tcp_tx_skb_cache_key) &&
1598
+ !sk->sk_tx_skb_cache && !skb_cloned(skb)) {
1599
+ skb_ext_reset(skb);
1600
+ skb_zcopy_clear(skb, true);
1601
+ sk->sk_tx_skb_cache = skb;
1602
+ return;
1603
+ }
14861604 __kfree_skb(skb);
14871605 }
14881606
....@@ -1492,7 +1610,7 @@
14921610 sk->sk_lock.owned = 0;
14931611
14941612 /* The sk_lock has mutex_unlock() semantics: */
1495
- mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1613
+ mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
14961614 }
14971615 }
14981616
....@@ -1615,15 +1733,18 @@
16151733 void sock_efree(struct sk_buff *skb);
16161734 #ifdef CONFIG_INET
16171735 void sock_edemux(struct sk_buff *skb);
1736
+void sock_pfree(struct sk_buff *skb);
16181737 #else
16191738 #define sock_edemux sock_efree
16201739 #endif
16211740
16221741 int sock_setsockopt(struct socket *sock, int level, int op,
1623
- char __user *optval, unsigned int optlen);
1742
+ sockptr_t optval, unsigned int optlen);
16241743
16251744 int sock_getsockopt(struct socket *sock, int level, int op,
16261745 char __user *optval, int __user *optlen);
1746
+int sock_gettstamp(struct socket *sock, void __user *userstamp,
1747
+ bool timeval, bool time32);
16271748 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
16281749 int noblock, int *errcode);
16291750 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
....@@ -1663,8 +1784,6 @@
16631784 int sock_no_ioctl(struct socket *, unsigned int, unsigned long);
16641785 int sock_no_listen(struct socket *, int);
16651786 int sock_no_shutdown(struct socket *, int);
1666
-int sock_no_getsockopt(struct socket *, int , int, char __user *, int __user *);
1667
-int sock_no_setsockopt(struct socket *, int, int, char __user *, unsigned int);
16681787 int sock_no_sendmsg(struct socket *, struct msghdr *, size_t);
16691788 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t len);
16701789 int sock_no_recvmsg(struct socket *, struct msghdr *, size_t, int);
....@@ -1684,11 +1803,7 @@
16841803 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
16851804 int flags);
16861805 int sock_common_setsockopt(struct socket *sock, int level, int optname,
1687
- char __user *optval, unsigned int optlen);
1688
-int compat_sock_common_getsockopt(struct socket *sock, int level,
1689
- int optname, char __user *optval, int __user *optlen);
1690
-int compat_sock_common_setsockopt(struct socket *sock, int level,
1691
- int optname, char __user *optval, unsigned int optlen);
1806
+ sockptr_t optval, unsigned int optlen);
16921807
16931808 void sk_common_release(struct sock *sk);
16941809
....@@ -1696,7 +1811,12 @@
16961811 * Default socket callbacks and setup code
16971812 */
16981813
1699
-/* Initialise core socket variables */
1814
+/* Initialise core socket variables using an explicit uid. */
1815
+void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid);
1816
+
1817
+/* Initialise core socket variables.
1818
+ * Assumes struct socket *sock is embedded in a struct socket_alloc.
1819
+ */
17001820 void sock_init_data(struct socket *sock, struct sock *sk);
17011821
17021822 /*
....@@ -1827,7 +1947,7 @@
18271947 {
18281948 WARN_ON(parent->sk);
18291949 write_lock_bh(&sk->sk_callback_lock);
1830
- rcu_assign_pointer(sk->sk_wq, parent->wq);
1950
+ rcu_assign_pointer(sk->sk_wq, &parent->wq);
18311951 parent->sk = sk;
18321952 sk_set_socket(sk, parent);
18331953 sk->sk_uid = SOCK_INODE(parent)->i_uid;
....@@ -1836,6 +1956,7 @@
18361956 }
18371957
18381958 kuid_t sock_i_uid(struct sock *sk);
1959
+unsigned long __sock_i_ino(struct sock *sk);
18391960 unsigned long sock_i_ino(struct sock *sk);
18401961
18411962 static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk)
....@@ -1856,10 +1977,13 @@
18561977 WRITE_ONCE(sk->sk_txhash, net_tx_rndhash());
18571978 }
18581979
1859
-static inline void sk_rethink_txhash(struct sock *sk)
1980
+static inline bool sk_rethink_txhash(struct sock *sk)
18601981 {
1861
- if (sk->sk_txhash)
1982
+ if (sk->sk_txhash) {
18621983 sk_set_txhash(sk);
1984
+ return true;
1985
+ }
1986
+ return false;
18631987 }
18641988
18651989 static inline struct dst_entry *
....@@ -1882,11 +2006,9 @@
18822006 return dst;
18832007 }
18842008
1885
-static inline void dst_negative_advice(struct sock *sk)
2009
+static inline void __dst_negative_advice(struct sock *sk)
18862010 {
18872011 struct dst_entry *ndst, *dst = __sk_dst_get(sk);
1888
-
1889
- sk_rethink_txhash(sk);
18902012
18912013 if (dst && dst->ops->negative_advice) {
18922014 ndst = dst->ops->negative_advice(dst);
....@@ -1897,6 +2019,12 @@
18972019 sk->sk_dst_pending_confirm = 0;
18982020 }
18992021 }
2022
+}
2023
+
2024
+static inline void dst_negative_advice(struct sock *sk)
2025
+{
2026
+ sk_rethink_txhash(sk);
2027
+ __dst_negative_advice(sk);
19002028 }
19012029
19022030 static inline void
....@@ -1941,8 +2069,8 @@
19412069
19422070 static inline void sk_dst_confirm(struct sock *sk)
19432071 {
1944
- if (!sk->sk_dst_pending_confirm)
1945
- sk->sk_dst_pending_confirm = 1;
2072
+ if (!READ_ONCE(sk->sk_dst_pending_confirm))
2073
+ WRITE_ONCE(sk->sk_dst_pending_confirm, 1);
19462074 }
19472075
19482076 static inline void sock_confirm_neigh(struct sk_buff *skb, struct neighbour *n)
....@@ -1952,10 +2080,10 @@
19522080 unsigned long now = jiffies;
19532081
19542082 /* avoid dirtying neighbour */
1955
- if (n->confirmed != now)
1956
- n->confirmed = now;
1957
- if (sk && sk->sk_dst_pending_confirm)
1958
- sk->sk_dst_pending_confirm = 0;
2083
+ if (READ_ONCE(n->confirmed) != now)
2084
+ WRITE_ONCE(n->confirmed, now);
2085
+ if (sk && READ_ONCE(sk->sk_dst_pending_confirm))
2086
+ WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
19592087 }
19602088 }
19612089
....@@ -2020,7 +2148,7 @@
20202148 skb->len += copy;
20212149 skb->data_len += copy;
20222150 skb->truesize += copy;
2023
- sk->sk_wmem_queued += copy;
2151
+ sk_wmem_queued_add(sk, copy);
20242152 sk_mem_charge(sk, copy);
20252153 return 0;
20262154 }
....@@ -2029,7 +2157,7 @@
20292157 * sk_wmem_alloc_get - returns write allocations
20302158 * @sk: socket
20312159 *
2032
- * Returns sk_wmem_alloc minus initial offset of one
2160
+ * Return: sk_wmem_alloc minus initial offset of one
20332161 */
20342162 static inline int sk_wmem_alloc_get(const struct sock *sk)
20352163 {
....@@ -2040,7 +2168,7 @@
20402168 * sk_rmem_alloc_get - returns read allocations
20412169 * @sk: socket
20422170 *
2043
- * Returns sk_rmem_alloc
2171
+ * Return: sk_rmem_alloc
20442172 */
20452173 static inline int sk_rmem_alloc_get(const struct sock *sk)
20462174 {
....@@ -2051,7 +2179,7 @@
20512179 * sk_has_allocations - check if allocations are outstanding
20522180 * @sk: socket
20532181 *
2054
- * Returns true if socket has write or read allocations
2182
+ * Return: true if socket has write or read allocations
20552183 */
20562184 static inline bool sk_has_allocations(const struct sock *sk)
20572185 {
....@@ -2062,7 +2190,7 @@
20622190 * skwq_has_sleeper - check if there are any waiting processes
20632191 * @wq: struct socket_wq
20642192 *
2065
- * Returns true if socket_wq has waiting processes
2193
+ * Return: true if socket_wq has waiting processes
20662194 *
20672195 * The purpose of the skwq_has_sleeper and sock_poll_wait is to wrap the memory
20682196 * barrier call. They were added due to the race found within the tcp code.
....@@ -2101,18 +2229,12 @@
21012229 * @p: poll_table
21022230 *
21032231 * See the comments in the wq_has_sleeper function.
2104
- *
2105
- * Do not derive sock from filp->private_data here. An SMC socket establishes
2106
- * an internal TCP socket that is used in the fallback case. All socket
2107
- * operations on the SMC socket are then forwarded to the TCP socket. In case of
2108
- * poll, the filp->private_data pointer references the SMC socket because the
2109
- * TCP socket has no file assigned.
21102232 */
21112233 static inline void sock_poll_wait(struct file *filp, struct socket *sock,
21122234 poll_table *p)
21132235 {
21142236 if (!poll_does_not_wait(p)) {
2115
- poll_wait(filp, &sock->wq->wait, p);
2237
+ poll_wait(filp, &sock->wq.wait, p);
21162238 /* We need to be sure we are in sync with the
21172239 * socket flags modification.
21182240 *
....@@ -2152,10 +2274,36 @@
21522274 sk_mem_charge(sk, skb->truesize);
21532275 }
21542276
2277
+static inline __must_check bool skb_set_owner_sk_safe(struct sk_buff *skb, struct sock *sk)
2278
+{
2279
+ if (sk && refcount_inc_not_zero(&sk->sk_refcnt)) {
2280
+ skb_orphan(skb);
2281
+ skb->destructor = sock_efree;
2282
+ skb->sk = sk;
2283
+ return true;
2284
+ }
2285
+ return false;
2286
+}
2287
+
2288
+static inline struct sk_buff *skb_clone_and_charge_r(struct sk_buff *skb, struct sock *sk)
2289
+{
2290
+ skb = skb_clone(skb, sk_gfp_mask(sk, GFP_ATOMIC));
2291
+ if (skb) {
2292
+ if (sk_rmem_schedule(sk, skb, skb->truesize)) {
2293
+ skb_set_owner_r(skb, sk);
2294
+ return skb;
2295
+ }
2296
+ __kfree_skb(skb);
2297
+ }
2298
+ return NULL;
2299
+}
2300
+
21552301 void sk_reset_timer(struct sock *sk, struct timer_list *timer,
21562302 unsigned long expires);
21572303
21582304 void sk_stop_timer(struct sock *sk, struct timer_list *timer);
2305
+
2306
+void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer);
21592307
21602308 int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue,
21612309 struct sk_buff *skb, unsigned int flags,
....@@ -2174,8 +2322,13 @@
21742322 static inline int sock_error(struct sock *sk)
21752323 {
21762324 int err;
2177
- if (likely(!sk->sk_err))
2325
+
2326
+ /* Avoid an atomic operation for the common case.
2327
+ * This is racy since another cpu/thread can change sk_err under us.
2328
+ */
2329
+ if (likely(data_race(!sk->sk_err)))
21782330 return 0;
2331
+
21792332 err = xchg(&sk->sk_err, 0);
21802333 return -err;
21812334 }
....@@ -2235,10 +2388,14 @@
22352388
22362389 static inline void sk_stream_moderate_sndbuf(struct sock *sk)
22372390 {
2238
- if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) {
2239
- sk->sk_sndbuf = min(sk->sk_sndbuf, sk->sk_wmem_queued >> 1);
2240
- sk->sk_sndbuf = max_t(u32, sk->sk_sndbuf, SOCK_MIN_SNDBUF);
2241
- }
2391
+ u32 val;
2392
+
2393
+ if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
2394
+ return;
2395
+
2396
+ val = min(sk->sk_sndbuf, sk->sk_wmem_queued >> 1);
2397
+
2398
+ WRITE_ONCE(sk->sk_sndbuf, max_t(u32, val, SOCK_MIN_SNDBUF));
22422399 }
22432400
22442401 struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
....@@ -2249,16 +2406,22 @@
22492406 * @sk: socket
22502407 *
22512408 * Use the per task page_frag instead of the per socket one for
2252
- * optimization when we know that we're in the normal context and owns
2409
+ * optimization when we know that we're in process context and own
22532410 * everything that's associated with %current.
22542411 *
2255
- * gfpflags_allow_blocking() isn't enough here as direct reclaim may nest
2256
- * inside other socket operations and end up recursing into sk_page_frag()
2257
- * while it's already in use.
2412
+ * Both direct reclaim and page faults can nest inside other
2413
+ * socket operations and end up recursing into sk_page_frag()
2414
+ * while it's already in use: explicitly avoid task page_frag
2415
+ * usage if the caller is potentially doing any of them.
2416
+ * This assumes that page fault handlers use the GFP_NOFS flags.
2417
+ *
2418
+ * Return: a per task page_frag if context allows that,
2419
+ * otherwise a per socket one.
22582420 */
22592421 static inline struct page_frag *sk_page_frag(struct sock *sk)
22602422 {
2261
- if (gfpflags_normal_context(sk->sk_allocation))
2423
+ if ((sk->sk_allocation & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC | __GFP_FS)) ==
2424
+ (__GFP_DIRECT_RECLAIM | __GFP_FS))
22622425 return &current->task_frag;
22632426
22642427 return &sk->sk_frag;
....@@ -2266,16 +2429,12 @@
22662429
22672430 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag);
22682431
2269
-int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
2270
- int sg_start, int *sg_curr, unsigned int *sg_size,
2271
- int first_coalesce);
2272
-
22732432 /*
22742433 * Default write policy as shown to user space via poll/select/SIGIO
22752434 */
22762435 static inline bool sock_writeable(const struct sock *sk)
22772436 {
2278
- return refcount_read(&sk->sk_wmem_alloc) < (sk->sk_sndbuf >> 1);
2437
+ return refcount_read(&sk->sk_wmem_alloc) < (READ_ONCE(sk->sk_sndbuf) >> 1);
22792438 }
22802439
22812440 static inline gfp_t gfp_any(void)
....@@ -2295,7 +2454,9 @@
22952454
22962455 static inline int sock_rcvlowat(const struct sock *sk, int waitall, int len)
22972456 {
2298
- return (waitall ? len : min_t(int, sk->sk_rcvlowat, len)) ? : 1;
2457
+ int v = waitall ? len : min_t(int, READ_ONCE(sk->sk_rcvlowat), len);
2458
+
2459
+ return v ?: 1;
22992460 }
23002461
23012462 /* Alas, with timeout socket operations are not restartable.
....@@ -2314,7 +2475,7 @@
23142475 * using skb->cb[] would keep using it directly and utilize its
23152476 * alignement guarantee.
23162477 */
2317
-#define SOCK_SKB_CB_OFFSET ((FIELD_SIZEOF(struct sk_buff, cb) - \
2478
+#define SOCK_SKB_CB_OFFSET ((sizeof_field(struct sk_buff, cb) - \
23182479 sizeof(struct sock_skb_cb)))
23192480
23202481 #define SOCK_SKB_CB(__skb) ((struct sock_skb_cb *)((__skb)->cb + \
....@@ -2411,29 +2572,47 @@
24112572 __sock_recv_ts_and_drops(msg, sk, skb);
24122573 else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP)))
24132574 sock_write_timestamp(sk, skb->tstamp);
2414
- else if (unlikely(sk->sk_stamp == SK_DEFAULT_STAMP))
2575
+ else if (unlikely(sock_read_timestamp(sk) == SK_DEFAULT_STAMP))
24152576 sock_write_timestamp(sk, 0);
24162577 }
24172578
24182579 void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags);
24192580
24202581 /**
2421
- * sock_tx_timestamp - checks whether the outgoing packet is to be time stamped
2582
+ * _sock_tx_timestamp - checks whether the outgoing packet is to be time stamped
24222583 * @sk: socket sending this packet
24232584 * @tsflags: timestamping flags to use
24242585 * @tx_flags: completed with instructions for time stamping
2586
+ * @tskey: filled in with next sk_tskey (not for TCP, which uses seqno)
24252587 *
24262588 * Note: callers should take care of initial ``*tx_flags`` value (usually 0)
24272589 */
2428
-static inline void sock_tx_timestamp(const struct sock *sk, __u16 tsflags,
2429
- __u8 *tx_flags)
2590
+static inline void _sock_tx_timestamp(struct sock *sk, __u16 tsflags,
2591
+ __u8 *tx_flags, __u32 *tskey)
24302592 {
2431
- if (unlikely(tsflags))
2593
+ if (unlikely(tsflags)) {
24322594 __sock_tx_timestamp(tsflags, tx_flags);
2595
+ if (tsflags & SOF_TIMESTAMPING_OPT_ID && tskey &&
2596
+ tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
2597
+ *tskey = sk->sk_tskey++;
2598
+ }
24332599 if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS)))
24342600 *tx_flags |= SKBTX_WIFI_STATUS;
24352601 }
24362602
2603
+static inline void sock_tx_timestamp(struct sock *sk, __u16 tsflags,
2604
+ __u8 *tx_flags)
2605
+{
2606
+ _sock_tx_timestamp(sk, tsflags, tx_flags, NULL);
2607
+}
2608
+
2609
+static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags)
2610
+{
2611
+ _sock_tx_timestamp(skb->sk, tsflags, &skb_shinfo(skb)->tx_flags,
2612
+ &skb_shinfo(skb)->tskey);
2613
+}
2614
+
2615
+DECLARE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key);
24372616 /**
24382617 * sk_eat_skb - Release a skb if it is no longer needed
24392618 * @sk: socket to eat this skb from
....@@ -2445,6 +2624,12 @@
24452624 static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
24462625 {
24472626 __skb_unlink(skb, &sk->sk_receive_queue);
2627
+ if (static_branch_unlikely(&tcp_rx_skb_cache_key) &&
2628
+ !sk->sk_rx_skb_cache) {
2629
+ sk->sk_rx_skb_cache = skb;
2630
+ skb_orphan(skb);
2631
+ return;
2632
+ }
24482633 __kfree_skb(skb);
24492634 }
24502635
....@@ -2460,16 +2645,14 @@
24602645 write_pnet(&sk->sk_net, net);
24612646 }
24622647
2463
-static inline struct sock *skb_steal_sock(struct sk_buff *skb)
2648
+static inline bool
2649
+skb_sk_is_prefetched(struct sk_buff *skb)
24642650 {
2465
- if (skb->sk) {
2466
- struct sock *sk = skb->sk;
2467
-
2468
- skb->destructor = NULL;
2469
- skb->sk = NULL;
2470
- return sk;
2471
- }
2472
- return NULL;
2651
+#ifdef CONFIG_INET
2652
+ return skb->destructor == sock_pfree;
2653
+#else
2654
+ return false;
2655
+#endif /* CONFIG_INET */
24732656 }
24742657
24752658 /* This helper checks if a socket is a full socket,
....@@ -2480,8 +2663,38 @@
24802663 return (1 << sk->sk_state) & ~(TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV);
24812664 }
24822665
2666
+static inline bool
2667
+sk_is_refcounted(struct sock *sk)
2668
+{
2669
+ /* Only full sockets have sk->sk_flags. */
2670
+ return !sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE);
2671
+}
2672
+
2673
+/**
2674
+ * skb_steal_sock - steal a socket from an sk_buff
2675
+ * @skb: sk_buff to steal the socket from
2676
+ * @refcounted: is set to true if the socket is reference-counted
2677
+ */
2678
+static inline struct sock *
2679
+skb_steal_sock(struct sk_buff *skb, bool *refcounted)
2680
+{
2681
+ if (skb->sk) {
2682
+ struct sock *sk = skb->sk;
2683
+
2684
+ *refcounted = true;
2685
+ if (skb_sk_is_prefetched(skb))
2686
+ *refcounted = sk_is_refcounted(sk);
2687
+ skb->destructor = NULL;
2688
+ skb->sk = NULL;
2689
+ return sk;
2690
+ }
2691
+ *refcounted = false;
2692
+ return NULL;
2693
+}
2694
+
24832695 /* Checks if this SKB belongs to an HW offloaded socket
24842696 * and whether any SW fallbacks are required based on dev.
2697
+ * Check decrypted mark in case skb_orphan() cleared socket.
24852698 */
24862699 static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb,
24872700 struct net_device *dev)
....@@ -2489,8 +2702,15 @@
24892702 #ifdef CONFIG_SOCK_VALIDATE_XMIT
24902703 struct sock *sk = skb->sk;
24912704
2492
- if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb)
2705
+ if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) {
24932706 skb = sk->sk_validate_xmit_skb(sk, dev, skb);
2707
+#ifdef CONFIG_TLS_DEVICE
2708
+ } else if (unlikely(skb->decrypted)) {
2709
+ pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n");
2710
+ kfree_skb(skb);
2711
+ skb = NULL;
2712
+#endif
2713
+ }
24942714 #endif
24952715
24962716 return skb;
....@@ -2504,9 +2724,7 @@
25042724 return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV);
25052725 }
25062726
2507
-void sock_enable_timestamp(struct sock *sk, int flag);
2508
-int sock_get_timestamp(struct sock *, struct timeval __user *);
2509
-int sock_get_timestampns(struct sock *, struct timespec __user *);
2727
+void sock_enable_timestamp(struct sock *sk, enum sock_flags flag);
25102728 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level,
25112729 int type);
25122730
....@@ -2536,22 +2754,25 @@
25362754 extern __u32 sysctl_wmem_default;
25372755 extern __u32 sysctl_rmem_default;
25382756
2757
+#define SKB_FRAG_PAGE_ORDER get_order(32768)
2758
+DECLARE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2759
+
25392760 static inline int sk_get_wmem0(const struct sock *sk, const struct proto *proto)
25402761 {
25412762 /* Does this proto have per netns sysctl_wmem ? */
25422763 if (proto->sysctl_wmem_offset)
2543
- return *(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset);
2764
+ return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset));
25442765
2545
- return *proto->sysctl_wmem;
2766
+ return READ_ONCE(*proto->sysctl_wmem);
25462767 }
25472768
25482769 static inline int sk_get_rmem0(const struct sock *sk, const struct proto *proto)
25492770 {
25502771 /* Does this proto have per netns sysctl_rmem ? */
25512772 if (proto->sysctl_rmem_offset)
2552
- return *(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset);
2773
+ return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset));
25532774
2554
- return *proto->sysctl_rmem;
2775
+ return READ_ONCE(*proto->sysctl_rmem);
25552776 }
25562777
25572778 /* Default TCP Small queue budget is ~1 ms of data (1sec >> 10)
....@@ -2560,9 +2781,9 @@
25602781 */
25612782 static inline void sk_pacing_shift_update(struct sock *sk, int val)
25622783 {
2563
- if (!sk || !sk_fullsock(sk) || sk->sk_pacing_shift == val)
2784
+ if (!sk || !sk_fullsock(sk) || READ_ONCE(sk->sk_pacing_shift) == val)
25642785 return;
2565
- sk->sk_pacing_shift = val;
2786
+ WRITE_ONCE(sk->sk_pacing_shift, val);
25662787 }
25672788
25682789 /* if a socket is bound to a device, check that the given device
....@@ -2584,4 +2805,19 @@
25842805 return false;
25852806 }
25862807
2808
+void sock_def_readable(struct sock *sk);
2809
+
2810
+int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk);
2811
+void sock_enable_timestamps(struct sock *sk);
2812
+void sock_no_linger(struct sock *sk);
2813
+void sock_set_keepalive(struct sock *sk);
2814
+void sock_set_priority(struct sock *sk, u32 priority);
2815
+void sock_set_rcvbuf(struct sock *sk, int val);
2816
+void sock_set_mark(struct sock *sk, u32 val);
2817
+void sock_set_reuseaddr(struct sock *sk);
2818
+void sock_set_reuseport(struct sock *sk);
2819
+void sock_set_sndtimeo(struct sock *sk, s64 secs);
2820
+
2821
+int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len);
2822
+
25872823 #endif /* _SOCK_H */