From 6778948f9de86c3cfaf36725a7c87dcff9ba247f Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Mon, 11 Dec 2023 08:20:59 +0000
Subject: [PATCH] kernel_5.10 no rt
---
kernel/net/core/sock.c | 1096 +++++++++++++++++++++++++++++++++-----------------------
1 files changed, 641 insertions(+), 455 deletions(-)
diff --git a/kernel/net/core/sock.c b/kernel/net/core/sock.c
index d52850e..0506590 100644
--- a/kernel/net/core/sock.c
+++ b/kernel/net/core/sock.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -5,7 +6,6 @@
*
* Generic socket support routines. Memory allocators, socket lock/release
* handler for protocols to use and generic option handler.
- *
*
* Authors: Ross Biro
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -81,12 +81,6 @@
* Arnaldo C. Melo : cleanups, use skb_queue_purge
*
* To Fix:
- *
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -119,6 +113,7 @@
#include <linux/static_key.h>
#include <linux/memcontrol.h>
#include <linux/prefetch.h>
+#include <linux/compat.h>
#include <linux/uaccess.h>
@@ -137,8 +132,10 @@
#include <linux/filter.h>
#include <net/sock_reuseport.h>
+#include <net/bpf_sk_storage.h>
#include <trace/events/sock.h>
+#include <trace/hooks/sched.h>
#include <net/tcp.h>
#include <net/busy_poll.h>
@@ -335,14 +332,66 @@
}
EXPORT_SYMBOL(__sk_backlog_rcv);
-static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
+static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
{
- struct timeval tv;
+ struct __kernel_sock_timeval tv;
- if (optlen < sizeof(tv))
- return -EINVAL;
- if (copy_from_user(&tv, optval, sizeof(tv)))
- return -EFAULT;
+ if (timeo == MAX_SCHEDULE_TIMEOUT) {
+ tv.tv_sec = 0;
+ tv.tv_usec = 0;
+ } else {
+ tv.tv_sec = timeo / HZ;
+ tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
+ }
+
+ if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
+ struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
+ *(struct old_timeval32 *)optval = tv32;
+ return sizeof(tv32);
+ }
+
+ if (old_timeval) {
+ struct __kernel_old_timeval old_tv;
+ old_tv.tv_sec = tv.tv_sec;
+ old_tv.tv_usec = tv.tv_usec;
+ *(struct __kernel_old_timeval *)optval = old_tv;
+ return sizeof(old_tv);
+ }
+
+ *(struct __kernel_sock_timeval *)optval = tv;
+ return sizeof(tv);
+}
+
+static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
+ bool old_timeval)
+{
+ struct __kernel_sock_timeval tv;
+
+ if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
+ struct old_timeval32 tv32;
+
+ if (optlen < sizeof(tv32))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
+ return -EFAULT;
+ tv.tv_sec = tv32.tv_sec;
+ tv.tv_usec = tv32.tv_usec;
+ } else if (old_timeval) {
+ struct __kernel_old_timeval old_tv;
+
+ if (optlen < sizeof(old_tv))
+ return -EINVAL;
+ if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
+ return -EFAULT;
+ tv.tv_sec = old_tv.tv_sec;
+ tv.tv_usec = old_tv.tv_usec;
+ } else {
+ if (optlen < sizeof(tv))
+ return -EINVAL;
+ if (copy_from_sockptr(&tv, optval, sizeof(tv)))
+ return -EFAULT;
+ }
if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
return -EDOM;
@@ -360,21 +409,9 @@
*timeo_p = MAX_SCHEDULE_TIMEOUT;
if (tv.tv_sec == 0 && tv.tv_usec == 0)
return 0;
- if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
- *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
+ if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
+ *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
return 0;
-}
-
-static void sock_warn_obsolete_bsdism(const char *name)
-{
- static int warned;
- static char warncomm[TASK_COMM_LEN];
- if (strcmp(warncomm, current->comm) && warned < 5) {
- strcpy(warncomm, current->comm);
- pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
- warncomm, name);
- warned++;
- }
}
static bool sock_needs_netstamp(const struct sock *sk)
@@ -472,8 +509,8 @@
rc = sk_backlog_rcv(sk, skb);
- mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
- } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
+ mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
+ } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
bh_unlock_sock(sk);
atomic_inc(&sk->sk_drops);
goto discard_and_relse;
@@ -520,19 +557,55 @@
}
EXPORT_SYMBOL(sk_dst_check);
-static int sock_setbindtodevice(struct sock *sk, char __user *optval,
- int optlen)
+static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
+{
+ int ret = -ENOPROTOOPT;
+#ifdef CONFIG_NETDEVICES
+ struct net *net = sock_net(sk);
+
+ /* Sorry... */
+ ret = -EPERM;
+ if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
+ goto out;
+
+ ret = -EINVAL;
+ if (ifindex < 0)
+ goto out;
+
+ sk->sk_bound_dev_if = ifindex;
+ if (sk->sk_prot->rehash)
+ sk->sk_prot->rehash(sk);
+ sk_dst_reset(sk);
+
+ ret = 0;
+
+out:
+#endif
+
+ return ret;
+}
+
+int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
+{
+ int ret;
+
+ if (lock_sk)
+ lock_sock(sk);
+ ret = sock_bindtoindex_locked(sk, ifindex);
+ if (lock_sk)
+ release_sock(sk);
+
+ return ret;
+}
+EXPORT_SYMBOL(sock_bindtoindex);
+
+static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
{
int ret = -ENOPROTOOPT;
#ifdef CONFIG_NETDEVICES
struct net *net = sock_net(sk);
char devname[IFNAMSIZ];
int index;
-
- /* Sorry... */
- ret = -EPERM;
- if (!ns_capable(net->user_ns, CAP_NET_RAW))
- goto out;
ret = -EINVAL;
if (optlen < 0)
@@ -548,7 +621,7 @@
memset(devname, 0, sizeof(devname));
ret = -EFAULT;
- if (copy_from_user(devname, optval, optlen))
+ if (copy_from_sockptr(devname, optval, optlen))
goto out;
index = 0;
@@ -565,13 +638,7 @@
goto out;
}
- lock_sock(sk);
- sk->sk_bound_dev_if = index;
- sk_dst_reset(sk);
- release_sock(sk);
-
- ret = 0;
-
+ return sock_bindtoindex(sk, index, true);
out:
#endif
@@ -618,14 +685,6 @@
return ret;
}
-static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
-{
- if (valbool)
- sock_set_flag(sk, bit);
- else
- sock_reset_flag(sk, bit);
-}
-
bool sk_mc_loop(struct sock *sk)
{
if (dev_recursion_level())
@@ -645,13 +704,133 @@
}
EXPORT_SYMBOL(sk_mc_loop);
+void sock_set_reuseaddr(struct sock *sk)
+{
+ lock_sock(sk);
+ sk->sk_reuse = SK_CAN_REUSE;
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_reuseaddr);
+
+void sock_set_reuseport(struct sock *sk)
+{
+ lock_sock(sk);
+ sk->sk_reuseport = true;
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_reuseport);
+
+void sock_no_linger(struct sock *sk)
+{
+ lock_sock(sk);
+ sk->sk_lingertime = 0;
+ sock_set_flag(sk, SOCK_LINGER);
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_no_linger);
+
+void sock_set_priority(struct sock *sk, u32 priority)
+{
+ lock_sock(sk);
+ sk->sk_priority = priority;
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_priority);
+
+void sock_set_sndtimeo(struct sock *sk, s64 secs)
+{
+ lock_sock(sk);
+ if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
+ sk->sk_sndtimeo = secs * HZ;
+ else
+ sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_sndtimeo);
+
+static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
+{
+ if (val) {
+ sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
+ sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
+ sock_set_flag(sk, SOCK_RCVTSTAMP);
+ sock_enable_timestamp(sk, SOCK_TIMESTAMP);
+ } else {
+ sock_reset_flag(sk, SOCK_RCVTSTAMP);
+ sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
+ }
+}
+
+void sock_enable_timestamps(struct sock *sk)
+{
+ lock_sock(sk);
+ __sock_set_timestamps(sk, true, false, true);
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_enable_timestamps);
+
+void sock_set_keepalive(struct sock *sk)
+{
+ lock_sock(sk);
+ if (sk->sk_prot->keepalive)
+ sk->sk_prot->keepalive(sk, true);
+ sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_keepalive);
+
+static void __sock_set_rcvbuf(struct sock *sk, int val)
+{
+ /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
+ * as a negative value.
+ */
+ val = min_t(int, val, INT_MAX / 2);
+ sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+
+ /* We double it on the way in to account for "struct sk_buff" etc.
+ * overhead. Applications assume that the SO_RCVBUF setting they make
+ * will allow that much actual data to be received on that socket.
+ *
+ * Applications are unaware that "struct sk_buff" and other overheads
+ * allocate from the receive buffer during socket buffer allocation.
+ *
+ * And after considering the possible alternatives, returning the value
+ * we actually used in getsockopt is the most desirable behavior.
+ */
+ WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
+}
+
+void sock_set_rcvbuf(struct sock *sk, int val)
+{
+ lock_sock(sk);
+ __sock_set_rcvbuf(sk, val);
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_rcvbuf);
+
+static void __sock_set_mark(struct sock *sk, u32 val)
+{
+ if (val != sk->sk_mark) {
+ sk->sk_mark = val;
+ sk_dst_reset(sk);
+ }
+}
+
+void sock_set_mark(struct sock *sk, u32 val)
+{
+ lock_sock(sk);
+ __sock_set_mark(sk, val);
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_mark);
+
/*
* This is meant for all protocols to use and covers goings on
* at the socket level. Everything here is generic.
*/
int sock_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
+ sockptr_t optval, unsigned int optlen)
{
struct sock_txtime sk_txtime;
struct sock *sk = sock->sk;
@@ -670,7 +849,7 @@
if (optlen < sizeof(int))
return -EINVAL;
- if (get_user(val, (int __user *)optval))
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT;
valbool = val ? 1 : 0;
@@ -709,10 +888,15 @@
* play 'guess the biggest size' games. RCVBUF/SNDBUF
* are treated in BSD as hints
*/
- val = min_t(u32, val, sysctl_wmem_max);
+ val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
set_sndbuf:
+ /* Ensure val * 2 fits into an int, to prevent max_t()
+ * from treating it as a negative value.
+ */
+ val = min_t(int, val, INT_MAX / 2);
sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
- sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
+ WRITE_ONCE(sk->sk_sndbuf,
+ max_t(int, val * 2, SOCK_MIN_SNDBUF));
/* Wake up sending tasks if we upped the value. */
sk->sk_write_space(sk);
break;
@@ -722,6 +906,12 @@
ret = -EPERM;
break;
}
+
+ /* No negative values (to prevent underflow, as val will be
+ * multiplied by 2).
+ */
+ if (val < 0)
+ val = 0;
goto set_sndbuf;
case SO_RCVBUF:
@@ -730,25 +920,7 @@
* play 'guess the biggest size' games. RCVBUF/SNDBUF
* are treated in BSD as hints
*/
- val = min_t(u32, val, sysctl_rmem_max);
-set_rcvbuf:
- sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
- /*
- * We double it on the way in to account for
- * "struct sk_buff" etc. overhead. Applications
- * assume that the SO_RCVBUF setting they make will
- * allow that much actual data to be received on that
- * socket.
- *
- * Applications are unaware that "struct sk_buff" and
- * other overheads allocate from the receive buffer
- * during socket buffer allocation.
- *
- * And after considering the possible alternatives,
- * returning the value we actually used in getsockopt
- * is the most desirable behavior.
- */
- sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
+ __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
break;
case SO_RCVBUFFORCE:
@@ -756,7 +928,12 @@
ret = -EPERM;
break;
}
- goto set_rcvbuf;
+
+ /* No negative values (to prevent underflow, as val will be
+ * multiplied by 2).
+ */
+ __sock_set_rcvbuf(sk, max(val, 0));
+ break;
case SO_KEEPALIVE:
if (sk->sk_prot->keepalive)
@@ -785,7 +962,7 @@
ret = -EINVAL; /* 1003.1g */
break;
}
- if (copy_from_user(&ling, optval, sizeof(ling))) {
+ if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
ret = -EFAULT;
break;
}
@@ -803,7 +980,6 @@
break;
case SO_BSDCOMPAT:
- sock_warn_obsolete_bsdism("setsockopt");
break;
case SO_PASSCRED:
@@ -813,22 +989,20 @@
clear_bit(SOCK_PASSCRED, &sock->flags);
break;
- case SO_TIMESTAMP:
- case SO_TIMESTAMPNS:
- if (valbool) {
- if (optname == SO_TIMESTAMP)
- sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
- else
- sock_set_flag(sk, SOCK_RCVTSTAMPNS);
- sock_set_flag(sk, SOCK_RCVTSTAMP);
- sock_enable_timestamp(sk, SOCK_TIMESTAMP);
- } else {
- sock_reset_flag(sk, SOCK_RCVTSTAMP);
- sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
- }
+ case SO_TIMESTAMP_OLD:
+ __sock_set_timestamps(sk, valbool, false, false);
break;
-
- case SO_TIMESTAMPING:
+ case SO_TIMESTAMP_NEW:
+ __sock_set_timestamps(sk, valbool, true, false);
+ break;
+ case SO_TIMESTAMPNS_OLD:
+ __sock_set_timestamps(sk, valbool, false, true);
+ break;
+ case SO_TIMESTAMPNS_NEW:
+ __sock_set_timestamps(sk, valbool, true, true);
+ break;
+ case SO_TIMESTAMPING_NEW:
+ case SO_TIMESTAMPING_OLD:
if (val & ~SOF_TIMESTAMPING_MASK) {
ret = -EINVAL;
break;
@@ -856,6 +1030,8 @@
}
sk->sk_tsflags = val;
+ sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
+
if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
sock_enable_timestamp(sk,
SOCK_TIMESTAMPING_RX_SOFTWARE);
@@ -870,67 +1046,65 @@
if (sock->ops->set_rcvlowat)
ret = sock->ops->set_rcvlowat(sk, val);
else
- sk->sk_rcvlowat = val ? : 1;
+ WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
break;
- case SO_RCVTIMEO:
- ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
+ case SO_RCVTIMEO_OLD:
+ case SO_RCVTIMEO_NEW:
+ ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
+ optlen, optname == SO_RCVTIMEO_OLD);
break;
- case SO_SNDTIMEO:
- ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
+ case SO_SNDTIMEO_OLD:
+ case SO_SNDTIMEO_NEW:
+ ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
+ optlen, optname == SO_SNDTIMEO_OLD);
break;
- case SO_ATTACH_FILTER:
- ret = -EINVAL;
- if (optlen == sizeof(struct sock_fprog)) {
- struct sock_fprog fprog;
+ case SO_ATTACH_FILTER: {
+ struct sock_fprog fprog;
- ret = -EFAULT;
- if (copy_from_user(&fprog, optval, sizeof(fprog)))
- break;
-
+ ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
+ if (!ret)
ret = sk_attach_filter(&fprog, sk);
- }
break;
-
+ }
case SO_ATTACH_BPF:
ret = -EINVAL;
if (optlen == sizeof(u32)) {
u32 ufd;
ret = -EFAULT;
- if (copy_from_user(&ufd, optval, sizeof(ufd)))
+ if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
break;
ret = sk_attach_bpf(ufd, sk);
}
break;
- case SO_ATTACH_REUSEPORT_CBPF:
- ret = -EINVAL;
- if (optlen == sizeof(struct sock_fprog)) {
- struct sock_fprog fprog;
+ case SO_ATTACH_REUSEPORT_CBPF: {
+ struct sock_fprog fprog;
- ret = -EFAULT;
- if (copy_from_user(&fprog, optval, sizeof(fprog)))
- break;
-
+ ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
+ if (!ret)
ret = sk_reuseport_attach_filter(&fprog, sk);
- }
break;
-
+ }
case SO_ATTACH_REUSEPORT_EBPF:
ret = -EINVAL;
if (optlen == sizeof(u32)) {
u32 ufd;
ret = -EFAULT;
- if (copy_from_user(&ufd, optval, sizeof(ufd)))
+ if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
break;
ret = sk_reuseport_attach_bpf(ufd, sk);
}
+ break;
+
+ case SO_DETACH_REUSEPORT_BPF:
+ ret = reuseport_detach_prog(sk);
break;
case SO_DETACH_FILTER:
@@ -951,10 +1125,12 @@
clear_bit(SOCK_PASSSEC, &sock->flags);
break;
case SO_MARK:
- if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
ret = -EPERM;
- else
- sk->sk_mark = val;
+ break;
+ }
+
+ __sock_set_mark(sk, val);
break;
case SO_RXQ_OVFL:
@@ -995,15 +1171,23 @@
#endif
case SO_MAX_PACING_RATE:
- if (val != ~0U)
+ {
+ unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
+
+ if (sizeof(ulval) != sizeof(val) &&
+ optlen >= sizeof(ulval) &&
+ copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
+ ret = -EFAULT;
+ break;
+ }
+ if (ulval != ~0UL)
cmpxchg(&sk->sk_pacing_status,
SK_PACING_NONE,
SK_PACING_NEEDED);
- sk->sk_max_pacing_rate = val;
- sk->sk_pacing_rate = min(sk->sk_pacing_rate,
- sk->sk_max_pacing_rate);
+ sk->sk_max_pacing_rate = ulval;
+ sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
break;
-
+ }
case SO_INCOMING_CPU:
WRITE_ONCE(sk->sk_incoming_cpu, val);
break;
@@ -1015,7 +1199,10 @@
case SO_ZEROCOPY:
if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
- if (sk->sk_protocol != IPPROTO_TCP)
+ if (!((sk->sk_type == SOCK_STREAM &&
+ sk->sk_protocol == IPPROTO_TCP) ||
+ (sk->sk_type == SOCK_DGRAM &&
+ sk->sk_protocol == IPPROTO_UDP)))
ret = -ENOTSUPP;
} else if (sk->sk_family != PF_RDS) {
ret = -ENOTSUPP;
@@ -1029,23 +1216,35 @@
break;
case SO_TXTIME:
- if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
- ret = -EPERM;
- } else if (optlen != sizeof(struct sock_txtime)) {
+ if (optlen != sizeof(struct sock_txtime)) {
ret = -EINVAL;
- } else if (copy_from_user(&sk_txtime, optval,
+ break;
+ } else if (copy_from_sockptr(&sk_txtime, optval,
sizeof(struct sock_txtime))) {
ret = -EFAULT;
+ break;
} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
ret = -EINVAL;
- } else {
- sock_valbool_flag(sk, SOCK_TXTIME, true);
- sk->sk_clockid = sk_txtime.clockid;
- sk->sk_txtime_deadline_mode =
- !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
- sk->sk_txtime_report_errors =
- !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
+ break;
}
+ /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
+ * scheduler has enough safe guards.
+ */
+ if (sk_txtime.clockid != CLOCK_MONOTONIC &&
+ !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+ ret = -EPERM;
+ break;
+ }
+ sock_valbool_flag(sk, SOCK_TXTIME, true);
+ sk->sk_clockid = sk_txtime.clockid;
+ sk->sk_txtime_deadline_mode =
+ !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
+ sk->sk_txtime_report_errors =
+ !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
+ break;
+
+ case SO_BINDTOIFINDEX:
+ ret = sock_bindtoindex_locked(sk, val);
break;
default:
@@ -1101,8 +1300,11 @@
union {
int val;
u64 val64;
+ unsigned long ulval;
struct linger ling;
- struct timeval tm;
+ struct old_timeval32 tm32;
+ struct __kernel_old_timeval tm;
+ struct __kernel_sock_timeval stm;
struct sock_txtime txtime;
} v;
@@ -1186,42 +1388,38 @@
break;
case SO_BSDCOMPAT:
- sock_warn_obsolete_bsdism("getsockopt");
break;
- case SO_TIMESTAMP:
+ case SO_TIMESTAMP_OLD:
v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
+ !sock_flag(sk, SOCK_TSTAMP_NEW) &&
!sock_flag(sk, SOCK_RCVTSTAMPNS);
break;
- case SO_TIMESTAMPNS:
- v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
+ case SO_TIMESTAMPNS_OLD:
+ v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
break;
- case SO_TIMESTAMPING:
+ case SO_TIMESTAMP_NEW:
+ v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
+ break;
+
+ case SO_TIMESTAMPNS_NEW:
+ v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
+ break;
+
+ case SO_TIMESTAMPING_OLD:
v.val = sk->sk_tsflags;
break;
- case SO_RCVTIMEO:
- lv = sizeof(struct timeval);
- if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
- v.tm.tv_sec = 0;
- v.tm.tv_usec = 0;
- } else {
- v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
- v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
- }
+ case SO_RCVTIMEO_OLD:
+ case SO_RCVTIMEO_NEW:
+ lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
break;
- case SO_SNDTIMEO:
- lv = sizeof(struct timeval);
- if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
- v.tm.tv_sec = 0;
- v.tm.tv_usec = 0;
- } else {
- v.tm.tv_sec = sk->sk_sndtimeo / HZ;
- v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
- }
+ case SO_SNDTIMEO_OLD:
+ case SO_SNDTIMEO_NEW:
+ lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
break;
case SO_RCVLOWAT:
@@ -1354,7 +1552,13 @@
#endif
case SO_MAX_PACING_RATE:
- v.val = sk->sk_max_pacing_rate;
+ if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
+ lv = sizeof(v.ulval);
+ v.ulval = sk->sk_max_pacing_rate;
+ } else {
+ /* 32bit version */
+ v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
+ }
break;
case SO_INCOMING_CPU:
@@ -1405,6 +1609,10 @@
SOF_TXTIME_REPORT_ERRORS : 0;
break;
+ case SO_BINDTOIFINDEX:
+ v.val = sk->sk_bound_dev_if;
+ break;
+
default:
/* We implement the SO_SNDLOWAT etc to not be settable
* (1003.1g 7).
@@ -1452,13 +1660,14 @@
*/
static void sock_copy(struct sock *nsk, const struct sock *osk)
{
+ const struct proto *prot = READ_ONCE(osk->sk_prot);
#ifdef CONFIG_SECURITY_NETWORK
void *sptr = nsk->sk_security;
#endif
memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
- osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
+ prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
#ifdef CONFIG_SECURITY_NETWORK
nsk->sk_security = sptr;
@@ -1584,6 +1793,10 @@
sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
+#ifdef CONFIG_BPF_SYSCALL
+ bpf_sk_storage_free(sk);
+#endif
+
if (atomic_read(&sk->sk_omem_alloc))
pr_debug("%s: optmem leakage (%d bytes) detected\n",
__func__, atomic_read(&sk->sk_omem_alloc));
@@ -1670,112 +1883,121 @@
*/
struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
{
- struct sock *newsk;
+ struct proto *prot = READ_ONCE(sk->sk_prot);
+ struct sk_filter *filter;
bool is_charged = true;
+ struct sock *newsk;
- newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
- if (newsk != NULL) {
- struct sk_filter *filter;
+ newsk = sk_prot_alloc(prot, priority, sk->sk_family);
+ if (!newsk)
+ goto out;
- sock_copy(newsk, sk);
+ sock_copy(newsk, sk);
- newsk->sk_prot_creator = sk->sk_prot;
+ newsk->sk_prot_creator = prot;
- /* SANITY */
- if (likely(newsk->sk_net_refcnt))
- get_net(sock_net(newsk));
- sk_node_init(&newsk->sk_node);
- sock_lock_init(newsk);
- bh_lock_sock(newsk);
- newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
- newsk->sk_backlog.len = 0;
-
- atomic_set(&newsk->sk_rmem_alloc, 0);
- /*
- * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
- */
- refcount_set(&newsk->sk_wmem_alloc, 1);
- atomic_set(&newsk->sk_omem_alloc, 0);
- sk_init_common(newsk);
-
- newsk->sk_dst_cache = NULL;
- newsk->sk_dst_pending_confirm = 0;
- newsk->sk_wmem_queued = 0;
- newsk->sk_forward_alloc = 0;
- atomic_set(&newsk->sk_drops, 0);
- newsk->sk_send_head = NULL;
- newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
- atomic_set(&newsk->sk_zckey, 0);
-
- sock_reset_flag(newsk, SOCK_DONE);
-
- /* sk->sk_memcg will be populated at accept() time */
- newsk->sk_memcg = NULL;
-
- cgroup_sk_clone(&newsk->sk_cgrp_data);
-
- rcu_read_lock();
- filter = rcu_dereference(sk->sk_filter);
- if (filter != NULL)
- /* though it's an empty new sock, the charging may fail
- * if sysctl_optmem_max was changed between creation of
- * original socket and cloning
- */
- is_charged = sk_filter_charge(newsk, filter);
- RCU_INIT_POINTER(newsk->sk_filter, filter);
- rcu_read_unlock();
-
- if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
- /* We need to make sure that we don't uncharge the new
- * socket if we couldn't charge it in the first place
- * as otherwise we uncharge the parent's filter.
- */
- if (!is_charged)
- RCU_INIT_POINTER(newsk->sk_filter, NULL);
- sk_free_unlock_clone(newsk);
- newsk = NULL;
- goto out;
- }
- RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
-
- newsk->sk_err = 0;
- newsk->sk_err_soft = 0;
- newsk->sk_priority = 0;
- newsk->sk_incoming_cpu = raw_smp_processor_id();
- atomic64_set(&newsk->sk_cookie, 0);
- if (likely(newsk->sk_net_refcnt))
- sock_inuse_add(sock_net(newsk), 1);
-
- /*
- * Before updating sk_refcnt, we must commit prior changes to memory
- * (Documentation/RCU/rculist_nulls.txt for details)
- */
- smp_wmb();
- refcount_set(&newsk->sk_refcnt, 2);
-
- /*
- * Increment the counter in the same struct proto as the master
- * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
- * is the same as sk->sk_prot->socks, as this field was copied
- * with memcpy).
- *
- * This _changes_ the previous behaviour, where
- * tcp_create_openreq_child always was incrementing the
- * equivalent to tcp_prot->socks (inet_sock_nr), so this have
- * to be taken into account in all callers. -acme
- */
- sk_refcnt_debug_inc(newsk);
- sk_set_socket(newsk, NULL);
- sk_tx_queue_clear(newsk);
- newsk->sk_wq = NULL;
-
- if (newsk->sk_prot->sockets_allocated)
- sk_sockets_allocated_inc(newsk);
-
- if (sock_needs_netstamp(sk) &&
- newsk->sk_flags & SK_FLAGS_TIMESTAMP)
- net_enable_timestamp();
+ /* SANITY */
+ if (likely(newsk->sk_net_refcnt)) {
+ get_net(sock_net(newsk));
+ sock_inuse_add(sock_net(newsk), 1);
}
+ sk_node_init(&newsk->sk_node);
+ sock_lock_init(newsk);
+ bh_lock_sock(newsk);
+ newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
+ newsk->sk_backlog.len = 0;
+
+ atomic_set(&newsk->sk_rmem_alloc, 0);
+
+ /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
+ refcount_set(&newsk->sk_wmem_alloc, 1);
+
+ atomic_set(&newsk->sk_omem_alloc, 0);
+ sk_init_common(newsk);
+
+ newsk->sk_dst_cache = NULL;
+ newsk->sk_dst_pending_confirm = 0;
+ newsk->sk_wmem_queued = 0;
+ newsk->sk_forward_alloc = 0;
+ atomic_set(&newsk->sk_drops, 0);
+ newsk->sk_send_head = NULL;
+ newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
+ atomic_set(&newsk->sk_zckey, 0);
+
+ sock_reset_flag(newsk, SOCK_DONE);
+
+ /* sk->sk_memcg will be populated at accept() time */
+ newsk->sk_memcg = NULL;
+
+ cgroup_sk_clone(&newsk->sk_cgrp_data);
+
+ rcu_read_lock();
+ filter = rcu_dereference(sk->sk_filter);
+ if (filter != NULL)
+ /* though it's an empty new sock, the charging may fail
+ * if sysctl_optmem_max was changed between creation of
+ * original socket and cloning
+ */
+ is_charged = sk_filter_charge(newsk, filter);
+ RCU_INIT_POINTER(newsk->sk_filter, filter);
+ rcu_read_unlock();
+
+ if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
+ /* We need to make sure that we don't uncharge the new
+ * socket if we couldn't charge it in the first place
+ * as otherwise we uncharge the parent's filter.
+ */
+ if (!is_charged)
+ RCU_INIT_POINTER(newsk->sk_filter, NULL);
+ sk_free_unlock_clone(newsk);
+ newsk = NULL;
+ goto out;
+ }
+ RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
+
+ if (bpf_sk_storage_clone(sk, newsk)) {
+ sk_free_unlock_clone(newsk);
+ newsk = NULL;
+ goto out;
+ }
+
+ /* Clear sk_user_data if parent had the pointer tagged
+ * as not suitable for copying when cloning.
+ */
+ if (sk_user_data_is_nocopy(newsk))
+ newsk->sk_user_data = NULL;
+
+ newsk->sk_err = 0;
+ newsk->sk_err_soft = 0;
+ newsk->sk_priority = 0;
+ newsk->sk_incoming_cpu = raw_smp_processor_id();
+
+ /* Before updating sk_refcnt, we must commit prior changes to memory
+ * (Documentation/RCU/rculist_nulls.rst for details)
+ */
+ smp_wmb();
+ refcount_set(&newsk->sk_refcnt, 2);
+
+ /* Increment the counter in the same struct proto as the master
+ * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
+ * is the same as sk->sk_prot->socks, as this field was copied
+ * with memcpy).
+ *
+ * This _changes_ the previous behaviour, where
+ * tcp_create_openreq_child always was incrementing the
+ * equivalent to tcp_prot->socks (inet_sock_nr), so this have
+ * to be taken into account in all callers. -acme
+ */
+ sk_refcnt_debug_inc(newsk);
+ sk_set_socket(newsk, NULL);
+ sk_tx_queue_clear(newsk);
+ RCU_INIT_POINTER(newsk->sk_wq, NULL);
+
+ if (newsk->sk_prot->sockets_allocated)
+ sk_sockets_allocated_inc(newsk);
+
+ if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
+ net_enable_timestamp();
out:
return newsk;
}
@@ -1877,6 +2099,19 @@
}
EXPORT_SYMBOL(skb_set_owner_w);
+static bool can_skb_orphan_partial(const struct sk_buff *skb)
+{
+#ifdef CONFIG_TLS_DEVICE
+ /* Drivers depend on in-order delivery for crypto offload,
+ * partial orphan breaks out-of-order-OK logic.
+ */
+ if (skb->decrypted)
+ return false;
+#endif
+ return (skb->destructor == sock_wfree ||
+ (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
+}
+
/* This helper is used by netem, as it can hold packets in its
* delay queue. We want to allow the owner socket to send more
* packets, as if they were already TX completed by a typical driver.
@@ -1888,20 +2123,10 @@
if (skb_is_tcp_pure_ack(skb))
return;
- if (skb->destructor == sock_wfree
-#ifdef CONFIG_INET
- || skb->destructor == tcp_wfree
-#endif
- ) {
- struct sock *sk = skb->sk;
+ if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
+ return;
- if (refcount_inc_not_zero(&sk->sk_refcnt)) {
- WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
- skb->destructor = sock_efree;
- }
- } else {
- skb_orphan(skb);
- }
+ skb_orphan(skb);
}
EXPORT_SYMBOL(skb_orphan_partial);
@@ -1927,6 +2152,18 @@
sock_put(skb->sk);
}
EXPORT_SYMBOL(sock_efree);
+
+/* Buffer destructor for prefetch/receive path where reference count may
+ * not be held, e.g. for listen sockets.
+ */
+#ifdef CONFIG_INET
+void sock_pfree(struct sk_buff *skb)
+{
+ if (sk_is_refcounted(skb->sk))
+ sock_gen_put(skb->sk);
+}
+EXPORT_SYMBOL(sock_pfree);
+#endif /* CONFIG_INET */
kuid_t sock_i_uid(struct sock *sk)
{
@@ -1956,8 +2193,10 @@
struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
gfp_t priority)
{
- if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
+ if (force ||
+ refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
struct sk_buff *skb = alloc_skb(size, priority);
+
if (skb) {
skb_set_owner_w(skb, sk);
return skb;
@@ -1981,7 +2220,7 @@
/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
- sysctl_optmem_max)
+ READ_ONCE(sysctl_optmem_max))
return NULL;
skb = alloc_skb(size, priority);
@@ -1999,8 +2238,10 @@
*/
void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
{
- if ((unsigned int)size <= sysctl_optmem_max &&
- atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
+ int optmem_max = READ_ONCE(sysctl_optmem_max);
+
+ if ((unsigned int)size <= optmem_max &&
+ atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
void *mem;
/* First do the add, to avoid the race if kmalloc
* might sleep.
@@ -2025,7 +2266,7 @@
if (WARN_ON_ONCE(!mem))
return;
if (nullify)
- kzfree(mem);
+ kfree_sensitive(mem);
else
kfree(mem);
atomic_sub(size, &sk->sk_omem_alloc);
@@ -2058,7 +2299,7 @@
break;
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
- if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
+ if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
break;
if (sk->sk_shutdown & SEND_SHUTDOWN)
break;
@@ -2093,7 +2334,7 @@
if (sk->sk_shutdown & SEND_SHUTDOWN)
goto failure;
- if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
+ if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
break;
sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
@@ -2139,7 +2380,7 @@
return -EINVAL;
sockc->mark = *(u32 *)CMSG_DATA(cmsg);
break;
- case SO_TIMESTAMPING:
+ case SO_TIMESTAMPING_OLD:
if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
return -EINVAL;
@@ -2207,8 +2448,8 @@
}
}
-/* On 32bit arches, an skb frag is limited to 2^15 */
#define SKB_FRAG_PAGE_ORDER get_order(32768)
+DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
/**
* skb_page_frag_refill - check that a page_frag contains enough room
@@ -2233,7 +2474,8 @@
}
pfrag->offset = 0;
- if (SKB_FRAG_PAGE_ORDER) {
+ if (SKB_FRAG_PAGE_ORDER &&
+ !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
/* Avoid direct reclaim but allow kswapd to wake */
pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
__GFP_COMP | __GFP_NOWARN |
@@ -2263,67 +2505,6 @@
return false;
}
EXPORT_SYMBOL(sk_page_frag_refill);
-
-int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
- int sg_start, int *sg_curr_index, unsigned int *sg_curr_size,
- int first_coalesce)
-{
- int sg_curr = *sg_curr_index, use = 0, rc = 0;
- unsigned int size = *sg_curr_size;
- struct page_frag *pfrag;
- struct scatterlist *sge;
-
- len -= size;
- pfrag = sk_page_frag(sk);
-
- while (len > 0) {
- unsigned int orig_offset;
-
- if (!sk_page_frag_refill(sk, pfrag)) {
- rc = -ENOMEM;
- goto out;
- }
-
- use = min_t(int, len, pfrag->size - pfrag->offset);
-
- if (!sk_wmem_schedule(sk, use)) {
- rc = -ENOMEM;
- goto out;
- }
-
- sk_mem_charge(sk, use);
- size += use;
- orig_offset = pfrag->offset;
- pfrag->offset += use;
-
- sge = sg + sg_curr - 1;
- if (sg_curr > first_coalesce && sg_page(sge) == pfrag->page &&
- sge->offset + sge->length == orig_offset) {
- sge->length += use;
- } else {
- sge = sg + sg_curr;
- sg_unmark_end(sge);
- sg_set_page(sge, pfrag->page, use, orig_offset);
- get_page(pfrag->page);
- sg_curr++;
-
- if (sg_curr == MAX_SKB_FRAGS)
- sg_curr = 0;
-
- if (sg_curr == sg_start) {
- rc = -ENOSPC;
- break;
- }
- }
-
- len -= use;
- }
-out:
- *sg_curr_size = size;
- *sg_curr_index = sg_curr;
- return rc;
-}
-EXPORT_SYMBOL(sk_alloc_sg);
static void __lock_sock(struct sock *sk)
__releases(&sk->sk_lock.slock)
@@ -2358,7 +2539,7 @@
next = skb->next;
prefetch(next);
WARN_ON_ONCE(skb_dst_is_noref(skb));
- skb->next = NULL;
+ skb_mark_not_on_list(skb);
sk_backlog_rcv(sk, skb);
cond_resched();
@@ -2614,20 +2795,6 @@
}
EXPORT_SYMBOL(sock_no_shutdown);
-int sock_no_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
-{
- return -EOPNOTSUPP;
-}
-EXPORT_SYMBOL(sock_no_setsockopt);
-
-int sock_no_getsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int __user *optlen)
-{
- return -EOPNOTSUPP;
-}
-EXPORT_SYMBOL(sock_no_getsockopt);
-
int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
{
return -EOPNOTSUPP;
@@ -2732,15 +2899,25 @@
rcu_read_unlock();
}
-static void sock_def_readable(struct sock *sk)
+void sock_def_readable(struct sock *sk)
{
struct socket_wq *wq;
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
- if (skwq_has_sleeper(wq))
+
+ if (skwq_has_sleeper(wq)) {
+ int done = 0;
+
+ trace_android_vh_do_wake_up_sync(&wq->wait, &done);
+ if (done)
+ goto out;
+
wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
EPOLLRDNORM | EPOLLRDBAND);
+ }
+
+out:
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
rcu_read_unlock();
}
@@ -2754,7 +2931,7 @@
/* Do not wake up a writer until he can make "significant"
* progress. --DaveM
*/
- if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
+ if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
wq = rcu_dereference(sk->sk_wq);
if (skwq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
@@ -2795,6 +2972,13 @@
}
EXPORT_SYMBOL(sk_stop_timer);
+void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
+{
+ if (del_timer_sync(timer))
+ __sock_put(sk);
+}
+EXPORT_SYMBOL(sk_stop_timer_sync);
+
void sock_init_data(struct socket *sock, struct sock *sk)
{
sk_init_common(sk);
@@ -2803,8 +2987,8 @@
timer_setup(&sk->sk_timer, NULL, 0);
sk->sk_allocation = GFP_KERNEL;
- sk->sk_rcvbuf = sysctl_rmem_default;
- sk->sk_sndbuf = sysctl_wmem_default;
+ sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default);
+ sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default);
sk->sk_state = TCP_CLOSE;
sk_set_socket(sk, sock);
@@ -2812,11 +2996,11 @@
if (sock) {
sk->sk_type = sock->type;
- sk->sk_wq = sock->wq;
+ RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
sock->sk = sk;
sk->sk_uid = SOCK_INODE(sock)->i_uid;
} else {
- sk->sk_wq = NULL;
+ RCU_INIT_POINTER(sk->sk_wq, NULL);
sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0);
}
@@ -2859,18 +3043,18 @@
#ifdef CONFIG_NET_RX_BUSY_POLL
sk->sk_napi_id = 0;
- sk->sk_ll_usec = sysctl_net_busy_read;
+ sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read);
#endif
- sk->sk_max_pacing_rate = ~0U;
- sk->sk_pacing_rate = ~0U;
- sk->sk_pacing_shift = 10;
+ sk->sk_max_pacing_rate = ~0UL;
+ sk->sk_pacing_rate = ~0UL;
+ WRITE_ONCE(sk->sk_pacing_shift, 10);
sk->sk_incoming_cpu = -1;
sk_rx_queue_clear(sk);
/*
* Before updating sk_refcnt, we must commit prior changes to memory
- * (Documentation/RCU/rculist_nulls.txt for details)
+ * (Documentation/RCU/rculist_nulls.rst for details)
*/
smp_wmb();
refcount_set(&sk->sk_refcnt, 1);
@@ -2949,41 +3133,46 @@
}
EXPORT_SYMBOL(lock_sock_fast);
-int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
+int sock_gettstamp(struct socket *sock, void __user *userstamp,
+ bool timeval, bool time32)
{
- struct timeval tv;
+ struct sock *sk = sock->sk;
+ struct timespec64 ts;
sock_enable_timestamp(sk, SOCK_TIMESTAMP);
- tv = ktime_to_timeval(sock_read_timestamp(sk));
- if (tv.tv_sec == -1)
- return -ENOENT;
- if (tv.tv_sec == 0) {
- ktime_t kt = ktime_get_real();
- sock_write_timestamp(sk, kt);
- tv = ktime_to_timeval(kt);
- }
- return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
-}
-EXPORT_SYMBOL(sock_get_timestamp);
-
-int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
-{
- struct timespec ts;
-
- sock_enable_timestamp(sk, SOCK_TIMESTAMP);
- ts = ktime_to_timespec(sock_read_timestamp(sk));
+ ts = ktime_to_timespec64(sock_read_timestamp(sk));
if (ts.tv_sec == -1)
return -ENOENT;
if (ts.tv_sec == 0) {
ktime_t kt = ktime_get_real();
sock_write_timestamp(sk, kt);
- ts = ktime_to_timespec(sk->sk_stamp);
+ ts = ktime_to_timespec64(kt);
}
- return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
-}
-EXPORT_SYMBOL(sock_get_timestampns);
-void sock_enable_timestamp(struct sock *sk, int flag)
+ if (timeval)
+ ts.tv_nsec /= 1000;
+
+#ifdef CONFIG_COMPAT_32BIT_TIME
+ if (time32)
+ return put_old_timespec32(&ts, userstamp);
+#endif
+#ifdef CONFIG_SPARC64
+ /* beware of padding in sparc64 timeval */
+ if (timeval && !in_compat_syscall()) {
+ struct __kernel_old_timeval __user tv = {
+ .tv_sec = ts.tv_sec,
+ .tv_usec = ts.tv_nsec,
+ };
+ if (copy_to_user(userstamp, &tv, sizeof(tv)))
+ return -EFAULT;
+ return 0;
+ }
+#endif
+ return put_timespec64(&ts, userstamp);
+}
+EXPORT_SYMBOL(sock_gettstamp);
+
+void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
{
if (!sock_flag(sk, flag)) {
unsigned long previous_flags = sk->sk_flags;
@@ -3052,20 +3241,6 @@
}
EXPORT_SYMBOL(sock_common_getsockopt);
-#ifdef CONFIG_COMPAT
-int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int __user *optlen)
-{
- struct sock *sk = sock->sk;
-
- if (sk->sk_prot->compat_getsockopt != NULL)
- return sk->sk_prot->compat_getsockopt(sk, level, optname,
- optval, optlen);
- return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
-}
-EXPORT_SYMBOL(compat_sock_common_getsockopt);
-#endif
-
int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
int flags)
{
@@ -3085,7 +3260,7 @@
* Set socket options on an inet socket.
*/
int sock_common_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
+ sockptr_t optval, unsigned int optlen)
{
struct sock *sk = sock->sk;
@@ -3093,27 +3268,13 @@
}
EXPORT_SYMBOL(sock_common_setsockopt);
-#ifdef CONFIG_COMPAT
-int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
-{
- struct sock *sk = sock->sk;
-
- if (sk->sk_prot->compat_setsockopt != NULL)
- return sk->sk_prot->compat_setsockopt(sk, level, optname,
- optval, optlen);
- return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
-}
-EXPORT_SYMBOL(compat_sock_common_setsockopt);
-#endif
-
void sk_common_release(struct sock *sk)
{
if (sk->sk_prot->destroy)
sk->sk_prot->destroy(sk);
/*
- * Observation: when sock_common_release is called, processes have
+ * Observation: when sk_common_release is called, processes have
* no access to socket. But net still has.
* Step one, detach it from networking:
*
@@ -3149,13 +3310,13 @@
memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
- mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
+ mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
- mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
+ mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
- mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
+ mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
- mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
+ mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
}
@@ -3240,16 +3401,17 @@
core_initcall(net_inuse_init);
-static void assign_proto_idx(struct proto *prot)
+static int assign_proto_idx(struct proto *prot)
{
prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
pr_err("PROTO_INUSE_NR exhausted\n");
- return;
+ return -ENOSPC;
}
set_bit(prot->inuse_idx, proto_inuse_idx);
+ return 0;
}
static void release_proto_idx(struct proto *prot)
@@ -3258,8 +3420,9 @@
clear_bit(prot->inuse_idx, proto_inuse_idx);
}
#else
-static inline void assign_proto_idx(struct proto *prot)
+static inline int assign_proto_idx(struct proto *prot)
{
+ return 0;
}
static inline void release_proto_idx(struct proto *prot)
@@ -3270,6 +3433,16 @@
{
}
#endif
+
+static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
+{
+ if (!twsk_prot)
+ return;
+ kfree(twsk_prot->twsk_slab_name);
+ twsk_prot->twsk_slab_name = NULL;
+ kmem_cache_destroy(twsk_prot->twsk_slab);
+ twsk_prot->twsk_slab = NULL;
+}
static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
{
@@ -3308,6 +3481,8 @@
int proto_register(struct proto *prot, int alloc_slab)
{
+ int ret = -ENOBUFS;
+
if (alloc_slab) {
prot->slab = kmem_cache_create_usercopy(prot->name,
prot->obj_size, 0,
@@ -3339,25 +3514,32 @@
prot->slab_flags,
NULL);
if (prot->twsk_prot->twsk_slab == NULL)
- goto out_free_timewait_sock_slab_name;
+ goto out_free_timewait_sock_slab;
}
}
mutex_lock(&proto_list_mutex);
+ ret = assign_proto_idx(prot);
+ if (ret) {
+ mutex_unlock(&proto_list_mutex);
+ goto out_free_timewait_sock_slab;
+ }
list_add(&prot->node, &proto_list);
- assign_proto_idx(prot);
mutex_unlock(&proto_list_mutex);
- return 0;
+ return ret;
-out_free_timewait_sock_slab_name:
- kfree(prot->twsk_prot->twsk_slab_name);
+out_free_timewait_sock_slab:
+ if (alloc_slab && prot->twsk_prot)
+ tw_prot_cleanup(prot->twsk_prot);
out_free_request_sock_slab:
- req_prot_cleanup(prot->rsk_prot);
+ if (alloc_slab) {
+ req_prot_cleanup(prot->rsk_prot);
- kmem_cache_destroy(prot->slab);
- prot->slab = NULL;
+ kmem_cache_destroy(prot->slab);
+ prot->slab = NULL;
+ }
out:
- return -ENOBUFS;
+ return ret;
}
EXPORT_SYMBOL(proto_register);
@@ -3372,12 +3554,7 @@
prot->slab = NULL;
req_prot_cleanup(prot->rsk_prot);
-
- if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
- kmem_cache_destroy(prot->twsk_prot->twsk_slab);
- kfree(prot->twsk_prot->twsk_slab_name);
- prot->twsk_prot->twsk_slab = NULL;
- }
+ tw_prot_cleanup(prot->twsk_prot);
}
EXPORT_SYMBOL(proto_unregister);
@@ -3394,6 +3571,7 @@
#ifdef CONFIG_INET
if (family == AF_INET &&
protocol != IPPROTO_RAW &&
+ protocol < MAX_INET_PROTOS &&
!rcu_access_pointer(inet_protos[protocol]))
return -ENOENT;
#endif
@@ -3431,7 +3609,7 @@
return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
}
-static char *sock_prot_memory_pressure(struct proto *proto)
+static const char *sock_prot_memory_pressure(struct proto *proto)
{
return proto->memory_pressure != NULL ?
proto_memory_pressure(proto) ? "yes" : "no" : "NI";
@@ -3535,3 +3713,11 @@
}
EXPORT_SYMBOL(sk_busy_loop_end);
#endif /* CONFIG_NET_RX_BUSY_POLL */
+
+int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
+{
+ if (!sk->sk_prot->bind_add)
+ return -EOPNOTSUPP;
+ return sk->sk_prot->bind_add(sk, addr, addr_len);
+}
+EXPORT_SYMBOL(sock_bind_add);
--
Gitblit v1.6.2