From b22da3d8526a935aa31e086e63f60ff3246cb61c Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Sat, 09 Dec 2023 07:24:11 +0000 Subject: [PATCH] add stmac read mac form eeprom --- kernel/net/core/sock.c | 1102 +++++++++++++++++++++++++++++++++------------------------ 1 files changed, 643 insertions(+), 459 deletions(-) diff --git a/kernel/net/core/sock.c b/kernel/net/core/sock.c index d52850e..4c630e7 100644 --- a/kernel/net/core/sock.c +++ b/kernel/net/core/sock.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket @@ -5,7 +6,6 @@ * * Generic socket support routines. Memory allocators, socket lock/release * handler for protocols to use and generic option handler. - * * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -81,12 +81,6 @@ * Arnaldo C. Melo : cleanups, use skb_queue_purge * * To Fix: - * - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -119,6 +113,7 @@ #include <linux/static_key.h> #include <linux/memcontrol.h> #include <linux/prefetch.h> +#include <linux/compat.h> #include <linux/uaccess.h> @@ -137,8 +132,10 @@ #include <linux/filter.h> #include <net/sock_reuseport.h> +#include <net/bpf_sk_storage.h> #include <trace/events/sock.h> +#include <trace/hooks/sched.h> #include <net/tcp.h> #include <net/busy_poll.h> @@ -335,14 +332,66 @@ } EXPORT_SYMBOL(__sk_backlog_rcv); -static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) +static int sock_get_timeout(long timeo, void *optval, bool old_timeval) { - struct timeval tv; + struct __kernel_sock_timeval tv; - if (optlen < sizeof(tv)) - return -EINVAL; - if (copy_from_user(&tv, optval, sizeof(tv))) - return -EFAULT; + if (timeo == MAX_SCHEDULE_TIMEOUT) { + tv.tv_sec = 0; + tv.tv_usec = 0; + } else { + tv.tv_sec = timeo / HZ; + tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ; + } + + if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { + struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec }; + *(struct old_timeval32 *)optval = tv32; + return sizeof(tv32); + } + + if (old_timeval) { + struct __kernel_old_timeval old_tv; + old_tv.tv_sec = tv.tv_sec; + old_tv.tv_usec = tv.tv_usec; + *(struct __kernel_old_timeval *)optval = old_tv; + return sizeof(old_tv); + } + + *(struct __kernel_sock_timeval *)optval = tv; + return sizeof(tv); +} + +static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, + bool old_timeval) +{ + struct __kernel_sock_timeval tv; + + if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { + struct old_timeval32 tv32; + + if (optlen < sizeof(tv32)) + return -EINVAL; + + if (copy_from_sockptr(&tv32, optval, sizeof(tv32))) + return -EFAULT; + tv.tv_sec = tv32.tv_sec; + tv.tv_usec = tv32.tv_usec; + } else if (old_timeval) { + struct __kernel_old_timeval old_tv; + + if (optlen < sizeof(old_tv)) + return -EINVAL; + if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv))) + return -EFAULT; + tv.tv_sec = old_tv.tv_sec; + tv.tv_usec = old_tv.tv_usec; + } else { + if (optlen < sizeof(tv)) + return -EINVAL; + if (copy_from_sockptr(&tv, optval, sizeof(tv))) + return -EFAULT; + } if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) return -EDOM; @@ -360,21 +409,9 @@ *timeo_p = MAX_SCHEDULE_TIMEOUT; if (tv.tv_sec == 0 && tv.tv_usec == 0) return 0; - if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1)) - *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ); + if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)) + *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ); return 0; -} - -static void sock_warn_obsolete_bsdism(const char *name) -{ - static int warned; - static char warncomm[TASK_COMM_LEN]; - if (strcmp(warncomm, current->comm) && warned < 5) { - strcpy(warncomm, current->comm); - pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n", - warncomm, name); - warned++; - } } static bool sock_needs_netstamp(const struct sock *sk) @@ -472,8 +509,8 @@ rc = sk_backlog_rcv(sk, skb); - mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); - } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) { + mutex_release(&sk->sk_lock.dep_map, _RET_IP_); + } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) { bh_unlock_sock(sk); atomic_inc(&sk->sk_drops); goto discard_and_relse; @@ -520,19 +557,55 @@ } EXPORT_SYMBOL(sk_dst_check); -static int sock_setbindtodevice(struct sock *sk, char __user *optval, - int optlen) +static int sock_bindtoindex_locked(struct sock *sk, int ifindex) +{ + int ret = -ENOPROTOOPT; +#ifdef CONFIG_NETDEVICES + struct net *net = sock_net(sk); + + /* Sorry... */ + ret = -EPERM; + if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW)) + goto out; + + ret = -EINVAL; + if (ifindex < 0) + goto out; + + sk->sk_bound_dev_if = ifindex; + if (sk->sk_prot->rehash) + sk->sk_prot->rehash(sk); + sk_dst_reset(sk); + + ret = 0; + +out: +#endif + + return ret; +} + +int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk) +{ + int ret; + + if (lock_sk) + lock_sock(sk); + ret = sock_bindtoindex_locked(sk, ifindex); + if (lock_sk) + release_sock(sk); + + return ret; +} +EXPORT_SYMBOL(sock_bindtoindex); + +static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES struct net *net = sock_net(sk); char devname[IFNAMSIZ]; int index; - - /* Sorry... */ - ret = -EPERM; - if (!ns_capable(net->user_ns, CAP_NET_RAW)) - goto out; ret = -EINVAL; if (optlen < 0) @@ -548,7 +621,7 @@ memset(devname, 0, sizeof(devname)); ret = -EFAULT; - if (copy_from_user(devname, optval, optlen)) + if (copy_from_sockptr(devname, optval, optlen)) goto out; index = 0; @@ -565,13 +638,7 @@ goto out; } - lock_sock(sk); - sk->sk_bound_dev_if = index; - sk_dst_reset(sk); - release_sock(sk); - - ret = 0; - + return sock_bindtoindex(sk, index, true); out: #endif @@ -618,14 +685,6 @@ return ret; } -static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool) -{ - if (valbool) - sock_set_flag(sk, bit); - else - sock_reset_flag(sk, bit); -} - bool sk_mc_loop(struct sock *sk) { if (dev_recursion_level()) @@ -645,13 +704,133 @@ } EXPORT_SYMBOL(sk_mc_loop); +void sock_set_reuseaddr(struct sock *sk) +{ + lock_sock(sk); + sk->sk_reuse = SK_CAN_REUSE; + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_reuseaddr); + +void sock_set_reuseport(struct sock *sk) +{ + lock_sock(sk); + sk->sk_reuseport = true; + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_reuseport); + +void sock_no_linger(struct sock *sk) +{ + lock_sock(sk); + sk->sk_lingertime = 0; + sock_set_flag(sk, SOCK_LINGER); + release_sock(sk); +} +EXPORT_SYMBOL(sock_no_linger); + +void sock_set_priority(struct sock *sk, u32 priority) +{ + lock_sock(sk); + sk->sk_priority = priority; + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_priority); + +void sock_set_sndtimeo(struct sock *sk, s64 secs) +{ + lock_sock(sk); + if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) + sk->sk_sndtimeo = secs * HZ; + else + sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_sndtimeo); + +static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) +{ + if (val) { + sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); + sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns); + sock_set_flag(sk, SOCK_RCVTSTAMP); + sock_enable_timestamp(sk, SOCK_TIMESTAMP); + } else { + sock_reset_flag(sk, SOCK_RCVTSTAMP); + sock_reset_flag(sk, SOCK_RCVTSTAMPNS); + } +} + +void sock_enable_timestamps(struct sock *sk) +{ + lock_sock(sk); + __sock_set_timestamps(sk, true, false, true); + release_sock(sk); +} +EXPORT_SYMBOL(sock_enable_timestamps); + +void sock_set_keepalive(struct sock *sk) +{ + lock_sock(sk); + if (sk->sk_prot->keepalive) + sk->sk_prot->keepalive(sk, true); + sock_valbool_flag(sk, SOCK_KEEPOPEN, true); + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_keepalive); + +static void __sock_set_rcvbuf(struct sock *sk, int val) +{ + /* Ensure val * 2 fits into an int, to prevent max_t() from treating it + * as a negative value. + */ + val = min_t(int, val, INT_MAX / 2); + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + + /* We double it on the way in to account for "struct sk_buff" etc. + * overhead. Applications assume that the SO_RCVBUF setting they make + * will allow that much actual data to be received on that socket. + * + * Applications are unaware that "struct sk_buff" and other overheads + * allocate from the receive buffer during socket buffer allocation. + * + * And after considering the possible alternatives, returning the value + * we actually used in getsockopt is the most desirable behavior. + */ + WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); +} + +void sock_set_rcvbuf(struct sock *sk, int val) +{ + lock_sock(sk); + __sock_set_rcvbuf(sk, val); + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_rcvbuf); + +static void __sock_set_mark(struct sock *sk, u32 val) +{ + if (val != sk->sk_mark) { + sk->sk_mark = val; + sk_dst_reset(sk); + } +} + +void sock_set_mark(struct sock *sk, u32 val) +{ + lock_sock(sk); + __sock_set_mark(sk, val); + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_mark); + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. */ int sock_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock_txtime sk_txtime; struct sock *sk = sock->sk; @@ -670,7 +849,7 @@ if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) + if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; valbool = val ? 1 : 0; @@ -709,10 +888,15 @@ * play 'guess the biggest size' games. RCVBUF/SNDBUF * are treated in BSD as hints */ - val = min_t(u32, val, sysctl_wmem_max); + val = min_t(u32, val, READ_ONCE(sysctl_wmem_max)); set_sndbuf: + /* Ensure val * 2 fits into an int, to prevent max_t() + * from treating it as a negative value. + */ + val = min_t(int, val, INT_MAX / 2); sk->sk_userlocks |= SOCK_SNDBUF_LOCK; - sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF); + WRITE_ONCE(sk->sk_sndbuf, + max_t(int, val * 2, SOCK_MIN_SNDBUF)); /* Wake up sending tasks if we upped the value. */ sk->sk_write_space(sk); break; @@ -722,6 +906,12 @@ ret = -EPERM; break; } + + /* No negative values (to prevent underflow, as val will be + * multiplied by 2). + */ + if (val < 0) + val = 0; goto set_sndbuf; case SO_RCVBUF: @@ -730,25 +920,7 @@ * play 'guess the biggest size' games. RCVBUF/SNDBUF * are treated in BSD as hints */ - val = min_t(u32, val, sysctl_rmem_max); -set_rcvbuf: - sk->sk_userlocks |= SOCK_RCVBUF_LOCK; - /* - * We double it on the way in to account for - * "struct sk_buff" etc. overhead. Applications - * assume that the SO_RCVBUF setting they make will - * allow that much actual data to be received on that - * socket. - * - * Applications are unaware that "struct sk_buff" and - * other overheads allocate from the receive buffer - * during socket buffer allocation. - * - * And after considering the possible alternatives, - * returning the value we actually used in getsockopt - * is the most desirable behavior. - */ - sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF); + __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max))); break; case SO_RCVBUFFORCE: @@ -756,7 +928,12 @@ ret = -EPERM; break; } - goto set_rcvbuf; + + /* No negative values (to prevent underflow, as val will be + * multiplied by 2). + */ + __sock_set_rcvbuf(sk, max(val, 0)); + break; case SO_KEEPALIVE: if (sk->sk_prot->keepalive) @@ -785,7 +962,7 @@ ret = -EINVAL; /* 1003.1g */ break; } - if (copy_from_user(&ling, optval, sizeof(ling))) { + if (copy_from_sockptr(&ling, optval, sizeof(ling))) { ret = -EFAULT; break; } @@ -803,7 +980,6 @@ break; case SO_BSDCOMPAT: - sock_warn_obsolete_bsdism("setsockopt"); break; case SO_PASSCRED: @@ -813,22 +989,20 @@ clear_bit(SOCK_PASSCRED, &sock->flags); break; - case SO_TIMESTAMP: - case SO_TIMESTAMPNS: - if (valbool) { - if (optname == SO_TIMESTAMP) - sock_reset_flag(sk, SOCK_RCVTSTAMPNS); - else - sock_set_flag(sk, SOCK_RCVTSTAMPNS); - sock_set_flag(sk, SOCK_RCVTSTAMP); - sock_enable_timestamp(sk, SOCK_TIMESTAMP); - } else { - sock_reset_flag(sk, SOCK_RCVTSTAMP); - sock_reset_flag(sk, SOCK_RCVTSTAMPNS); - } + case SO_TIMESTAMP_OLD: + __sock_set_timestamps(sk, valbool, false, false); break; - - case SO_TIMESTAMPING: + case SO_TIMESTAMP_NEW: + __sock_set_timestamps(sk, valbool, true, false); + break; + case SO_TIMESTAMPNS_OLD: + __sock_set_timestamps(sk, valbool, false, true); + break; + case SO_TIMESTAMPNS_NEW: + __sock_set_timestamps(sk, valbool, true, true); + break; + case SO_TIMESTAMPING_NEW: + case SO_TIMESTAMPING_OLD: if (val & ~SOF_TIMESTAMPING_MASK) { ret = -EINVAL; break; @@ -856,6 +1030,8 @@ } sk->sk_tsflags = val; + sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); + if (val & SOF_TIMESTAMPING_RX_SOFTWARE) sock_enable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE); @@ -870,67 +1046,65 @@ if (sock->ops->set_rcvlowat) ret = sock->ops->set_rcvlowat(sk, val); else - sk->sk_rcvlowat = val ? : 1; + WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); break; - case SO_RCVTIMEO: - ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen); + case SO_RCVTIMEO_OLD: + case SO_RCVTIMEO_NEW: + ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, + optlen, optname == SO_RCVTIMEO_OLD); break; - case SO_SNDTIMEO: - ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen); + case SO_SNDTIMEO_OLD: + case SO_SNDTIMEO_NEW: + ret = sock_set_timeout(&sk->sk_sndtimeo, optval, + optlen, optname == SO_SNDTIMEO_OLD); break; - case SO_ATTACH_FILTER: - ret = -EINVAL; - if (optlen == sizeof(struct sock_fprog)) { - struct sock_fprog fprog; + case SO_ATTACH_FILTER: { + struct sock_fprog fprog; - ret = -EFAULT; - if (copy_from_user(&fprog, optval, sizeof(fprog))) - break; - + ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); + if (!ret) ret = sk_attach_filter(&fprog, sk); - } break; - + } case SO_ATTACH_BPF: ret = -EINVAL; if (optlen == sizeof(u32)) { u32 ufd; ret = -EFAULT; - if (copy_from_user(&ufd, optval, sizeof(ufd))) + if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) break; ret = sk_attach_bpf(ufd, sk); } break; - case SO_ATTACH_REUSEPORT_CBPF: - ret = -EINVAL; - if (optlen == sizeof(struct sock_fprog)) { - struct sock_fprog fprog; + case SO_ATTACH_REUSEPORT_CBPF: { + struct sock_fprog fprog; - ret = -EFAULT; - if (copy_from_user(&fprog, optval, sizeof(fprog))) - break; - + ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); + if (!ret) ret = sk_reuseport_attach_filter(&fprog, sk); - } break; - + } case SO_ATTACH_REUSEPORT_EBPF: ret = -EINVAL; if (optlen == sizeof(u32)) { u32 ufd; ret = -EFAULT; - if (copy_from_user(&ufd, optval, sizeof(ufd))) + if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) break; ret = sk_reuseport_attach_bpf(ufd, sk); } + break; + + case SO_DETACH_REUSEPORT_BPF: + ret = reuseport_detach_prog(sk); break; case SO_DETACH_FILTER: @@ -951,10 +1125,12 @@ clear_bit(SOCK_PASSSEC, &sock->flags); break; case SO_MARK: - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; - else - sk->sk_mark = val; + break; + } + + __sock_set_mark(sk, val); break; case SO_RXQ_OVFL: @@ -995,15 +1171,23 @@ #endif case SO_MAX_PACING_RATE: - if (val != ~0U) + { + unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val; + + if (sizeof(ulval) != sizeof(val) && + optlen >= sizeof(ulval) && + copy_from_sockptr(&ulval, optval, sizeof(ulval))) { + ret = -EFAULT; + break; + } + if (ulval != ~0UL) cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); - sk->sk_max_pacing_rate = val; - sk->sk_pacing_rate = min(sk->sk_pacing_rate, - sk->sk_max_pacing_rate); + sk->sk_max_pacing_rate = ulval; + sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval); break; - + } case SO_INCOMING_CPU: WRITE_ONCE(sk->sk_incoming_cpu, val); break; @@ -1015,7 +1199,10 @@ case SO_ZEROCOPY: if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { - if (sk->sk_protocol != IPPROTO_TCP) + if (!((sk->sk_type == SOCK_STREAM && + sk->sk_protocol == IPPROTO_TCP) || + (sk->sk_type == SOCK_DGRAM && + sk->sk_protocol == IPPROTO_UDP))) ret = -ENOTSUPP; } else if (sk->sk_family != PF_RDS) { ret = -ENOTSUPP; @@ -1029,23 +1216,35 @@ break; case SO_TXTIME: - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { - ret = -EPERM; - } else if (optlen != sizeof(struct sock_txtime)) { + if (optlen != sizeof(struct sock_txtime)) { ret = -EINVAL; - } else if (copy_from_user(&sk_txtime, optval, + break; + } else if (copy_from_sockptr(&sk_txtime, optval, sizeof(struct sock_txtime))) { ret = -EFAULT; + break; } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { ret = -EINVAL; - } else { - sock_valbool_flag(sk, SOCK_TXTIME, true); - sk->sk_clockid = sk_txtime.clockid; - sk->sk_txtime_deadline_mode = - !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); - sk->sk_txtime_report_errors = - !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); + break; } + /* CLOCK_MONOTONIC is only used by sch_fq, and this packet + * scheduler has enough safe guards. + */ + if (sk_txtime.clockid != CLOCK_MONOTONIC && + !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + ret = -EPERM; + break; + } + sock_valbool_flag(sk, SOCK_TXTIME, true); + sk->sk_clockid = sk_txtime.clockid; + sk->sk_txtime_deadline_mode = + !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); + sk->sk_txtime_report_errors = + !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); + break; + + case SO_BINDTOIFINDEX: + ret = sock_bindtoindex_locked(sk, val); break; default: @@ -1101,8 +1300,11 @@ union { int val; u64 val64; + unsigned long ulval; struct linger ling; - struct timeval tm; + struct old_timeval32 tm32; + struct __kernel_old_timeval tm; + struct __kernel_sock_timeval stm; struct sock_txtime txtime; } v; @@ -1186,42 +1388,38 @@ break; case SO_BSDCOMPAT: - sock_warn_obsolete_bsdism("getsockopt"); break; - case SO_TIMESTAMP: + case SO_TIMESTAMP_OLD: v.val = sock_flag(sk, SOCK_RCVTSTAMP) && + !sock_flag(sk, SOCK_TSTAMP_NEW) && !sock_flag(sk, SOCK_RCVTSTAMPNS); break; - case SO_TIMESTAMPNS: - v.val = sock_flag(sk, SOCK_RCVTSTAMPNS); + case SO_TIMESTAMPNS_OLD: + v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW); break; - case SO_TIMESTAMPING: + case SO_TIMESTAMP_NEW: + v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW); + break; + + case SO_TIMESTAMPNS_NEW: + v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW); + break; + + case SO_TIMESTAMPING_OLD: v.val = sk->sk_tsflags; break; - case SO_RCVTIMEO: - lv = sizeof(struct timeval); - if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) { - v.tm.tv_sec = 0; - v.tm.tv_usec = 0; - } else { - v.tm.tv_sec = sk->sk_rcvtimeo / HZ; - v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ; - } + case SO_RCVTIMEO_OLD: + case SO_RCVTIMEO_NEW: + lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname); break; - case SO_SNDTIMEO: - lv = sizeof(struct timeval); - if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) { - v.tm.tv_sec = 0; - v.tm.tv_usec = 0; - } else { - v.tm.tv_sec = sk->sk_sndtimeo / HZ; - v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ; - } + case SO_SNDTIMEO_OLD: + case SO_SNDTIMEO_NEW: + lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname); break; case SO_RCVLOWAT: @@ -1354,7 +1552,13 @@ #endif case SO_MAX_PACING_RATE: - v.val = sk->sk_max_pacing_rate; + if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) { + lv = sizeof(v.ulval); + v.ulval = sk->sk_max_pacing_rate; + } else { + /* 32bit version */ + v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U); + } break; case SO_INCOMING_CPU: @@ -1405,6 +1609,10 @@ SOF_TXTIME_REPORT_ERRORS : 0; break; + case SO_BINDTOIFINDEX: + v.val = sk->sk_bound_dev_if; + break; + default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). @@ -1452,13 +1660,14 @@ */ static void sock_copy(struct sock *nsk, const struct sock *osk) { + const struct proto *prot = READ_ONCE(osk->sk_prot); #ifdef CONFIG_SECURITY_NETWORK void *sptr = nsk->sk_security; #endif memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, - osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); + prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); #ifdef CONFIG_SECURITY_NETWORK nsk->sk_security = sptr; @@ -1584,6 +1793,10 @@ sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); +#ifdef CONFIG_BPF_SYSCALL + bpf_sk_storage_free(sk); +#endif + if (atomic_read(&sk->sk_omem_alloc)) pr_debug("%s: optmem leakage (%d bytes) detected\n", __func__, atomic_read(&sk->sk_omem_alloc)); @@ -1670,112 +1883,121 @@ */ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) { - struct sock *newsk; + struct proto *prot = READ_ONCE(sk->sk_prot); + struct sk_filter *filter; bool is_charged = true; + struct sock *newsk; - newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family); - if (newsk != NULL) { - struct sk_filter *filter; + newsk = sk_prot_alloc(prot, priority, sk->sk_family); + if (!newsk) + goto out; - sock_copy(newsk, sk); + sock_copy(newsk, sk); - newsk->sk_prot_creator = sk->sk_prot; + newsk->sk_prot_creator = prot; - /* SANITY */ - if (likely(newsk->sk_net_refcnt)) - get_net(sock_net(newsk)); - sk_node_init(&newsk->sk_node); - sock_lock_init(newsk); - bh_lock_sock(newsk); - newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; - newsk->sk_backlog.len = 0; - - atomic_set(&newsk->sk_rmem_alloc, 0); - /* - * sk_wmem_alloc set to one (see sk_free() and sock_wfree()) - */ - refcount_set(&newsk->sk_wmem_alloc, 1); - atomic_set(&newsk->sk_omem_alloc, 0); - sk_init_common(newsk); - - newsk->sk_dst_cache = NULL; - newsk->sk_dst_pending_confirm = 0; - newsk->sk_wmem_queued = 0; - newsk->sk_forward_alloc = 0; - atomic_set(&newsk->sk_drops, 0); - newsk->sk_send_head = NULL; - newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; - atomic_set(&newsk->sk_zckey, 0); - - sock_reset_flag(newsk, SOCK_DONE); - - /* sk->sk_memcg will be populated at accept() time */ - newsk->sk_memcg = NULL; - - cgroup_sk_clone(&newsk->sk_cgrp_data); - - rcu_read_lock(); - filter = rcu_dereference(sk->sk_filter); - if (filter != NULL) - /* though it's an empty new sock, the charging may fail - * if sysctl_optmem_max was changed between creation of - * original socket and cloning - */ - is_charged = sk_filter_charge(newsk, filter); - RCU_INIT_POINTER(newsk->sk_filter, filter); - rcu_read_unlock(); - - if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { - /* We need to make sure that we don't uncharge the new - * socket if we couldn't charge it in the first place - * as otherwise we uncharge the parent's filter. - */ - if (!is_charged) - RCU_INIT_POINTER(newsk->sk_filter, NULL); - sk_free_unlock_clone(newsk); - newsk = NULL; - goto out; - } - RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); - - newsk->sk_err = 0; - newsk->sk_err_soft = 0; - newsk->sk_priority = 0; - newsk->sk_incoming_cpu = raw_smp_processor_id(); - atomic64_set(&newsk->sk_cookie, 0); - if (likely(newsk->sk_net_refcnt)) - sock_inuse_add(sock_net(newsk), 1); - - /* - * Before updating sk_refcnt, we must commit prior changes to memory - * (Documentation/RCU/rculist_nulls.txt for details) - */ - smp_wmb(); - refcount_set(&newsk->sk_refcnt, 2); - - /* - * Increment the counter in the same struct proto as the master - * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that - * is the same as sk->sk_prot->socks, as this field was copied - * with memcpy). - * - * This _changes_ the previous behaviour, where - * tcp_create_openreq_child always was incrementing the - * equivalent to tcp_prot->socks (inet_sock_nr), so this have - * to be taken into account in all callers. -acme - */ - sk_refcnt_debug_inc(newsk); - sk_set_socket(newsk, NULL); - sk_tx_queue_clear(newsk); - newsk->sk_wq = NULL; - - if (newsk->sk_prot->sockets_allocated) - sk_sockets_allocated_inc(newsk); - - if (sock_needs_netstamp(sk) && - newsk->sk_flags & SK_FLAGS_TIMESTAMP) - net_enable_timestamp(); + /* SANITY */ + if (likely(newsk->sk_net_refcnt)) { + get_net(sock_net(newsk)); + sock_inuse_add(sock_net(newsk), 1); } + sk_node_init(&newsk->sk_node); + sock_lock_init(newsk); + bh_lock_sock(newsk); + newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; + newsk->sk_backlog.len = 0; + + atomic_set(&newsk->sk_rmem_alloc, 0); + + /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */ + refcount_set(&newsk->sk_wmem_alloc, 1); + + atomic_set(&newsk->sk_omem_alloc, 0); + sk_init_common(newsk); + + newsk->sk_dst_cache = NULL; + newsk->sk_dst_pending_confirm = 0; + newsk->sk_wmem_queued = 0; + newsk->sk_forward_alloc = 0; + atomic_set(&newsk->sk_drops, 0); + newsk->sk_send_head = NULL; + newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; + atomic_set(&newsk->sk_zckey, 0); + + sock_reset_flag(newsk, SOCK_DONE); + + /* sk->sk_memcg will be populated at accept() time */ + newsk->sk_memcg = NULL; + + cgroup_sk_clone(&newsk->sk_cgrp_data); + + rcu_read_lock(); + filter = rcu_dereference(sk->sk_filter); + if (filter != NULL) + /* though it's an empty new sock, the charging may fail + * if sysctl_optmem_max was changed between creation of + * original socket and cloning + */ + is_charged = sk_filter_charge(newsk, filter); + RCU_INIT_POINTER(newsk->sk_filter, filter); + rcu_read_unlock(); + + if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { + /* We need to make sure that we don't uncharge the new + * socket if we couldn't charge it in the first place + * as otherwise we uncharge the parent's filter. + */ + if (!is_charged) + RCU_INIT_POINTER(newsk->sk_filter, NULL); + sk_free_unlock_clone(newsk); + newsk = NULL; + goto out; + } + RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); + + if (bpf_sk_storage_clone(sk, newsk)) { + sk_free_unlock_clone(newsk); + newsk = NULL; + goto out; + } + + /* Clear sk_user_data if parent had the pointer tagged + * as not suitable for copying when cloning. + */ + if (sk_user_data_is_nocopy(newsk)) + newsk->sk_user_data = NULL; + + newsk->sk_err = 0; + newsk->sk_err_soft = 0; + newsk->sk_priority = 0; + newsk->sk_incoming_cpu = raw_smp_processor_id(); + + /* Before updating sk_refcnt, we must commit prior changes to memory + * (Documentation/RCU/rculist_nulls.rst for details) + */ + smp_wmb(); + refcount_set(&newsk->sk_refcnt, 2); + + /* Increment the counter in the same struct proto as the master + * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that + * is the same as sk->sk_prot->socks, as this field was copied + * with memcpy). + * + * This _changes_ the previous behaviour, where + * tcp_create_openreq_child always was incrementing the + * equivalent to tcp_prot->socks (inet_sock_nr), so this have + * to be taken into account in all callers. -acme + */ + sk_refcnt_debug_inc(newsk); + sk_set_socket(newsk, NULL); + sk_tx_queue_clear(newsk); + RCU_INIT_POINTER(newsk->sk_wq, NULL); + + if (newsk->sk_prot->sockets_allocated) + sk_sockets_allocated_inc(newsk); + + if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP) + net_enable_timestamp(); out: return newsk; } @@ -1877,6 +2099,19 @@ } EXPORT_SYMBOL(skb_set_owner_w); +static bool can_skb_orphan_partial(const struct sk_buff *skb) +{ +#ifdef CONFIG_TLS_DEVICE + /* Drivers depend on in-order delivery for crypto offload, + * partial orphan breaks out-of-order-OK logic. + */ + if (skb->decrypted) + return false; +#endif + return (skb->destructor == sock_wfree || + (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree)); +} + /* This helper is used by netem, as it can hold packets in its * delay queue. We want to allow the owner socket to send more * packets, as if they were already TX completed by a typical driver. @@ -1888,20 +2123,10 @@ if (skb_is_tcp_pure_ack(skb)) return; - if (skb->destructor == sock_wfree -#ifdef CONFIG_INET - || skb->destructor == tcp_wfree -#endif - ) { - struct sock *sk = skb->sk; + if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk)) + return; - if (refcount_inc_not_zero(&sk->sk_refcnt)) { - WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)); - skb->destructor = sock_efree; - } - } else { - skb_orphan(skb); - } + skb_orphan(skb); } EXPORT_SYMBOL(skb_orphan_partial); @@ -1927,6 +2152,18 @@ sock_put(skb->sk); } EXPORT_SYMBOL(sock_efree); + +/* Buffer destructor for prefetch/receive path where reference count may + * not be held, e.g. for listen sockets. + */ +#ifdef CONFIG_INET +void sock_pfree(struct sk_buff *skb) +{ + if (sk_is_refcounted(skb->sk)) + sock_gen_put(skb->sk); +} +EXPORT_SYMBOL(sock_pfree); +#endif /* CONFIG_INET */ kuid_t sock_i_uid(struct sock *sk) { @@ -1956,8 +2193,10 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority) { - if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { + if (force || + refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) { struct sk_buff *skb = alloc_skb(size, priority); + if (skb) { skb_set_owner_w(skb, sk); return skb; @@ -1981,7 +2220,7 @@ /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > - sysctl_optmem_max) + READ_ONCE(sysctl_optmem_max)) return NULL; skb = alloc_skb(size, priority); @@ -1999,8 +2238,10 @@ */ void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) { - if ((unsigned int)size <= sysctl_optmem_max && - atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { + int optmem_max = READ_ONCE(sysctl_optmem_max); + + if ((unsigned int)size <= optmem_max && + atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { void *mem; /* First do the add, to avoid the race if kmalloc * might sleep. @@ -2025,7 +2266,7 @@ if (WARN_ON_ONCE(!mem)) return; if (nullify) - kzfree(mem); + kfree_sensitive(mem); else kfree(mem); atomic_sub(size, &sk->sk_omem_alloc); @@ -2058,7 +2299,7 @@ break; set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) + if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) break; if (sk->sk_shutdown & SEND_SHUTDOWN) break; @@ -2093,7 +2334,7 @@ if (sk->sk_shutdown & SEND_SHUTDOWN) goto failure; - if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf) + if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf)) break; sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); @@ -2139,7 +2380,7 @@ return -EINVAL; sockc->mark = *(u32 *)CMSG_DATA(cmsg); break; - case SO_TIMESTAMPING: + case SO_TIMESTAMPING_OLD: if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) return -EINVAL; @@ -2207,8 +2448,8 @@ } } -/* On 32bit arches, an skb frag is limited to 2^15 */ #define SKB_FRAG_PAGE_ORDER get_order(32768) +DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); /** * skb_page_frag_refill - check that a page_frag contains enough room @@ -2233,7 +2474,8 @@ } pfrag->offset = 0; - if (SKB_FRAG_PAGE_ORDER) { + if (SKB_FRAG_PAGE_ORDER && + !static_branch_unlikely(&net_high_order_alloc_disable_key)) { /* Avoid direct reclaim but allow kswapd to wake */ pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP | __GFP_NOWARN | @@ -2263,67 +2505,6 @@ return false; } EXPORT_SYMBOL(sk_page_frag_refill); - -int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, - int sg_start, int *sg_curr_index, unsigned int *sg_curr_size, - int first_coalesce) -{ - int sg_curr = *sg_curr_index, use = 0, rc = 0; - unsigned int size = *sg_curr_size; - struct page_frag *pfrag; - struct scatterlist *sge; - - len -= size; - pfrag = sk_page_frag(sk); - - while (len > 0) { - unsigned int orig_offset; - - if (!sk_page_frag_refill(sk, pfrag)) { - rc = -ENOMEM; - goto out; - } - - use = min_t(int, len, pfrag->size - pfrag->offset); - - if (!sk_wmem_schedule(sk, use)) { - rc = -ENOMEM; - goto out; - } - - sk_mem_charge(sk, use); - size += use; - orig_offset = pfrag->offset; - pfrag->offset += use; - - sge = sg + sg_curr - 1; - if (sg_curr > first_coalesce && sg_page(sge) == pfrag->page && - sge->offset + sge->length == orig_offset) { - sge->length += use; - } else { - sge = sg + sg_curr; - sg_unmark_end(sge); - sg_set_page(sge, pfrag->page, use, orig_offset); - get_page(pfrag->page); - sg_curr++; - - if (sg_curr == MAX_SKB_FRAGS) - sg_curr = 0; - - if (sg_curr == sg_start) { - rc = -ENOSPC; - break; - } - } - - len -= use; - } -out: - *sg_curr_size = size; - *sg_curr_index = sg_curr; - return rc; -} -EXPORT_SYMBOL(sk_alloc_sg); static void __lock_sock(struct sock *sk) __releases(&sk->sk_lock.slock) @@ -2358,7 +2539,7 @@ next = skb->next; prefetch(next); WARN_ON_ONCE(skb_dst_is_noref(skb)); - skb->next = NULL; + skb_mark_not_on_list(skb); sk_backlog_rcv(sk, skb); cond_resched(); @@ -2614,20 +2795,6 @@ } EXPORT_SYMBOL(sock_no_shutdown); -int sock_no_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) -{ - return -EOPNOTSUPP; -} -EXPORT_SYMBOL(sock_no_setsockopt); - -int sock_no_getsockopt(struct socket *sock, int level, int optname, - char __user *optval, int __user *optlen) -{ - return -EOPNOTSUPP; -} -EXPORT_SYMBOL(sock_no_getsockopt); - int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) { return -EOPNOTSUPP; @@ -2732,15 +2899,25 @@ rcu_read_unlock(); } -static void sock_def_readable(struct sock *sk) +void sock_def_readable(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); - if (skwq_has_sleeper(wq)) + + if (skwq_has_sleeper(wq)) { + int done = 0; + + trace_android_vh_do_wake_up_sync(&wq->wait, &done); + if (done) + goto out; + wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | EPOLLRDNORM | EPOLLRDBAND); + } + +out: sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); rcu_read_unlock(); } @@ -2754,7 +2931,7 @@ /* Do not wake up a writer until he can make "significant" * progress. --DaveM */ - if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { + if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) { wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | @@ -2795,6 +2972,13 @@ } EXPORT_SYMBOL(sk_stop_timer); +void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer) +{ + if (del_timer_sync(timer)) + __sock_put(sk); +} +EXPORT_SYMBOL(sk_stop_timer_sync); + void sock_init_data(struct socket *sock, struct sock *sk) { sk_init_common(sk); @@ -2803,8 +2987,8 @@ timer_setup(&sk->sk_timer, NULL, 0); sk->sk_allocation = GFP_KERNEL; - sk->sk_rcvbuf = sysctl_rmem_default; - sk->sk_sndbuf = sysctl_wmem_default; + sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default); + sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default); sk->sk_state = TCP_CLOSE; sk_set_socket(sk, sock); @@ -2812,11 +2996,11 @@ if (sock) { sk->sk_type = sock->type; - sk->sk_wq = sock->wq; + RCU_INIT_POINTER(sk->sk_wq, &sock->wq); sock->sk = sk; sk->sk_uid = SOCK_INODE(sock)->i_uid; } else { - sk->sk_wq = NULL; + RCU_INIT_POINTER(sk->sk_wq, NULL); sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0); } @@ -2859,18 +3043,18 @@ #ifdef CONFIG_NET_RX_BUSY_POLL sk->sk_napi_id = 0; - sk->sk_ll_usec = sysctl_net_busy_read; + sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read); #endif - sk->sk_max_pacing_rate = ~0U; - sk->sk_pacing_rate = ~0U; - sk->sk_pacing_shift = 10; + sk->sk_max_pacing_rate = ~0UL; + sk->sk_pacing_rate = ~0UL; + WRITE_ONCE(sk->sk_pacing_shift, 10); sk->sk_incoming_cpu = -1; sk_rx_queue_clear(sk); /* * Before updating sk_refcnt, we must commit prior changes to memory - * (Documentation/RCU/rculist_nulls.txt for details) + * (Documentation/RCU/rculist_nulls.rst for details) */ smp_wmb(); refcount_set(&sk->sk_refcnt, 1); @@ -2885,12 +3069,11 @@ if (sk->sk_lock.owned) __lock_sock(sk); sk->sk_lock.owned = 1; - spin_unlock(&sk->sk_lock.slock); + spin_unlock_bh(&sk->sk_lock.slock); /* * The sk_lock has mutex_lock() semantics here: */ mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); - local_bh_enable(); } EXPORT_SYMBOL(lock_sock_nested); @@ -2939,51 +3122,55 @@ __lock_sock(sk); sk->sk_lock.owned = 1; - spin_unlock(&sk->sk_lock.slock); + spin_unlock_bh(&sk->sk_lock.slock); /* * The sk_lock has mutex_lock() semantics here: */ mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); - local_bh_enable(); return true; } EXPORT_SYMBOL(lock_sock_fast); -int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) +int sock_gettstamp(struct socket *sock, void __user *userstamp, + bool timeval, bool time32) { - struct timeval tv; + struct sock *sk = sock->sk; + struct timespec64 ts; sock_enable_timestamp(sk, SOCK_TIMESTAMP); - tv = ktime_to_timeval(sock_read_timestamp(sk)); - if (tv.tv_sec == -1) - return -ENOENT; - if (tv.tv_sec == 0) { - ktime_t kt = ktime_get_real(); - sock_write_timestamp(sk, kt); - tv = ktime_to_timeval(kt); - } - return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0; -} -EXPORT_SYMBOL(sock_get_timestamp); - -int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp) -{ - struct timespec ts; - - sock_enable_timestamp(sk, SOCK_TIMESTAMP); - ts = ktime_to_timespec(sock_read_timestamp(sk)); + ts = ktime_to_timespec64(sock_read_timestamp(sk)); if (ts.tv_sec == -1) return -ENOENT; if (ts.tv_sec == 0) { ktime_t kt = ktime_get_real(); sock_write_timestamp(sk, kt); - ts = ktime_to_timespec(sk->sk_stamp); + ts = ktime_to_timespec64(kt); } - return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0; -} -EXPORT_SYMBOL(sock_get_timestampns); -void sock_enable_timestamp(struct sock *sk, int flag) + if (timeval) + ts.tv_nsec /= 1000; + +#ifdef CONFIG_COMPAT_32BIT_TIME + if (time32) + return put_old_timespec32(&ts, userstamp); +#endif +#ifdef CONFIG_SPARC64 + /* beware of padding in sparc64 timeval */ + if (timeval && !in_compat_syscall()) { + struct __kernel_old_timeval __user tv = { + .tv_sec = ts.tv_sec, + .tv_usec = ts.tv_nsec, + }; + if (copy_to_user(userstamp, &tv, sizeof(tv))) + return -EFAULT; + return 0; + } +#endif + return put_timespec64(&ts, userstamp); +} +EXPORT_SYMBOL(sock_gettstamp); + +void sock_enable_timestamp(struct sock *sk, enum sock_flags flag) { if (!sock_flag(sk, flag)) { unsigned long previous_flags = sk->sk_flags; @@ -3052,20 +3239,6 @@ } EXPORT_SYMBOL(sock_common_getsockopt); -#ifdef CONFIG_COMPAT -int compat_sock_common_getsockopt(struct socket *sock, int level, int optname, - char __user *optval, int __user *optlen) -{ - struct sock *sk = sock->sk; - - if (sk->sk_prot->compat_getsockopt != NULL) - return sk->sk_prot->compat_getsockopt(sk, level, optname, - optval, optlen); - return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); -} -EXPORT_SYMBOL(compat_sock_common_getsockopt); -#endif - int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { @@ -3085,7 +3258,7 @@ * Set socket options on an inet socket. */ int sock_common_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; @@ -3093,27 +3266,13 @@ } EXPORT_SYMBOL(sock_common_setsockopt); -#ifdef CONFIG_COMPAT -int compat_sock_common_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) -{ - struct sock *sk = sock->sk; - - if (sk->sk_prot->compat_setsockopt != NULL) - return sk->sk_prot->compat_setsockopt(sk, level, optname, - optval, optlen); - return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); -} -EXPORT_SYMBOL(compat_sock_common_setsockopt); -#endif - void sk_common_release(struct sock *sk) { if (sk->sk_prot->destroy) sk->sk_prot->destroy(sk); /* - * Observation: when sock_common_release is called, processes have + * Observation: when sk_common_release is called, processes have * no access to socket. But net still has. * Step one, detach it from networking: * @@ -3149,13 +3308,13 @@ memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); - mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf; + mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); - mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf; + mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; - mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; + mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); - mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len; + mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); } @@ -3240,16 +3399,17 @@ core_initcall(net_inuse_init); -static void assign_proto_idx(struct proto *prot) +static int assign_proto_idx(struct proto *prot) { prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { pr_err("PROTO_INUSE_NR exhausted\n"); - return; + return -ENOSPC; } set_bit(prot->inuse_idx, proto_inuse_idx); + return 0; } static void release_proto_idx(struct proto *prot) @@ -3258,8 +3418,9 @@ clear_bit(prot->inuse_idx, proto_inuse_idx); } #else -static inline void assign_proto_idx(struct proto *prot) +static inline int assign_proto_idx(struct proto *prot) { + return 0; } static inline void release_proto_idx(struct proto *prot) @@ -3270,6 +3431,16 @@ { } #endif + +static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot) +{ + if (!twsk_prot) + return; + kfree(twsk_prot->twsk_slab_name); + twsk_prot->twsk_slab_name = NULL; + kmem_cache_destroy(twsk_prot->twsk_slab); + twsk_prot->twsk_slab = NULL; +} static void req_prot_cleanup(struct request_sock_ops *rsk_prot) { @@ -3308,6 +3479,8 @@ int proto_register(struct proto *prot, int alloc_slab) { + int ret = -ENOBUFS; + if (alloc_slab) { prot->slab = kmem_cache_create_usercopy(prot->name, prot->obj_size, 0, @@ -3339,25 +3512,32 @@ prot->slab_flags, NULL); if (prot->twsk_prot->twsk_slab == NULL) - goto out_free_timewait_sock_slab_name; + goto out_free_timewait_sock_slab; } } mutex_lock(&proto_list_mutex); + ret = assign_proto_idx(prot); + if (ret) { + mutex_unlock(&proto_list_mutex); + goto out_free_timewait_sock_slab; + } list_add(&prot->node, &proto_list); - assign_proto_idx(prot); mutex_unlock(&proto_list_mutex); - return 0; + return ret; -out_free_timewait_sock_slab_name: - kfree(prot->twsk_prot->twsk_slab_name); +out_free_timewait_sock_slab: + if (alloc_slab && prot->twsk_prot) + tw_prot_cleanup(prot->twsk_prot); out_free_request_sock_slab: - req_prot_cleanup(prot->rsk_prot); + if (alloc_slab) { + req_prot_cleanup(prot->rsk_prot); - kmem_cache_destroy(prot->slab); - prot->slab = NULL; + kmem_cache_destroy(prot->slab); + prot->slab = NULL; + } out: - return -ENOBUFS; + return ret; } EXPORT_SYMBOL(proto_register); @@ -3372,12 +3552,7 @@ prot->slab = NULL; req_prot_cleanup(prot->rsk_prot); - - if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) { - kmem_cache_destroy(prot->twsk_prot->twsk_slab); - kfree(prot->twsk_prot->twsk_slab_name); - prot->twsk_prot->twsk_slab = NULL; - } + tw_prot_cleanup(prot->twsk_prot); } EXPORT_SYMBOL(proto_unregister); @@ -3394,6 +3569,7 @@ #ifdef CONFIG_INET if (family == AF_INET && protocol != IPPROTO_RAW && + protocol < MAX_INET_PROTOS && !rcu_access_pointer(inet_protos[protocol])) return -ENOENT; #endif @@ -3431,7 +3607,7 @@ return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; } -static char *sock_prot_memory_pressure(struct proto *proto) +static const char *sock_prot_memory_pressure(struct proto *proto) { return proto->memory_pressure != NULL ? proto_memory_pressure(proto) ? "yes" : "no" : "NI"; @@ -3535,3 +3711,11 @@ } EXPORT_SYMBOL(sk_busy_loop_end); #endif /* CONFIG_NET_RX_BUSY_POLL */ + +int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len) +{ + if (!sk->sk_prot->bind_add) + return -EOPNOTSUPP; + return sk->sk_prot->bind_add(sk, addr, addr_len); +} +EXPORT_SYMBOL(sock_bind_add); -- Gitblit v1.6.2