hc
2024-01-05 071106ecf68c401173c58808b1cf5f68cc50d390
kernel/net/ipv4/af_inet.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /*
23 * INET An implementation of the TCP/IP protocol suite for the LINUX
34 * operating system. INET is implemented using the BSD Socket
....@@ -58,11 +59,6 @@
5859 * Some other random speedups.
5960 * Cyrus Durgin : Cleaned up file for kmod hacks.
6061 * Andi Kleen : Fix inet_stream_connect TCP race.
61
- *
62
- * This program is free software; you can redistribute it and/or
63
- * modify it under the terms of the GNU General Public License
64
- * as published by the Free Software Foundation; either version
65
- * 2 of the License, or (at your option) any later version.
6662 */
6763
6864 #define pr_fmt(fmt) "IPv4: " fmt
....@@ -120,6 +116,7 @@
120116 #include <linux/mroute.h>
121117 #endif
122118 #include <net/l3mdev.h>
119
+#include <net/compat.h>
123120
124121 #include <trace/events/sock.h>
125122
....@@ -136,6 +133,10 @@
136133 struct inet_sock *inet = inet_sk(sk);
137134
138135 __skb_queue_purge(&sk->sk_receive_queue);
136
+ if (sk->sk_rx_skb_cache) {
137
+ __kfree_skb(sk->sk_rx_skb_cache);
138
+ sk->sk_rx_skb_cache = NULL;
139
+ }
139140 __skb_queue_purge(&sk->sk_error_queue);
140141
141142 sk_mem_reclaim(sk);
....@@ -156,8 +157,8 @@
156157 WARN_ON(sk->sk_forward_alloc);
157158
158159 kfree(rcu_dereference_protected(inet->inet_opt, 1));
159
- dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
160
- dst_release(sk->sk_rx_dst);
160
+ dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1));
161
+ dst_release(rcu_dereference_protected(sk->sk_rx_dst, 1));
161162 sk_refcnt_debug_dec(sk);
162163 }
163164 EXPORT_SYMBOL(inet_sock_destruct);
....@@ -208,6 +209,7 @@
208209 if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
209210 goto out;
210211
212
+ WRITE_ONCE(sk->sk_max_ack_backlog, backlog);
211213 /* Really, if the socket is already in listen state
212214 * we can only allow the backlog to be adjusted.
213215 */
....@@ -218,7 +220,7 @@
218220 * because the socket was in TCP_LISTEN state previously but
219221 * was shutdown() rather than close().
220222 */
221
- tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen;
223
+ tcp_fastopen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen);
222224 if ((tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) &&
223225 (tcp_fastopen & TFO_SERVER_ENABLE) &&
224226 !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) {
....@@ -231,7 +233,6 @@
231233 goto out;
232234 tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL);
233235 }
234
- sk->sk_max_ack_backlog = backlog;
235236 err = 0;
236237
237238 out:
....@@ -337,7 +338,7 @@
337338 inet->hdrincl = 1;
338339 }
339340
340
- if (net->ipv4.sysctl_ip_no_pmtu_disc)
341
+ if (READ_ONCE(net->ipv4.sysctl_ip_no_pmtu_disc))
341342 inet->pmtudisc = IP_PMTUDISC_DONT;
342343 else
343344 inet->pmtudisc = IP_PMTUDISC_WANT;
....@@ -410,6 +411,9 @@
410411 if (sk) {
411412 long timeout;
412413
414
+ if (!sk->sk_kern_sock)
415
+ BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk);
416
+
413417 /* Applications forget to leave groups before exiting */
414418 ip_mc_drop_socket(sk);
415419
....@@ -450,12 +454,12 @@
450454 if (err)
451455 return err;
452456
453
- return __inet_bind(sk, uaddr, addr_len, false, true);
457
+ return __inet_bind(sk, uaddr, addr_len, BIND_WITH_LOCK);
454458 }
455459 EXPORT_SYMBOL(inet_bind);
456460
457461 int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
458
- bool force_bind_address_no_port, bool with_lock)
462
+ u32 flags)
459463 {
460464 struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
461465 struct inet_sock *inet = inet_sk(sk);
....@@ -494,8 +498,12 @@
494498 goto out;
495499
496500 snum = ntohs(addr->sin_port);
501
+ err = -EPERM;
502
+ if (snum && inet_is_local_unbindable_port(net, snum))
503
+ goto out;
504
+
497505 err = -EACCES;
498
- if (snum && snum < inet_prot_sock(net) &&
506
+ if (snum && inet_port_requires_bind_service(net, snum) &&
499507 !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
500508 goto out;
501509
....@@ -506,7 +514,7 @@
506514 * would be illegal to use them (multicast/broadcast) in
507515 * which case the sending device address is used.
508516 */
509
- if (with_lock)
517
+ if (flags & BIND_WITH_LOCK)
510518 lock_sock(sk);
511519
512520 /* Check these errors (active socket, double bind). */
....@@ -520,16 +528,18 @@
520528
521529 /* Make sure we are allowed to bind here. */
522530 if (snum || !(inet->bind_address_no_port ||
523
- force_bind_address_no_port)) {
531
+ (flags & BIND_FORCE_ADDRESS_NO_PORT))) {
524532 if (sk->sk_prot->get_port(sk, snum)) {
525533 inet->inet_saddr = inet->inet_rcv_saddr = 0;
526534 err = -EADDRINUSE;
527535 goto out_release_sock;
528536 }
529
- err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk);
530
- if (err) {
531
- inet->inet_saddr = inet->inet_rcv_saddr = 0;
532
- goto out_release_sock;
537
+ if (!(flags & BIND_FROM_BPF)) {
538
+ err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk);
539
+ if (err) {
540
+ inet->inet_saddr = inet->inet_rcv_saddr = 0;
541
+ goto out_release_sock;
542
+ }
533543 }
534544 }
535545
....@@ -543,7 +553,7 @@
543553 sk_dst_reset(sk);
544554 err = 0;
545555 out_release_sock:
546
- if (with_lock)
556
+ if (flags & BIND_WITH_LOCK)
547557 release_sock(sk);
548558 out:
549559 return err;
....@@ -566,7 +576,7 @@
566576 return err;
567577 }
568578
569
- if (!inet_sk(sk)->inet_num && inet_autobind(sk))
579
+ if (data_race(!inet_sk(sk)->inet_num) && inet_autobind(sk))
570580 return -EAGAIN;
571581 return sk->sk_prot->connect(sk, uaddr, addr_len);
572582 }
....@@ -753,12 +763,11 @@
753763 }
754764 EXPORT_SYMBOL(inet_accept);
755765
756
-
757766 /*
758767 * This does both peername and sockname.
759768 */
760769 int inet_getname(struct socket *sock, struct sockaddr *uaddr,
761
- int peer)
770
+ int peer)
762771 {
763772 struct sock *sk = sock->sk;
764773 struct inet_sock *inet = inet_sk(sk);
....@@ -779,23 +788,38 @@
779788 sin->sin_port = inet->inet_sport;
780789 sin->sin_addr.s_addr = addr;
781790 }
791
+ if (cgroup_bpf_enabled)
792
+ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
793
+ peer ? BPF_CGROUP_INET4_GETPEERNAME :
794
+ BPF_CGROUP_INET4_GETSOCKNAME,
795
+ NULL);
782796 memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
783797 return sizeof(*sin);
784798 }
785799 EXPORT_SYMBOL(inet_getname);
786800
801
+int inet_send_prepare(struct sock *sk)
802
+{
803
+ sock_rps_record_flow(sk);
804
+
805
+ /* We may need to bind the socket. */
806
+ if (data_race(!inet_sk(sk)->inet_num) && !sk->sk_prot->no_autobind &&
807
+ inet_autobind(sk))
808
+ return -EAGAIN;
809
+
810
+ return 0;
811
+}
812
+EXPORT_SYMBOL_GPL(inet_send_prepare);
813
+
787814 int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
788815 {
789816 struct sock *sk = sock->sk;
790817
791
- sock_rps_record_flow(sk);
792
-
793
- /* We may need to bind the socket. */
794
- if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind &&
795
- inet_autobind(sk))
818
+ if (unlikely(inet_send_prepare(sk)))
796819 return -EAGAIN;
797820
798
- return sk->sk_prot->sendmsg(sk, msg, size);
821
+ return INDIRECT_CALL_2(sk->sk_prot->sendmsg, tcp_sendmsg, udp_sendmsg,
822
+ sk, msg, size);
799823 }
800824 EXPORT_SYMBOL(inet_sendmsg);
801825
....@@ -804,11 +828,7 @@
804828 {
805829 struct sock *sk = sock->sk;
806830
807
- sock_rps_record_flow(sk);
808
-
809
- /* We may need to bind the socket. */
810
- if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind &&
811
- inet_autobind(sk))
831
+ if (unlikely(inet_send_prepare(sk)))
812832 return -EAGAIN;
813833
814834 if (sk->sk_prot->sendpage)
....@@ -817,6 +837,8 @@
817837 }
818838 EXPORT_SYMBOL(inet_sendpage);
819839
840
+INDIRECT_CALLABLE_DECLARE(int udp_recvmsg(struct sock *, struct msghdr *,
841
+ size_t, int, int, int *));
820842 int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
821843 int flags)
822844 {
....@@ -827,8 +849,9 @@
827849 if (likely(!(flags & MSG_ERRQUEUE)))
828850 sock_rps_record_flow(sk);
829851
830
- err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
831
- flags & ~MSG_DONTWAIT, &addr_len);
852
+ err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udp_recvmsg,
853
+ sk, msg, size, flags & MSG_DONTWAIT,
854
+ flags & ~MSG_DONTWAIT, &addr_len);
832855 if (err >= 0)
833856 msg->msg_namelen = addr_len;
834857 return err;
....@@ -863,9 +886,9 @@
863886 err = -ENOTCONN;
864887 /* Hack to wake up other listeners, who can poll for
865888 EPOLLHUP, even on eg. unconnected UDP sockets -- RR */
866
- /* fall through */
889
+ fallthrough;
867890 default:
868
- sk->sk_shutdown |= how;
891
+ WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | how);
869892 if (sk->sk_prot->shutdown)
870893 sk->sk_prot->shutdown(sk, how);
871894 break;
....@@ -877,7 +900,7 @@
877900 case TCP_LISTEN:
878901 if (!(how & RCV_SHUTDOWN))
879902 break;
880
- /* fall through */
903
+ fallthrough;
881904 case TCP_SYN_SENT:
882905 err = sk->sk_prot->disconnect(sk, O_NONBLOCK);
883906 sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
....@@ -911,12 +934,6 @@
911934 struct rtentry rt;
912935
913936 switch (cmd) {
914
- case SIOCGSTAMP:
915
- err = sock_get_timestamp(sk, (struct timeval __user *)arg);
916
- break;
917
- case SIOCGSTAMPNS:
918
- err = sock_get_timestampns(sk, (struct timespec __user *)arg);
919
- break;
920937 case SIOCADDRT:
921938 case SIOCDELRT:
922939 if (copy_from_user(&rt, p, sizeof(struct rtentry)))
....@@ -965,17 +982,42 @@
965982 EXPORT_SYMBOL(inet_ioctl);
966983
967984 #ifdef CONFIG_COMPAT
985
+static int inet_compat_routing_ioctl(struct sock *sk, unsigned int cmd,
986
+ struct compat_rtentry __user *ur)
987
+{
988
+ compat_uptr_t rtdev;
989
+ struct rtentry rt;
990
+
991
+ if (copy_from_user(&rt.rt_dst, &ur->rt_dst,
992
+ 3 * sizeof(struct sockaddr)) ||
993
+ get_user(rt.rt_flags, &ur->rt_flags) ||
994
+ get_user(rt.rt_metric, &ur->rt_metric) ||
995
+ get_user(rt.rt_mtu, &ur->rt_mtu) ||
996
+ get_user(rt.rt_window, &ur->rt_window) ||
997
+ get_user(rt.rt_irtt, &ur->rt_irtt) ||
998
+ get_user(rtdev, &ur->rt_dev))
999
+ return -EFAULT;
1000
+
1001
+ rt.rt_dev = compat_ptr(rtdev);
1002
+ return ip_rt_ioctl(sock_net(sk), cmd, &rt);
1003
+}
1004
+
9681005 static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
9691006 {
1007
+ void __user *argp = compat_ptr(arg);
9701008 struct sock *sk = sock->sk;
971
- int err = -ENOIOCTLCMD;
9721009
973
- if (sk->sk_prot->compat_ioctl)
974
- err = sk->sk_prot->compat_ioctl(sk, cmd, arg);
975
-
976
- return err;
1010
+ switch (cmd) {
1011
+ case SIOCADDRT:
1012
+ case SIOCDELRT:
1013
+ return inet_compat_routing_ioctl(sk, cmd, argp);
1014
+ default:
1015
+ if (!sk->sk_prot->compat_ioctl)
1016
+ return -ENOIOCTLCMD;
1017
+ return sk->sk_prot->compat_ioctl(sk, cmd, arg);
1018
+ }
9771019 }
978
-#endif
1020
+#endif /* CONFIG_COMPAT */
9791021
9801022 const struct proto_ops inet_stream_ops = {
9811023 .family = PF_INET,
....@@ -988,6 +1030,7 @@
9881030 .getname = inet_getname,
9891031 .poll = tcp_poll,
9901032 .ioctl = inet_ioctl,
1033
+ .gettstamp = sock_gettstamp,
9911034 .listen = inet_listen,
9921035 .shutdown = inet_shutdown,
9931036 .setsockopt = sock_common_setsockopt,
....@@ -1004,8 +1047,6 @@
10041047 .sendpage_locked = tcp_sendpage_locked,
10051048 .peek_len = tcp_peek_len,
10061049 #ifdef CONFIG_COMPAT
1007
- .compat_setsockopt = compat_sock_common_setsockopt,
1008
- .compat_getsockopt = compat_sock_common_getsockopt,
10091050 .compat_ioctl = inet_compat_ioctl,
10101051 #endif
10111052 .set_rcvlowat = tcp_set_rcvlowat,
....@@ -1023,6 +1064,7 @@
10231064 .getname = inet_getname,
10241065 .poll = udp_poll,
10251066 .ioctl = inet_ioctl,
1067
+ .gettstamp = sock_gettstamp,
10261068 .listen = sock_no_listen,
10271069 .shutdown = inet_shutdown,
10281070 .setsockopt = sock_common_setsockopt,
....@@ -1033,8 +1075,6 @@
10331075 .sendpage = inet_sendpage,
10341076 .set_peek_off = sk_set_peek_off,
10351077 #ifdef CONFIG_COMPAT
1036
- .compat_setsockopt = compat_sock_common_setsockopt,
1037
- .compat_getsockopt = compat_sock_common_getsockopt,
10381078 .compat_ioctl = inet_compat_ioctl,
10391079 #endif
10401080 };
....@@ -1055,6 +1095,7 @@
10551095 .getname = inet_getname,
10561096 .poll = datagram_poll,
10571097 .ioctl = inet_ioctl,
1098
+ .gettstamp = sock_gettstamp,
10581099 .listen = sock_no_listen,
10591100 .shutdown = inet_shutdown,
10601101 .setsockopt = sock_common_setsockopt,
....@@ -1064,8 +1105,6 @@
10641105 .mmap = sock_no_mmap,
10651106 .sendpage = inet_sendpage,
10661107 #ifdef CONFIG_COMPAT
1067
- .compat_setsockopt = compat_sock_common_setsockopt,
1068
- .compat_getsockopt = compat_sock_common_getsockopt,
10691108 .compat_ioctl = inet_compat_ioctl,
10701109 #endif
10711110 };
....@@ -1209,7 +1248,7 @@
12091248 if (new_saddr == old_saddr)
12101249 return 0;
12111250
1212
- if (sock_net(sk)->ipv4.sysctl_ip_dynaddr > 1) {
1251
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) > 1) {
12131252 pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n",
12141253 __func__, &old_saddr, &new_saddr);
12151254 }
....@@ -1264,7 +1303,7 @@
12641303 * Other protocols have to map its equivalent state to TCP_SYN_SENT.
12651304 * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme
12661305 */
1267
- if (!sock_net(sk)->ipv4.sysctl_ip_dynaddr ||
1306
+ if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) ||
12681307 sk->sk_state != TCP_SYN_SENT ||
12691308 (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
12701309 (err = inet_sk_reselect_saddr(sk)) != 0)
....@@ -1388,6 +1427,15 @@
13881427 }
13891428 EXPORT_SYMBOL(inet_gso_segment);
13901429
1430
+static struct sk_buff *ipip_gso_segment(struct sk_buff *skb,
1431
+ netdev_features_t features)
1432
+{
1433
+ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_IPXIP4))
1434
+ return ERR_PTR(-EINVAL);
1435
+
1436
+ return inet_gso_segment(skb, features);
1437
+}
1438
+
13911439 struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
13921440 {
13931441 const struct net_offload *ops;
....@@ -1497,7 +1545,8 @@
14971545 skb_gro_pull(skb, sizeof(*iph));
14981546 skb_set_transport_header(skb, skb_gro_offset(skb));
14991547
1500
- pp = call_gro_receive(ops->callbacks.gro_receive, head, skb);
1548
+ pp = indirect_call_gro_receive(tcp4_gro_receive, udp4_gro_receive,
1549
+ ops->callbacks.gro_receive, head, skb);
15011550
15021551 out_unlock:
15031552 rcu_read_unlock();
....@@ -1584,7 +1633,9 @@
15841633 * because any hdr with option will have been flushed in
15851634 * inet_gro_receive().
15861635 */
1587
- err = ops->callbacks.gro_complete(skb, nhoff + sizeof(*iph));
1636
+ err = INDIRECT_CALL_2(ops->callbacks.gro_complete,
1637
+ tcp4_gro_complete, udp4_gro_complete,
1638
+ skb, nhoff + sizeof(*iph));
15881639
15891640 out_unlock:
15901641 rcu_read_unlock();
....@@ -1678,12 +1729,7 @@
16781729 };
16791730 #endif
16801731
1681
-/* thinking of making this const? Don't.
1682
- * early_demux can change based on sysctl.
1683
- */
1684
-static struct net_protocol tcp_protocol = {
1685
- .early_demux = tcp_v4_early_demux,
1686
- .early_demux_handler = tcp_v4_early_demux,
1732
+static const struct net_protocol tcp_protocol = {
16871733 .handler = tcp_v4_rcv,
16881734 .err_handler = tcp_v4_err,
16891735 .no_policy = 1,
....@@ -1691,12 +1737,7 @@
16911737 .icmp_strict_tag_validation = 1,
16921738 };
16931739
1694
-/* thinking of making this const? Don't.
1695
- * early_demux can change based on sysctl.
1696
- */
1697
-static struct net_protocol udp_protocol = {
1698
- .early_demux = udp_v4_early_demux,
1699
- .early_demux_handler = udp_v4_early_demux,
1740
+static const struct net_protocol udp_protocol = {
17001741 .handler = udp_rcv,
17011742 .err_handler = udp_err,
17021743 .no_policy = 1,
....@@ -1772,6 +1813,10 @@
17721813 free_percpu(net->mib.net_statistics);
17731814 free_percpu(net->mib.ip_statistics);
17741815 free_percpu(net->mib.tcp_statistics);
1816
+#ifdef CONFIG_MPTCP
1817
+ /* allocated on demand, see mptcp_init_sock() */
1818
+ free_percpu(net->mib.mptcp_statistics);
1819
+#endif
17751820 }
17761821
17771822 static __net_initdata struct pernet_operations ipv4_mib_ops = {
....@@ -1810,6 +1855,7 @@
18101855 net->ipv4.sysctl_ip_early_demux = 1;
18111856 net->ipv4.sysctl_udp_early_demux = 1;
18121857 net->ipv4.sysctl_tcp_early_demux = 1;
1858
+ net->ipv4.sysctl_nexthop_compat_mode = 1;
18131859 #ifdef CONFIG_SYSCTL
18141860 net->ipv4.sysctl_ip_prot_sock = PROT_SOCK;
18151861 #endif
....@@ -1824,13 +1870,8 @@
18241870 return 0;
18251871 }
18261872
1827
-static __net_exit void inet_exit_net(struct net *net)
1828
-{
1829
-}
1830
-
18311873 static __net_initdata struct pernet_operations af_inet_ops = {
18321874 .init = inet_init_net,
1833
- .exit = inet_exit_net,
18341875 };
18351876
18361877 static int __init init_inet_pernet_ops(void)
....@@ -1855,7 +1896,7 @@
18551896
18561897 static const struct net_offload ipip_offload = {
18571898 .callbacks = {
1858
- .gso_segment = inet_gso_segment,
1899
+ .gso_segment = ipip_gso_segment,
18591900 .gro_receive = ipip_gro_receive,
18601901 .gro_complete = ipip_gro_complete,
18611902 },
....@@ -1894,7 +1935,7 @@
18941935 {
18951936 struct inet_protosw *q;
18961937 struct list_head *r;
1897
- int rc = -EINVAL;
1938
+ int rc;
18981939
18991940 sock_skb_cb_check_size(sizeof(struct inet_skb_parm));
19001941
....@@ -1971,6 +2012,8 @@
19712012 /* Add UDP-Lite (RFC 3828) */
19722013 udplite4_register();
19732014
2015
+ raw_init();
2016
+
19742017 ping_init();
19752018
19762019 /*