hc
2024-12-19 9370bb92b2d16684ee45cf24e879c93c509162da
kernel/net/ipv4/tcp_ipv4.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /*
23 * INET An implementation of the TCP/IP protocol suite for the LINUX
34 * operating system. INET is implemented using the BSD Socket
....@@ -7,18 +8,12 @@
78 *
89 * IPv4 specific functions
910 *
10
- *
1111 * code split from:
1212 * linux/ipv4/tcp.c
1313 * linux/ipv4/tcp_input.c
1414 * linux/ipv4/tcp_output.c
1515 *
1616 * See tcp.c for author information
17
- *
18
- * This program is free software; you can redistribute it and/or
19
- * modify it under the terms of the GNU General Public License
20
- * as published by the Free Software Foundation; either version
21
- * 2 of the License, or (at your option) any later version.
2217 */
2318
2419 /*
....@@ -62,7 +57,6 @@
6257 #include <linux/init.h>
6358 #include <linux/times.h>
6459 #include <linux/slab.h>
65
-#include <linux/locallock.h>
6660
6761 #include <net/net_namespace.h>
6862 #include <net/icmp.h>
....@@ -82,6 +76,7 @@
8276 #include <linux/proc_fs.h>
8377 #include <linux/seq_file.h>
8478 #include <linux/inetdevice.h>
79
+#include <linux/btf_ids.h>
8580
8681 #include <crypto/hash.h>
8782 #include <linux/scatterlist.h>
....@@ -111,10 +106,10 @@
111106
112107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
113108 {
109
+ int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
114110 const struct inet_timewait_sock *tw = inet_twsk(sktw);
115111 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
116112 struct tcp_sock *tp = tcp_sk(sk);
117
- int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
118113
119114 if (reuse == 2) {
120115 /* Still does not detect *everything* that goes through
....@@ -127,11 +122,9 @@
127122 #if IS_ENABLED(CONFIG_IPV6)
128123 if (tw->tw_family == AF_INET6) {
129124 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
130
- (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
131
- (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
125
+ ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
132126 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
133
- (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
134
- (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
127
+ ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
135128 loopback = true;
136129 } else
137130 #endif
....@@ -329,6 +322,8 @@
329322 * if necessary.
330323 */
331324 tcp_set_state(sk, TCP_CLOSE);
325
+ if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
326
+ inet_reset_saddr(sk);
332327 ip_rt_put(rt);
333328 sk->sk_route_caps = 0;
334329 inet->inet_dport = 0;
....@@ -411,6 +406,46 @@
411406 }
412407 EXPORT_SYMBOL(tcp_req_err);
413408
409
+/* TCP-LD (RFC 6069) logic */
410
+void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
411
+{
412
+ struct inet_connection_sock *icsk = inet_csk(sk);
413
+ struct tcp_sock *tp = tcp_sk(sk);
414
+ struct sk_buff *skb;
415
+ s32 remaining;
416
+ u32 delta_us;
417
+
418
+ if (sock_owned_by_user(sk))
419
+ return;
420
+
421
+ if (seq != tp->snd_una || !icsk->icsk_retransmits ||
422
+ !icsk->icsk_backoff)
423
+ return;
424
+
425
+ skb = tcp_rtx_queue_head(sk);
426
+ if (WARN_ON_ONCE(!skb))
427
+ return;
428
+
429
+ icsk->icsk_backoff--;
430
+ icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
431
+ icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
432
+
433
+ tcp_mstamp_refresh(tp);
434
+ delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
435
+ remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
436
+
437
+ if (remaining > 0) {
438
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
439
+ remaining, TCP_RTO_MAX);
440
+ } else {
441
+ /* RTO revert clocked out retransmission.
442
+ * Will retransmit now.
443
+ */
444
+ tcp_retransmit_timer(sk);
445
+ }
446
+}
447
+EXPORT_SYMBOL(tcp_ld_RTO_revert);
448
+
414449 /*
415450 * This routine is called by the ICMP module when it gets some
416451 * sort of error condition. If err < 0 then the socket should
....@@ -427,43 +462,40 @@
427462 *
428463 */
429464
430
-void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
465
+int tcp_v4_err(struct sk_buff *skb, u32 info)
431466 {
432
- const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
433
- struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
434
- struct inet_connection_sock *icsk;
467
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
468
+ struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
435469 struct tcp_sock *tp;
436470 struct inet_sock *inet;
437
- const int type = icmp_hdr(icmp_skb)->type;
438
- const int code = icmp_hdr(icmp_skb)->code;
471
+ const int type = icmp_hdr(skb)->type;
472
+ const int code = icmp_hdr(skb)->code;
439473 struct sock *sk;
440
- struct sk_buff *skb;
441474 struct request_sock *fastopen;
442475 u32 seq, snd_una;
443
- s32 remaining;
444
- u32 delta_us;
445476 int err;
446
- struct net *net = dev_net(icmp_skb->dev);
477
+ struct net *net = dev_net(skb->dev);
447478
448479 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
449480 th->dest, iph->saddr, ntohs(th->source),
450
- inet_iif(icmp_skb), 0);
481
+ inet_iif(skb), 0);
451482 if (!sk) {
452483 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
453
- return;
484
+ return -ENOENT;
454485 }
455486 if (sk->sk_state == TCP_TIME_WAIT) {
456487 inet_twsk_put(inet_twsk(sk));
457
- return;
488
+ return 0;
458489 }
459490 seq = ntohl(th->seq);
460
- if (sk->sk_state == TCP_NEW_SYN_RECV)
461
- return tcp_req_err(sk, seq,
462
- type == ICMP_PARAMETERPROB ||
463
- type == ICMP_TIME_EXCEEDED ||
464
- (type == ICMP_DEST_UNREACH &&
465
- (code == ICMP_NET_UNREACH ||
466
- code == ICMP_HOST_UNREACH)));
491
+ if (sk->sk_state == TCP_NEW_SYN_RECV) {
492
+ tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
493
+ type == ICMP_TIME_EXCEEDED ||
494
+ (type == ICMP_DEST_UNREACH &&
495
+ (code == ICMP_NET_UNREACH ||
496
+ code == ICMP_HOST_UNREACH)));
497
+ return 0;
498
+ }
467499
468500 bh_lock_sock(sk);
469501 /* If too many ICMPs get dropped on busy
....@@ -483,10 +515,9 @@
483515 goto out;
484516 }
485517
486
- icsk = inet_csk(sk);
487518 tp = tcp_sk(sk);
488519 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
489
- fastopen = tp->fastopen_rsk;
520
+ fastopen = rcu_dereference(tp->fastopen_rsk);
490521 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
491522 if (sk->sk_state != TCP_LISTEN &&
492523 !between(seq, snd_una, tp->snd_nxt)) {
....@@ -497,7 +528,7 @@
497528 switch (type) {
498529 case ICMP_REDIRECT:
499530 if (!sock_owned_by_user(sk))
500
- do_redirect(icmp_skb, sk);
531
+ do_redirect(skb, sk);
501532 goto out;
502533 case ICMP_SOURCE_QUENCH:
503534 /* Just silently ignore these. */
....@@ -528,40 +559,12 @@
528559 }
529560
530561 err = icmp_err_convert[code].errno;
531
- /* check if icmp_skb allows revert of backoff
532
- * (see draft-zimmermann-tcp-lcd) */
533
- if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
534
- break;
535
- if (seq != tp->snd_una || !icsk->icsk_retransmits ||
536
- !icsk->icsk_backoff || fastopen)
537
- break;
538
-
539
- if (sock_owned_by_user(sk))
540
- break;
541
-
542
- skb = tcp_rtx_queue_head(sk);
543
- if (WARN_ON_ONCE(!skb))
544
- break;
545
-
546
- icsk->icsk_backoff--;
547
- icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
548
- TCP_TIMEOUT_INIT;
549
- icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
550
-
551
- tcp_mstamp_refresh(tp);
552
- delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
553
- remaining = icsk->icsk_rto -
554
- usecs_to_jiffies(delta_us);
555
-
556
- if (remaining > 0) {
557
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
558
- remaining, TCP_RTO_MAX);
559
- } else {
560
- /* RTO revert clocked out retransmission.
561
- * Will retransmit now */
562
- tcp_retransmit_timer(sk);
563
- }
564
-
562
+ /* check if this ICMP message allows revert of backoff.
563
+ * (see RFC 6069)
564
+ */
565
+ if (!fastopen &&
566
+ (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
567
+ tcp_ld_RTO_revert(sk, seq);
565568 break;
566569 case ICMP_TIME_EXCEEDED:
567570 err = EHOSTUNREACH;
....@@ -574,10 +577,12 @@
574577 case TCP_SYN_SENT:
575578 case TCP_SYN_RECV:
576579 /* Only in fast or simultaneous open. If a fast open socket is
577
- * is already accepted it is treated as a connected one below.
580
+ * already accepted it is treated as a connected one below.
578581 */
579582 if (fastopen && !fastopen->sk)
580583 break;
584
+
585
+ ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
581586
582587 if (!sock_owned_by_user(sk)) {
583588 sk->sk_err = err;
....@@ -618,6 +623,7 @@
618623 out:
619624 bh_unlock_sock(sk);
620625 sock_put(sk);
626
+ return 0;
621627 }
622628
623629 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
....@@ -638,7 +644,6 @@
638644 }
639645 EXPORT_SYMBOL(tcp_v4_send_check);
640646
641
-static DEFINE_LOCAL_IRQ_LOCK(tcp_sk_lock);
642647 /*
643648 * This routine will send an RST to the other tcp.
644649 *
....@@ -669,8 +674,9 @@
669674 int genhash;
670675 struct sock *sk1 = NULL;
671676 #endif
672
- struct net *net;
677
+ u64 transmit_time = 0;
673678 struct sock *ctl_sk;
679
+ struct net *net;
674680
675681 /* Never send a reset in response to a reset. */
676682 if (th->rst)
....@@ -706,9 +712,21 @@
706712 rcu_read_lock();
707713 hash_location = tcp_parse_md5sig_option(th);
708714 if (sk && sk_fullsock(sk)) {
709
- key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
710
- &ip_hdr(skb)->saddr, AF_INET);
715
+ const union tcp_md5_addr *addr;
716
+ int l3index;
717
+
718
+ /* sdif set, means packet ingressed via a device
719
+ * in an L3 domain and inet_iif is set to it.
720
+ */
721
+ l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
722
+ addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
723
+ key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
711724 } else if (hash_location) {
725
+ const union tcp_md5_addr *addr;
726
+ int sdif = tcp_v4_sdif(skb);
727
+ int dif = inet_iif(skb);
728
+ int l3index;
729
+
712730 /*
713731 * active side is lost. Try to find listening socket through
714732 * source port, and then find md5 key through listening socket.
....@@ -719,14 +737,17 @@
719737 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
720738 ip_hdr(skb)->saddr,
721739 th->source, ip_hdr(skb)->daddr,
722
- ntohs(th->source), inet_iif(skb),
723
- tcp_v4_sdif(skb));
740
+ ntohs(th->source), dif, sdif);
724741 /* don't send rst if it can't find key */
725742 if (!sk1)
726743 goto out;
727744
728
- key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
729
- &ip_hdr(skb)->saddr, AF_INET);
745
+ /* sdif set, means packet ingressed via a device
746
+ * in an L3 domain and dif is set to it.
747
+ */
748
+ l3index = sdif ? dif : 0;
749
+ addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
750
+ key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
730751 if (!key)
731752 goto out;
732753
....@@ -773,20 +794,23 @@
773794 arg.tos = ip_hdr(skb)->tos;
774795 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
775796 local_bh_disable();
776
- local_lock(tcp_sk_lock);
777
- ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
778
- if (sk)
797
+ ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
798
+ if (sk) {
779799 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
780800 inet_twsk(sk)->tw_mark : sk->sk_mark;
801
+ ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
802
+ inet_twsk(sk)->tw_priority : sk->sk_priority;
803
+ transmit_time = tcp_transmit_time(sk);
804
+ }
781805 ip_send_unicast_reply(ctl_sk,
782806 skb, &TCP_SKB_CB(skb)->header.h4.opt,
783807 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
784
- &arg, arg.iov[0].iov_len);
808
+ &arg, arg.iov[0].iov_len,
809
+ transmit_time);
785810
786811 ctl_sk->sk_mark = 0;
787812 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
788813 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
789
- local_unlock(tcp_sk_lock);
790814 local_bh_enable();
791815
792816 #ifdef CONFIG_TCP_MD5SIG
....@@ -817,6 +841,7 @@
817841 struct net *net = sock_net(sk);
818842 struct ip_reply_arg arg;
819843 struct sock *ctl_sk;
844
+ u64 transmit_time;
820845
821846 memset(&rep.th, 0, sizeof(struct tcphdr));
822847 memset(&arg, 0, sizeof(arg));
....@@ -867,19 +892,20 @@
867892 arg.tos = tos;
868893 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
869894 local_bh_disable();
870
- local_lock(tcp_sk_lock);
871
- ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
872
- if (sk)
873
- ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
874
- inet_twsk(sk)->tw_mark : sk->sk_mark;
895
+ ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
896
+ ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
897
+ inet_twsk(sk)->tw_mark : sk->sk_mark;
898
+ ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
899
+ inet_twsk(sk)->tw_priority : sk->sk_priority;
900
+ transmit_time = tcp_transmit_time(sk);
875901 ip_send_unicast_reply(ctl_sk,
876902 skb, &TCP_SKB_CB(skb)->header.h4.opt,
877903 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
878
- &arg, arg.iov[0].iov_len);
904
+ &arg, arg.iov[0].iov_len,
905
+ transmit_time);
879906
880907 ctl_sk->sk_mark = 0;
881908 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
882
- local_unlock(tcp_sk_lock);
883909 local_bh_enable();
884910 }
885911
....@@ -905,6 +931,9 @@
905931 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
906932 struct request_sock *req)
907933 {
934
+ const union tcp_md5_addr *addr;
935
+ int l3index;
936
+
908937 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
909938 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
910939 */
....@@ -916,14 +945,15 @@
916945 * exception of <SYN> segments, MUST be right-shifted by
917946 * Rcv.Wind.Shift bits:
918947 */
948
+ addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
949
+ l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
919950 tcp_v4_send_ack(sk, skb, seq,
920951 tcp_rsk(req)->rcv_nxt,
921952 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
922953 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
923
- req->ts_recent,
954
+ READ_ONCE(req->ts_recent),
924955 0,
925
- tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
926
- AF_INET),
956
+ tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
927957 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
928958 ip_hdr(skb)->tos);
929959 }
....@@ -937,26 +967,38 @@
937967 struct flowi *fl,
938968 struct request_sock *req,
939969 struct tcp_fastopen_cookie *foc,
940
- enum tcp_synack_type synack_type)
970
+ enum tcp_synack_type synack_type,
971
+ struct sk_buff *syn_skb)
941972 {
942973 const struct inet_request_sock *ireq = inet_rsk(req);
943974 struct flowi4 fl4;
944975 int err = -1;
945976 struct sk_buff *skb;
977
+ u8 tos;
946978
947979 /* First, grab a route. */
948980 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
949981 return -1;
950982
951
- skb = tcp_make_synack(sk, dst, req, foc, synack_type);
983
+ skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
952984
953985 if (skb) {
954986 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
955987
988
+ tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
989
+ (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
990
+ (inet_sk(sk)->tos & INET_ECN_MASK) :
991
+ inet_sk(sk)->tos;
992
+
993
+ if (!INET_ECN_is_capable(tos) &&
994
+ tcp_bpf_ca_needs_ecn((struct sock *)req))
995
+ tos |= INET_ECN_ECT_0;
996
+
956997 rcu_read_lock();
957998 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
958999 ireq->ir_rmt_addr,
959
- rcu_dereference(ireq->ireq_opt));
1000
+ rcu_dereference(ireq->ireq_opt),
1001
+ tos);
9601002 rcu_read_unlock();
9611003 err = net_xmit_eval(err);
9621004 }
....@@ -979,10 +1021,27 @@
9791021 * We need to maintain these in the sk structure.
9801022 */
9811023
1024
+DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1025
+EXPORT_SYMBOL(tcp_md5_needed);
1026
+
1027
+static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1028
+{
1029
+ if (!old)
1030
+ return true;
1031
+
1032
+ /* l3index always overrides non-l3index */
1033
+ if (old->l3index && new->l3index == 0)
1034
+ return false;
1035
+ if (old->l3index == 0 && new->l3index)
1036
+ return true;
1037
+
1038
+ return old->prefixlen < new->prefixlen;
1039
+}
1040
+
9821041 /* Find the Key structure for an address. */
983
-struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
984
- const union tcp_md5_addr *addr,
985
- int family)
1042
+struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1043
+ const union tcp_md5_addr *addr,
1044
+ int family)
9861045 {
9871046 const struct tcp_sock *tp = tcp_sk(sk);
9881047 struct tcp_md5sig_key *key;
....@@ -997,10 +1056,12 @@
9971056 if (!md5sig)
9981057 return NULL;
9991058
1000
- hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1059
+ hlist_for_each_entry_rcu(key, &md5sig->head, node,
1060
+ lockdep_sock_is_held(sk)) {
10011061 if (key->family != family)
10021062 continue;
1003
-
1063
+ if (key->l3index && key->l3index != l3index)
1064
+ continue;
10041065 if (family == AF_INET) {
10051066 mask = inet_make_mask(key->prefixlen);
10061067 match = (key->addr.a4.s_addr & mask) ==
....@@ -1014,17 +1075,17 @@
10141075 match = false;
10151076 }
10161077
1017
- if (match && (!best_match ||
1018
- key->prefixlen > best_match->prefixlen))
1078
+ if (match && better_md5_match(best_match, key))
10191079 best_match = key;
10201080 }
10211081 return best_match;
10221082 }
1023
-EXPORT_SYMBOL(tcp_md5_do_lookup);
1083
+EXPORT_SYMBOL(__tcp_md5_do_lookup);
10241084
10251085 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
10261086 const union tcp_md5_addr *addr,
1027
- int family, u8 prefixlen)
1087
+ int family, u8 prefixlen,
1088
+ int l3index)
10281089 {
10291090 const struct tcp_sock *tp = tcp_sk(sk);
10301091 struct tcp_md5sig_key *key;
....@@ -1040,8 +1101,11 @@
10401101 if (family == AF_INET6)
10411102 size = sizeof(struct in6_addr);
10421103 #endif
1043
- hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1104
+ hlist_for_each_entry_rcu(key, &md5sig->head, node,
1105
+ lockdep_sock_is_held(sk)) {
10441106 if (key->family != family)
1107
+ continue;
1108
+ if (key->l3index != l3index)
10451109 continue;
10461110 if (!memcmp(&key->addr, addr, size) &&
10471111 key->prefixlen == prefixlen)
....@@ -1054,28 +1118,34 @@
10541118 const struct sock *addr_sk)
10551119 {
10561120 const union tcp_md5_addr *addr;
1121
+ int l3index;
10571122
1123
+ l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1124
+ addr_sk->sk_bound_dev_if);
10581125 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1059
- return tcp_md5_do_lookup(sk, addr, AF_INET);
1126
+ return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
10601127 }
10611128 EXPORT_SYMBOL(tcp_v4_md5_lookup);
10621129
10631130 /* This can be called on a newly created socket, from other files */
10641131 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1065
- int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1066
- gfp_t gfp)
1132
+ int family, u8 prefixlen, int l3index,
1133
+ const u8 *newkey, u8 newkeylen, gfp_t gfp)
10671134 {
10681135 /* Add Key to the list */
10691136 struct tcp_md5sig_key *key;
10701137 struct tcp_sock *tp = tcp_sk(sk);
10711138 struct tcp_md5sig_info *md5sig;
10721139
1073
- key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1140
+ key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
10741141 if (key) {
10751142 /* Pre-existing entry - just update that one.
10761143 * Note that the key might be used concurrently.
1144
+ * data_race() is telling kcsan that we do not care of
1145
+ * key mismatches, since changing MD5 key on live flows
1146
+ * can lead to packet drops.
10771147 */
1078
- memcpy(key->key, newkey, newkeylen);
1148
+ data_race(memcpy(key->key, newkey, newkeylen));
10791149
10801150 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
10811151 * Also note that a reader could catch new key->keylen value
....@@ -1111,6 +1181,7 @@
11111181 key->keylen = newkeylen;
11121182 key->family = family;
11131183 key->prefixlen = prefixlen;
1184
+ key->l3index = l3index;
11141185 memcpy(&key->addr, addr,
11151186 (family == AF_INET6) ? sizeof(struct in6_addr) :
11161187 sizeof(struct in_addr));
....@@ -1120,11 +1191,11 @@
11201191 EXPORT_SYMBOL(tcp_md5_do_add);
11211192
11221193 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1123
- u8 prefixlen)
1194
+ u8 prefixlen, int l3index)
11241195 {
11251196 struct tcp_md5sig_key *key;
11261197
1127
- key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1198
+ key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
11281199 if (!key)
11291200 return -ENOENT;
11301201 hlist_del_rcu(&key->node);
....@@ -1151,16 +1222,18 @@
11511222 }
11521223
11531224 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1154
- char __user *optval, int optlen)
1225
+ sockptr_t optval, int optlen)
11551226 {
11561227 struct tcp_md5sig cmd;
11571228 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1229
+ const union tcp_md5_addr *addr;
11581230 u8 prefixlen = 32;
1231
+ int l3index = 0;
11591232
11601233 if (optlen < sizeof(cmd))
11611234 return -EINVAL;
11621235
1163
- if (copy_from_user(&cmd, optval, sizeof(cmd)))
1236
+ if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
11641237 return -EFAULT;
11651238
11661239 if (sin->sin_family != AF_INET)
....@@ -1173,16 +1246,34 @@
11731246 return -EINVAL;
11741247 }
11751248
1249
+ if (optname == TCP_MD5SIG_EXT &&
1250
+ cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1251
+ struct net_device *dev;
1252
+
1253
+ rcu_read_lock();
1254
+ dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1255
+ if (dev && netif_is_l3_master(dev))
1256
+ l3index = dev->ifindex;
1257
+
1258
+ rcu_read_unlock();
1259
+
1260
+ /* ok to reference set/not set outside of rcu;
1261
+ * right now device MUST be an L3 master
1262
+ */
1263
+ if (!dev || !l3index)
1264
+ return -EINVAL;
1265
+ }
1266
+
1267
+ addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1268
+
11761269 if (!cmd.tcpm_keylen)
1177
- return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1178
- AF_INET, prefixlen);
1270
+ return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
11791271
11801272 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
11811273 return -EINVAL;
11821274
1183
- return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1184
- AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1185
- GFP_KERNEL);
1275
+ return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1276
+ cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
11861277 }
11871278
11881279 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
....@@ -1292,7 +1383,8 @@
12921383
12931384 /* Called with rcu_read_lock() */
12941385 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1295
- const struct sk_buff *skb)
1386
+ const struct sk_buff *skb,
1387
+ int dif, int sdif)
12961388 {
12971389 #ifdef CONFIG_TCP_MD5SIG
12981390 /*
....@@ -1307,11 +1399,17 @@
13071399 struct tcp_md5sig_key *hash_expected;
13081400 const struct iphdr *iph = ip_hdr(skb);
13091401 const struct tcphdr *th = tcp_hdr(skb);
1310
- int genhash;
1402
+ const union tcp_md5_addr *addr;
13111403 unsigned char newhash[16];
1404
+ int genhash, l3index;
13121405
1313
- hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1314
- AF_INET);
1406
+ /* sdif set, means packet ingressed via a device
1407
+ * in an L3 domain and dif is set to the l3mdev
1408
+ */
1409
+ l3index = sdif ? dif : 0;
1410
+
1411
+ addr = (union tcp_md5_addr *)&iph->saddr;
1412
+ hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
13151413 hash_location = tcp_parse_md5sig_option(th);
13161414
13171415 /* We've parsed the options - do we have a hash? */
....@@ -1337,11 +1435,11 @@
13371435
13381436 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
13391437 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1340
- net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1438
+ net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
13411439 &iph->saddr, ntohs(th->source),
13421440 &iph->daddr, ntohs(th->dest),
13431441 genhash ? " tcp_v4_calc_md5_hash failed"
1344
- : "");
1442
+ : "", l3index);
13451443 return true;
13461444 }
13471445 return false;
....@@ -1378,7 +1476,7 @@
13781476 .syn_ack_timeout = tcp_syn_ack_timeout,
13791477 };
13801478
1381
-static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1479
+const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
13821480 .mss_clamp = TCP_MSS_DEFAULT,
13831481 #ifdef CONFIG_TCP_MD5SIG
13841482 .req_md5_lookup = tcp_v4_md5_lookup,
....@@ -1421,11 +1519,14 @@
14211519 bool *own_req)
14221520 {
14231521 struct inet_request_sock *ireq;
1522
+ bool found_dup_sk = false;
14241523 struct inet_sock *newinet;
14251524 struct tcp_sock *newtp;
14261525 struct sock *newsk;
14271526 #ifdef CONFIG_TCP_MD5SIG
1527
+ const union tcp_md5_addr *addr;
14281528 struct tcp_md5sig_key *key;
1529
+ int l3index;
14291530 #endif
14301531 struct ip_options_rcu *inet_opt;
14311532
....@@ -1456,6 +1557,12 @@
14561557 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
14571558 newinet->inet_id = prandom_u32();
14581559
1560
+ /* Set ToS of the new socket based upon the value of incoming SYN.
1561
+ * ECT bits are set later in tcp_init_transfer().
1562
+ */
1563
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1564
+ newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1565
+
14591566 if (!dst) {
14601567 dst = inet_csk_route_child_sock(sk, newsk, req);
14611568 if (!dst)
....@@ -1473,9 +1580,10 @@
14731580 tcp_initialize_rcv_mss(newsk);
14741581
14751582 #ifdef CONFIG_TCP_MD5SIG
1583
+ l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
14761584 /* Copy over the MD5 key from the original socket */
1477
- key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1478
- AF_INET);
1585
+ addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1586
+ key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
14791587 if (key) {
14801588 /*
14811589 * We're using one, so create a matching key
....@@ -1483,20 +1591,30 @@
14831591 * memory, then we end up not copying the key
14841592 * across. Shucks.
14851593 */
1486
- tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1487
- AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1594
+ tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1595
+ key->key, key->keylen, GFP_ATOMIC);
14881596 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
14891597 }
14901598 #endif
14911599
14921600 if (__inet_inherit_port(sk, newsk) < 0)
14931601 goto put_and_exit;
1494
- *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1602
+ *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1603
+ &found_dup_sk);
14951604 if (likely(*own_req)) {
14961605 tcp_move_syn(newtp, req);
14971606 ireq->ireq_opt = NULL;
14981607 } else {
14991608 newinet->inet_opt = NULL;
1609
+
1610
+ if (!req_unhash && found_dup_sk) {
1611
+ /* This code path should only be executed in the
1612
+ * syncookie case only
1613
+ */
1614
+ bh_unlock_sock(newsk);
1615
+ sock_put(newsk);
1616
+ newsk = NULL;
1617
+ }
15001618 }
15011619 return newsk;
15021620
....@@ -1526,6 +1644,21 @@
15261644 return sk;
15271645 }
15281646
1647
+u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1648
+ struct tcphdr *th, u32 *cookie)
1649
+{
1650
+ u16 mss = 0;
1651
+#ifdef CONFIG_SYN_COOKIES
1652
+ mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1653
+ &tcp_request_sock_ipv4_ops, sk, th);
1654
+ if (mss) {
1655
+ *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1656
+ tcp_synq_overflow(sk);
1657
+ }
1658
+#endif
1659
+ return mss;
1660
+}
1661
+
15291662 /* The socket must have it's spinlock held when we get
15301663 * here, unless it is a TCP_LISTEN socket.
15311664 *
....@@ -1539,15 +1672,18 @@
15391672 struct sock *rsk;
15401673
15411674 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1542
- struct dst_entry *dst = sk->sk_rx_dst;
1675
+ struct dst_entry *dst;
1676
+
1677
+ dst = rcu_dereference_protected(sk->sk_rx_dst,
1678
+ lockdep_sock_is_held(sk));
15431679
15441680 sock_rps_save_rxhash(sk, skb);
15451681 sk_mark_napi_id(sk, skb);
15461682 if (dst) {
15471683 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
15481684 !dst->ops->check(dst, 0)) {
1685
+ RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
15491686 dst_release(dst);
1550
- sk->sk_rx_dst = NULL;
15511687 }
15521688 }
15531689 tcp_rcv_established(sk, skb);
....@@ -1622,7 +1758,7 @@
16221758 skb->sk = sk;
16231759 skb->destructor = sock_edemux;
16241760 if (sk_fullsock(sk)) {
1625
- struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1761
+ struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
16261762
16271763 if (dst)
16281764 dst = dst_check(dst, 0);
....@@ -1636,13 +1772,16 @@
16361772
16371773 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
16381774 {
1639
- u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1640
-
1641
- /* Only socket owner can try to collapse/prune rx queues
1642
- * to reduce memory overhead, so add a little headroom here.
1643
- * Few sockets backlog are possibly concurrently non empty.
1644
- */
1645
- limit += 64*1024;
1775
+ u32 limit, tail_gso_size, tail_gso_segs;
1776
+ struct skb_shared_info *shinfo;
1777
+ const struct tcphdr *th;
1778
+ struct tcphdr *thtail;
1779
+ struct sk_buff *tail;
1780
+ unsigned int hdrlen;
1781
+ bool fragstolen;
1782
+ u32 gso_segs;
1783
+ u32 gso_size;
1784
+ int delta;
16461785
16471786 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
16481787 * we can fix skb->truesize to its real value to avoid future drops.
....@@ -1651,6 +1790,98 @@
16511790 * (if cooked by drivers without copybreak feature).
16521791 */
16531792 skb_condense(skb);
1793
+
1794
+ skb_dst_drop(skb);
1795
+
1796
+ if (unlikely(tcp_checksum_complete(skb))) {
1797
+ bh_unlock_sock(sk);
1798
+ __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1799
+ __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1800
+ return true;
1801
+ }
1802
+
1803
+ /* Attempt coalescing to last skb in backlog, even if we are
1804
+ * above the limits.
1805
+ * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1806
+ */
1807
+ th = (const struct tcphdr *)skb->data;
1808
+ hdrlen = th->doff * 4;
1809
+
1810
+ tail = sk->sk_backlog.tail;
1811
+ if (!tail)
1812
+ goto no_coalesce;
1813
+ thtail = (struct tcphdr *)tail->data;
1814
+
1815
+ if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1816
+ TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1817
+ ((TCP_SKB_CB(tail)->tcp_flags |
1818
+ TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1819
+ !((TCP_SKB_CB(tail)->tcp_flags &
1820
+ TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1821
+ ((TCP_SKB_CB(tail)->tcp_flags ^
1822
+ TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1823
+#ifdef CONFIG_TLS_DEVICE
1824
+ tail->decrypted != skb->decrypted ||
1825
+#endif
1826
+ thtail->doff != th->doff ||
1827
+ memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1828
+ goto no_coalesce;
1829
+
1830
+ __skb_pull(skb, hdrlen);
1831
+
1832
+ shinfo = skb_shinfo(skb);
1833
+ gso_size = shinfo->gso_size ?: skb->len;
1834
+ gso_segs = shinfo->gso_segs ?: 1;
1835
+
1836
+ shinfo = skb_shinfo(tail);
1837
+ tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1838
+ tail_gso_segs = shinfo->gso_segs ?: 1;
1839
+
1840
+ if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1841
+ TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1842
+
1843
+ if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1844
+ TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1845
+ thtail->window = th->window;
1846
+ }
1847
+
1848
+ /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1849
+ * thtail->fin, so that the fast path in tcp_rcv_established()
1850
+ * is not entered if we append a packet with a FIN.
1851
+ * SYN, RST, URG are not present.
1852
+ * ACK is set on both packets.
1853
+ * PSH : we do not really care in TCP stack,
1854
+ * at least for 'GRO' packets.
1855
+ */
1856
+ thtail->fin |= th->fin;
1857
+ TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1858
+
1859
+ if (TCP_SKB_CB(skb)->has_rxtstamp) {
1860
+ TCP_SKB_CB(tail)->has_rxtstamp = true;
1861
+ tail->tstamp = skb->tstamp;
1862
+ skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1863
+ }
1864
+
1865
+ /* Not as strict as GRO. We only need to carry mss max value */
1866
+ shinfo->gso_size = max(gso_size, tail_gso_size);
1867
+ shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1868
+
1869
+ sk->sk_backlog.len += delta;
1870
+ __NET_INC_STATS(sock_net(sk),
1871
+ LINUX_MIB_TCPBACKLOGCOALESCE);
1872
+ kfree_skb_partial(skb, fragstolen);
1873
+ return false;
1874
+ }
1875
+ __skb_push(skb, hdrlen);
1876
+
1877
+no_coalesce:
1878
+ limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1879
+
1880
+ /* Only socket owner can try to collapse/prune rx queues
1881
+ * to reduce memory overhead, so add a little headroom here.
1882
+ * Few sockets backlog are possibly concurrently non empty.
1883
+ */
1884
+ limit += 64 * 1024;
16541885
16551886 if (unlikely(sk_add_backlog(sk, skb, limit))) {
16561887 bh_unlock_sock(sk);
....@@ -1704,7 +1935,9 @@
17041935 int tcp_v4_rcv(struct sk_buff *skb)
17051936 {
17061937 struct net *net = dev_net(skb->dev);
1938
+ struct sk_buff *skb_to_free;
17071939 int sdif = inet_sdif(skb);
1940
+ int dif = inet_iif(skb);
17081941 const struct iphdr *iph;
17091942 const struct tcphdr *th;
17101943 bool refcounted;
....@@ -1753,7 +1986,8 @@
17531986 struct sock *nsk;
17541987
17551988 sk = req->rsk_listener;
1756
- if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1989
+ if (unlikely(!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb) ||
1990
+ tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
17571991 sk_drops_add(sk, skb);
17581992 reqsk_put(req);
17591993 goto discard_it;
....@@ -1792,6 +2026,7 @@
17922026 }
17932027 goto discard_and_relse;
17942028 }
2029
+ nf_reset_ct(skb);
17952030 if (nsk == sk) {
17962031 reqsk_put(req);
17972032 tcp_v4_restore_cb(skb);
....@@ -1811,10 +2046,10 @@
18112046 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
18122047 goto discard_and_relse;
18132048
1814
- if (tcp_v4_inbound_md5_hash(sk, skb))
2049
+ if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
18152050 goto discard_and_relse;
18162051
1817
- nf_reset(skb);
2052
+ nf_reset_ct(skb);
18182053
18192054 if (tcp_filter(sk, skb))
18202055 goto discard_and_relse;
....@@ -1835,11 +2070,17 @@
18352070 tcp_segs_in(tcp_sk(sk), skb);
18362071 ret = 0;
18372072 if (!sock_owned_by_user(sk)) {
2073
+ skb_to_free = sk->sk_rx_skb_cache;
2074
+ sk->sk_rx_skb_cache = NULL;
18382075 ret = tcp_v4_do_rcv(sk, skb);
1839
- } else if (tcp_add_backlog(sk, skb)) {
1840
- goto discard_and_relse;
2076
+ } else {
2077
+ if (tcp_add_backlog(sk, skb))
2078
+ goto discard_and_relse;
2079
+ skb_to_free = NULL;
18412080 }
18422081 bh_unlock_sock(sk);
2082
+ if (skb_to_free)
2083
+ __kfree_skb(skb_to_free);
18432084
18442085 put_and_return:
18452086 if (refcounted)
....@@ -1903,7 +2144,7 @@
19032144 }
19042145 }
19052146 /* to ACK */
1906
- /* fall through */
2147
+ fallthrough;
19072148 case TCP_TW_ACK:
19082149 tcp_v4_timewait_ack(sk, skb);
19092150 break;
....@@ -1927,7 +2168,7 @@
19272168 struct dst_entry *dst = skb_dst(skb);
19282169
19292170 if (dst && dst_hold_safe(dst)) {
1930
- sk->sk_rx_dst = dst;
2171
+ rcu_assign_pointer(sk->sk_rx_dst, dst);
19312172 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
19322173 }
19332174 }
....@@ -1945,10 +2186,6 @@
19452186 .getsockopt = ip_getsockopt,
19462187 .addr2sockaddr = inet_csk_addr2sockaddr,
19472188 .sockaddr_len = sizeof(struct sockaddr_in),
1948
-#ifdef CONFIG_COMPAT
1949
- .compat_setsockopt = compat_ip_setsockopt,
1950
- .compat_getsockopt = compat_ip_getsockopt,
1951
-#endif
19522189 .mtu_reduced = tcp_v4_mtu_reduced,
19532190 };
19542191 EXPORT_SYMBOL(ipv4_specific);
....@@ -2013,7 +2250,7 @@
20132250 if (inet_csk(sk)->icsk_bind_hash)
20142251 inet_put_port(sk);
20152252
2016
- BUG_ON(tp->fastopen_rsk);
2253
+ BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
20172254
20182255 /* If socket is aborted during connect operation */
20192256 tcp_free_fastopen_req(tp);
....@@ -2034,12 +2271,17 @@
20342271 */
20352272 static void *listening_get_next(struct seq_file *seq, void *cur)
20362273 {
2037
- struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2274
+ struct tcp_seq_afinfo *afinfo;
20382275 struct tcp_iter_state *st = seq->private;
20392276 struct net *net = seq_file_net(seq);
20402277 struct inet_listen_hashbucket *ilb;
20412278 struct hlist_nulls_node *node;
20422279 struct sock *sk = cur;
2280
+
2281
+ if (st->bpf_seq_afinfo)
2282
+ afinfo = st->bpf_seq_afinfo;
2283
+ else
2284
+ afinfo = PDE_DATA(file_inode(seq->file));
20432285
20442286 if (!sk) {
20452287 get_head:
....@@ -2058,7 +2300,8 @@
20582300 sk_nulls_for_each_from(sk, node) {
20592301 if (!net_eq(sock_net(sk), net))
20602302 continue;
2061
- if (sk->sk_family == afinfo->family)
2303
+ if (afinfo->family == AF_UNSPEC ||
2304
+ sk->sk_family == afinfo->family)
20622305 return sk;
20632306 }
20642307 spin_unlock(&ilb->lock);
....@@ -2095,10 +2338,15 @@
20952338 */
20962339 static void *established_get_first(struct seq_file *seq)
20972340 {
2098
- struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2341
+ struct tcp_seq_afinfo *afinfo;
20992342 struct tcp_iter_state *st = seq->private;
21002343 struct net *net = seq_file_net(seq);
21012344 void *rc = NULL;
2345
+
2346
+ if (st->bpf_seq_afinfo)
2347
+ afinfo = st->bpf_seq_afinfo;
2348
+ else
2349
+ afinfo = PDE_DATA(file_inode(seq->file));
21022350
21032351 st->offset = 0;
21042352 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
....@@ -2112,7 +2360,8 @@
21122360
21132361 spin_lock_bh(lock);
21142362 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2115
- if (sk->sk_family != afinfo->family ||
2363
+ if ((afinfo->family != AF_UNSPEC &&
2364
+ sk->sk_family != afinfo->family) ||
21162365 !net_eq(sock_net(sk), net)) {
21172366 continue;
21182367 }
....@@ -2127,11 +2376,16 @@
21272376
21282377 static void *established_get_next(struct seq_file *seq, void *cur)
21292378 {
2130
- struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2379
+ struct tcp_seq_afinfo *afinfo;
21312380 struct sock *sk = cur;
21322381 struct hlist_nulls_node *node;
21332382 struct tcp_iter_state *st = seq->private;
21342383 struct net *net = seq_file_net(seq);
2384
+
2385
+ if (st->bpf_seq_afinfo)
2386
+ afinfo = st->bpf_seq_afinfo;
2387
+ else
2388
+ afinfo = PDE_DATA(file_inode(seq->file));
21352389
21362390 ++st->num;
21372391 ++st->offset;
....@@ -2139,7 +2393,8 @@
21392393 sk = sk_nulls_next(sk);
21402394
21412395 sk_nulls_for_each_from(sk, node) {
2142
- if (sk->sk_family == afinfo->family &&
2396
+ if ((afinfo->family == AF_UNSPEC ||
2397
+ sk->sk_family == afinfo->family) &&
21432398 net_eq(sock_net(sk), net))
21442399 return sk;
21452400 }
....@@ -2200,7 +2455,7 @@
22002455 break;
22012456 st->bucket = 0;
22022457 st->state = TCP_SEQ_STATE_ESTABLISHED;
2203
- /* Fallthrough */
2458
+ fallthrough;
22042459 case TCP_SEQ_STATE_ESTABLISHED:
22052460 if (st->bucket > tcp_hashinfo.ehash_mask)
22062461 break;
....@@ -2344,7 +2599,7 @@
23442599
23452600 state = inet_sk_state_load(sk);
23462601 if (state == TCP_LISTEN)
2347
- rx_queue = sk->sk_ack_backlog;
2602
+ rx_queue = READ_ONCE(sk->sk_ack_backlog);
23482603 else
23492604 /* Because we don't lock the socket,
23502605 * we might find a transient negative value.
....@@ -2366,7 +2621,7 @@
23662621 refcount_read(&sk->sk_refcnt), sk,
23672622 jiffies_to_clock_t(icsk->icsk_rto),
23682623 jiffies_to_clock_t(icsk->icsk_ack.ato),
2369
- (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2624
+ (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
23702625 tp->snd_cwnd,
23712626 state == TCP_LISTEN ?
23722627 fastopenq->max_qlen :
....@@ -2418,6 +2673,74 @@
24182673 seq_pad(seq, '\n');
24192674 return 0;
24202675 }
2676
+
2677
+#ifdef CONFIG_BPF_SYSCALL
2678
+struct bpf_iter__tcp {
2679
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
2680
+ __bpf_md_ptr(struct sock_common *, sk_common);
2681
+ uid_t uid __aligned(8);
2682
+};
2683
+
2684
+static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2685
+ struct sock_common *sk_common, uid_t uid)
2686
+{
2687
+ struct bpf_iter__tcp ctx;
2688
+
2689
+ meta->seq_num--; /* skip SEQ_START_TOKEN */
2690
+ ctx.meta = meta;
2691
+ ctx.sk_common = sk_common;
2692
+ ctx.uid = uid;
2693
+ return bpf_iter_run_prog(prog, &ctx);
2694
+}
2695
+
2696
+static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2697
+{
2698
+ struct bpf_iter_meta meta;
2699
+ struct bpf_prog *prog;
2700
+ struct sock *sk = v;
2701
+ uid_t uid;
2702
+
2703
+ if (v == SEQ_START_TOKEN)
2704
+ return 0;
2705
+
2706
+ if (sk->sk_state == TCP_TIME_WAIT) {
2707
+ uid = 0;
2708
+ } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2709
+ const struct request_sock *req = v;
2710
+
2711
+ uid = from_kuid_munged(seq_user_ns(seq),
2712
+ sock_i_uid(req->rsk_listener));
2713
+ } else {
2714
+ uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2715
+ }
2716
+
2717
+ meta.seq = seq;
2718
+ prog = bpf_iter_get_info(&meta, false);
2719
+ return tcp_prog_seq_show(prog, &meta, v, uid);
2720
+}
2721
+
2722
+static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2723
+{
2724
+ struct bpf_iter_meta meta;
2725
+ struct bpf_prog *prog;
2726
+
2727
+ if (!v) {
2728
+ meta.seq = seq;
2729
+ prog = bpf_iter_get_info(&meta, true);
2730
+ if (prog)
2731
+ (void)tcp_prog_seq_show(prog, &meta, v, 0);
2732
+ }
2733
+
2734
+ tcp_seq_stop(seq, v);
2735
+}
2736
+
2737
+static const struct seq_operations bpf_iter_tcp_seq_ops = {
2738
+ .show = bpf_iter_tcp_seq_show,
2739
+ .start = tcp_seq_start,
2740
+ .next = tcp_seq_next,
2741
+ .stop = bpf_iter_tcp_seq_stop,
2742
+};
2743
+#endif
24212744
24222745 static const struct seq_operations tcp4_seq_ops = {
24232746 .show = tcp4_seq_show,
....@@ -2499,10 +2822,6 @@
24992822 .rsk_prot = &tcp_request_sock_ops,
25002823 .h.hashinfo = &tcp_hashinfo,
25012824 .no_autobind = true,
2502
-#ifdef CONFIG_COMPAT
2503
- .compat_setsockopt = compat_tcp_setsockopt,
2504
- .compat_getsockopt = compat_tcp_getsockopt,
2505
-#endif
25062825 .diag_destroy = tcp_abort,
25072826 };
25082827 EXPORT_SYMBOL(tcp_prot);
....@@ -2512,7 +2831,8 @@
25122831 int cpu;
25132832
25142833 if (net->ipv4.tcp_congestion_control)
2515
- module_put(net->ipv4.tcp_congestion_control->owner);
2834
+ bpf_module_put(net->ipv4.tcp_congestion_control,
2835
+ net->ipv4.tcp_congestion_control->owner);
25162836
25172837 for_each_possible_cpu(cpu)
25182838 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
....@@ -2551,6 +2871,7 @@
25512871 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
25522872 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
25532873 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2874
+ net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
25542875
25552876 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
25562877 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
....@@ -2566,12 +2887,13 @@
25662887 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
25672888 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
25682889 net->ipv4.sysctl_tcp_tw_reuse = 2;
2890
+ net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
25692891
25702892 cnt = tcp_hashinfo.ehash_mask + 1;
2571
- net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2893
+ net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
25722894 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
25732895
2574
- net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2896
+ net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
25752897 net->ipv4.sysctl_tcp_sack = 1;
25762898 net->ipv4.sysctl_tcp_window_scaling = 1;
25772899 net->ipv4.sysctl_tcp_timestamps = 1;
....@@ -2590,8 +2912,8 @@
25902912 * which are too large can cause TCP streams to be bursty.
25912913 */
25922914 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2593
- /* Default TSQ limit of four TSO segments */
2594
- net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
2915
+ /* Default TSQ limit of 16 TSO segments */
2916
+ net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
25952917 /* rfc5961 challenge ack rate limiting */
25962918 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
25972919 net->ipv4.sysctl_tcp_min_tso_segs = 2;
....@@ -2609,15 +2931,17 @@
26092931 sizeof(init_net.ipv4.sysctl_tcp_wmem));
26102932 }
26112933 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2934
+ net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
26122935 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
26132936 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
26142937 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2615
- net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2938
+ net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
26162939 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
26172940
26182941 /* Reno is always built in */
26192942 if (!net_eq(net, &init_net) &&
2620
- try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2943
+ bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2944
+ init_net.ipv4.tcp_congestion_control->owner))
26212945 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
26222946 else
26232947 net->ipv4.tcp_congestion_control = &tcp_reno;
....@@ -2645,8 +2969,68 @@
26452969 .exit_batch = tcp_sk_exit_batch,
26462970 };
26472971
2972
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
2973
+DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
2974
+ struct sock_common *sk_common, uid_t uid)
2975
+
2976
+static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
2977
+{
2978
+ struct tcp_iter_state *st = priv_data;
2979
+ struct tcp_seq_afinfo *afinfo;
2980
+ int ret;
2981
+
2982
+ afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
2983
+ if (!afinfo)
2984
+ return -ENOMEM;
2985
+
2986
+ afinfo->family = AF_UNSPEC;
2987
+ st->bpf_seq_afinfo = afinfo;
2988
+ ret = bpf_iter_init_seq_net(priv_data, aux);
2989
+ if (ret)
2990
+ kfree(afinfo);
2991
+ return ret;
2992
+}
2993
+
2994
+static void bpf_iter_fini_tcp(void *priv_data)
2995
+{
2996
+ struct tcp_iter_state *st = priv_data;
2997
+
2998
+ kfree(st->bpf_seq_afinfo);
2999
+ bpf_iter_fini_seq_net(priv_data);
3000
+}
3001
+
3002
+static const struct bpf_iter_seq_info tcp_seq_info = {
3003
+ .seq_ops = &bpf_iter_tcp_seq_ops,
3004
+ .init_seq_private = bpf_iter_init_tcp,
3005
+ .fini_seq_private = bpf_iter_fini_tcp,
3006
+ .seq_priv_size = sizeof(struct tcp_iter_state),
3007
+};
3008
+
3009
+static struct bpf_iter_reg tcp_reg_info = {
3010
+ .target = "tcp",
3011
+ .ctx_arg_info_size = 1,
3012
+ .ctx_arg_info = {
3013
+ { offsetof(struct bpf_iter__tcp, sk_common),
3014
+ PTR_TO_BTF_ID_OR_NULL },
3015
+ },
3016
+ .seq_info = &tcp_seq_info,
3017
+};
3018
+
3019
+static void __init bpf_iter_register(void)
3020
+{
3021
+ tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3022
+ if (bpf_iter_reg_target(&tcp_reg_info))
3023
+ pr_warn("Warning: could not register bpf iterator tcp\n");
3024
+}
3025
+
3026
+#endif
3027
+
26483028 void __init tcp_v4_init(void)
26493029 {
26503030 if (register_pernet_subsys(&tcp_sk_ops))
26513031 panic("Failed to create the TCP control socket.\n");
3032
+
3033
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3034
+ bpf_iter_register();
3035
+#endif
26523036 }