hc
2024-12-19 9370bb92b2d16684ee45cf24e879c93c509162da
kernel/net/ipv4/tcp_ipv4.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /*
23 * INET An implementation of the TCP/IP protocol suite for the LINUX
34 * operating system. INET is implemented using the BSD Socket
....@@ -7,18 +8,12 @@
78 *
89 * IPv4 specific functions
910 *
10
- *
1111 * code split from:
1212 * linux/ipv4/tcp.c
1313 * linux/ipv4/tcp_input.c
1414 * linux/ipv4/tcp_output.c
1515 *
1616 * See tcp.c for author information
17
- *
18
- * This program is free software; you can redistribute it and/or
19
- * modify it under the terms of the GNU General Public License
20
- * as published by the Free Software Foundation; either version
21
- * 2 of the License, or (at your option) any later version.
2217 */
2318
2419 /*
....@@ -81,6 +76,7 @@
8176 #include <linux/proc_fs.h>
8277 #include <linux/seq_file.h>
8378 #include <linux/inetdevice.h>
79
+#include <linux/btf_ids.h>
8480
8581 #include <crypto/hash.h>
8682 #include <linux/scatterlist.h>
....@@ -110,10 +106,10 @@
110106
111107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112108 {
109
+ int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
113110 const struct inet_timewait_sock *tw = inet_twsk(sktw);
114111 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
115112 struct tcp_sock *tp = tcp_sk(sk);
116
- int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
117113
118114 if (reuse == 2) {
119115 /* Still does not detect *everything* that goes through
....@@ -126,11 +122,9 @@
126122 #if IS_ENABLED(CONFIG_IPV6)
127123 if (tw->tw_family == AF_INET6) {
128124 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
129
- (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
130
- (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
125
+ ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
131126 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
132
- (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
133
- (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
127
+ ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
134128 loopback = true;
135129 } else
136130 #endif
....@@ -328,6 +322,8 @@
328322 * if necessary.
329323 */
330324 tcp_set_state(sk, TCP_CLOSE);
325
+ if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
326
+ inet_reset_saddr(sk);
331327 ip_rt_put(rt);
332328 sk->sk_route_caps = 0;
333329 inet->inet_dport = 0;
....@@ -410,6 +406,46 @@
410406 }
411407 EXPORT_SYMBOL(tcp_req_err);
412408
409
+/* TCP-LD (RFC 6069) logic */
410
+void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
411
+{
412
+ struct inet_connection_sock *icsk = inet_csk(sk);
413
+ struct tcp_sock *tp = tcp_sk(sk);
414
+ struct sk_buff *skb;
415
+ s32 remaining;
416
+ u32 delta_us;
417
+
418
+ if (sock_owned_by_user(sk))
419
+ return;
420
+
421
+ if (seq != tp->snd_una || !icsk->icsk_retransmits ||
422
+ !icsk->icsk_backoff)
423
+ return;
424
+
425
+ skb = tcp_rtx_queue_head(sk);
426
+ if (WARN_ON_ONCE(!skb))
427
+ return;
428
+
429
+ icsk->icsk_backoff--;
430
+ icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
431
+ icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
432
+
433
+ tcp_mstamp_refresh(tp);
434
+ delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
435
+ remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
436
+
437
+ if (remaining > 0) {
438
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
439
+ remaining, TCP_RTO_MAX);
440
+ } else {
441
+ /* RTO revert clocked out retransmission.
442
+ * Will retransmit now.
443
+ */
444
+ tcp_retransmit_timer(sk);
445
+ }
446
+}
447
+EXPORT_SYMBOL(tcp_ld_RTO_revert);
448
+
413449 /*
414450 * This routine is called by the ICMP module when it gets some
415451 * sort of error condition. If err < 0 then the socket should
....@@ -426,43 +462,40 @@
426462 *
427463 */
428464
429
-void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
465
+int tcp_v4_err(struct sk_buff *skb, u32 info)
430466 {
431
- const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
432
- struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
433
- struct inet_connection_sock *icsk;
467
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
468
+ struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
434469 struct tcp_sock *tp;
435470 struct inet_sock *inet;
436
- const int type = icmp_hdr(icmp_skb)->type;
437
- const int code = icmp_hdr(icmp_skb)->code;
471
+ const int type = icmp_hdr(skb)->type;
472
+ const int code = icmp_hdr(skb)->code;
438473 struct sock *sk;
439
- struct sk_buff *skb;
440474 struct request_sock *fastopen;
441475 u32 seq, snd_una;
442
- s32 remaining;
443
- u32 delta_us;
444476 int err;
445
- struct net *net = dev_net(icmp_skb->dev);
477
+ struct net *net = dev_net(skb->dev);
446478
447479 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
448480 th->dest, iph->saddr, ntohs(th->source),
449
- inet_iif(icmp_skb), 0);
481
+ inet_iif(skb), 0);
450482 if (!sk) {
451483 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
452
- return;
484
+ return -ENOENT;
453485 }
454486 if (sk->sk_state == TCP_TIME_WAIT) {
455487 inet_twsk_put(inet_twsk(sk));
456
- return;
488
+ return 0;
457489 }
458490 seq = ntohl(th->seq);
459
- if (sk->sk_state == TCP_NEW_SYN_RECV)
460
- return tcp_req_err(sk, seq,
461
- type == ICMP_PARAMETERPROB ||
462
- type == ICMP_TIME_EXCEEDED ||
463
- (type == ICMP_DEST_UNREACH &&
464
- (code == ICMP_NET_UNREACH ||
465
- code == ICMP_HOST_UNREACH)));
491
+ if (sk->sk_state == TCP_NEW_SYN_RECV) {
492
+ tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
493
+ type == ICMP_TIME_EXCEEDED ||
494
+ (type == ICMP_DEST_UNREACH &&
495
+ (code == ICMP_NET_UNREACH ||
496
+ code == ICMP_HOST_UNREACH)));
497
+ return 0;
498
+ }
466499
467500 bh_lock_sock(sk);
468501 /* If too many ICMPs get dropped on busy
....@@ -482,10 +515,9 @@
482515 goto out;
483516 }
484517
485
- icsk = inet_csk(sk);
486518 tp = tcp_sk(sk);
487519 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
488
- fastopen = tp->fastopen_rsk;
520
+ fastopen = rcu_dereference(tp->fastopen_rsk);
489521 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
490522 if (sk->sk_state != TCP_LISTEN &&
491523 !between(seq, snd_una, tp->snd_nxt)) {
....@@ -496,7 +528,7 @@
496528 switch (type) {
497529 case ICMP_REDIRECT:
498530 if (!sock_owned_by_user(sk))
499
- do_redirect(icmp_skb, sk);
531
+ do_redirect(skb, sk);
500532 goto out;
501533 case ICMP_SOURCE_QUENCH:
502534 /* Just silently ignore these. */
....@@ -527,40 +559,12 @@
527559 }
528560
529561 err = icmp_err_convert[code].errno;
530
- /* check if icmp_skb allows revert of backoff
531
- * (see draft-zimmermann-tcp-lcd) */
532
- if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
533
- break;
534
- if (seq != tp->snd_una || !icsk->icsk_retransmits ||
535
- !icsk->icsk_backoff || fastopen)
536
- break;
537
-
538
- if (sock_owned_by_user(sk))
539
- break;
540
-
541
- skb = tcp_rtx_queue_head(sk);
542
- if (WARN_ON_ONCE(!skb))
543
- break;
544
-
545
- icsk->icsk_backoff--;
546
- icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
547
- TCP_TIMEOUT_INIT;
548
- icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
549
-
550
- tcp_mstamp_refresh(tp);
551
- delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
552
- remaining = icsk->icsk_rto -
553
- usecs_to_jiffies(delta_us);
554
-
555
- if (remaining > 0) {
556
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
557
- remaining, TCP_RTO_MAX);
558
- } else {
559
- /* RTO revert clocked out retransmission.
560
- * Will retransmit now */
561
- tcp_retransmit_timer(sk);
562
- }
563
-
562
+ /* check if this ICMP message allows revert of backoff.
563
+ * (see RFC 6069)
564
+ */
565
+ if (!fastopen &&
566
+ (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
567
+ tcp_ld_RTO_revert(sk, seq);
564568 break;
565569 case ICMP_TIME_EXCEEDED:
566570 err = EHOSTUNREACH;
....@@ -573,10 +577,12 @@
573577 case TCP_SYN_SENT:
574578 case TCP_SYN_RECV:
575579 /* Only in fast or simultaneous open. If a fast open socket is
576
- * is already accepted it is treated as a connected one below.
580
+ * already accepted it is treated as a connected one below.
577581 */
578582 if (fastopen && !fastopen->sk)
579583 break;
584
+
585
+ ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
580586
581587 if (!sock_owned_by_user(sk)) {
582588 sk->sk_err = err;
....@@ -617,6 +623,7 @@
617623 out:
618624 bh_unlock_sock(sk);
619625 sock_put(sk);
626
+ return 0;
620627 }
621628
622629 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
....@@ -667,8 +674,9 @@
667674 int genhash;
668675 struct sock *sk1 = NULL;
669676 #endif
670
- struct net *net;
677
+ u64 transmit_time = 0;
671678 struct sock *ctl_sk;
679
+ struct net *net;
672680
673681 /* Never send a reset in response to a reset. */
674682 if (th->rst)
....@@ -704,9 +712,21 @@
704712 rcu_read_lock();
705713 hash_location = tcp_parse_md5sig_option(th);
706714 if (sk && sk_fullsock(sk)) {
707
- key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
708
- &ip_hdr(skb)->saddr, AF_INET);
715
+ const union tcp_md5_addr *addr;
716
+ int l3index;
717
+
718
+ /* sdif set, means packet ingressed via a device
719
+ * in an L3 domain and inet_iif is set to it.
720
+ */
721
+ l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
722
+ addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
723
+ key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
709724 } else if (hash_location) {
725
+ const union tcp_md5_addr *addr;
726
+ int sdif = tcp_v4_sdif(skb);
727
+ int dif = inet_iif(skb);
728
+ int l3index;
729
+
710730 /*
711731 * active side is lost. Try to find listening socket through
712732 * source port, and then find md5 key through listening socket.
....@@ -717,14 +737,17 @@
717737 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
718738 ip_hdr(skb)->saddr,
719739 th->source, ip_hdr(skb)->daddr,
720
- ntohs(th->source), inet_iif(skb),
721
- tcp_v4_sdif(skb));
740
+ ntohs(th->source), dif, sdif);
722741 /* don't send rst if it can't find key */
723742 if (!sk1)
724743 goto out;
725744
726
- key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
727
- &ip_hdr(skb)->saddr, AF_INET);
745
+ /* sdif set, means packet ingressed via a device
746
+ * in an L3 domain and dif is set to it.
747
+ */
748
+ l3index = sdif ? dif : 0;
749
+ addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
750
+ key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
728751 if (!key)
729752 goto out;
730753
....@@ -771,14 +794,19 @@
771794 arg.tos = ip_hdr(skb)->tos;
772795 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
773796 local_bh_disable();
774
- ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
775
- if (sk)
797
+ ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
798
+ if (sk) {
776799 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
777800 inet_twsk(sk)->tw_mark : sk->sk_mark;
801
+ ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
802
+ inet_twsk(sk)->tw_priority : sk->sk_priority;
803
+ transmit_time = tcp_transmit_time(sk);
804
+ }
778805 ip_send_unicast_reply(ctl_sk,
779806 skb, &TCP_SKB_CB(skb)->header.h4.opt,
780807 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
781
- &arg, arg.iov[0].iov_len);
808
+ &arg, arg.iov[0].iov_len,
809
+ transmit_time);
782810
783811 ctl_sk->sk_mark = 0;
784812 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
....@@ -813,6 +841,7 @@
813841 struct net *net = sock_net(sk);
814842 struct ip_reply_arg arg;
815843 struct sock *ctl_sk;
844
+ u64 transmit_time;
816845
817846 memset(&rep.th, 0, sizeof(struct tcphdr));
818847 memset(&arg, 0, sizeof(arg));
....@@ -863,14 +892,17 @@
863892 arg.tos = tos;
864893 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
865894 local_bh_disable();
866
- ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
867
- if (sk)
868
- ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
869
- inet_twsk(sk)->tw_mark : sk->sk_mark;
895
+ ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
896
+ ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
897
+ inet_twsk(sk)->tw_mark : sk->sk_mark;
898
+ ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
899
+ inet_twsk(sk)->tw_priority : sk->sk_priority;
900
+ transmit_time = tcp_transmit_time(sk);
870901 ip_send_unicast_reply(ctl_sk,
871902 skb, &TCP_SKB_CB(skb)->header.h4.opt,
872903 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
873
- &arg, arg.iov[0].iov_len);
904
+ &arg, arg.iov[0].iov_len,
905
+ transmit_time);
874906
875907 ctl_sk->sk_mark = 0;
876908 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
....@@ -899,6 +931,9 @@
899931 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
900932 struct request_sock *req)
901933 {
934
+ const union tcp_md5_addr *addr;
935
+ int l3index;
936
+
902937 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
903938 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
904939 */
....@@ -910,14 +945,15 @@
910945 * exception of <SYN> segments, MUST be right-shifted by
911946 * Rcv.Wind.Shift bits:
912947 */
948
+ addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
949
+ l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
913950 tcp_v4_send_ack(sk, skb, seq,
914951 tcp_rsk(req)->rcv_nxt,
915952 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
916953 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
917
- req->ts_recent,
954
+ READ_ONCE(req->ts_recent),
918955 0,
919
- tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
920
- AF_INET),
956
+ tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
921957 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
922958 ip_hdr(skb)->tos);
923959 }
....@@ -931,26 +967,38 @@
931967 struct flowi *fl,
932968 struct request_sock *req,
933969 struct tcp_fastopen_cookie *foc,
934
- enum tcp_synack_type synack_type)
970
+ enum tcp_synack_type synack_type,
971
+ struct sk_buff *syn_skb)
935972 {
936973 const struct inet_request_sock *ireq = inet_rsk(req);
937974 struct flowi4 fl4;
938975 int err = -1;
939976 struct sk_buff *skb;
977
+ u8 tos;
940978
941979 /* First, grab a route. */
942980 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
943981 return -1;
944982
945
- skb = tcp_make_synack(sk, dst, req, foc, synack_type);
983
+ skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
946984
947985 if (skb) {
948986 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
949987
988
+ tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
989
+ (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
990
+ (inet_sk(sk)->tos & INET_ECN_MASK) :
991
+ inet_sk(sk)->tos;
992
+
993
+ if (!INET_ECN_is_capable(tos) &&
994
+ tcp_bpf_ca_needs_ecn((struct sock *)req))
995
+ tos |= INET_ECN_ECT_0;
996
+
950997 rcu_read_lock();
951998 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
952999 ireq->ir_rmt_addr,
953
- rcu_dereference(ireq->ireq_opt));
1000
+ rcu_dereference(ireq->ireq_opt),
1001
+ tos);
9541002 rcu_read_unlock();
9551003 err = net_xmit_eval(err);
9561004 }
....@@ -973,10 +1021,27 @@
9731021 * We need to maintain these in the sk structure.
9741022 */
9751023
1024
+DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1025
+EXPORT_SYMBOL(tcp_md5_needed);
1026
+
1027
+static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1028
+{
1029
+ if (!old)
1030
+ return true;
1031
+
1032
+ /* l3index always overrides non-l3index */
1033
+ if (old->l3index && new->l3index == 0)
1034
+ return false;
1035
+ if (old->l3index == 0 && new->l3index)
1036
+ return true;
1037
+
1038
+ return old->prefixlen < new->prefixlen;
1039
+}
1040
+
9761041 /* Find the Key structure for an address. */
977
-struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
978
- const union tcp_md5_addr *addr,
979
- int family)
1042
+struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1043
+ const union tcp_md5_addr *addr,
1044
+ int family)
9801045 {
9811046 const struct tcp_sock *tp = tcp_sk(sk);
9821047 struct tcp_md5sig_key *key;
....@@ -991,10 +1056,12 @@
9911056 if (!md5sig)
9921057 return NULL;
9931058
994
- hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1059
+ hlist_for_each_entry_rcu(key, &md5sig->head, node,
1060
+ lockdep_sock_is_held(sk)) {
9951061 if (key->family != family)
9961062 continue;
997
-
1063
+ if (key->l3index && key->l3index != l3index)
1064
+ continue;
9981065 if (family == AF_INET) {
9991066 mask = inet_make_mask(key->prefixlen);
10001067 match = (key->addr.a4.s_addr & mask) ==
....@@ -1008,17 +1075,17 @@
10081075 match = false;
10091076 }
10101077
1011
- if (match && (!best_match ||
1012
- key->prefixlen > best_match->prefixlen))
1078
+ if (match && better_md5_match(best_match, key))
10131079 best_match = key;
10141080 }
10151081 return best_match;
10161082 }
1017
-EXPORT_SYMBOL(tcp_md5_do_lookup);
1083
+EXPORT_SYMBOL(__tcp_md5_do_lookup);
10181084
10191085 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
10201086 const union tcp_md5_addr *addr,
1021
- int family, u8 prefixlen)
1087
+ int family, u8 prefixlen,
1088
+ int l3index)
10221089 {
10231090 const struct tcp_sock *tp = tcp_sk(sk);
10241091 struct tcp_md5sig_key *key;
....@@ -1034,8 +1101,11 @@
10341101 if (family == AF_INET6)
10351102 size = sizeof(struct in6_addr);
10361103 #endif
1037
- hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1104
+ hlist_for_each_entry_rcu(key, &md5sig->head, node,
1105
+ lockdep_sock_is_held(sk)) {
10381106 if (key->family != family)
1107
+ continue;
1108
+ if (key->l3index != l3index)
10391109 continue;
10401110 if (!memcmp(&key->addr, addr, size) &&
10411111 key->prefixlen == prefixlen)
....@@ -1048,28 +1118,34 @@
10481118 const struct sock *addr_sk)
10491119 {
10501120 const union tcp_md5_addr *addr;
1121
+ int l3index;
10511122
1123
+ l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1124
+ addr_sk->sk_bound_dev_if);
10521125 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1053
- return tcp_md5_do_lookup(sk, addr, AF_INET);
1126
+ return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
10541127 }
10551128 EXPORT_SYMBOL(tcp_v4_md5_lookup);
10561129
10571130 /* This can be called on a newly created socket, from other files */
10581131 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1059
- int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1060
- gfp_t gfp)
1132
+ int family, u8 prefixlen, int l3index,
1133
+ const u8 *newkey, u8 newkeylen, gfp_t gfp)
10611134 {
10621135 /* Add Key to the list */
10631136 struct tcp_md5sig_key *key;
10641137 struct tcp_sock *tp = tcp_sk(sk);
10651138 struct tcp_md5sig_info *md5sig;
10661139
1067
- key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1140
+ key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
10681141 if (key) {
10691142 /* Pre-existing entry - just update that one.
10701143 * Note that the key might be used concurrently.
1144
+ * data_race() is telling kcsan that we do not care of
1145
+ * key mismatches, since changing MD5 key on live flows
1146
+ * can lead to packet drops.
10711147 */
1072
- memcpy(key->key, newkey, newkeylen);
1148
+ data_race(memcpy(key->key, newkey, newkeylen));
10731149
10741150 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
10751151 * Also note that a reader could catch new key->keylen value
....@@ -1105,6 +1181,7 @@
11051181 key->keylen = newkeylen;
11061182 key->family = family;
11071183 key->prefixlen = prefixlen;
1184
+ key->l3index = l3index;
11081185 memcpy(&key->addr, addr,
11091186 (family == AF_INET6) ? sizeof(struct in6_addr) :
11101187 sizeof(struct in_addr));
....@@ -1114,11 +1191,11 @@
11141191 EXPORT_SYMBOL(tcp_md5_do_add);
11151192
11161193 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1117
- u8 prefixlen)
1194
+ u8 prefixlen, int l3index)
11181195 {
11191196 struct tcp_md5sig_key *key;
11201197
1121
- key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1198
+ key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
11221199 if (!key)
11231200 return -ENOENT;
11241201 hlist_del_rcu(&key->node);
....@@ -1145,16 +1222,18 @@
11451222 }
11461223
11471224 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1148
- char __user *optval, int optlen)
1225
+ sockptr_t optval, int optlen)
11491226 {
11501227 struct tcp_md5sig cmd;
11511228 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1229
+ const union tcp_md5_addr *addr;
11521230 u8 prefixlen = 32;
1231
+ int l3index = 0;
11531232
11541233 if (optlen < sizeof(cmd))
11551234 return -EINVAL;
11561235
1157
- if (copy_from_user(&cmd, optval, sizeof(cmd)))
1236
+ if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
11581237 return -EFAULT;
11591238
11601239 if (sin->sin_family != AF_INET)
....@@ -1167,16 +1246,34 @@
11671246 return -EINVAL;
11681247 }
11691248
1249
+ if (optname == TCP_MD5SIG_EXT &&
1250
+ cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1251
+ struct net_device *dev;
1252
+
1253
+ rcu_read_lock();
1254
+ dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1255
+ if (dev && netif_is_l3_master(dev))
1256
+ l3index = dev->ifindex;
1257
+
1258
+ rcu_read_unlock();
1259
+
1260
+ /* ok to reference set/not set outside of rcu;
1261
+ * right now device MUST be an L3 master
1262
+ */
1263
+ if (!dev || !l3index)
1264
+ return -EINVAL;
1265
+ }
1266
+
1267
+ addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1268
+
11701269 if (!cmd.tcpm_keylen)
1171
- return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1172
- AF_INET, prefixlen);
1270
+ return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
11731271
11741272 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
11751273 return -EINVAL;
11761274
1177
- return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1178
- AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1179
- GFP_KERNEL);
1275
+ return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1276
+ cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
11801277 }
11811278
11821279 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
....@@ -1286,7 +1383,8 @@
12861383
12871384 /* Called with rcu_read_lock() */
12881385 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1289
- const struct sk_buff *skb)
1386
+ const struct sk_buff *skb,
1387
+ int dif, int sdif)
12901388 {
12911389 #ifdef CONFIG_TCP_MD5SIG
12921390 /*
....@@ -1301,11 +1399,17 @@
13011399 struct tcp_md5sig_key *hash_expected;
13021400 const struct iphdr *iph = ip_hdr(skb);
13031401 const struct tcphdr *th = tcp_hdr(skb);
1304
- int genhash;
1402
+ const union tcp_md5_addr *addr;
13051403 unsigned char newhash[16];
1404
+ int genhash, l3index;
13061405
1307
- hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1308
- AF_INET);
1406
+ /* sdif set, means packet ingressed via a device
1407
+ * in an L3 domain and dif is set to the l3mdev
1408
+ */
1409
+ l3index = sdif ? dif : 0;
1410
+
1411
+ addr = (union tcp_md5_addr *)&iph->saddr;
1412
+ hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
13091413 hash_location = tcp_parse_md5sig_option(th);
13101414
13111415 /* We've parsed the options - do we have a hash? */
....@@ -1331,11 +1435,11 @@
13311435
13321436 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
13331437 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1334
- net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1438
+ net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
13351439 &iph->saddr, ntohs(th->source),
13361440 &iph->daddr, ntohs(th->dest),
13371441 genhash ? " tcp_v4_calc_md5_hash failed"
1338
- : "");
1442
+ : "", l3index);
13391443 return true;
13401444 }
13411445 return false;
....@@ -1372,7 +1476,7 @@
13721476 .syn_ack_timeout = tcp_syn_ack_timeout,
13731477 };
13741478
1375
-static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1479
+const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
13761480 .mss_clamp = TCP_MSS_DEFAULT,
13771481 #ifdef CONFIG_TCP_MD5SIG
13781482 .req_md5_lookup = tcp_v4_md5_lookup,
....@@ -1415,11 +1519,14 @@
14151519 bool *own_req)
14161520 {
14171521 struct inet_request_sock *ireq;
1522
+ bool found_dup_sk = false;
14181523 struct inet_sock *newinet;
14191524 struct tcp_sock *newtp;
14201525 struct sock *newsk;
14211526 #ifdef CONFIG_TCP_MD5SIG
1527
+ const union tcp_md5_addr *addr;
14221528 struct tcp_md5sig_key *key;
1529
+ int l3index;
14231530 #endif
14241531 struct ip_options_rcu *inet_opt;
14251532
....@@ -1450,6 +1557,12 @@
14501557 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
14511558 newinet->inet_id = prandom_u32();
14521559
1560
+ /* Set ToS of the new socket based upon the value of incoming SYN.
1561
+ * ECT bits are set later in tcp_init_transfer().
1562
+ */
1563
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1564
+ newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1565
+
14531566 if (!dst) {
14541567 dst = inet_csk_route_child_sock(sk, newsk, req);
14551568 if (!dst)
....@@ -1467,9 +1580,10 @@
14671580 tcp_initialize_rcv_mss(newsk);
14681581
14691582 #ifdef CONFIG_TCP_MD5SIG
1583
+ l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
14701584 /* Copy over the MD5 key from the original socket */
1471
- key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1472
- AF_INET);
1585
+ addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1586
+ key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
14731587 if (key) {
14741588 /*
14751589 * We're using one, so create a matching key
....@@ -1477,20 +1591,30 @@
14771591 * memory, then we end up not copying the key
14781592 * across. Shucks.
14791593 */
1480
- tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1481
- AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1594
+ tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1595
+ key->key, key->keylen, GFP_ATOMIC);
14821596 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
14831597 }
14841598 #endif
14851599
14861600 if (__inet_inherit_port(sk, newsk) < 0)
14871601 goto put_and_exit;
1488
- *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1602
+ *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1603
+ &found_dup_sk);
14891604 if (likely(*own_req)) {
14901605 tcp_move_syn(newtp, req);
14911606 ireq->ireq_opt = NULL;
14921607 } else {
14931608 newinet->inet_opt = NULL;
1609
+
1610
+ if (!req_unhash && found_dup_sk) {
1611
+ /* This code path should only be executed in the
1612
+ * syncookie case only
1613
+ */
1614
+ bh_unlock_sock(newsk);
1615
+ sock_put(newsk);
1616
+ newsk = NULL;
1617
+ }
14941618 }
14951619 return newsk;
14961620
....@@ -1520,6 +1644,21 @@
15201644 return sk;
15211645 }
15221646
1647
+u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1648
+ struct tcphdr *th, u32 *cookie)
1649
+{
1650
+ u16 mss = 0;
1651
+#ifdef CONFIG_SYN_COOKIES
1652
+ mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1653
+ &tcp_request_sock_ipv4_ops, sk, th);
1654
+ if (mss) {
1655
+ *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1656
+ tcp_synq_overflow(sk);
1657
+ }
1658
+#endif
1659
+ return mss;
1660
+}
1661
+
15231662 /* The socket must have it's spinlock held when we get
15241663 * here, unless it is a TCP_LISTEN socket.
15251664 *
....@@ -1533,15 +1672,18 @@
15331672 struct sock *rsk;
15341673
15351674 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1536
- struct dst_entry *dst = sk->sk_rx_dst;
1675
+ struct dst_entry *dst;
1676
+
1677
+ dst = rcu_dereference_protected(sk->sk_rx_dst,
1678
+ lockdep_sock_is_held(sk));
15371679
15381680 sock_rps_save_rxhash(sk, skb);
15391681 sk_mark_napi_id(sk, skb);
15401682 if (dst) {
15411683 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
15421684 !dst->ops->check(dst, 0)) {
1685
+ RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
15431686 dst_release(dst);
1544
- sk->sk_rx_dst = NULL;
15451687 }
15461688 }
15471689 tcp_rcv_established(sk, skb);
....@@ -1616,7 +1758,7 @@
16161758 skb->sk = sk;
16171759 skb->destructor = sock_edemux;
16181760 if (sk_fullsock(sk)) {
1619
- struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1761
+ struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
16201762
16211763 if (dst)
16221764 dst = dst_check(dst, 0);
....@@ -1630,13 +1772,16 @@
16301772
16311773 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
16321774 {
1633
- u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1634
-
1635
- /* Only socket owner can try to collapse/prune rx queues
1636
- * to reduce memory overhead, so add a little headroom here.
1637
- * Few sockets backlog are possibly concurrently non empty.
1638
- */
1639
- limit += 64*1024;
1775
+ u32 limit, tail_gso_size, tail_gso_segs;
1776
+ struct skb_shared_info *shinfo;
1777
+ const struct tcphdr *th;
1778
+ struct tcphdr *thtail;
1779
+ struct sk_buff *tail;
1780
+ unsigned int hdrlen;
1781
+ bool fragstolen;
1782
+ u32 gso_segs;
1783
+ u32 gso_size;
1784
+ int delta;
16401785
16411786 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
16421787 * we can fix skb->truesize to its real value to avoid future drops.
....@@ -1645,6 +1790,98 @@
16451790 * (if cooked by drivers without copybreak feature).
16461791 */
16471792 skb_condense(skb);
1793
+
1794
+ skb_dst_drop(skb);
1795
+
1796
+ if (unlikely(tcp_checksum_complete(skb))) {
1797
+ bh_unlock_sock(sk);
1798
+ __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1799
+ __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1800
+ return true;
1801
+ }
1802
+
1803
+ /* Attempt coalescing to last skb in backlog, even if we are
1804
+ * above the limits.
1805
+ * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1806
+ */
1807
+ th = (const struct tcphdr *)skb->data;
1808
+ hdrlen = th->doff * 4;
1809
+
1810
+ tail = sk->sk_backlog.tail;
1811
+ if (!tail)
1812
+ goto no_coalesce;
1813
+ thtail = (struct tcphdr *)tail->data;
1814
+
1815
+ if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1816
+ TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1817
+ ((TCP_SKB_CB(tail)->tcp_flags |
1818
+ TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1819
+ !((TCP_SKB_CB(tail)->tcp_flags &
1820
+ TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1821
+ ((TCP_SKB_CB(tail)->tcp_flags ^
1822
+ TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1823
+#ifdef CONFIG_TLS_DEVICE
1824
+ tail->decrypted != skb->decrypted ||
1825
+#endif
1826
+ thtail->doff != th->doff ||
1827
+ memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1828
+ goto no_coalesce;
1829
+
1830
+ __skb_pull(skb, hdrlen);
1831
+
1832
+ shinfo = skb_shinfo(skb);
1833
+ gso_size = shinfo->gso_size ?: skb->len;
1834
+ gso_segs = shinfo->gso_segs ?: 1;
1835
+
1836
+ shinfo = skb_shinfo(tail);
1837
+ tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1838
+ tail_gso_segs = shinfo->gso_segs ?: 1;
1839
+
1840
+ if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1841
+ TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1842
+
1843
+ if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1844
+ TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1845
+ thtail->window = th->window;
1846
+ }
1847
+
1848
+ /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1849
+ * thtail->fin, so that the fast path in tcp_rcv_established()
1850
+ * is not entered if we append a packet with a FIN.
1851
+ * SYN, RST, URG are not present.
1852
+ * ACK is set on both packets.
1853
+ * PSH : we do not really care in TCP stack,
1854
+ * at least for 'GRO' packets.
1855
+ */
1856
+ thtail->fin |= th->fin;
1857
+ TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1858
+
1859
+ if (TCP_SKB_CB(skb)->has_rxtstamp) {
1860
+ TCP_SKB_CB(tail)->has_rxtstamp = true;
1861
+ tail->tstamp = skb->tstamp;
1862
+ skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1863
+ }
1864
+
1865
+ /* Not as strict as GRO. We only need to carry mss max value */
1866
+ shinfo->gso_size = max(gso_size, tail_gso_size);
1867
+ shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1868
+
1869
+ sk->sk_backlog.len += delta;
1870
+ __NET_INC_STATS(sock_net(sk),
1871
+ LINUX_MIB_TCPBACKLOGCOALESCE);
1872
+ kfree_skb_partial(skb, fragstolen);
1873
+ return false;
1874
+ }
1875
+ __skb_push(skb, hdrlen);
1876
+
1877
+no_coalesce:
1878
+ limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1879
+
1880
+ /* Only socket owner can try to collapse/prune rx queues
1881
+ * to reduce memory overhead, so add a little headroom here.
1882
+ * Few sockets backlog are possibly concurrently non empty.
1883
+ */
1884
+ limit += 64 * 1024;
16481885
16491886 if (unlikely(sk_add_backlog(sk, skb, limit))) {
16501887 bh_unlock_sock(sk);
....@@ -1698,7 +1935,9 @@
16981935 int tcp_v4_rcv(struct sk_buff *skb)
16991936 {
17001937 struct net *net = dev_net(skb->dev);
1938
+ struct sk_buff *skb_to_free;
17011939 int sdif = inet_sdif(skb);
1940
+ int dif = inet_iif(skb);
17021941 const struct iphdr *iph;
17031942 const struct tcphdr *th;
17041943 bool refcounted;
....@@ -1747,7 +1986,8 @@
17471986 struct sock *nsk;
17481987
17491988 sk = req->rsk_listener;
1750
- if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1989
+ if (unlikely(!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb) ||
1990
+ tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
17511991 sk_drops_add(sk, skb);
17521992 reqsk_put(req);
17531993 goto discard_it;
....@@ -1786,6 +2026,7 @@
17862026 }
17872027 goto discard_and_relse;
17882028 }
2029
+ nf_reset_ct(skb);
17892030 if (nsk == sk) {
17902031 reqsk_put(req);
17912032 tcp_v4_restore_cb(skb);
....@@ -1805,10 +2046,10 @@
18052046 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
18062047 goto discard_and_relse;
18072048
1808
- if (tcp_v4_inbound_md5_hash(sk, skb))
2049
+ if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
18092050 goto discard_and_relse;
18102051
1811
- nf_reset(skb);
2052
+ nf_reset_ct(skb);
18122053
18132054 if (tcp_filter(sk, skb))
18142055 goto discard_and_relse;
....@@ -1829,11 +2070,17 @@
18292070 tcp_segs_in(tcp_sk(sk), skb);
18302071 ret = 0;
18312072 if (!sock_owned_by_user(sk)) {
2073
+ skb_to_free = sk->sk_rx_skb_cache;
2074
+ sk->sk_rx_skb_cache = NULL;
18322075 ret = tcp_v4_do_rcv(sk, skb);
1833
- } else if (tcp_add_backlog(sk, skb)) {
1834
- goto discard_and_relse;
2076
+ } else {
2077
+ if (tcp_add_backlog(sk, skb))
2078
+ goto discard_and_relse;
2079
+ skb_to_free = NULL;
18352080 }
18362081 bh_unlock_sock(sk);
2082
+ if (skb_to_free)
2083
+ __kfree_skb(skb_to_free);
18372084
18382085 put_and_return:
18392086 if (refcounted)
....@@ -1897,7 +2144,7 @@
18972144 }
18982145 }
18992146 /* to ACK */
1900
- /* fall through */
2147
+ fallthrough;
19012148 case TCP_TW_ACK:
19022149 tcp_v4_timewait_ack(sk, skb);
19032150 break;
....@@ -1921,7 +2168,7 @@
19212168 struct dst_entry *dst = skb_dst(skb);
19222169
19232170 if (dst && dst_hold_safe(dst)) {
1924
- sk->sk_rx_dst = dst;
2171
+ rcu_assign_pointer(sk->sk_rx_dst, dst);
19252172 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
19262173 }
19272174 }
....@@ -1939,10 +2186,6 @@
19392186 .getsockopt = ip_getsockopt,
19402187 .addr2sockaddr = inet_csk_addr2sockaddr,
19412188 .sockaddr_len = sizeof(struct sockaddr_in),
1942
-#ifdef CONFIG_COMPAT
1943
- .compat_setsockopt = compat_ip_setsockopt,
1944
- .compat_getsockopt = compat_ip_getsockopt,
1945
-#endif
19462189 .mtu_reduced = tcp_v4_mtu_reduced,
19472190 };
19482191 EXPORT_SYMBOL(ipv4_specific);
....@@ -2007,7 +2250,7 @@
20072250 if (inet_csk(sk)->icsk_bind_hash)
20082251 inet_put_port(sk);
20092252
2010
- BUG_ON(tp->fastopen_rsk);
2253
+ BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
20112254
20122255 /* If socket is aborted during connect operation */
20132256 tcp_free_fastopen_req(tp);
....@@ -2028,12 +2271,17 @@
20282271 */
20292272 static void *listening_get_next(struct seq_file *seq, void *cur)
20302273 {
2031
- struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2274
+ struct tcp_seq_afinfo *afinfo;
20322275 struct tcp_iter_state *st = seq->private;
20332276 struct net *net = seq_file_net(seq);
20342277 struct inet_listen_hashbucket *ilb;
20352278 struct hlist_nulls_node *node;
20362279 struct sock *sk = cur;
2280
+
2281
+ if (st->bpf_seq_afinfo)
2282
+ afinfo = st->bpf_seq_afinfo;
2283
+ else
2284
+ afinfo = PDE_DATA(file_inode(seq->file));
20372285
20382286 if (!sk) {
20392287 get_head:
....@@ -2052,7 +2300,8 @@
20522300 sk_nulls_for_each_from(sk, node) {
20532301 if (!net_eq(sock_net(sk), net))
20542302 continue;
2055
- if (sk->sk_family == afinfo->family)
2303
+ if (afinfo->family == AF_UNSPEC ||
2304
+ sk->sk_family == afinfo->family)
20562305 return sk;
20572306 }
20582307 spin_unlock(&ilb->lock);
....@@ -2089,10 +2338,15 @@
20892338 */
20902339 static void *established_get_first(struct seq_file *seq)
20912340 {
2092
- struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2341
+ struct tcp_seq_afinfo *afinfo;
20932342 struct tcp_iter_state *st = seq->private;
20942343 struct net *net = seq_file_net(seq);
20952344 void *rc = NULL;
2345
+
2346
+ if (st->bpf_seq_afinfo)
2347
+ afinfo = st->bpf_seq_afinfo;
2348
+ else
2349
+ afinfo = PDE_DATA(file_inode(seq->file));
20962350
20972351 st->offset = 0;
20982352 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
....@@ -2106,7 +2360,8 @@
21062360
21072361 spin_lock_bh(lock);
21082362 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2109
- if (sk->sk_family != afinfo->family ||
2363
+ if ((afinfo->family != AF_UNSPEC &&
2364
+ sk->sk_family != afinfo->family) ||
21102365 !net_eq(sock_net(sk), net)) {
21112366 continue;
21122367 }
....@@ -2121,11 +2376,16 @@
21212376
21222377 static void *established_get_next(struct seq_file *seq, void *cur)
21232378 {
2124
- struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2379
+ struct tcp_seq_afinfo *afinfo;
21252380 struct sock *sk = cur;
21262381 struct hlist_nulls_node *node;
21272382 struct tcp_iter_state *st = seq->private;
21282383 struct net *net = seq_file_net(seq);
2384
+
2385
+ if (st->bpf_seq_afinfo)
2386
+ afinfo = st->bpf_seq_afinfo;
2387
+ else
2388
+ afinfo = PDE_DATA(file_inode(seq->file));
21292389
21302390 ++st->num;
21312391 ++st->offset;
....@@ -2133,7 +2393,8 @@
21332393 sk = sk_nulls_next(sk);
21342394
21352395 sk_nulls_for_each_from(sk, node) {
2136
- if (sk->sk_family == afinfo->family &&
2396
+ if ((afinfo->family == AF_UNSPEC ||
2397
+ sk->sk_family == afinfo->family) &&
21372398 net_eq(sock_net(sk), net))
21382399 return sk;
21392400 }
....@@ -2194,7 +2455,7 @@
21942455 break;
21952456 st->bucket = 0;
21962457 st->state = TCP_SEQ_STATE_ESTABLISHED;
2197
- /* Fallthrough */
2458
+ fallthrough;
21982459 case TCP_SEQ_STATE_ESTABLISHED:
21992460 if (st->bucket > tcp_hashinfo.ehash_mask)
22002461 break;
....@@ -2338,7 +2599,7 @@
23382599
23392600 state = inet_sk_state_load(sk);
23402601 if (state == TCP_LISTEN)
2341
- rx_queue = sk->sk_ack_backlog;
2602
+ rx_queue = READ_ONCE(sk->sk_ack_backlog);
23422603 else
23432604 /* Because we don't lock the socket,
23442605 * we might find a transient negative value.
....@@ -2360,7 +2621,7 @@
23602621 refcount_read(&sk->sk_refcnt), sk,
23612622 jiffies_to_clock_t(icsk->icsk_rto),
23622623 jiffies_to_clock_t(icsk->icsk_ack.ato),
2363
- (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2624
+ (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
23642625 tp->snd_cwnd,
23652626 state == TCP_LISTEN ?
23662627 fastopenq->max_qlen :
....@@ -2412,6 +2673,74 @@
24122673 seq_pad(seq, '\n');
24132674 return 0;
24142675 }
2676
+
2677
+#ifdef CONFIG_BPF_SYSCALL
2678
+struct bpf_iter__tcp {
2679
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
2680
+ __bpf_md_ptr(struct sock_common *, sk_common);
2681
+ uid_t uid __aligned(8);
2682
+};
2683
+
2684
+static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2685
+ struct sock_common *sk_common, uid_t uid)
2686
+{
2687
+ struct bpf_iter__tcp ctx;
2688
+
2689
+ meta->seq_num--; /* skip SEQ_START_TOKEN */
2690
+ ctx.meta = meta;
2691
+ ctx.sk_common = sk_common;
2692
+ ctx.uid = uid;
2693
+ return bpf_iter_run_prog(prog, &ctx);
2694
+}
2695
+
2696
+static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2697
+{
2698
+ struct bpf_iter_meta meta;
2699
+ struct bpf_prog *prog;
2700
+ struct sock *sk = v;
2701
+ uid_t uid;
2702
+
2703
+ if (v == SEQ_START_TOKEN)
2704
+ return 0;
2705
+
2706
+ if (sk->sk_state == TCP_TIME_WAIT) {
2707
+ uid = 0;
2708
+ } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2709
+ const struct request_sock *req = v;
2710
+
2711
+ uid = from_kuid_munged(seq_user_ns(seq),
2712
+ sock_i_uid(req->rsk_listener));
2713
+ } else {
2714
+ uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2715
+ }
2716
+
2717
+ meta.seq = seq;
2718
+ prog = bpf_iter_get_info(&meta, false);
2719
+ return tcp_prog_seq_show(prog, &meta, v, uid);
2720
+}
2721
+
2722
+static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2723
+{
2724
+ struct bpf_iter_meta meta;
2725
+ struct bpf_prog *prog;
2726
+
2727
+ if (!v) {
2728
+ meta.seq = seq;
2729
+ prog = bpf_iter_get_info(&meta, true);
2730
+ if (prog)
2731
+ (void)tcp_prog_seq_show(prog, &meta, v, 0);
2732
+ }
2733
+
2734
+ tcp_seq_stop(seq, v);
2735
+}
2736
+
2737
+static const struct seq_operations bpf_iter_tcp_seq_ops = {
2738
+ .show = bpf_iter_tcp_seq_show,
2739
+ .start = tcp_seq_start,
2740
+ .next = tcp_seq_next,
2741
+ .stop = bpf_iter_tcp_seq_stop,
2742
+};
2743
+#endif
24152744
24162745 static const struct seq_operations tcp4_seq_ops = {
24172746 .show = tcp4_seq_show,
....@@ -2493,10 +2822,6 @@
24932822 .rsk_prot = &tcp_request_sock_ops,
24942823 .h.hashinfo = &tcp_hashinfo,
24952824 .no_autobind = true,
2496
-#ifdef CONFIG_COMPAT
2497
- .compat_setsockopt = compat_tcp_setsockopt,
2498
- .compat_getsockopt = compat_tcp_getsockopt,
2499
-#endif
25002825 .diag_destroy = tcp_abort,
25012826 };
25022827 EXPORT_SYMBOL(tcp_prot);
....@@ -2506,7 +2831,8 @@
25062831 int cpu;
25072832
25082833 if (net->ipv4.tcp_congestion_control)
2509
- module_put(net->ipv4.tcp_congestion_control->owner);
2834
+ bpf_module_put(net->ipv4.tcp_congestion_control,
2835
+ net->ipv4.tcp_congestion_control->owner);
25102836
25112837 for_each_possible_cpu(cpu)
25122838 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
....@@ -2545,6 +2871,7 @@
25452871 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
25462872 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
25472873 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2874
+ net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
25482875
25492876 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
25502877 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
....@@ -2560,12 +2887,13 @@
25602887 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
25612888 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
25622889 net->ipv4.sysctl_tcp_tw_reuse = 2;
2890
+ net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
25632891
25642892 cnt = tcp_hashinfo.ehash_mask + 1;
2565
- net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2893
+ net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
25662894 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
25672895
2568
- net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2896
+ net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
25692897 net->ipv4.sysctl_tcp_sack = 1;
25702898 net->ipv4.sysctl_tcp_window_scaling = 1;
25712899 net->ipv4.sysctl_tcp_timestamps = 1;
....@@ -2584,8 +2912,8 @@
25842912 * which are too large can cause TCP streams to be bursty.
25852913 */
25862914 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2587
- /* Default TSQ limit of four TSO segments */
2588
- net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
2915
+ /* Default TSQ limit of 16 TSO segments */
2916
+ net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
25892917 /* rfc5961 challenge ack rate limiting */
25902918 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
25912919 net->ipv4.sysctl_tcp_min_tso_segs = 2;
....@@ -2603,15 +2931,17 @@
26032931 sizeof(init_net.ipv4.sysctl_tcp_wmem));
26042932 }
26052933 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2934
+ net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
26062935 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
26072936 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
26082937 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2609
- net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2938
+ net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
26102939 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
26112940
26122941 /* Reno is always built in */
26132942 if (!net_eq(net, &init_net) &&
2614
- try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2943
+ bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2944
+ init_net.ipv4.tcp_congestion_control->owner))
26152945 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
26162946 else
26172947 net->ipv4.tcp_congestion_control = &tcp_reno;
....@@ -2639,8 +2969,68 @@
26392969 .exit_batch = tcp_sk_exit_batch,
26402970 };
26412971
2972
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
2973
+DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
2974
+ struct sock_common *sk_common, uid_t uid)
2975
+
2976
+static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
2977
+{
2978
+ struct tcp_iter_state *st = priv_data;
2979
+ struct tcp_seq_afinfo *afinfo;
2980
+ int ret;
2981
+
2982
+ afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
2983
+ if (!afinfo)
2984
+ return -ENOMEM;
2985
+
2986
+ afinfo->family = AF_UNSPEC;
2987
+ st->bpf_seq_afinfo = afinfo;
2988
+ ret = bpf_iter_init_seq_net(priv_data, aux);
2989
+ if (ret)
2990
+ kfree(afinfo);
2991
+ return ret;
2992
+}
2993
+
2994
+static void bpf_iter_fini_tcp(void *priv_data)
2995
+{
2996
+ struct tcp_iter_state *st = priv_data;
2997
+
2998
+ kfree(st->bpf_seq_afinfo);
2999
+ bpf_iter_fini_seq_net(priv_data);
3000
+}
3001
+
3002
+static const struct bpf_iter_seq_info tcp_seq_info = {
3003
+ .seq_ops = &bpf_iter_tcp_seq_ops,
3004
+ .init_seq_private = bpf_iter_init_tcp,
3005
+ .fini_seq_private = bpf_iter_fini_tcp,
3006
+ .seq_priv_size = sizeof(struct tcp_iter_state),
3007
+};
3008
+
3009
+static struct bpf_iter_reg tcp_reg_info = {
3010
+ .target = "tcp",
3011
+ .ctx_arg_info_size = 1,
3012
+ .ctx_arg_info = {
3013
+ { offsetof(struct bpf_iter__tcp, sk_common),
3014
+ PTR_TO_BTF_ID_OR_NULL },
3015
+ },
3016
+ .seq_info = &tcp_seq_info,
3017
+};
3018
+
3019
+static void __init bpf_iter_register(void)
3020
+{
3021
+ tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3022
+ if (bpf_iter_reg_target(&tcp_reg_info))
3023
+ pr_warn("Warning: could not register bpf iterator tcp\n");
3024
+}
3025
+
3026
+#endif
3027
+
26423028 void __init tcp_v4_init(void)
26433029 {
26443030 if (register_pernet_subsys(&tcp_sk_ops))
26453031 panic("Failed to create the TCP control socket.\n");
3032
+
3033
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3034
+ bpf_iter_register();
3035
+#endif
26463036 }