From 37f49e37ab4cb5d0bc4c60eb5c6d4dd57db767bb Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Fri, 10 May 2024 07:44:59 +0000
Subject: [PATCH] gmac get mac form eeprom
---
kernel/net/ipv4/tcp_ipv4.c | 732 ++++++++++++++++++++++++++++++++++++++++++-------------
1 files changed, 558 insertions(+), 174 deletions(-)
diff --git a/kernel/net/ipv4/tcp_ipv4.c b/kernel/net/ipv4/tcp_ipv4.c
index 6a530a0..45939b6 100644
--- a/kernel/net/ipv4/tcp_ipv4.c
+++ b/kernel/net/ipv4/tcp_ipv4.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -7,18 +8,12 @@
*
* IPv4 specific functions
*
- *
* code split from:
* linux/ipv4/tcp.c
* linux/ipv4/tcp_input.c
* linux/ipv4/tcp_output.c
*
* See tcp.c for author information
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
/*
@@ -62,7 +57,6 @@
#include <linux/init.h>
#include <linux/times.h>
#include <linux/slab.h>
-#include <linux/locallock.h>
#include <net/net_namespace.h>
#include <net/icmp.h>
@@ -82,6 +76,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/inetdevice.h>
+#include <linux/btf_ids.h>
#include <crypto/hash.h>
#include <linux/scatterlist.h>
@@ -111,10 +106,10 @@
int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
{
+ int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
const struct inet_timewait_sock *tw = inet_twsk(sktw);
const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
struct tcp_sock *tp = tcp_sk(sk);
- int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
if (reuse == 2) {
/* Still does not detect *everything* that goes through
@@ -127,11 +122,9 @@
#if IS_ENABLED(CONFIG_IPV6)
if (tw->tw_family == AF_INET6) {
if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
- (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
- (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
+ ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
- (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
- (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
+ ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
loopback = true;
} else
#endif
@@ -329,6 +322,8 @@
* if necessary.
*/
tcp_set_state(sk, TCP_CLOSE);
+ if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
+ inet_reset_saddr(sk);
ip_rt_put(rt);
sk->sk_route_caps = 0;
inet->inet_dport = 0;
@@ -411,6 +406,46 @@
}
EXPORT_SYMBOL(tcp_req_err);
+/* TCP-LD (RFC 6069) logic */
+void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ s32 remaining;
+ u32 delta_us;
+
+ if (sock_owned_by_user(sk))
+ return;
+
+ if (seq != tp->snd_una || !icsk->icsk_retransmits ||
+ !icsk->icsk_backoff)
+ return;
+
+ skb = tcp_rtx_queue_head(sk);
+ if (WARN_ON_ONCE(!skb))
+ return;
+
+ icsk->icsk_backoff--;
+ icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
+ icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
+
+ tcp_mstamp_refresh(tp);
+ delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
+ remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
+
+ if (remaining > 0) {
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ remaining, TCP_RTO_MAX);
+ } else {
+ /* RTO revert clocked out retransmission.
+ * Will retransmit now.
+ */
+ tcp_retransmit_timer(sk);
+ }
+}
+EXPORT_SYMBOL(tcp_ld_RTO_revert);
+
/*
* This routine is called by the ICMP module when it gets some
* sort of error condition. If err < 0 then the socket should
@@ -427,43 +462,40 @@
*
*/
-void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
+int tcp_v4_err(struct sk_buff *skb, u32 info)
{
- const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
- struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
- struct inet_connection_sock *icsk;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
struct tcp_sock *tp;
struct inet_sock *inet;
- const int type = icmp_hdr(icmp_skb)->type;
- const int code = icmp_hdr(icmp_skb)->code;
+ const int type = icmp_hdr(skb)->type;
+ const int code = icmp_hdr(skb)->code;
struct sock *sk;
- struct sk_buff *skb;
struct request_sock *fastopen;
u32 seq, snd_una;
- s32 remaining;
- u32 delta_us;
int err;
- struct net *net = dev_net(icmp_skb->dev);
+ struct net *net = dev_net(skb->dev);
sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
th->dest, iph->saddr, ntohs(th->source),
- inet_iif(icmp_skb), 0);
+ inet_iif(skb), 0);
if (!sk) {
__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
- return;
+ return -ENOENT;
}
if (sk->sk_state == TCP_TIME_WAIT) {
inet_twsk_put(inet_twsk(sk));
- return;
+ return 0;
}
seq = ntohl(th->seq);
- if (sk->sk_state == TCP_NEW_SYN_RECV)
- return tcp_req_err(sk, seq,
- type == ICMP_PARAMETERPROB ||
- type == ICMP_TIME_EXCEEDED ||
- (type == ICMP_DEST_UNREACH &&
- (code == ICMP_NET_UNREACH ||
- code == ICMP_HOST_UNREACH)));
+ if (sk->sk_state == TCP_NEW_SYN_RECV) {
+ tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
+ type == ICMP_TIME_EXCEEDED ||
+ (type == ICMP_DEST_UNREACH &&
+ (code == ICMP_NET_UNREACH ||
+ code == ICMP_HOST_UNREACH)));
+ return 0;
+ }
bh_lock_sock(sk);
/* If too many ICMPs get dropped on busy
@@ -483,10 +515,9 @@
goto out;
}
- icsk = inet_csk(sk);
tp = tcp_sk(sk);
/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
- fastopen = tp->fastopen_rsk;
+ fastopen = rcu_dereference(tp->fastopen_rsk);
snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
if (sk->sk_state != TCP_LISTEN &&
!between(seq, snd_una, tp->snd_nxt)) {
@@ -497,7 +528,7 @@
switch (type) {
case ICMP_REDIRECT:
if (!sock_owned_by_user(sk))
- do_redirect(icmp_skb, sk);
+ do_redirect(skb, sk);
goto out;
case ICMP_SOURCE_QUENCH:
/* Just silently ignore these. */
@@ -528,40 +559,12 @@
}
err = icmp_err_convert[code].errno;
- /* check if icmp_skb allows revert of backoff
- * (see draft-zimmermann-tcp-lcd) */
- if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
- break;
- if (seq != tp->snd_una || !icsk->icsk_retransmits ||
- !icsk->icsk_backoff || fastopen)
- break;
-
- if (sock_owned_by_user(sk))
- break;
-
- skb = tcp_rtx_queue_head(sk);
- if (WARN_ON_ONCE(!skb))
- break;
-
- icsk->icsk_backoff--;
- icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
- TCP_TIMEOUT_INIT;
- icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
-
- tcp_mstamp_refresh(tp);
- delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
- remaining = icsk->icsk_rto -
- usecs_to_jiffies(delta_us);
-
- if (remaining > 0) {
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- remaining, TCP_RTO_MAX);
- } else {
- /* RTO revert clocked out retransmission.
- * Will retransmit now */
- tcp_retransmit_timer(sk);
- }
-
+ /* check if this ICMP message allows revert of backoff.
+ * (see RFC 6069)
+ */
+ if (!fastopen &&
+ (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
+ tcp_ld_RTO_revert(sk, seq);
break;
case ICMP_TIME_EXCEEDED:
err = EHOSTUNREACH;
@@ -574,10 +577,12 @@
case TCP_SYN_SENT:
case TCP_SYN_RECV:
/* Only in fast or simultaneous open. If a fast open socket is
- * is already accepted it is treated as a connected one below.
+ * already accepted it is treated as a connected one below.
*/
if (fastopen && !fastopen->sk)
break;
+
+ ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
if (!sock_owned_by_user(sk)) {
sk->sk_err = err;
@@ -618,6 +623,7 @@
out:
bh_unlock_sock(sk);
sock_put(sk);
+ return 0;
}
void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
@@ -638,7 +644,6 @@
}
EXPORT_SYMBOL(tcp_v4_send_check);
-static DEFINE_LOCAL_IRQ_LOCK(tcp_sk_lock);
/*
* This routine will send an RST to the other tcp.
*
@@ -669,8 +674,9 @@
int genhash;
struct sock *sk1 = NULL;
#endif
- struct net *net;
+ u64 transmit_time = 0;
struct sock *ctl_sk;
+ struct net *net;
/* Never send a reset in response to a reset. */
if (th->rst)
@@ -706,9 +712,21 @@
rcu_read_lock();
hash_location = tcp_parse_md5sig_option(th);
if (sk && sk_fullsock(sk)) {
- key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
- &ip_hdr(skb)->saddr, AF_INET);
+ const union tcp_md5_addr *addr;
+ int l3index;
+
+ /* sdif set, means packet ingressed via a device
+ * in an L3 domain and inet_iif is set to it.
+ */
+ l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
+ addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
+ key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
} else if (hash_location) {
+ const union tcp_md5_addr *addr;
+ int sdif = tcp_v4_sdif(skb);
+ int dif = inet_iif(skb);
+ int l3index;
+
/*
* active side is lost. Try to find listening socket through
* source port, and then find md5 key through listening socket.
@@ -719,14 +737,17 @@
sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
ip_hdr(skb)->saddr,
th->source, ip_hdr(skb)->daddr,
- ntohs(th->source), inet_iif(skb),
- tcp_v4_sdif(skb));
+ ntohs(th->source), dif, sdif);
/* don't send rst if it can't find key */
if (!sk1)
goto out;
- key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
- &ip_hdr(skb)->saddr, AF_INET);
+ /* sdif set, means packet ingressed via a device
+ * in an L3 domain and dif is set to it.
+ */
+ l3index = sdif ? dif : 0;
+ addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
+ key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
if (!key)
goto out;
@@ -773,20 +794,23 @@
arg.tos = ip_hdr(skb)->tos;
arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
- local_lock(tcp_sk_lock);
- ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
- if (sk)
+ ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
+ if (sk) {
ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_mark : sk->sk_mark;
+ ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
+ inet_twsk(sk)->tw_priority : sk->sk_priority;
+ transmit_time = tcp_transmit_time(sk);
+ }
ip_send_unicast_reply(ctl_sk,
skb, &TCP_SKB_CB(skb)->header.h4.opt,
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
- &arg, arg.iov[0].iov_len);
+ &arg, arg.iov[0].iov_len,
+ transmit_time);
ctl_sk->sk_mark = 0;
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
- local_unlock(tcp_sk_lock);
local_bh_enable();
#ifdef CONFIG_TCP_MD5SIG
@@ -817,6 +841,7 @@
struct net *net = sock_net(sk);
struct ip_reply_arg arg;
struct sock *ctl_sk;
+ u64 transmit_time;
memset(&rep.th, 0, sizeof(struct tcphdr));
memset(&arg, 0, sizeof(arg));
@@ -867,19 +892,20 @@
arg.tos = tos;
arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
- local_lock(tcp_sk_lock);
- ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
- if (sk)
- ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
- inet_twsk(sk)->tw_mark : sk->sk_mark;
+ ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
+ ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
+ inet_twsk(sk)->tw_mark : sk->sk_mark;
+ ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
+ inet_twsk(sk)->tw_priority : sk->sk_priority;
+ transmit_time = tcp_transmit_time(sk);
ip_send_unicast_reply(ctl_sk,
skb, &TCP_SKB_CB(skb)->header.h4.opt,
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
- &arg, arg.iov[0].iov_len);
+ &arg, arg.iov[0].iov_len,
+ transmit_time);
ctl_sk->sk_mark = 0;
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
- local_unlock(tcp_sk_lock);
local_bh_enable();
}
@@ -905,6 +931,9 @@
static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req)
{
+ const union tcp_md5_addr *addr;
+ int l3index;
+
/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
* sk->sk_state == TCP_SYN_RECV -> for Fast Open.
*/
@@ -916,14 +945,15 @@
* exception of <SYN> segments, MUST be right-shifted by
* Rcv.Wind.Shift bits:
*/
+ addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
+ l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
tcp_v4_send_ack(sk, skb, seq,
tcp_rsk(req)->rcv_nxt,
req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
- req->ts_recent,
+ READ_ONCE(req->ts_recent),
0,
- tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
- AF_INET),
+ tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
ip_hdr(skb)->tos);
}
@@ -937,26 +967,38 @@
struct flowi *fl,
struct request_sock *req,
struct tcp_fastopen_cookie *foc,
- enum tcp_synack_type synack_type)
+ enum tcp_synack_type synack_type,
+ struct sk_buff *syn_skb)
{
const struct inet_request_sock *ireq = inet_rsk(req);
struct flowi4 fl4;
int err = -1;
struct sk_buff *skb;
+ u8 tos;
/* First, grab a route. */
if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
return -1;
- skb = tcp_make_synack(sk, dst, req, foc, synack_type);
+ skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
if (skb) {
__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
+ tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
+ (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
+ (inet_sk(sk)->tos & INET_ECN_MASK) :
+ inet_sk(sk)->tos;
+
+ if (!INET_ECN_is_capable(tos) &&
+ tcp_bpf_ca_needs_ecn((struct sock *)req))
+ tos |= INET_ECN_ECT_0;
+
rcu_read_lock();
err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
ireq->ir_rmt_addr,
- rcu_dereference(ireq->ireq_opt));
+ rcu_dereference(ireq->ireq_opt),
+ tos);
rcu_read_unlock();
err = net_xmit_eval(err);
}
@@ -979,10 +1021,27 @@
* We need to maintain these in the sk structure.
*/
+DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
+EXPORT_SYMBOL(tcp_md5_needed);
+
+static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
+{
+ if (!old)
+ return true;
+
+ /* l3index always overrides non-l3index */
+ if (old->l3index && new->l3index == 0)
+ return false;
+ if (old->l3index == 0 && new->l3index)
+ return true;
+
+ return old->prefixlen < new->prefixlen;
+}
+
/* Find the Key structure for an address. */
-struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
- const union tcp_md5_addr *addr,
- int family)
+struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
+ const union tcp_md5_addr *addr,
+ int family)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_key *key;
@@ -997,10 +1056,12 @@
if (!md5sig)
return NULL;
- hlist_for_each_entry_rcu(key, &md5sig->head, node) {
+ hlist_for_each_entry_rcu(key, &md5sig->head, node,
+ lockdep_sock_is_held(sk)) {
if (key->family != family)
continue;
-
+ if (key->l3index && key->l3index != l3index)
+ continue;
if (family == AF_INET) {
mask = inet_make_mask(key->prefixlen);
match = (key->addr.a4.s_addr & mask) ==
@@ -1014,17 +1075,17 @@
match = false;
}
- if (match && (!best_match ||
- key->prefixlen > best_match->prefixlen))
+ if (match && better_md5_match(best_match, key))
best_match = key;
}
return best_match;
}
-EXPORT_SYMBOL(tcp_md5_do_lookup);
+EXPORT_SYMBOL(__tcp_md5_do_lookup);
static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
const union tcp_md5_addr *addr,
- int family, u8 prefixlen)
+ int family, u8 prefixlen,
+ int l3index)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_key *key;
@@ -1040,8 +1101,11 @@
if (family == AF_INET6)
size = sizeof(struct in6_addr);
#endif
- hlist_for_each_entry_rcu(key, &md5sig->head, node) {
+ hlist_for_each_entry_rcu(key, &md5sig->head, node,
+ lockdep_sock_is_held(sk)) {
if (key->family != family)
+ continue;
+ if (key->l3index != l3index)
continue;
if (!memcmp(&key->addr, addr, size) &&
key->prefixlen == prefixlen)
@@ -1054,28 +1118,34 @@
const struct sock *addr_sk)
{
const union tcp_md5_addr *addr;
+ int l3index;
+ l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
+ addr_sk->sk_bound_dev_if);
addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
- return tcp_md5_do_lookup(sk, addr, AF_INET);
+ return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
}
EXPORT_SYMBOL(tcp_v4_md5_lookup);
/* This can be called on a newly created socket, from other files */
int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
- int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
- gfp_t gfp)
+ int family, u8 prefixlen, int l3index,
+ const u8 *newkey, u8 newkeylen, gfp_t gfp)
{
/* Add Key to the list */
struct tcp_md5sig_key *key;
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_info *md5sig;
- key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
+ key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
if (key) {
/* Pre-existing entry - just update that one.
* Note that the key might be used concurrently.
+ * data_race() is telling kcsan that we do not care of
+ * key mismatches, since changing MD5 key on live flows
+ * can lead to packet drops.
*/
- memcpy(key->key, newkey, newkeylen);
+ data_race(memcpy(key->key, newkey, newkeylen));
/* Pairs with READ_ONCE() in tcp_md5_hash_key().
* Also note that a reader could catch new key->keylen value
@@ -1111,6 +1181,7 @@
key->keylen = newkeylen;
key->family = family;
key->prefixlen = prefixlen;
+ key->l3index = l3index;
memcpy(&key->addr, addr,
(family == AF_INET6) ? sizeof(struct in6_addr) :
sizeof(struct in_addr));
@@ -1120,11 +1191,11 @@
EXPORT_SYMBOL(tcp_md5_do_add);
int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
- u8 prefixlen)
+ u8 prefixlen, int l3index)
{
struct tcp_md5sig_key *key;
- key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
+ key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
if (!key)
return -ENOENT;
hlist_del_rcu(&key->node);
@@ -1151,16 +1222,18 @@
}
static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
- char __user *optval, int optlen)
+ sockptr_t optval, int optlen)
{
struct tcp_md5sig cmd;
struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
+ const union tcp_md5_addr *addr;
u8 prefixlen = 32;
+ int l3index = 0;
if (optlen < sizeof(cmd))
return -EINVAL;
- if (copy_from_user(&cmd, optval, sizeof(cmd)))
+ if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
return -EFAULT;
if (sin->sin_family != AF_INET)
@@ -1173,16 +1246,34 @@
return -EINVAL;
}
+ if (optname == TCP_MD5SIG_EXT &&
+ cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
+ struct net_device *dev;
+
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
+ if (dev && netif_is_l3_master(dev))
+ l3index = dev->ifindex;
+
+ rcu_read_unlock();
+
+ /* ok to reference set/not set outside of rcu;
+ * right now device MUST be an L3 master
+ */
+ if (!dev || !l3index)
+ return -EINVAL;
+ }
+
+ addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
+
if (!cmd.tcpm_keylen)
- return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
- AF_INET, prefixlen);
+ return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
return -EINVAL;
- return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
- AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
- GFP_KERNEL);
+ return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
+ cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
}
static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
@@ -1292,7 +1383,8 @@
/* Called with rcu_read_lock() */
static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
- const struct sk_buff *skb)
+ const struct sk_buff *skb,
+ int dif, int sdif)
{
#ifdef CONFIG_TCP_MD5SIG
/*
@@ -1307,11 +1399,17 @@
struct tcp_md5sig_key *hash_expected;
const struct iphdr *iph = ip_hdr(skb);
const struct tcphdr *th = tcp_hdr(skb);
- int genhash;
+ const union tcp_md5_addr *addr;
unsigned char newhash[16];
+ int genhash, l3index;
- hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
- AF_INET);
+ /* sdif set, means packet ingressed via a device
+ * in an L3 domain and dif is set to the l3mdev
+ */
+ l3index = sdif ? dif : 0;
+
+ addr = (union tcp_md5_addr *)&iph->saddr;
+ hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
hash_location = tcp_parse_md5sig_option(th);
/* We've parsed the options - do we have a hash? */
@@ -1337,11 +1435,11 @@
if (genhash || memcmp(hash_location, newhash, 16) != 0) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
- net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
+ net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
&iph->saddr, ntohs(th->source),
&iph->daddr, ntohs(th->dest),
genhash ? " tcp_v4_calc_md5_hash failed"
- : "");
+ : "", l3index);
return true;
}
return false;
@@ -1378,7 +1476,7 @@
.syn_ack_timeout = tcp_syn_ack_timeout,
};
-static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
+const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
.mss_clamp = TCP_MSS_DEFAULT,
#ifdef CONFIG_TCP_MD5SIG
.req_md5_lookup = tcp_v4_md5_lookup,
@@ -1421,11 +1519,14 @@
bool *own_req)
{
struct inet_request_sock *ireq;
+ bool found_dup_sk = false;
struct inet_sock *newinet;
struct tcp_sock *newtp;
struct sock *newsk;
#ifdef CONFIG_TCP_MD5SIG
+ const union tcp_md5_addr *addr;
struct tcp_md5sig_key *key;
+ int l3index;
#endif
struct ip_options_rcu *inet_opt;
@@ -1456,6 +1557,12 @@
inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
newinet->inet_id = prandom_u32();
+ /* Set ToS of the new socket based upon the value of incoming SYN.
+ * ECT bits are set later in tcp_init_transfer().
+ */
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
+ newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
+
if (!dst) {
dst = inet_csk_route_child_sock(sk, newsk, req);
if (!dst)
@@ -1473,9 +1580,10 @@
tcp_initialize_rcv_mss(newsk);
#ifdef CONFIG_TCP_MD5SIG
+ l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
/* Copy over the MD5 key from the original socket */
- key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
- AF_INET);
+ addr = (union tcp_md5_addr *)&newinet->inet_daddr;
+ key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
if (key) {
/*
* We're using one, so create a matching key
@@ -1483,20 +1591,30 @@
* memory, then we end up not copying the key
* across. Shucks.
*/
- tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
- AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
+ tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
+ key->key, key->keylen, GFP_ATOMIC);
sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
}
#endif
if (__inet_inherit_port(sk, newsk) < 0)
goto put_and_exit;
- *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
+ *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
+ &found_dup_sk);
if (likely(*own_req)) {
tcp_move_syn(newtp, req);
ireq->ireq_opt = NULL;
} else {
newinet->inet_opt = NULL;
+
+ if (!req_unhash && found_dup_sk) {
+ /* This code path should only be executed in the
+ * syncookie case only
+ */
+ bh_unlock_sock(newsk);
+ sock_put(newsk);
+ newsk = NULL;
+ }
}
return newsk;
@@ -1526,6 +1644,21 @@
return sk;
}
+u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
+ struct tcphdr *th, u32 *cookie)
+{
+ u16 mss = 0;
+#ifdef CONFIG_SYN_COOKIES
+ mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
+ &tcp_request_sock_ipv4_ops, sk, th);
+ if (mss) {
+ *cookie = __cookie_v4_init_sequence(iph, th, &mss);
+ tcp_synq_overflow(sk);
+ }
+#endif
+ return mss;
+}
+
/* The socket must have it's spinlock held when we get
* here, unless it is a TCP_LISTEN socket.
*
@@ -1539,15 +1672,18 @@
struct sock *rsk;
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
- struct dst_entry *dst = sk->sk_rx_dst;
+ struct dst_entry *dst;
+
+ dst = rcu_dereference_protected(sk->sk_rx_dst,
+ lockdep_sock_is_held(sk));
sock_rps_save_rxhash(sk, skb);
sk_mark_napi_id(sk, skb);
if (dst) {
if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
!dst->ops->check(dst, 0)) {
+ RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
dst_release(dst);
- sk->sk_rx_dst = NULL;
}
}
tcp_rcv_established(sk, skb);
@@ -1622,7 +1758,7 @@
skb->sk = sk;
skb->destructor = sock_edemux;
if (sk_fullsock(sk)) {
- struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
+ struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
if (dst)
dst = dst_check(dst, 0);
@@ -1636,13 +1772,16 @@
bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
{
- u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
-
- /* Only socket owner can try to collapse/prune rx queues
- * to reduce memory overhead, so add a little headroom here.
- * Few sockets backlog are possibly concurrently non empty.
- */
- limit += 64*1024;
+ u32 limit, tail_gso_size, tail_gso_segs;
+ struct skb_shared_info *shinfo;
+ const struct tcphdr *th;
+ struct tcphdr *thtail;
+ struct sk_buff *tail;
+ unsigned int hdrlen;
+ bool fragstolen;
+ u32 gso_segs;
+ u32 gso_size;
+ int delta;
/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
* we can fix skb->truesize to its real value to avoid future drops.
@@ -1651,6 +1790,98 @@
* (if cooked by drivers without copybreak feature).
*/
skb_condense(skb);
+
+ skb_dst_drop(skb);
+
+ if (unlikely(tcp_checksum_complete(skb))) {
+ bh_unlock_sock(sk);
+ __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
+ __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
+ return true;
+ }
+
+ /* Attempt coalescing to last skb in backlog, even if we are
+ * above the limits.
+ * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
+ */
+ th = (const struct tcphdr *)skb->data;
+ hdrlen = th->doff * 4;
+
+ tail = sk->sk_backlog.tail;
+ if (!tail)
+ goto no_coalesce;
+ thtail = (struct tcphdr *)tail->data;
+
+ if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
+ TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
+ ((TCP_SKB_CB(tail)->tcp_flags |
+ TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
+ !((TCP_SKB_CB(tail)->tcp_flags &
+ TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
+ ((TCP_SKB_CB(tail)->tcp_flags ^
+ TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
+#ifdef CONFIG_TLS_DEVICE
+ tail->decrypted != skb->decrypted ||
+#endif
+ thtail->doff != th->doff ||
+ memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
+ goto no_coalesce;
+
+ __skb_pull(skb, hdrlen);
+
+ shinfo = skb_shinfo(skb);
+ gso_size = shinfo->gso_size ?: skb->len;
+ gso_segs = shinfo->gso_segs ?: 1;
+
+ shinfo = skb_shinfo(tail);
+ tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
+ tail_gso_segs = shinfo->gso_segs ?: 1;
+
+ if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
+ TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
+
+ if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
+ TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
+ thtail->window = th->window;
+ }
+
+ /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
+ * thtail->fin, so that the fast path in tcp_rcv_established()
+ * is not entered if we append a packet with a FIN.
+ * SYN, RST, URG are not present.
+ * ACK is set on both packets.
+ * PSH : we do not really care in TCP stack,
+ * at least for 'GRO' packets.
+ */
+ thtail->fin |= th->fin;
+ TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
+
+ if (TCP_SKB_CB(skb)->has_rxtstamp) {
+ TCP_SKB_CB(tail)->has_rxtstamp = true;
+ tail->tstamp = skb->tstamp;
+ skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
+ }
+
+ /* Not as strict as GRO. We only need to carry mss max value */
+ shinfo->gso_size = max(gso_size, tail_gso_size);
+ shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
+
+ sk->sk_backlog.len += delta;
+ __NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPBACKLOGCOALESCE);
+ kfree_skb_partial(skb, fragstolen);
+ return false;
+ }
+ __skb_push(skb, hdrlen);
+
+no_coalesce:
+ limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
+
+ /* Only socket owner can try to collapse/prune rx queues
+ * to reduce memory overhead, so add a little headroom here.
+ * Few sockets backlog are possibly concurrently non empty.
+ */
+ limit += 64 * 1024;
if (unlikely(sk_add_backlog(sk, skb, limit))) {
bh_unlock_sock(sk);
@@ -1704,7 +1935,9 @@
int tcp_v4_rcv(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
+ struct sk_buff *skb_to_free;
int sdif = inet_sdif(skb);
+ int dif = inet_iif(skb);
const struct iphdr *iph;
const struct tcphdr *th;
bool refcounted;
@@ -1753,7 +1986,8 @@
struct sock *nsk;
sk = req->rsk_listener;
- if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
+ if (unlikely(!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb) ||
+ tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
sk_drops_add(sk, skb);
reqsk_put(req);
goto discard_it;
@@ -1792,6 +2026,7 @@
}
goto discard_and_relse;
}
+ nf_reset_ct(skb);
if (nsk == sk) {
reqsk_put(req);
tcp_v4_restore_cb(skb);
@@ -1811,10 +2046,10 @@
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
goto discard_and_relse;
- if (tcp_v4_inbound_md5_hash(sk, skb))
+ if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
goto discard_and_relse;
- nf_reset(skb);
+ nf_reset_ct(skb);
if (tcp_filter(sk, skb))
goto discard_and_relse;
@@ -1835,11 +2070,17 @@
tcp_segs_in(tcp_sk(sk), skb);
ret = 0;
if (!sock_owned_by_user(sk)) {
+ skb_to_free = sk->sk_rx_skb_cache;
+ sk->sk_rx_skb_cache = NULL;
ret = tcp_v4_do_rcv(sk, skb);
- } else if (tcp_add_backlog(sk, skb)) {
- goto discard_and_relse;
+ } else {
+ if (tcp_add_backlog(sk, skb))
+ goto discard_and_relse;
+ skb_to_free = NULL;
}
bh_unlock_sock(sk);
+ if (skb_to_free)
+ __kfree_skb(skb_to_free);
put_and_return:
if (refcounted)
@@ -1903,7 +2144,7 @@
}
}
/* to ACK */
- /* fall through */
+ fallthrough;
case TCP_TW_ACK:
tcp_v4_timewait_ack(sk, skb);
break;
@@ -1927,7 +2168,7 @@
struct dst_entry *dst = skb_dst(skb);
if (dst && dst_hold_safe(dst)) {
- sk->sk_rx_dst = dst;
+ rcu_assign_pointer(sk->sk_rx_dst, dst);
inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
}
}
@@ -1945,10 +2186,6 @@
.getsockopt = ip_getsockopt,
.addr2sockaddr = inet_csk_addr2sockaddr,
.sockaddr_len = sizeof(struct sockaddr_in),
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_ip_setsockopt,
- .compat_getsockopt = compat_ip_getsockopt,
-#endif
.mtu_reduced = tcp_v4_mtu_reduced,
};
EXPORT_SYMBOL(ipv4_specific);
@@ -2013,7 +2250,7 @@
if (inet_csk(sk)->icsk_bind_hash)
inet_put_port(sk);
- BUG_ON(tp->fastopen_rsk);
+ BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
/* If socket is aborted during connect operation */
tcp_free_fastopen_req(tp);
@@ -2034,12 +2271,17 @@
*/
static void *listening_get_next(struct seq_file *seq, void *cur)
{
- struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
+ struct tcp_seq_afinfo *afinfo;
struct tcp_iter_state *st = seq->private;
struct net *net = seq_file_net(seq);
struct inet_listen_hashbucket *ilb;
struct hlist_nulls_node *node;
struct sock *sk = cur;
+
+ if (st->bpf_seq_afinfo)
+ afinfo = st->bpf_seq_afinfo;
+ else
+ afinfo = PDE_DATA(file_inode(seq->file));
if (!sk) {
get_head:
@@ -2058,7 +2300,8 @@
sk_nulls_for_each_from(sk, node) {
if (!net_eq(sock_net(sk), net))
continue;
- if (sk->sk_family == afinfo->family)
+ if (afinfo->family == AF_UNSPEC ||
+ sk->sk_family == afinfo->family)
return sk;
}
spin_unlock(&ilb->lock);
@@ -2095,10 +2338,15 @@
*/
static void *established_get_first(struct seq_file *seq)
{
- struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
+ struct tcp_seq_afinfo *afinfo;
struct tcp_iter_state *st = seq->private;
struct net *net = seq_file_net(seq);
void *rc = NULL;
+
+ if (st->bpf_seq_afinfo)
+ afinfo = st->bpf_seq_afinfo;
+ else
+ afinfo = PDE_DATA(file_inode(seq->file));
st->offset = 0;
for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
@@ -2112,7 +2360,8 @@
spin_lock_bh(lock);
sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
- if (sk->sk_family != afinfo->family ||
+ if ((afinfo->family != AF_UNSPEC &&
+ sk->sk_family != afinfo->family) ||
!net_eq(sock_net(sk), net)) {
continue;
}
@@ -2127,11 +2376,16 @@
static void *established_get_next(struct seq_file *seq, void *cur)
{
- struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
+ struct tcp_seq_afinfo *afinfo;
struct sock *sk = cur;
struct hlist_nulls_node *node;
struct tcp_iter_state *st = seq->private;
struct net *net = seq_file_net(seq);
+
+ if (st->bpf_seq_afinfo)
+ afinfo = st->bpf_seq_afinfo;
+ else
+ afinfo = PDE_DATA(file_inode(seq->file));
++st->num;
++st->offset;
@@ -2139,7 +2393,8 @@
sk = sk_nulls_next(sk);
sk_nulls_for_each_from(sk, node) {
- if (sk->sk_family == afinfo->family &&
+ if ((afinfo->family == AF_UNSPEC ||
+ sk->sk_family == afinfo->family) &&
net_eq(sock_net(sk), net))
return sk;
}
@@ -2200,7 +2455,7 @@
break;
st->bucket = 0;
st->state = TCP_SEQ_STATE_ESTABLISHED;
- /* Fallthrough */
+ fallthrough;
case TCP_SEQ_STATE_ESTABLISHED:
if (st->bucket > tcp_hashinfo.ehash_mask)
break;
@@ -2344,7 +2599,7 @@
state = inet_sk_state_load(sk);
if (state == TCP_LISTEN)
- rx_queue = sk->sk_ack_backlog;
+ rx_queue = READ_ONCE(sk->sk_ack_backlog);
else
/* Because we don't lock the socket,
* we might find a transient negative value.
@@ -2366,7 +2621,7 @@
refcount_read(&sk->sk_refcnt), sk,
jiffies_to_clock_t(icsk->icsk_rto),
jiffies_to_clock_t(icsk->icsk_ack.ato),
- (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
+ (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
tp->snd_cwnd,
state == TCP_LISTEN ?
fastopenq->max_qlen :
@@ -2418,6 +2673,74 @@
seq_pad(seq, '\n');
return 0;
}
+
+#ifdef CONFIG_BPF_SYSCALL
+struct bpf_iter__tcp {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct sock_common *, sk_common);
+ uid_t uid __aligned(8);
+};
+
+static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
+ struct sock_common *sk_common, uid_t uid)
+{
+ struct bpf_iter__tcp ctx;
+
+ meta->seq_num--; /* skip SEQ_START_TOKEN */
+ ctx.meta = meta;
+ ctx.sk_common = sk_common;
+ ctx.uid = uid;
+ return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
+{
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+ struct sock *sk = v;
+ uid_t uid;
+
+ if (v == SEQ_START_TOKEN)
+ return 0;
+
+ if (sk->sk_state == TCP_TIME_WAIT) {
+ uid = 0;
+ } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
+ const struct request_sock *req = v;
+
+ uid = from_kuid_munged(seq_user_ns(seq),
+ sock_i_uid(req->rsk_listener));
+ } else {
+ uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
+ }
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, false);
+ return tcp_prog_seq_show(prog, &meta, v, uid);
+}
+
+static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
+{
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+
+ if (!v) {
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, true);
+ if (prog)
+ (void)tcp_prog_seq_show(prog, &meta, v, 0);
+ }
+
+ tcp_seq_stop(seq, v);
+}
+
+static const struct seq_operations bpf_iter_tcp_seq_ops = {
+ .show = bpf_iter_tcp_seq_show,
+ .start = tcp_seq_start,
+ .next = tcp_seq_next,
+ .stop = bpf_iter_tcp_seq_stop,
+};
+#endif
static const struct seq_operations tcp4_seq_ops = {
.show = tcp4_seq_show,
@@ -2499,10 +2822,6 @@
.rsk_prot = &tcp_request_sock_ops,
.h.hashinfo = &tcp_hashinfo,
.no_autobind = true,
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_tcp_setsockopt,
- .compat_getsockopt = compat_tcp_getsockopt,
-#endif
.diag_destroy = tcp_abort,
};
EXPORT_SYMBOL(tcp_prot);
@@ -2512,7 +2831,8 @@
int cpu;
if (net->ipv4.tcp_congestion_control)
- module_put(net->ipv4.tcp_congestion_control->owner);
+ bpf_module_put(net->ipv4.tcp_congestion_control,
+ net->ipv4.tcp_congestion_control->owner);
for_each_possible_cpu(cpu)
inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
@@ -2551,6 +2871,7 @@
net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
+ net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
@@ -2566,12 +2887,13 @@
net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
net->ipv4.sysctl_tcp_tw_reuse = 2;
+ net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
cnt = tcp_hashinfo.ehash_mask + 1;
- net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
+ net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
- net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
+ net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
net->ipv4.sysctl_tcp_sack = 1;
net->ipv4.sysctl_tcp_window_scaling = 1;
net->ipv4.sysctl_tcp_timestamps = 1;
@@ -2590,8 +2912,8 @@
* which are too large can cause TCP streams to be bursty.
*/
net->ipv4.sysctl_tcp_tso_win_divisor = 3;
- /* Default TSQ limit of four TSO segments */
- net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
+ /* Default TSQ limit of 16 TSO segments */
+ net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
/* rfc5961 challenge ack rate limiting */
net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
net->ipv4.sysctl_tcp_min_tso_segs = 2;
@@ -2609,15 +2931,17 @@
sizeof(init_net.ipv4.sysctl_tcp_wmem));
}
net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
+ net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
net->ipv4.sysctl_tcp_comp_sack_nr = 44;
net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
- net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
+ net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
atomic_set(&net->ipv4.tfo_active_disable_times, 0);
/* Reno is always built in */
if (!net_eq(net, &init_net) &&
- try_module_get(init_net.ipv4.tcp_congestion_control->owner))
+ bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
+ init_net.ipv4.tcp_congestion_control->owner))
net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
else
net->ipv4.tcp_congestion_control = &tcp_reno;
@@ -2645,8 +2969,68 @@
.exit_batch = tcp_sk_exit_batch,
};
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
+ struct sock_common *sk_common, uid_t uid)
+
+static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
+{
+ struct tcp_iter_state *st = priv_data;
+ struct tcp_seq_afinfo *afinfo;
+ int ret;
+
+ afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
+ if (!afinfo)
+ return -ENOMEM;
+
+ afinfo->family = AF_UNSPEC;
+ st->bpf_seq_afinfo = afinfo;
+ ret = bpf_iter_init_seq_net(priv_data, aux);
+ if (ret)
+ kfree(afinfo);
+ return ret;
+}
+
+static void bpf_iter_fini_tcp(void *priv_data)
+{
+ struct tcp_iter_state *st = priv_data;
+
+ kfree(st->bpf_seq_afinfo);
+ bpf_iter_fini_seq_net(priv_data);
+}
+
+static const struct bpf_iter_seq_info tcp_seq_info = {
+ .seq_ops = &bpf_iter_tcp_seq_ops,
+ .init_seq_private = bpf_iter_init_tcp,
+ .fini_seq_private = bpf_iter_fini_tcp,
+ .seq_priv_size = sizeof(struct tcp_iter_state),
+};
+
+static struct bpf_iter_reg tcp_reg_info = {
+ .target = "tcp",
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__tcp, sk_common),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+ .seq_info = &tcp_seq_info,
+};
+
+static void __init bpf_iter_register(void)
+{
+ tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
+ if (bpf_iter_reg_target(&tcp_reg_info))
+ pr_warn("Warning: could not register bpf iterator tcp\n");
+}
+
+#endif
+
void __init tcp_v4_init(void)
{
if (register_pernet_subsys(&tcp_sk_ops))
panic("Failed to create the TCP control socket.\n");
+
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+ bpf_iter_register();
+#endif
}
--
Gitblit v1.6.2