From 95099d4622f8cb224d94e314c7a8e0df60b13f87 Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Sat, 09 Dec 2023 08:38:01 +0000
Subject: [PATCH] enable docker ppp
---
kernel/drivers/net/tun.c | 774 +++++++++++++++++++++++++++++++++++------------------------
1 files changed, 460 insertions(+), 314 deletions(-)
diff --git a/kernel/drivers/net/tun.c b/kernel/drivers/net/tun.c
index d5bb972..67ce7b7 100644
--- a/kernel/drivers/net/tun.c
+++ b/kernel/drivers/net/tun.c
@@ -1,16 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* TUN - Universal TUN/TAP device driver.
* Copyright (C) 1999-2002 Maxim Krasnyansky <maxk@qualcomm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
*
* $Id: tun.c,v 1.15 2002/03/01 02:44:24 maxk Exp $
*/
@@ -71,6 +62,7 @@
#include <net/rtnetlink.h>
#include <net/sock.h>
#include <net/xdp.h>
+#include <net/ip_tunnels.h>
#include <linux/seq_file.h>
#include <linux/uio.h>
#include <linux/skb_array.h>
@@ -92,36 +84,6 @@
static void tun_default_link_ksettings(struct net_device *dev,
struct ethtool_link_ksettings *cmd);
-/* Uncomment to enable debugging */
-/* #define TUN_DEBUG 1 */
-
-#ifdef TUN_DEBUG
-static int debug;
-
-#define tun_debug(level, tun, fmt, args...) \
-do { \
- if (tun->debug) \
- netdev_printk(level, tun->dev, fmt, ##args); \
-} while (0)
-#define DBG1(level, fmt, args...) \
-do { \
- if (debug == 2) \
- printk(level fmt, ##args); \
-} while (0)
-#else
-#define tun_debug(level, tun, fmt, args...) \
-do { \
- if (0) \
- netdev_printk(level, tun->dev, fmt, ##args); \
-} while (0)
-#define DBG1(level, fmt, args...) \
-do { \
- if (0) \
- printk(level fmt, ##args); \
-} while (0)
-#endif
-
-#define TUN_HEADROOM 256
#define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
/* TUN device flags */
@@ -154,10 +116,10 @@
#define TUN_FLOW_EXPIRE (3 * HZ)
struct tun_pcpu_stats {
- u64 rx_packets;
- u64 rx_bytes;
- u64 tx_packets;
- u64 tx_bytes;
+ u64_stats_t rx_packets;
+ u64_stats_t rx_bytes;
+ u64_stats_t tx_packets;
+ u64_stats_t tx_bytes;
struct u64_stats_sync syncp;
u32 rx_dropped;
u32 tx_dropped;
@@ -178,7 +140,6 @@
struct tun_file {
struct sock sk;
struct socket socket;
- struct socket_wq wq;
struct tun_struct __rcu *tun;
struct fasync_struct *fasync;
/* only used for fasnyc */
@@ -197,6 +158,11 @@
struct xdp_rxq_info xdp_rxq;
};
+struct tun_page {
+ struct page *page;
+ int count;
+};
+
struct tun_flow_entry {
struct hlist_node hash_link;
struct rcu_head rcu;
@@ -205,7 +171,7 @@
u32 rxhash;
u32 rps_rxhash;
int queue_index;
- unsigned long updated;
+ unsigned long updated ____cacheline_aligned_in_smp;
};
#define TUN_NUM_FLOW_ENTRIES 1024
@@ -239,9 +205,7 @@
struct sock_fprog fprog;
/* protected by rtnl lock */
bool filter_attached;
-#ifdef TUN_DEBUG
- int debug;
-#endif
+ u32 msg_enable;
spinlock_t lock;
struct hlist_head flows[TUN_NUM_FLOW_ENTRIES];
struct timer_list flow_gc_timer;
@@ -256,6 +220,9 @@
struct tun_prog __rcu *steering_prog;
struct tun_prog __rcu *filter_prog;
struct ethtool_link_ksettings link_ksettings;
+ /* init args */
+ struct file *file;
+ struct ifreq *ifr;
};
struct veth {
@@ -263,23 +230,8 @@
__be16 h_vlan_TCI;
};
-bool tun_is_xdp_frame(void *ptr)
-{
- return (unsigned long)ptr & TUN_XDP_FLAG;
-}
-EXPORT_SYMBOL(tun_is_xdp_frame);
-
-void *tun_xdp_to_ptr(void *ptr)
-{
- return (void *)((unsigned long)ptr | TUN_XDP_FLAG);
-}
-EXPORT_SYMBOL(tun_xdp_to_ptr);
-
-void *tun_ptr_to_xdp(void *ptr)
-{
- return (void *)((unsigned long)ptr & ~TUN_XDP_FLAG);
-}
-EXPORT_SYMBOL(tun_ptr_to_xdp);
+static void tun_flow_init(struct tun_struct *tun);
+static void tun_flow_uninit(struct tun_struct *tun);
static int tun_napi_receive(struct napi_struct *napi, int budget)
{
@@ -331,6 +283,12 @@
NAPI_POLL_WEIGHT);
napi_enable(&tfile->napi);
}
+}
+
+static void tun_napi_enable(struct tun_file *tfile)
+{
+ if (tfile->napi_enabled)
+ napi_enable(&tfile->napi);
}
static void tun_napi_disable(struct tun_file *tfile)
@@ -437,8 +395,9 @@
struct tun_flow_entry *e = kmalloc(sizeof(*e), GFP_ATOMIC);
if (e) {
- tun_debug(KERN_INFO, tun, "create flow: hash %u index %u\n",
- rxhash, queue_index);
+ netif_info(tun, tx_queued, tun->dev,
+ "create flow: hash %u index %u\n",
+ rxhash, queue_index);
e->updated = jiffies;
e->rxhash = rxhash;
e->rps_rxhash = 0;
@@ -452,8 +411,8 @@
static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e)
{
- tun_debug(KERN_INFO, tun, "delete flow: hash %u index %u\n",
- e->rxhash, e->queue_index);
+ netif_info(tun, tx_queued, tun->dev, "delete flow: hash %u index %u\n",
+ e->rxhash, e->queue_index);
hlist_del_rcu(&e->hash_link);
kfree_rcu(e, rcu);
--tun->flow_count;
@@ -499,8 +458,6 @@
unsigned long count = 0;
int i;
- tun_debug(KERN_INFO, tun, "tun_flow_cleanup\n");
-
spin_lock(&tun->lock);
for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
struct tun_flow_entry *e;
@@ -533,18 +490,17 @@
unsigned long delay = tun->ageing_time;
u16 queue_index = tfile->queue_index;
- if (!rxhash)
- return;
- else
- head = &tun->flows[tun_hashfn(rxhash)];
+ head = &tun->flows[tun_hashfn(rxhash)];
rcu_read_lock();
e = tun_flow_find(head, rxhash);
if (likely(e)) {
/* TODO: keep queueing to old queue until it's empty? */
- e->queue_index = queue_index;
- e->updated = jiffies;
+ if (READ_ONCE(e->queue_index) != queue_index)
+ WRITE_ONCE(e->queue_index, queue_index);
+ if (e->updated != jiffies)
+ e->updated = jiffies;
sock_rps_record_flow_hash(e->rps_rxhash);
} else {
spin_lock_bh(&tun->lock);
@@ -561,8 +517,7 @@
rcu_read_unlock();
}
-/**
- * Save the hash received in the stack receive path and update the
+/* Save the hash received in the stack receive path and update the
* flow_hash table accordingly.
*/
static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
@@ -571,12 +526,11 @@
e->rps_rxhash = hash;
}
-/* We try to identify a flow through its rxhash first. The reason that
+/* We try to identify a flow through its rxhash. The reason that
* we do not check rxq no. is because some cards(e.g 82599), chooses
* the rxq based on the txq where the last packet of the flow comes. As
* the userspace application move between processors, we may get a
- * different rxq no. here. If we could not get rxhash, then we would
- * hope the rxq no. may help here.
+ * different rxq no. here.
*/
static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
{
@@ -587,18 +541,13 @@
numqueues = READ_ONCE(tun->numqueues);
txq = __skb_get_hash_symmetric(skb);
- if (txq) {
- e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq);
- if (e) {
- tun_flow_save_rps_rxhash(e, txq);
- txq = e->queue_index;
- } else
- /* use multiply and shift instead of expensive divide */
- txq = ((u64)txq * numqueues) >> 32;
- } else if (likely(skb_rx_queue_recorded(skb))) {
- txq = skb_get_rx_queue(skb);
- while (unlikely(txq >= numqueues))
- txq -= numqueues;
+ e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq);
+ if (e) {
+ tun_flow_save_rps_rxhash(e, txq);
+ txq = e->queue_index;
+ } else {
+ /* use multiply and shift instead of expensive divide */
+ txq = ((u64)txq * numqueues) >> 32;
}
return txq;
@@ -622,8 +571,7 @@
}
static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
- struct net_device *sb_dev,
- select_queue_fallback_t fallback)
+ struct net_device *sb_dev)
{
struct tun_struct *tun = netdev_priv(dev);
u16 ret;
@@ -704,7 +652,8 @@
tun = rtnl_dereference(tfile->tun);
if (tun && clean) {
- tun_napi_disable(tfile);
+ if (!tfile->detached)
+ tun_napi_disable(tfile);
tun_napi_del(tfile);
}
@@ -723,8 +672,10 @@
if (clean) {
RCU_INIT_POINTER(tfile->tun, NULL);
sock_put(&tfile->sk);
- } else
+ } else {
tun_disable_queue(tun, tfile);
+ tun_napi_disable(tfile);
+ }
synchronize_net();
tun_flow_delete_by_queue(tun, tun->numqueues + 1);
@@ -747,7 +698,6 @@
if (tun)
xdp_rxq_info_unreg(&tfile->xdp_rxq);
ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free);
- sock_put(&tfile->sk);
}
}
@@ -763,6 +713,9 @@
if (dev)
netdev_state_change(dev);
rtnl_unlock();
+
+ if (clean)
+ sock_put(&tfile->sk);
}
static void tun_detach_all(struct net_device *dev)
@@ -797,6 +750,7 @@
sock_put(&tfile->sk);
}
list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) {
+ tun_napi_del(tfile);
tun_enable_queue(tfile);
tun_queue_purge(tfile);
xdp_rxq_info_unreg(&tfile->xdp_rxq);
@@ -877,10 +831,14 @@
if (tfile->detached) {
tun_enable_queue(tfile);
+ tun_napi_enable(tfile);
} else {
sock_hold(&tfile->sk);
tun_napi_init(tun, tfile, napi, napi_frags);
}
+
+ if (rtnl_dereference(tun->xdp_prog))
+ sock_set_flag(&tfile->sk, SOCK_XDP);
/* device is allowed to go away first, so no need to hold extra
* refcnt.
@@ -1025,6 +983,49 @@
static const struct ethtool_ops tun_ethtool_ops;
+static int tun_net_init(struct net_device *dev)
+{
+ struct tun_struct *tun = netdev_priv(dev);
+ struct ifreq *ifr = tun->ifr;
+ int err;
+
+ tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats);
+ if (!tun->pcpu_stats)
+ return -ENOMEM;
+
+ spin_lock_init(&tun->lock);
+
+ err = security_tun_dev_alloc_security(&tun->security);
+ if (err < 0) {
+ free_percpu(tun->pcpu_stats);
+ return err;
+ }
+
+ tun_flow_init(tun);
+
+ dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
+ TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX |
+ NETIF_F_HW_VLAN_STAG_TX;
+ dev->features = dev->hw_features | NETIF_F_LLTX;
+ dev->vlan_features = dev->features &
+ ~(NETIF_F_HW_VLAN_CTAG_TX |
+ NETIF_F_HW_VLAN_STAG_TX);
+
+ tun->flags = (tun->flags & ~TUN_FEATURES) |
+ (ifr->ifr_flags & TUN_FEATURES);
+
+ INIT_LIST_HEAD(&tun->disabled);
+ err = tun_attach(tun, tun->file, false, ifr->ifr_flags & IFF_NAPI,
+ ifr->ifr_flags & IFF_NAPI_FRAGS, false);
+ if (err < 0) {
+ tun_flow_uninit(tun);
+ security_tun_dev_free_security(tun->security);
+ free_percpu(tun->pcpu_stats);
+ return err;
+ }
+ return 0;
+}
+
/* Net device detach from fd. */
static void tun_net_uninit(struct net_device *dev)
{
@@ -1050,20 +1051,17 @@
static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb)
{
#ifdef CONFIG_RPS
- if (tun->numqueues == 1 && static_key_false(&rps_needed)) {
+ if (tun->numqueues == 1 && static_branch_unlikely(&rps_needed)) {
/* Select queue was not called for the skbuff, so we extract the
* RPS hash and save it into the flow_table here.
*/
+ struct tun_flow_entry *e;
__u32 rxhash;
rxhash = __skb_get_hash_symmetric(skb);
- if (rxhash) {
- struct tun_flow_entry *e;
- e = tun_flow_find(&tun->flows[tun_hashfn(rxhash)],
- rxhash);
- if (e)
- tun_flow_save_rps_rxhash(e, rxhash);
- }
+ e = tun_flow_find(&tun->flows[tun_hashfn(rxhash)], rxhash);
+ if (e)
+ tun_flow_save_rps_rxhash(e, rxhash);
}
#endif
}
@@ -1099,9 +1097,7 @@
if (!rcu_dereference(tun->steering_prog))
tun_automq_xmit(tun, skb);
- tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);
-
- BUG_ON(!tfile);
+ netif_info(tun, tx_queued, tun->dev, "%s %d\n", __func__, skb->len);
/* Drop if the filter does not like it.
* This is a noop if the filter is disabled.
@@ -1127,7 +1123,7 @@
*/
skb_orphan(skb);
- nf_reset(skb);
+ nf_reset_ct(skb);
if (ptr_ring_produce(&tfile->tx_ring, skb))
goto drop;
@@ -1194,10 +1190,10 @@
p = per_cpu_ptr(tun->pcpu_stats, i);
do {
start = u64_stats_fetch_begin(&p->syncp);
- rxpackets = p->rx_packets;
- rxbytes = p->rx_bytes;
- txpackets = p->tx_packets;
- txbytes = p->tx_bytes;
+ rxpackets = u64_stats_read(&p->rx_packets);
+ rxbytes = u64_stats_read(&p->rx_bytes);
+ txpackets = u64_stats_read(&p->tx_packets);
+ txbytes = u64_stats_read(&p->tx_bytes);
} while (u64_stats_fetch_retry(&p->syncp, start));
stats->rx_packets += rxpackets;
@@ -1219,24 +1215,28 @@
struct netlink_ext_ack *extack)
{
struct tun_struct *tun = netdev_priv(dev);
+ struct tun_file *tfile;
struct bpf_prog *old_prog;
+ int i;
old_prog = rtnl_dereference(tun->xdp_prog);
rcu_assign_pointer(tun->xdp_prog, prog);
if (old_prog)
bpf_prog_put(old_prog);
- return 0;
-}
-
-static u32 tun_xdp_query(struct net_device *dev)
-{
- struct tun_struct *tun = netdev_priv(dev);
- const struct bpf_prog *xdp_prog;
-
- xdp_prog = rtnl_dereference(tun->xdp_prog);
- if (xdp_prog)
- return xdp_prog->aux->id;
+ for (i = 0; i < tun->numqueues; i++) {
+ tfile = rtnl_dereference(tun->tfiles[i]);
+ if (prog)
+ sock_set_flag(&tfile->sk, SOCK_XDP);
+ else
+ sock_reset_flag(&tfile->sk, SOCK_XDP);
+ }
+ list_for_each_entry(tfile, &tun->disabled, next) {
+ if (prog)
+ sock_set_flag(&tfile->sk, SOCK_XDP);
+ else
+ sock_reset_flag(&tfile->sk, SOCK_XDP);
+ }
return 0;
}
@@ -1246,15 +1246,28 @@
switch (xdp->command) {
case XDP_SETUP_PROG:
return tun_xdp_set(dev, xdp->prog, xdp->extack);
- case XDP_QUERY_PROG:
- xdp->prog_id = tun_xdp_query(dev);
- return 0;
default:
return -EINVAL;
}
}
+static int tun_net_change_carrier(struct net_device *dev, bool new_carrier)
+{
+ if (new_carrier) {
+ struct tun_struct *tun = netdev_priv(dev);
+
+ if (!tun->numqueues)
+ return -EPERM;
+
+ netif_carrier_on(dev);
+ } else {
+ netif_carrier_off(dev);
+ }
+ return 0;
+}
+
static const struct net_device_ops tun_netdev_ops = {
+ .ndo_init = tun_net_init,
.ndo_uninit = tun_net_uninit,
.ndo_open = tun_net_open,
.ndo_stop = tun_net_close,
@@ -1263,6 +1276,7 @@
.ndo_select_queue = tun_select_queue,
.ndo_set_rx_headroom = tun_set_headroom,
.ndo_get_stats64 = tun_net_get_stats64,
+ .ndo_change_carrier = tun_net_change_carrier,
};
static void __tun_xdp_flush_tfile(struct tun_file *tfile)
@@ -1325,7 +1339,7 @@
static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
{
- struct xdp_frame *frame = convert_to_xdp_frame(xdp);
+ struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp);
if (unlikely(!frame))
return -EOVERFLOW;
@@ -1334,6 +1348,7 @@
}
static const struct net_device_ops tap_netdev_ops = {
+ .ndo_init = tun_net_init,
.ndo_uninit = tun_net_uninit,
.ndo_open = tun_net_open,
.ndo_stop = tun_net_close,
@@ -1348,6 +1363,7 @@
.ndo_get_stats64 = tun_net_get_stats64,
.ndo_bpf = tun_xdp,
.ndo_xdp_xmit = tun_xdp_xmit,
+ .ndo_change_carrier = tun_net_change_carrier,
};
static void tun_flow_init(struct tun_struct *tun)
@@ -1373,13 +1389,14 @@
#define MAX_MTU 65535
/* Initialize net device. */
-static void tun_net_init(struct net_device *dev)
+static void tun_net_initialize(struct net_device *dev)
{
struct tun_struct *tun = netdev_priv(dev);
switch (tun->flags & TUN_TYPE_MASK) {
case IFF_TUN:
dev->netdev_ops = &tun_netdev_ops;
+ dev->header_ops = &ip_tunnel_header_ops;
/* Point-to-Point TUN Device */
dev->hard_header_len = 0;
@@ -1429,8 +1446,6 @@
sk = tfile->socket.sk;
- tun_debug(KERN_INFO, tun, "tun_chr_poll\n");
-
poll_wait(file, sk_sleep(sk), wait);
if (!ptr_ring_empty(&tfile->tx_ring))
@@ -1462,7 +1477,8 @@
int err;
int i;
- if (it->nr_segs > MAX_SKB_FRAGS + 1)
+ if (it->nr_segs > MAX_SKB_FRAGS + 1 ||
+ len > (ETH_MAX_MTU - NET_SKB_PAD - NET_IP_ALIGN))
return ERR_PTR(-EMSGSIZE);
local_bh_disable();
@@ -1481,23 +1497,22 @@
skb->truesize += skb->data_len;
for (i = 1; i < it->nr_segs; i++) {
- struct page_frag *pfrag = ¤t->task_frag;
size_t fragsz = it->iov[i].iov_len;
+ struct page *page;
+ void *frag;
if (fragsz == 0 || fragsz > PAGE_SIZE) {
err = -EINVAL;
goto free;
}
-
- if (!skb_page_frag_refill(fragsz, pfrag, GFP_KERNEL)) {
+ frag = netdev_alloc_frag(fragsz);
+ if (!frag) {
err = -ENOMEM;
goto free;
}
-
- skb_fill_page_desc(skb, i - 1, pfrag->page,
- pfrag->offset, fragsz);
- page_ref_inc(pfrag->page);
- pfrag->offset += fragsz;
+ page = virt_to_head_page(frag);
+ skb_fill_page_desc(skb, i - 1, page,
+ frag - page_address(page), fragsz);
}
return skb;
@@ -1596,6 +1611,57 @@
return true;
}
+static struct sk_buff *__tun_build_skb(struct tun_file *tfile,
+ struct page_frag *alloc_frag, char *buf,
+ int buflen, int len, int pad)
+{
+ struct sk_buff *skb = build_skb(buf, buflen);
+
+ if (!skb)
+ return ERR_PTR(-ENOMEM);
+
+ skb_reserve(skb, pad);
+ skb_put(skb, len);
+ skb_set_owner_w(skb, tfile->socket.sk);
+
+ get_page(alloc_frag->page);
+ alloc_frag->offset += buflen;
+
+ return skb;
+}
+
+static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog,
+ struct xdp_buff *xdp, u32 act)
+{
+ int err;
+
+ switch (act) {
+ case XDP_REDIRECT:
+ err = xdp_do_redirect(tun->dev, xdp, xdp_prog);
+ if (err)
+ return err;
+ break;
+ case XDP_TX:
+ err = tun_xdp_tx(tun->dev, xdp);
+ if (err < 0)
+ return err;
+ break;
+ case XDP_PASS:
+ break;
+ default:
+ bpf_warn_invalid_xdp_action(act);
+ fallthrough;
+ case XDP_ABORTED:
+ trace_xdp_exception(tun->dev, xdp_prog, act);
+ fallthrough;
+ case XDP_DROP:
+ this_cpu_inc(tun->pcpu_stats->rx_dropped);
+ break;
+ }
+
+ return act;
+}
+
static struct sk_buff *tun_build_skb(struct tun_struct *tun,
struct tun_file *tfile,
struct iov_iter *from,
@@ -1603,18 +1669,17 @@
int len, int *skb_xdp)
{
struct page_frag *alloc_frag = ¤t->task_frag;
- struct sk_buff *skb;
struct bpf_prog *xdp_prog;
int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- unsigned int delta = 0;
char *buf;
size_t copied;
- int err, pad = TUN_RX_PAD;
+ int pad = TUN_RX_PAD;
+ int err = 0;
rcu_read_lock();
xdp_prog = rcu_dereference(tun->xdp_prog);
if (xdp_prog)
- pad += TUN_HEADROOM;
+ pad += XDP_PACKET_HEADROOM;
buflen += SKB_DATA_ALIGN(len + pad);
rcu_read_unlock();
@@ -1633,17 +1698,19 @@
* of xdp_prog above, this should be rare and for simplicity
* we do XDP on skb in case the headroom is not enough.
*/
- if (hdr->gso_type || !xdp_prog)
+ if (hdr->gso_type || !xdp_prog) {
*skb_xdp = 1;
- else
- *skb_xdp = 0;
+ return __tun_build_skb(tfile, alloc_frag, buf, buflen, len,
+ pad);
+ }
+
+ *skb_xdp = 0;
local_bh_disable();
rcu_read_lock();
xdp_prog = rcu_dereference(tun->xdp_prog);
- if (xdp_prog && !*skb_xdp) {
+ if (xdp_prog) {
struct xdp_buff xdp;
- void *orig_data;
u32 act;
xdp.data_hard_start = buf;
@@ -1651,67 +1718,36 @@
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = xdp.data + len;
xdp.rxq = &tfile->xdp_rxq;
- orig_data = xdp.data;
+ xdp.frame_sz = buflen;
+
act = bpf_prog_run_xdp(xdp_prog, &xdp);
-
- switch (act) {
- case XDP_REDIRECT:
+ if (act == XDP_REDIRECT || act == XDP_TX) {
get_page(alloc_frag->page);
alloc_frag->offset += buflen;
- err = xdp_do_redirect(tun->dev, &xdp, xdp_prog);
- xdp_do_flush_map();
- if (err)
- goto err_redirect;
- rcu_read_unlock();
- local_bh_enable();
- return NULL;
- case XDP_TX:
- get_page(alloc_frag->page);
- alloc_frag->offset += buflen;
- if (tun_xdp_tx(tun->dev, &xdp) < 0)
- goto err_redirect;
- rcu_read_unlock();
- local_bh_enable();
- return NULL;
- case XDP_PASS:
- delta = orig_data - xdp.data;
- len = xdp.data_end - xdp.data;
- break;
- default:
- bpf_warn_invalid_xdp_action(act);
- /* fall through */
- case XDP_ABORTED:
- trace_xdp_exception(tun->dev, xdp_prog, act);
- /* fall through */
- case XDP_DROP:
- goto err_xdp;
}
+ err = tun_xdp_act(tun, xdp_prog, &xdp, act);
+ if (err < 0) {
+ if (act == XDP_REDIRECT || act == XDP_TX)
+ put_page(alloc_frag->page);
+ goto out;
+ }
+
+ if (err == XDP_REDIRECT)
+ xdp_do_flush();
+ if (err != XDP_PASS)
+ goto out;
+
+ pad = xdp.data - xdp.data_hard_start;
+ len = xdp.data_end - xdp.data;
}
-
- skb = build_skb(buf, buflen);
- if (!skb) {
- rcu_read_unlock();
- local_bh_enable();
- return ERR_PTR(-ENOMEM);
- }
-
- skb_reserve(skb, pad - delta);
- skb_put(skb, len);
- skb_set_owner_w(skb, tfile->socket.sk);
- get_page(alloc_frag->page);
- alloc_frag->offset += buflen;
-
rcu_read_unlock();
local_bh_enable();
- return skb;
+ return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad);
-err_redirect:
- put_page(alloc_frag->page);
-err_xdp:
+out:
rcu_read_unlock();
local_bh_enable();
- this_cpu_inc(tun->pcpu_stats->rx_dropped);
return NULL;
}
@@ -1902,7 +1938,8 @@
}
skb_reset_network_header(skb);
- skb_probe_transport_header(skb, 0);
+ skb_probe_transport_header(skb);
+ skb_record_rx_queue(skb, tfile->queue_index);
if (skb_xdp) {
struct bpf_prog *xdp_prog;
@@ -1947,20 +1984,29 @@
/* Exercise flow dissector code path. */
skb_push(skb, ETH_HLEN);
- headlen = eth_get_headlen(skb->data, skb_headlen(skb));
+ headlen = eth_get_headlen(tun->dev, skb->data,
+ skb_headlen(skb));
if (unlikely(headlen > skb_headlen(skb))) {
+ WARN_ON_ONCE(1);
+ err = -ENOMEM;
this_cpu_inc(tun->pcpu_stats->rx_dropped);
+napi_busy:
napi_free_frags(&tfile->napi);
rcu_read_unlock();
mutex_unlock(&tfile->napi_mutex);
- WARN_ON(1);
- return -ENOMEM;
+ return err;
}
- local_bh_disable();
- napi_gro_frags(&tfile->napi);
- local_bh_enable();
+ if (likely(napi_schedule_prep(&tfile->napi))) {
+ local_bh_disable();
+ napi_gro_frags(&tfile->napi);
+ napi_complete(&tfile->napi);
+ local_bh_enable();
+ } else {
+ err = -EBUSY;
+ goto napi_busy;
+ }
mutex_unlock(&tfile->napi_mutex);
} else if (tfile->napi_enabled) {
struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
@@ -1984,8 +2030,8 @@
stats = get_cpu_ptr(tun->pcpu_stats);
u64_stats_update_begin(&stats->syncp);
- stats->rx_packets++;
- stats->rx_bytes += len;
+ u64_stats_inc(&stats->rx_packets);
+ u64_stats_add(&stats->rx_bytes, len);
u64_stats_update_end(&stats->syncp);
put_cpu_ptr(stats);
@@ -2041,8 +2087,8 @@
stats = get_cpu_ptr(tun->pcpu_stats);
u64_stats_update_begin(&stats->syncp);
- stats->tx_packets++;
- stats->tx_bytes += ret;
+ u64_stats_inc(&stats->tx_packets);
+ u64_stats_add(&stats->tx_bytes, ret);
u64_stats_update_end(&stats->syncp);
put_cpu_ptr(tun->pcpu_stats);
@@ -2136,8 +2182,8 @@
/* caller is in process context, */
stats = get_cpu_ptr(tun->pcpu_stats);
u64_stats_update_begin(&stats->syncp);
- stats->tx_packets++;
- stats->tx_bytes += skb->len + vlan_hlen;
+ u64_stats_inc(&stats->tx_packets);
+ u64_stats_add(&stats->tx_bytes, skb->len + vlan_hlen);
u64_stats_update_end(&stats->syncp);
put_cpu_ptr(tun->pcpu_stats);
@@ -2158,7 +2204,7 @@
goto out;
}
- add_wait_queue(&tfile->wq.wait, &wait);
+ add_wait_queue(&tfile->socket.wq.wait, &wait);
while (1) {
set_current_state(TASK_INTERRUPTIBLE);
@@ -2178,7 +2224,7 @@
}
__set_current_state(TASK_RUNNING);
- remove_wait_queue(&tfile->wq.wait, &wait);
+ remove_wait_queue(&tfile->socket.wq.wait, &wait);
out:
*err = error;
@@ -2191,8 +2237,6 @@
{
ssize_t ret;
int err;
-
- tun_debug(KERN_INFO, tun, "tun_do_read\n");
if (!iov_iter_count(to)) {
tun_ptr_free(ptr);
@@ -2284,7 +2328,9 @@
struct tun_struct *tun = netdev_priv(dev);
BUG_ON(!(list_empty(&tun->disabled)));
+
free_percpu(tun->pcpu_stats);
+
tun_flow_uninit(tun);
security_tun_dev_free_security(tun->security);
__tun_set_ebpf(tun, &tun->steering_prog, NULL);
@@ -2400,18 +2446,160 @@
kill_fasync(&tfile->fasync, SIGIO, POLL_OUT);
}
+static void tun_put_page(struct tun_page *tpage)
+{
+ if (tpage->page)
+ __page_frag_cache_drain(tpage->page, tpage->count);
+}
+
+static int tun_xdp_one(struct tun_struct *tun,
+ struct tun_file *tfile,
+ struct xdp_buff *xdp, int *flush,
+ struct tun_page *tpage)
+{
+ unsigned int datasize = xdp->data_end - xdp->data;
+ struct tun_xdp_hdr *hdr = xdp->data_hard_start;
+ struct virtio_net_hdr *gso = &hdr->gso;
+ struct tun_pcpu_stats *stats;
+ struct bpf_prog *xdp_prog;
+ struct sk_buff *skb = NULL;
+ u32 rxhash = 0, act;
+ int buflen = hdr->buflen;
+ int err = 0;
+ bool skb_xdp = false;
+ struct page *page;
+
+ xdp_prog = rcu_dereference(tun->xdp_prog);
+ if (xdp_prog) {
+ if (gso->gso_type) {
+ skb_xdp = true;
+ goto build;
+ }
+ xdp_set_data_meta_invalid(xdp);
+ xdp->rxq = &tfile->xdp_rxq;
+ xdp->frame_sz = buflen;
+
+ act = bpf_prog_run_xdp(xdp_prog, xdp);
+ err = tun_xdp_act(tun, xdp_prog, xdp, act);
+ if (err < 0) {
+ put_page(virt_to_head_page(xdp->data));
+ return err;
+ }
+
+ switch (err) {
+ case XDP_REDIRECT:
+ *flush = true;
+ fallthrough;
+ case XDP_TX:
+ return 0;
+ case XDP_PASS:
+ break;
+ default:
+ page = virt_to_head_page(xdp->data);
+ if (tpage->page == page) {
+ ++tpage->count;
+ } else {
+ tun_put_page(tpage);
+ tpage->page = page;
+ tpage->count = 1;
+ }
+ return 0;
+ }
+ }
+
+build:
+ skb = build_skb(xdp->data_hard_start, buflen);
+ if (!skb) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ skb_reserve(skb, xdp->data - xdp->data_hard_start);
+ skb_put(skb, xdp->data_end - xdp->data);
+
+ if (virtio_net_hdr_to_skb(skb, gso, tun_is_little_endian(tun))) {
+ this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
+ kfree_skb(skb);
+ err = -EINVAL;
+ goto out;
+ }
+
+ skb->protocol = eth_type_trans(skb, tun->dev);
+ skb_reset_network_header(skb);
+ skb_probe_transport_header(skb);
+ skb_record_rx_queue(skb, tfile->queue_index);
+
+ if (skb_xdp) {
+ err = do_xdp_generic(xdp_prog, skb);
+ if (err != XDP_PASS)
+ goto out;
+ }
+
+ if (!rcu_dereference(tun->steering_prog) && tun->numqueues > 1 &&
+ !tfile->detached)
+ rxhash = __skb_get_hash_symmetric(skb);
+
+ netif_receive_skb(skb);
+
+ /* No need for get_cpu_ptr() here since this function is
+ * always called with bh disabled
+ */
+ stats = this_cpu_ptr(tun->pcpu_stats);
+ u64_stats_update_begin(&stats->syncp);
+ u64_stats_inc(&stats->rx_packets);
+ u64_stats_add(&stats->rx_bytes, datasize);
+ u64_stats_update_end(&stats->syncp);
+
+ if (rxhash)
+ tun_flow_update(tun, rxhash, tfile);
+
+out:
+ return err;
+}
+
static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
{
- int ret;
+ int ret, i;
struct tun_file *tfile = container_of(sock, struct tun_file, socket);
struct tun_struct *tun = tun_get(tfile);
+ struct tun_msg_ctl *ctl = m->msg_control;
+ struct xdp_buff *xdp;
if (!tun)
return -EBADFD;
- ret = tun_get_user(tun, tfile, m->msg_control, &m->msg_iter,
+ if (m->msg_controllen == sizeof(struct tun_msg_ctl) &&
+ ctl && ctl->type == TUN_MSG_PTR) {
+ struct tun_page tpage;
+ int n = ctl->num;
+ int flush = 0;
+
+ memset(&tpage, 0, sizeof(tpage));
+
+ local_bh_disable();
+ rcu_read_lock();
+
+ for (i = 0; i < n; i++) {
+ xdp = &((struct xdp_buff *)ctl->ptr)[i];
+ tun_xdp_one(tun, tfile, xdp, &flush, &tpage);
+ }
+
+ if (flush)
+ xdp_do_flush();
+
+ rcu_read_unlock();
+ local_bh_enable();
+
+ tun_put_page(&tpage);
+
+ ret = total_len;
+ goto out;
+ }
+
+ ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter,
m->msg_flags & MSG_DONTWAIT,
m->msg_flags & MSG_MORE);
+out:
tun_put(tun);
return ret;
}
@@ -2636,9 +2824,6 @@
if (!dev)
return -ENOMEM;
- err = dev_get_valid_name(net, dev, name);
- if (err < 0)
- goto err_free_dev;
dev_net_set(dev, net);
dev->rtnl_link_ops = &tun_link_ops;
@@ -2657,41 +2842,16 @@
tun->rx_batched = 0;
RCU_INIT_POINTER(tun->steering_prog, NULL);
- tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats);
- if (!tun->pcpu_stats) {
- err = -ENOMEM;
- goto err_free_dev;
- }
+ tun->ifr = ifr;
+ tun->file = file;
- spin_lock_init(&tun->lock);
-
- err = security_tun_dev_alloc_security(&tun->security);
- if (err < 0)
- goto err_free_stat;
-
- tun_net_init(dev);
- tun_flow_init(tun);
-
- dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
- TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX |
- NETIF_F_HW_VLAN_STAG_TX;
- dev->features = dev->hw_features | NETIF_F_LLTX;
- dev->vlan_features = dev->features &
- ~(NETIF_F_HW_VLAN_CTAG_TX |
- NETIF_F_HW_VLAN_STAG_TX);
-
- tun->flags = (tun->flags & ~TUN_FEATURES) |
- (ifr->ifr_flags & TUN_FEATURES);
-
- INIT_LIST_HEAD(&tun->disabled);
- err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI,
- ifr->ifr_flags & IFF_NAPI_FRAGS, false);
- if (err < 0)
- goto err_free_flow;
+ tun_net_initialize(dev);
err = register_netdevice(tun->dev);
- if (err < 0)
- goto err_detach;
+ if (err < 0) {
+ free_netdev(dev);
+ return err;
+ }
/* free_netdev() won't check refcnt, to aovid race
* with dev_put() we need publish tun after registration.
*/
@@ -2699,8 +2859,6 @@
}
netif_carrier_on(tun->dev);
-
- tun_debug(KERN_INFO, tun, "tun_set_iff\n");
/* Make sure persistent devices do not get stuck in
* xoff state.
@@ -2710,27 +2868,10 @@
strcpy(ifr->ifr_name, tun->dev->name);
return 0;
-
-err_detach:
- tun_detach_all(dev);
- /* register_netdevice() already called tun_free_netdev() */
- goto err_free_dev;
-
-err_free_flow:
- tun_flow_uninit(tun);
- security_tun_dev_free_security(tun->security);
-err_free_stat:
- free_percpu(tun->pcpu_stats);
-err_free_dev:
- free_netdev(dev);
- return err;
}
-static void tun_get_iff(struct net *net, struct tun_struct *tun,
- struct ifreq *ifr)
+static void tun_get_iff(struct tun_struct *tun, struct ifreq *ifr)
{
- tun_debug(KERN_INFO, tun, "tun_get_iff\n");
-
strcpy(ifr->ifr_name, tun->dev->name);
ifr->ifr_flags = tun_flags(tun);
@@ -2857,7 +2998,7 @@
return ret;
}
-static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog **prog_p,
+static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p,
void __user *data)
{
struct bpf_prog *prog;
@@ -2923,12 +3064,12 @@
struct net *net = sock_net(&tfile->sk);
struct tun_struct *tun;
void __user* argp = (void __user*)arg;
+ unsigned int ifindex, carrier;
struct ifreq ifr;
kuid_t owner;
kgid_t group;
int sndbuf;
int vnet_hdr_sz;
- unsigned int ifindex;
int le;
int ret;
bool do_notify = false;
@@ -2993,12 +3134,13 @@
if (!tun)
goto unlock;
- tun_debug(KERN_INFO, tun, "tun_chr_ioctl cmd %u\n", cmd);
+ netif_info(tun, drv, tun->dev, "tun_chr_ioctl cmd %u\n", cmd);
+ net = dev_net(tun->dev);
ret = 0;
switch (cmd) {
case TUNGETIFF:
- tun_get_iff(current->nsproxy->net_ns, tun, &ifr);
+ tun_get_iff(tun, &ifr);
if (tfile->detached)
ifr.ifr_flags |= IFF_DETACH_QUEUE;
@@ -3013,8 +3155,8 @@
/* Disable/Enable checksum */
/* [unimplemented] */
- tun_debug(KERN_INFO, tun, "ignored: set checksum %s\n",
- arg ? "disabled" : "enabled");
+ netif_info(tun, drv, tun->dev, "ignored: set checksum %s\n",
+ arg ? "disabled" : "enabled");
break;
case TUNSETPERSIST:
@@ -3032,8 +3174,8 @@
do_notify = true;
}
- tun_debug(KERN_INFO, tun, "persist %s\n",
- arg ? "enabled" : "disabled");
+ netif_info(tun, drv, tun->dev, "persist %s\n",
+ arg ? "enabled" : "disabled");
break;
case TUNSETOWNER:
@@ -3045,8 +3187,8 @@
}
tun->owner = owner;
do_notify = true;
- tun_debug(KERN_INFO, tun, "owner set to %u\n",
- from_kuid(&init_user_ns, tun->owner));
+ netif_info(tun, drv, tun->dev, "owner set to %u\n",
+ from_kuid(&init_user_ns, tun->owner));
break;
case TUNSETGROUP:
@@ -3058,30 +3200,29 @@
}
tun->group = group;
do_notify = true;
- tun_debug(KERN_INFO, tun, "group set to %u\n",
- from_kgid(&init_user_ns, tun->group));
+ netif_info(tun, drv, tun->dev, "group set to %u\n",
+ from_kgid(&init_user_ns, tun->group));
break;
case TUNSETLINK:
/* Only allow setting the type when the interface is down */
if (tun->dev->flags & IFF_UP) {
- tun_debug(KERN_INFO, tun,
- "Linktype set failed because interface is up\n");
+ netif_info(tun, drv, tun->dev,
+ "Linktype set failed because interface is up\n");
ret = -EBUSY;
} else {
tun->dev->type = (int) arg;
tun->dev->addr_len = tun_get_addr_len(tun->dev->type);
- tun_debug(KERN_INFO, tun, "linktype set to %d\n",
- tun->dev->type);
+ netif_info(tun, drv, tun->dev, "linktype set to %d\n",
+ tun->dev->type);
ret = 0;
}
break;
-#ifdef TUN_DEBUG
case TUNSETDEBUG:
- tun->debug = arg;
+ tun->msg_enable = (u32)arg;
break;
-#endif
+
case TUNSETOFFLOAD:
ret = set_offload(tun, arg);
break;
@@ -3096,18 +3237,14 @@
case SIOCGIFHWADDR:
/* Get hw address */
- memcpy(ifr.ifr_hwaddr.sa_data, tun->dev->dev_addr, ETH_ALEN);
- ifr.ifr_hwaddr.sa_family = tun->dev->type;
+ dev_get_mac_address(&ifr.ifr_hwaddr, net, tun->dev->name);
if (copy_to_user(argp, &ifr, ifreq_len))
ret = -EFAULT;
break;
case SIOCSIFHWADDR:
/* Set hw address */
- tun_debug(KERN_DEBUG, tun, "set hw address: %pM\n",
- ifr.ifr_hwaddr.sa_data);
-
- ret = dev_set_mac_address(tun->dev, &ifr.ifr_hwaddr);
+ ret = dev_set_mac_address_user(tun->dev, &ifr.ifr_hwaddr, NULL);
break;
case TUNGETSNDBUF:
@@ -3213,6 +3350,21 @@
ret = tun_set_ebpf(tun, &tun->filter_prog, argp);
break;
+ case TUNSETCARRIER:
+ ret = -EFAULT;
+ if (copy_from_user(&carrier, argp, sizeof(carrier)))
+ goto unlock;
+
+ ret = tun_net_change_carrier(tun->dev, (bool)carrier);
+ break;
+
+ case TUNGETDEVNETNS:
+ ret = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ goto unlock;
+ ret = open_related_ns(&net->ns, get_net_ns);
+ break;
+
default:
ret = -EINVAL;
break;
@@ -3286,8 +3438,6 @@
struct net *net = current->nsproxy->net_ns;
struct tun_file *tfile;
- DBG1(KERN_INFO, "tunX: tun_chr_open\n");
-
tfile = (struct tun_file *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
&tun_proto, 0);
if (!tfile)
@@ -3302,8 +3452,7 @@
tfile->flags = 0;
tfile->ifindex = 0;
- init_waitqueue_head(&tfile->wq.wait);
- RCU_INIT_POINTER(tfile->socket.wq, &tfile->wq);
+ init_waitqueue_head(&tfile->socket.wq.wait);
tfile->socket.file = file;
tfile->socket.ops = &tun_socket_ops;
@@ -3342,7 +3491,7 @@
rtnl_lock();
tun = tun_get(tfile);
if (tun)
- tun_get_iff(current->nsproxy->net_ns, tun, &ifr);
+ tun_get_iff(tun, &ifr);
rtnl_unlock();
if (tun)
@@ -3428,20 +3577,16 @@
static u32 tun_get_msglevel(struct net_device *dev)
{
-#ifdef TUN_DEBUG
struct tun_struct *tun = netdev_priv(dev);
- return tun->debug;
-#else
- return -EOPNOTSUPP;
-#endif
+
+ return tun->msg_enable;
}
static void tun_set_msglevel(struct net_device *dev, u32 value)
{
-#ifdef TUN_DEBUG
struct tun_struct *tun = netdev_priv(dev);
- tun->debug = value;
-#endif
+
+ tun->msg_enable = value;
}
static int tun_get_coalesce(struct net_device *dev,
@@ -3468,6 +3613,7 @@
}
static const struct ethtool_ops tun_ethtool_ops = {
+ .supported_coalesce_params = ETHTOOL_COALESCE_RX_MAX_FRAMES,
.get_drvinfo = tun_get_drvinfo,
.get_msglevel = tun_get_msglevel,
.set_msglevel = tun_set_msglevel,
--
Gitblit v1.6.2