From 04dd17822334871b23ea2862f7798fb0e0007777 Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Sat, 11 May 2024 08:53:19 +0000 Subject: [PATCH] change otg to host mode --- kernel/drivers/net/tun.c | 778 +++++++++++++++++++++++++++++++++++------------------------ 1 files changed, 462 insertions(+), 316 deletions(-) diff --git a/kernel/drivers/net/tun.c b/kernel/drivers/net/tun.c index d5bb972..191bf0d 100644 --- a/kernel/drivers/net/tun.c +++ b/kernel/drivers/net/tun.c @@ -1,16 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * TUN - Universal TUN/TAP device driver. * Copyright (C) 1999-2002 Maxim Krasnyansky <maxk@qualcomm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. * * $Id: tun.c,v 1.15 2002/03/01 02:44:24 maxk Exp $ */ @@ -71,6 +62,7 @@ #include <net/rtnetlink.h> #include <net/sock.h> #include <net/xdp.h> +#include <net/ip_tunnels.h> #include <linux/seq_file.h> #include <linux/uio.h> #include <linux/skb_array.h> @@ -92,36 +84,6 @@ static void tun_default_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd); -/* Uncomment to enable debugging */ -/* #define TUN_DEBUG 1 */ - -#ifdef TUN_DEBUG -static int debug; - -#define tun_debug(level, tun, fmt, args...) \ -do { \ - if (tun->debug) \ - netdev_printk(level, tun->dev, fmt, ##args); \ -} while (0) -#define DBG1(level, fmt, args...) \ -do { \ - if (debug == 2) \ - printk(level fmt, ##args); \ -} while (0) -#else -#define tun_debug(level, tun, fmt, args...) \ -do { \ - if (0) \ - netdev_printk(level, tun->dev, fmt, ##args); \ -} while (0) -#define DBG1(level, fmt, args...) \ -do { \ - if (0) \ - printk(level fmt, ##args); \ -} while (0) -#endif - -#define TUN_HEADROOM 256 #define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD) /* TUN device flags */ @@ -154,10 +116,10 @@ #define TUN_FLOW_EXPIRE (3 * HZ) struct tun_pcpu_stats { - u64 rx_packets; - u64 rx_bytes; - u64 tx_packets; - u64 tx_bytes; + u64_stats_t rx_packets; + u64_stats_t rx_bytes; + u64_stats_t tx_packets; + u64_stats_t tx_bytes; struct u64_stats_sync syncp; u32 rx_dropped; u32 tx_dropped; @@ -178,7 +140,6 @@ struct tun_file { struct sock sk; struct socket socket; - struct socket_wq wq; struct tun_struct __rcu *tun; struct fasync_struct *fasync; /* only used for fasnyc */ @@ -197,6 +158,11 @@ struct xdp_rxq_info xdp_rxq; }; +struct tun_page { + struct page *page; + int count; +}; + struct tun_flow_entry { struct hlist_node hash_link; struct rcu_head rcu; @@ -205,7 +171,7 @@ u32 rxhash; u32 rps_rxhash; int queue_index; - unsigned long updated; + unsigned long updated ____cacheline_aligned_in_smp; }; #define TUN_NUM_FLOW_ENTRIES 1024 @@ -239,9 +205,7 @@ struct sock_fprog fprog; /* protected by rtnl lock */ bool filter_attached; -#ifdef TUN_DEBUG - int debug; -#endif + u32 msg_enable; spinlock_t lock; struct hlist_head flows[TUN_NUM_FLOW_ENTRIES]; struct timer_list flow_gc_timer; @@ -256,6 +220,9 @@ struct tun_prog __rcu *steering_prog; struct tun_prog __rcu *filter_prog; struct ethtool_link_ksettings link_ksettings; + /* init args */ + struct file *file; + struct ifreq *ifr; }; struct veth { @@ -263,23 +230,8 @@ __be16 h_vlan_TCI; }; -bool tun_is_xdp_frame(void *ptr) -{ - return (unsigned long)ptr & TUN_XDP_FLAG; -} -EXPORT_SYMBOL(tun_is_xdp_frame); - -void *tun_xdp_to_ptr(void *ptr) -{ - return (void *)((unsigned long)ptr | TUN_XDP_FLAG); -} -EXPORT_SYMBOL(tun_xdp_to_ptr); - -void *tun_ptr_to_xdp(void *ptr) -{ - return (void *)((unsigned long)ptr & ~TUN_XDP_FLAG); -} -EXPORT_SYMBOL(tun_ptr_to_xdp); +static void tun_flow_init(struct tun_struct *tun); +static void tun_flow_uninit(struct tun_struct *tun); static int tun_napi_receive(struct napi_struct *napi, int budget) { @@ -331,6 +283,12 @@ NAPI_POLL_WEIGHT); napi_enable(&tfile->napi); } +} + +static void tun_napi_enable(struct tun_file *tfile) +{ + if (tfile->napi_enabled) + napi_enable(&tfile->napi); } static void tun_napi_disable(struct tun_file *tfile) @@ -437,8 +395,9 @@ struct tun_flow_entry *e = kmalloc(sizeof(*e), GFP_ATOMIC); if (e) { - tun_debug(KERN_INFO, tun, "create flow: hash %u index %u\n", - rxhash, queue_index); + netif_info(tun, tx_queued, tun->dev, + "create flow: hash %u index %u\n", + rxhash, queue_index); e->updated = jiffies; e->rxhash = rxhash; e->rps_rxhash = 0; @@ -452,8 +411,8 @@ static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e) { - tun_debug(KERN_INFO, tun, "delete flow: hash %u index %u\n", - e->rxhash, e->queue_index); + netif_info(tun, tx_queued, tun->dev, "delete flow: hash %u index %u\n", + e->rxhash, e->queue_index); hlist_del_rcu(&e->hash_link); kfree_rcu(e, rcu); --tun->flow_count; @@ -499,8 +458,6 @@ unsigned long count = 0; int i; - tun_debug(KERN_INFO, tun, "tun_flow_cleanup\n"); - spin_lock(&tun->lock); for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { struct tun_flow_entry *e; @@ -533,18 +490,17 @@ unsigned long delay = tun->ageing_time; u16 queue_index = tfile->queue_index; - if (!rxhash) - return; - else - head = &tun->flows[tun_hashfn(rxhash)]; + head = &tun->flows[tun_hashfn(rxhash)]; rcu_read_lock(); e = tun_flow_find(head, rxhash); if (likely(e)) { /* TODO: keep queueing to old queue until it's empty? */ - e->queue_index = queue_index; - e->updated = jiffies; + if (READ_ONCE(e->queue_index) != queue_index) + WRITE_ONCE(e->queue_index, queue_index); + if (e->updated != jiffies) + e->updated = jiffies; sock_rps_record_flow_hash(e->rps_rxhash); } else { spin_lock_bh(&tun->lock); @@ -561,8 +517,7 @@ rcu_read_unlock(); } -/** - * Save the hash received in the stack receive path and update the +/* Save the hash received in the stack receive path and update the * flow_hash table accordingly. */ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash) @@ -571,12 +526,11 @@ e->rps_rxhash = hash; } -/* We try to identify a flow through its rxhash first. The reason that +/* We try to identify a flow through its rxhash. The reason that * we do not check rxq no. is because some cards(e.g 82599), chooses * the rxq based on the txq where the last packet of the flow comes. As * the userspace application move between processors, we may get a - * different rxq no. here. If we could not get rxhash, then we would - * hope the rxq no. may help here. + * different rxq no. here. */ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) { @@ -587,18 +541,13 @@ numqueues = READ_ONCE(tun->numqueues); txq = __skb_get_hash_symmetric(skb); - if (txq) { - e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq); - if (e) { - tun_flow_save_rps_rxhash(e, txq); - txq = e->queue_index; - } else - /* use multiply and shift instead of expensive divide */ - txq = ((u64)txq * numqueues) >> 32; - } else if (likely(skb_rx_queue_recorded(skb))) { - txq = skb_get_rx_queue(skb); - while (unlikely(txq >= numqueues)) - txq -= numqueues; + e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq); + if (e) { + tun_flow_save_rps_rxhash(e, txq); + txq = e->queue_index; + } else { + /* use multiply and shift instead of expensive divide */ + txq = ((u64)txq * numqueues) >> 32; } return txq; @@ -622,8 +571,7 @@ } static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, - struct net_device *sb_dev, - select_queue_fallback_t fallback) + struct net_device *sb_dev) { struct tun_struct *tun = netdev_priv(dev); u16 ret; @@ -704,7 +652,8 @@ tun = rtnl_dereference(tfile->tun); if (tun && clean) { - tun_napi_disable(tfile); + if (!tfile->detached) + tun_napi_disable(tfile); tun_napi_del(tfile); } @@ -723,8 +672,10 @@ if (clean) { RCU_INIT_POINTER(tfile->tun, NULL); sock_put(&tfile->sk); - } else + } else { tun_disable_queue(tun, tfile); + tun_napi_disable(tfile); + } synchronize_net(); tun_flow_delete_by_queue(tun, tun->numqueues + 1); @@ -747,7 +698,6 @@ if (tun) xdp_rxq_info_unreg(&tfile->xdp_rxq); ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free); - sock_put(&tfile->sk); } } @@ -763,6 +713,9 @@ if (dev) netdev_state_change(dev); rtnl_unlock(); + + if (clean) + sock_put(&tfile->sk); } static void tun_detach_all(struct net_device *dev) @@ -797,6 +750,7 @@ sock_put(&tfile->sk); } list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) { + tun_napi_del(tfile); tun_enable_queue(tfile); tun_queue_purge(tfile); xdp_rxq_info_unreg(&tfile->xdp_rxq); @@ -877,10 +831,14 @@ if (tfile->detached) { tun_enable_queue(tfile); + tun_napi_enable(tfile); } else { sock_hold(&tfile->sk); tun_napi_init(tun, tfile, napi, napi_frags); } + + if (rtnl_dereference(tun->xdp_prog)) + sock_set_flag(&tfile->sk, SOCK_XDP); /* device is allowed to go away first, so no need to hold extra * refcnt. @@ -1025,6 +983,49 @@ static const struct ethtool_ops tun_ethtool_ops; +static int tun_net_init(struct net_device *dev) +{ + struct tun_struct *tun = netdev_priv(dev); + struct ifreq *ifr = tun->ifr; + int err; + + tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats); + if (!tun->pcpu_stats) + return -ENOMEM; + + spin_lock_init(&tun->lock); + + err = security_tun_dev_alloc_security(&tun->security); + if (err < 0) { + free_percpu(tun->pcpu_stats); + return err; + } + + tun_flow_init(tun); + + dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | + TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX; + dev->features = dev->hw_features | NETIF_F_LLTX; + dev->vlan_features = dev->features & + ~(NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX); + + tun->flags = (tun->flags & ~TUN_FEATURES) | + (ifr->ifr_flags & TUN_FEATURES); + + INIT_LIST_HEAD(&tun->disabled); + err = tun_attach(tun, tun->file, false, ifr->ifr_flags & IFF_NAPI, + ifr->ifr_flags & IFF_NAPI_FRAGS, false); + if (err < 0) { + tun_flow_uninit(tun); + security_tun_dev_free_security(tun->security); + free_percpu(tun->pcpu_stats); + return err; + } + return 0; +} + /* Net device detach from fd. */ static void tun_net_uninit(struct net_device *dev) { @@ -1050,20 +1051,17 @@ static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb) { #ifdef CONFIG_RPS - if (tun->numqueues == 1 && static_key_false(&rps_needed)) { + if (tun->numqueues == 1 && static_branch_unlikely(&rps_needed)) { /* Select queue was not called for the skbuff, so we extract the * RPS hash and save it into the flow_table here. */ + struct tun_flow_entry *e; __u32 rxhash; rxhash = __skb_get_hash_symmetric(skb); - if (rxhash) { - struct tun_flow_entry *e; - e = tun_flow_find(&tun->flows[tun_hashfn(rxhash)], - rxhash); - if (e) - tun_flow_save_rps_rxhash(e, rxhash); - } + e = tun_flow_find(&tun->flows[tun_hashfn(rxhash)], rxhash); + if (e) + tun_flow_save_rps_rxhash(e, rxhash); } #endif } @@ -1099,9 +1097,7 @@ if (!rcu_dereference(tun->steering_prog)) tun_automq_xmit(tun, skb); - tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len); - - BUG_ON(!tfile); + netif_info(tun, tx_queued, tun->dev, "%s %d\n", __func__, skb->len); /* Drop if the filter does not like it. * This is a noop if the filter is disabled. @@ -1127,7 +1123,7 @@ */ skb_orphan(skb); - nf_reset(skb); + nf_reset_ct(skb); if (ptr_ring_produce(&tfile->tx_ring, skb)) goto drop; @@ -1194,10 +1190,10 @@ p = per_cpu_ptr(tun->pcpu_stats, i); do { start = u64_stats_fetch_begin(&p->syncp); - rxpackets = p->rx_packets; - rxbytes = p->rx_bytes; - txpackets = p->tx_packets; - txbytes = p->tx_bytes; + rxpackets = u64_stats_read(&p->rx_packets); + rxbytes = u64_stats_read(&p->rx_bytes); + txpackets = u64_stats_read(&p->tx_packets); + txbytes = u64_stats_read(&p->tx_bytes); } while (u64_stats_fetch_retry(&p->syncp, start)); stats->rx_packets += rxpackets; @@ -1219,24 +1215,28 @@ struct netlink_ext_ack *extack) { struct tun_struct *tun = netdev_priv(dev); + struct tun_file *tfile; struct bpf_prog *old_prog; + int i; old_prog = rtnl_dereference(tun->xdp_prog); rcu_assign_pointer(tun->xdp_prog, prog); if (old_prog) bpf_prog_put(old_prog); - return 0; -} - -static u32 tun_xdp_query(struct net_device *dev) -{ - struct tun_struct *tun = netdev_priv(dev); - const struct bpf_prog *xdp_prog; - - xdp_prog = rtnl_dereference(tun->xdp_prog); - if (xdp_prog) - return xdp_prog->aux->id; + for (i = 0; i < tun->numqueues; i++) { + tfile = rtnl_dereference(tun->tfiles[i]); + if (prog) + sock_set_flag(&tfile->sk, SOCK_XDP); + else + sock_reset_flag(&tfile->sk, SOCK_XDP); + } + list_for_each_entry(tfile, &tun->disabled, next) { + if (prog) + sock_set_flag(&tfile->sk, SOCK_XDP); + else + sock_reset_flag(&tfile->sk, SOCK_XDP); + } return 0; } @@ -1246,15 +1246,28 @@ switch (xdp->command) { case XDP_SETUP_PROG: return tun_xdp_set(dev, xdp->prog, xdp->extack); - case XDP_QUERY_PROG: - xdp->prog_id = tun_xdp_query(dev); - return 0; default: return -EINVAL; } } +static int tun_net_change_carrier(struct net_device *dev, bool new_carrier) +{ + if (new_carrier) { + struct tun_struct *tun = netdev_priv(dev); + + if (!tun->numqueues) + return -EPERM; + + netif_carrier_on(dev); + } else { + netif_carrier_off(dev); + } + return 0; +} + static const struct net_device_ops tun_netdev_ops = { + .ndo_init = tun_net_init, .ndo_uninit = tun_net_uninit, .ndo_open = tun_net_open, .ndo_stop = tun_net_close, @@ -1263,6 +1276,7 @@ .ndo_select_queue = tun_select_queue, .ndo_set_rx_headroom = tun_set_headroom, .ndo_get_stats64 = tun_net_get_stats64, + .ndo_change_carrier = tun_net_change_carrier, }; static void __tun_xdp_flush_tfile(struct tun_file *tfile) @@ -1325,7 +1339,7 @@ static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) { - struct xdp_frame *frame = convert_to_xdp_frame(xdp); + struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp); if (unlikely(!frame)) return -EOVERFLOW; @@ -1334,6 +1348,7 @@ } static const struct net_device_ops tap_netdev_ops = { + .ndo_init = tun_net_init, .ndo_uninit = tun_net_uninit, .ndo_open = tun_net_open, .ndo_stop = tun_net_close, @@ -1348,6 +1363,7 @@ .ndo_get_stats64 = tun_net_get_stats64, .ndo_bpf = tun_xdp, .ndo_xdp_xmit = tun_xdp_xmit, + .ndo_change_carrier = tun_net_change_carrier, }; static void tun_flow_init(struct tun_struct *tun) @@ -1373,13 +1389,14 @@ #define MAX_MTU 65535 /* Initialize net device. */ -static void tun_net_init(struct net_device *dev) +static void tun_net_initialize(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); switch (tun->flags & TUN_TYPE_MASK) { case IFF_TUN: dev->netdev_ops = &tun_netdev_ops; + dev->header_ops = &ip_tunnel_header_ops; /* Point-to-Point TUN Device */ dev->hard_header_len = 0; @@ -1429,8 +1446,6 @@ sk = tfile->socket.sk; - tun_debug(KERN_INFO, tun, "tun_chr_poll\n"); - poll_wait(file, sk_sleep(sk), wait); if (!ptr_ring_empty(&tfile->tx_ring)) @@ -1462,7 +1477,8 @@ int err; int i; - if (it->nr_segs > MAX_SKB_FRAGS + 1) + if (it->nr_segs > MAX_SKB_FRAGS + 1 || + len > (ETH_MAX_MTU - NET_SKB_PAD - NET_IP_ALIGN)) return ERR_PTR(-EMSGSIZE); local_bh_disable(); @@ -1481,23 +1497,22 @@ skb->truesize += skb->data_len; for (i = 1; i < it->nr_segs; i++) { - struct page_frag *pfrag = ¤t->task_frag; size_t fragsz = it->iov[i].iov_len; + struct page *page; + void *frag; if (fragsz == 0 || fragsz > PAGE_SIZE) { err = -EINVAL; goto free; } - - if (!skb_page_frag_refill(fragsz, pfrag, GFP_KERNEL)) { + frag = netdev_alloc_frag(fragsz); + if (!frag) { err = -ENOMEM; goto free; } - - skb_fill_page_desc(skb, i - 1, pfrag->page, - pfrag->offset, fragsz); - page_ref_inc(pfrag->page); - pfrag->offset += fragsz; + page = virt_to_head_page(frag); + skb_fill_page_desc(skb, i - 1, page, + frag - page_address(page), fragsz); } return skb; @@ -1589,11 +1604,62 @@ if (zerocopy) return false; - if (SKB_DATA_ALIGN(len + TUN_RX_PAD) + + if (SKB_DATA_ALIGN(len + TUN_RX_PAD + XDP_PACKET_HEADROOM) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE) return false; return true; +} + +static struct sk_buff *__tun_build_skb(struct tun_file *tfile, + struct page_frag *alloc_frag, char *buf, + int buflen, int len, int pad) +{ + struct sk_buff *skb = build_skb(buf, buflen); + + if (!skb) + return ERR_PTR(-ENOMEM); + + skb_reserve(skb, pad); + skb_put(skb, len); + skb_set_owner_w(skb, tfile->socket.sk); + + get_page(alloc_frag->page); + alloc_frag->offset += buflen; + + return skb; +} + +static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog, + struct xdp_buff *xdp, u32 act) +{ + int err; + + switch (act) { + case XDP_REDIRECT: + err = xdp_do_redirect(tun->dev, xdp, xdp_prog); + if (err) + return err; + break; + case XDP_TX: + err = tun_xdp_tx(tun->dev, xdp); + if (err < 0) + return err; + break; + case XDP_PASS: + break; + default: + bpf_warn_invalid_xdp_action(act); + fallthrough; + case XDP_ABORTED: + trace_xdp_exception(tun->dev, xdp_prog, act); + fallthrough; + case XDP_DROP: + this_cpu_inc(tun->pcpu_stats->rx_dropped); + break; + } + + return act; } static struct sk_buff *tun_build_skb(struct tun_struct *tun, @@ -1603,18 +1669,17 @@ int len, int *skb_xdp) { struct page_frag *alloc_frag = ¤t->task_frag; - struct sk_buff *skb; struct bpf_prog *xdp_prog; int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - unsigned int delta = 0; char *buf; size_t copied; - int err, pad = TUN_RX_PAD; + int pad = TUN_RX_PAD; + int err = 0; rcu_read_lock(); xdp_prog = rcu_dereference(tun->xdp_prog); if (xdp_prog) - pad += TUN_HEADROOM; + pad += XDP_PACKET_HEADROOM; buflen += SKB_DATA_ALIGN(len + pad); rcu_read_unlock(); @@ -1633,17 +1698,19 @@ * of xdp_prog above, this should be rare and for simplicity * we do XDP on skb in case the headroom is not enough. */ - if (hdr->gso_type || !xdp_prog) + if (hdr->gso_type || !xdp_prog) { *skb_xdp = 1; - else - *skb_xdp = 0; + return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, + pad); + } + + *skb_xdp = 0; local_bh_disable(); rcu_read_lock(); xdp_prog = rcu_dereference(tun->xdp_prog); - if (xdp_prog && !*skb_xdp) { + if (xdp_prog) { struct xdp_buff xdp; - void *orig_data; u32 act; xdp.data_hard_start = buf; @@ -1651,67 +1718,36 @@ xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + len; xdp.rxq = &tfile->xdp_rxq; - orig_data = xdp.data; + xdp.frame_sz = buflen; + act = bpf_prog_run_xdp(xdp_prog, &xdp); - - switch (act) { - case XDP_REDIRECT: + if (act == XDP_REDIRECT || act == XDP_TX) { get_page(alloc_frag->page); alloc_frag->offset += buflen; - err = xdp_do_redirect(tun->dev, &xdp, xdp_prog); - xdp_do_flush_map(); - if (err) - goto err_redirect; - rcu_read_unlock(); - local_bh_enable(); - return NULL; - case XDP_TX: - get_page(alloc_frag->page); - alloc_frag->offset += buflen; - if (tun_xdp_tx(tun->dev, &xdp) < 0) - goto err_redirect; - rcu_read_unlock(); - local_bh_enable(); - return NULL; - case XDP_PASS: - delta = orig_data - xdp.data; - len = xdp.data_end - xdp.data; - break; - default: - bpf_warn_invalid_xdp_action(act); - /* fall through */ - case XDP_ABORTED: - trace_xdp_exception(tun->dev, xdp_prog, act); - /* fall through */ - case XDP_DROP: - goto err_xdp; } + err = tun_xdp_act(tun, xdp_prog, &xdp, act); + if (err < 0) { + if (act == XDP_REDIRECT || act == XDP_TX) + put_page(alloc_frag->page); + goto out; + } + + if (err == XDP_REDIRECT) + xdp_do_flush(); + if (err != XDP_PASS) + goto out; + + pad = xdp.data - xdp.data_hard_start; + len = xdp.data_end - xdp.data; } - - skb = build_skb(buf, buflen); - if (!skb) { - rcu_read_unlock(); - local_bh_enable(); - return ERR_PTR(-ENOMEM); - } - - skb_reserve(skb, pad - delta); - skb_put(skb, len); - skb_set_owner_w(skb, tfile->socket.sk); - get_page(alloc_frag->page); - alloc_frag->offset += buflen; - rcu_read_unlock(); local_bh_enable(); - return skb; + return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad); -err_redirect: - put_page(alloc_frag->page); -err_xdp: +out: rcu_read_unlock(); local_bh_enable(); - this_cpu_inc(tun->pcpu_stats->rx_dropped); return NULL; } @@ -1902,7 +1938,8 @@ } skb_reset_network_header(skb); - skb_probe_transport_header(skb, 0); + skb_probe_transport_header(skb); + skb_record_rx_queue(skb, tfile->queue_index); if (skb_xdp) { struct bpf_prog *xdp_prog; @@ -1947,20 +1984,29 @@ /* Exercise flow dissector code path. */ skb_push(skb, ETH_HLEN); - headlen = eth_get_headlen(skb->data, skb_headlen(skb)); + headlen = eth_get_headlen(tun->dev, skb->data, + skb_headlen(skb)); if (unlikely(headlen > skb_headlen(skb))) { + WARN_ON_ONCE(1); + err = -ENOMEM; this_cpu_inc(tun->pcpu_stats->rx_dropped); +napi_busy: napi_free_frags(&tfile->napi); rcu_read_unlock(); mutex_unlock(&tfile->napi_mutex); - WARN_ON(1); - return -ENOMEM; + return err; } - local_bh_disable(); - napi_gro_frags(&tfile->napi); - local_bh_enable(); + if (likely(napi_schedule_prep(&tfile->napi))) { + local_bh_disable(); + napi_gro_frags(&tfile->napi); + napi_complete(&tfile->napi); + local_bh_enable(); + } else { + err = -EBUSY; + goto napi_busy; + } mutex_unlock(&tfile->napi_mutex); } else if (tfile->napi_enabled) { struct sk_buff_head *queue = &tfile->sk.sk_write_queue; @@ -1984,8 +2030,8 @@ stats = get_cpu_ptr(tun->pcpu_stats); u64_stats_update_begin(&stats->syncp); - stats->rx_packets++; - stats->rx_bytes += len; + u64_stats_inc(&stats->rx_packets); + u64_stats_add(&stats->rx_bytes, len); u64_stats_update_end(&stats->syncp); put_cpu_ptr(stats); @@ -2041,8 +2087,8 @@ stats = get_cpu_ptr(tun->pcpu_stats); u64_stats_update_begin(&stats->syncp); - stats->tx_packets++; - stats->tx_bytes += ret; + u64_stats_inc(&stats->tx_packets); + u64_stats_add(&stats->tx_bytes, ret); u64_stats_update_end(&stats->syncp); put_cpu_ptr(tun->pcpu_stats); @@ -2136,8 +2182,8 @@ /* caller is in process context, */ stats = get_cpu_ptr(tun->pcpu_stats); u64_stats_update_begin(&stats->syncp); - stats->tx_packets++; - stats->tx_bytes += skb->len + vlan_hlen; + u64_stats_inc(&stats->tx_packets); + u64_stats_add(&stats->tx_bytes, skb->len + vlan_hlen); u64_stats_update_end(&stats->syncp); put_cpu_ptr(tun->pcpu_stats); @@ -2158,7 +2204,7 @@ goto out; } - add_wait_queue(&tfile->wq.wait, &wait); + add_wait_queue(&tfile->socket.wq.wait, &wait); while (1) { set_current_state(TASK_INTERRUPTIBLE); @@ -2178,7 +2224,7 @@ } __set_current_state(TASK_RUNNING); - remove_wait_queue(&tfile->wq.wait, &wait); + remove_wait_queue(&tfile->socket.wq.wait, &wait); out: *err = error; @@ -2191,8 +2237,6 @@ { ssize_t ret; int err; - - tun_debug(KERN_INFO, tun, "tun_do_read\n"); if (!iov_iter_count(to)) { tun_ptr_free(ptr); @@ -2284,7 +2328,9 @@ struct tun_struct *tun = netdev_priv(dev); BUG_ON(!(list_empty(&tun->disabled))); + free_percpu(tun->pcpu_stats); + tun_flow_uninit(tun); security_tun_dev_free_security(tun->security); __tun_set_ebpf(tun, &tun->steering_prog, NULL); @@ -2400,18 +2446,160 @@ kill_fasync(&tfile->fasync, SIGIO, POLL_OUT); } +static void tun_put_page(struct tun_page *tpage) +{ + if (tpage->page) + __page_frag_cache_drain(tpage->page, tpage->count); +} + +static int tun_xdp_one(struct tun_struct *tun, + struct tun_file *tfile, + struct xdp_buff *xdp, int *flush, + struct tun_page *tpage) +{ + unsigned int datasize = xdp->data_end - xdp->data; + struct tun_xdp_hdr *hdr = xdp->data_hard_start; + struct virtio_net_hdr *gso = &hdr->gso; + struct tun_pcpu_stats *stats; + struct bpf_prog *xdp_prog; + struct sk_buff *skb = NULL; + u32 rxhash = 0, act; + int buflen = hdr->buflen; + int err = 0; + bool skb_xdp = false; + struct page *page; + + xdp_prog = rcu_dereference(tun->xdp_prog); + if (xdp_prog) { + if (gso->gso_type) { + skb_xdp = true; + goto build; + } + xdp_set_data_meta_invalid(xdp); + xdp->rxq = &tfile->xdp_rxq; + xdp->frame_sz = buflen; + + act = bpf_prog_run_xdp(xdp_prog, xdp); + err = tun_xdp_act(tun, xdp_prog, xdp, act); + if (err < 0) { + put_page(virt_to_head_page(xdp->data)); + return err; + } + + switch (err) { + case XDP_REDIRECT: + *flush = true; + fallthrough; + case XDP_TX: + return 0; + case XDP_PASS: + break; + default: + page = virt_to_head_page(xdp->data); + if (tpage->page == page) { + ++tpage->count; + } else { + tun_put_page(tpage); + tpage->page = page; + tpage->count = 1; + } + return 0; + } + } + +build: + skb = build_skb(xdp->data_hard_start, buflen); + if (!skb) { + err = -ENOMEM; + goto out; + } + + skb_reserve(skb, xdp->data - xdp->data_hard_start); + skb_put(skb, xdp->data_end - xdp->data); + + if (virtio_net_hdr_to_skb(skb, gso, tun_is_little_endian(tun))) { + this_cpu_inc(tun->pcpu_stats->rx_frame_errors); + kfree_skb(skb); + err = -EINVAL; + goto out; + } + + skb->protocol = eth_type_trans(skb, tun->dev); + skb_reset_network_header(skb); + skb_probe_transport_header(skb); + skb_record_rx_queue(skb, tfile->queue_index); + + if (skb_xdp) { + err = do_xdp_generic(xdp_prog, skb); + if (err != XDP_PASS) + goto out; + } + + if (!rcu_dereference(tun->steering_prog) && tun->numqueues > 1 && + !tfile->detached) + rxhash = __skb_get_hash_symmetric(skb); + + netif_receive_skb(skb); + + /* No need for get_cpu_ptr() here since this function is + * always called with bh disabled + */ + stats = this_cpu_ptr(tun->pcpu_stats); + u64_stats_update_begin(&stats->syncp); + u64_stats_inc(&stats->rx_packets); + u64_stats_add(&stats->rx_bytes, datasize); + u64_stats_update_end(&stats->syncp); + + if (rxhash) + tun_flow_update(tun, rxhash, tfile); + +out: + return err; +} + static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { - int ret; + int ret, i; struct tun_file *tfile = container_of(sock, struct tun_file, socket); struct tun_struct *tun = tun_get(tfile); + struct tun_msg_ctl *ctl = m->msg_control; + struct xdp_buff *xdp; if (!tun) return -EBADFD; - ret = tun_get_user(tun, tfile, m->msg_control, &m->msg_iter, + if (m->msg_controllen == sizeof(struct tun_msg_ctl) && + ctl && ctl->type == TUN_MSG_PTR) { + struct tun_page tpage; + int n = ctl->num; + int flush = 0; + + memset(&tpage, 0, sizeof(tpage)); + + local_bh_disable(); + rcu_read_lock(); + + for (i = 0; i < n; i++) { + xdp = &((struct xdp_buff *)ctl->ptr)[i]; + tun_xdp_one(tun, tfile, xdp, &flush, &tpage); + } + + if (flush) + xdp_do_flush(); + + rcu_read_unlock(); + local_bh_enable(); + + tun_put_page(&tpage); + + ret = total_len; + goto out; + } + + ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter, m->msg_flags & MSG_DONTWAIT, m->msg_flags & MSG_MORE); +out: tun_put(tun); return ret; } @@ -2636,9 +2824,6 @@ if (!dev) return -ENOMEM; - err = dev_get_valid_name(net, dev, name); - if (err < 0) - goto err_free_dev; dev_net_set(dev, net); dev->rtnl_link_ops = &tun_link_ops; @@ -2657,41 +2842,16 @@ tun->rx_batched = 0; RCU_INIT_POINTER(tun->steering_prog, NULL); - tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats); - if (!tun->pcpu_stats) { - err = -ENOMEM; - goto err_free_dev; - } + tun->ifr = ifr; + tun->file = file; - spin_lock_init(&tun->lock); - - err = security_tun_dev_alloc_security(&tun->security); - if (err < 0) - goto err_free_stat; - - tun_net_init(dev); - tun_flow_init(tun); - - dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | - TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX | - NETIF_F_HW_VLAN_STAG_TX; - dev->features = dev->hw_features | NETIF_F_LLTX; - dev->vlan_features = dev->features & - ~(NETIF_F_HW_VLAN_CTAG_TX | - NETIF_F_HW_VLAN_STAG_TX); - - tun->flags = (tun->flags & ~TUN_FEATURES) | - (ifr->ifr_flags & TUN_FEATURES); - - INIT_LIST_HEAD(&tun->disabled); - err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI, - ifr->ifr_flags & IFF_NAPI_FRAGS, false); - if (err < 0) - goto err_free_flow; + tun_net_initialize(dev); err = register_netdevice(tun->dev); - if (err < 0) - goto err_detach; + if (err < 0) { + free_netdev(dev); + return err; + } /* free_netdev() won't check refcnt, to aovid race * with dev_put() we need publish tun after registration. */ @@ -2699,8 +2859,6 @@ } netif_carrier_on(tun->dev); - - tun_debug(KERN_INFO, tun, "tun_set_iff\n"); /* Make sure persistent devices do not get stuck in * xoff state. @@ -2710,27 +2868,10 @@ strcpy(ifr->ifr_name, tun->dev->name); return 0; - -err_detach: - tun_detach_all(dev); - /* register_netdevice() already called tun_free_netdev() */ - goto err_free_dev; - -err_free_flow: - tun_flow_uninit(tun); - security_tun_dev_free_security(tun->security); -err_free_stat: - free_percpu(tun->pcpu_stats); -err_free_dev: - free_netdev(dev); - return err; } -static void tun_get_iff(struct net *net, struct tun_struct *tun, - struct ifreq *ifr) +static void tun_get_iff(struct tun_struct *tun, struct ifreq *ifr) { - tun_debug(KERN_INFO, tun, "tun_get_iff\n"); - strcpy(ifr->ifr_name, tun->dev->name); ifr->ifr_flags = tun_flags(tun); @@ -2857,7 +2998,7 @@ return ret; } -static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog **prog_p, +static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p, void __user *data) { struct bpf_prog *prog; @@ -2923,12 +3064,12 @@ struct net *net = sock_net(&tfile->sk); struct tun_struct *tun; void __user* argp = (void __user*)arg; + unsigned int ifindex, carrier; struct ifreq ifr; kuid_t owner; kgid_t group; int sndbuf; int vnet_hdr_sz; - unsigned int ifindex; int le; int ret; bool do_notify = false; @@ -2993,12 +3134,13 @@ if (!tun) goto unlock; - tun_debug(KERN_INFO, tun, "tun_chr_ioctl cmd %u\n", cmd); + netif_info(tun, drv, tun->dev, "tun_chr_ioctl cmd %u\n", cmd); + net = dev_net(tun->dev); ret = 0; switch (cmd) { case TUNGETIFF: - tun_get_iff(current->nsproxy->net_ns, tun, &ifr); + tun_get_iff(tun, &ifr); if (tfile->detached) ifr.ifr_flags |= IFF_DETACH_QUEUE; @@ -3013,8 +3155,8 @@ /* Disable/Enable checksum */ /* [unimplemented] */ - tun_debug(KERN_INFO, tun, "ignored: set checksum %s\n", - arg ? "disabled" : "enabled"); + netif_info(tun, drv, tun->dev, "ignored: set checksum %s\n", + arg ? "disabled" : "enabled"); break; case TUNSETPERSIST: @@ -3032,8 +3174,8 @@ do_notify = true; } - tun_debug(KERN_INFO, tun, "persist %s\n", - arg ? "enabled" : "disabled"); + netif_info(tun, drv, tun->dev, "persist %s\n", + arg ? "enabled" : "disabled"); break; case TUNSETOWNER: @@ -3045,8 +3187,8 @@ } tun->owner = owner; do_notify = true; - tun_debug(KERN_INFO, tun, "owner set to %u\n", - from_kuid(&init_user_ns, tun->owner)); + netif_info(tun, drv, tun->dev, "owner set to %u\n", + from_kuid(&init_user_ns, tun->owner)); break; case TUNSETGROUP: @@ -3058,30 +3200,29 @@ } tun->group = group; do_notify = true; - tun_debug(KERN_INFO, tun, "group set to %u\n", - from_kgid(&init_user_ns, tun->group)); + netif_info(tun, drv, tun->dev, "group set to %u\n", + from_kgid(&init_user_ns, tun->group)); break; case TUNSETLINK: /* Only allow setting the type when the interface is down */ if (tun->dev->flags & IFF_UP) { - tun_debug(KERN_INFO, tun, - "Linktype set failed because interface is up\n"); + netif_info(tun, drv, tun->dev, + "Linktype set failed because interface is up\n"); ret = -EBUSY; } else { tun->dev->type = (int) arg; tun->dev->addr_len = tun_get_addr_len(tun->dev->type); - tun_debug(KERN_INFO, tun, "linktype set to %d\n", - tun->dev->type); + netif_info(tun, drv, tun->dev, "linktype set to %d\n", + tun->dev->type); ret = 0; } break; -#ifdef TUN_DEBUG case TUNSETDEBUG: - tun->debug = arg; + tun->msg_enable = (u32)arg; break; -#endif + case TUNSETOFFLOAD: ret = set_offload(tun, arg); break; @@ -3096,18 +3237,14 @@ case SIOCGIFHWADDR: /* Get hw address */ - memcpy(ifr.ifr_hwaddr.sa_data, tun->dev->dev_addr, ETH_ALEN); - ifr.ifr_hwaddr.sa_family = tun->dev->type; + dev_get_mac_address(&ifr.ifr_hwaddr, net, tun->dev->name); if (copy_to_user(argp, &ifr, ifreq_len)) ret = -EFAULT; break; case SIOCSIFHWADDR: /* Set hw address */ - tun_debug(KERN_DEBUG, tun, "set hw address: %pM\n", - ifr.ifr_hwaddr.sa_data); - - ret = dev_set_mac_address(tun->dev, &ifr.ifr_hwaddr); + ret = dev_set_mac_address_user(tun->dev, &ifr.ifr_hwaddr, NULL); break; case TUNGETSNDBUF: @@ -3213,6 +3350,21 @@ ret = tun_set_ebpf(tun, &tun->filter_prog, argp); break; + case TUNSETCARRIER: + ret = -EFAULT; + if (copy_from_user(&carrier, argp, sizeof(carrier))) + goto unlock; + + ret = tun_net_change_carrier(tun->dev, (bool)carrier); + break; + + case TUNGETDEVNETNS: + ret = -EPERM; + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + goto unlock; + ret = open_related_ns(&net->ns, get_net_ns); + break; + default: ret = -EINVAL; break; @@ -3286,8 +3438,6 @@ struct net *net = current->nsproxy->net_ns; struct tun_file *tfile; - DBG1(KERN_INFO, "tunX: tun_chr_open\n"); - tfile = (struct tun_file *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL, &tun_proto, 0); if (!tfile) @@ -3302,13 +3452,12 @@ tfile->flags = 0; tfile->ifindex = 0; - init_waitqueue_head(&tfile->wq.wait); - RCU_INIT_POINTER(tfile->socket.wq, &tfile->wq); + init_waitqueue_head(&tfile->socket.wq.wait); tfile->socket.file = file; tfile->socket.ops = &tun_socket_ops; - sock_init_data(&tfile->socket, &tfile->sk); + sock_init_data_uid(&tfile->socket, &tfile->sk, current_fsuid()); tfile->sk.sk_write_space = tun_sock_write_space; tfile->sk.sk_sndbuf = INT_MAX; @@ -3342,7 +3491,7 @@ rtnl_lock(); tun = tun_get(tfile); if (tun) - tun_get_iff(current->nsproxy->net_ns, tun, &ifr); + tun_get_iff(tun, &ifr); rtnl_unlock(); if (tun) @@ -3428,20 +3577,16 @@ static u32 tun_get_msglevel(struct net_device *dev) { -#ifdef TUN_DEBUG struct tun_struct *tun = netdev_priv(dev); - return tun->debug; -#else - return -EOPNOTSUPP; -#endif + + return tun->msg_enable; } static void tun_set_msglevel(struct net_device *dev, u32 value) { -#ifdef TUN_DEBUG struct tun_struct *tun = netdev_priv(dev); - tun->debug = value; -#endif + + tun->msg_enable = value; } static int tun_get_coalesce(struct net_device *dev, @@ -3468,6 +3613,7 @@ } static const struct ethtool_ops tun_ethtool_ops = { + .supported_coalesce_params = ETHTOOL_COALESCE_RX_MAX_FRAMES, .get_drvinfo = tun_get_drvinfo, .get_msglevel = tun_get_msglevel, .set_msglevel = tun_set_msglevel, -- Gitblit v1.6.2