From 9370bb92b2d16684ee45cf24e879c93c509162da Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Thu, 19 Dec 2024 01:47:39 +0000
Subject: [PATCH] add wifi6 8852be driver
---
kernel/net/core/dev.c | 2577 ++++++++++++++++++++++++++++++++++++++++++++++-------------
1 files changed, 2,006 insertions(+), 571 deletions(-)
diff --git a/kernel/net/core/dev.c b/kernel/net/core/dev.c
index 27d414d..c34511b 100644
--- a/kernel/net/core/dev.c
+++ b/kernel/net/core/dev.c
@@ -1,10 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* NET3 Protocol independent device support routines.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*
* Derived from the non IP parts of dev.c 1.0.19
* Authors: Ross Biro
@@ -102,6 +98,7 @@
#include <net/busy_poll.h>
#include <linux/rtnetlink.h>
#include <linux/stat.h>
+#include <net/dsa.h>
#include <net/dst.h>
#include <net/dst_metadata.h>
#include <net/pkt_sched.h>
@@ -132,7 +129,6 @@
#include <trace/events/napi.h>
#include <trace/events/net.h>
#include <trace/events/skb.h>
-#include <linux/pci.h>
#include <linux/inetdevice.h>
#include <linux/cpu_rmap.h>
#include <linux/static_key.h>
@@ -146,11 +142,15 @@
#include <linux/sctp.h>
#include <net/udp_tunnel.h>
#include <linux/net_namespace.h>
+#include <linux/indirect_call_wrapper.h>
+#include <net/devlink.h>
+#include <linux/pm_runtime.h>
+#include <linux/prandom.h>
+#include <trace/hooks/net.h>
#include "net-sysfs.h"
#define MAX_GRO_SKBS 8
-#define MAX_NEST_DEV 8
/* This should be increased if a protocol with a bigger head is added. */
#define GRO_MAX_HEAD (MAX_HEADER + 128)
@@ -164,6 +164,9 @@
static int netif_rx_internal(struct sk_buff *skb);
static int call_netdevice_notifiers_info(unsigned long val,
struct netdev_notifier_info *info);
+static int call_netdevice_notifiers_extack(unsigned long val,
+ struct net_device *dev,
+ struct netlink_ext_ack *extack);
static struct napi_struct *napi_by_id(unsigned int napi_id);
/*
@@ -230,6 +233,128 @@
#endif
}
+static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
+ const char *name)
+{
+ struct netdev_name_node *name_node;
+
+ name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
+ if (!name_node)
+ return NULL;
+ INIT_HLIST_NODE(&name_node->hlist);
+ name_node->dev = dev;
+ name_node->name = name;
+ return name_node;
+}
+
+static struct netdev_name_node *
+netdev_name_node_head_alloc(struct net_device *dev)
+{
+ struct netdev_name_node *name_node;
+
+ name_node = netdev_name_node_alloc(dev, dev->name);
+ if (!name_node)
+ return NULL;
+ INIT_LIST_HEAD(&name_node->list);
+ return name_node;
+}
+
+static void netdev_name_node_free(struct netdev_name_node *name_node)
+{
+ kfree(name_node);
+}
+
+static void netdev_name_node_add(struct net *net,
+ struct netdev_name_node *name_node)
+{
+ hlist_add_head_rcu(&name_node->hlist,
+ dev_name_hash(net, name_node->name));
+}
+
+static void netdev_name_node_del(struct netdev_name_node *name_node)
+{
+ hlist_del_rcu(&name_node->hlist);
+}
+
+static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
+ const char *name)
+{
+ struct hlist_head *head = dev_name_hash(net, name);
+ struct netdev_name_node *name_node;
+
+ hlist_for_each_entry(name_node, head, hlist)
+ if (!strcmp(name_node->name, name))
+ return name_node;
+ return NULL;
+}
+
+static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
+ const char *name)
+{
+ struct hlist_head *head = dev_name_hash(net, name);
+ struct netdev_name_node *name_node;
+
+ hlist_for_each_entry_rcu(name_node, head, hlist)
+ if (!strcmp(name_node->name, name))
+ return name_node;
+ return NULL;
+}
+
+int netdev_name_node_alt_create(struct net_device *dev, const char *name)
+{
+ struct netdev_name_node *name_node;
+ struct net *net = dev_net(dev);
+
+ name_node = netdev_name_node_lookup(net, name);
+ if (name_node)
+ return -EEXIST;
+ name_node = netdev_name_node_alloc(dev, name);
+ if (!name_node)
+ return -ENOMEM;
+ netdev_name_node_add(net, name_node);
+ /* The node that holds dev->name acts as a head of per-device list. */
+ list_add_tail(&name_node->list, &dev->name_node->list);
+
+ return 0;
+}
+EXPORT_SYMBOL(netdev_name_node_alt_create);
+
+static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
+{
+ list_del(&name_node->list);
+ netdev_name_node_del(name_node);
+ kfree(name_node->name);
+ netdev_name_node_free(name_node);
+}
+
+int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
+{
+ struct netdev_name_node *name_node;
+ struct net *net = dev_net(dev);
+
+ name_node = netdev_name_node_lookup(net, name);
+ if (!name_node)
+ return -ENOENT;
+ /* lookup might have found our primary name or a name belonging
+ * to another device.
+ */
+ if (name_node == dev->name_node || name_node->dev != dev)
+ return -EINVAL;
+
+ __netdev_name_node_alt_destroy(name_node);
+
+ return 0;
+}
+EXPORT_SYMBOL(netdev_name_node_alt_destroy);
+
+static void netdev_name_node_alt_flush(struct net_device *dev)
+{
+ struct netdev_name_node *name_node, *tmp;
+
+ list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list)
+ __netdev_name_node_alt_destroy(name_node);
+}
+
/* Device list insertion */
static void list_netdevice(struct net_device *dev)
{
@@ -239,7 +364,7 @@
write_lock_bh(&dev_base_lock);
list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
- hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
+ netdev_name_node_add(net, dev->name_node);
hlist_add_head_rcu(&dev->index_hlist,
dev_index_hash(net, dev->ifindex));
write_unlock_bh(&dev_base_lock);
@@ -257,7 +382,7 @@
/* Unlink dev from the device chain */
write_lock_bh(&dev_base_lock);
list_del_rcu(&dev->dev_list);
- hlist_del_rcu(&dev->name_hlist);
+ netdev_name_node_del(dev->name_node);
hlist_del_rcu(&dev->index_hlist);
write_unlock_bh(&dev_base_lock);
@@ -355,6 +480,7 @@
unsigned short dev_type)
{
}
+
static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
{
}
@@ -385,6 +511,12 @@
static inline struct list_head *ptype_head(const struct packet_type *pt)
{
+ struct list_head vendor_pt = { .next = NULL, };
+
+ trace_android_vh_ptype_head(pt, &vendor_pt);
+ if (vendor_pt.next)
+ return vendor_pt.next;
+
if (pt->type == htons(ETH_P_ALL))
return pt->dev ? &pt->dev->ptype_all : &ptype_all;
else
@@ -735,14 +867,10 @@
struct net_device *__dev_get_by_name(struct net *net, const char *name)
{
- struct net_device *dev;
- struct hlist_head *head = dev_name_hash(net, name);
+ struct netdev_name_node *node_name;
- hlist_for_each_entry(dev, head, name_hlist)
- if (!strncmp(dev->name, name, IFNAMSIZ))
- return dev;
-
- return NULL;
+ node_name = netdev_name_node_lookup(net, name);
+ return node_name ? node_name->dev : NULL;
}
EXPORT_SYMBOL(__dev_get_by_name);
@@ -760,14 +888,10 @@
struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
{
- struct net_device *dev;
- struct hlist_head *head = dev_name_hash(net, name);
+ struct netdev_name_node *node_name;
- hlist_for_each_entry_rcu(dev, head, name_hlist)
- if (!strncmp(dev->name, name, IFNAMSIZ))
- return dev;
-
- return NULL;
+ node_name = netdev_name_node_lookup_rcu(net, name);
+ return node_name ? node_name->dev : NULL;
}
EXPORT_SYMBOL(dev_get_by_name_rcu);
@@ -1015,7 +1139,7 @@
* @name: name string
*
* Network device names need to be valid file names to
- * to allow sysfs to work. We also disallow any kind of
+ * allow sysfs to work. We also disallow any kind of
* whitespace.
*/
bool dev_valid_name(const char *name)
@@ -1078,6 +1202,18 @@
return -ENOMEM;
for_each_netdev(net, d) {
+ struct netdev_name_node *name_node;
+ list_for_each_entry(name_node, &d->name_node->list, list) {
+ if (!sscanf(name_node->name, name, &i))
+ continue;
+ if (i < 0 || i >= max_netdevices)
+ continue;
+
+ /* avoid cases where sscanf is not exact inverse of printf */
+ snprintf(buf, IFNAMSIZ, name, i);
+ if (!strncmp(buf, name_node->name, IFNAMSIZ))
+ set_bit(i, inuse);
+ }
if (!sscanf(d->name, name, &i))
continue;
if (i < 0 || i >= max_netdevices)
@@ -1138,8 +1274,8 @@
}
EXPORT_SYMBOL(dev_alloc_name);
-int dev_get_valid_name(struct net *net, struct net_device *dev,
- const char *name)
+static int dev_get_valid_name(struct net *net, struct net_device *dev,
+ const char *name)
{
BUG_ON(!net);
@@ -1155,7 +1291,6 @@
return 0;
}
-EXPORT_SYMBOL(dev_get_valid_name);
/**
* dev_change_name - change name of a device
@@ -1229,13 +1364,13 @@
netdev_adjacent_rename_links(dev, oldname);
write_lock_bh(&dev_base_lock);
- hlist_del_rcu(&dev->name_hlist);
+ netdev_name_node_del(dev->name_node);
write_unlock_bh(&dev_base_lock);
synchronize_rcu();
write_lock_bh(&dev_base_lock);
- hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
+ netdev_name_node_add(net, dev->name_node);
write_unlock_bh(&dev_base_lock);
ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
@@ -1285,8 +1420,8 @@
}
mutex_lock(&ifalias_mutex);
- rcu_swap_protected(dev->ifalias, new_alias,
- mutex_is_locked(&ifalias_mutex));
+ new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
+ mutex_is_locked(&ifalias_mutex));
mutex_unlock(&ifalias_mutex);
if (new_alias)
@@ -1372,15 +1507,20 @@
}
EXPORT_SYMBOL(netdev_notify_peers);
-static int __dev_open(struct net_device *dev)
+static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
{
const struct net_device_ops *ops = dev->netdev_ops;
int ret;
ASSERT_RTNL();
- if (!netif_device_present(dev))
- return -ENODEV;
+ if (!netif_device_present(dev)) {
+ /* may be detached because parent is runtime-suspended */
+ if (dev->dev.parent)
+ pm_runtime_resume(dev->dev.parent);
+ if (!netif_device_present(dev))
+ return -ENODEV;
+ }
/* Block netpoll from trying to do any rx path servicing.
* If we don't do this there is a chance ndo_poll_controller
@@ -1388,7 +1528,7 @@
*/
netpoll_poll_disable(dev);
- ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
+ ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
ret = notifier_to_errno(ret);
if (ret)
return ret;
@@ -1417,7 +1557,8 @@
/**
* dev_open - prepare an interface for use.
- * @dev: device to open
+ * @dev: device to open
+ * @extack: netlink extended ack
*
* Takes a device from down to up state. The device's private open
* function is invoked and then the multicast lists are loaded. Finally
@@ -1427,14 +1568,14 @@
* Calling this function on an active interface is a nop. On a failure
* a negative errno code is returned.
*/
-int dev_open(struct net_device *dev)
+int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
{
int ret;
if (dev->flags & IFF_UP)
return 0;
- ret = __dev_open(dev);
+ ret = __dev_open(dev, extack);
if (ret < 0)
return ret;
@@ -1596,6 +1737,7 @@
N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
+ N(PRE_CHANGEADDR)
}
#undef N
return "UNKNOWN_NETDEV_EVENT";
@@ -1610,6 +1752,62 @@
};
return nb->notifier_call(nb, val, &info);
+}
+
+static int call_netdevice_register_notifiers(struct notifier_block *nb,
+ struct net_device *dev)
+{
+ int err;
+
+ err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
+ err = notifier_to_errno(err);
+ if (err)
+ return err;
+
+ if (!(dev->flags & IFF_UP))
+ return 0;
+
+ call_netdevice_notifier(nb, NETDEV_UP, dev);
+ return 0;
+}
+
+static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
+ struct net_device *dev)
+{
+ if (dev->flags & IFF_UP) {
+ call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
+ dev);
+ call_netdevice_notifier(nb, NETDEV_DOWN, dev);
+ }
+ call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
+}
+
+static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
+ struct net *net)
+{
+ struct net_device *dev;
+ int err;
+
+ for_each_netdev(net, dev) {
+ err = call_netdevice_register_notifiers(nb, dev);
+ if (err)
+ goto rollback;
+ }
+ return 0;
+
+rollback:
+ for_each_netdev_continue_reverse(net, dev)
+ call_netdevice_unregister_notifiers(nb, dev);
+ return err;
+}
+
+static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
+ struct net *net)
+{
+ struct net_device *dev;
+
+ for_each_netdev(net, dev)
+ call_netdevice_unregister_notifiers(nb, dev);
}
static int dev_boot_phase = 1;
@@ -1630,8 +1828,6 @@
int register_netdevice_notifier(struct notifier_block *nb)
{
- struct net_device *dev;
- struct net_device *last;
struct net *net;
int err;
@@ -1644,17 +1840,9 @@
if (dev_boot_phase)
goto unlock;
for_each_net(net) {
- for_each_netdev(net, dev) {
- err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
- err = notifier_to_errno(err);
- if (err)
- goto rollback;
-
- if (!(dev->flags & IFF_UP))
- continue;
-
- call_netdevice_notifier(nb, NETDEV_UP, dev);
- }
+ err = call_netdevice_register_net_notifiers(nb, net);
+ if (err)
+ goto rollback;
}
unlock:
@@ -1663,22 +1851,9 @@
return err;
rollback:
- last = dev;
- for_each_net(net) {
- for_each_netdev(net, dev) {
- if (dev == last)
- goto outroll;
+ for_each_net_continue_reverse(net)
+ call_netdevice_unregister_net_notifiers(nb, net);
- if (dev->flags & IFF_UP) {
- call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
- dev);
- call_netdevice_notifier(nb, NETDEV_DOWN, dev);
- }
- call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
- }
- }
-
-outroll:
raw_notifier_chain_unregister(&netdev_chain, nb);
goto unlock;
}
@@ -1700,7 +1875,6 @@
int unregister_netdevice_notifier(struct notifier_block *nb)
{
- struct net_device *dev;
struct net *net;
int err;
@@ -1711,22 +1885,147 @@
if (err)
goto unlock;
- for_each_net(net) {
- for_each_netdev(net, dev) {
- if (dev->flags & IFF_UP) {
- call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
- dev);
- call_netdevice_notifier(nb, NETDEV_DOWN, dev);
- }
- call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
- }
- }
+ for_each_net(net)
+ call_netdevice_unregister_net_notifiers(nb, net);
+
unlock:
rtnl_unlock();
up_write(&pernet_ops_rwsem);
return err;
}
EXPORT_SYMBOL(unregister_netdevice_notifier);
+
+static int __register_netdevice_notifier_net(struct net *net,
+ struct notifier_block *nb,
+ bool ignore_call_fail)
+{
+ int err;
+
+ err = raw_notifier_chain_register(&net->netdev_chain, nb);
+ if (err)
+ return err;
+ if (dev_boot_phase)
+ return 0;
+
+ err = call_netdevice_register_net_notifiers(nb, net);
+ if (err && !ignore_call_fail)
+ goto chain_unregister;
+
+ return 0;
+
+chain_unregister:
+ raw_notifier_chain_unregister(&net->netdev_chain, nb);
+ return err;
+}
+
+static int __unregister_netdevice_notifier_net(struct net *net,
+ struct notifier_block *nb)
+{
+ int err;
+
+ err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
+ if (err)
+ return err;
+
+ call_netdevice_unregister_net_notifiers(nb, net);
+ return 0;
+}
+
+/**
+ * register_netdevice_notifier_net - register a per-netns network notifier block
+ * @net: network namespace
+ * @nb: notifier
+ *
+ * Register a notifier to be called when network device events occur.
+ * The notifier passed is linked into the kernel structures and must
+ * not be reused until it has been unregistered. A negative errno code
+ * is returned on a failure.
+ *
+ * When registered all registration and up events are replayed
+ * to the new notifier to allow device to have a race free
+ * view of the network device list.
+ */
+
+int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
+{
+ int err;
+
+ rtnl_lock();
+ err = __register_netdevice_notifier_net(net, nb, false);
+ rtnl_unlock();
+ return err;
+}
+EXPORT_SYMBOL(register_netdevice_notifier_net);
+
+/**
+ * unregister_netdevice_notifier_net - unregister a per-netns
+ * network notifier block
+ * @net: network namespace
+ * @nb: notifier
+ *
+ * Unregister a notifier previously registered by
+ * register_netdevice_notifier(). The notifier is unlinked into the
+ * kernel structures and may then be reused. A negative errno code
+ * is returned on a failure.
+ *
+ * After unregistering unregister and down device events are synthesized
+ * for all devices on the device list to the removed notifier to remove
+ * the need for special case cleanup code.
+ */
+
+int unregister_netdevice_notifier_net(struct net *net,
+ struct notifier_block *nb)
+{
+ int err;
+
+ rtnl_lock();
+ err = __unregister_netdevice_notifier_net(net, nb);
+ rtnl_unlock();
+ return err;
+}
+EXPORT_SYMBOL(unregister_netdevice_notifier_net);
+
+int register_netdevice_notifier_dev_net(struct net_device *dev,
+ struct notifier_block *nb,
+ struct netdev_net_notifier *nn)
+{
+ int err;
+
+ rtnl_lock();
+ err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
+ if (!err) {
+ nn->nb = nb;
+ list_add(&nn->list, &dev->net_notifier_list);
+ }
+ rtnl_unlock();
+ return err;
+}
+EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
+
+int unregister_netdevice_notifier_dev_net(struct net_device *dev,
+ struct notifier_block *nb,
+ struct netdev_net_notifier *nn)
+{
+ int err;
+
+ rtnl_lock();
+ list_del(&nn->list);
+ err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
+ rtnl_unlock();
+ return err;
+}
+EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
+
+static void move_netdevice_notifiers_dev_net(struct net_device *dev,
+ struct net *net)
+{
+ struct netdev_net_notifier *nn;
+
+ list_for_each_entry(nn, &dev->net_notifier_list, list) {
+ __unregister_netdevice_notifier_net(dev_net(dev), nn->nb);
+ __register_netdevice_notifier_net(net, nn->nb, true);
+ }
+}
/**
* call_netdevice_notifiers_info - call all network notifier blocks
@@ -1740,8 +2039,31 @@
static int call_netdevice_notifiers_info(unsigned long val,
struct netdev_notifier_info *info)
{
+ struct net *net = dev_net(info->dev);
+ int ret;
+
ASSERT_RTNL();
+
+ /* Run per-netns notifier block chain first, then run the global one.
+ * Hopefully, one day, the global one is going to be removed after
+ * all notifier block registrators get converted to be per-netns.
+ */
+ ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
+ if (ret & NOTIFY_STOP_MASK)
+ return ret;
return raw_notifier_call_chain(&netdev_chain, val, info);
+}
+
+static int call_netdevice_notifiers_extack(unsigned long val,
+ struct net_device *dev,
+ struct netlink_ext_ack *extack)
+{
+ struct netdev_notifier_info info = {
+ .dev = dev,
+ .extack = extack,
+ };
+
+ return call_netdevice_notifiers_info(val, &info);
}
/**
@@ -1755,11 +2077,7 @@
int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
{
- struct netdev_notifier_info info = {
- .dev = dev,
- };
-
- return call_netdevice_notifiers_info(val, &info);
+ return call_netdevice_notifiers_extack(val, dev, NULL);
}
EXPORT_SYMBOL(call_netdevice_notifiers);
@@ -1987,6 +2305,17 @@
return false;
}
+/**
+ * dev_nit_active - return true if any network interface taps are in use
+ *
+ * @dev: network device to check for the presence of taps
+ */
+bool dev_nit_active(struct net_device *dev)
+{
+ return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
+}
+EXPORT_SYMBOL_GPL(dev_nit_active);
+
/*
* Support routine. Sends outgoing frames to any network
* taps currently in use.
@@ -2002,6 +2331,9 @@
rcu_read_lock();
again:
list_for_each_entry_rcu(ptype, ptype_list, list) {
+ if (ptype->ignore_outgoing)
+ continue;
+
/* Never send packets back to the socket
* they originated from - MvS (miquels@drinkel.ow.org)
*/
@@ -2302,6 +2634,8 @@
struct xps_map *map, *new_map;
bool active = false;
unsigned int nr_ids;
+
+ WARN_ON_ONCE(index >= dev->num_tx_queues);
if (dev->num_tc) {
/* Do not allow XPS on subordinate device directly */
@@ -2744,7 +3078,7 @@
void netif_schedule_queue(struct netdev_queue *txq)
{
rcu_read_lock();
- if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
+ if (!netif_xmit_stopped(txq)) {
struct Qdisc *q = rcu_dereference(txq->qdisc);
__netif_schedule(q);
@@ -2792,8 +3126,10 @@
{
if (in_irq() || irqs_disabled())
__dev_kfree_skb_irq(skb, reason);
+ else if (unlikely(reason == SKB_REASON_DROPPED))
+ kfree_skb(skb);
else
- dev_kfree_skb(skb);
+ consume_skb(skb);
}
EXPORT_SYMBOL(__dev_kfree_skb_any);
@@ -2881,12 +3217,10 @@
else
name = netdev_name(dev);
}
- WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
- "gso_type=%d ip_summed=%d\n",
+ skb_dump(KERN_WARNING, skb, false);
+ WARN(1, "%s: caps=(%pNF, %pNF)\n",
name, dev ? &dev->features : &null_features,
- skb->sk ? &skb->sk->sk_route_caps : &null_features,
- skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
- skb_shinfo(skb)->gso_type, skb->ip_summed);
+ skb->sk ? &skb->sk->sk_route_caps : &null_features);
}
/*
@@ -2916,18 +3250,19 @@
}
offset = skb_checksum_start_offset(skb);
- BUG_ON(offset >= skb_headlen(skb));
+ ret = -EINVAL;
+ if (WARN_ON_ONCE(offset >= skb_headlen(skb)))
+ goto out;
+
csum = skb_checksum(skb, offset, skb->len - offset, 0);
offset += skb->csum_offset;
- BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
+ if (WARN_ON_ONCE(offset + sizeof(__sum16) > skb_headlen(skb)))
+ goto out;
- if (skb_cloned(skb) &&
- !skb_clone_writable(skb, offset + sizeof(__sum16))) {
- ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
- if (ret)
- goto out;
- }
+ ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
+ if (ret)
+ goto out;
*(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
out_set_summed:
@@ -2962,12 +3297,11 @@
ret = -EINVAL;
goto out;
}
- if (skb_cloned(skb) &&
- !skb_clone_writable(skb, offset + sizeof(__le32))) {
- ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
- if (ret)
- goto out;
- }
+
+ ret = skb_ensure_writable(skb, offset + sizeof(__le32));
+ if (ret)
+ goto out;
+
crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
skb->len - start, ~(__u32)0,
crc32c_csum_stub));
@@ -2993,7 +3327,7 @@
type = eth->h_proto;
}
- return __vlan_get_protocol(skb, type, depth);
+ return vlan_get_protocol_and_depth(skb, type, depth);
}
/**
@@ -3052,7 +3386,7 @@
* It may return NULL if the skb requires no segmentation. This is
* only possible when GSO is used for verifying header integrity.
*
- * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
+ * Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb.
*/
struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
netdev_features_t features, bool tx_path)
@@ -3081,7 +3415,7 @@
features &= ~NETIF_F_GSO_PARTIAL;
}
- BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
+ BUILD_BUG_ON(SKB_GSO_CB_OFFSET +
sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
@@ -3092,7 +3426,7 @@
segs = skb_mac_gso_segment(skb, features);
- if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
+ if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
skb_warn_bad_offload(skb);
return segs;
@@ -3101,10 +3435,11 @@
/* Take action when hardware reception checksum errors are detected. */
#ifdef CONFIG_BUG
-void netdev_rx_csum_fault(struct net_device *dev)
+void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
{
if (net_ratelimit()) {
pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
+ skb_dump(KERN_ERR, skb, true);
dump_stack();
}
}
@@ -3154,10 +3489,9 @@
static netdev_features_t harmonize_features(struct sk_buff *skb,
netdev_features_t features)
{
- int tmp;
__be16 type;
- type = skb_network_protocol(skb, &tmp);
+ type = skb_network_protocol(skb, NULL);
features = net_mpls_features(skb, features, type);
if (skb->ip_summed != CHECKSUM_NONE &&
@@ -3254,10 +3588,11 @@
unsigned int len;
int rc;
- if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
+ if (dev_nit_active(dev))
dev_queue_xmit_nit(skb, dev);
len = skb->len;
+ PRANDOM_ADD_NOISE(skb, dev, txq, len + jiffies);
trace_net_dev_start_xmit(skb, dev);
rc = netdev_start_xmit(skb, dev, txq, more);
trace_net_dev_xmit(skb, rc, dev, len);
@@ -3274,7 +3609,7 @@
while (skb) {
struct sk_buff *next = skb->next;
- skb->next = NULL;
+ skb_mark_not_on_list(skb);
rc = xmit_one(skb, dev, txq, next != NULL);
if (unlikely(!dev_xmit_complete(rc))) {
skb->next = next;
@@ -3305,7 +3640,7 @@
int skb_csum_hwoffload_help(struct sk_buff *skb,
const netdev_features_t features)
{
- if (unlikely(skb->csum_not_inet))
+ if (unlikely(skb_csum_is_sctp(skb)))
return !!(features & NETIF_F_SCTP_CRC) ? 0 :
skb_crc32c_csum_help(skb);
@@ -3374,7 +3709,7 @@
for (; skb != NULL; skb = next) {
next = skb->next;
- skb->next = NULL;
+ skb_mark_not_on_list(skb);
/* in case skb wont be segmented, point to itself */
skb->prev = skb;
@@ -3405,7 +3740,7 @@
/* To get more precise estimation of bytes sent on wire,
* we add to pkt_len the headers size of all segments
*/
- if (shinfo->gso_size) {
+ if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
unsigned int hdr_len;
u16 gso_segs = shinfo->gso_segs;
@@ -3449,13 +3784,9 @@
qdisc_calculate_pkt_len(skb, q);
if (q->flags & TCQ_F_NOLOCK) {
- if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
- __qdisc_drop(skb, &to_free);
- rc = NET_XMIT_DROP;
- } else {
- rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
+ rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
+ if (likely(!netif_xmit_frozen_or_stopped(txq)))
qdisc_run(q);
- }
if (unlikely(to_free))
kfree_skb_list(to_free);
@@ -3551,7 +3882,8 @@
skb_reset_mac_header(skb);
__skb_pull(skb, skb_network_offset(skb));
skb->pkt_type = PACKET_LOOPBACK;
- skb->ip_summed = CHECKSUM_UNNECESSARY;
+ if (skb->ip_summed == CHECKSUM_NONE)
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
WARN_ON(!skb_dst(skb));
skb_dst_force(skb);
netif_rx_ni(skb);
@@ -3570,6 +3902,7 @@
return skb;
/* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
+ qdisc_skb_cb(skb)->mru = 0;
mini_qdisc_bstats_cpu_update(miniq, skb);
switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
@@ -3670,23 +4003,21 @@
}
u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
- struct net_device *sb_dev,
- select_queue_fallback_t fallback)
+ struct net_device *sb_dev)
{
return 0;
}
EXPORT_SYMBOL(dev_pick_tx_zero);
u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
- struct net_device *sb_dev,
- select_queue_fallback_t fallback)
+ struct net_device *sb_dev)
{
return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
}
EXPORT_SYMBOL(dev_pick_tx_cpu_id);
-static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
- struct net_device *sb_dev)
+u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
+ struct net_device *sb_dev)
{
struct sock *sk = skb->sk;
int queue_index = sk_tx_queue_get(sk);
@@ -3710,10 +4041,11 @@
return queue_index;
}
+EXPORT_SYMBOL(netdev_pick_tx);
-struct netdev_queue *netdev_pick_tx(struct net_device *dev,
- struct sk_buff *skb,
- struct net_device *sb_dev)
+struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
+ struct sk_buff *skb,
+ struct net_device *sb_dev)
{
int queue_index = 0;
@@ -3728,10 +4060,9 @@
const struct net_device_ops *ops = dev->netdev_ops;
if (ops->ndo_select_queue)
- queue_index = ops->ndo_select_queue(dev, skb, sb_dev,
- __netdev_pick_tx);
+ queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
else
- queue_index = __netdev_pick_tx(dev, skb, sb_dev);
+ queue_index = netdev_pick_tx(dev, skb, sb_dev);
queue_index = netdev_cap_txqueue(dev, queue_index);
}
@@ -3775,6 +4106,7 @@
bool again = false;
skb_reset_mac_header(skb);
+ skb_assert_len(skb);
if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
@@ -3805,7 +4137,7 @@
else
skb_dst_force(skb);
- txq = netdev_pick_tx(dev, skb, sb_dev);
+ txq = netdev_core_pick_tx(dev, skb, sb_dev);
q = rcu_dereference_bh(txq->qdisc);
trace_net_dev_queue(skb);
@@ -3840,6 +4172,7 @@
if (!skb)
goto out;
+ PRANDOM_ADD_NOISE(skb, dev, txq, jiffies);
HARD_TX_LOCK(dev, txq, cpu);
if (!netif_xmit_stopped(txq)) {
@@ -3887,7 +4220,7 @@
}
EXPORT_SYMBOL(dev_queue_xmit_accel);
-int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
+int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
{
struct net_device *dev = skb->dev;
struct sk_buff *orig_skb = skb;
@@ -3905,6 +4238,7 @@
skb_set_queue_mapping(skb, queue_id);
txq = skb_get_tx_queue(dev, skb);
+ PRANDOM_ADD_NOISE(skb, dev, txq, jiffies);
local_bh_disable();
@@ -3916,17 +4250,13 @@
dev_xmit_recursion_dec();
local_bh_enable();
-
- if (!dev_xmit_complete(ret))
- kfree_skb(skb);
-
return ret;
drop:
atomic_long_inc(&dev->tx_dropped);
kfree_skb_list(skb);
return NET_XMIT_DROP;
}
-EXPORT_SYMBOL(dev_direct_xmit);
+EXPORT_SYMBOL(__dev_direct_xmit);
/*************************************************************************
* Receiver routines
@@ -3944,6 +4274,8 @@
int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */
int dev_rx_weight __read_mostly = 64;
int dev_tx_weight __read_mostly = 64;
+/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */
+int gro_normal_batch __read_mostly = 8;
/* Called with irq disabled */
static inline void ____napi_schedule(struct softnet_data *sd,
@@ -3961,9 +4293,9 @@
u32 rps_cpu_mask __read_mostly;
EXPORT_SYMBOL(rps_cpu_mask);
-struct static_key rps_needed __read_mostly;
+struct static_key_false rps_needed __read_mostly;
EXPORT_SYMBOL(rps_needed);
-struct static_key rfs_needed __read_mostly;
+struct static_key_false rfs_needed __read_mostly;
EXPORT_SYMBOL(rfs_needed);
static struct rps_dev_flow *
@@ -4058,8 +4390,10 @@
u32 next_cpu;
u32 ident;
- /* First check into global flow table if there is a match */
- ident = sock_flow_table->ents[hash & sock_flow_table->mask];
+ /* First check into global flow table if there is a match.
+ * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow().
+ */
+ ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]);
if ((ident ^ hash) & ~rps_cpu_mask)
goto try_rps;
@@ -4194,7 +4528,7 @@
struct softnet_data *sd;
unsigned int old_flow, new_flow;
- if (qlen < (netdev_max_backlog >> 1))
+ if (qlen < (READ_ONCE(netdev_max_backlog) >> 1))
return false;
sd = this_cpu_ptr(&softnet_data);
@@ -4242,7 +4576,7 @@
if (!netif_running(skb->dev))
goto drop;
qlen = skb_queue_len(&sd->input_pkt_queue);
- if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
+ if (qlen <= READ_ONCE(netdev_max_backlog) && !skb_flow_limit(skb, qlen)) {
if (qlen) {
enqueue:
__skb_queue_tail(&sd->input_pkt_queue, skb);
@@ -4312,7 +4646,7 @@
/* Reinjected packets coming from act_mirred or similar should
* not get XDP generic processing.
*/
- if (skb_is_tc_redirected(skb))
+ if (skb_is_redirected(skb))
return XDP_PASS;
/* XDP packets must be linear and must have sufficient headroom
@@ -4344,6 +4678,11 @@
xdp->data_meta = xdp->data;
xdp->data_end = xdp->data + hlen;
xdp->data_hard_start = skb->data - skb_headroom(skb);
+
+ /* SKB "head" area always have tailroom for skb_shared_info */
+ xdp->frame_sz = (void *)skb_end_pointer(skb) - xdp->data_hard_start;
+ xdp->frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
orig_data_end = xdp->data_end;
orig_data = xdp->data;
eth = (struct ethhdr *)xdp->data;
@@ -4367,14 +4706,11 @@
skb_reset_network_header(skb);
}
- /* check if bpf_xdp_adjust_tail was used. it can only "shrink"
- * pckt.
- */
- off = orig_data_end - xdp->data_end;
+ /* check if bpf_xdp_adjust_tail was used */
+ off = xdp->data_end - orig_data_end;
if (off != 0) {
skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
- skb->len -= off;
-
+ skb->len += off; /* positive on grow, negative on shrink */
}
/* check if XDP changed eth hdr such SKB needs update */
@@ -4397,10 +4733,10 @@
break;
default:
bpf_warn_invalid_xdp_action(act);
- /* fall through */
+ fallthrough;
case XDP_ABORTED:
trace_xdp_exception(skb->dev, xdp_prog, act);
- /* fall through */
+ fallthrough;
case XDP_DROP:
do_drop:
kfree_skb(skb);
@@ -4420,7 +4756,7 @@
bool free_skb = true;
int cpu, rc;
- txq = netdev_pick_tx(dev, skb, NULL);
+ txq = netdev_core_pick_tx(dev, skb, NULL);
cpu = smp_processor_id();
HARD_TX_LOCK(dev, txq, cpu);
if (!netif_xmit_stopped(txq)) {
@@ -4434,7 +4770,6 @@
kfree_skb(skb);
}
}
-EXPORT_SYMBOL_GPL(generic_xdp_tx);
static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
@@ -4472,12 +4807,12 @@
{
int ret;
- net_timestamp_check(netdev_tstamp_prequeue, skb);
+ net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
trace_netif_rx(skb);
#ifdef CONFIG_RPS
- if (static_key_false(&rps_needed)) {
+ if (static_branch_unlikely(&rps_needed)) {
struct rps_dev_flow voidflow, *rflow = &voidflow;
int cpu;
@@ -4520,9 +4855,14 @@
int netif_rx(struct sk_buff *skb)
{
+ int ret;
+
trace_netif_rx_entry(skb);
- return netif_rx_internal(skb);
+ ret = netif_rx_internal(skb);
+ trace_netif_rx_exit(ret);
+
+ return ret;
}
EXPORT_SYMBOL(netif_rx);
@@ -4537,10 +4877,26 @@
if (local_softirq_pending())
do_softirq();
preempt_enable();
+ trace_netif_rx_ni_exit(err);
return err;
}
EXPORT_SYMBOL(netif_rx_ni);
+
+int netif_rx_any_context(struct sk_buff *skb)
+{
+ /*
+ * If invoked from contexts which do not invoke bottom half
+ * processing either at return from interrupt or when softrqs are
+ * reenabled, use netif_rx_ni() which invokes bottomhalf processing
+ * directly.
+ */
+ if (in_interrupt())
+ return netif_rx(skb);
+ else
+ return netif_rx_ni(skb);
+}
+EXPORT_SYMBOL(netif_rx_any_context);
static __latent_entropy void net_tx_action(struct softirq_action *h)
{
@@ -4583,25 +4939,43 @@
sd->output_queue_tailp = &sd->output_queue;
local_irq_enable();
+ rcu_read_lock();
+
while (head) {
struct Qdisc *q = head;
spinlock_t *root_lock = NULL;
head = head->next_sched;
- if (!(q->flags & TCQ_F_NOLOCK)) {
- root_lock = qdisc_lock(q);
- spin_lock(root_lock);
- }
/* We need to make sure head->next_sched is read
* before clearing __QDISC_STATE_SCHED
*/
smp_mb__before_atomic();
+
+ if (!(q->flags & TCQ_F_NOLOCK)) {
+ root_lock = qdisc_lock(q);
+ spin_lock(root_lock);
+ } else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
+ &q->state))) {
+ /* There is a synchronize_net() between
+ * STATE_DEACTIVATED flag being set and
+ * qdisc_reset()/some_qdisc_is_busy() in
+ * dev_deactivate(), so we can safely bail out
+ * early here to avoid data race between
+ * qdisc_deactivate() and some_qdisc_is_busy()
+ * for lockless qdisc.
+ */
+ clear_bit(__QDISC_STATE_SCHED, &q->state);
+ continue;
+ }
+
clear_bit(__QDISC_STATE_SCHED, &q->state);
qdisc_run(q);
if (root_lock)
spin_unlock(root_lock);
}
+
+ rcu_read_unlock();
}
xfrm_dev_backlog(sd);
@@ -4616,7 +4990,7 @@
static inline struct sk_buff *
sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
- struct net_device *orig_dev)
+ struct net_device *orig_dev, bool *another)
{
#ifdef CONFIG_NET_CLS_ACT
struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
@@ -4636,10 +5010,12 @@
}
qdisc_skb_cb(skb)->pkt_len = skb->len;
+ qdisc_skb_cb(skb)->mru = 0;
skb->tc_at_ingress = 1;
mini_qdisc_bstats_cpu_update(miniq, skb);
- switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
+ switch (tcf_classify_ingress(skb, miniq->block, miniq->filter_list,
+ &cl_res, false)) {
case TC_ACT_OK:
case TC_ACT_RECLASSIFY:
skb->tc_index = TC_H_MIN(cl_res.classid);
@@ -4659,11 +5035,13 @@
* redirecting to another netdev
*/
__skb_push(skb, skb->mac_len);
- skb_do_redirect(skb);
+ if (skb_do_redirect(skb) == -EAGAIN) {
+ __skb_pull(skb, skb->mac_len);
+ *another = true;
+ break;
+ }
return NULL;
- case TC_ACT_REINSERT:
- /* this does not scrub the packet, and updates stats on error */
- skb_tc_reinsert(skb, &cl_res);
+ case TC_ACT_CONSUMED:
return NULL;
default:
break;
@@ -4763,7 +5141,6 @@
static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
int *ret, struct net_device *orig_dev)
{
-#ifdef CONFIG_NETFILTER_INGRESS
if (nf_hook_ingress_active(skb)) {
int ingress_retval;
@@ -4777,7 +5154,6 @@
rcu_read_unlock();
return ingress_retval;
}
-#endif /* CONFIG_NETFILTER_INGRESS */
return 0;
}
@@ -4792,7 +5168,7 @@
int ret = NET_RX_DROP;
__be16 type;
- net_timestamp_check(!netdev_tstamp_prequeue, skb);
+ net_timestamp_check(!READ_ONCE(netdev_tstamp_prequeue), skb);
trace_netif_receive_skb(skb);
@@ -4852,7 +5228,12 @@
skip_taps:
#ifdef CONFIG_NET_INGRESS
if (static_branch_unlikely(&ingress_needed_key)) {
- skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
+ bool another = false;
+
+ skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
+ &another);
+ if (another)
+ goto another_round;
if (!skb)
goto out;
@@ -4860,7 +5241,7 @@
goto out;
}
#endif
- skb_reset_tc(skb);
+ skb_reset_redirect(skb);
skip_classify:
if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
goto drop;
@@ -4897,14 +5278,42 @@
}
}
- if (unlikely(skb_vlan_tag_present(skb))) {
- if (skb_vlan_tag_get_id(skb))
+ if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
+check_vlan_id:
+ if (skb_vlan_tag_get_id(skb)) {
+ /* Vlan id is non 0 and vlan_do_receive() above couldn't
+ * find vlan device.
+ */
skb->pkt_type = PACKET_OTHERHOST;
+ } else if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
+ skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
+ /* Outer header is 802.1P with vlan 0, inner header is
+ * 802.1Q or 802.1AD and vlan_do_receive() above could
+ * not find vlan dev for vlan id 0.
+ */
+ __vlan_hwaccel_clear_tag(skb);
+ skb = skb_vlan_untag(skb);
+ if (unlikely(!skb))
+ goto out;
+ if (vlan_do_receive(&skb))
+ /* After stripping off 802.1P header with vlan 0
+ * vlan dev is found for inner header.
+ */
+ goto another_round;
+ else if (unlikely(!skb))
+ goto out;
+ else
+ /* We have stripped outer 802.1P vlan 0 header.
+ * But could not find vlan dev.
+ * check again for vlan id to set OTHERHOST.
+ */
+ goto check_vlan_id;
+ }
/* Note: we might in the future use prio bits
* and set skb->priority like in vlan_do_receive()
* For the time being, just ignore Priority Code Point
*/
- skb->vlan_tci = 0;
+ __vlan_hwaccel_clear_tag(skb);
}
type = skb->protocol;
@@ -4960,7 +5369,8 @@
ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
if (pt_prev)
- ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+ ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
+ skb->dev, pt_prev, orig_dev);
return ret;
}
@@ -4970,7 +5380,7 @@
*
* More direct receive version of netif_receive_skb(). It should
* only be used by callers that have a need to skip RPS and Generic XDP.
- * Caller must also take care of handling if (page_is_)pfmemalloc.
+ * Caller must also take care of handling if ``(page_is_)pfmemalloc``.
*
* This function may only be called from softirq context and interrupts
* should be enabled.
@@ -5002,7 +5412,8 @@
if (list_empty(head))
return;
if (pt_prev->list_func != NULL)
- pt_prev->list_func(head, pt_prev, orig_dev);
+ INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
+ ip_list_rcv, head, pt_prev, orig_dev);
else
list_for_each_entry_safe(skb, next, head, list) {
skb_list_del_init(skb);
@@ -5113,6 +5524,25 @@
struct bpf_prog *new = xdp->prog;
int ret = 0;
+ if (new) {
+ u32 i;
+
+ mutex_lock(&new->aux->used_maps_mutex);
+
+ /* generic XDP does not work with DEVMAPs that can
+ * have a bpf_prog installed on an entry
+ */
+ for (i = 0; i < new->aux->used_map_cnt; i++) {
+ if (dev_map_can_have_prog(new->aux->used_maps[i]) ||
+ cpu_map_prog_allowed(new->aux->used_maps[i])) {
+ mutex_unlock(&new->aux->used_maps_mutex);
+ return -EINVAL;
+ }
+ }
+
+ mutex_unlock(&new->aux->used_maps_mutex);
+ }
+
switch (xdp->command) {
case XDP_SETUP_PROG:
rcu_assign_pointer(dev->xdp_prog, new);
@@ -5128,10 +5558,6 @@
}
break;
- case XDP_QUERY_PROG:
- xdp->prog_id = old ? old->aux->id : 0;
- break;
-
default:
ret = -EINVAL;
break;
@@ -5144,14 +5570,14 @@
{
int ret;
- net_timestamp_check(netdev_tstamp_prequeue, skb);
+ net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
if (skb_defer_rx_timestamp(skb))
return NET_RX_SUCCESS;
rcu_read_lock();
#ifdef CONFIG_RPS
- if (static_key_false(&rps_needed)) {
+ if (static_branch_unlikely(&rps_needed)) {
struct rps_dev_flow voidflow, *rflow = &voidflow;
int cpu = get_rps_cpu(skb->dev, skb, &rflow);
@@ -5174,7 +5600,7 @@
INIT_LIST_HEAD(&sublist);
list_for_each_entry_safe(skb, next, head, list) {
- net_timestamp_check(netdev_tstamp_prequeue, skb);
+ net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
skb_list_del_init(skb);
if (!skb_defer_rx_timestamp(skb))
list_add_tail(&skb->list, &sublist);
@@ -5183,7 +5609,7 @@
rcu_read_lock();
#ifdef CONFIG_RPS
- if (static_key_false(&rps_needed)) {
+ if (static_branch_unlikely(&rps_needed)) {
list_for_each_entry_safe(skb, next, head, list) {
struct rps_dev_flow voidflow, *rflow = &voidflow;
int cpu = get_rps_cpu(skb->dev, skb, &rflow);
@@ -5217,9 +5643,14 @@
*/
int netif_receive_skb(struct sk_buff *skb)
{
+ int ret;
+
trace_netif_receive_skb_entry(skb);
- return netif_receive_skb_internal(skb);
+ ret = netif_receive_skb_internal(skb);
+ trace_netif_receive_skb_exit(ret);
+
+ return ret;
}
EXPORT_SYMBOL(netif_receive_skb);
@@ -5239,13 +5670,16 @@
if (list_empty(head))
return;
- list_for_each_entry(skb, head, list)
- trace_netif_receive_skb_list_entry(skb);
+ if (trace_netif_receive_skb_list_entry_enabled()) {
+ list_for_each_entry(skb, head, list)
+ trace_netif_receive_skb_list_entry(skb);
+ }
netif_receive_skb_list_internal(head);
+ trace_netif_receive_skb_list_exit(0);
}
EXPORT_SYMBOL(netif_receive_skb_list);
-DEFINE_PER_CPU(struct work_struct, flush_works);
+static DEFINE_PER_CPU(struct work_struct, flush_works);
/* Network device is going away, flush any packets still pending */
static void flush_backlog(struct work_struct *work)
@@ -5278,23 +5712,89 @@
local_bh_enable();
}
+static bool flush_required(int cpu)
+{
+#if IS_ENABLED(CONFIG_RPS)
+ struct softnet_data *sd = &per_cpu(softnet_data, cpu);
+ bool do_flush;
+
+ local_irq_disable();
+ rps_lock(sd);
+
+ /* as insertion into process_queue happens with the rps lock held,
+ * process_queue access may race only with dequeue
+ */
+ do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
+ !skb_queue_empty_lockless(&sd->process_queue);
+ rps_unlock(sd);
+ local_irq_enable();
+
+ return do_flush;
+#endif
+ /* without RPS we can't safely check input_pkt_queue: during a
+ * concurrent remote skb_queue_splice() we can detect as empty both
+ * input_pkt_queue and process_queue even if the latter could end-up
+ * containing a lot of packets.
+ */
+ return true;
+}
+
static void flush_all_backlogs(void)
{
+ static cpumask_t flush_cpus;
unsigned int cpu;
+
+ /* since we are under rtnl lock protection we can use static data
+ * for the cpumask and avoid allocating on stack the possibly
+ * large mask
+ */
+ ASSERT_RTNL();
get_online_cpus();
- for_each_online_cpu(cpu)
- queue_work_on(cpu, system_highpri_wq,
- per_cpu_ptr(&flush_works, cpu));
+ cpumask_clear(&flush_cpus);
+ for_each_online_cpu(cpu) {
+ if (flush_required(cpu)) {
+ queue_work_on(cpu, system_highpri_wq,
+ per_cpu_ptr(&flush_works, cpu));
+ cpumask_set_cpu(cpu, &flush_cpus);
+ }
+ }
- for_each_online_cpu(cpu)
+ /* we can have in flight packet[s] on the cpus we are not flushing,
+ * synchronize_net() in unregister_netdevice_many() will take care of
+ * them
+ */
+ for_each_cpu(cpu, &flush_cpus)
flush_work(per_cpu_ptr(&flush_works, cpu));
put_online_cpus();
}
-static int napi_gro_complete(struct sk_buff *skb)
+/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
+static void gro_normal_list(struct napi_struct *napi)
+{
+ if (!napi->rx_count)
+ return;
+ netif_receive_skb_list_internal(&napi->rx_list);
+ INIT_LIST_HEAD(&napi->rx_list);
+ napi->rx_count = 0;
+}
+
+/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded,
+ * pass the whole batch up to the stack.
+ */
+static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, int segs)
+{
+ list_add_tail(&skb->list, &napi->rx_list);
+ napi->rx_count += segs;
+ if (napi->rx_count >= gro_normal_batch)
+ gro_normal_list(napi);
+}
+
+INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int));
+INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int));
+static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
{
struct packet_offload *ptype;
__be16 type = skb->protocol;
@@ -5313,7 +5813,9 @@
if (ptype->type != type || !ptype->callbacks.gro_complete)
continue;
- err = ptype->callbacks.gro_complete(skb, 0);
+ err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
+ ipv6_gro_complete, inet_gro_complete,
+ skb, 0);
break;
}
rcu_read_unlock();
@@ -5325,7 +5827,8 @@
}
out:
- return netif_receive_skb_internal(skb);
+ gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count);
+ return NET_RX_SUCCESS;
}
static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
@@ -5337,9 +5840,8 @@
list_for_each_entry_safe_reverse(skb, p, head, list) {
if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
return;
- list_del(&skb->list);
- skb->next = NULL;
- napi_gro_complete(skb);
+ skb_list_del_init(skb);
+ napi_gro_complete(napi, skb);
napi->gro_hash[index].count--;
}
@@ -5353,11 +5855,13 @@
*/
void napi_gro_flush(struct napi_struct *napi, bool flush_old)
{
- u32 i;
+ unsigned long bitmask = napi->gro_bitmask;
+ unsigned int i, base = ~0U;
- for (i = 0; i < GRO_HASH_BUCKETS; i++) {
- if (test_bit(i, &napi->gro_bitmask))
- __napi_gro_flush_chain(napi, i, flush_old);
+ while ((i = ffs(bitmask)) != 0) {
+ bitmask >>= i;
+ base += i;
+ __napi_gro_flush_chain(napi, base, flush_old);
}
}
EXPORT_SYMBOL(napi_gro_flush);
@@ -5382,7 +5886,9 @@
}
diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
- diffs |= p->vlan_tci ^ skb->vlan_tci;
+ diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb);
+ if (skb_vlan_tag_present(p))
+ diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb);
diffs |= skb_metadata_dst_cmp(p, skb);
diffs |= skb_metadata_differs(p, skb);
if (maclen == ETH_HLEN)
@@ -5392,13 +5898,26 @@
diffs = memcmp(skb_mac_header(p),
skb_mac_header(skb),
maclen);
+
+ diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb);
+#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+ if (!diffs) {
+ struct tc_skb_ext *skb_ext = skb_ext_find(skb, TC_SKB_EXT);
+ struct tc_skb_ext *p_ext = skb_ext_find(p, TC_SKB_EXT);
+
+ diffs |= (!!p_ext) ^ (!!skb_ext);
+ if (!diffs && unlikely(skb_ext))
+ diffs |= p_ext->chain ^ skb_ext->chain;
+ }
+#endif
+
NAPI_GRO_CB(p)->same_flow = !diffs;
}
return head;
}
-static void skb_gro_reset_offset(struct sk_buff *skb)
+static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff)
{
const struct skb_shared_info *pinfo = skb_shinfo(skb);
const skb_frag_t *frag0 = &pinfo->frags[0];
@@ -5407,10 +5926,9 @@
NAPI_GRO_CB(skb)->frag0 = NULL;
NAPI_GRO_CB(skb)->frag0_len = 0;
- if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
- pinfo->nr_frags &&
+ if (!skb_headlen(skb) && pinfo->nr_frags &&
!PageHighMem(skb_frag_page(frag0)) &&
- (!NET_IP_ALIGN || !(skb_frag_off(frag0) & 3))) {
+ (!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) {
NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
skb_frag_size(frag0),
@@ -5429,7 +5947,7 @@
skb->data_len -= grow;
skb->tail += grow;
- pinfo->frags[0].page_offset += grow;
+ skb_frag_off_add(&pinfo->frags[0], grow);
skb_frag_size_sub(&pinfo->frags[0], grow);
if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
@@ -5439,7 +5957,7 @@
}
}
-static void gro_flush_oldest(struct list_head *head)
+static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
{
struct sk_buff *oldest;
@@ -5454,11 +5972,14 @@
/* Do not adjust napi->gro_hash[].count, caller is adding a new
* SKB to the chain.
*/
- list_del(&oldest->list);
- oldest->next = NULL;
- napi_gro_complete(oldest);
+ skb_list_del_init(oldest);
+ napi_gro_complete(napi, oldest);
}
+INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *,
+ struct sk_buff *));
+INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *,
+ struct sk_buff *));
static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
@@ -5508,7 +6029,9 @@
NAPI_GRO_CB(skb)->csum_valid = 0;
}
- pp = ptype->callbacks.gro_receive(gro_head, skb);
+ pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
+ ipv6_gro_receive, inet_gro_receive,
+ gro_head, skb);
break;
}
rcu_read_unlock();
@@ -5516,7 +6039,7 @@
if (&ptype->list == head)
goto normal;
- if (IS_ERR(pp) && PTR_ERR(pp) == -EINPROGRESS) {
+ if (PTR_ERR(pp) == -EINPROGRESS) {
ret = GRO_CONSUMED;
goto ok;
}
@@ -5525,9 +6048,8 @@
ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
if (pp) {
- list_del(&pp->list);
- pp->next = NULL;
- napi_gro_complete(pp);
+ skb_list_del_init(pp);
+ napi_gro_complete(napi, pp);
napi->gro_hash[hash].count--;
}
@@ -5538,7 +6060,7 @@
goto normal;
if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
- gro_flush_oldest(gro_head);
+ gro_flush_oldest(napi, gro_head);
} else {
napi->gro_hash[hash].count++;
}
@@ -5598,17 +6120,19 @@
static void napi_skb_free_stolen_head(struct sk_buff *skb)
{
+ nf_reset_ct(skb);
skb_dst_drop(skb);
- secpath_reset(skb);
+ skb_ext_put(skb);
kmem_cache_free(skbuff_head_cache, skb);
}
-static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
+static gro_result_t napi_skb_finish(struct napi_struct *napi,
+ struct sk_buff *skb,
+ gro_result_t ret)
{
switch (ret) {
case GRO_NORMAL:
- if (netif_receive_skb_internal(skb))
- ret = GRO_DROP;
+ gro_normal_one(napi, skb, 1);
break;
case GRO_DROP:
@@ -5633,12 +6157,17 @@
gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
+ gro_result_t ret;
+
skb_mark_napi_id(skb, napi);
trace_napi_gro_receive_entry(skb);
- skb_gro_reset_offset(skb);
+ skb_gro_reset_offset(skb, 0);
- return napi_skb_finish(dev_gro_receive(napi, skb), skb);
+ ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb));
+ trace_napi_gro_receive_exit(ret);
+
+ return ret;
}
EXPORT_SYMBOL(napi_gro_receive);
@@ -5651,7 +6180,7 @@
__skb_pull(skb, skb_headlen(skb));
/* restore the reserve we had after netdev_alloc_skb_ip_align() */
skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
- skb->vlan_tci = 0;
+ __vlan_hwaccel_clear_tag(skb);
skb->dev = napi->dev;
skb->skb_iif = 0;
@@ -5661,7 +6190,8 @@
skb->encapsulation = 0;
skb_shinfo(skb)->gso_type = 0;
skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
- secpath_reset(skb);
+ skb_ext_reset(skb);
+ nf_reset_ct(skb);
napi->skb = skb;
}
@@ -5690,8 +6220,8 @@
case GRO_HELD:
__skb_push(skb, ETH_HLEN);
skb->protocol = eth_type_trans(skb, skb->dev);
- if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
- ret = GRO_DROP;
+ if (ret == GRO_NORMAL)
+ gro_normal_one(napi, skb, 1);
break;
case GRO_DROP:
@@ -5726,7 +6256,7 @@
napi->skb = NULL;
skb_reset_mac_header(skb);
- skb_gro_reset_offset(skb);
+ skb_gro_reset_offset(skb, hlen);
if (unlikely(skb_gro_header_hard(skb, hlen))) {
eth = skb_gro_header_slow(skb, hlen, 0);
@@ -5756,6 +6286,7 @@
gro_result_t napi_gro_frags(struct napi_struct *napi)
{
+ gro_result_t ret;
struct sk_buff *skb = napi_frags_skb(napi);
if (!skb)
@@ -5763,7 +6294,10 @@
trace_napi_gro_frags_entry(skb);
- return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
+ ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
+ trace_napi_gro_frags_exit(ret);
+
+ return ret;
}
EXPORT_SYMBOL(napi_gro_frags);
@@ -5779,10 +6313,11 @@
/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
+ /* See comments in __skb_checksum_complete(). */
if (likely(!sum)) {
if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
!skb->csum_complete_sw)
- netdev_rx_csum_fault(skb->dev);
+ netdev_rx_csum_fault(skb->dev, skb);
}
NAPI_GRO_CB(skb)->csum = wsum;
@@ -5849,7 +6384,7 @@
net_rps_action_and_irq_enable(sd);
}
- napi->weight = dev_rx_weight;
+ napi->weight = READ_ONCE(dev_rx_weight);
while (again) {
struct sk_buff *skb;
@@ -5859,7 +6394,8 @@
rcu_read_unlock();
input_queue_head_incr(sd);
if (++work >= quota)
- goto state_changed;
+ return work;
+
}
local_irq_disable();
@@ -5882,10 +6418,6 @@
rps_unlock(sd);
local_irq_enable();
}
-
-state_changed:
- napi_gro_flush(napi, false);
- sd->current_napi = NULL;
return work;
}
@@ -5912,7 +6444,7 @@
* @n: napi context
*
* Test if NAPI routine is already running, and if not mark
- * it as running. This is used as a condition variable
+ * it as running. This is used as a condition variable to
* insure only one NAPI poll instance runs. We also make
* sure there is no pending NAPI disable.
*/
@@ -5961,7 +6493,8 @@
bool napi_complete_done(struct napi_struct *n, int work_done)
{
- unsigned long flags, val, new;
+ unsigned long flags, val, new, timeout = 0;
+ bool ret = true;
/*
* 1) Don't let napi dequeue from the cpu poll list
@@ -5973,28 +6506,31 @@
NAPIF_STATE_IN_BUSY_POLL)))
return false;
+ if (work_done) {
+ if (n->gro_bitmask)
+ timeout = READ_ONCE(n->dev->gro_flush_timeout);
+ n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
+ }
+ if (n->defer_hard_irqs_count > 0) {
+ n->defer_hard_irqs_count--;
+ timeout = READ_ONCE(n->dev->gro_flush_timeout);
+ if (timeout)
+ ret = false;
+ }
if (n->gro_bitmask) {
- unsigned long timeout = 0;
-
- if (work_done)
- timeout = n->dev->gro_flush_timeout;
-
/* When the NAPI instance uses a timeout and keeps postponing
* it, we need to bound somehow the time packets are kept in
* the GRO layer
*/
napi_gro_flush(n, !!timeout);
- if (timeout)
- hrtimer_start(&n->timer, ns_to_ktime(timeout),
- HRTIMER_MODE_REL_PINNED);
}
- if (unlikely(!list_empty(&n->poll_list))) {
- struct softnet_data *sd = this_cpu_ptr(&softnet_data);
+ gro_normal_list(n);
+
+ if (unlikely(!list_empty(&n->poll_list))) {
/* If n->poll_list is not empty, we need to mask irqs */
local_irq_save(flags);
list_del_init(&n->poll_list);
- sd->current_napi = NULL;
local_irq_restore(flags);
}
@@ -6018,7 +6554,10 @@
return false;
}
- return true;
+ if (timeout)
+ hrtimer_start(&n->timer, ns_to_ktime(timeout),
+ HRTIMER_MODE_REL_PINNED);
+ return ret;
}
EXPORT_SYMBOL(napi_complete_done);
@@ -6061,10 +6600,19 @@
* Ideally, a new ndo_busy_poll_stop() could avoid another round.
*/
rc = napi->poll(napi, BUSY_POLL_BUDGET);
+ /* We can't gro_normal_list() here, because napi->poll() might have
+ * rearmed the napi (napi_complete_done()) in which case it could
+ * already be running on another CPU.
+ */
trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
netpoll_poll_unlock(have_poll_lock);
- if (rc == BUSY_POLL_BUDGET)
+ if (rc == BUSY_POLL_BUDGET) {
+ /* As the whole budget was spent, we still own the napi so can
+ * safely handle the rx_list.
+ */
+ gro_normal_list(napi);
__napi_schedule(napi);
+ }
local_bh_enable();
}
@@ -6109,6 +6657,7 @@
}
work = napi_poll(napi, BUSY_POLL_BUDGET);
trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
+ gro_normal_list(napi);
count:
if (work > 0)
__NET_ADD_STATS(dev_net(napi->dev),
@@ -6142,8 +6691,7 @@
static void napi_hash_add(struct napi_struct *napi)
{
- if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
- test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
+ if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
return;
spin_lock(&napi_hash_lock);
@@ -6164,20 +6712,14 @@
/* Warning : caller is responsible to make sure rcu grace period
* is respected before freeing memory containing @napi
*/
-bool napi_hash_del(struct napi_struct *napi)
+static void napi_hash_del(struct napi_struct *napi)
{
- bool rcu_sync_needed = false;
-
spin_lock(&napi_hash_lock);
- if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
- rcu_sync_needed = true;
- hlist_del_rcu(&napi->napi_hash_node);
- }
+ hlist_del_init_rcu(&napi->napi_hash_node);
+
spin_unlock(&napi_hash_lock);
- return rcu_sync_needed;
}
-EXPORT_SYMBOL_GPL(napi_hash_del);
static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
{
@@ -6188,7 +6730,7 @@
/* Note : we use a relaxed variant of napi_schedule_prep() not setting
* NAPI_STATE_MISSED, since we do not react to a device IRQ.
*/
- if (napi->gro_bitmask && !napi_disable_pending(napi) &&
+ if (!napi_disable_pending(napi) &&
!test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
__napi_schedule_irqoff(napi);
@@ -6209,15 +6751,21 @@
void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight)
{
+ if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
+ return;
+
INIT_LIST_HEAD(&napi->poll_list);
+ INIT_HLIST_NODE(&napi->napi_hash_node);
hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
napi->timer.function = napi_watchdog;
init_gro_hash(napi);
napi->skb = NULL;
+ INIT_LIST_HEAD(&napi->rx_list);
+ napi->rx_count = 0;
napi->poll = poll;
if (weight > NAPI_POLL_WEIGHT)
- pr_err_once("netif_napi_add() called with weight %d on device %s\n",
- weight, dev->name);
+ netdev_err_once(dev, "%s() called with weight %d\n", __func__,
+ weight);
napi->weight = weight;
napi->dev = dev;
#ifdef CONFIG_NETPOLL
@@ -6260,26 +6808,19 @@
}
/* Must be called in process context */
-void netif_napi_del(struct napi_struct *napi)
+void __netif_napi_del(struct napi_struct *napi)
{
- might_sleep();
- if (napi_hash_del(napi))
- synchronize_net();
- list_del_init(&napi->dev_list);
+ if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
+ return;
+
+ napi_hash_del(napi);
+ list_del_rcu(&napi->dev_list);
napi_free_frags(napi);
flush_gro_hash(napi);
napi->gro_bitmask = 0;
}
-EXPORT_SYMBOL(netif_napi_del);
-
-struct napi_struct *get_current_napi_context(void)
-{
- struct softnet_data *sd = this_cpu_ptr(&softnet_data);
-
- return sd->current_napi;
-}
-EXPORT_SYMBOL(get_current_napi_context);
+EXPORT_SYMBOL(__netif_napi_del);
static int napi_poll(struct napi_struct *n, struct list_head *repoll)
{
@@ -6300,14 +6841,13 @@
*/
work = 0;
if (test_bit(NAPI_STATE_SCHED, &n->state)) {
- struct softnet_data *sd = this_cpu_ptr(&softnet_data);
-
- sd->current_napi = n;
work = n->poll(n, weight);
trace_napi_poll(n, work, weight);
}
- WARN_ON_ONCE(work > weight);
+ if (unlikely(work > weight))
+ pr_err_once("NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
+ n->poll, work, weight);
if (likely(work < weight))
goto out_unlock;
@@ -6328,6 +6868,8 @@
*/
napi_gro_flush(n, HZ >= 1000);
}
+
+ gro_normal_list(n);
/* Some drivers may have called napi_schedule
* prior to exhausting their budget.
@@ -6350,8 +6892,8 @@
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
unsigned long time_limit = jiffies +
- usecs_to_jiffies(netdev_budget_usecs);
- int budget = netdev_budget;
+ usecs_to_jiffies(READ_ONCE(netdev_budget_usecs));
+ int budget = READ_ONCE(netdev_budget);
LIST_HEAD(list);
LIST_HEAD(repoll);
@@ -6401,6 +6943,9 @@
/* upper master flag, there can only be one master device per list */
bool master;
+ /* lookup ignore flag */
+ bool ignore;
+
/* counter for the number of times this device was added to us */
u16 ref_nr;
@@ -6423,9 +6968,10 @@
return NULL;
}
-static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
+static int ____netdev_has_upper_dev(struct net_device *upper_dev,
+ struct netdev_nested_priv *priv)
{
- struct net_device *dev = data;
+ struct net_device *dev = (struct net_device *)priv->data;
return upper_dev == dev;
}
@@ -6442,10 +6988,14 @@
bool netdev_has_upper_dev(struct net_device *dev,
struct net_device *upper_dev)
{
+ struct netdev_nested_priv priv = {
+ .data = (void *)upper_dev,
+ };
+
ASSERT_RTNL();
- return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
- upper_dev);
+ return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
+ &priv);
}
EXPORT_SYMBOL(netdev_has_upper_dev);
@@ -6462,8 +7012,12 @@
bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
struct net_device *upper_dev)
{
- return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
- upper_dev);
+ struct netdev_nested_priv priv = {
+ .data = (void *)upper_dev,
+ };
+
+ return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
+ &priv);
}
EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
@@ -6505,6 +7059,22 @@
return NULL;
}
EXPORT_SYMBOL(netdev_master_upper_dev_get);
+
+static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
+{
+ struct netdev_adjacent *upper;
+
+ ASSERT_RTNL();
+
+ if (list_empty(&dev->adj_list.upper))
+ return NULL;
+
+ upper = list_first_entry(&dev->adj_list.upper,
+ struct netdev_adjacent, list);
+ if (likely(upper->master) && !upper->ignore)
+ return upper->dev;
+ return NULL;
+}
/**
* netdev_has_any_lower_dev - Check if device is linked to some device
@@ -6556,8 +7126,9 @@
}
EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
-static struct net_device *netdev_next_upper_dev(struct net_device *dev,
- struct list_head **iter)
+static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
+ struct list_head **iter,
+ bool *ignore)
{
struct netdev_adjacent *upper;
@@ -6567,6 +7138,7 @@
return NULL;
*iter = &upper->list;
+ *ignore = upper->ignore;
return upper->dev;
}
@@ -6588,30 +7160,33 @@
return upper->dev;
}
-static int netdev_walk_all_upper_dev(struct net_device *dev,
- int (*fn)(struct net_device *dev,
- void *data),
- void *data)
+static int __netdev_walk_all_upper_dev(struct net_device *dev,
+ int (*fn)(struct net_device *dev,
+ struct netdev_nested_priv *priv),
+ struct netdev_nested_priv *priv)
{
struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
int ret, cur = 0;
+ bool ignore;
now = dev;
iter = &dev->adj_list.upper;
while (1) {
if (now != dev) {
- ret = fn(now, data);
+ ret = fn(now, priv);
if (ret)
return ret;
}
next = NULL;
while (1) {
- udev = netdev_next_upper_dev(now, &iter);
+ udev = __netdev_next_upper_dev(now, &iter, &ignore);
if (!udev)
break;
+ if (ignore)
+ continue;
next = udev;
niter = &udev->adj_list.upper;
@@ -6636,8 +7211,8 @@
int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
int (*fn)(struct net_device *dev,
- void *data),
- void *data)
+ struct netdev_nested_priv *priv),
+ struct netdev_nested_priv *priv)
{
struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
@@ -6648,7 +7223,7 @@
while (1) {
if (now != dev) {
- ret = fn(now, data);
+ ret = fn(now, priv);
if (ret)
return ret;
}
@@ -6680,6 +7255,20 @@
return 0;
}
EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
+
+static bool __netdev_has_upper_dev(struct net_device *dev,
+ struct net_device *upper_dev)
+{
+ struct netdev_nested_priv priv = {
+ .flags = 0,
+ .data = (void *)upper_dev,
+ };
+
+ ASSERT_RTNL();
+
+ return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
+ &priv);
+}
/**
* netdev_lower_get_next_private - Get the next ->private from the
@@ -6777,10 +7366,27 @@
return lower->dev;
}
+static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
+ struct list_head **iter,
+ bool *ignore)
+{
+ struct netdev_adjacent *lower;
+
+ lower = list_entry((*iter)->next, struct netdev_adjacent, list);
+
+ if (&lower->list == &dev->adj_list.lower)
+ return NULL;
+
+ *iter = &lower->list;
+ *ignore = lower->ignore;
+
+ return lower->dev;
+}
+
int netdev_walk_all_lower_dev(struct net_device *dev,
int (*fn)(struct net_device *dev,
- void *data),
- void *data)
+ struct netdev_nested_priv *priv),
+ struct netdev_nested_priv *priv)
{
struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
@@ -6791,7 +7397,7 @@
while (1) {
if (now != dev) {
- ret = fn(now, data);
+ ret = fn(now, priv);
if (ret)
return ret;
}
@@ -6824,8 +7430,57 @@
}
EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
-static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
- struct list_head **iter)
+static int __netdev_walk_all_lower_dev(struct net_device *dev,
+ int (*fn)(struct net_device *dev,
+ struct netdev_nested_priv *priv),
+ struct netdev_nested_priv *priv)
+{
+ struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
+ struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
+ int ret, cur = 0;
+ bool ignore;
+
+ now = dev;
+ iter = &dev->adj_list.lower;
+
+ while (1) {
+ if (now != dev) {
+ ret = fn(now, priv);
+ if (ret)
+ return ret;
+ }
+
+ next = NULL;
+ while (1) {
+ ldev = __netdev_next_lower_dev(now, &iter, &ignore);
+ if (!ldev)
+ break;
+ if (ignore)
+ continue;
+
+ next = ldev;
+ niter = &ldev->adj_list.lower;
+ dev_stack[cur] = now;
+ iter_stack[cur++] = iter;
+ break;
+ }
+
+ if (!next) {
+ if (!cur)
+ return 0;
+ next = dev_stack[--cur];
+ niter = iter_stack[cur];
+ }
+
+ now = next;
+ iter = niter;
+ }
+
+ return 0;
+}
+
+struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
+ struct list_head **iter)
{
struct netdev_adjacent *lower;
@@ -6837,17 +7492,21 @@
return lower->dev;
}
+EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
static u8 __netdev_upper_depth(struct net_device *dev)
{
struct net_device *udev;
struct list_head *iter;
u8 max_depth = 0;
+ bool ignore;
for (iter = &dev->adj_list.upper,
- udev = netdev_next_upper_dev(dev, &iter);
+ udev = __netdev_next_upper_dev(dev, &iter, &ignore);
udev;
- udev = netdev_next_upper_dev(dev, &iter)) {
+ udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
+ if (ignore)
+ continue;
if (max_depth < udev->upper_level)
max_depth = udev->upper_level;
}
@@ -6860,11 +7519,14 @@
struct net_device *ldev;
struct list_head *iter;
u8 max_depth = 0;
+ bool ignore;
for (iter = &dev->adj_list.lower,
- ldev = netdev_next_lower_dev(dev, &iter);
+ ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
ldev;
- ldev = netdev_next_lower_dev(dev, &iter)) {
+ ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
+ if (ignore)
+ continue;
if (max_depth < ldev->lower_level)
max_depth = ldev->lower_level;
}
@@ -6872,22 +7534,34 @@
return max_depth;
}
-static int __netdev_update_upper_level(struct net_device *dev, void *data)
+static int __netdev_update_upper_level(struct net_device *dev,
+ struct netdev_nested_priv *__unused)
{
dev->upper_level = __netdev_upper_depth(dev) + 1;
return 0;
}
-static int __netdev_update_lower_level(struct net_device *dev, void *data)
+static int __netdev_update_lower_level(struct net_device *dev,
+ struct netdev_nested_priv *priv)
{
dev->lower_level = __netdev_lower_depth(dev) + 1;
+
+#ifdef CONFIG_LOCKDEP
+ if (!priv)
+ return 0;
+
+ if (priv->flags & NESTED_SYNC_IMM)
+ dev->nested_level = dev->lower_level - 1;
+ if (priv->flags & NESTED_SYNC_TODO)
+ net_unlink_todo(dev);
+#endif
return 0;
}
int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
int (*fn)(struct net_device *dev,
- void *data),
- void *data)
+ struct netdev_nested_priv *priv),
+ struct netdev_nested_priv *priv)
{
struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
@@ -6898,7 +7572,7 @@
while (1) {
if (now != dev) {
- ret = fn(now, data);
+ ret = fn(now, priv);
if (ret)
return ret;
}
@@ -7028,6 +7702,7 @@
adj->master = master;
adj->ref_nr = 1;
adj->private = private;
+ adj->ignore = false;
dev_hold(adj_dev);
pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
@@ -7157,6 +7832,7 @@
static int __netdev_upper_dev_link(struct net_device *dev,
struct net_device *upper_dev, bool master,
void *upper_priv, void *upper_info,
+ struct netdev_nested_priv *priv,
struct netlink_ext_ack *extack)
{
struct netdev_notifier_changeupper_info changeupper_info = {
@@ -7178,17 +7854,17 @@
return -EBUSY;
/* To prevent loops, check if dev is not upper device to upper_dev. */
- if (netdev_has_upper_dev(upper_dev, dev))
+ if (__netdev_has_upper_dev(upper_dev, dev))
return -EBUSY;
if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
return -EMLINK;
if (!master) {
- if (netdev_has_upper_dev(dev, upper_dev))
+ if (__netdev_has_upper_dev(dev, upper_dev))
return -EEXIST;
} else {
- master_dev = netdev_master_upper_dev_get(dev);
+ master_dev = __netdev_master_upper_dev_get(dev);
if (master_dev)
return master_dev == upper_dev ? -EEXIST : -EBUSY;
}
@@ -7211,10 +7887,11 @@
goto rollback;
__netdev_update_upper_level(dev, NULL);
- netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
+ __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
- __netdev_update_lower_level(upper_dev, NULL);
- netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, NULL);
+ __netdev_update_lower_level(upper_dev, priv);
+ __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
+ priv);
return 0;
@@ -7239,8 +7916,13 @@
struct net_device *upper_dev,
struct netlink_ext_ack *extack)
{
+ struct netdev_nested_priv priv = {
+ .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
+ .data = NULL,
+ };
+
return __netdev_upper_dev_link(dev, upper_dev, false,
- NULL, NULL, extack);
+ NULL, NULL, &priv, extack);
}
EXPORT_SYMBOL(netdev_upper_dev_link);
@@ -7263,21 +7945,19 @@
void *upper_priv, void *upper_info,
struct netlink_ext_ack *extack)
{
+ struct netdev_nested_priv priv = {
+ .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
+ .data = NULL,
+ };
+
return __netdev_upper_dev_link(dev, upper_dev, true,
- upper_priv, upper_info, extack);
+ upper_priv, upper_info, &priv, extack);
}
EXPORT_SYMBOL(netdev_master_upper_dev_link);
-/**
- * netdev_upper_dev_unlink - Removes a link to upper device
- * @dev: device
- * @upper_dev: new upper device
- *
- * Removes a link to device which is upper to this one. The caller must hold
- * the RTNL lock.
- */
-void netdev_upper_dev_unlink(struct net_device *dev,
- struct net_device *upper_dev)
+static void __netdev_upper_dev_unlink(struct net_device *dev,
+ struct net_device *upper_dev,
+ struct netdev_nested_priv *priv)
{
struct netdev_notifier_changeupper_info changeupper_info = {
.info = {
@@ -7300,12 +7980,126 @@
&changeupper_info.info);
__netdev_update_upper_level(dev, NULL);
- netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
+ __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
- __netdev_update_lower_level(upper_dev, NULL);
- netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, NULL);
+ __netdev_update_lower_level(upper_dev, priv);
+ __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
+ priv);
+}
+
+/**
+ * netdev_upper_dev_unlink - Removes a link to upper device
+ * @dev: device
+ * @upper_dev: new upper device
+ *
+ * Removes a link to device which is upper to this one. The caller must hold
+ * the RTNL lock.
+ */
+void netdev_upper_dev_unlink(struct net_device *dev,
+ struct net_device *upper_dev)
+{
+ struct netdev_nested_priv priv = {
+ .flags = NESTED_SYNC_TODO,
+ .data = NULL,
+ };
+
+ __netdev_upper_dev_unlink(dev, upper_dev, &priv);
}
EXPORT_SYMBOL(netdev_upper_dev_unlink);
+
+static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
+ struct net_device *lower_dev,
+ bool val)
+{
+ struct netdev_adjacent *adj;
+
+ adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
+ if (adj)
+ adj->ignore = val;
+
+ adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
+ if (adj)
+ adj->ignore = val;
+}
+
+static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
+ struct net_device *lower_dev)
+{
+ __netdev_adjacent_dev_set(upper_dev, lower_dev, true);
+}
+
+static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
+ struct net_device *lower_dev)
+{
+ __netdev_adjacent_dev_set(upper_dev, lower_dev, false);
+}
+
+int netdev_adjacent_change_prepare(struct net_device *old_dev,
+ struct net_device *new_dev,
+ struct net_device *dev,
+ struct netlink_ext_ack *extack)
+{
+ struct netdev_nested_priv priv = {
+ .flags = 0,
+ .data = NULL,
+ };
+ int err;
+
+ if (!new_dev)
+ return 0;
+
+ if (old_dev && new_dev != old_dev)
+ netdev_adjacent_dev_disable(dev, old_dev);
+ err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv,
+ extack);
+ if (err) {
+ if (old_dev && new_dev != old_dev)
+ netdev_adjacent_dev_enable(dev, old_dev);
+ return err;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(netdev_adjacent_change_prepare);
+
+void netdev_adjacent_change_commit(struct net_device *old_dev,
+ struct net_device *new_dev,
+ struct net_device *dev)
+{
+ struct netdev_nested_priv priv = {
+ .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
+ .data = NULL,
+ };
+
+ if (!new_dev || !old_dev)
+ return;
+
+ if (new_dev == old_dev)
+ return;
+
+ netdev_adjacent_dev_enable(dev, old_dev);
+ __netdev_upper_dev_unlink(old_dev, dev, &priv);
+}
+EXPORT_SYMBOL(netdev_adjacent_change_commit);
+
+void netdev_adjacent_change_abort(struct net_device *old_dev,
+ struct net_device *new_dev,
+ struct net_device *dev)
+{
+ struct netdev_nested_priv priv = {
+ .flags = 0,
+ .data = NULL,
+ };
+
+ if (!new_dev)
+ return;
+
+ if (old_dev && new_dev != old_dev)
+ netdev_adjacent_dev_enable(dev, old_dev);
+
+ __netdev_upper_dev_unlink(new_dev, dev, &priv);
+}
+EXPORT_SYMBOL(netdev_adjacent_change_abort);
/**
* netdev_bonding_info_change - Dispatch event about slave change
@@ -7328,6 +8122,29 @@
&info.info);
}
EXPORT_SYMBOL(netdev_bonding_info_change);
+
+/**
+ * netdev_get_xmit_slave - Get the xmit slave of master device
+ * @dev: device
+ * @skb: The packet
+ * @all_slaves: assume all the slaves are active
+ *
+ * The reference counters are not incremented so the caller must be
+ * careful with locks. The caller must hold RCU lock.
+ * %NULL is returned if no slave is found.
+ */
+
+struct net_device *netdev_get_xmit_slave(struct net_device *dev,
+ struct sk_buff *skb,
+ bool all_slaves)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (!ops->ndo_get_xmit_slave)
+ return NULL;
+ return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
+}
+EXPORT_SYMBOL(netdev_get_xmit_slave);
static void netdev_adjacent_add_links(struct net_device *dev)
{
@@ -7419,25 +8236,6 @@
}
EXPORT_SYMBOL(netdev_lower_dev_get_private);
-
-int dev_get_nest_level(struct net_device *dev)
-{
- struct net_device *lower = NULL;
- struct list_head *iter;
- int max_nest = -1;
- int nest;
-
- ASSERT_RTNL();
-
- netdev_for_each_lower_dev(dev, lower, iter) {
- nest = dev_get_nest_level(lower);
- if (max_nest < nest)
- max_nest = nest;
- }
-
- return max_nest + 1;
-}
-EXPORT_SYMBOL(dev_get_nest_level);
/**
* netdev_lower_change - Dispatch event about lower device state change
@@ -7665,7 +8463,8 @@
}
EXPORT_SYMBOL(dev_get_flags);
-int __dev_change_flags(struct net_device *dev, unsigned int flags)
+int __dev_change_flags(struct net_device *dev, unsigned int flags,
+ struct netlink_ext_ack *extack)
{
unsigned int old_flags = dev->flags;
int ret;
@@ -7702,7 +8501,7 @@
if (old_flags & IFF_UP)
__dev_close(dev);
else
- ret = __dev_open(dev);
+ ret = __dev_open(dev, extack);
}
if ((flags ^ dev->gflags) & IFF_PROMISC) {
@@ -7762,16 +8561,18 @@
* dev_change_flags - change device settings
* @dev: device
* @flags: device state flags
+ * @extack: netlink extended ack
*
* Change settings on device based state flags. The flags are
* in the userspace exported format.
*/
-int dev_change_flags(struct net_device *dev, unsigned int flags)
+int dev_change_flags(struct net_device *dev, unsigned int flags,
+ struct netlink_ext_ack *extack)
{
int ret;
unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
- ret = __dev_change_flags(dev, flags);
+ ret = __dev_change_flags(dev, flags, extack);
if (ret < 0)
return ret;
@@ -7914,13 +8715,36 @@
EXPORT_SYMBOL(dev_set_group);
/**
+ * dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
+ * @dev: device
+ * @addr: new address
+ * @extack: netlink extended ack
+ */
+int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
+ struct netlink_ext_ack *extack)
+{
+ struct netdev_notifier_pre_changeaddr_info info = {
+ .info.dev = dev,
+ .info.extack = extack,
+ .dev_addr = addr,
+ };
+ int rc;
+
+ rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
+ return notifier_to_errno(rc);
+}
+EXPORT_SYMBOL(dev_pre_changeaddr_notify);
+
+/**
* dev_set_mac_address - Change Media Access Control Address
* @dev: device
* @sa: new address
+ * @extack: netlink extended ack
*
* Change the hardware (MAC) address of the device
*/
-int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
+int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
+ struct netlink_ext_ack *extack)
{
const struct net_device_ops *ops = dev->netdev_ops;
int err;
@@ -7931,6 +8755,9 @@
return -EINVAL;
if (!netif_device_present(dev))
return -ENODEV;
+ err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
+ if (err)
+ return err;
err = ops->ndo_set_mac_address(dev, sa);
if (err)
return err;
@@ -7940,6 +8767,48 @@
return 0;
}
EXPORT_SYMBOL(dev_set_mac_address);
+
+static DECLARE_RWSEM(dev_addr_sem);
+
+int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
+ struct netlink_ext_ack *extack)
+{
+ int ret;
+
+ down_write(&dev_addr_sem);
+ ret = dev_set_mac_address(dev, sa, extack);
+ up_write(&dev_addr_sem);
+ return ret;
+}
+EXPORT_SYMBOL(dev_set_mac_address_user);
+
+int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
+{
+ size_t size = sizeof(sa->sa_data);
+ struct net_device *dev;
+ int ret = 0;
+
+ down_read(&dev_addr_sem);
+ rcu_read_lock();
+
+ dev = dev_get_by_name_rcu(net, dev_name);
+ if (!dev) {
+ ret = -ENODEV;
+ goto unlock;
+ }
+ if (!dev->addr_len)
+ memset(sa->sa_data, 0, size);
+ else
+ memcpy(sa->sa_data, dev->dev_addr,
+ min_t(size_t, size, dev->addr_len));
+ sa->sa_family = dev->type;
+
+unlock:
+ rcu_read_unlock();
+ up_read(&dev_addr_sem);
+ return ret;
+}
+EXPORT_SYMBOL(dev_get_mac_address);
/**
* dev_change_carrier - Change device carrier
@@ -7990,12 +8859,80 @@
char *name, size_t len)
{
const struct net_device_ops *ops = dev->netdev_ops;
+ int err;
- if (!ops->ndo_get_phys_port_name)
- return -EOPNOTSUPP;
- return ops->ndo_get_phys_port_name(dev, name, len);
+ if (ops->ndo_get_phys_port_name) {
+ err = ops->ndo_get_phys_port_name(dev, name, len);
+ if (err != -EOPNOTSUPP)
+ return err;
+ }
+ return devlink_compat_phys_port_name_get(dev, name, len);
}
EXPORT_SYMBOL(dev_get_phys_port_name);
+
+/**
+ * dev_get_port_parent_id - Get the device's port parent identifier
+ * @dev: network device
+ * @ppid: pointer to a storage for the port's parent identifier
+ * @recurse: allow/disallow recursion to lower devices
+ *
+ * Get the devices's port parent identifier
+ */
+int dev_get_port_parent_id(struct net_device *dev,
+ struct netdev_phys_item_id *ppid,
+ bool recurse)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ struct netdev_phys_item_id first = { };
+ struct net_device *lower_dev;
+ struct list_head *iter;
+ int err;
+
+ if (ops->ndo_get_port_parent_id) {
+ err = ops->ndo_get_port_parent_id(dev, ppid);
+ if (err != -EOPNOTSUPP)
+ return err;
+ }
+
+ err = devlink_compat_switch_id_get(dev, ppid);
+ if (!err || err != -EOPNOTSUPP)
+ return err;
+
+ if (!recurse)
+ return -EOPNOTSUPP;
+
+ netdev_for_each_lower_dev(dev, lower_dev, iter) {
+ err = dev_get_port_parent_id(lower_dev, ppid, recurse);
+ if (err)
+ break;
+ if (!first.id_len)
+ first = *ppid;
+ else if (memcmp(&first, ppid, sizeof(*ppid)))
+ return -EOPNOTSUPP;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL(dev_get_port_parent_id);
+
+/**
+ * netdev_port_same_parent_id - Indicate if two network devices have
+ * the same port parent identifier
+ * @a: first network device
+ * @b: second network device
+ */
+bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
+{
+ struct netdev_phys_item_id a_id = { };
+ struct netdev_phys_item_id b_id = { };
+
+ if (dev_get_port_parent_id(a, &a_id, true) ||
+ dev_get_port_parent_id(b, &b_id, true))
+ return false;
+
+ return netdev_phys_item_id_same(&a_id, &b_id);
+}
+EXPORT_SYMBOL(netdev_port_same_parent_id);
/**
* dev_change_proto_down - update protocol port state information
@@ -8017,67 +8954,495 @@
}
EXPORT_SYMBOL(dev_change_proto_down);
-u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
- enum bpf_netdev_command cmd)
+/**
+ * dev_change_proto_down_generic - generic implementation for
+ * ndo_change_proto_down that sets carrier according to
+ * proto_down.
+ *
+ * @dev: device
+ * @proto_down: new value
+ */
+int dev_change_proto_down_generic(struct net_device *dev, bool proto_down)
{
- struct netdev_bpf xdp;
+ if (proto_down)
+ netif_carrier_off(dev);
+ else
+ netif_carrier_on(dev);
+ dev->proto_down = proto_down;
+ return 0;
+}
+EXPORT_SYMBOL(dev_change_proto_down_generic);
- if (!bpf_op)
- return 0;
+/**
+ * dev_change_proto_down_reason - proto down reason
+ *
+ * @dev: device
+ * @mask: proto down mask
+ * @value: proto down value
+ */
+void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
+ u32 value)
+{
+ int b;
- memset(&xdp, 0, sizeof(xdp));
- xdp.command = cmd;
+ if (!mask) {
+ dev->proto_down_reason = value;
+ } else {
+ for_each_set_bit(b, &mask, 32) {
+ if (value & (1 << b))
+ dev->proto_down_reason |= BIT(b);
+ else
+ dev->proto_down_reason &= ~BIT(b);
+ }
+ }
+}
+EXPORT_SYMBOL(dev_change_proto_down_reason);
- /* Query must always succeed. */
- WARN_ON(bpf_op(dev, &xdp) < 0 && cmd == XDP_QUERY_PROG);
+struct bpf_xdp_link {
+ struct bpf_link link;
+ struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
+ int flags;
+};
- return xdp.prog_id;
+static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags)
+{
+ if (flags & XDP_FLAGS_HW_MODE)
+ return XDP_MODE_HW;
+ if (flags & XDP_FLAGS_DRV_MODE)
+ return XDP_MODE_DRV;
+ if (flags & XDP_FLAGS_SKB_MODE)
+ return XDP_MODE_SKB;
+ return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB;
}
-static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
- struct netlink_ext_ack *extack, u32 flags,
- struct bpf_prog *prog)
+static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
+{
+ switch (mode) {
+ case XDP_MODE_SKB:
+ return generic_xdp_install;
+ case XDP_MODE_DRV:
+ case XDP_MODE_HW:
+ return dev->netdev_ops->ndo_bpf;
+ default:
+ return NULL;
+ };
+}
+
+static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
+ enum bpf_xdp_mode mode)
+{
+ return dev->xdp_state[mode].link;
+}
+
+static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
+ enum bpf_xdp_mode mode)
+{
+ struct bpf_xdp_link *link = dev_xdp_link(dev, mode);
+
+ if (link)
+ return link->link.prog;
+ return dev->xdp_state[mode].prog;
+}
+
+static u8 dev_xdp_prog_count(struct net_device *dev)
+{
+ u8 count = 0;
+ int i;
+
+ for (i = 0; i < __MAX_XDP_MODE; i++)
+ if (dev->xdp_state[i].prog || dev->xdp_state[i].link)
+ count++;
+ return count;
+}
+
+u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
+{
+ struct bpf_prog *prog = dev_xdp_prog(dev, mode);
+
+ return prog ? prog->aux->id : 0;
+}
+
+static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
+ struct bpf_xdp_link *link)
+{
+ dev->xdp_state[mode].link = link;
+ dev->xdp_state[mode].prog = NULL;
+}
+
+static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
+ struct bpf_prog *prog)
+{
+ dev->xdp_state[mode].link = NULL;
+ dev->xdp_state[mode].prog = prog;
+}
+
+static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
+ bpf_op_t bpf_op, struct netlink_ext_ack *extack,
+ u32 flags, struct bpf_prog *prog)
{
struct netdev_bpf xdp;
+ int err;
memset(&xdp, 0, sizeof(xdp));
- if (flags & XDP_FLAGS_HW_MODE)
- xdp.command = XDP_SETUP_PROG_HW;
- else
- xdp.command = XDP_SETUP_PROG;
+ xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
xdp.extack = extack;
xdp.flags = flags;
xdp.prog = prog;
- return bpf_op(dev, &xdp);
+ /* Drivers assume refcnt is already incremented (i.e, prog pointer is
+ * "moved" into driver), so they don't increment it on their own, but
+ * they do decrement refcnt when program is detached or replaced.
+ * Given net_device also owns link/prog, we need to bump refcnt here
+ * to prevent drivers from underflowing it.
+ */
+ if (prog)
+ bpf_prog_inc(prog);
+ err = bpf_op(dev, &xdp);
+ if (err) {
+ if (prog)
+ bpf_prog_put(prog);
+ return err;
+ }
+
+ if (mode != XDP_MODE_HW)
+ bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);
+
+ return 0;
}
static void dev_xdp_uninstall(struct net_device *dev)
{
- struct netdev_bpf xdp;
- bpf_op_t ndo_bpf;
+ struct bpf_xdp_link *link;
+ struct bpf_prog *prog;
+ enum bpf_xdp_mode mode;
+ bpf_op_t bpf_op;
- /* Remove generic XDP */
- WARN_ON(dev_xdp_install(dev, generic_xdp_install, NULL, 0, NULL));
+ ASSERT_RTNL();
- /* Remove from the driver */
- ndo_bpf = dev->netdev_ops->ndo_bpf;
- if (!ndo_bpf)
- return;
+ for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
+ prog = dev_xdp_prog(dev, mode);
+ if (!prog)
+ continue;
- memset(&xdp, 0, sizeof(xdp));
- xdp.command = XDP_QUERY_PROG;
- WARN_ON(ndo_bpf(dev, &xdp));
- if (xdp.prog_id)
- WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
- NULL));
+ bpf_op = dev_xdp_bpf_op(dev, mode);
+ if (!bpf_op)
+ continue;
- /* Remove HW offload */
- memset(&xdp, 0, sizeof(xdp));
- xdp.command = XDP_QUERY_PROG_HW;
- if (!ndo_bpf(dev, &xdp) && xdp.prog_id)
- WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
- NULL));
+ WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
+
+ /* auto-detach link from net device */
+ link = dev_xdp_link(dev, mode);
+ if (link)
+ link->dev = NULL;
+ else
+ bpf_prog_put(prog);
+
+ dev_xdp_set_link(dev, mode, NULL);
+ }
+}
+
+static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
+ struct bpf_xdp_link *link, struct bpf_prog *new_prog,
+ struct bpf_prog *old_prog, u32 flags)
+{
+ unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES);
+ struct bpf_prog *cur_prog;
+ enum bpf_xdp_mode mode;
+ bpf_op_t bpf_op;
+ int err;
+
+ ASSERT_RTNL();
+
+ /* either link or prog attachment, never both */
+ if (link && (new_prog || old_prog))
+ return -EINVAL;
+ /* link supports only XDP mode flags */
+ if (link && (flags & ~XDP_FLAGS_MODES)) {
+ NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
+ return -EINVAL;
+ }
+ /* just one XDP mode bit should be set, zero defaults to drv/skb mode */
+ if (num_modes > 1) {
+ NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
+ return -EINVAL;
+ }
+ /* avoid ambiguity if offload + drv/skb mode progs are both loaded */
+ if (!num_modes && dev_xdp_prog_count(dev) > 1) {
+ NL_SET_ERR_MSG(extack,
+ "More than one program loaded, unset mode is ambiguous");
+ return -EINVAL;
+ }
+ /* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
+ if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
+ NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
+ return -EINVAL;
+ }
+
+ mode = dev_xdp_mode(dev, flags);
+ /* can't replace attached link */
+ if (dev_xdp_link(dev, mode)) {
+ NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
+ return -EBUSY;
+ }
+
+ cur_prog = dev_xdp_prog(dev, mode);
+ /* can't replace attached prog with link */
+ if (link && cur_prog) {
+ NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
+ return -EBUSY;
+ }
+ if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
+ NL_SET_ERR_MSG(extack, "Active program does not match expected");
+ return -EEXIST;
+ }
+
+ /* put effective new program into new_prog */
+ if (link)
+ new_prog = link->link.prog;
+
+ if (new_prog) {
+ bool offload = mode == XDP_MODE_HW;
+ enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
+ ? XDP_MODE_DRV : XDP_MODE_SKB;
+
+ if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
+ NL_SET_ERR_MSG(extack, "XDP program already attached");
+ return -EBUSY;
+ }
+ if (!offload && dev_xdp_prog(dev, other_mode)) {
+ NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
+ return -EEXIST;
+ }
+ if (!offload && bpf_prog_is_dev_bound(new_prog->aux)) {
+ NL_SET_ERR_MSG(extack, "Using device-bound program without HW_MODE flag is not supported");
+ return -EINVAL;
+ }
+ if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
+ NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
+ return -EINVAL;
+ }
+ if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
+ NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
+ return -EINVAL;
+ }
+ }
+
+ /* don't call drivers if the effective program didn't change */
+ if (new_prog != cur_prog) {
+ bpf_op = dev_xdp_bpf_op(dev, mode);
+ if (!bpf_op) {
+ NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
+ return -EOPNOTSUPP;
+ }
+
+ err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
+ if (err)
+ return err;
+ }
+
+ if (link)
+ dev_xdp_set_link(dev, mode, link);
+ else
+ dev_xdp_set_prog(dev, mode, new_prog);
+ if (cur_prog)
+ bpf_prog_put(cur_prog);
+
+ return 0;
+}
+
+static int dev_xdp_attach_link(struct net_device *dev,
+ struct netlink_ext_ack *extack,
+ struct bpf_xdp_link *link)
+{
+ return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
+}
+
+static int dev_xdp_detach_link(struct net_device *dev,
+ struct netlink_ext_ack *extack,
+ struct bpf_xdp_link *link)
+{
+ enum bpf_xdp_mode mode;
+ bpf_op_t bpf_op;
+
+ ASSERT_RTNL();
+
+ mode = dev_xdp_mode(dev, link->flags);
+ if (dev_xdp_link(dev, mode) != link)
+ return -EINVAL;
+
+ bpf_op = dev_xdp_bpf_op(dev, mode);
+ WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
+ dev_xdp_set_link(dev, mode, NULL);
+ return 0;
+}
+
+static void bpf_xdp_link_release(struct bpf_link *link)
+{
+ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
+
+ rtnl_lock();
+
+ /* if racing with net_device's tear down, xdp_link->dev might be
+ * already NULL, in which case link was already auto-detached
+ */
+ if (xdp_link->dev) {
+ WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
+ xdp_link->dev = NULL;
+ }
+
+ rtnl_unlock();
+}
+
+static int bpf_xdp_link_detach(struct bpf_link *link)
+{
+ bpf_xdp_link_release(link);
+ return 0;
+}
+
+static void bpf_xdp_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
+
+ kfree(xdp_link);
+}
+
+static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
+ u32 ifindex = 0;
+
+ rtnl_lock();
+ if (xdp_link->dev)
+ ifindex = xdp_link->dev->ifindex;
+ rtnl_unlock();
+
+ seq_printf(seq, "ifindex:\t%u\n", ifindex);
+}
+
+static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
+ u32 ifindex = 0;
+
+ rtnl_lock();
+ if (xdp_link->dev)
+ ifindex = xdp_link->dev->ifindex;
+ rtnl_unlock();
+
+ info->xdp.ifindex = ifindex;
+ return 0;
+}
+
+static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
+ struct bpf_prog *old_prog)
+{
+ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
+ enum bpf_xdp_mode mode;
+ bpf_op_t bpf_op;
+ int err = 0;
+
+ rtnl_lock();
+
+ /* link might have been auto-released already, so fail */
+ if (!xdp_link->dev) {
+ err = -ENOLINK;
+ goto out_unlock;
+ }
+
+ if (old_prog && link->prog != old_prog) {
+ err = -EPERM;
+ goto out_unlock;
+ }
+ old_prog = link->prog;
+ if (old_prog->type != new_prog->type ||
+ old_prog->expected_attach_type != new_prog->expected_attach_type) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
+ if (old_prog == new_prog) {
+ /* no-op, don't disturb drivers */
+ bpf_prog_put(new_prog);
+ goto out_unlock;
+ }
+
+ mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
+ bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
+ err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
+ xdp_link->flags, new_prog);
+ if (err)
+ goto out_unlock;
+
+ old_prog = xchg(&link->prog, new_prog);
+ bpf_prog_put(old_prog);
+
+out_unlock:
+ rtnl_unlock();
+ return err;
+}
+
+static const struct bpf_link_ops bpf_xdp_link_lops = {
+ .release = bpf_xdp_link_release,
+ .dealloc = bpf_xdp_link_dealloc,
+ .detach = bpf_xdp_link_detach,
+ .show_fdinfo = bpf_xdp_link_show_fdinfo,
+ .fill_link_info = bpf_xdp_link_fill_link_info,
+ .update_prog = bpf_xdp_link_update,
+};
+
+int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct net *net = current->nsproxy->net_ns;
+ struct bpf_link_primer link_primer;
+ struct bpf_xdp_link *link;
+ struct net_device *dev;
+ int err, fd;
+
+ rtnl_lock();
+ dev = dev_get_by_index(net, attr->link_create.target_ifindex);
+ if (!dev) {
+ rtnl_unlock();
+ return -EINVAL;
+ }
+
+ link = kzalloc(sizeof(*link), GFP_USER);
+ if (!link) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog);
+ link->dev = dev;
+ link->flags = attr->link_create.flags;
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err) {
+ kfree(link);
+ goto unlock;
+ }
+
+ err = dev_xdp_attach_link(dev, NULL, link);
+ rtnl_unlock();
+
+ if (err) {
+ link->dev = NULL;
+ bpf_link_cleanup(&link_primer);
+ goto out_put_dev;
+ }
+
+ fd = bpf_link_settle(&link_primer);
+ /* link itself doesn't hold dev's refcnt to not complicate shutdown */
+ dev_put(dev);
+ return fd;
+
+unlock:
+ rtnl_unlock();
+
+out_put_dev:
+ dev_put(dev);
+ return err;
}
/**
@@ -8085,56 +9450,44 @@
* @dev: device
* @extack: netlink extended ack
* @fd: new program fd or negative value to clear
+ * @expected_fd: old program fd that userspace expects to replace or clear
* @flags: xdp-related flags
*
* Set or clear a bpf program for a device
*/
int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
- int fd, u32 flags)
+ int fd, int expected_fd, u32 flags)
{
- const struct net_device_ops *ops = dev->netdev_ops;
- enum bpf_netdev_command query;
- struct bpf_prog *prog = NULL;
- bpf_op_t bpf_op, bpf_chk;
+ enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags);
+ struct bpf_prog *new_prog = NULL, *old_prog = NULL;
int err;
ASSERT_RTNL();
- query = flags & XDP_FLAGS_HW_MODE ? XDP_QUERY_PROG_HW : XDP_QUERY_PROG;
-
- bpf_op = bpf_chk = ops->ndo_bpf;
- if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE)))
- return -EOPNOTSUPP;
- if (!bpf_op || (flags & XDP_FLAGS_SKB_MODE))
- bpf_op = generic_xdp_install;
- if (bpf_op == bpf_chk)
- bpf_chk = generic_xdp_install;
-
if (fd >= 0) {
- if (__dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG) ||
- __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG_HW))
- return -EEXIST;
- if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) &&
- __dev_xdp_query(dev, bpf_op, query))
- return -EBUSY;
+ new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
+ mode != XDP_MODE_SKB);
+ if (IS_ERR(new_prog))
+ return PTR_ERR(new_prog);
+ }
- prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
- bpf_op == ops->ndo_bpf);
- if (IS_ERR(prog))
- return PTR_ERR(prog);
-
- if (!(flags & XDP_FLAGS_HW_MODE) &&
- bpf_prog_is_dev_bound(prog->aux)) {
- NL_SET_ERR_MSG(extack, "using device-bound program without HW_MODE flag is not supported");
- bpf_prog_put(prog);
- return -EINVAL;
+ if (expected_fd >= 0) {
+ old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
+ mode != XDP_MODE_SKB);
+ if (IS_ERR(old_prog)) {
+ err = PTR_ERR(old_prog);
+ old_prog = NULL;
+ goto err_out;
}
}
- err = dev_xdp_install(dev, bpf_op, extack, flags, prog);
- if (err < 0 && prog)
- bpf_prog_put(prog);
+ err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);
+err_out:
+ if (err && new_prog)
+ bpf_prog_put(new_prog);
+ if (old_prog)
+ bpf_prog_put(old_prog);
return err;
}
@@ -8166,103 +9519,6 @@
{
list_add_tail(&dev->todo_list, &net_todo_list);
dev_net(dev)->dev_unreg_count++;
-}
-
-static void rollback_registered_many(struct list_head *head)
-{
- struct net_device *dev, *tmp;
- LIST_HEAD(close_head);
-
- BUG_ON(dev_boot_phase);
- ASSERT_RTNL();
-
- list_for_each_entry_safe(dev, tmp, head, unreg_list) {
- /* Some devices call without registering
- * for initialization unwind. Remove those
- * devices and proceed with the remaining.
- */
- if (dev->reg_state == NETREG_UNINITIALIZED) {
- pr_debug("unregister_netdevice: device %s/%p never was registered\n",
- dev->name, dev);
-
- WARN_ON(1);
- list_del(&dev->unreg_list);
- continue;
- }
- dev->dismantle = true;
- BUG_ON(dev->reg_state != NETREG_REGISTERED);
- }
-
- /* If device is running, close it first. */
- list_for_each_entry(dev, head, unreg_list)
- list_add_tail(&dev->close_list, &close_head);
- dev_close_many(&close_head, true);
-
- list_for_each_entry(dev, head, unreg_list) {
- /* And unlink it from device chain. */
- unlist_netdevice(dev);
-
- dev->reg_state = NETREG_UNREGISTERING;
- }
- flush_all_backlogs();
-
- synchronize_net();
-
- list_for_each_entry(dev, head, unreg_list) {
- struct sk_buff *skb = NULL;
-
- /* Shutdown queueing discipline. */
- dev_shutdown(dev);
-
- dev_xdp_uninstall(dev);
-
- /* Notify protocols, that we are about to destroy
- * this device. They should clean all the things.
- */
- call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
-
- if (!dev->rtnl_link_ops ||
- dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
- skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
- GFP_KERNEL, NULL, 0);
-
- /*
- * Flush the unicast and multicast chains
- */
- dev_uc_flush(dev);
- dev_mc_flush(dev);
-
- if (dev->netdev_ops->ndo_uninit)
- dev->netdev_ops->ndo_uninit(dev);
-
- if (skb)
- rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
-
- /* Notifier chain MUST detach us all upper devices. */
- WARN_ON(netdev_has_any_upper_dev(dev));
- WARN_ON(netdev_has_any_lower_dev(dev));
-
- /* Remove entries from kobject tree */
- netdev_unregister_kobject(dev);
-#ifdef CONFIG_XPS
- /* Remove XPS queueing entries */
- netif_reset_xps_queues_gt(dev, 0);
-#endif
- }
-
- synchronize_net();
-
- list_for_each_entry(dev, head, unreg_list)
- dev_put(dev);
-}
-
-static void rollback_registered(struct net_device *dev)
-{
- LIST_HEAD(single);
-
- list_add(&dev->unreg_list, &single);
- rollback_registered_many(&single);
- list_del(&single);
}
static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
@@ -8410,7 +9666,7 @@
/* driver might be less strict about feature dependencies */
features = netdev_fix_features(dev, features);
- /* some features can't be enabled if they're off an an upper device */
+ /* some features can't be enabled if they're off on an upper device */
netdev_for_each_upper_dev_rcu(dev, upper, iter)
features = netdev_sync_upper_features(dev, upper, features);
@@ -8533,6 +9789,11 @@
netif_dormant_on(dev);
else
netif_dormant_off(dev);
+
+ if (rootdev->operstate == IF_OPER_TESTING)
+ netif_testing_on(dev);
+ else
+ netif_testing_off(dev);
if (netif_carrier_ok(rootdev))
netif_carrier_on(dev);
@@ -8674,11 +9935,20 @@
BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
BUG_ON(!net);
+ ret = ethtool_check_ops(dev->ethtool_ops);
+ if (ret)
+ return ret;
+
spin_lock_init(&dev->addr_list_lock);
netdev_set_addr_lockdep_class(dev);
ret = dev_get_valid_name(net, dev, dev->name);
if (ret < 0)
+ goto out;
+
+ ret = -ENOMEM;
+ dev->name_node = netdev_name_node_head_alloc(dev);
+ if (!dev->name_node)
goto out;
/* Init, if this function is available */
@@ -8687,7 +9957,7 @@
if (ret) {
if (ret > 0)
ret = -EIO;
- goto out;
+ goto err_free_name;
}
}
@@ -8709,7 +9979,7 @@
/* Transfer changeable features to wanted_features and enable
* software offloads (GSO and GRO).
*/
- dev->hw_features |= NETIF_F_SOFT_FEATURES;
+ dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
dev->features |= NETIF_F_SOFT_FEATURES;
if (dev->netdev_ops->ndo_udp_tunnel_add) {
@@ -8787,17 +10057,10 @@
ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
ret = notifier_to_errno(ret);
if (ret) {
- rollback_registered(dev);
- rcu_barrier();
-
- dev->reg_state = NETREG_UNREGISTERED;
- /* We should put the kobject that hold in
- * netdev_unregister_kobject(), otherwise
- * the net device cannot be freed when
- * driver calls free_netdev(), because the
- * kobject is being hold.
- */
- kobject_put(&dev->dev.kobj);
+ /* Expect explicit free_netdev() on failure */
+ dev->needs_free_netdev = false;
+ unregister_netdevice_queue(dev, NULL);
+ goto out;
}
/*
* Prevent userspace races by waiting until the network
@@ -8815,6 +10078,8 @@
dev->netdev_ops->ndo_uninit(dev);
if (dev->priv_destructor)
dev->priv_destructor(dev);
+err_free_name:
+ netdev_name_node_free(dev->name_node);
goto out;
}
EXPORT_SYMBOL(register_netdevice);
@@ -8898,6 +10163,8 @@
}
EXPORT_SYMBOL(netdev_refcnt_read);
+#define WAIT_REFS_MIN_MSECS 1
+#define WAIT_REFS_MAX_MSECS 250
/**
* netdev_wait_allrefs - wait until all references are gone.
* @dev: target net_device
@@ -8913,7 +10180,7 @@
static void netdev_wait_allrefs(struct net_device *dev)
{
unsigned long rebroadcast_time, warning_time;
- int refcnt;
+ int wait = 0, refcnt;
linkwatch_forget_dev(dev);
@@ -8947,7 +10214,13 @@
rebroadcast_time = jiffies;
}
- msleep(250);
+ if (!wait) {
+ rcu_barrier();
+ wait = WAIT_REFS_MIN_MSECS;
+ } else {
+ msleep(wait);
+ wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
+ }
refcnt = netdev_refcnt_read(dev);
@@ -8986,6 +10259,19 @@
void netdev_run_todo(void)
{
struct list_head list;
+#ifdef CONFIG_LOCKDEP
+ struct list_head unlink_list;
+
+ list_replace_init(&net_unlink_list, &unlink_list);
+
+ while (!list_empty(&unlink_list)) {
+ struct net_device *dev = list_first_entry(&unlink_list,
+ struct net_device,
+ unlink_list);
+ list_del_init(&dev->unlink_list);
+ dev->nested_level = dev->lower_level - 1;
+ }
+#endif
/* Snapshot list, allow later requests */
list_replace_init(&net_todo_list, &list);
@@ -9019,9 +10305,7 @@
BUG_ON(!list_empty(&dev->ptype_specific));
WARN_ON(rcu_access_pointer(dev->ip_ptr));
WARN_ON(rcu_access_pointer(dev->ip6_ptr));
-#if IS_ENABLED(CONFIG_DECNET)
- WARN_ON(dev->dn_ptr);
-#endif
+
if (dev->priv_destructor)
dev->priv_destructor(dev);
if (dev->needs_free_netdev)
@@ -9096,6 +10380,40 @@
return storage;
}
EXPORT_SYMBOL(dev_get_stats);
+
+/**
+ * dev_fetch_sw_netstats - get per-cpu network device statistics
+ * @s: place to store stats
+ * @netstats: per-cpu network stats to read from
+ *
+ * Read per-cpu network statistics and populate the related fields in @s.
+ */
+void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
+ const struct pcpu_sw_netstats __percpu *netstats)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ const struct pcpu_sw_netstats *stats;
+ struct pcpu_sw_netstats tmp;
+ unsigned int start;
+
+ stats = per_cpu_ptr(netstats, cpu);
+ do {
+ start = u64_stats_fetch_begin_irq(&stats->syncp);
+ tmp.rx_packets = stats->rx_packets;
+ tmp.rx_bytes = stats->rx_bytes;
+ tmp.tx_packets = stats->tx_packets;
+ tmp.tx_bytes = stats->tx_bytes;
+ } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
+
+ s->rx_packets += tmp.rx_packets;
+ s->rx_bytes += tmp.rx_bytes;
+ s->tx_packets += tmp.tx_packets;
+ s->tx_bytes += tmp.tx_bytes;
+ }
+}
+EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats);
struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
{
@@ -9198,6 +10516,10 @@
dev->gso_max_segs = GSO_MAX_SEGS;
dev->upper_level = 1;
dev->lower_level = 1;
+#ifdef CONFIG_LOCKDEP
+ dev->nested_level = 0;
+ INIT_LIST_HEAD(&dev->unlink_list);
+#endif
INIT_LIST_HEAD(&dev->napi_list);
INIT_LIST_HEAD(&dev->unreg_list);
@@ -9207,6 +10529,7 @@
INIT_LIST_HEAD(&dev->adj_list.lower);
INIT_LIST_HEAD(&dev->ptype_all);
INIT_LIST_HEAD(&dev->ptype_specific);
+ INIT_LIST_HEAD(&dev->net_notifier_list);
#ifdef CONFIG_NET_SCHED
hash_init(dev->qdisc_hash);
#endif
@@ -9264,6 +10587,17 @@
struct napi_struct *p, *n;
might_sleep();
+
+ /* When called immediately after register_netdevice() failed the unwind
+ * handling may still be dismantling the device. Handle that case by
+ * deferring the free.
+ */
+ if (dev->reg_state == NETREG_UNREGISTERING) {
+ ASSERT_RTNL();
+ dev->needs_free_netdev = true;
+ return;
+ }
+
netif_free_tx_queues(dev);
netif_free_rx_queues(dev);
@@ -9277,6 +10611,8 @@
free_percpu(dev->pcpu_refcnt);
dev->pcpu_refcnt = NULL;
+ free_percpu(dev->xdp_bulkq);
+ dev->xdp_bulkq = NULL;
/* Compatibility with error handling in drivers */
if (dev->reg_state == NETREG_UNINITIALIZED) {
@@ -9328,9 +10664,10 @@
if (head) {
list_move_tail(&dev->unreg_list, head);
} else {
- rollback_registered(dev);
- /* Finish processing unregister after unlock */
- net_set_todo(dev);
+ LIST_HEAD(single);
+
+ list_add(&dev->unreg_list, &single);
+ unregister_netdevice_many(&single);
}
}
EXPORT_SYMBOL(unregister_netdevice_queue);
@@ -9344,14 +10681,100 @@
*/
void unregister_netdevice_many(struct list_head *head)
{
- struct net_device *dev;
+ struct net_device *dev, *tmp;
+ LIST_HEAD(close_head);
- if (!list_empty(head)) {
- rollback_registered_many(head);
- list_for_each_entry(dev, head, unreg_list)
- net_set_todo(dev);
- list_del(head);
+ BUG_ON(dev_boot_phase);
+ ASSERT_RTNL();
+
+ if (list_empty(head))
+ return;
+
+ list_for_each_entry_safe(dev, tmp, head, unreg_list) {
+ /* Some devices call without registering
+ * for initialization unwind. Remove those
+ * devices and proceed with the remaining.
+ */
+ if (dev->reg_state == NETREG_UNINITIALIZED) {
+ pr_debug("unregister_netdevice: device %s/%p never was registered\n",
+ dev->name, dev);
+
+ WARN_ON(1);
+ list_del(&dev->unreg_list);
+ continue;
+ }
+ dev->dismantle = true;
+ BUG_ON(dev->reg_state != NETREG_REGISTERED);
}
+
+ /* If device is running, close it first. */
+ list_for_each_entry(dev, head, unreg_list)
+ list_add_tail(&dev->close_list, &close_head);
+ dev_close_many(&close_head, true);
+
+ list_for_each_entry(dev, head, unreg_list) {
+ /* And unlink it from device chain. */
+ unlist_netdevice(dev);
+
+ dev->reg_state = NETREG_UNREGISTERING;
+ }
+ flush_all_backlogs();
+
+ synchronize_net();
+
+ list_for_each_entry(dev, head, unreg_list) {
+ struct sk_buff *skb = NULL;
+
+ /* Shutdown queueing discipline. */
+ dev_shutdown(dev);
+
+ dev_xdp_uninstall(dev);
+
+ /* Notify protocols, that we are about to destroy
+ * this device. They should clean all the things.
+ */
+ call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+
+ if (!dev->rtnl_link_ops ||
+ dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
+ skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
+ GFP_KERNEL, NULL, 0);
+
+ /*
+ * Flush the unicast and multicast chains
+ */
+ dev_uc_flush(dev);
+ dev_mc_flush(dev);
+
+ netdev_name_node_alt_flush(dev);
+ netdev_name_node_free(dev->name_node);
+
+ if (dev->netdev_ops->ndo_uninit)
+ dev->netdev_ops->ndo_uninit(dev);
+
+ if (skb)
+ rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
+
+ /* Notifier chain MUST detach us all upper devices. */
+ WARN_ON(netdev_has_any_upper_dev(dev));
+ WARN_ON(netdev_has_any_lower_dev(dev));
+
+ /* Remove entries from kobject tree */
+ netdev_unregister_kobject(dev);
+#ifdef CONFIG_XPS
+ /* Remove XPS queueing entries */
+ netif_reset_xps_queues_gt(dev, 0);
+#endif
+ }
+
+ synchronize_net();
+
+ list_for_each_entry(dev, head, unreg_list) {
+ dev_put(dev);
+ net_set_todo(dev);
+ }
+
+ list_del(head);
}
EXPORT_SYMBOL(unregister_netdevice_many);
@@ -9390,6 +10813,7 @@
int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
{
+ struct net *net_old = dev_net(dev);
int err, new_nsid, new_ifindex;
ASSERT_RTNL();
@@ -9405,7 +10829,7 @@
/* Get out if there is nothing todo */
err = 0;
- if (net_eq(dev_net(dev), net))
+ if (net_eq(net_old, net))
goto out;
/* Pick the destination device name, and ensure
@@ -9466,6 +10890,9 @@
kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
netdev_adjacent_del_links(dev);
+ /* Move per-net netdevice notifiers that are following the netdevice */
+ move_netdevice_notifiers_dev_net(dev, net);
+
/* Actually switch the network namespace */
dev_net_set(dev, net);
dev->ifindex = new_ifindex;
@@ -9476,6 +10903,12 @@
/* Fixup kobjects */
err = device_rename(&dev->dev, dev->name);
+ WARN_ON(err);
+
+ /* Adapt owner in case owning user namespace of target network
+ * namespace is different from the original one.
+ */
+ err = netdev_change_owner(dev, net_old, net);
WARN_ON(err);
/* Add the device back in the hashes */
@@ -9608,7 +11041,7 @@
static int __net_init netdev_init(struct net *net)
{
BUILD_BUG_ON(GRO_HASH_BUCKETS >
- 8 * FIELD_SIZEOF(struct napi_struct, gro_bitmask));
+ 8 * sizeof_field(struct napi_struct, gro_bitmask));
if (net != &init_net)
INIT_LIST_HEAD(&net->dev_base_head);
@@ -9620,6 +11053,8 @@
net->dev_index_head = netdev_create_hash();
if (net->dev_index_head == NULL)
goto err_idx;
+
+ RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
return 0;
@@ -9742,7 +11177,7 @@
continue;
/* Leave virtual devices for the generic cleanup */
- if (dev->rtnl_link_ops)
+ if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund)
continue;
/* Push remaining network devices to init_net */
--
Gitblit v1.6.2