hc
2024-12-19 9370bb92b2d16684ee45cf24e879c93c509162da
kernel/net/core/dev.c
....@@ -1,10 +1,6 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /*
23 * NET3 Protocol independent device support routines.
3
- *
4
- * This program is free software; you can redistribute it and/or
5
- * modify it under the terms of the GNU General Public License
6
- * as published by the Free Software Foundation; either version
7
- * 2 of the License, or (at your option) any later version.
84 *
95 * Derived from the non IP parts of dev.c 1.0.19
106 * Authors: Ross Biro
....@@ -102,6 +98,7 @@
10298 #include <net/busy_poll.h>
10399 #include <linux/rtnetlink.h>
104100 #include <linux/stat.h>
101
+#include <net/dsa.h>
105102 #include <net/dst.h>
106103 #include <net/dst_metadata.h>
107104 #include <net/pkt_sched.h>
....@@ -132,7 +129,6 @@
132129 #include <trace/events/napi.h>
133130 #include <trace/events/net.h>
134131 #include <trace/events/skb.h>
135
-#include <linux/pci.h>
136132 #include <linux/inetdevice.h>
137133 #include <linux/cpu_rmap.h>
138134 #include <linux/static_key.h>
....@@ -146,11 +142,15 @@
146142 #include <linux/sctp.h>
147143 #include <net/udp_tunnel.h>
148144 #include <linux/net_namespace.h>
145
+#include <linux/indirect_call_wrapper.h>
146
+#include <net/devlink.h>
147
+#include <linux/pm_runtime.h>
148
+#include <linux/prandom.h>
149
+#include <trace/hooks/net.h>
149150
150151 #include "net-sysfs.h"
151152
152153 #define MAX_GRO_SKBS 8
153
-#define MAX_NEST_DEV 8
154154
155155 /* This should be increased if a protocol with a bigger head is added. */
156156 #define GRO_MAX_HEAD (MAX_HEADER + 128)
....@@ -164,6 +164,9 @@
164164 static int netif_rx_internal(struct sk_buff *skb);
165165 static int call_netdevice_notifiers_info(unsigned long val,
166166 struct netdev_notifier_info *info);
167
+static int call_netdevice_notifiers_extack(unsigned long val,
168
+ struct net_device *dev,
169
+ struct netlink_ext_ack *extack);
167170 static struct napi_struct *napi_by_id(unsigned int napi_id);
168171
169172 /*
....@@ -219,15 +222,137 @@
219222 static inline void rps_lock(struct softnet_data *sd)
220223 {
221224 #ifdef CONFIG_RPS
222
- raw_spin_lock(&sd->input_pkt_queue.raw_lock);
225
+ spin_lock(&sd->input_pkt_queue.lock);
223226 #endif
224227 }
225228
226229 static inline void rps_unlock(struct softnet_data *sd)
227230 {
228231 #ifdef CONFIG_RPS
229
- raw_spin_unlock(&sd->input_pkt_queue.raw_lock);
232
+ spin_unlock(&sd->input_pkt_queue.lock);
230233 #endif
234
+}
235
+
236
+static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
237
+ const char *name)
238
+{
239
+ struct netdev_name_node *name_node;
240
+
241
+ name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
242
+ if (!name_node)
243
+ return NULL;
244
+ INIT_HLIST_NODE(&name_node->hlist);
245
+ name_node->dev = dev;
246
+ name_node->name = name;
247
+ return name_node;
248
+}
249
+
250
+static struct netdev_name_node *
251
+netdev_name_node_head_alloc(struct net_device *dev)
252
+{
253
+ struct netdev_name_node *name_node;
254
+
255
+ name_node = netdev_name_node_alloc(dev, dev->name);
256
+ if (!name_node)
257
+ return NULL;
258
+ INIT_LIST_HEAD(&name_node->list);
259
+ return name_node;
260
+}
261
+
262
+static void netdev_name_node_free(struct netdev_name_node *name_node)
263
+{
264
+ kfree(name_node);
265
+}
266
+
267
+static void netdev_name_node_add(struct net *net,
268
+ struct netdev_name_node *name_node)
269
+{
270
+ hlist_add_head_rcu(&name_node->hlist,
271
+ dev_name_hash(net, name_node->name));
272
+}
273
+
274
+static void netdev_name_node_del(struct netdev_name_node *name_node)
275
+{
276
+ hlist_del_rcu(&name_node->hlist);
277
+}
278
+
279
+static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
280
+ const char *name)
281
+{
282
+ struct hlist_head *head = dev_name_hash(net, name);
283
+ struct netdev_name_node *name_node;
284
+
285
+ hlist_for_each_entry(name_node, head, hlist)
286
+ if (!strcmp(name_node->name, name))
287
+ return name_node;
288
+ return NULL;
289
+}
290
+
291
+static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
292
+ const char *name)
293
+{
294
+ struct hlist_head *head = dev_name_hash(net, name);
295
+ struct netdev_name_node *name_node;
296
+
297
+ hlist_for_each_entry_rcu(name_node, head, hlist)
298
+ if (!strcmp(name_node->name, name))
299
+ return name_node;
300
+ return NULL;
301
+}
302
+
303
+int netdev_name_node_alt_create(struct net_device *dev, const char *name)
304
+{
305
+ struct netdev_name_node *name_node;
306
+ struct net *net = dev_net(dev);
307
+
308
+ name_node = netdev_name_node_lookup(net, name);
309
+ if (name_node)
310
+ return -EEXIST;
311
+ name_node = netdev_name_node_alloc(dev, name);
312
+ if (!name_node)
313
+ return -ENOMEM;
314
+ netdev_name_node_add(net, name_node);
315
+ /* The node that holds dev->name acts as a head of per-device list. */
316
+ list_add_tail(&name_node->list, &dev->name_node->list);
317
+
318
+ return 0;
319
+}
320
+EXPORT_SYMBOL(netdev_name_node_alt_create);
321
+
322
+static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
323
+{
324
+ list_del(&name_node->list);
325
+ netdev_name_node_del(name_node);
326
+ kfree(name_node->name);
327
+ netdev_name_node_free(name_node);
328
+}
329
+
330
+int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
331
+{
332
+ struct netdev_name_node *name_node;
333
+ struct net *net = dev_net(dev);
334
+
335
+ name_node = netdev_name_node_lookup(net, name);
336
+ if (!name_node)
337
+ return -ENOENT;
338
+ /* lookup might have found our primary name or a name belonging
339
+ * to another device.
340
+ */
341
+ if (name_node == dev->name_node || name_node->dev != dev)
342
+ return -EINVAL;
343
+
344
+ __netdev_name_node_alt_destroy(name_node);
345
+
346
+ return 0;
347
+}
348
+EXPORT_SYMBOL(netdev_name_node_alt_destroy);
349
+
350
+static void netdev_name_node_alt_flush(struct net_device *dev)
351
+{
352
+ struct netdev_name_node *name_node, *tmp;
353
+
354
+ list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list)
355
+ __netdev_name_node_alt_destroy(name_node);
231356 }
232357
233358 /* Device list insertion */
....@@ -239,7 +364,7 @@
239364
240365 write_lock_bh(&dev_base_lock);
241366 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
242
- hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
367
+ netdev_name_node_add(net, dev->name_node);
243368 hlist_add_head_rcu(&dev->index_hlist,
244369 dev_index_hash(net, dev->ifindex));
245370 write_unlock_bh(&dev_base_lock);
....@@ -257,7 +382,7 @@
257382 /* Unlink dev from the device chain */
258383 write_lock_bh(&dev_base_lock);
259384 list_del_rcu(&dev->dev_list);
260
- hlist_del_rcu(&dev->name_hlist);
385
+ netdev_name_node_del(dev->name_node);
261386 hlist_del_rcu(&dev->index_hlist);
262387 write_unlock_bh(&dev_base_lock);
263388
....@@ -355,6 +480,7 @@
355480 unsigned short dev_type)
356481 {
357482 }
483
+
358484 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
359485 {
360486 }
....@@ -385,6 +511,12 @@
385511
386512 static inline struct list_head *ptype_head(const struct packet_type *pt)
387513 {
514
+ struct list_head vendor_pt = { .next = NULL, };
515
+
516
+ trace_android_vh_ptype_head(pt, &vendor_pt);
517
+ if (vendor_pt.next)
518
+ return vendor_pt.next;
519
+
388520 if (pt->type == htons(ETH_P_ALL))
389521 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
390522 else
....@@ -735,14 +867,10 @@
735867
736868 struct net_device *__dev_get_by_name(struct net *net, const char *name)
737869 {
738
- struct net_device *dev;
739
- struct hlist_head *head = dev_name_hash(net, name);
870
+ struct netdev_name_node *node_name;
740871
741
- hlist_for_each_entry(dev, head, name_hlist)
742
- if (!strncmp(dev->name, name, IFNAMSIZ))
743
- return dev;
744
-
745
- return NULL;
872
+ node_name = netdev_name_node_lookup(net, name);
873
+ return node_name ? node_name->dev : NULL;
746874 }
747875 EXPORT_SYMBOL(__dev_get_by_name);
748876
....@@ -760,14 +888,10 @@
760888
761889 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
762890 {
763
- struct net_device *dev;
764
- struct hlist_head *head = dev_name_hash(net, name);
891
+ struct netdev_name_node *node_name;
765892
766
- hlist_for_each_entry_rcu(dev, head, name_hlist)
767
- if (!strncmp(dev->name, name, IFNAMSIZ))
768
- return dev;
769
-
770
- return NULL;
893
+ node_name = netdev_name_node_lookup_rcu(net, name);
894
+ return node_name ? node_name->dev : NULL;
771895 }
772896 EXPORT_SYMBOL(dev_get_by_name_rcu);
773897
....@@ -1015,7 +1139,7 @@
10151139 * @name: name string
10161140 *
10171141 * Network device names need to be valid file names to
1018
- * to allow sysfs to work. We also disallow any kind of
1142
+ * allow sysfs to work. We also disallow any kind of
10191143 * whitespace.
10201144 */
10211145 bool dev_valid_name(const char *name)
....@@ -1078,6 +1202,18 @@
10781202 return -ENOMEM;
10791203
10801204 for_each_netdev(net, d) {
1205
+ struct netdev_name_node *name_node;
1206
+ list_for_each_entry(name_node, &d->name_node->list, list) {
1207
+ if (!sscanf(name_node->name, name, &i))
1208
+ continue;
1209
+ if (i < 0 || i >= max_netdevices)
1210
+ continue;
1211
+
1212
+ /* avoid cases where sscanf is not exact inverse of printf */
1213
+ snprintf(buf, IFNAMSIZ, name, i);
1214
+ if (!strncmp(buf, name_node->name, IFNAMSIZ))
1215
+ set_bit(i, inuse);
1216
+ }
10811217 if (!sscanf(d->name, name, &i))
10821218 continue;
10831219 if (i < 0 || i >= max_netdevices)
....@@ -1138,8 +1274,8 @@
11381274 }
11391275 EXPORT_SYMBOL(dev_alloc_name);
11401276
1141
-int dev_get_valid_name(struct net *net, struct net_device *dev,
1142
- const char *name)
1277
+static int dev_get_valid_name(struct net *net, struct net_device *dev,
1278
+ const char *name)
11431279 {
11441280 BUG_ON(!net);
11451281
....@@ -1155,7 +1291,6 @@
11551291
11561292 return 0;
11571293 }
1158
-EXPORT_SYMBOL(dev_get_valid_name);
11591294
11601295 /**
11611296 * dev_change_name - change name of a device
....@@ -1229,13 +1364,13 @@
12291364 netdev_adjacent_rename_links(dev, oldname);
12301365
12311366 write_lock_bh(&dev_base_lock);
1232
- hlist_del_rcu(&dev->name_hlist);
1367
+ netdev_name_node_del(dev->name_node);
12331368 write_unlock_bh(&dev_base_lock);
12341369
12351370 synchronize_rcu();
12361371
12371372 write_lock_bh(&dev_base_lock);
1238
- hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1373
+ netdev_name_node_add(net, dev->name_node);
12391374 write_unlock_bh(&dev_base_lock);
12401375
12411376 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
....@@ -1285,8 +1420,8 @@
12851420 }
12861421
12871422 mutex_lock(&ifalias_mutex);
1288
- rcu_swap_protected(dev->ifalias, new_alias,
1289
- mutex_is_locked(&ifalias_mutex));
1423
+ new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
1424
+ mutex_is_locked(&ifalias_mutex));
12901425 mutex_unlock(&ifalias_mutex);
12911426
12921427 if (new_alias)
....@@ -1372,15 +1507,20 @@
13721507 }
13731508 EXPORT_SYMBOL(netdev_notify_peers);
13741509
1375
-static int __dev_open(struct net_device *dev)
1510
+static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
13761511 {
13771512 const struct net_device_ops *ops = dev->netdev_ops;
13781513 int ret;
13791514
13801515 ASSERT_RTNL();
13811516
1382
- if (!netif_device_present(dev))
1383
- return -ENODEV;
1517
+ if (!netif_device_present(dev)) {
1518
+ /* may be detached because parent is runtime-suspended */
1519
+ if (dev->dev.parent)
1520
+ pm_runtime_resume(dev->dev.parent);
1521
+ if (!netif_device_present(dev))
1522
+ return -ENODEV;
1523
+ }
13841524
13851525 /* Block netpoll from trying to do any rx path servicing.
13861526 * If we don't do this there is a chance ndo_poll_controller
....@@ -1388,7 +1528,7 @@
13881528 */
13891529 netpoll_poll_disable(dev);
13901530
1391
- ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1531
+ ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
13921532 ret = notifier_to_errno(ret);
13931533 if (ret)
13941534 return ret;
....@@ -1417,7 +1557,8 @@
14171557
14181558 /**
14191559 * dev_open - prepare an interface for use.
1420
- * @dev: device to open
1560
+ * @dev: device to open
1561
+ * @extack: netlink extended ack
14211562 *
14221563 * Takes a device from down to up state. The device's private open
14231564 * function is invoked and then the multicast lists are loaded. Finally
....@@ -1427,14 +1568,14 @@
14271568 * Calling this function on an active interface is a nop. On a failure
14281569 * a negative errno code is returned.
14291570 */
1430
-int dev_open(struct net_device *dev)
1571
+int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
14311572 {
14321573 int ret;
14331574
14341575 if (dev->flags & IFF_UP)
14351576 return 0;
14361577
1437
- ret = __dev_open(dev);
1578
+ ret = __dev_open(dev, extack);
14381579 if (ret < 0)
14391580 return ret;
14401581
....@@ -1596,6 +1737,7 @@
15961737 N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
15971738 N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
15981739 N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
1740
+ N(PRE_CHANGEADDR)
15991741 }
16001742 #undef N
16011743 return "UNKNOWN_NETDEV_EVENT";
....@@ -1610,6 +1752,62 @@
16101752 };
16111753
16121754 return nb->notifier_call(nb, val, &info);
1755
+}
1756
+
1757
+static int call_netdevice_register_notifiers(struct notifier_block *nb,
1758
+ struct net_device *dev)
1759
+{
1760
+ int err;
1761
+
1762
+ err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1763
+ err = notifier_to_errno(err);
1764
+ if (err)
1765
+ return err;
1766
+
1767
+ if (!(dev->flags & IFF_UP))
1768
+ return 0;
1769
+
1770
+ call_netdevice_notifier(nb, NETDEV_UP, dev);
1771
+ return 0;
1772
+}
1773
+
1774
+static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
1775
+ struct net_device *dev)
1776
+{
1777
+ if (dev->flags & IFF_UP) {
1778
+ call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1779
+ dev);
1780
+ call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1781
+ }
1782
+ call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1783
+}
1784
+
1785
+static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
1786
+ struct net *net)
1787
+{
1788
+ struct net_device *dev;
1789
+ int err;
1790
+
1791
+ for_each_netdev(net, dev) {
1792
+ err = call_netdevice_register_notifiers(nb, dev);
1793
+ if (err)
1794
+ goto rollback;
1795
+ }
1796
+ return 0;
1797
+
1798
+rollback:
1799
+ for_each_netdev_continue_reverse(net, dev)
1800
+ call_netdevice_unregister_notifiers(nb, dev);
1801
+ return err;
1802
+}
1803
+
1804
+static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
1805
+ struct net *net)
1806
+{
1807
+ struct net_device *dev;
1808
+
1809
+ for_each_netdev(net, dev)
1810
+ call_netdevice_unregister_notifiers(nb, dev);
16131811 }
16141812
16151813 static int dev_boot_phase = 1;
....@@ -1630,8 +1828,6 @@
16301828
16311829 int register_netdevice_notifier(struct notifier_block *nb)
16321830 {
1633
- struct net_device *dev;
1634
- struct net_device *last;
16351831 struct net *net;
16361832 int err;
16371833
....@@ -1644,17 +1840,9 @@
16441840 if (dev_boot_phase)
16451841 goto unlock;
16461842 for_each_net(net) {
1647
- for_each_netdev(net, dev) {
1648
- err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1649
- err = notifier_to_errno(err);
1650
- if (err)
1651
- goto rollback;
1652
-
1653
- if (!(dev->flags & IFF_UP))
1654
- continue;
1655
-
1656
- call_netdevice_notifier(nb, NETDEV_UP, dev);
1657
- }
1843
+ err = call_netdevice_register_net_notifiers(nb, net);
1844
+ if (err)
1845
+ goto rollback;
16581846 }
16591847
16601848 unlock:
....@@ -1663,22 +1851,9 @@
16631851 return err;
16641852
16651853 rollback:
1666
- last = dev;
1667
- for_each_net(net) {
1668
- for_each_netdev(net, dev) {
1669
- if (dev == last)
1670
- goto outroll;
1854
+ for_each_net_continue_reverse(net)
1855
+ call_netdevice_unregister_net_notifiers(nb, net);
16711856
1672
- if (dev->flags & IFF_UP) {
1673
- call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1674
- dev);
1675
- call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1676
- }
1677
- call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1678
- }
1679
- }
1680
-
1681
-outroll:
16821857 raw_notifier_chain_unregister(&netdev_chain, nb);
16831858 goto unlock;
16841859 }
....@@ -1700,7 +1875,6 @@
17001875
17011876 int unregister_netdevice_notifier(struct notifier_block *nb)
17021877 {
1703
- struct net_device *dev;
17041878 struct net *net;
17051879 int err;
17061880
....@@ -1711,22 +1885,147 @@
17111885 if (err)
17121886 goto unlock;
17131887
1714
- for_each_net(net) {
1715
- for_each_netdev(net, dev) {
1716
- if (dev->flags & IFF_UP) {
1717
- call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1718
- dev);
1719
- call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1720
- }
1721
- call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1722
- }
1723
- }
1888
+ for_each_net(net)
1889
+ call_netdevice_unregister_net_notifiers(nb, net);
1890
+
17241891 unlock:
17251892 rtnl_unlock();
17261893 up_write(&pernet_ops_rwsem);
17271894 return err;
17281895 }
17291896 EXPORT_SYMBOL(unregister_netdevice_notifier);
1897
+
1898
+static int __register_netdevice_notifier_net(struct net *net,
1899
+ struct notifier_block *nb,
1900
+ bool ignore_call_fail)
1901
+{
1902
+ int err;
1903
+
1904
+ err = raw_notifier_chain_register(&net->netdev_chain, nb);
1905
+ if (err)
1906
+ return err;
1907
+ if (dev_boot_phase)
1908
+ return 0;
1909
+
1910
+ err = call_netdevice_register_net_notifiers(nb, net);
1911
+ if (err && !ignore_call_fail)
1912
+ goto chain_unregister;
1913
+
1914
+ return 0;
1915
+
1916
+chain_unregister:
1917
+ raw_notifier_chain_unregister(&net->netdev_chain, nb);
1918
+ return err;
1919
+}
1920
+
1921
+static int __unregister_netdevice_notifier_net(struct net *net,
1922
+ struct notifier_block *nb)
1923
+{
1924
+ int err;
1925
+
1926
+ err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
1927
+ if (err)
1928
+ return err;
1929
+
1930
+ call_netdevice_unregister_net_notifiers(nb, net);
1931
+ return 0;
1932
+}
1933
+
1934
+/**
1935
+ * register_netdevice_notifier_net - register a per-netns network notifier block
1936
+ * @net: network namespace
1937
+ * @nb: notifier
1938
+ *
1939
+ * Register a notifier to be called when network device events occur.
1940
+ * The notifier passed is linked into the kernel structures and must
1941
+ * not be reused until it has been unregistered. A negative errno code
1942
+ * is returned on a failure.
1943
+ *
1944
+ * When registered all registration and up events are replayed
1945
+ * to the new notifier to allow device to have a race free
1946
+ * view of the network device list.
1947
+ */
1948
+
1949
+int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
1950
+{
1951
+ int err;
1952
+
1953
+ rtnl_lock();
1954
+ err = __register_netdevice_notifier_net(net, nb, false);
1955
+ rtnl_unlock();
1956
+ return err;
1957
+}
1958
+EXPORT_SYMBOL(register_netdevice_notifier_net);
1959
+
1960
+/**
1961
+ * unregister_netdevice_notifier_net - unregister a per-netns
1962
+ * network notifier block
1963
+ * @net: network namespace
1964
+ * @nb: notifier
1965
+ *
1966
+ * Unregister a notifier previously registered by
1967
+ * register_netdevice_notifier(). The notifier is unlinked into the
1968
+ * kernel structures and may then be reused. A negative errno code
1969
+ * is returned on a failure.
1970
+ *
1971
+ * After unregistering unregister and down device events are synthesized
1972
+ * for all devices on the device list to the removed notifier to remove
1973
+ * the need for special case cleanup code.
1974
+ */
1975
+
1976
+int unregister_netdevice_notifier_net(struct net *net,
1977
+ struct notifier_block *nb)
1978
+{
1979
+ int err;
1980
+
1981
+ rtnl_lock();
1982
+ err = __unregister_netdevice_notifier_net(net, nb);
1983
+ rtnl_unlock();
1984
+ return err;
1985
+}
1986
+EXPORT_SYMBOL(unregister_netdevice_notifier_net);
1987
+
1988
+int register_netdevice_notifier_dev_net(struct net_device *dev,
1989
+ struct notifier_block *nb,
1990
+ struct netdev_net_notifier *nn)
1991
+{
1992
+ int err;
1993
+
1994
+ rtnl_lock();
1995
+ err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
1996
+ if (!err) {
1997
+ nn->nb = nb;
1998
+ list_add(&nn->list, &dev->net_notifier_list);
1999
+ }
2000
+ rtnl_unlock();
2001
+ return err;
2002
+}
2003
+EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
2004
+
2005
+int unregister_netdevice_notifier_dev_net(struct net_device *dev,
2006
+ struct notifier_block *nb,
2007
+ struct netdev_net_notifier *nn)
2008
+{
2009
+ int err;
2010
+
2011
+ rtnl_lock();
2012
+ list_del(&nn->list);
2013
+ err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
2014
+ rtnl_unlock();
2015
+ return err;
2016
+}
2017
+EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
2018
+
2019
+static void move_netdevice_notifiers_dev_net(struct net_device *dev,
2020
+ struct net *net)
2021
+{
2022
+ struct netdev_net_notifier *nn;
2023
+
2024
+ list_for_each_entry(nn, &dev->net_notifier_list, list) {
2025
+ __unregister_netdevice_notifier_net(dev_net(dev), nn->nb);
2026
+ __register_netdevice_notifier_net(net, nn->nb, true);
2027
+ }
2028
+}
17302029
17312030 /**
17322031 * call_netdevice_notifiers_info - call all network notifier blocks
....@@ -1740,8 +2039,31 @@
17402039 static int call_netdevice_notifiers_info(unsigned long val,
17412040 struct netdev_notifier_info *info)
17422041 {
2042
+ struct net *net = dev_net(info->dev);
2043
+ int ret;
2044
+
17432045 ASSERT_RTNL();
2046
+
2047
+ /* Run per-netns notifier block chain first, then run the global one.
2048
+ * Hopefully, one day, the global one is going to be removed after
2049
+ * all notifier block registrators get converted to be per-netns.
2050
+ */
2051
+ ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
2052
+ if (ret & NOTIFY_STOP_MASK)
2053
+ return ret;
17442054 return raw_notifier_call_chain(&netdev_chain, val, info);
2055
+}
2056
+
2057
+static int call_netdevice_notifiers_extack(unsigned long val,
2058
+ struct net_device *dev,
2059
+ struct netlink_ext_ack *extack)
2060
+{
2061
+ struct netdev_notifier_info info = {
2062
+ .dev = dev,
2063
+ .extack = extack,
2064
+ };
2065
+
2066
+ return call_netdevice_notifiers_info(val, &info);
17452067 }
17462068
17472069 /**
....@@ -1755,11 +2077,7 @@
17552077
17562078 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
17572079 {
1758
- struct netdev_notifier_info info = {
1759
- .dev = dev,
1760
- };
1761
-
1762
- return call_netdevice_notifiers_info(val, &info);
2080
+ return call_netdevice_notifiers_extack(val, dev, NULL);
17632081 }
17642082 EXPORT_SYMBOL(call_netdevice_notifiers);
17652083
....@@ -1987,6 +2305,17 @@
19872305 return false;
19882306 }
19892307
2308
+/**
2309
+ * dev_nit_active - return true if any network interface taps are in use
2310
+ *
2311
+ * @dev: network device to check for the presence of taps
2312
+ */
2313
+bool dev_nit_active(struct net_device *dev)
2314
+{
2315
+ return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
2316
+}
2317
+EXPORT_SYMBOL_GPL(dev_nit_active);
2318
+
19902319 /*
19912320 * Support routine. Sends outgoing frames to any network
19922321 * taps currently in use.
....@@ -2002,6 +2331,9 @@
20022331 rcu_read_lock();
20032332 again:
20042333 list_for_each_entry_rcu(ptype, ptype_list, list) {
2334
+ if (ptype->ignore_outgoing)
2335
+ continue;
2336
+
20052337 /* Never send packets back to the socket
20062338 * they originated from - MvS (miquels@drinkel.ow.org)
20072339 */
....@@ -2302,6 +2634,8 @@
23022634 struct xps_map *map, *new_map;
23032635 bool active = false;
23042636 unsigned int nr_ids;
2637
+
2638
+ WARN_ON_ONCE(index >= dev->num_tx_queues);
23052639
23062640 if (dev->num_tc) {
23072641 /* Do not allow XPS on subordinate device directly */
....@@ -2723,7 +3057,6 @@
27233057 sd->output_queue_tailp = &q->next_sched;
27243058 raise_softirq_irqoff(NET_TX_SOFTIRQ);
27253059 local_irq_restore(flags);
2726
- preempt_check_resched_rt();
27273060 }
27283061
27293062 void __netif_schedule(struct Qdisc *q)
....@@ -2745,7 +3078,7 @@
27453078 void netif_schedule_queue(struct netdev_queue *txq)
27463079 {
27473080 rcu_read_lock();
2748
- if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
3081
+ if (!netif_xmit_stopped(txq)) {
27493082 struct Qdisc *q = rcu_dereference(txq->qdisc);
27503083
27513084 __netif_schedule(q);
....@@ -2786,7 +3119,6 @@
27863119 __this_cpu_write(softnet_data.completion_queue, skb);
27873120 raise_softirq_irqoff(NET_TX_SOFTIRQ);
27883121 local_irq_restore(flags);
2789
- preempt_check_resched_rt();
27903122 }
27913123 EXPORT_SYMBOL(__dev_kfree_skb_irq);
27923124
....@@ -2794,8 +3126,10 @@
27943126 {
27953127 if (in_irq() || irqs_disabled())
27963128 __dev_kfree_skb_irq(skb, reason);
3129
+ else if (unlikely(reason == SKB_REASON_DROPPED))
3130
+ kfree_skb(skb);
27973131 else
2798
- dev_kfree_skb(skb);
3132
+ consume_skb(skb);
27993133 }
28003134 EXPORT_SYMBOL(__dev_kfree_skb_any);
28013135
....@@ -2883,12 +3217,10 @@
28833217 else
28843218 name = netdev_name(dev);
28853219 }
2886
- WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2887
- "gso_type=%d ip_summed=%d\n",
3220
+ skb_dump(KERN_WARNING, skb, false);
3221
+ WARN(1, "%s: caps=(%pNF, %pNF)\n",
28883222 name, dev ? &dev->features : &null_features,
2889
- skb->sk ? &skb->sk->sk_route_caps : &null_features,
2890
- skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2891
- skb_shinfo(skb)->gso_type, skb->ip_summed);
3223
+ skb->sk ? &skb->sk->sk_route_caps : &null_features);
28923224 }
28933225
28943226 /*
....@@ -2918,18 +3250,19 @@
29183250 }
29193251
29203252 offset = skb_checksum_start_offset(skb);
2921
- BUG_ON(offset >= skb_headlen(skb));
3253
+ ret = -EINVAL;
3254
+ if (WARN_ON_ONCE(offset >= skb_headlen(skb)))
3255
+ goto out;
3256
+
29223257 csum = skb_checksum(skb, offset, skb->len - offset, 0);
29233258
29243259 offset += skb->csum_offset;
2925
- BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
3260
+ if (WARN_ON_ONCE(offset + sizeof(__sum16) > skb_headlen(skb)))
3261
+ goto out;
29263262
2927
- if (skb_cloned(skb) &&
2928
- !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2929
- ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2930
- if (ret)
2931
- goto out;
2932
- }
3263
+ ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
3264
+ if (ret)
3265
+ goto out;
29333266
29343267 *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
29353268 out_set_summed:
....@@ -2964,12 +3297,11 @@
29643297 ret = -EINVAL;
29653298 goto out;
29663299 }
2967
- if (skb_cloned(skb) &&
2968
- !skb_clone_writable(skb, offset + sizeof(__le32))) {
2969
- ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2970
- if (ret)
2971
- goto out;
2972
- }
3300
+
3301
+ ret = skb_ensure_writable(skb, offset + sizeof(__le32));
3302
+ if (ret)
3303
+ goto out;
3304
+
29733305 crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
29743306 skb->len - start, ~(__u32)0,
29753307 crc32c_csum_stub));
....@@ -2995,7 +3327,7 @@
29953327 type = eth->h_proto;
29963328 }
29973329
2998
- return __vlan_get_protocol(skb, type, depth);
3330
+ return vlan_get_protocol_and_depth(skb, type, depth);
29993331 }
30003332
30013333 /**
....@@ -3054,7 +3386,7 @@
30543386 * It may return NULL if the skb requires no segmentation. This is
30553387 * only possible when GSO is used for verifying header integrity.
30563388 *
3057
- * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
3389
+ * Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb.
30583390 */
30593391 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
30603392 netdev_features_t features, bool tx_path)
....@@ -3083,7 +3415,7 @@
30833415 features &= ~NETIF_F_GSO_PARTIAL;
30843416 }
30853417
3086
- BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
3418
+ BUILD_BUG_ON(SKB_GSO_CB_OFFSET +
30873419 sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
30883420
30893421 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
....@@ -3094,7 +3426,7 @@
30943426
30953427 segs = skb_mac_gso_segment(skb, features);
30963428
3097
- if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
3429
+ if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
30983430 skb_warn_bad_offload(skb);
30993431
31003432 return segs;
....@@ -3103,10 +3435,11 @@
31033435
31043436 /* Take action when hardware reception checksum errors are detected. */
31053437 #ifdef CONFIG_BUG
3106
-void netdev_rx_csum_fault(struct net_device *dev)
3438
+void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
31073439 {
31083440 if (net_ratelimit()) {
31093441 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
3442
+ skb_dump(KERN_ERR, skb, true);
31103443 dump_stack();
31113444 }
31123445 }
....@@ -3156,10 +3489,9 @@
31563489 static netdev_features_t harmonize_features(struct sk_buff *skb,
31573490 netdev_features_t features)
31583491 {
3159
- int tmp;
31603492 __be16 type;
31613493
3162
- type = skb_network_protocol(skb, &tmp);
3494
+ type = skb_network_protocol(skb, NULL);
31633495 features = net_mpls_features(skb, features, type);
31643496
31653497 if (skb->ip_summed != CHECKSUM_NONE &&
....@@ -3256,10 +3588,11 @@
32563588 unsigned int len;
32573589 int rc;
32583590
3259
- if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
3591
+ if (dev_nit_active(dev))
32603592 dev_queue_xmit_nit(skb, dev);
32613593
32623594 len = skb->len;
3595
+ PRANDOM_ADD_NOISE(skb, dev, txq, len + jiffies);
32633596 trace_net_dev_start_xmit(skb, dev);
32643597 rc = netdev_start_xmit(skb, dev, txq, more);
32653598 trace_net_dev_xmit(skb, rc, dev, len);
....@@ -3276,7 +3609,7 @@
32763609 while (skb) {
32773610 struct sk_buff *next = skb->next;
32783611
3279
- skb->next = NULL;
3612
+ skb_mark_not_on_list(skb);
32803613 rc = xmit_one(skb, dev, txq, next != NULL);
32813614 if (unlikely(!dev_xmit_complete(rc))) {
32823615 skb->next = next;
....@@ -3307,7 +3640,7 @@
33073640 int skb_csum_hwoffload_help(struct sk_buff *skb,
33083641 const netdev_features_t features)
33093642 {
3310
- if (unlikely(skb->csum_not_inet))
3643
+ if (unlikely(skb_csum_is_sctp(skb)))
33113644 return !!(features & NETIF_F_SCTP_CRC) ? 0 :
33123645 skb_crc32c_csum_help(skb);
33133646
....@@ -3376,7 +3709,7 @@
33763709
33773710 for (; skb != NULL; skb = next) {
33783711 next = skb->next;
3379
- skb->next = NULL;
3712
+ skb_mark_not_on_list(skb);
33803713
33813714 /* in case skb wont be segmented, point to itself */
33823715 skb->prev = skb;
....@@ -3407,7 +3740,7 @@
34073740 /* To get more precise estimation of bytes sent on wire,
34083741 * we add to pkt_len the headers size of all segments
34093742 */
3410
- if (shinfo->gso_size) {
3743
+ if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
34113744 unsigned int hdr_len;
34123745 u16 gso_segs = shinfo->gso_segs;
34133746
....@@ -3451,13 +3784,9 @@
34513784 qdisc_calculate_pkt_len(skb, q);
34523785
34533786 if (q->flags & TCQ_F_NOLOCK) {
3454
- if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3455
- __qdisc_drop(skb, &to_free);
3456
- rc = NET_XMIT_DROP;
3457
- } else {
3458
- rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3787
+ rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3788
+ if (likely(!netif_xmit_frozen_or_stopped(txq)))
34593789 qdisc_run(q);
3460
- }
34613790
34623791 if (unlikely(to_free))
34633792 kfree_skb_list(to_free);
....@@ -3470,11 +3799,7 @@
34703799 * This permits qdisc->running owner to get the lock more
34713800 * often and dequeue packets faster.
34723801 */
3473
-#ifdef CONFIG_PREEMPT_RT_FULL
3474
- contended = true;
3475
-#else
34763802 contended = qdisc_is_running(q);
3477
-#endif
34783803 if (unlikely(contended))
34793804 spin_lock(&q->busylock);
34803805
....@@ -3557,7 +3882,8 @@
35573882 skb_reset_mac_header(skb);
35583883 __skb_pull(skb, skb_network_offset(skb));
35593884 skb->pkt_type = PACKET_LOOPBACK;
3560
- skb->ip_summed = CHECKSUM_UNNECESSARY;
3885
+ if (skb->ip_summed == CHECKSUM_NONE)
3886
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
35613887 WARN_ON(!skb_dst(skb));
35623888 skb_dst_force(skb);
35633889 netif_rx_ni(skb);
....@@ -3576,6 +3902,7 @@
35763902 return skb;
35773903
35783904 /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
3905
+ qdisc_skb_cb(skb)->mru = 0;
35793906 mini_qdisc_bstats_cpu_update(miniq, skb);
35803907
35813908 switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
....@@ -3676,23 +4003,21 @@
36764003 }
36774004
36784005 u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
3679
- struct net_device *sb_dev,
3680
- select_queue_fallback_t fallback)
4006
+ struct net_device *sb_dev)
36814007 {
36824008 return 0;
36834009 }
36844010 EXPORT_SYMBOL(dev_pick_tx_zero);
36854011
36864012 u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
3687
- struct net_device *sb_dev,
3688
- select_queue_fallback_t fallback)
4013
+ struct net_device *sb_dev)
36894014 {
36904015 return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
36914016 }
36924017 EXPORT_SYMBOL(dev_pick_tx_cpu_id);
36934018
3694
-static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
3695
- struct net_device *sb_dev)
4019
+u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
4020
+ struct net_device *sb_dev)
36964021 {
36974022 struct sock *sk = skb->sk;
36984023 int queue_index = sk_tx_queue_get(sk);
....@@ -3716,10 +4041,11 @@
37164041
37174042 return queue_index;
37184043 }
4044
+EXPORT_SYMBOL(netdev_pick_tx);
37194045
3720
-struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3721
- struct sk_buff *skb,
3722
- struct net_device *sb_dev)
4046
+struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
4047
+ struct sk_buff *skb,
4048
+ struct net_device *sb_dev)
37234049 {
37244050 int queue_index = 0;
37254051
....@@ -3734,10 +4060,9 @@
37344060 const struct net_device_ops *ops = dev->netdev_ops;
37354061
37364062 if (ops->ndo_select_queue)
3737
- queue_index = ops->ndo_select_queue(dev, skb, sb_dev,
3738
- __netdev_pick_tx);
4063
+ queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
37394064 else
3740
- queue_index = __netdev_pick_tx(dev, skb, sb_dev);
4065
+ queue_index = netdev_pick_tx(dev, skb, sb_dev);
37414066
37424067 queue_index = netdev_cap_txqueue(dev, queue_index);
37434068 }
....@@ -3781,6 +4106,7 @@
37814106 bool again = false;
37824107
37834108 skb_reset_mac_header(skb);
4109
+ skb_assert_len(skb);
37844110
37854111 if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
37864112 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
....@@ -3811,7 +4137,7 @@
38114137 else
38124138 skb_dst_force(skb);
38134139
3814
- txq = netdev_pick_tx(dev, skb, sb_dev);
4140
+ txq = netdev_core_pick_tx(dev, skb, sb_dev);
38154141 q = rcu_dereference_bh(txq->qdisc);
38164142
38174143 trace_net_dev_queue(skb);
....@@ -3835,14 +4161,10 @@
38354161 if (dev->flags & IFF_UP) {
38364162 int cpu = smp_processor_id(); /* ok because BHs are off */
38374163
3838
-#ifdef CONFIG_PREEMPT_RT_FULL
3839
- if (READ_ONCE(txq->xmit_lock_owner) != current) {
3840
-#else
38414164 /* Other cpus might concurrently change txq->xmit_lock_owner
38424165 * to -1 or to their cpu id, but not to our id.
38434166 */
38444167 if (READ_ONCE(txq->xmit_lock_owner) != cpu) {
3845
-#endif
38464168 if (dev_xmit_recursion())
38474169 goto recursion_alert;
38484170
....@@ -3850,6 +4172,7 @@
38504172 if (!skb)
38514173 goto out;
38524174
4175
+ PRANDOM_ADD_NOISE(skb, dev, txq, jiffies);
38534176 HARD_TX_LOCK(dev, txq, cpu);
38544177
38554178 if (!netif_xmit_stopped(txq)) {
....@@ -3897,7 +4220,7 @@
38974220 }
38984221 EXPORT_SYMBOL(dev_queue_xmit_accel);
38994222
3900
-int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
4223
+int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
39014224 {
39024225 struct net_device *dev = skb->dev;
39034226 struct sk_buff *orig_skb = skb;
....@@ -3915,6 +4238,7 @@
39154238
39164239 skb_set_queue_mapping(skb, queue_id);
39174240 txq = skb_get_tx_queue(dev, skb);
4241
+ PRANDOM_ADD_NOISE(skb, dev, txq, jiffies);
39184242
39194243 local_bh_disable();
39204244
....@@ -3926,17 +4250,13 @@
39264250 dev_xmit_recursion_dec();
39274251
39284252 local_bh_enable();
3929
-
3930
- if (!dev_xmit_complete(ret))
3931
- kfree_skb(skb);
3932
-
39334253 return ret;
39344254 drop:
39354255 atomic_long_inc(&dev->tx_dropped);
39364256 kfree_skb_list(skb);
39374257 return NET_XMIT_DROP;
39384258 }
3939
-EXPORT_SYMBOL(dev_direct_xmit);
4259
+EXPORT_SYMBOL(__dev_direct_xmit);
39404260
39414261 /*************************************************************************
39424262 * Receiver routines
....@@ -3954,6 +4274,8 @@
39544274 int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */
39554275 int dev_rx_weight __read_mostly = 64;
39564276 int dev_tx_weight __read_mostly = 64;
4277
+/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */
4278
+int gro_normal_batch __read_mostly = 8;
39574279
39584280 /* Called with irq disabled */
39594281 static inline void ____napi_schedule(struct softnet_data *sd,
....@@ -3971,9 +4293,9 @@
39714293 u32 rps_cpu_mask __read_mostly;
39724294 EXPORT_SYMBOL(rps_cpu_mask);
39734295
3974
-struct static_key rps_needed __read_mostly;
4296
+struct static_key_false rps_needed __read_mostly;
39754297 EXPORT_SYMBOL(rps_needed);
3976
-struct static_key rfs_needed __read_mostly;
4298
+struct static_key_false rfs_needed __read_mostly;
39774299 EXPORT_SYMBOL(rfs_needed);
39784300
39794301 static struct rps_dev_flow *
....@@ -4068,8 +4390,10 @@
40684390 u32 next_cpu;
40694391 u32 ident;
40704392
4071
- /* First check into global flow table if there is a match */
4072
- ident = sock_flow_table->ents[hash & sock_flow_table->mask];
4393
+ /* First check into global flow table if there is a match.
4394
+ * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow().
4395
+ */
4396
+ ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]);
40734397 if ((ident ^ hash) & ~rps_cpu_mask)
40744398 goto try_rps;
40754399
....@@ -4204,7 +4528,7 @@
42044528 struct softnet_data *sd;
42054529 unsigned int old_flow, new_flow;
42064530
4207
- if (qlen < (netdev_max_backlog >> 1))
4531
+ if (qlen < (READ_ONCE(netdev_max_backlog) >> 1))
42084532 return false;
42094533
42104534 sd = this_cpu_ptr(&softnet_data);
....@@ -4252,7 +4576,7 @@
42524576 if (!netif_running(skb->dev))
42534577 goto drop;
42544578 qlen = skb_queue_len(&sd->input_pkt_queue);
4255
- if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
4579
+ if (qlen <= READ_ONCE(netdev_max_backlog) && !skb_flow_limit(skb, qlen)) {
42564580 if (qlen) {
42574581 enqueue:
42584582 __skb_queue_tail(&sd->input_pkt_queue, skb);
....@@ -4277,7 +4601,6 @@
42774601 rps_unlock(sd);
42784602
42794603 local_irq_restore(flags);
4280
- preempt_check_resched_rt();
42814604
42824605 atomic_long_inc(&skb->dev->rx_dropped);
42834606 kfree_skb(skb);
....@@ -4323,7 +4646,7 @@
43234646 /* Reinjected packets coming from act_mirred or similar should
43244647 * not get XDP generic processing.
43254648 */
4326
- if (skb_is_tc_redirected(skb))
4649
+ if (skb_is_redirected(skb))
43274650 return XDP_PASS;
43284651
43294652 /* XDP packets must be linear and must have sufficient headroom
....@@ -4355,6 +4678,11 @@
43554678 xdp->data_meta = xdp->data;
43564679 xdp->data_end = xdp->data + hlen;
43574680 xdp->data_hard_start = skb->data - skb_headroom(skb);
4681
+
4682
+ /* SKB "head" area always have tailroom for skb_shared_info */
4683
+ xdp->frame_sz = (void *)skb_end_pointer(skb) - xdp->data_hard_start;
4684
+ xdp->frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
4685
+
43584686 orig_data_end = xdp->data_end;
43594687 orig_data = xdp->data;
43604688 eth = (struct ethhdr *)xdp->data;
....@@ -4378,14 +4706,11 @@
43784706 skb_reset_network_header(skb);
43794707 }
43804708
4381
- /* check if bpf_xdp_adjust_tail was used. it can only "shrink"
4382
- * pckt.
4383
- */
4384
- off = orig_data_end - xdp->data_end;
4709
+ /* check if bpf_xdp_adjust_tail was used */
4710
+ off = xdp->data_end - orig_data_end;
43854711 if (off != 0) {
43864712 skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
4387
- skb->len -= off;
4388
-
4713
+ skb->len += off; /* positive on grow, negative on shrink */
43894714 }
43904715
43914716 /* check if XDP changed eth hdr such SKB needs update */
....@@ -4408,10 +4733,10 @@
44084733 break;
44094734 default:
44104735 bpf_warn_invalid_xdp_action(act);
4411
- /* fall through */
4736
+ fallthrough;
44124737 case XDP_ABORTED:
44134738 trace_xdp_exception(skb->dev, xdp_prog, act);
4414
- /* fall through */
4739
+ fallthrough;
44154740 case XDP_DROP:
44164741 do_drop:
44174742 kfree_skb(skb);
....@@ -4431,7 +4756,7 @@
44314756 bool free_skb = true;
44324757 int cpu, rc;
44334758
4434
- txq = netdev_pick_tx(dev, skb, NULL);
4759
+ txq = netdev_core_pick_tx(dev, skb, NULL);
44354760 cpu = smp_processor_id();
44364761 HARD_TX_LOCK(dev, txq, cpu);
44374762 if (!netif_xmit_stopped(txq)) {
....@@ -4445,7 +4770,6 @@
44454770 kfree_skb(skb);
44464771 }
44474772 }
4448
-EXPORT_SYMBOL_GPL(generic_xdp_tx);
44494773
44504774 static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
44514775
....@@ -4483,16 +4807,16 @@
44834807 {
44844808 int ret;
44854809
4486
- net_timestamp_check(netdev_tstamp_prequeue, skb);
4810
+ net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
44874811
44884812 trace_netif_rx(skb);
44894813
44904814 #ifdef CONFIG_RPS
4491
- if (static_key_false(&rps_needed)) {
4815
+ if (static_branch_unlikely(&rps_needed)) {
44924816 struct rps_dev_flow voidflow, *rflow = &voidflow;
44934817 int cpu;
44944818
4495
- migrate_disable();
4819
+ preempt_disable();
44964820 rcu_read_lock();
44974821
44984822 cpu = get_rps_cpu(skb->dev, skb, &rflow);
....@@ -4502,14 +4826,14 @@
45024826 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
45034827
45044828 rcu_read_unlock();
4505
- migrate_enable();
4829
+ preempt_enable();
45064830 } else
45074831 #endif
45084832 {
45094833 unsigned int qtail;
45104834
4511
- ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail);
4512
- put_cpu_light();
4835
+ ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
4836
+ put_cpu();
45134837 }
45144838 return ret;
45154839 }
....@@ -4531,9 +4855,14 @@
45314855
45324856 int netif_rx(struct sk_buff *skb)
45334857 {
4858
+ int ret;
4859
+
45344860 trace_netif_rx_entry(skb);
45354861
4536
- return netif_rx_internal(skb);
4862
+ ret = netif_rx_internal(skb);
4863
+ trace_netif_rx_exit(ret);
4864
+
4865
+ return ret;
45374866 }
45384867 EXPORT_SYMBOL(netif_rx);
45394868
....@@ -4543,13 +4872,31 @@
45434872
45444873 trace_netif_rx_ni_entry(skb);
45454874
4546
- local_bh_disable();
4875
+ preempt_disable();
45474876 err = netif_rx_internal(skb);
4548
- local_bh_enable();
4877
+ if (local_softirq_pending())
4878
+ do_softirq();
4879
+ preempt_enable();
4880
+ trace_netif_rx_ni_exit(err);
45494881
45504882 return err;
45514883 }
45524884 EXPORT_SYMBOL(netif_rx_ni);
4885
+
4886
+int netif_rx_any_context(struct sk_buff *skb)
4887
+{
4888
+ /*
4889
+ * If invoked from contexts which do not invoke bottom half
4890
+ * processing either at return from interrupt or when softrqs are
4891
+ * reenabled, use netif_rx_ni() which invokes bottomhalf processing
4892
+ * directly.
4893
+ */
4894
+ if (in_interrupt())
4895
+ return netif_rx(skb);
4896
+ else
4897
+ return netif_rx_ni(skb);
4898
+}
4899
+EXPORT_SYMBOL(netif_rx_any_context);
45534900
45544901 static __latent_entropy void net_tx_action(struct softirq_action *h)
45554902 {
....@@ -4592,25 +4939,43 @@
45924939 sd->output_queue_tailp = &sd->output_queue;
45934940 local_irq_enable();
45944941
4942
+ rcu_read_lock();
4943
+
45954944 while (head) {
45964945 struct Qdisc *q = head;
45974946 spinlock_t *root_lock = NULL;
45984947
45994948 head = head->next_sched;
46004949
4601
- if (!(q->flags & TCQ_F_NOLOCK)) {
4602
- root_lock = qdisc_lock(q);
4603
- spin_lock(root_lock);
4604
- }
46054950 /* We need to make sure head->next_sched is read
46064951 * before clearing __QDISC_STATE_SCHED
46074952 */
46084953 smp_mb__before_atomic();
4954
+
4955
+ if (!(q->flags & TCQ_F_NOLOCK)) {
4956
+ root_lock = qdisc_lock(q);
4957
+ spin_lock(root_lock);
4958
+ } else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
4959
+ &q->state))) {
4960
+ /* There is a synchronize_net() between
4961
+ * STATE_DEACTIVATED flag being set and
4962
+ * qdisc_reset()/some_qdisc_is_busy() in
4963
+ * dev_deactivate(), so we can safely bail out
4964
+ * early here to avoid data race between
4965
+ * qdisc_deactivate() and some_qdisc_is_busy()
4966
+ * for lockless qdisc.
4967
+ */
4968
+ clear_bit(__QDISC_STATE_SCHED, &q->state);
4969
+ continue;
4970
+ }
4971
+
46094972 clear_bit(__QDISC_STATE_SCHED, &q->state);
46104973 qdisc_run(q);
46114974 if (root_lock)
46124975 spin_unlock(root_lock);
46134976 }
4977
+
4978
+ rcu_read_unlock();
46144979 }
46154980
46164981 xfrm_dev_backlog(sd);
....@@ -4625,7 +4990,7 @@
46254990
46264991 static inline struct sk_buff *
46274992 sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
4628
- struct net_device *orig_dev)
4993
+ struct net_device *orig_dev, bool *another)
46294994 {
46304995 #ifdef CONFIG_NET_CLS_ACT
46314996 struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
....@@ -4645,10 +5010,12 @@
46455010 }
46465011
46475012 qdisc_skb_cb(skb)->pkt_len = skb->len;
5013
+ qdisc_skb_cb(skb)->mru = 0;
46485014 skb->tc_at_ingress = 1;
46495015 mini_qdisc_bstats_cpu_update(miniq, skb);
46505016
4651
- switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
5017
+ switch (tcf_classify_ingress(skb, miniq->block, miniq->filter_list,
5018
+ &cl_res, false)) {
46525019 case TC_ACT_OK:
46535020 case TC_ACT_RECLASSIFY:
46545021 skb->tc_index = TC_H_MIN(cl_res.classid);
....@@ -4668,11 +5035,13 @@
46685035 * redirecting to another netdev
46695036 */
46705037 __skb_push(skb, skb->mac_len);
4671
- skb_do_redirect(skb);
5038
+ if (skb_do_redirect(skb) == -EAGAIN) {
5039
+ __skb_pull(skb, skb->mac_len);
5040
+ *another = true;
5041
+ break;
5042
+ }
46725043 return NULL;
4673
- case TC_ACT_REINSERT:
4674
- /* this does not scrub the packet, and updates stats on error */
4675
- skb_tc_reinsert(skb, &cl_res);
5044
+ case TC_ACT_CONSUMED:
46765045 return NULL;
46775046 default:
46785047 break;
....@@ -4772,7 +5141,6 @@
47725141 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
47735142 int *ret, struct net_device *orig_dev)
47745143 {
4775
-#ifdef CONFIG_NETFILTER_INGRESS
47765144 if (nf_hook_ingress_active(skb)) {
47775145 int ingress_retval;
47785146
....@@ -4786,7 +5154,6 @@
47865154 rcu_read_unlock();
47875155 return ingress_retval;
47885156 }
4789
-#endif /* CONFIG_NETFILTER_INGRESS */
47905157 return 0;
47915158 }
47925159
....@@ -4801,7 +5168,7 @@
48015168 int ret = NET_RX_DROP;
48025169 __be16 type;
48035170
4804
- net_timestamp_check(!netdev_tstamp_prequeue, skb);
5171
+ net_timestamp_check(!READ_ONCE(netdev_tstamp_prequeue), skb);
48055172
48065173 trace_netif_receive_skb(skb);
48075174
....@@ -4861,7 +5228,12 @@
48615228 skip_taps:
48625229 #ifdef CONFIG_NET_INGRESS
48635230 if (static_branch_unlikely(&ingress_needed_key)) {
4864
- skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
5231
+ bool another = false;
5232
+
5233
+ skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
5234
+ &another);
5235
+ if (another)
5236
+ goto another_round;
48655237 if (!skb)
48665238 goto out;
48675239
....@@ -4869,7 +5241,7 @@
48695241 goto out;
48705242 }
48715243 #endif
4872
- skb_reset_tc(skb);
5244
+ skb_reset_redirect(skb);
48735245 skip_classify:
48745246 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
48755247 goto drop;
....@@ -4906,14 +5278,42 @@
49065278 }
49075279 }
49085280
4909
- if (unlikely(skb_vlan_tag_present(skb))) {
4910
- if (skb_vlan_tag_get_id(skb))
5281
+ if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
5282
+check_vlan_id:
5283
+ if (skb_vlan_tag_get_id(skb)) {
5284
+ /* Vlan id is non 0 and vlan_do_receive() above couldn't
5285
+ * find vlan device.
5286
+ */
49115287 skb->pkt_type = PACKET_OTHERHOST;
5288
+ } else if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
5289
+ skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
5290
+ /* Outer header is 802.1P with vlan 0, inner header is
5291
+ * 802.1Q or 802.1AD and vlan_do_receive() above could
5292
+ * not find vlan dev for vlan id 0.
5293
+ */
5294
+ __vlan_hwaccel_clear_tag(skb);
5295
+ skb = skb_vlan_untag(skb);
5296
+ if (unlikely(!skb))
5297
+ goto out;
5298
+ if (vlan_do_receive(&skb))
5299
+ /* After stripping off 802.1P header with vlan 0
5300
+ * vlan dev is found for inner header.
5301
+ */
5302
+ goto another_round;
5303
+ else if (unlikely(!skb))
5304
+ goto out;
5305
+ else
5306
+ /* We have stripped outer 802.1P vlan 0 header.
5307
+ * But could not find vlan dev.
5308
+ * check again for vlan id to set OTHERHOST.
5309
+ */
5310
+ goto check_vlan_id;
5311
+ }
49125312 /* Note: we might in the future use prio bits
49135313 * and set skb->priority like in vlan_do_receive()
49145314 * For the time being, just ignore Priority Code Point
49155315 */
4916
- skb->vlan_tci = 0;
5316
+ __vlan_hwaccel_clear_tag(skb);
49175317 }
49185318
49195319 type = skb->protocol;
....@@ -4969,7 +5369,8 @@
49695369
49705370 ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
49715371 if (pt_prev)
4972
- ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
5372
+ ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
5373
+ skb->dev, pt_prev, orig_dev);
49735374 return ret;
49745375 }
49755376
....@@ -4979,7 +5380,7 @@
49795380 *
49805381 * More direct receive version of netif_receive_skb(). It should
49815382 * only be used by callers that have a need to skip RPS and Generic XDP.
4982
- * Caller must also take care of handling if (page_is_)pfmemalloc.
5383
+ * Caller must also take care of handling if ``(page_is_)pfmemalloc``.
49835384 *
49845385 * This function may only be called from softirq context and interrupts
49855386 * should be enabled.
....@@ -5011,7 +5412,8 @@
50115412 if (list_empty(head))
50125413 return;
50135414 if (pt_prev->list_func != NULL)
5014
- pt_prev->list_func(head, pt_prev, orig_dev);
5415
+ INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
5416
+ ip_list_rcv, head, pt_prev, orig_dev);
50155417 else
50165418 list_for_each_entry_safe(skb, next, head, list) {
50175419 skb_list_del_init(skb);
....@@ -5122,6 +5524,25 @@
51225524 struct bpf_prog *new = xdp->prog;
51235525 int ret = 0;
51245526
5527
+ if (new) {
5528
+ u32 i;
5529
+
5530
+ mutex_lock(&new->aux->used_maps_mutex);
5531
+
5532
+ /* generic XDP does not work with DEVMAPs that can
5533
+ * have a bpf_prog installed on an entry
5534
+ */
5535
+ for (i = 0; i < new->aux->used_map_cnt; i++) {
5536
+ if (dev_map_can_have_prog(new->aux->used_maps[i]) ||
5537
+ cpu_map_prog_allowed(new->aux->used_maps[i])) {
5538
+ mutex_unlock(&new->aux->used_maps_mutex);
5539
+ return -EINVAL;
5540
+ }
5541
+ }
5542
+
5543
+ mutex_unlock(&new->aux->used_maps_mutex);
5544
+ }
5545
+
51255546 switch (xdp->command) {
51265547 case XDP_SETUP_PROG:
51275548 rcu_assign_pointer(dev->xdp_prog, new);
....@@ -5137,10 +5558,6 @@
51375558 }
51385559 break;
51395560
5140
- case XDP_QUERY_PROG:
5141
- xdp->prog_id = old ? old->aux->id : 0;
5142
- break;
5143
-
51445561 default:
51455562 ret = -EINVAL;
51465563 break;
....@@ -5153,14 +5570,14 @@
51535570 {
51545571 int ret;
51555572
5156
- net_timestamp_check(netdev_tstamp_prequeue, skb);
5573
+ net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
51575574
51585575 if (skb_defer_rx_timestamp(skb))
51595576 return NET_RX_SUCCESS;
51605577
51615578 rcu_read_lock();
51625579 #ifdef CONFIG_RPS
5163
- if (static_key_false(&rps_needed)) {
5580
+ if (static_branch_unlikely(&rps_needed)) {
51645581 struct rps_dev_flow voidflow, *rflow = &voidflow;
51655582 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
51665583
....@@ -5183,7 +5600,7 @@
51835600
51845601 INIT_LIST_HEAD(&sublist);
51855602 list_for_each_entry_safe(skb, next, head, list) {
5186
- net_timestamp_check(netdev_tstamp_prequeue, skb);
5603
+ net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
51875604 skb_list_del_init(skb);
51885605 if (!skb_defer_rx_timestamp(skb))
51895606 list_add_tail(&skb->list, &sublist);
....@@ -5192,7 +5609,7 @@
51925609
51935610 rcu_read_lock();
51945611 #ifdef CONFIG_RPS
5195
- if (static_key_false(&rps_needed)) {
5612
+ if (static_branch_unlikely(&rps_needed)) {
51965613 list_for_each_entry_safe(skb, next, head, list) {
51975614 struct rps_dev_flow voidflow, *rflow = &voidflow;
51985615 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
....@@ -5226,9 +5643,14 @@
52265643 */
52275644 int netif_receive_skb(struct sk_buff *skb)
52285645 {
5646
+ int ret;
5647
+
52295648 trace_netif_receive_skb_entry(skb);
52305649
5231
- return netif_receive_skb_internal(skb);
5650
+ ret = netif_receive_skb_internal(skb);
5651
+ trace_netif_receive_skb_exit(ret);
5652
+
5653
+ return ret;
52325654 }
52335655 EXPORT_SYMBOL(netif_receive_skb);
52345656
....@@ -5248,13 +5670,16 @@
52485670
52495671 if (list_empty(head))
52505672 return;
5251
- list_for_each_entry(skb, head, list)
5252
- trace_netif_receive_skb_list_entry(skb);
5673
+ if (trace_netif_receive_skb_list_entry_enabled()) {
5674
+ list_for_each_entry(skb, head, list)
5675
+ trace_netif_receive_skb_list_entry(skb);
5676
+ }
52535677 netif_receive_skb_list_internal(head);
5678
+ trace_netif_receive_skb_list_exit(0);
52545679 }
52555680 EXPORT_SYMBOL(netif_receive_skb_list);
52565681
5257
-DEFINE_PER_CPU(struct work_struct, flush_works);
5682
+static DEFINE_PER_CPU(struct work_struct, flush_works);
52585683
52595684 /* Network device is going away, flush any packets still pending */
52605685 static void flush_backlog(struct work_struct *work)
....@@ -5287,23 +5712,89 @@
52875712 local_bh_enable();
52885713 }
52895714
5715
+static bool flush_required(int cpu)
5716
+{
5717
+#if IS_ENABLED(CONFIG_RPS)
5718
+ struct softnet_data *sd = &per_cpu(softnet_data, cpu);
5719
+ bool do_flush;
5720
+
5721
+ local_irq_disable();
5722
+ rps_lock(sd);
5723
+
5724
+ /* as insertion into process_queue happens with the rps lock held,
5725
+ * process_queue access may race only with dequeue
5726
+ */
5727
+ do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
5728
+ !skb_queue_empty_lockless(&sd->process_queue);
5729
+ rps_unlock(sd);
5730
+ local_irq_enable();
5731
+
5732
+ return do_flush;
5733
+#endif
5734
+ /* without RPS we can't safely check input_pkt_queue: during a
5735
+ * concurrent remote skb_queue_splice() we can detect as empty both
5736
+ * input_pkt_queue and process_queue even if the latter could end-up
5737
+ * containing a lot of packets.
5738
+ */
5739
+ return true;
5740
+}
5741
+
52905742 static void flush_all_backlogs(void)
52915743 {
5744
+ static cpumask_t flush_cpus;
52925745 unsigned int cpu;
5746
+
5747
+ /* since we are under rtnl lock protection we can use static data
5748
+ * for the cpumask and avoid allocating on stack the possibly
5749
+ * large mask
5750
+ */
5751
+ ASSERT_RTNL();
52935752
52945753 get_online_cpus();
52955754
5296
- for_each_online_cpu(cpu)
5297
- queue_work_on(cpu, system_highpri_wq,
5298
- per_cpu_ptr(&flush_works, cpu));
5755
+ cpumask_clear(&flush_cpus);
5756
+ for_each_online_cpu(cpu) {
5757
+ if (flush_required(cpu)) {
5758
+ queue_work_on(cpu, system_highpri_wq,
5759
+ per_cpu_ptr(&flush_works, cpu));
5760
+ cpumask_set_cpu(cpu, &flush_cpus);
5761
+ }
5762
+ }
52995763
5300
- for_each_online_cpu(cpu)
5764
+ /* we can have in flight packet[s] on the cpus we are not flushing,
5765
+ * synchronize_net() in unregister_netdevice_many() will take care of
5766
+ * them
5767
+ */
5768
+ for_each_cpu(cpu, &flush_cpus)
53015769 flush_work(per_cpu_ptr(&flush_works, cpu));
53025770
53035771 put_online_cpus();
53045772 }
53055773
5306
-static int napi_gro_complete(struct sk_buff *skb)
5774
+/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
5775
+static void gro_normal_list(struct napi_struct *napi)
5776
+{
5777
+ if (!napi->rx_count)
5778
+ return;
5779
+ netif_receive_skb_list_internal(&napi->rx_list);
5780
+ INIT_LIST_HEAD(&napi->rx_list);
5781
+ napi->rx_count = 0;
5782
+}
5783
+
5784
+/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded,
5785
+ * pass the whole batch up to the stack.
5786
+ */
5787
+static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, int segs)
5788
+{
5789
+ list_add_tail(&skb->list, &napi->rx_list);
5790
+ napi->rx_count += segs;
5791
+ if (napi->rx_count >= gro_normal_batch)
5792
+ gro_normal_list(napi);
5793
+}
5794
+
5795
+INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int));
5796
+INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int));
5797
+static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
53075798 {
53085799 struct packet_offload *ptype;
53095800 __be16 type = skb->protocol;
....@@ -5322,7 +5813,9 @@
53225813 if (ptype->type != type || !ptype->callbacks.gro_complete)
53235814 continue;
53245815
5325
- err = ptype->callbacks.gro_complete(skb, 0);
5816
+ err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
5817
+ ipv6_gro_complete, inet_gro_complete,
5818
+ skb, 0);
53265819 break;
53275820 }
53285821 rcu_read_unlock();
....@@ -5334,7 +5827,8 @@
53345827 }
53355828
53365829 out:
5337
- return netif_receive_skb_internal(skb);
5830
+ gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count);
5831
+ return NET_RX_SUCCESS;
53385832 }
53395833
53405834 static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
....@@ -5346,9 +5840,8 @@
53465840 list_for_each_entry_safe_reverse(skb, p, head, list) {
53475841 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
53485842 return;
5349
- list_del(&skb->list);
5350
- skb->next = NULL;
5351
- napi_gro_complete(skb);
5843
+ skb_list_del_init(skb);
5844
+ napi_gro_complete(napi, skb);
53525845 napi->gro_hash[index].count--;
53535846 }
53545847
....@@ -5362,11 +5855,13 @@
53625855 */
53635856 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
53645857 {
5365
- u32 i;
5858
+ unsigned long bitmask = napi->gro_bitmask;
5859
+ unsigned int i, base = ~0U;
53665860
5367
- for (i = 0; i < GRO_HASH_BUCKETS; i++) {
5368
- if (test_bit(i, &napi->gro_bitmask))
5369
- __napi_gro_flush_chain(napi, i, flush_old);
5861
+ while ((i = ffs(bitmask)) != 0) {
5862
+ bitmask >>= i;
5863
+ base += i;
5864
+ __napi_gro_flush_chain(napi, base, flush_old);
53705865 }
53715866 }
53725867 EXPORT_SYMBOL(napi_gro_flush);
....@@ -5391,7 +5886,9 @@
53915886 }
53925887
53935888 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
5394
- diffs |= p->vlan_tci ^ skb->vlan_tci;
5889
+ diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb);
5890
+ if (skb_vlan_tag_present(p))
5891
+ diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb);
53955892 diffs |= skb_metadata_dst_cmp(p, skb);
53965893 diffs |= skb_metadata_differs(p, skb);
53975894 if (maclen == ETH_HLEN)
....@@ -5401,13 +5898,26 @@
54015898 diffs = memcmp(skb_mac_header(p),
54025899 skb_mac_header(skb),
54035900 maclen);
5901
+
5902
+ diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb);
5903
+#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
5904
+ if (!diffs) {
5905
+ struct tc_skb_ext *skb_ext = skb_ext_find(skb, TC_SKB_EXT);
5906
+ struct tc_skb_ext *p_ext = skb_ext_find(p, TC_SKB_EXT);
5907
+
5908
+ diffs |= (!!p_ext) ^ (!!skb_ext);
5909
+ if (!diffs && unlikely(skb_ext))
5910
+ diffs |= p_ext->chain ^ skb_ext->chain;
5911
+ }
5912
+#endif
5913
+
54045914 NAPI_GRO_CB(p)->same_flow = !diffs;
54055915 }
54065916
54075917 return head;
54085918 }
54095919
5410
-static void skb_gro_reset_offset(struct sk_buff *skb)
5920
+static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff)
54115921 {
54125922 const struct skb_shared_info *pinfo = skb_shinfo(skb);
54135923 const skb_frag_t *frag0 = &pinfo->frags[0];
....@@ -5416,10 +5926,9 @@
54165926 NAPI_GRO_CB(skb)->frag0 = NULL;
54175927 NAPI_GRO_CB(skb)->frag0_len = 0;
54185928
5419
- if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
5420
- pinfo->nr_frags &&
5929
+ if (!skb_headlen(skb) && pinfo->nr_frags &&
54215930 !PageHighMem(skb_frag_page(frag0)) &&
5422
- (!NET_IP_ALIGN || !(skb_frag_off(frag0) & 3))) {
5931
+ (!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) {
54235932 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
54245933 NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
54255934 skb_frag_size(frag0),
....@@ -5438,7 +5947,7 @@
54385947 skb->data_len -= grow;
54395948 skb->tail += grow;
54405949
5441
- pinfo->frags[0].page_offset += grow;
5950
+ skb_frag_off_add(&pinfo->frags[0], grow);
54425951 skb_frag_size_sub(&pinfo->frags[0], grow);
54435952
54445953 if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
....@@ -5448,7 +5957,7 @@
54485957 }
54495958 }
54505959
5451
-static void gro_flush_oldest(struct list_head *head)
5960
+static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
54525961 {
54535962 struct sk_buff *oldest;
54545963
....@@ -5463,11 +5972,14 @@
54635972 /* Do not adjust napi->gro_hash[].count, caller is adding a new
54645973 * SKB to the chain.
54655974 */
5466
- list_del(&oldest->list);
5467
- oldest->next = NULL;
5468
- napi_gro_complete(oldest);
5975
+ skb_list_del_init(oldest);
5976
+ napi_gro_complete(napi, oldest);
54695977 }
54705978
5979
+INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *,
5980
+ struct sk_buff *));
5981
+INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *,
5982
+ struct sk_buff *));
54715983 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
54725984 {
54735985 u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
....@@ -5517,7 +6029,9 @@
55176029 NAPI_GRO_CB(skb)->csum_valid = 0;
55186030 }
55196031
5520
- pp = ptype->callbacks.gro_receive(gro_head, skb);
6032
+ pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
6033
+ ipv6_gro_receive, inet_gro_receive,
6034
+ gro_head, skb);
55216035 break;
55226036 }
55236037 rcu_read_unlock();
....@@ -5525,7 +6039,7 @@
55256039 if (&ptype->list == head)
55266040 goto normal;
55276041
5528
- if (IS_ERR(pp) && PTR_ERR(pp) == -EINPROGRESS) {
6042
+ if (PTR_ERR(pp) == -EINPROGRESS) {
55296043 ret = GRO_CONSUMED;
55306044 goto ok;
55316045 }
....@@ -5534,9 +6048,8 @@
55346048 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
55356049
55366050 if (pp) {
5537
- list_del(&pp->list);
5538
- pp->next = NULL;
5539
- napi_gro_complete(pp);
6051
+ skb_list_del_init(pp);
6052
+ napi_gro_complete(napi, pp);
55406053 napi->gro_hash[hash].count--;
55416054 }
55426055
....@@ -5547,7 +6060,7 @@
55476060 goto normal;
55486061
55496062 if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
5550
- gro_flush_oldest(gro_head);
6063
+ gro_flush_oldest(napi, gro_head);
55516064 } else {
55526065 napi->gro_hash[hash].count++;
55536066 }
....@@ -5607,17 +6120,19 @@
56076120
56086121 static void napi_skb_free_stolen_head(struct sk_buff *skb)
56096122 {
6123
+ nf_reset_ct(skb);
56106124 skb_dst_drop(skb);
5611
- secpath_reset(skb);
6125
+ skb_ext_put(skb);
56126126 kmem_cache_free(skbuff_head_cache, skb);
56136127 }
56146128
5615
-static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
6129
+static gro_result_t napi_skb_finish(struct napi_struct *napi,
6130
+ struct sk_buff *skb,
6131
+ gro_result_t ret)
56166132 {
56176133 switch (ret) {
56186134 case GRO_NORMAL:
5619
- if (netif_receive_skb_internal(skb))
5620
- ret = GRO_DROP;
6135
+ gro_normal_one(napi, skb, 1);
56216136 break;
56226137
56236138 case GRO_DROP:
....@@ -5642,12 +6157,17 @@
56426157
56436158 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
56446159 {
6160
+ gro_result_t ret;
6161
+
56456162 skb_mark_napi_id(skb, napi);
56466163 trace_napi_gro_receive_entry(skb);
56476164
5648
- skb_gro_reset_offset(skb);
6165
+ skb_gro_reset_offset(skb, 0);
56496166
5650
- return napi_skb_finish(dev_gro_receive(napi, skb), skb);
6167
+ ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb));
6168
+ trace_napi_gro_receive_exit(ret);
6169
+
6170
+ return ret;
56516171 }
56526172 EXPORT_SYMBOL(napi_gro_receive);
56536173
....@@ -5660,7 +6180,7 @@
56606180 __skb_pull(skb, skb_headlen(skb));
56616181 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
56626182 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
5663
- skb->vlan_tci = 0;
6183
+ __vlan_hwaccel_clear_tag(skb);
56646184 skb->dev = napi->dev;
56656185 skb->skb_iif = 0;
56666186
....@@ -5670,7 +6190,8 @@
56706190 skb->encapsulation = 0;
56716191 skb_shinfo(skb)->gso_type = 0;
56726192 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
5673
- secpath_reset(skb);
6193
+ skb_ext_reset(skb);
6194
+ nf_reset_ct(skb);
56746195
56756196 napi->skb = skb;
56766197 }
....@@ -5699,8 +6220,8 @@
56996220 case GRO_HELD:
57006221 __skb_push(skb, ETH_HLEN);
57016222 skb->protocol = eth_type_trans(skb, skb->dev);
5702
- if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
5703
- ret = GRO_DROP;
6223
+ if (ret == GRO_NORMAL)
6224
+ gro_normal_one(napi, skb, 1);
57046225 break;
57056226
57066227 case GRO_DROP:
....@@ -5735,7 +6256,7 @@
57356256 napi->skb = NULL;
57366257
57376258 skb_reset_mac_header(skb);
5738
- skb_gro_reset_offset(skb);
6259
+ skb_gro_reset_offset(skb, hlen);
57396260
57406261 if (unlikely(skb_gro_header_hard(skb, hlen))) {
57416262 eth = skb_gro_header_slow(skb, hlen, 0);
....@@ -5765,6 +6286,7 @@
57656286
57666287 gro_result_t napi_gro_frags(struct napi_struct *napi)
57676288 {
6289
+ gro_result_t ret;
57686290 struct sk_buff *skb = napi_frags_skb(napi);
57696291
57706292 if (!skb)
....@@ -5772,7 +6294,10 @@
57726294
57736295 trace_napi_gro_frags_entry(skb);
57746296
5775
- return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
6297
+ ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
6298
+ trace_napi_gro_frags_exit(ret);
6299
+
6300
+ return ret;
57766301 }
57776302 EXPORT_SYMBOL(napi_gro_frags);
57786303
....@@ -5788,10 +6313,11 @@
57886313
57896314 /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
57906315 sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
6316
+ /* See comments in __skb_checksum_complete(). */
57916317 if (likely(!sum)) {
57926318 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
57936319 !skb->csum_complete_sw)
5794
- netdev_rx_csum_fault(skb->dev);
6320
+ netdev_rx_csum_fault(skb->dev, skb);
57956321 }
57966322
57976323 NAPI_GRO_CB(skb)->csum = wsum;
....@@ -5827,14 +6353,12 @@
58276353 sd->rps_ipi_list = NULL;
58286354
58296355 local_irq_enable();
5830
- preempt_check_resched_rt();
58316356
58326357 /* Send pending IPI's to kick RPS processing on remote cpus. */
58336358 net_rps_send_ipi(remsd);
58346359 } else
58356360 #endif
58366361 local_irq_enable();
5837
- preempt_check_resched_rt();
58386362 }
58396363
58406364 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
....@@ -5860,22 +6384,21 @@
58606384 net_rps_action_and_irq_enable(sd);
58616385 }
58626386
5863
- napi->weight = dev_rx_weight;
6387
+ napi->weight = READ_ONCE(dev_rx_weight);
58646388 while (again) {
58656389 struct sk_buff *skb;
58666390
5867
- local_irq_disable();
58686391 while ((skb = __skb_dequeue(&sd->process_queue))) {
5869
- local_irq_enable();
58706392 rcu_read_lock();
58716393 __netif_receive_skb(skb);
58726394 rcu_read_unlock();
58736395 input_queue_head_incr(sd);
58746396 if (++work >= quota)
5875
- goto state_changed;
5876
- local_irq_disable();
6397
+ return work;
6398
+
58776399 }
58786400
6401
+ local_irq_disable();
58796402 rps_lock(sd);
58806403 if (skb_queue_empty(&sd->input_pkt_queue)) {
58816404 /*
....@@ -5896,10 +6419,6 @@
58966419 local_irq_enable();
58976420 }
58986421
5899
-state_changed:
5900
- napi_gro_flush(napi, false);
5901
- sd->current_napi = NULL;
5902
-
59036422 return work;
59046423 }
59056424
....@@ -5917,7 +6436,6 @@
59176436 local_irq_save(flags);
59186437 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
59196438 local_irq_restore(flags);
5920
- preempt_check_resched_rt();
59216439 }
59226440 EXPORT_SYMBOL(__napi_schedule);
59236441
....@@ -5926,7 +6444,7 @@
59266444 * @n: napi context
59276445 *
59286446 * Test if NAPI routine is already running, and if not mark
5929
- * it as running. This is used as a condition variable
6447
+ * it as running. This is used as a condition variable to
59306448 * insure only one NAPI poll instance runs. We also make
59316449 * sure there is no pending NAPI disable.
59326450 */
....@@ -5954,7 +6472,6 @@
59546472 }
59556473 EXPORT_SYMBOL(napi_schedule_prep);
59566474
5957
-#ifndef CONFIG_PREEMPT_RT_FULL
59586475 /**
59596476 * __napi_schedule_irqoff - schedule for receive
59606477 * @n: entry to schedule
....@@ -5973,11 +6490,11 @@
59736490 __napi_schedule(n);
59746491 }
59756492 EXPORT_SYMBOL(__napi_schedule_irqoff);
5976
-#endif
59776493
59786494 bool napi_complete_done(struct napi_struct *n, int work_done)
59796495 {
5980
- unsigned long flags, val, new;
6496
+ unsigned long flags, val, new, timeout = 0;
6497
+ bool ret = true;
59816498
59826499 /*
59836500 * 1) Don't let napi dequeue from the cpu poll list
....@@ -5989,28 +6506,31 @@
59896506 NAPIF_STATE_IN_BUSY_POLL)))
59906507 return false;
59916508
6509
+ if (work_done) {
6510
+ if (n->gro_bitmask)
6511
+ timeout = READ_ONCE(n->dev->gro_flush_timeout);
6512
+ n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
6513
+ }
6514
+ if (n->defer_hard_irqs_count > 0) {
6515
+ n->defer_hard_irqs_count--;
6516
+ timeout = READ_ONCE(n->dev->gro_flush_timeout);
6517
+ if (timeout)
6518
+ ret = false;
6519
+ }
59926520 if (n->gro_bitmask) {
5993
- unsigned long timeout = 0;
5994
-
5995
- if (work_done)
5996
- timeout = n->dev->gro_flush_timeout;
5997
-
59986521 /* When the NAPI instance uses a timeout and keeps postponing
59996522 * it, we need to bound somehow the time packets are kept in
60006523 * the GRO layer
60016524 */
60026525 napi_gro_flush(n, !!timeout);
6003
- if (timeout)
6004
- hrtimer_start(&n->timer, ns_to_ktime(timeout),
6005
- HRTIMER_MODE_REL_PINNED);
60066526 }
6007
- if (unlikely(!list_empty(&n->poll_list))) {
6008
- struct softnet_data *sd = this_cpu_ptr(&softnet_data);
60096527
6528
+ gro_normal_list(n);
6529
+
6530
+ if (unlikely(!list_empty(&n->poll_list))) {
60106531 /* If n->poll_list is not empty, we need to mask irqs */
60116532 local_irq_save(flags);
60126533 list_del_init(&n->poll_list);
6013
- sd->current_napi = NULL;
60146534 local_irq_restore(flags);
60156535 }
60166536
....@@ -6034,7 +6554,10 @@
60346554 return false;
60356555 }
60366556
6037
- return true;
6557
+ if (timeout)
6558
+ hrtimer_start(&n->timer, ns_to_ktime(timeout),
6559
+ HRTIMER_MODE_REL_PINNED);
6560
+ return ret;
60386561 }
60396562 EXPORT_SYMBOL(napi_complete_done);
60406563
....@@ -6077,10 +6600,19 @@
60776600 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
60786601 */
60796602 rc = napi->poll(napi, BUSY_POLL_BUDGET);
6603
+ /* We can't gro_normal_list() here, because napi->poll() might have
6604
+ * rearmed the napi (napi_complete_done()) in which case it could
6605
+ * already be running on another CPU.
6606
+ */
60806607 trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
60816608 netpoll_poll_unlock(have_poll_lock);
6082
- if (rc == BUSY_POLL_BUDGET)
6609
+ if (rc == BUSY_POLL_BUDGET) {
6610
+ /* As the whole budget was spent, we still own the napi so can
6611
+ * safely handle the rx_list.
6612
+ */
6613
+ gro_normal_list(napi);
60836614 __napi_schedule(napi);
6615
+ }
60846616 local_bh_enable();
60856617 }
60866618
....@@ -6125,6 +6657,7 @@
61256657 }
61266658 work = napi_poll(napi, BUSY_POLL_BUDGET);
61276659 trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
6660
+ gro_normal_list(napi);
61286661 count:
61296662 if (work > 0)
61306663 __NET_ADD_STATS(dev_net(napi->dev),
....@@ -6158,8 +6691,7 @@
61586691
61596692 static void napi_hash_add(struct napi_struct *napi)
61606693 {
6161
- if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
6162
- test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
6694
+ if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
61636695 return;
61646696
61656697 spin_lock(&napi_hash_lock);
....@@ -6180,20 +6712,14 @@
61806712 /* Warning : caller is responsible to make sure rcu grace period
61816713 * is respected before freeing memory containing @napi
61826714 */
6183
-bool napi_hash_del(struct napi_struct *napi)
6715
+static void napi_hash_del(struct napi_struct *napi)
61846716 {
6185
- bool rcu_sync_needed = false;
6186
-
61876717 spin_lock(&napi_hash_lock);
61886718
6189
- if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
6190
- rcu_sync_needed = true;
6191
- hlist_del_rcu(&napi->napi_hash_node);
6192
- }
6719
+ hlist_del_init_rcu(&napi->napi_hash_node);
6720
+
61936721 spin_unlock(&napi_hash_lock);
6194
- return rcu_sync_needed;
61956722 }
6196
-EXPORT_SYMBOL_GPL(napi_hash_del);
61976723
61986724 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
61996725 {
....@@ -6204,7 +6730,7 @@
62046730 /* Note : we use a relaxed variant of napi_schedule_prep() not setting
62056731 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
62066732 */
6207
- if (napi->gro_bitmask && !napi_disable_pending(napi) &&
6733
+ if (!napi_disable_pending(napi) &&
62086734 !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
62096735 __napi_schedule_irqoff(napi);
62106736
....@@ -6225,15 +6751,21 @@
62256751 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
62266752 int (*poll)(struct napi_struct *, int), int weight)
62276753 {
6754
+ if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
6755
+ return;
6756
+
62286757 INIT_LIST_HEAD(&napi->poll_list);
6758
+ INIT_HLIST_NODE(&napi->napi_hash_node);
62296759 hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
62306760 napi->timer.function = napi_watchdog;
62316761 init_gro_hash(napi);
62326762 napi->skb = NULL;
6763
+ INIT_LIST_HEAD(&napi->rx_list);
6764
+ napi->rx_count = 0;
62336765 napi->poll = poll;
62346766 if (weight > NAPI_POLL_WEIGHT)
6235
- pr_err_once("netif_napi_add() called with weight %d on device %s\n",
6236
- weight, dev->name);
6767
+ netdev_err_once(dev, "%s() called with weight %d\n", __func__,
6768
+ weight);
62376769 napi->weight = weight;
62386770 napi->dev = dev;
62396771 #ifdef CONFIG_NETPOLL
....@@ -6276,26 +6808,19 @@
62766808 }
62776809
62786810 /* Must be called in process context */
6279
-void netif_napi_del(struct napi_struct *napi)
6811
+void __netif_napi_del(struct napi_struct *napi)
62806812 {
6281
- might_sleep();
6282
- if (napi_hash_del(napi))
6283
- synchronize_net();
6284
- list_del_init(&napi->dev_list);
6813
+ if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
6814
+ return;
6815
+
6816
+ napi_hash_del(napi);
6817
+ list_del_rcu(&napi->dev_list);
62856818 napi_free_frags(napi);
62866819
62876820 flush_gro_hash(napi);
62886821 napi->gro_bitmask = 0;
62896822 }
6290
-EXPORT_SYMBOL(netif_napi_del);
6291
-
6292
-struct napi_struct *get_current_napi_context(void)
6293
-{
6294
- struct softnet_data *sd = this_cpu_ptr(&softnet_data);
6295
-
6296
- return sd->current_napi;
6297
-}
6298
-EXPORT_SYMBOL(get_current_napi_context);
6823
+EXPORT_SYMBOL(__netif_napi_del);
62996824
63006825 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
63016826 {
....@@ -6316,14 +6841,13 @@
63166841 */
63176842 work = 0;
63186843 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
6319
- struct softnet_data *sd = this_cpu_ptr(&softnet_data);
6320
-
6321
- sd->current_napi = n;
63226844 work = n->poll(n, weight);
63236845 trace_napi_poll(n, work, weight);
63246846 }
63256847
6326
- WARN_ON_ONCE(work > weight);
6848
+ if (unlikely(work > weight))
6849
+ pr_err_once("NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
6850
+ n->poll, work, weight);
63276851
63286852 if (likely(work < weight))
63296853 goto out_unlock;
....@@ -6344,6 +6868,8 @@
63446868 */
63456869 napi_gro_flush(n, HZ >= 1000);
63466870 }
6871
+
6872
+ gro_normal_list(n);
63476873
63486874 /* Some drivers may have called napi_schedule
63496875 * prior to exhausting their budget.
....@@ -6366,22 +6892,14 @@
63666892 {
63676893 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
63686894 unsigned long time_limit = jiffies +
6369
- usecs_to_jiffies(netdev_budget_usecs);
6370
- int budget = netdev_budget;
6371
- struct sk_buff_head tofree_q;
6372
- struct sk_buff *skb;
6895
+ usecs_to_jiffies(READ_ONCE(netdev_budget_usecs));
6896
+ int budget = READ_ONCE(netdev_budget);
63736897 LIST_HEAD(list);
63746898 LIST_HEAD(repoll);
63756899
6376
- __skb_queue_head_init(&tofree_q);
6377
-
63786900 local_irq_disable();
6379
- skb_queue_splice_init(&sd->tofree_queue, &tofree_q);
63806901 list_splice_init(&sd->poll_list, &list);
63816902 local_irq_enable();
6382
-
6383
- while ((skb = __skb_dequeue(&tofree_q)))
6384
- kfree_skb(skb);
63856903
63866904 for (;;) {
63876905 struct napi_struct *n;
....@@ -6412,7 +6930,7 @@
64126930 list_splice_tail(&repoll, &list);
64136931 list_splice(&list, &sd->poll_list);
64146932 if (!list_empty(&sd->poll_list))
6415
- __raise_softirq_irqoff_ksoft(NET_RX_SOFTIRQ);
6933
+ __raise_softirq_irqoff(NET_RX_SOFTIRQ);
64166934
64176935 net_rps_action_and_irq_enable(sd);
64186936 out:
....@@ -6424,6 +6942,9 @@
64246942
64256943 /* upper master flag, there can only be one master device per list */
64266944 bool master;
6945
+
6946
+ /* lookup ignore flag */
6947
+ bool ignore;
64276948
64286949 /* counter for the number of times this device was added to us */
64296950 u16 ref_nr;
....@@ -6447,9 +6968,10 @@
64476968 return NULL;
64486969 }
64496970
6450
-static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
6971
+static int ____netdev_has_upper_dev(struct net_device *upper_dev,
6972
+ struct netdev_nested_priv *priv)
64516973 {
6452
- struct net_device *dev = data;
6974
+ struct net_device *dev = (struct net_device *)priv->data;
64536975
64546976 return upper_dev == dev;
64556977 }
....@@ -6466,10 +6988,14 @@
64666988 bool netdev_has_upper_dev(struct net_device *dev,
64676989 struct net_device *upper_dev)
64686990 {
6991
+ struct netdev_nested_priv priv = {
6992
+ .data = (void *)upper_dev,
6993
+ };
6994
+
64696995 ASSERT_RTNL();
64706996
6471
- return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
6472
- upper_dev);
6997
+ return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
6998
+ &priv);
64736999 }
64747000 EXPORT_SYMBOL(netdev_has_upper_dev);
64757001
....@@ -6486,8 +7012,12 @@
64867012 bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
64877013 struct net_device *upper_dev)
64887014 {
6489
- return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
6490
- upper_dev);
7015
+ struct netdev_nested_priv priv = {
7016
+ .data = (void *)upper_dev,
7017
+ };
7018
+
7019
+ return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
7020
+ &priv);
64917021 }
64927022 EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
64937023
....@@ -6529,6 +7059,22 @@
65297059 return NULL;
65307060 }
65317061 EXPORT_SYMBOL(netdev_master_upper_dev_get);
7062
+
7063
+static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
7064
+{
7065
+ struct netdev_adjacent *upper;
7066
+
7067
+ ASSERT_RTNL();
7068
+
7069
+ if (list_empty(&dev->adj_list.upper))
7070
+ return NULL;
7071
+
7072
+ upper = list_first_entry(&dev->adj_list.upper,
7073
+ struct netdev_adjacent, list);
7074
+ if (likely(upper->master) && !upper->ignore)
7075
+ return upper->dev;
7076
+ return NULL;
7077
+}
65327078
65337079 /**
65347080 * netdev_has_any_lower_dev - Check if device is linked to some device
....@@ -6580,8 +7126,9 @@
65807126 }
65817127 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
65827128
6583
-static struct net_device *netdev_next_upper_dev(struct net_device *dev,
6584
- struct list_head **iter)
7129
+static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
7130
+ struct list_head **iter,
7131
+ bool *ignore)
65857132 {
65867133 struct netdev_adjacent *upper;
65877134
....@@ -6591,6 +7138,7 @@
65917138 return NULL;
65927139
65937140 *iter = &upper->list;
7141
+ *ignore = upper->ignore;
65947142
65957143 return upper->dev;
65967144 }
....@@ -6612,30 +7160,33 @@
66127160 return upper->dev;
66137161 }
66147162
6615
-static int netdev_walk_all_upper_dev(struct net_device *dev,
6616
- int (*fn)(struct net_device *dev,
6617
- void *data),
6618
- void *data)
7163
+static int __netdev_walk_all_upper_dev(struct net_device *dev,
7164
+ int (*fn)(struct net_device *dev,
7165
+ struct netdev_nested_priv *priv),
7166
+ struct netdev_nested_priv *priv)
66197167 {
66207168 struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
66217169 struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
66227170 int ret, cur = 0;
7171
+ bool ignore;
66237172
66247173 now = dev;
66257174 iter = &dev->adj_list.upper;
66267175
66277176 while (1) {
66287177 if (now != dev) {
6629
- ret = fn(now, data);
7178
+ ret = fn(now, priv);
66307179 if (ret)
66317180 return ret;
66327181 }
66337182
66347183 next = NULL;
66357184 while (1) {
6636
- udev = netdev_next_upper_dev(now, &iter);
7185
+ udev = __netdev_next_upper_dev(now, &iter, &ignore);
66377186 if (!udev)
66387187 break;
7188
+ if (ignore)
7189
+ continue;
66397190
66407191 next = udev;
66417192 niter = &udev->adj_list.upper;
....@@ -6660,8 +7211,8 @@
66607211
66617212 int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
66627213 int (*fn)(struct net_device *dev,
6663
- void *data),
6664
- void *data)
7214
+ struct netdev_nested_priv *priv),
7215
+ struct netdev_nested_priv *priv)
66657216 {
66667217 struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
66677218 struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
....@@ -6672,7 +7223,7 @@
66727223
66737224 while (1) {
66747225 if (now != dev) {
6675
- ret = fn(now, data);
7226
+ ret = fn(now, priv);
66767227 if (ret)
66777228 return ret;
66787229 }
....@@ -6704,6 +7255,20 @@
67047255 return 0;
67057256 }
67067257 EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
7258
+
7259
+static bool __netdev_has_upper_dev(struct net_device *dev,
7260
+ struct net_device *upper_dev)
7261
+{
7262
+ struct netdev_nested_priv priv = {
7263
+ .flags = 0,
7264
+ .data = (void *)upper_dev,
7265
+ };
7266
+
7267
+ ASSERT_RTNL();
7268
+
7269
+ return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
7270
+ &priv);
7271
+}
67077272
67087273 /**
67097274 * netdev_lower_get_next_private - Get the next ->private from the
....@@ -6801,10 +7366,27 @@
68017366 return lower->dev;
68027367 }
68037368
7369
+static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
7370
+ struct list_head **iter,
7371
+ bool *ignore)
7372
+{
7373
+ struct netdev_adjacent *lower;
7374
+
7375
+ lower = list_entry((*iter)->next, struct netdev_adjacent, list);
7376
+
7377
+ if (&lower->list == &dev->adj_list.lower)
7378
+ return NULL;
7379
+
7380
+ *iter = &lower->list;
7381
+ *ignore = lower->ignore;
7382
+
7383
+ return lower->dev;
7384
+}
7385
+
68047386 int netdev_walk_all_lower_dev(struct net_device *dev,
68057387 int (*fn)(struct net_device *dev,
6806
- void *data),
6807
- void *data)
7388
+ struct netdev_nested_priv *priv),
7389
+ struct netdev_nested_priv *priv)
68087390 {
68097391 struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
68107392 struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
....@@ -6815,7 +7397,7 @@
68157397
68167398 while (1) {
68177399 if (now != dev) {
6818
- ret = fn(now, data);
7400
+ ret = fn(now, priv);
68197401 if (ret)
68207402 return ret;
68217403 }
....@@ -6848,8 +7430,57 @@
68487430 }
68497431 EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
68507432
6851
-static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
6852
- struct list_head **iter)
7433
+static int __netdev_walk_all_lower_dev(struct net_device *dev,
7434
+ int (*fn)(struct net_device *dev,
7435
+ struct netdev_nested_priv *priv),
7436
+ struct netdev_nested_priv *priv)
7437
+{
7438
+ struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7439
+ struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7440
+ int ret, cur = 0;
7441
+ bool ignore;
7442
+
7443
+ now = dev;
7444
+ iter = &dev->adj_list.lower;
7445
+
7446
+ while (1) {
7447
+ if (now != dev) {
7448
+ ret = fn(now, priv);
7449
+ if (ret)
7450
+ return ret;
7451
+ }
7452
+
7453
+ next = NULL;
7454
+ while (1) {
7455
+ ldev = __netdev_next_lower_dev(now, &iter, &ignore);
7456
+ if (!ldev)
7457
+ break;
7458
+ if (ignore)
7459
+ continue;
7460
+
7461
+ next = ldev;
7462
+ niter = &ldev->adj_list.lower;
7463
+ dev_stack[cur] = now;
7464
+ iter_stack[cur++] = iter;
7465
+ break;
7466
+ }
7467
+
7468
+ if (!next) {
7469
+ if (!cur)
7470
+ return 0;
7471
+ next = dev_stack[--cur];
7472
+ niter = iter_stack[cur];
7473
+ }
7474
+
7475
+ now = next;
7476
+ iter = niter;
7477
+ }
7478
+
7479
+ return 0;
7480
+}
7481
+
7482
+struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
7483
+ struct list_head **iter)
68537484 {
68547485 struct netdev_adjacent *lower;
68557486
....@@ -6861,17 +7492,21 @@
68617492
68627493 return lower->dev;
68637494 }
7495
+EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
68647496
68657497 static u8 __netdev_upper_depth(struct net_device *dev)
68667498 {
68677499 struct net_device *udev;
68687500 struct list_head *iter;
68697501 u8 max_depth = 0;
7502
+ bool ignore;
68707503
68717504 for (iter = &dev->adj_list.upper,
6872
- udev = netdev_next_upper_dev(dev, &iter);
7505
+ udev = __netdev_next_upper_dev(dev, &iter, &ignore);
68737506 udev;
6874
- udev = netdev_next_upper_dev(dev, &iter)) {
7507
+ udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
7508
+ if (ignore)
7509
+ continue;
68757510 if (max_depth < udev->upper_level)
68767511 max_depth = udev->upper_level;
68777512 }
....@@ -6884,11 +7519,14 @@
68847519 struct net_device *ldev;
68857520 struct list_head *iter;
68867521 u8 max_depth = 0;
7522
+ bool ignore;
68877523
68887524 for (iter = &dev->adj_list.lower,
6889
- ldev = netdev_next_lower_dev(dev, &iter);
7525
+ ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
68907526 ldev;
6891
- ldev = netdev_next_lower_dev(dev, &iter)) {
7527
+ ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
7528
+ if (ignore)
7529
+ continue;
68927530 if (max_depth < ldev->lower_level)
68937531 max_depth = ldev->lower_level;
68947532 }
....@@ -6896,22 +7534,34 @@
68967534 return max_depth;
68977535 }
68987536
6899
-static int __netdev_update_upper_level(struct net_device *dev, void *data)
7537
+static int __netdev_update_upper_level(struct net_device *dev,
7538
+ struct netdev_nested_priv *__unused)
69007539 {
69017540 dev->upper_level = __netdev_upper_depth(dev) + 1;
69027541 return 0;
69037542 }
69047543
6905
-static int __netdev_update_lower_level(struct net_device *dev, void *data)
7544
+static int __netdev_update_lower_level(struct net_device *dev,
7545
+ struct netdev_nested_priv *priv)
69067546 {
69077547 dev->lower_level = __netdev_lower_depth(dev) + 1;
7548
+
7549
+#ifdef CONFIG_LOCKDEP
7550
+ if (!priv)
7551
+ return 0;
7552
+
7553
+ if (priv->flags & NESTED_SYNC_IMM)
7554
+ dev->nested_level = dev->lower_level - 1;
7555
+ if (priv->flags & NESTED_SYNC_TODO)
7556
+ net_unlink_todo(dev);
7557
+#endif
69087558 return 0;
69097559 }
69107560
69117561 int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
69127562 int (*fn)(struct net_device *dev,
6913
- void *data),
6914
- void *data)
7563
+ struct netdev_nested_priv *priv),
7564
+ struct netdev_nested_priv *priv)
69157565 {
69167566 struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
69177567 struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
....@@ -6922,7 +7572,7 @@
69227572
69237573 while (1) {
69247574 if (now != dev) {
6925
- ret = fn(now, data);
7575
+ ret = fn(now, priv);
69267576 if (ret)
69277577 return ret;
69287578 }
....@@ -7052,6 +7702,7 @@
70527702 adj->master = master;
70537703 adj->ref_nr = 1;
70547704 adj->private = private;
7705
+ adj->ignore = false;
70557706 dev_hold(adj_dev);
70567707
70577708 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
....@@ -7181,6 +7832,7 @@
71817832 static int __netdev_upper_dev_link(struct net_device *dev,
71827833 struct net_device *upper_dev, bool master,
71837834 void *upper_priv, void *upper_info,
7835
+ struct netdev_nested_priv *priv,
71847836 struct netlink_ext_ack *extack)
71857837 {
71867838 struct netdev_notifier_changeupper_info changeupper_info = {
....@@ -7202,17 +7854,17 @@
72027854 return -EBUSY;
72037855
72047856 /* To prevent loops, check if dev is not upper device to upper_dev. */
7205
- if (netdev_has_upper_dev(upper_dev, dev))
7857
+ if (__netdev_has_upper_dev(upper_dev, dev))
72067858 return -EBUSY;
72077859
72087860 if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
72097861 return -EMLINK;
72107862
72117863 if (!master) {
7212
- if (netdev_has_upper_dev(dev, upper_dev))
7864
+ if (__netdev_has_upper_dev(dev, upper_dev))
72137865 return -EEXIST;
72147866 } else {
7215
- master_dev = netdev_master_upper_dev_get(dev);
7867
+ master_dev = __netdev_master_upper_dev_get(dev);
72167868 if (master_dev)
72177869 return master_dev == upper_dev ? -EEXIST : -EBUSY;
72187870 }
....@@ -7235,10 +7887,11 @@
72357887 goto rollback;
72367888
72377889 __netdev_update_upper_level(dev, NULL);
7238
- netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
7890
+ __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
72397891
7240
- __netdev_update_lower_level(upper_dev, NULL);
7241
- netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, NULL);
7892
+ __netdev_update_lower_level(upper_dev, priv);
7893
+ __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
7894
+ priv);
72427895
72437896 return 0;
72447897
....@@ -7263,8 +7916,13 @@
72637916 struct net_device *upper_dev,
72647917 struct netlink_ext_ack *extack)
72657918 {
7919
+ struct netdev_nested_priv priv = {
7920
+ .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
7921
+ .data = NULL,
7922
+ };
7923
+
72667924 return __netdev_upper_dev_link(dev, upper_dev, false,
7267
- NULL, NULL, extack);
7925
+ NULL, NULL, &priv, extack);
72687926 }
72697927 EXPORT_SYMBOL(netdev_upper_dev_link);
72707928
....@@ -7287,21 +7945,19 @@
72877945 void *upper_priv, void *upper_info,
72887946 struct netlink_ext_ack *extack)
72897947 {
7948
+ struct netdev_nested_priv priv = {
7949
+ .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
7950
+ .data = NULL,
7951
+ };
7952
+
72907953 return __netdev_upper_dev_link(dev, upper_dev, true,
7291
- upper_priv, upper_info, extack);
7954
+ upper_priv, upper_info, &priv, extack);
72927955 }
72937956 EXPORT_SYMBOL(netdev_master_upper_dev_link);
72947957
7295
-/**
7296
- * netdev_upper_dev_unlink - Removes a link to upper device
7297
- * @dev: device
7298
- * @upper_dev: new upper device
7299
- *
7300
- * Removes a link to device which is upper to this one. The caller must hold
7301
- * the RTNL lock.
7302
- */
7303
-void netdev_upper_dev_unlink(struct net_device *dev,
7304
- struct net_device *upper_dev)
7958
+static void __netdev_upper_dev_unlink(struct net_device *dev,
7959
+ struct net_device *upper_dev,
7960
+ struct netdev_nested_priv *priv)
73057961 {
73067962 struct netdev_notifier_changeupper_info changeupper_info = {
73077963 .info = {
....@@ -7324,12 +7980,126 @@
73247980 &changeupper_info.info);
73257981
73267982 __netdev_update_upper_level(dev, NULL);
7327
- netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
7983
+ __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
73287984
7329
- __netdev_update_lower_level(upper_dev, NULL);
7330
- netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, NULL);
7985
+ __netdev_update_lower_level(upper_dev, priv);
7986
+ __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
7987
+ priv);
7988
+}
7989
+
7990
+/**
7991
+ * netdev_upper_dev_unlink - Removes a link to upper device
7992
+ * @dev: device
7993
+ * @upper_dev: new upper device
7994
+ *
7995
+ * Removes a link to device which is upper to this one. The caller must hold
7996
+ * the RTNL lock.
7997
+ */
7998
+void netdev_upper_dev_unlink(struct net_device *dev,
7999
+ struct net_device *upper_dev)
8000
+{
8001
+ struct netdev_nested_priv priv = {
8002
+ .flags = NESTED_SYNC_TODO,
8003
+ .data = NULL,
8004
+ };
8005
+
8006
+ __netdev_upper_dev_unlink(dev, upper_dev, &priv);
73318007 }
73328008 EXPORT_SYMBOL(netdev_upper_dev_unlink);
8009
+
8010
+static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
8011
+ struct net_device *lower_dev,
8012
+ bool val)
8013
+{
8014
+ struct netdev_adjacent *adj;
8015
+
8016
+ adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
8017
+ if (adj)
8018
+ adj->ignore = val;
8019
+
8020
+ adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
8021
+ if (adj)
8022
+ adj->ignore = val;
8023
+}
8024
+
8025
+static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
8026
+ struct net_device *lower_dev)
8027
+{
8028
+ __netdev_adjacent_dev_set(upper_dev, lower_dev, true);
8029
+}
8030
+
8031
+static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
8032
+ struct net_device *lower_dev)
8033
+{
8034
+ __netdev_adjacent_dev_set(upper_dev, lower_dev, false);
8035
+}
8036
+
8037
+int netdev_adjacent_change_prepare(struct net_device *old_dev,
8038
+ struct net_device *new_dev,
8039
+ struct net_device *dev,
8040
+ struct netlink_ext_ack *extack)
8041
+{
8042
+ struct netdev_nested_priv priv = {
8043
+ .flags = 0,
8044
+ .data = NULL,
8045
+ };
8046
+ int err;
8047
+
8048
+ if (!new_dev)
8049
+ return 0;
8050
+
8051
+ if (old_dev && new_dev != old_dev)
8052
+ netdev_adjacent_dev_disable(dev, old_dev);
8053
+ err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv,
8054
+ extack);
8055
+ if (err) {
8056
+ if (old_dev && new_dev != old_dev)
8057
+ netdev_adjacent_dev_enable(dev, old_dev);
8058
+ return err;
8059
+ }
8060
+
8061
+ return 0;
8062
+}
8063
+EXPORT_SYMBOL(netdev_adjacent_change_prepare);
8064
+
8065
+void netdev_adjacent_change_commit(struct net_device *old_dev,
8066
+ struct net_device *new_dev,
8067
+ struct net_device *dev)
8068
+{
8069
+ struct netdev_nested_priv priv = {
8070
+ .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
8071
+ .data = NULL,
8072
+ };
8073
+
8074
+ if (!new_dev || !old_dev)
8075
+ return;
8076
+
8077
+ if (new_dev == old_dev)
8078
+ return;
8079
+
8080
+ netdev_adjacent_dev_enable(dev, old_dev);
8081
+ __netdev_upper_dev_unlink(old_dev, dev, &priv);
8082
+}
8083
+EXPORT_SYMBOL(netdev_adjacent_change_commit);
8084
+
8085
+void netdev_adjacent_change_abort(struct net_device *old_dev,
8086
+ struct net_device *new_dev,
8087
+ struct net_device *dev)
8088
+{
8089
+ struct netdev_nested_priv priv = {
8090
+ .flags = 0,
8091
+ .data = NULL,
8092
+ };
8093
+
8094
+ if (!new_dev)
8095
+ return;
8096
+
8097
+ if (old_dev && new_dev != old_dev)
8098
+ netdev_adjacent_dev_enable(dev, old_dev);
8099
+
8100
+ __netdev_upper_dev_unlink(new_dev, dev, &priv);
8101
+}
8102
+EXPORT_SYMBOL(netdev_adjacent_change_abort);
73338103
73348104 /**
73358105 * netdev_bonding_info_change - Dispatch event about slave change
....@@ -7352,6 +8122,29 @@
73528122 &info.info);
73538123 }
73548124 EXPORT_SYMBOL(netdev_bonding_info_change);
8125
+
8126
+/**
8127
+ * netdev_get_xmit_slave - Get the xmit slave of master device
8128
+ * @dev: device
8129
+ * @skb: The packet
8130
+ * @all_slaves: assume all the slaves are active
8131
+ *
8132
+ * The reference counters are not incremented so the caller must be
8133
+ * careful with locks. The caller must hold RCU lock.
8134
+ * %NULL is returned if no slave is found.
8135
+ */
8136
+
8137
+struct net_device *netdev_get_xmit_slave(struct net_device *dev,
8138
+ struct sk_buff *skb,
8139
+ bool all_slaves)
8140
+{
8141
+ const struct net_device_ops *ops = dev->netdev_ops;
8142
+
8143
+ if (!ops->ndo_get_xmit_slave)
8144
+ return NULL;
8145
+ return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
8146
+}
8147
+EXPORT_SYMBOL(netdev_get_xmit_slave);
73558148
73568149 static void netdev_adjacent_add_links(struct net_device *dev)
73578150 {
....@@ -7443,25 +8236,6 @@
74438236 }
74448237 EXPORT_SYMBOL(netdev_lower_dev_get_private);
74458238
7446
-
7447
-int dev_get_nest_level(struct net_device *dev)
7448
-{
7449
- struct net_device *lower = NULL;
7450
- struct list_head *iter;
7451
- int max_nest = -1;
7452
- int nest;
7453
-
7454
- ASSERT_RTNL();
7455
-
7456
- netdev_for_each_lower_dev(dev, lower, iter) {
7457
- nest = dev_get_nest_level(lower);
7458
- if (max_nest < nest)
7459
- max_nest = nest;
7460
- }
7461
-
7462
- return max_nest + 1;
7463
-}
7464
-EXPORT_SYMBOL(dev_get_nest_level);
74658239
74668240 /**
74678241 * netdev_lower_change - Dispatch event about lower device state change
....@@ -7689,7 +8463,8 @@
76898463 }
76908464 EXPORT_SYMBOL(dev_get_flags);
76918465
7692
-int __dev_change_flags(struct net_device *dev, unsigned int flags)
8466
+int __dev_change_flags(struct net_device *dev, unsigned int flags,
8467
+ struct netlink_ext_ack *extack)
76938468 {
76948469 unsigned int old_flags = dev->flags;
76958470 int ret;
....@@ -7726,7 +8501,7 @@
77268501 if (old_flags & IFF_UP)
77278502 __dev_close(dev);
77288503 else
7729
- ret = __dev_open(dev);
8504
+ ret = __dev_open(dev, extack);
77308505 }
77318506
77328507 if ((flags ^ dev->gflags) & IFF_PROMISC) {
....@@ -7786,16 +8561,18 @@
77868561 * dev_change_flags - change device settings
77878562 * @dev: device
77888563 * @flags: device state flags
8564
+ * @extack: netlink extended ack
77898565 *
77908566 * Change settings on device based state flags. The flags are
77918567 * in the userspace exported format.
77928568 */
7793
-int dev_change_flags(struct net_device *dev, unsigned int flags)
8569
+int dev_change_flags(struct net_device *dev, unsigned int flags,
8570
+ struct netlink_ext_ack *extack)
77948571 {
77958572 int ret;
77968573 unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
77978574
7798
- ret = __dev_change_flags(dev, flags);
8575
+ ret = __dev_change_flags(dev, flags, extack);
77998576 if (ret < 0)
78008577 return ret;
78018578
....@@ -7938,13 +8715,36 @@
79388715 EXPORT_SYMBOL(dev_set_group);
79398716
79408717 /**
8718
+ * dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
8719
+ * @dev: device
8720
+ * @addr: new address
8721
+ * @extack: netlink extended ack
8722
+ */
8723
+int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
8724
+ struct netlink_ext_ack *extack)
8725
+{
8726
+ struct netdev_notifier_pre_changeaddr_info info = {
8727
+ .info.dev = dev,
8728
+ .info.extack = extack,
8729
+ .dev_addr = addr,
8730
+ };
8731
+ int rc;
8732
+
8733
+ rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
8734
+ return notifier_to_errno(rc);
8735
+}
8736
+EXPORT_SYMBOL(dev_pre_changeaddr_notify);
8737
+
8738
+/**
79418739 * dev_set_mac_address - Change Media Access Control Address
79428740 * @dev: device
79438741 * @sa: new address
8742
+ * @extack: netlink extended ack
79448743 *
79458744 * Change the hardware (MAC) address of the device
79468745 */
7947
-int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
8746
+int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
8747
+ struct netlink_ext_ack *extack)
79488748 {
79498749 const struct net_device_ops *ops = dev->netdev_ops;
79508750 int err;
....@@ -7955,6 +8755,9 @@
79558755 return -EINVAL;
79568756 if (!netif_device_present(dev))
79578757 return -ENODEV;
8758
+ err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
8759
+ if (err)
8760
+ return err;
79588761 err = ops->ndo_set_mac_address(dev, sa);
79598762 if (err)
79608763 return err;
....@@ -7964,6 +8767,48 @@
79648767 return 0;
79658768 }
79668769 EXPORT_SYMBOL(dev_set_mac_address);
8770
+
8771
+static DECLARE_RWSEM(dev_addr_sem);
8772
+
8773
+int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
8774
+ struct netlink_ext_ack *extack)
8775
+{
8776
+ int ret;
8777
+
8778
+ down_write(&dev_addr_sem);
8779
+ ret = dev_set_mac_address(dev, sa, extack);
8780
+ up_write(&dev_addr_sem);
8781
+ return ret;
8782
+}
8783
+EXPORT_SYMBOL(dev_set_mac_address_user);
8784
+
8785
+int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
8786
+{
8787
+ size_t size = sizeof(sa->sa_data);
8788
+ struct net_device *dev;
8789
+ int ret = 0;
8790
+
8791
+ down_read(&dev_addr_sem);
8792
+ rcu_read_lock();
8793
+
8794
+ dev = dev_get_by_name_rcu(net, dev_name);
8795
+ if (!dev) {
8796
+ ret = -ENODEV;
8797
+ goto unlock;
8798
+ }
8799
+ if (!dev->addr_len)
8800
+ memset(sa->sa_data, 0, size);
8801
+ else
8802
+ memcpy(sa->sa_data, dev->dev_addr,
8803
+ min_t(size_t, size, dev->addr_len));
8804
+ sa->sa_family = dev->type;
8805
+
8806
+unlock:
8807
+ rcu_read_unlock();
8808
+ up_read(&dev_addr_sem);
8809
+ return ret;
8810
+}
8811
+EXPORT_SYMBOL(dev_get_mac_address);
79678812
79688813 /**
79698814 * dev_change_carrier - Change device carrier
....@@ -8014,12 +8859,80 @@
80148859 char *name, size_t len)
80158860 {
80168861 const struct net_device_ops *ops = dev->netdev_ops;
8862
+ int err;
80178863
8018
- if (!ops->ndo_get_phys_port_name)
8019
- return -EOPNOTSUPP;
8020
- return ops->ndo_get_phys_port_name(dev, name, len);
8864
+ if (ops->ndo_get_phys_port_name) {
8865
+ err = ops->ndo_get_phys_port_name(dev, name, len);
8866
+ if (err != -EOPNOTSUPP)
8867
+ return err;
8868
+ }
8869
+ return devlink_compat_phys_port_name_get(dev, name, len);
80218870 }
80228871 EXPORT_SYMBOL(dev_get_phys_port_name);
8872
+
8873
+/**
8874
+ * dev_get_port_parent_id - Get the device's port parent identifier
8875
+ * @dev: network device
8876
+ * @ppid: pointer to a storage for the port's parent identifier
8877
+ * @recurse: allow/disallow recursion to lower devices
8878
+ *
8879
+ * Get the devices's port parent identifier
8880
+ */
8881
+int dev_get_port_parent_id(struct net_device *dev,
8882
+ struct netdev_phys_item_id *ppid,
8883
+ bool recurse)
8884
+{
8885
+ const struct net_device_ops *ops = dev->netdev_ops;
8886
+ struct netdev_phys_item_id first = { };
8887
+ struct net_device *lower_dev;
8888
+ struct list_head *iter;
8889
+ int err;
8890
+
8891
+ if (ops->ndo_get_port_parent_id) {
8892
+ err = ops->ndo_get_port_parent_id(dev, ppid);
8893
+ if (err != -EOPNOTSUPP)
8894
+ return err;
8895
+ }
8896
+
8897
+ err = devlink_compat_switch_id_get(dev, ppid);
8898
+ if (!err || err != -EOPNOTSUPP)
8899
+ return err;
8900
+
8901
+ if (!recurse)
8902
+ return -EOPNOTSUPP;
8903
+
8904
+ netdev_for_each_lower_dev(dev, lower_dev, iter) {
8905
+ err = dev_get_port_parent_id(lower_dev, ppid, recurse);
8906
+ if (err)
8907
+ break;
8908
+ if (!first.id_len)
8909
+ first = *ppid;
8910
+ else if (memcmp(&first, ppid, sizeof(*ppid)))
8911
+ return -EOPNOTSUPP;
8912
+ }
8913
+
8914
+ return err;
8915
+}
8916
+EXPORT_SYMBOL(dev_get_port_parent_id);
8917
+
8918
+/**
8919
+ * netdev_port_same_parent_id - Indicate if two network devices have
8920
+ * the same port parent identifier
8921
+ * @a: first network device
8922
+ * @b: second network device
8923
+ */
8924
+bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
8925
+{
8926
+ struct netdev_phys_item_id a_id = { };
8927
+ struct netdev_phys_item_id b_id = { };
8928
+
8929
+ if (dev_get_port_parent_id(a, &a_id, true) ||
8930
+ dev_get_port_parent_id(b, &b_id, true))
8931
+ return false;
8932
+
8933
+ return netdev_phys_item_id_same(&a_id, &b_id);
8934
+}
8935
+EXPORT_SYMBOL(netdev_port_same_parent_id);
80238936
80248937 /**
80258938 * dev_change_proto_down - update protocol port state information
....@@ -8041,67 +8954,495 @@
80418954 }
80428955 EXPORT_SYMBOL(dev_change_proto_down);
80438956
8044
-u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
8045
- enum bpf_netdev_command cmd)
8957
+/**
8958
+ * dev_change_proto_down_generic - generic implementation for
8959
+ * ndo_change_proto_down that sets carrier according to
8960
+ * proto_down.
8961
+ *
8962
+ * @dev: device
8963
+ * @proto_down: new value
8964
+ */
8965
+int dev_change_proto_down_generic(struct net_device *dev, bool proto_down)
80468966 {
8047
- struct netdev_bpf xdp;
8967
+ if (proto_down)
8968
+ netif_carrier_off(dev);
8969
+ else
8970
+ netif_carrier_on(dev);
8971
+ dev->proto_down = proto_down;
8972
+ return 0;
8973
+}
8974
+EXPORT_SYMBOL(dev_change_proto_down_generic);
80488975
8049
- if (!bpf_op)
8050
- return 0;
8976
+/**
8977
+ * dev_change_proto_down_reason - proto down reason
8978
+ *
8979
+ * @dev: device
8980
+ * @mask: proto down mask
8981
+ * @value: proto down value
8982
+ */
8983
+void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
8984
+ u32 value)
8985
+{
8986
+ int b;
80518987
8052
- memset(&xdp, 0, sizeof(xdp));
8053
- xdp.command = cmd;
8988
+ if (!mask) {
8989
+ dev->proto_down_reason = value;
8990
+ } else {
8991
+ for_each_set_bit(b, &mask, 32) {
8992
+ if (value & (1 << b))
8993
+ dev->proto_down_reason |= BIT(b);
8994
+ else
8995
+ dev->proto_down_reason &= ~BIT(b);
8996
+ }
8997
+ }
8998
+}
8999
+EXPORT_SYMBOL(dev_change_proto_down_reason);
80549000
8055
- /* Query must always succeed. */
8056
- WARN_ON(bpf_op(dev, &xdp) < 0 && cmd == XDP_QUERY_PROG);
9001
+struct bpf_xdp_link {
9002
+ struct bpf_link link;
9003
+ struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
9004
+ int flags;
9005
+};
80579006
8058
- return xdp.prog_id;
9007
+static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags)
9008
+{
9009
+ if (flags & XDP_FLAGS_HW_MODE)
9010
+ return XDP_MODE_HW;
9011
+ if (flags & XDP_FLAGS_DRV_MODE)
9012
+ return XDP_MODE_DRV;
9013
+ if (flags & XDP_FLAGS_SKB_MODE)
9014
+ return XDP_MODE_SKB;
9015
+ return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB;
80599016 }
80609017
8061
-static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
8062
- struct netlink_ext_ack *extack, u32 flags,
8063
- struct bpf_prog *prog)
9018
+static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
9019
+{
9020
+ switch (mode) {
9021
+ case XDP_MODE_SKB:
9022
+ return generic_xdp_install;
9023
+ case XDP_MODE_DRV:
9024
+ case XDP_MODE_HW:
9025
+ return dev->netdev_ops->ndo_bpf;
9026
+ default:
9027
+ return NULL;
9028
+ };
9029
+}
9030
+
9031
+static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
9032
+ enum bpf_xdp_mode mode)
9033
+{
9034
+ return dev->xdp_state[mode].link;
9035
+}
9036
+
9037
+static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
9038
+ enum bpf_xdp_mode mode)
9039
+{
9040
+ struct bpf_xdp_link *link = dev_xdp_link(dev, mode);
9041
+
9042
+ if (link)
9043
+ return link->link.prog;
9044
+ return dev->xdp_state[mode].prog;
9045
+}
9046
+
9047
+static u8 dev_xdp_prog_count(struct net_device *dev)
9048
+{
9049
+ u8 count = 0;
9050
+ int i;
9051
+
9052
+ for (i = 0; i < __MAX_XDP_MODE; i++)
9053
+ if (dev->xdp_state[i].prog || dev->xdp_state[i].link)
9054
+ count++;
9055
+ return count;
9056
+}
9057
+
9058
+u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
9059
+{
9060
+ struct bpf_prog *prog = dev_xdp_prog(dev, mode);
9061
+
9062
+ return prog ? prog->aux->id : 0;
9063
+}
9064
+
9065
+static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
9066
+ struct bpf_xdp_link *link)
9067
+{
9068
+ dev->xdp_state[mode].link = link;
9069
+ dev->xdp_state[mode].prog = NULL;
9070
+}
9071
+
9072
+static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
9073
+ struct bpf_prog *prog)
9074
+{
9075
+ dev->xdp_state[mode].link = NULL;
9076
+ dev->xdp_state[mode].prog = prog;
9077
+}
9078
+
9079
+static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
9080
+ bpf_op_t bpf_op, struct netlink_ext_ack *extack,
9081
+ u32 flags, struct bpf_prog *prog)
80649082 {
80659083 struct netdev_bpf xdp;
9084
+ int err;
80669085
80679086 memset(&xdp, 0, sizeof(xdp));
8068
- if (flags & XDP_FLAGS_HW_MODE)
8069
- xdp.command = XDP_SETUP_PROG_HW;
8070
- else
8071
- xdp.command = XDP_SETUP_PROG;
9087
+ xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
80729088 xdp.extack = extack;
80739089 xdp.flags = flags;
80749090 xdp.prog = prog;
80759091
8076
- return bpf_op(dev, &xdp);
9092
+ /* Drivers assume refcnt is already incremented (i.e, prog pointer is
9093
+ * "moved" into driver), so they don't increment it on their own, but
9094
+ * they do decrement refcnt when program is detached or replaced.
9095
+ * Given net_device also owns link/prog, we need to bump refcnt here
9096
+ * to prevent drivers from underflowing it.
9097
+ */
9098
+ if (prog)
9099
+ bpf_prog_inc(prog);
9100
+ err = bpf_op(dev, &xdp);
9101
+ if (err) {
9102
+ if (prog)
9103
+ bpf_prog_put(prog);
9104
+ return err;
9105
+ }
9106
+
9107
+ if (mode != XDP_MODE_HW)
9108
+ bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);
9109
+
9110
+ return 0;
80779111 }
80789112
80799113 static void dev_xdp_uninstall(struct net_device *dev)
80809114 {
8081
- struct netdev_bpf xdp;
8082
- bpf_op_t ndo_bpf;
9115
+ struct bpf_xdp_link *link;
9116
+ struct bpf_prog *prog;
9117
+ enum bpf_xdp_mode mode;
9118
+ bpf_op_t bpf_op;
80839119
8084
- /* Remove generic XDP */
8085
- WARN_ON(dev_xdp_install(dev, generic_xdp_install, NULL, 0, NULL));
9120
+ ASSERT_RTNL();
80869121
8087
- /* Remove from the driver */
8088
- ndo_bpf = dev->netdev_ops->ndo_bpf;
8089
- if (!ndo_bpf)
8090
- return;
9122
+ for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
9123
+ prog = dev_xdp_prog(dev, mode);
9124
+ if (!prog)
9125
+ continue;
80919126
8092
- memset(&xdp, 0, sizeof(xdp));
8093
- xdp.command = XDP_QUERY_PROG;
8094
- WARN_ON(ndo_bpf(dev, &xdp));
8095
- if (xdp.prog_id)
8096
- WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
8097
- NULL));
9127
+ bpf_op = dev_xdp_bpf_op(dev, mode);
9128
+ if (!bpf_op)
9129
+ continue;
80989130
8099
- /* Remove HW offload */
8100
- memset(&xdp, 0, sizeof(xdp));
8101
- xdp.command = XDP_QUERY_PROG_HW;
8102
- if (!ndo_bpf(dev, &xdp) && xdp.prog_id)
8103
- WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
8104
- NULL));
9131
+ WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
9132
+
9133
+ /* auto-detach link from net device */
9134
+ link = dev_xdp_link(dev, mode);
9135
+ if (link)
9136
+ link->dev = NULL;
9137
+ else
9138
+ bpf_prog_put(prog);
9139
+
9140
+ dev_xdp_set_link(dev, mode, NULL);
9141
+ }
9142
+}
9143
+
9144
+static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
9145
+ struct bpf_xdp_link *link, struct bpf_prog *new_prog,
9146
+ struct bpf_prog *old_prog, u32 flags)
9147
+{
9148
+ unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES);
9149
+ struct bpf_prog *cur_prog;
9150
+ enum bpf_xdp_mode mode;
9151
+ bpf_op_t bpf_op;
9152
+ int err;
9153
+
9154
+ ASSERT_RTNL();
9155
+
9156
+ /* either link or prog attachment, never both */
9157
+ if (link && (new_prog || old_prog))
9158
+ return -EINVAL;
9159
+ /* link supports only XDP mode flags */
9160
+ if (link && (flags & ~XDP_FLAGS_MODES)) {
9161
+ NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
9162
+ return -EINVAL;
9163
+ }
9164
+ /* just one XDP mode bit should be set, zero defaults to drv/skb mode */
9165
+ if (num_modes > 1) {
9166
+ NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
9167
+ return -EINVAL;
9168
+ }
9169
+ /* avoid ambiguity if offload + drv/skb mode progs are both loaded */
9170
+ if (!num_modes && dev_xdp_prog_count(dev) > 1) {
9171
+ NL_SET_ERR_MSG(extack,
9172
+ "More than one program loaded, unset mode is ambiguous");
9173
+ return -EINVAL;
9174
+ }
9175
+ /* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
9176
+ if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
9177
+ NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
9178
+ return -EINVAL;
9179
+ }
9180
+
9181
+ mode = dev_xdp_mode(dev, flags);
9182
+ /* can't replace attached link */
9183
+ if (dev_xdp_link(dev, mode)) {
9184
+ NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
9185
+ return -EBUSY;
9186
+ }
9187
+
9188
+ cur_prog = dev_xdp_prog(dev, mode);
9189
+ /* can't replace attached prog with link */
9190
+ if (link && cur_prog) {
9191
+ NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
9192
+ return -EBUSY;
9193
+ }
9194
+ if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
9195
+ NL_SET_ERR_MSG(extack, "Active program does not match expected");
9196
+ return -EEXIST;
9197
+ }
9198
+
9199
+ /* put effective new program into new_prog */
9200
+ if (link)
9201
+ new_prog = link->link.prog;
9202
+
9203
+ if (new_prog) {
9204
+ bool offload = mode == XDP_MODE_HW;
9205
+ enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
9206
+ ? XDP_MODE_DRV : XDP_MODE_SKB;
9207
+
9208
+ if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
9209
+ NL_SET_ERR_MSG(extack, "XDP program already attached");
9210
+ return -EBUSY;
9211
+ }
9212
+ if (!offload && dev_xdp_prog(dev, other_mode)) {
9213
+ NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
9214
+ return -EEXIST;
9215
+ }
9216
+ if (!offload && bpf_prog_is_dev_bound(new_prog->aux)) {
9217
+ NL_SET_ERR_MSG(extack, "Using device-bound program without HW_MODE flag is not supported");
9218
+ return -EINVAL;
9219
+ }
9220
+ if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
9221
+ NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
9222
+ return -EINVAL;
9223
+ }
9224
+ if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
9225
+ NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
9226
+ return -EINVAL;
9227
+ }
9228
+ }
9229
+
9230
+ /* don't call drivers if the effective program didn't change */
9231
+ if (new_prog != cur_prog) {
9232
+ bpf_op = dev_xdp_bpf_op(dev, mode);
9233
+ if (!bpf_op) {
9234
+ NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
9235
+ return -EOPNOTSUPP;
9236
+ }
9237
+
9238
+ err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
9239
+ if (err)
9240
+ return err;
9241
+ }
9242
+
9243
+ if (link)
9244
+ dev_xdp_set_link(dev, mode, link);
9245
+ else
9246
+ dev_xdp_set_prog(dev, mode, new_prog);
9247
+ if (cur_prog)
9248
+ bpf_prog_put(cur_prog);
9249
+
9250
+ return 0;
9251
+}
9252
+
9253
+static int dev_xdp_attach_link(struct net_device *dev,
9254
+ struct netlink_ext_ack *extack,
9255
+ struct bpf_xdp_link *link)
9256
+{
9257
+ return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
9258
+}
9259
+
9260
+static int dev_xdp_detach_link(struct net_device *dev,
9261
+ struct netlink_ext_ack *extack,
9262
+ struct bpf_xdp_link *link)
9263
+{
9264
+ enum bpf_xdp_mode mode;
9265
+ bpf_op_t bpf_op;
9266
+
9267
+ ASSERT_RTNL();
9268
+
9269
+ mode = dev_xdp_mode(dev, link->flags);
9270
+ if (dev_xdp_link(dev, mode) != link)
9271
+ return -EINVAL;
9272
+
9273
+ bpf_op = dev_xdp_bpf_op(dev, mode);
9274
+ WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
9275
+ dev_xdp_set_link(dev, mode, NULL);
9276
+ return 0;
9277
+}
9278
+
9279
+static void bpf_xdp_link_release(struct bpf_link *link)
9280
+{
9281
+ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9282
+
9283
+ rtnl_lock();
9284
+
9285
+ /* if racing with net_device's tear down, xdp_link->dev might be
9286
+ * already NULL, in which case link was already auto-detached
9287
+ */
9288
+ if (xdp_link->dev) {
9289
+ WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
9290
+ xdp_link->dev = NULL;
9291
+ }
9292
+
9293
+ rtnl_unlock();
9294
+}
9295
+
9296
+static int bpf_xdp_link_detach(struct bpf_link *link)
9297
+{
9298
+ bpf_xdp_link_release(link);
9299
+ return 0;
9300
+}
9301
+
9302
+static void bpf_xdp_link_dealloc(struct bpf_link *link)
9303
+{
9304
+ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9305
+
9306
+ kfree(xdp_link);
9307
+}
9308
+
9309
+static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
9310
+ struct seq_file *seq)
9311
+{
9312
+ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9313
+ u32 ifindex = 0;
9314
+
9315
+ rtnl_lock();
9316
+ if (xdp_link->dev)
9317
+ ifindex = xdp_link->dev->ifindex;
9318
+ rtnl_unlock();
9319
+
9320
+ seq_printf(seq, "ifindex:\t%u\n", ifindex);
9321
+}
9322
+
9323
+static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
9324
+ struct bpf_link_info *info)
9325
+{
9326
+ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9327
+ u32 ifindex = 0;
9328
+
9329
+ rtnl_lock();
9330
+ if (xdp_link->dev)
9331
+ ifindex = xdp_link->dev->ifindex;
9332
+ rtnl_unlock();
9333
+
9334
+ info->xdp.ifindex = ifindex;
9335
+ return 0;
9336
+}
9337
+
9338
+static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
9339
+ struct bpf_prog *old_prog)
9340
+{
9341
+ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9342
+ enum bpf_xdp_mode mode;
9343
+ bpf_op_t bpf_op;
9344
+ int err = 0;
9345
+
9346
+ rtnl_lock();
9347
+
9348
+ /* link might have been auto-released already, so fail */
9349
+ if (!xdp_link->dev) {
9350
+ err = -ENOLINK;
9351
+ goto out_unlock;
9352
+ }
9353
+
9354
+ if (old_prog && link->prog != old_prog) {
9355
+ err = -EPERM;
9356
+ goto out_unlock;
9357
+ }
9358
+ old_prog = link->prog;
9359
+ if (old_prog->type != new_prog->type ||
9360
+ old_prog->expected_attach_type != new_prog->expected_attach_type) {
9361
+ err = -EINVAL;
9362
+ goto out_unlock;
9363
+ }
9364
+
9365
+ if (old_prog == new_prog) {
9366
+ /* no-op, don't disturb drivers */
9367
+ bpf_prog_put(new_prog);
9368
+ goto out_unlock;
9369
+ }
9370
+
9371
+ mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
9372
+ bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
9373
+ err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
9374
+ xdp_link->flags, new_prog);
9375
+ if (err)
9376
+ goto out_unlock;
9377
+
9378
+ old_prog = xchg(&link->prog, new_prog);
9379
+ bpf_prog_put(old_prog);
9380
+
9381
+out_unlock:
9382
+ rtnl_unlock();
9383
+ return err;
9384
+}
9385
+
9386
+static const struct bpf_link_ops bpf_xdp_link_lops = {
9387
+ .release = bpf_xdp_link_release,
9388
+ .dealloc = bpf_xdp_link_dealloc,
9389
+ .detach = bpf_xdp_link_detach,
9390
+ .show_fdinfo = bpf_xdp_link_show_fdinfo,
9391
+ .fill_link_info = bpf_xdp_link_fill_link_info,
9392
+ .update_prog = bpf_xdp_link_update,
9393
+};
9394
+
9395
+int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
9396
+{
9397
+ struct net *net = current->nsproxy->net_ns;
9398
+ struct bpf_link_primer link_primer;
9399
+ struct bpf_xdp_link *link;
9400
+ struct net_device *dev;
9401
+ int err, fd;
9402
+
9403
+ rtnl_lock();
9404
+ dev = dev_get_by_index(net, attr->link_create.target_ifindex);
9405
+ if (!dev) {
9406
+ rtnl_unlock();
9407
+ return -EINVAL;
9408
+ }
9409
+
9410
+ link = kzalloc(sizeof(*link), GFP_USER);
9411
+ if (!link) {
9412
+ err = -ENOMEM;
9413
+ goto unlock;
9414
+ }
9415
+
9416
+ bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog);
9417
+ link->dev = dev;
9418
+ link->flags = attr->link_create.flags;
9419
+
9420
+ err = bpf_link_prime(&link->link, &link_primer);
9421
+ if (err) {
9422
+ kfree(link);
9423
+ goto unlock;
9424
+ }
9425
+
9426
+ err = dev_xdp_attach_link(dev, NULL, link);
9427
+ rtnl_unlock();
9428
+
9429
+ if (err) {
9430
+ link->dev = NULL;
9431
+ bpf_link_cleanup(&link_primer);
9432
+ goto out_put_dev;
9433
+ }
9434
+
9435
+ fd = bpf_link_settle(&link_primer);
9436
+ /* link itself doesn't hold dev's refcnt to not complicate shutdown */
9437
+ dev_put(dev);
9438
+ return fd;
9439
+
9440
+unlock:
9441
+ rtnl_unlock();
9442
+
9443
+out_put_dev:
9444
+ dev_put(dev);
9445
+ return err;
81059446 }
81069447
81079448 /**
....@@ -8109,56 +9450,44 @@
81099450 * @dev: device
81109451 * @extack: netlink extended ack
81119452 * @fd: new program fd or negative value to clear
9453
+ * @expected_fd: old program fd that userspace expects to replace or clear
81129454 * @flags: xdp-related flags
81139455 *
81149456 * Set or clear a bpf program for a device
81159457 */
81169458 int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
8117
- int fd, u32 flags)
9459
+ int fd, int expected_fd, u32 flags)
81189460 {
8119
- const struct net_device_ops *ops = dev->netdev_ops;
8120
- enum bpf_netdev_command query;
8121
- struct bpf_prog *prog = NULL;
8122
- bpf_op_t bpf_op, bpf_chk;
9461
+ enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags);
9462
+ struct bpf_prog *new_prog = NULL, *old_prog = NULL;
81239463 int err;
81249464
81259465 ASSERT_RTNL();
81269466
8127
- query = flags & XDP_FLAGS_HW_MODE ? XDP_QUERY_PROG_HW : XDP_QUERY_PROG;
8128
-
8129
- bpf_op = bpf_chk = ops->ndo_bpf;
8130
- if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE)))
8131
- return -EOPNOTSUPP;
8132
- if (!bpf_op || (flags & XDP_FLAGS_SKB_MODE))
8133
- bpf_op = generic_xdp_install;
8134
- if (bpf_op == bpf_chk)
8135
- bpf_chk = generic_xdp_install;
8136
-
81379467 if (fd >= 0) {
8138
- if (__dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG) ||
8139
- __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG_HW))
8140
- return -EEXIST;
8141
- if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) &&
8142
- __dev_xdp_query(dev, bpf_op, query))
8143
- return -EBUSY;
9468
+ new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
9469
+ mode != XDP_MODE_SKB);
9470
+ if (IS_ERR(new_prog))
9471
+ return PTR_ERR(new_prog);
9472
+ }
81449473
8145
- prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
8146
- bpf_op == ops->ndo_bpf);
8147
- if (IS_ERR(prog))
8148
- return PTR_ERR(prog);
8149
-
8150
- if (!(flags & XDP_FLAGS_HW_MODE) &&
8151
- bpf_prog_is_dev_bound(prog->aux)) {
8152
- NL_SET_ERR_MSG(extack, "using device-bound program without HW_MODE flag is not supported");
8153
- bpf_prog_put(prog);
8154
- return -EINVAL;
9474
+ if (expected_fd >= 0) {
9475
+ old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
9476
+ mode != XDP_MODE_SKB);
9477
+ if (IS_ERR(old_prog)) {
9478
+ err = PTR_ERR(old_prog);
9479
+ old_prog = NULL;
9480
+ goto err_out;
81559481 }
81569482 }
81579483
8158
- err = dev_xdp_install(dev, bpf_op, extack, flags, prog);
8159
- if (err < 0 && prog)
8160
- bpf_prog_put(prog);
9484
+ err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);
81619485
9486
+err_out:
9487
+ if (err && new_prog)
9488
+ bpf_prog_put(new_prog);
9489
+ if (old_prog)
9490
+ bpf_prog_put(old_prog);
81629491 return err;
81639492 }
81649493
....@@ -8190,103 +9519,6 @@
81909519 {
81919520 list_add_tail(&dev->todo_list, &net_todo_list);
81929521 dev_net(dev)->dev_unreg_count++;
8193
-}
8194
-
8195
-static void rollback_registered_many(struct list_head *head)
8196
-{
8197
- struct net_device *dev, *tmp;
8198
- LIST_HEAD(close_head);
8199
-
8200
- BUG_ON(dev_boot_phase);
8201
- ASSERT_RTNL();
8202
-
8203
- list_for_each_entry_safe(dev, tmp, head, unreg_list) {
8204
- /* Some devices call without registering
8205
- * for initialization unwind. Remove those
8206
- * devices and proceed with the remaining.
8207
- */
8208
- if (dev->reg_state == NETREG_UNINITIALIZED) {
8209
- pr_debug("unregister_netdevice: device %s/%p never was registered\n",
8210
- dev->name, dev);
8211
-
8212
- WARN_ON(1);
8213
- list_del(&dev->unreg_list);
8214
- continue;
8215
- }
8216
- dev->dismantle = true;
8217
- BUG_ON(dev->reg_state != NETREG_REGISTERED);
8218
- }
8219
-
8220
- /* If device is running, close it first. */
8221
- list_for_each_entry(dev, head, unreg_list)
8222
- list_add_tail(&dev->close_list, &close_head);
8223
- dev_close_many(&close_head, true);
8224
-
8225
- list_for_each_entry(dev, head, unreg_list) {
8226
- /* And unlink it from device chain. */
8227
- unlist_netdevice(dev);
8228
-
8229
- dev->reg_state = NETREG_UNREGISTERING;
8230
- }
8231
- flush_all_backlogs();
8232
-
8233
- synchronize_net();
8234
-
8235
- list_for_each_entry(dev, head, unreg_list) {
8236
- struct sk_buff *skb = NULL;
8237
-
8238
- /* Shutdown queueing discipline. */
8239
- dev_shutdown(dev);
8240
-
8241
- dev_xdp_uninstall(dev);
8242
-
8243
- /* Notify protocols, that we are about to destroy
8244
- * this device. They should clean all the things.
8245
- */
8246
- call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
8247
-
8248
- if (!dev->rtnl_link_ops ||
8249
- dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
8250
- skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
8251
- GFP_KERNEL, NULL, 0);
8252
-
8253
- /*
8254
- * Flush the unicast and multicast chains
8255
- */
8256
- dev_uc_flush(dev);
8257
- dev_mc_flush(dev);
8258
-
8259
- if (dev->netdev_ops->ndo_uninit)
8260
- dev->netdev_ops->ndo_uninit(dev);
8261
-
8262
- if (skb)
8263
- rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
8264
-
8265
- /* Notifier chain MUST detach us all upper devices. */
8266
- WARN_ON(netdev_has_any_upper_dev(dev));
8267
- WARN_ON(netdev_has_any_lower_dev(dev));
8268
-
8269
- /* Remove entries from kobject tree */
8270
- netdev_unregister_kobject(dev);
8271
-#ifdef CONFIG_XPS
8272
- /* Remove XPS queueing entries */
8273
- netif_reset_xps_queues_gt(dev, 0);
8274
-#endif
8275
- }
8276
-
8277
- synchronize_net();
8278
-
8279
- list_for_each_entry(dev, head, unreg_list)
8280
- dev_put(dev);
8281
-}
8282
-
8283
-static void rollback_registered(struct net_device *dev)
8284
-{
8285
- LIST_HEAD(single);
8286
-
8287
- list_add(&dev->unreg_list, &single);
8288
- rollback_registered_many(&single);
8289
- list_del(&single);
82909522 }
82919523
82929524 static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
....@@ -8434,7 +9666,7 @@
84349666 /* driver might be less strict about feature dependencies */
84359667 features = netdev_fix_features(dev, features);
84369668
8437
- /* some features can't be enabled if they're off an an upper device */
9669
+ /* some features can't be enabled if they're off on an upper device */
84389670 netdev_for_each_upper_dev_rcu(dev, upper, iter)
84399671 features = netdev_sync_upper_features(dev, upper, features);
84409672
....@@ -8558,6 +9790,11 @@
85589790 else
85599791 netif_dormant_off(dev);
85609792
9793
+ if (rootdev->operstate == IF_OPER_TESTING)
9794
+ netif_testing_on(dev);
9795
+ else
9796
+ netif_testing_off(dev);
9797
+
85619798 if (netif_carrier_ok(rootdev))
85629799 netif_carrier_on(dev);
85639800 else
....@@ -8619,7 +9856,7 @@
86199856 /* Initialize queue lock */
86209857 spin_lock_init(&queue->_xmit_lock);
86219858 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
8622
- netdev_queue_clear_owner(queue);
9859
+ queue->xmit_lock_owner = -1;
86239860 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
86249861 queue->dev = dev;
86259862 #ifdef CONFIG_BQL
....@@ -8698,11 +9935,20 @@
86989935 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
86999936 BUG_ON(!net);
87009937
9938
+ ret = ethtool_check_ops(dev->ethtool_ops);
9939
+ if (ret)
9940
+ return ret;
9941
+
87019942 spin_lock_init(&dev->addr_list_lock);
87029943 netdev_set_addr_lockdep_class(dev);
87039944
87049945 ret = dev_get_valid_name(net, dev, dev->name);
87059946 if (ret < 0)
9947
+ goto out;
9948
+
9949
+ ret = -ENOMEM;
9950
+ dev->name_node = netdev_name_node_head_alloc(dev);
9951
+ if (!dev->name_node)
87069952 goto out;
87079953
87089954 /* Init, if this function is available */
....@@ -8711,7 +9957,7 @@
87119957 if (ret) {
87129958 if (ret > 0)
87139959 ret = -EIO;
8714
- goto out;
9960
+ goto err_free_name;
87159961 }
87169962 }
87179963
....@@ -8733,7 +9979,7 @@
87339979 /* Transfer changeable features to wanted_features and enable
87349980 * software offloads (GSO and GRO).
87359981 */
8736
- dev->hw_features |= NETIF_F_SOFT_FEATURES;
9982
+ dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
87379983 dev->features |= NETIF_F_SOFT_FEATURES;
87389984
87399985 if (dev->netdev_ops->ndo_udp_tunnel_add) {
....@@ -8811,17 +10057,10 @@
881110057 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
881210058 ret = notifier_to_errno(ret);
881310059 if (ret) {
8814
- rollback_registered(dev);
8815
- rcu_barrier();
8816
-
8817
- dev->reg_state = NETREG_UNREGISTERED;
8818
- /* We should put the kobject that hold in
8819
- * netdev_unregister_kobject(), otherwise
8820
- * the net device cannot be freed when
8821
- * driver calls free_netdev(), because the
8822
- * kobject is being hold.
8823
- */
8824
- kobject_put(&dev->dev.kobj);
10060
+ /* Expect explicit free_netdev() on failure */
10061
+ dev->needs_free_netdev = false;
10062
+ unregister_netdevice_queue(dev, NULL);
10063
+ goto out;
882510064 }
882610065 /*
882710066 * Prevent userspace races by waiting until the network
....@@ -8839,6 +10078,8 @@
883910078 dev->netdev_ops->ndo_uninit(dev);
884010079 if (dev->priv_destructor)
884110080 dev->priv_destructor(dev);
10081
+err_free_name:
10082
+ netdev_name_node_free(dev->name_node);
884210083 goto out;
884310084 }
884410085 EXPORT_SYMBOL(register_netdevice);
....@@ -8922,6 +10163,8 @@
892210163 }
892310164 EXPORT_SYMBOL(netdev_refcnt_read);
892410165
10166
+#define WAIT_REFS_MIN_MSECS 1
10167
+#define WAIT_REFS_MAX_MSECS 250
892510168 /**
892610169 * netdev_wait_allrefs - wait until all references are gone.
892710170 * @dev: target net_device
....@@ -8937,7 +10180,7 @@
893710180 static void netdev_wait_allrefs(struct net_device *dev)
893810181 {
893910182 unsigned long rebroadcast_time, warning_time;
8940
- int refcnt;
10183
+ int wait = 0, refcnt;
894110184
894210185 linkwatch_forget_dev(dev);
894310186
....@@ -8971,7 +10214,13 @@
897110214 rebroadcast_time = jiffies;
897210215 }
897310216
8974
- msleep(250);
10217
+ if (!wait) {
10218
+ rcu_barrier();
10219
+ wait = WAIT_REFS_MIN_MSECS;
10220
+ } else {
10221
+ msleep(wait);
10222
+ wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
10223
+ }
897510224
897610225 refcnt = netdev_refcnt_read(dev);
897710226
....@@ -9010,6 +10259,19 @@
901010259 void netdev_run_todo(void)
901110260 {
901210261 struct list_head list;
10262
+#ifdef CONFIG_LOCKDEP
10263
+ struct list_head unlink_list;
10264
+
10265
+ list_replace_init(&net_unlink_list, &unlink_list);
10266
+
10267
+ while (!list_empty(&unlink_list)) {
10268
+ struct net_device *dev = list_first_entry(&unlink_list,
10269
+ struct net_device,
10270
+ unlink_list);
10271
+ list_del_init(&dev->unlink_list);
10272
+ dev->nested_level = dev->lower_level - 1;
10273
+ }
10274
+#endif
901310275
901410276 /* Snapshot list, allow later requests */
901510277 list_replace_init(&net_todo_list, &list);
....@@ -9043,9 +10305,7 @@
904310305 BUG_ON(!list_empty(&dev->ptype_specific));
904410306 WARN_ON(rcu_access_pointer(dev->ip_ptr));
904510307 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
9046
-#if IS_ENABLED(CONFIG_DECNET)
9047
- WARN_ON(dev->dn_ptr);
9048
-#endif
10308
+
904910309 if (dev->priv_destructor)
905010310 dev->priv_destructor(dev);
905110311 if (dev->needs_free_netdev)
....@@ -9120,6 +10380,40 @@
912010380 return storage;
912110381 }
912210382 EXPORT_SYMBOL(dev_get_stats);
10383
+
10384
+/**
10385
+ * dev_fetch_sw_netstats - get per-cpu network device statistics
10386
+ * @s: place to store stats
10387
+ * @netstats: per-cpu network stats to read from
10388
+ *
10389
+ * Read per-cpu network statistics and populate the related fields in @s.
10390
+ */
10391
+void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
10392
+ const struct pcpu_sw_netstats __percpu *netstats)
10393
+{
10394
+ int cpu;
10395
+
10396
+ for_each_possible_cpu(cpu) {
10397
+ const struct pcpu_sw_netstats *stats;
10398
+ struct pcpu_sw_netstats tmp;
10399
+ unsigned int start;
10400
+
10401
+ stats = per_cpu_ptr(netstats, cpu);
10402
+ do {
10403
+ start = u64_stats_fetch_begin_irq(&stats->syncp);
10404
+ tmp.rx_packets = stats->rx_packets;
10405
+ tmp.rx_bytes = stats->rx_bytes;
10406
+ tmp.tx_packets = stats->tx_packets;
10407
+ tmp.tx_bytes = stats->tx_bytes;
10408
+ } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
10409
+
10410
+ s->rx_packets += tmp.rx_packets;
10411
+ s->rx_bytes += tmp.rx_bytes;
10412
+ s->tx_packets += tmp.tx_packets;
10413
+ s->tx_bytes += tmp.tx_bytes;
10414
+ }
10415
+}
10416
+EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats);
912310417
912410418 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
912510419 {
....@@ -9222,6 +10516,10 @@
922210516 dev->gso_max_segs = GSO_MAX_SEGS;
922310517 dev->upper_level = 1;
922410518 dev->lower_level = 1;
10519
+#ifdef CONFIG_LOCKDEP
10520
+ dev->nested_level = 0;
10521
+ INIT_LIST_HEAD(&dev->unlink_list);
10522
+#endif
922510523
922610524 INIT_LIST_HEAD(&dev->napi_list);
922710525 INIT_LIST_HEAD(&dev->unreg_list);
....@@ -9231,6 +10529,7 @@
923110529 INIT_LIST_HEAD(&dev->adj_list.lower);
923210530 INIT_LIST_HEAD(&dev->ptype_all);
923310531 INIT_LIST_HEAD(&dev->ptype_specific);
10532
+ INIT_LIST_HEAD(&dev->net_notifier_list);
923410533 #ifdef CONFIG_NET_SCHED
923510534 hash_init(dev->qdisc_hash);
923610535 #endif
....@@ -9288,6 +10587,17 @@
928810587 struct napi_struct *p, *n;
928910588
929010589 might_sleep();
10590
+
10591
+ /* When called immediately after register_netdevice() failed the unwind
10592
+ * handling may still be dismantling the device. Handle that case by
10593
+ * deferring the free.
10594
+ */
10595
+ if (dev->reg_state == NETREG_UNREGISTERING) {
10596
+ ASSERT_RTNL();
10597
+ dev->needs_free_netdev = true;
10598
+ return;
10599
+ }
10600
+
929110601 netif_free_tx_queues(dev);
929210602 netif_free_rx_queues(dev);
929310603
....@@ -9301,6 +10611,8 @@
930110611
930210612 free_percpu(dev->pcpu_refcnt);
930310613 dev->pcpu_refcnt = NULL;
10614
+ free_percpu(dev->xdp_bulkq);
10615
+ dev->xdp_bulkq = NULL;
930410616
930510617 /* Compatibility with error handling in drivers */
930610618 if (dev->reg_state == NETREG_UNINITIALIZED) {
....@@ -9352,9 +10664,10 @@
935210664 if (head) {
935310665 list_move_tail(&dev->unreg_list, head);
935410666 } else {
9355
- rollback_registered(dev);
9356
- /* Finish processing unregister after unlock */
9357
- net_set_todo(dev);
10667
+ LIST_HEAD(single);
10668
+
10669
+ list_add(&dev->unreg_list, &single);
10670
+ unregister_netdevice_many(&single);
935810671 }
935910672 }
936010673 EXPORT_SYMBOL(unregister_netdevice_queue);
....@@ -9368,14 +10681,100 @@
936810681 */
936910682 void unregister_netdevice_many(struct list_head *head)
937010683 {
9371
- struct net_device *dev;
10684
+ struct net_device *dev, *tmp;
10685
+ LIST_HEAD(close_head);
937210686
9373
- if (!list_empty(head)) {
9374
- rollback_registered_many(head);
9375
- list_for_each_entry(dev, head, unreg_list)
9376
- net_set_todo(dev);
9377
- list_del(head);
10687
+ BUG_ON(dev_boot_phase);
10688
+ ASSERT_RTNL();
10689
+
10690
+ if (list_empty(head))
10691
+ return;
10692
+
10693
+ list_for_each_entry_safe(dev, tmp, head, unreg_list) {
10694
+ /* Some devices call without registering
10695
+ * for initialization unwind. Remove those
10696
+ * devices and proceed with the remaining.
10697
+ */
10698
+ if (dev->reg_state == NETREG_UNINITIALIZED) {
10699
+ pr_debug("unregister_netdevice: device %s/%p never was registered\n",
10700
+ dev->name, dev);
10701
+
10702
+ WARN_ON(1);
10703
+ list_del(&dev->unreg_list);
10704
+ continue;
10705
+ }
10706
+ dev->dismantle = true;
10707
+ BUG_ON(dev->reg_state != NETREG_REGISTERED);
937810708 }
10709
+
10710
+ /* If device is running, close it first. */
10711
+ list_for_each_entry(dev, head, unreg_list)
10712
+ list_add_tail(&dev->close_list, &close_head);
10713
+ dev_close_many(&close_head, true);
10714
+
10715
+ list_for_each_entry(dev, head, unreg_list) {
10716
+ /* And unlink it from device chain. */
10717
+ unlist_netdevice(dev);
10718
+
10719
+ dev->reg_state = NETREG_UNREGISTERING;
10720
+ }
10721
+ flush_all_backlogs();
10722
+
10723
+ synchronize_net();
10724
+
10725
+ list_for_each_entry(dev, head, unreg_list) {
10726
+ struct sk_buff *skb = NULL;
10727
+
10728
+ /* Shutdown queueing discipline. */
10729
+ dev_shutdown(dev);
10730
+
10731
+ dev_xdp_uninstall(dev);
10732
+
10733
+ /* Notify protocols, that we are about to destroy
10734
+ * this device. They should clean all the things.
10735
+ */
10736
+ call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10737
+
10738
+ if (!dev->rtnl_link_ops ||
10739
+ dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
10740
+ skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
10741
+ GFP_KERNEL, NULL, 0);
10742
+
10743
+ /*
10744
+ * Flush the unicast and multicast chains
10745
+ */
10746
+ dev_uc_flush(dev);
10747
+ dev_mc_flush(dev);
10748
+
10749
+ netdev_name_node_alt_flush(dev);
10750
+ netdev_name_node_free(dev->name_node);
10751
+
10752
+ if (dev->netdev_ops->ndo_uninit)
10753
+ dev->netdev_ops->ndo_uninit(dev);
10754
+
10755
+ if (skb)
10756
+ rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
10757
+
10758
+ /* Notifier chain MUST detach us all upper devices. */
10759
+ WARN_ON(netdev_has_any_upper_dev(dev));
10760
+ WARN_ON(netdev_has_any_lower_dev(dev));
10761
+
10762
+ /* Remove entries from kobject tree */
10763
+ netdev_unregister_kobject(dev);
10764
+#ifdef CONFIG_XPS
10765
+ /* Remove XPS queueing entries */
10766
+ netif_reset_xps_queues_gt(dev, 0);
10767
+#endif
10768
+ }
10769
+
10770
+ synchronize_net();
10771
+
10772
+ list_for_each_entry(dev, head, unreg_list) {
10773
+ dev_put(dev);
10774
+ net_set_todo(dev);
10775
+ }
10776
+
10777
+ list_del(head);
937910778 }
938010779 EXPORT_SYMBOL(unregister_netdevice_many);
938110780
....@@ -9414,6 +10813,7 @@
941410813
941510814 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
941610815 {
10816
+ struct net *net_old = dev_net(dev);
941710817 int err, new_nsid, new_ifindex;
941810818
941910819 ASSERT_RTNL();
....@@ -9429,7 +10829,7 @@
942910829
943010830 /* Get out if there is nothing todo */
943110831 err = 0;
9432
- if (net_eq(dev_net(dev), net))
10832
+ if (net_eq(net_old, net))
943310833 goto out;
943410834
943510835 /* Pick the destination device name, and ensure
....@@ -9490,6 +10890,9 @@
949010890 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
949110891 netdev_adjacent_del_links(dev);
949210892
10893
+ /* Move per-net netdevice notifiers that are following the netdevice */
10894
+ move_netdevice_notifiers_dev_net(dev, net);
10895
+
949310896 /* Actually switch the network namespace */
949410897 dev_net_set(dev, net);
949510898 dev->ifindex = new_ifindex;
....@@ -9500,6 +10903,12 @@
950010903
950110904 /* Fixup kobjects */
950210905 err = device_rename(&dev->dev, dev->name);
10906
+ WARN_ON(err);
10907
+
10908
+ /* Adapt owner in case owning user namespace of target network
10909
+ * namespace is different from the original one.
10910
+ */
10911
+ err = netdev_change_owner(dev, net_old, net);
950310912 WARN_ON(err);
950410913
950510914 /* Add the device back in the hashes */
....@@ -9566,7 +10975,6 @@
956610975
956710976 raise_softirq_irqoff(NET_TX_SOFTIRQ);
956810977 local_irq_enable();
9569
- preempt_check_resched_rt();
957010978
957110979 #ifdef CONFIG_RPS
957210980 remsd = oldsd->rps_ipi_list;
....@@ -9580,12 +10988,9 @@
958010988 netif_rx_ni(skb);
958110989 input_queue_head_incr(oldsd);
958210990 }
9583
- while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
10991
+ while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
958410992 netif_rx_ni(skb);
958510993 input_queue_head_incr(oldsd);
9586
- }
9587
- while ((skb = __skb_dequeue(&oldsd->tofree_queue))) {
9588
- kfree_skb(skb);
958910994 }
959010995
959110996 return 0;
....@@ -9636,7 +11041,7 @@
963611041 static int __net_init netdev_init(struct net *net)
963711042 {
963811043 BUILD_BUG_ON(GRO_HASH_BUCKETS >
9639
- 8 * FIELD_SIZEOF(struct napi_struct, gro_bitmask));
11044
+ 8 * sizeof_field(struct napi_struct, gro_bitmask));
964011045
964111046 if (net != &init_net)
964211047 INIT_LIST_HEAD(&net->dev_base_head);
....@@ -9648,6 +11053,8 @@
964811053 net->dev_index_head = netdev_create_hash();
964911054 if (net->dev_index_head == NULL)
965011055 goto err_idx;
11056
+
11057
+ RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
965111058
965211059 return 0;
965311060
....@@ -9770,7 +11177,7 @@
977011177 continue;
977111178
977211179 /* Leave virtual devices for the generic cleanup */
9773
- if (dev->rtnl_link_ops)
11180
+ if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund)
977411181 continue;
977511182
977611183 /* Push remaining network devices to init_net */
....@@ -9897,9 +11304,8 @@
989711304
989811305 INIT_WORK(flush, flush_backlog);
989911306
9900
- skb_queue_head_init_raw(&sd->input_pkt_queue);
9901
- skb_queue_head_init_raw(&sd->process_queue);
9902
- skb_queue_head_init_raw(&sd->tofree_queue);
11307
+ skb_queue_head_init(&sd->input_pkt_queue);
11308
+ skb_queue_head_init(&sd->process_queue);
990311309 #ifdef CONFIG_XFRM_OFFLOAD
990411310 skb_queue_head_init(&sd->xfrm_backlog);
990511311 #endif