hc
2023-12-08 01573e231f18eb2d99162747186f59511f56b64d
kernel/net/core/dev.c
....@@ -1,10 +1,6 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /*
23 * NET3 Protocol independent device support routines.
3
- *
4
- * This program is free software; you can redistribute it and/or
5
- * modify it under the terms of the GNU General Public License
6
- * as published by the Free Software Foundation; either version
7
- * 2 of the License, or (at your option) any later version.
84 *
95 * Derived from the non IP parts of dev.c 1.0.19
106 * Authors: Ross Biro
....@@ -102,6 +98,7 @@
10298 #include <net/busy_poll.h>
10399 #include <linux/rtnetlink.h>
104100 #include <linux/stat.h>
101
+#include <net/dsa.h>
105102 #include <net/dst.h>
106103 #include <net/dst_metadata.h>
107104 #include <net/pkt_sched.h>
....@@ -132,7 +129,6 @@
132129 #include <trace/events/napi.h>
133130 #include <trace/events/net.h>
134131 #include <trace/events/skb.h>
135
-#include <linux/pci.h>
136132 #include <linux/inetdevice.h>
137133 #include <linux/cpu_rmap.h>
138134 #include <linux/static_key.h>
....@@ -146,11 +142,15 @@
146142 #include <linux/sctp.h>
147143 #include <net/udp_tunnel.h>
148144 #include <linux/net_namespace.h>
145
+#include <linux/indirect_call_wrapper.h>
146
+#include <net/devlink.h>
147
+#include <linux/pm_runtime.h>
148
+#include <linux/prandom.h>
149
+#include <trace/hooks/net.h>
149150
150151 #include "net-sysfs.h"
151152
152153 #define MAX_GRO_SKBS 8
153
-#define MAX_NEST_DEV 8
154154
155155 /* This should be increased if a protocol with a bigger head is added. */
156156 #define GRO_MAX_HEAD (MAX_HEADER + 128)
....@@ -164,6 +164,9 @@
164164 static int netif_rx_internal(struct sk_buff *skb);
165165 static int call_netdevice_notifiers_info(unsigned long val,
166166 struct netdev_notifier_info *info);
167
+static int call_netdevice_notifiers_extack(unsigned long val,
168
+ struct net_device *dev,
169
+ struct netlink_ext_ack *extack);
167170 static struct napi_struct *napi_by_id(unsigned int napi_id);
168171
169172 /*
....@@ -219,15 +222,137 @@
219222 static inline void rps_lock(struct softnet_data *sd)
220223 {
221224 #ifdef CONFIG_RPS
222
- spin_lock(&sd->input_pkt_queue.lock);
225
+ raw_spin_lock(&sd->input_pkt_queue.raw_lock);
223226 #endif
224227 }
225228
226229 static inline void rps_unlock(struct softnet_data *sd)
227230 {
228231 #ifdef CONFIG_RPS
229
- spin_unlock(&sd->input_pkt_queue.lock);
232
+ raw_spin_unlock(&sd->input_pkt_queue.raw_lock);
230233 #endif
234
+}
235
+
236
+static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
237
+ const char *name)
238
+{
239
+ struct netdev_name_node *name_node;
240
+
241
+ name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
242
+ if (!name_node)
243
+ return NULL;
244
+ INIT_HLIST_NODE(&name_node->hlist);
245
+ name_node->dev = dev;
246
+ name_node->name = name;
247
+ return name_node;
248
+}
249
+
250
+static struct netdev_name_node *
251
+netdev_name_node_head_alloc(struct net_device *dev)
252
+{
253
+ struct netdev_name_node *name_node;
254
+
255
+ name_node = netdev_name_node_alloc(dev, dev->name);
256
+ if (!name_node)
257
+ return NULL;
258
+ INIT_LIST_HEAD(&name_node->list);
259
+ return name_node;
260
+}
261
+
262
+static void netdev_name_node_free(struct netdev_name_node *name_node)
263
+{
264
+ kfree(name_node);
265
+}
266
+
267
+static void netdev_name_node_add(struct net *net,
268
+ struct netdev_name_node *name_node)
269
+{
270
+ hlist_add_head_rcu(&name_node->hlist,
271
+ dev_name_hash(net, name_node->name));
272
+}
273
+
274
+static void netdev_name_node_del(struct netdev_name_node *name_node)
275
+{
276
+ hlist_del_rcu(&name_node->hlist);
277
+}
278
+
279
+static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
280
+ const char *name)
281
+{
282
+ struct hlist_head *head = dev_name_hash(net, name);
283
+ struct netdev_name_node *name_node;
284
+
285
+ hlist_for_each_entry(name_node, head, hlist)
286
+ if (!strcmp(name_node->name, name))
287
+ return name_node;
288
+ return NULL;
289
+}
290
+
291
+static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
292
+ const char *name)
293
+{
294
+ struct hlist_head *head = dev_name_hash(net, name);
295
+ struct netdev_name_node *name_node;
296
+
297
+ hlist_for_each_entry_rcu(name_node, head, hlist)
298
+ if (!strcmp(name_node->name, name))
299
+ return name_node;
300
+ return NULL;
301
+}
302
+
303
+int netdev_name_node_alt_create(struct net_device *dev, const char *name)
304
+{
305
+ struct netdev_name_node *name_node;
306
+ struct net *net = dev_net(dev);
307
+
308
+ name_node = netdev_name_node_lookup(net, name);
309
+ if (name_node)
310
+ return -EEXIST;
311
+ name_node = netdev_name_node_alloc(dev, name);
312
+ if (!name_node)
313
+ return -ENOMEM;
314
+ netdev_name_node_add(net, name_node);
315
+ /* The node that holds dev->name acts as a head of per-device list. */
316
+ list_add_tail(&name_node->list, &dev->name_node->list);
317
+
318
+ return 0;
319
+}
320
+EXPORT_SYMBOL(netdev_name_node_alt_create);
321
+
322
+static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
323
+{
324
+ list_del(&name_node->list);
325
+ netdev_name_node_del(name_node);
326
+ kfree(name_node->name);
327
+ netdev_name_node_free(name_node);
328
+}
329
+
330
+int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
331
+{
332
+ struct netdev_name_node *name_node;
333
+ struct net *net = dev_net(dev);
334
+
335
+ name_node = netdev_name_node_lookup(net, name);
336
+ if (!name_node)
337
+ return -ENOENT;
338
+ /* lookup might have found our primary name or a name belonging
339
+ * to another device.
340
+ */
341
+ if (name_node == dev->name_node || name_node->dev != dev)
342
+ return -EINVAL;
343
+
344
+ __netdev_name_node_alt_destroy(name_node);
345
+
346
+ return 0;
347
+}
348
+EXPORT_SYMBOL(netdev_name_node_alt_destroy);
349
+
350
+static void netdev_name_node_alt_flush(struct net_device *dev)
351
+{
352
+ struct netdev_name_node *name_node, *tmp;
353
+
354
+ list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list)
355
+ __netdev_name_node_alt_destroy(name_node);
231356 }
232357
233358 /* Device list insertion */
....@@ -239,7 +364,7 @@
239364
240365 write_lock_bh(&dev_base_lock);
241366 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
242
- hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
367
+ netdev_name_node_add(net, dev->name_node);
243368 hlist_add_head_rcu(&dev->index_hlist,
244369 dev_index_hash(net, dev->ifindex));
245370 write_unlock_bh(&dev_base_lock);
....@@ -257,7 +382,7 @@
257382 /* Unlink dev from the device chain */
258383 write_lock_bh(&dev_base_lock);
259384 list_del_rcu(&dev->dev_list);
260
- hlist_del_rcu(&dev->name_hlist);
385
+ netdev_name_node_del(dev->name_node);
261386 hlist_del_rcu(&dev->index_hlist);
262387 write_unlock_bh(&dev_base_lock);
263388
....@@ -355,6 +480,7 @@
355480 unsigned short dev_type)
356481 {
357482 }
483
+
358484 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
359485 {
360486 }
....@@ -385,6 +511,12 @@
385511
386512 static inline struct list_head *ptype_head(const struct packet_type *pt)
387513 {
514
+ struct list_head vendor_pt = { .next = NULL, };
515
+
516
+ trace_android_vh_ptype_head(pt, &vendor_pt);
517
+ if (vendor_pt.next)
518
+ return vendor_pt.next;
519
+
388520 if (pt->type == htons(ETH_P_ALL))
389521 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
390522 else
....@@ -735,14 +867,10 @@
735867
736868 struct net_device *__dev_get_by_name(struct net *net, const char *name)
737869 {
738
- struct net_device *dev;
739
- struct hlist_head *head = dev_name_hash(net, name);
870
+ struct netdev_name_node *node_name;
740871
741
- hlist_for_each_entry(dev, head, name_hlist)
742
- if (!strncmp(dev->name, name, IFNAMSIZ))
743
- return dev;
744
-
745
- return NULL;
872
+ node_name = netdev_name_node_lookup(net, name);
873
+ return node_name ? node_name->dev : NULL;
746874 }
747875 EXPORT_SYMBOL(__dev_get_by_name);
748876
....@@ -760,14 +888,10 @@
760888
761889 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
762890 {
763
- struct net_device *dev;
764
- struct hlist_head *head = dev_name_hash(net, name);
891
+ struct netdev_name_node *node_name;
765892
766
- hlist_for_each_entry_rcu(dev, head, name_hlist)
767
- if (!strncmp(dev->name, name, IFNAMSIZ))
768
- return dev;
769
-
770
- return NULL;
893
+ node_name = netdev_name_node_lookup_rcu(net, name);
894
+ return node_name ? node_name->dev : NULL;
771895 }
772896 EXPORT_SYMBOL(dev_get_by_name_rcu);
773897
....@@ -1015,7 +1139,7 @@
10151139 * @name: name string
10161140 *
10171141 * Network device names need to be valid file names to
1018
- * to allow sysfs to work. We also disallow any kind of
1142
+ * allow sysfs to work. We also disallow any kind of
10191143 * whitespace.
10201144 */
10211145 bool dev_valid_name(const char *name)
....@@ -1078,6 +1202,18 @@
10781202 return -ENOMEM;
10791203
10801204 for_each_netdev(net, d) {
1205
+ struct netdev_name_node *name_node;
1206
+ list_for_each_entry(name_node, &d->name_node->list, list) {
1207
+ if (!sscanf(name_node->name, name, &i))
1208
+ continue;
1209
+ if (i < 0 || i >= max_netdevices)
1210
+ continue;
1211
+
1212
+ /* avoid cases where sscanf is not exact inverse of printf */
1213
+ snprintf(buf, IFNAMSIZ, name, i);
1214
+ if (!strncmp(buf, name_node->name, IFNAMSIZ))
1215
+ set_bit(i, inuse);
1216
+ }
10811217 if (!sscanf(d->name, name, &i))
10821218 continue;
10831219 if (i < 0 || i >= max_netdevices)
....@@ -1138,8 +1274,8 @@
11381274 }
11391275 EXPORT_SYMBOL(dev_alloc_name);
11401276
1141
-int dev_get_valid_name(struct net *net, struct net_device *dev,
1142
- const char *name)
1277
+static int dev_get_valid_name(struct net *net, struct net_device *dev,
1278
+ const char *name)
11431279 {
11441280 BUG_ON(!net);
11451281
....@@ -1155,7 +1291,6 @@
11551291
11561292 return 0;
11571293 }
1158
-EXPORT_SYMBOL(dev_get_valid_name);
11591294
11601295 /**
11611296 * dev_change_name - change name of a device
....@@ -1229,13 +1364,13 @@
12291364 netdev_adjacent_rename_links(dev, oldname);
12301365
12311366 write_lock_bh(&dev_base_lock);
1232
- hlist_del_rcu(&dev->name_hlist);
1367
+ netdev_name_node_del(dev->name_node);
12331368 write_unlock_bh(&dev_base_lock);
12341369
12351370 synchronize_rcu();
12361371
12371372 write_lock_bh(&dev_base_lock);
1238
- hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1373
+ netdev_name_node_add(net, dev->name_node);
12391374 write_unlock_bh(&dev_base_lock);
12401375
12411376 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
....@@ -1285,8 +1420,8 @@
12851420 }
12861421
12871422 mutex_lock(&ifalias_mutex);
1288
- rcu_swap_protected(dev->ifalias, new_alias,
1289
- mutex_is_locked(&ifalias_mutex));
1423
+ new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
1424
+ mutex_is_locked(&ifalias_mutex));
12901425 mutex_unlock(&ifalias_mutex);
12911426
12921427 if (new_alias)
....@@ -1372,15 +1507,20 @@
13721507 }
13731508 EXPORT_SYMBOL(netdev_notify_peers);
13741509
1375
-static int __dev_open(struct net_device *dev)
1510
+static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
13761511 {
13771512 const struct net_device_ops *ops = dev->netdev_ops;
13781513 int ret;
13791514
13801515 ASSERT_RTNL();
13811516
1382
- if (!netif_device_present(dev))
1383
- return -ENODEV;
1517
+ if (!netif_device_present(dev)) {
1518
+ /* may be detached because parent is runtime-suspended */
1519
+ if (dev->dev.parent)
1520
+ pm_runtime_resume(dev->dev.parent);
1521
+ if (!netif_device_present(dev))
1522
+ return -ENODEV;
1523
+ }
13841524
13851525 /* Block netpoll from trying to do any rx path servicing.
13861526 * If we don't do this there is a chance ndo_poll_controller
....@@ -1388,7 +1528,7 @@
13881528 */
13891529 netpoll_poll_disable(dev);
13901530
1391
- ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1531
+ ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
13921532 ret = notifier_to_errno(ret);
13931533 if (ret)
13941534 return ret;
....@@ -1417,7 +1557,8 @@
14171557
14181558 /**
14191559 * dev_open - prepare an interface for use.
1420
- * @dev: device to open
1560
+ * @dev: device to open
1561
+ * @extack: netlink extended ack
14211562 *
14221563 * Takes a device from down to up state. The device's private open
14231564 * function is invoked and then the multicast lists are loaded. Finally
....@@ -1427,14 +1568,14 @@
14271568 * Calling this function on an active interface is a nop. On a failure
14281569 * a negative errno code is returned.
14291570 */
1430
-int dev_open(struct net_device *dev)
1571
+int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
14311572 {
14321573 int ret;
14331574
14341575 if (dev->flags & IFF_UP)
14351576 return 0;
14361577
1437
- ret = __dev_open(dev);
1578
+ ret = __dev_open(dev, extack);
14381579 if (ret < 0)
14391580 return ret;
14401581
....@@ -1596,6 +1737,7 @@
15961737 N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
15971738 N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
15981739 N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
1740
+ N(PRE_CHANGEADDR)
15991741 }
16001742 #undef N
16011743 return "UNKNOWN_NETDEV_EVENT";
....@@ -1610,6 +1752,62 @@
16101752 };
16111753
16121754 return nb->notifier_call(nb, val, &info);
1755
+}
1756
+
1757
+static int call_netdevice_register_notifiers(struct notifier_block *nb,
1758
+ struct net_device *dev)
1759
+{
1760
+ int err;
1761
+
1762
+ err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1763
+ err = notifier_to_errno(err);
1764
+ if (err)
1765
+ return err;
1766
+
1767
+ if (!(dev->flags & IFF_UP))
1768
+ return 0;
1769
+
1770
+ call_netdevice_notifier(nb, NETDEV_UP, dev);
1771
+ return 0;
1772
+}
1773
+
1774
+static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
1775
+ struct net_device *dev)
1776
+{
1777
+ if (dev->flags & IFF_UP) {
1778
+ call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1779
+ dev);
1780
+ call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1781
+ }
1782
+ call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1783
+}
1784
+
1785
+static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
1786
+ struct net *net)
1787
+{
1788
+ struct net_device *dev;
1789
+ int err;
1790
+
1791
+ for_each_netdev(net, dev) {
1792
+ err = call_netdevice_register_notifiers(nb, dev);
1793
+ if (err)
1794
+ goto rollback;
1795
+ }
1796
+ return 0;
1797
+
1798
+rollback:
1799
+ for_each_netdev_continue_reverse(net, dev)
1800
+ call_netdevice_unregister_notifiers(nb, dev);
1801
+ return err;
1802
+}
1803
+
1804
+static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
1805
+ struct net *net)
1806
+{
1807
+ struct net_device *dev;
1808
+
1809
+ for_each_netdev(net, dev)
1810
+ call_netdevice_unregister_notifiers(nb, dev);
16131811 }
16141812
16151813 static int dev_boot_phase = 1;
....@@ -1630,8 +1828,6 @@
16301828
16311829 int register_netdevice_notifier(struct notifier_block *nb)
16321830 {
1633
- struct net_device *dev;
1634
- struct net_device *last;
16351831 struct net *net;
16361832 int err;
16371833
....@@ -1644,17 +1840,9 @@
16441840 if (dev_boot_phase)
16451841 goto unlock;
16461842 for_each_net(net) {
1647
- for_each_netdev(net, dev) {
1648
- err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1649
- err = notifier_to_errno(err);
1650
- if (err)
1651
- goto rollback;
1652
-
1653
- if (!(dev->flags & IFF_UP))
1654
- continue;
1655
-
1656
- call_netdevice_notifier(nb, NETDEV_UP, dev);
1657
- }
1843
+ err = call_netdevice_register_net_notifiers(nb, net);
1844
+ if (err)
1845
+ goto rollback;
16581846 }
16591847
16601848 unlock:
....@@ -1663,22 +1851,9 @@
16631851 return err;
16641852
16651853 rollback:
1666
- last = dev;
1667
- for_each_net(net) {
1668
- for_each_netdev(net, dev) {
1669
- if (dev == last)
1670
- goto outroll;
1854
+ for_each_net_continue_reverse(net)
1855
+ call_netdevice_unregister_net_notifiers(nb, net);
16711856
1672
- if (dev->flags & IFF_UP) {
1673
- call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1674
- dev);
1675
- call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1676
- }
1677
- call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1678
- }
1679
- }
1680
-
1681
-outroll:
16821857 raw_notifier_chain_unregister(&netdev_chain, nb);
16831858 goto unlock;
16841859 }
....@@ -1700,7 +1875,6 @@
17001875
17011876 int unregister_netdevice_notifier(struct notifier_block *nb)
17021877 {
1703
- struct net_device *dev;
17041878 struct net *net;
17051879 int err;
17061880
....@@ -1711,22 +1885,147 @@
17111885 if (err)
17121886 goto unlock;
17131887
1714
- for_each_net(net) {
1715
- for_each_netdev(net, dev) {
1716
- if (dev->flags & IFF_UP) {
1717
- call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1718
- dev);
1719
- call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1720
- }
1721
- call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1722
- }
1723
- }
1888
+ for_each_net(net)
1889
+ call_netdevice_unregister_net_notifiers(nb, net);
1890
+
17241891 unlock:
17251892 rtnl_unlock();
17261893 up_write(&pernet_ops_rwsem);
17271894 return err;
17281895 }
17291896 EXPORT_SYMBOL(unregister_netdevice_notifier);
1897
+
1898
+static int __register_netdevice_notifier_net(struct net *net,
1899
+ struct notifier_block *nb,
1900
+ bool ignore_call_fail)
1901
+{
1902
+ int err;
1903
+
1904
+ err = raw_notifier_chain_register(&net->netdev_chain, nb);
1905
+ if (err)
1906
+ return err;
1907
+ if (dev_boot_phase)
1908
+ return 0;
1909
+
1910
+ err = call_netdevice_register_net_notifiers(nb, net);
1911
+ if (err && !ignore_call_fail)
1912
+ goto chain_unregister;
1913
+
1914
+ return 0;
1915
+
1916
+chain_unregister:
1917
+ raw_notifier_chain_unregister(&net->netdev_chain, nb);
1918
+ return err;
1919
+}
1920
+
1921
+static int __unregister_netdevice_notifier_net(struct net *net,
1922
+ struct notifier_block *nb)
1923
+{
1924
+ int err;
1925
+
1926
+ err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
1927
+ if (err)
1928
+ return err;
1929
+
1930
+ call_netdevice_unregister_net_notifiers(nb, net);
1931
+ return 0;
1932
+}
1933
+
1934
+/**
1935
+ * register_netdevice_notifier_net - register a per-netns network notifier block
1936
+ * @net: network namespace
1937
+ * @nb: notifier
1938
+ *
1939
+ * Register a notifier to be called when network device events occur.
1940
+ * The notifier passed is linked into the kernel structures and must
1941
+ * not be reused until it has been unregistered. A negative errno code
1942
+ * is returned on a failure.
1943
+ *
1944
+ * When registered all registration and up events are replayed
1945
+ * to the new notifier to allow device to have a race free
1946
+ * view of the network device list.
1947
+ */
1948
+
1949
+int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
1950
+{
1951
+ int err;
1952
+
1953
+ rtnl_lock();
1954
+ err = __register_netdevice_notifier_net(net, nb, false);
1955
+ rtnl_unlock();
1956
+ return err;
1957
+}
1958
+EXPORT_SYMBOL(register_netdevice_notifier_net);
1959
+
1960
+/**
1961
+ * unregister_netdevice_notifier_net - unregister a per-netns
1962
+ * network notifier block
1963
+ * @net: network namespace
1964
+ * @nb: notifier
1965
+ *
1966
+ * Unregister a notifier previously registered by
1967
+ * register_netdevice_notifier(). The notifier is unlinked into the
1968
+ * kernel structures and may then be reused. A negative errno code
1969
+ * is returned on a failure.
1970
+ *
1971
+ * After unregistering unregister and down device events are synthesized
1972
+ * for all devices on the device list to the removed notifier to remove
1973
+ * the need for special case cleanup code.
1974
+ */
1975
+
1976
+int unregister_netdevice_notifier_net(struct net *net,
1977
+ struct notifier_block *nb)
1978
+{
1979
+ int err;
1980
+
1981
+ rtnl_lock();
1982
+ err = __unregister_netdevice_notifier_net(net, nb);
1983
+ rtnl_unlock();
1984
+ return err;
1985
+}
1986
+EXPORT_SYMBOL(unregister_netdevice_notifier_net);
1987
+
1988
+int register_netdevice_notifier_dev_net(struct net_device *dev,
1989
+ struct notifier_block *nb,
1990
+ struct netdev_net_notifier *nn)
1991
+{
1992
+ int err;
1993
+
1994
+ rtnl_lock();
1995
+ err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
1996
+ if (!err) {
1997
+ nn->nb = nb;
1998
+ list_add(&nn->list, &dev->net_notifier_list);
1999
+ }
2000
+ rtnl_unlock();
2001
+ return err;
2002
+}
2003
+EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
2004
+
2005
+int unregister_netdevice_notifier_dev_net(struct net_device *dev,
2006
+ struct notifier_block *nb,
2007
+ struct netdev_net_notifier *nn)
2008
+{
2009
+ int err;
2010
+
2011
+ rtnl_lock();
2012
+ list_del(&nn->list);
2013
+ err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
2014
+ rtnl_unlock();
2015
+ return err;
2016
+}
2017
+EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
2018
+
2019
+static void move_netdevice_notifiers_dev_net(struct net_device *dev,
2020
+ struct net *net)
2021
+{
2022
+ struct netdev_net_notifier *nn;
2023
+
2024
+ list_for_each_entry(nn, &dev->net_notifier_list, list) {
2025
+ __unregister_netdevice_notifier_net(dev_net(dev), nn->nb);
2026
+ __register_netdevice_notifier_net(net, nn->nb, true);
2027
+ }
2028
+}
17302029
17312030 /**
17322031 * call_netdevice_notifiers_info - call all network notifier blocks
....@@ -1740,8 +2039,31 @@
17402039 static int call_netdevice_notifiers_info(unsigned long val,
17412040 struct netdev_notifier_info *info)
17422041 {
2042
+ struct net *net = dev_net(info->dev);
2043
+ int ret;
2044
+
17432045 ASSERT_RTNL();
2046
+
2047
+ /* Run per-netns notifier block chain first, then run the global one.
2048
+ * Hopefully, one day, the global one is going to be removed after
2049
+ * all notifier block registrators get converted to be per-netns.
2050
+ */
2051
+ ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
2052
+ if (ret & NOTIFY_STOP_MASK)
2053
+ return ret;
17442054 return raw_notifier_call_chain(&netdev_chain, val, info);
2055
+}
2056
+
2057
+static int call_netdevice_notifiers_extack(unsigned long val,
2058
+ struct net_device *dev,
2059
+ struct netlink_ext_ack *extack)
2060
+{
2061
+ struct netdev_notifier_info info = {
2062
+ .dev = dev,
2063
+ .extack = extack,
2064
+ };
2065
+
2066
+ return call_netdevice_notifiers_info(val, &info);
17452067 }
17462068
17472069 /**
....@@ -1755,11 +2077,7 @@
17552077
17562078 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
17572079 {
1758
- struct netdev_notifier_info info = {
1759
- .dev = dev,
1760
- };
1761
-
1762
- return call_netdevice_notifiers_info(val, &info);
2080
+ return call_netdevice_notifiers_extack(val, dev, NULL);
17632081 }
17642082 EXPORT_SYMBOL(call_netdevice_notifiers);
17652083
....@@ -1987,6 +2305,17 @@
19872305 return false;
19882306 }
19892307
2308
+/**
2309
+ * dev_nit_active - return true if any network interface taps are in use
2310
+ *
2311
+ * @dev: network device to check for the presence of taps
2312
+ */
2313
+bool dev_nit_active(struct net_device *dev)
2314
+{
2315
+ return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
2316
+}
2317
+EXPORT_SYMBOL_GPL(dev_nit_active);
2318
+
19902319 /*
19912320 * Support routine. Sends outgoing frames to any network
19922321 * taps currently in use.
....@@ -2002,6 +2331,9 @@
20022331 rcu_read_lock();
20032332 again:
20042333 list_for_each_entry_rcu(ptype, ptype_list, list) {
2334
+ if (ptype->ignore_outgoing)
2335
+ continue;
2336
+
20052337 /* Never send packets back to the socket
20062338 * they originated from - MvS (miquels@drinkel.ow.org)
20072339 */
....@@ -2723,6 +3055,7 @@
27233055 sd->output_queue_tailp = &q->next_sched;
27243056 raise_softirq_irqoff(NET_TX_SOFTIRQ);
27253057 local_irq_restore(flags);
3058
+ preempt_check_resched_rt();
27263059 }
27273060
27283061 void __netif_schedule(struct Qdisc *q)
....@@ -2744,7 +3077,7 @@
27443077 void netif_schedule_queue(struct netdev_queue *txq)
27453078 {
27463079 rcu_read_lock();
2747
- if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
3080
+ if (!netif_xmit_stopped(txq)) {
27483081 struct Qdisc *q = rcu_dereference(txq->qdisc);
27493082
27503083 __netif_schedule(q);
....@@ -2785,6 +3118,7 @@
27853118 __this_cpu_write(softnet_data.completion_queue, skb);
27863119 raise_softirq_irqoff(NET_TX_SOFTIRQ);
27873120 local_irq_restore(flags);
3121
+ preempt_check_resched_rt();
27883122 }
27893123 EXPORT_SYMBOL(__dev_kfree_skb_irq);
27903124
....@@ -2881,12 +3215,10 @@
28813215 else
28823216 name = netdev_name(dev);
28833217 }
2884
- WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2885
- "gso_type=%d ip_summed=%d\n",
3218
+ skb_dump(KERN_WARNING, skb, false);
3219
+ WARN(1, "%s: caps=(%pNF, %pNF)\n",
28863220 name, dev ? &dev->features : &null_features,
2887
- skb->sk ? &skb->sk->sk_route_caps : &null_features,
2888
- skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2889
- skb_shinfo(skb)->gso_type, skb->ip_summed);
3221
+ skb->sk ? &skb->sk->sk_route_caps : &null_features);
28903222 }
28913223
28923224 /*
....@@ -2916,18 +3248,19 @@
29163248 }
29173249
29183250 offset = skb_checksum_start_offset(skb);
2919
- BUG_ON(offset >= skb_headlen(skb));
3251
+ ret = -EINVAL;
3252
+ if (WARN_ON_ONCE(offset >= skb_headlen(skb)))
3253
+ goto out;
3254
+
29203255 csum = skb_checksum(skb, offset, skb->len - offset, 0);
29213256
29223257 offset += skb->csum_offset;
2923
- BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
3258
+ if (WARN_ON_ONCE(offset + sizeof(__sum16) > skb_headlen(skb)))
3259
+ goto out;
29243260
2925
- if (skb_cloned(skb) &&
2926
- !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2927
- ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2928
- if (ret)
2929
- goto out;
2930
- }
3261
+ ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
3262
+ if (ret)
3263
+ goto out;
29313264
29323265 *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
29333266 out_set_summed:
....@@ -2962,12 +3295,11 @@
29623295 ret = -EINVAL;
29633296 goto out;
29643297 }
2965
- if (skb_cloned(skb) &&
2966
- !skb_clone_writable(skb, offset + sizeof(__le32))) {
2967
- ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2968
- if (ret)
2969
- goto out;
2970
- }
3298
+
3299
+ ret = skb_ensure_writable(skb, offset + sizeof(__le32));
3300
+ if (ret)
3301
+ goto out;
3302
+
29713303 crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
29723304 skb->len - start, ~(__u32)0,
29733305 crc32c_csum_stub));
....@@ -3052,7 +3384,7 @@
30523384 * It may return NULL if the skb requires no segmentation. This is
30533385 * only possible when GSO is used for verifying header integrity.
30543386 *
3055
- * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
3387
+ * Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb.
30563388 */
30573389 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
30583390 netdev_features_t features, bool tx_path)
....@@ -3081,7 +3413,7 @@
30813413 features &= ~NETIF_F_GSO_PARTIAL;
30823414 }
30833415
3084
- BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
3416
+ BUILD_BUG_ON(SKB_GSO_CB_OFFSET +
30853417 sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
30863418
30873419 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
....@@ -3092,7 +3424,7 @@
30923424
30933425 segs = skb_mac_gso_segment(skb, features);
30943426
3095
- if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
3427
+ if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
30963428 skb_warn_bad_offload(skb);
30973429
30983430 return segs;
....@@ -3101,10 +3433,11 @@
31013433
31023434 /* Take action when hardware reception checksum errors are detected. */
31033435 #ifdef CONFIG_BUG
3104
-void netdev_rx_csum_fault(struct net_device *dev)
3436
+void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
31053437 {
31063438 if (net_ratelimit()) {
31073439 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
3440
+ skb_dump(KERN_ERR, skb, true);
31083441 dump_stack();
31093442 }
31103443 }
....@@ -3154,10 +3487,9 @@
31543487 static netdev_features_t harmonize_features(struct sk_buff *skb,
31553488 netdev_features_t features)
31563489 {
3157
- int tmp;
31583490 __be16 type;
31593491
3160
- type = skb_network_protocol(skb, &tmp);
3492
+ type = skb_network_protocol(skb, NULL);
31613493 features = net_mpls_features(skb, features, type);
31623494
31633495 if (skb->ip_summed != CHECKSUM_NONE &&
....@@ -3254,10 +3586,11 @@
32543586 unsigned int len;
32553587 int rc;
32563588
3257
- if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
3589
+ if (dev_nit_active(dev))
32583590 dev_queue_xmit_nit(skb, dev);
32593591
32603592 len = skb->len;
3593
+ PRANDOM_ADD_NOISE(skb, dev, txq, len + jiffies);
32613594 trace_net_dev_start_xmit(skb, dev);
32623595 rc = netdev_start_xmit(skb, dev, txq, more);
32633596 trace_net_dev_xmit(skb, rc, dev, len);
....@@ -3274,7 +3607,7 @@
32743607 while (skb) {
32753608 struct sk_buff *next = skb->next;
32763609
3277
- skb->next = NULL;
3610
+ skb_mark_not_on_list(skb);
32783611 rc = xmit_one(skb, dev, txq, next != NULL);
32793612 if (unlikely(!dev_xmit_complete(rc))) {
32803613 skb->next = next;
....@@ -3374,7 +3707,7 @@
33743707
33753708 for (; skb != NULL; skb = next) {
33763709 next = skb->next;
3377
- skb->next = NULL;
3710
+ skb_mark_not_on_list(skb);
33783711
33793712 /* in case skb wont be segmented, point to itself */
33803713 skb->prev = skb;
....@@ -3405,7 +3738,7 @@
34053738 /* To get more precise estimation of bytes sent on wire,
34063739 * we add to pkt_len the headers size of all segments
34073740 */
3408
- if (shinfo->gso_size) {
3741
+ if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
34093742 unsigned int hdr_len;
34103743 u16 gso_segs = shinfo->gso_segs;
34113744
....@@ -3449,13 +3782,9 @@
34493782 qdisc_calculate_pkt_len(skb, q);
34503783
34513784 if (q->flags & TCQ_F_NOLOCK) {
3452
- if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3453
- __qdisc_drop(skb, &to_free);
3454
- rc = NET_XMIT_DROP;
3455
- } else {
3456
- rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3785
+ rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3786
+ if (likely(!netif_xmit_frozen_or_stopped(txq)))
34573787 qdisc_run(q);
3458
- }
34593788
34603789 if (unlikely(to_free))
34613790 kfree_skb_list(to_free);
....@@ -3468,7 +3797,11 @@
34683797 * This permits qdisc->running owner to get the lock more
34693798 * often and dequeue packets faster.
34703799 */
3800
+#ifdef CONFIG_PREEMPT_RT
3801
+ contended = true;
3802
+#else
34713803 contended = qdisc_is_running(q);
3804
+#endif
34723805 if (unlikely(contended))
34733806 spin_lock(&q->busylock);
34743807
....@@ -3551,7 +3884,8 @@
35513884 skb_reset_mac_header(skb);
35523885 __skb_pull(skb, skb_network_offset(skb));
35533886 skb->pkt_type = PACKET_LOOPBACK;
3554
- skb->ip_summed = CHECKSUM_UNNECESSARY;
3887
+ if (skb->ip_summed == CHECKSUM_NONE)
3888
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
35553889 WARN_ON(!skb_dst(skb));
35563890 skb_dst_force(skb);
35573891 netif_rx_ni(skb);
....@@ -3570,6 +3904,7 @@
35703904 return skb;
35713905
35723906 /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
3907
+ qdisc_skb_cb(skb)->mru = 0;
35733908 mini_qdisc_bstats_cpu_update(miniq, skb);
35743909
35753910 switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
....@@ -3670,23 +4005,21 @@
36704005 }
36714006
36724007 u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
3673
- struct net_device *sb_dev,
3674
- select_queue_fallback_t fallback)
4008
+ struct net_device *sb_dev)
36754009 {
36764010 return 0;
36774011 }
36784012 EXPORT_SYMBOL(dev_pick_tx_zero);
36794013
36804014 u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
3681
- struct net_device *sb_dev,
3682
- select_queue_fallback_t fallback)
4015
+ struct net_device *sb_dev)
36834016 {
36844017 return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
36854018 }
36864019 EXPORT_SYMBOL(dev_pick_tx_cpu_id);
36874020
3688
-static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
3689
- struct net_device *sb_dev)
4021
+u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
4022
+ struct net_device *sb_dev)
36904023 {
36914024 struct sock *sk = skb->sk;
36924025 int queue_index = sk_tx_queue_get(sk);
....@@ -3710,10 +4043,11 @@
37104043
37114044 return queue_index;
37124045 }
4046
+EXPORT_SYMBOL(netdev_pick_tx);
37134047
3714
-struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3715
- struct sk_buff *skb,
3716
- struct net_device *sb_dev)
4048
+struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
4049
+ struct sk_buff *skb,
4050
+ struct net_device *sb_dev)
37174051 {
37184052 int queue_index = 0;
37194053
....@@ -3728,10 +4062,9 @@
37284062 const struct net_device_ops *ops = dev->netdev_ops;
37294063
37304064 if (ops->ndo_select_queue)
3731
- queue_index = ops->ndo_select_queue(dev, skb, sb_dev,
3732
- __netdev_pick_tx);
4065
+ queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
37334066 else
3734
- queue_index = __netdev_pick_tx(dev, skb, sb_dev);
4067
+ queue_index = netdev_pick_tx(dev, skb, sb_dev);
37354068
37364069 queue_index = netdev_cap_txqueue(dev, queue_index);
37374070 }
....@@ -3775,6 +4108,7 @@
37754108 bool again = false;
37764109
37774110 skb_reset_mac_header(skb);
4111
+ skb_assert_len(skb);
37784112
37794113 if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
37804114 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
....@@ -3805,7 +4139,7 @@
38054139 else
38064140 skb_dst_force(skb);
38074141
3808
- txq = netdev_pick_tx(dev, skb, sb_dev);
4142
+ txq = netdev_core_pick_tx(dev, skb, sb_dev);
38094143 q = rcu_dereference_bh(txq->qdisc);
38104144
38114145 trace_net_dev_queue(skb);
....@@ -3840,6 +4174,7 @@
38404174 if (!skb)
38414175 goto out;
38424176
4177
+ PRANDOM_ADD_NOISE(skb, dev, txq, jiffies);
38434178 HARD_TX_LOCK(dev, txq, cpu);
38444179
38454180 if (!netif_xmit_stopped(txq)) {
....@@ -3887,7 +4222,7 @@
38874222 }
38884223 EXPORT_SYMBOL(dev_queue_xmit_accel);
38894224
3890
-int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
4225
+int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
38914226 {
38924227 struct net_device *dev = skb->dev;
38934228 struct sk_buff *orig_skb = skb;
....@@ -3905,6 +4240,7 @@
39054240
39064241 skb_set_queue_mapping(skb, queue_id);
39074242 txq = skb_get_tx_queue(dev, skb);
4243
+ PRANDOM_ADD_NOISE(skb, dev, txq, jiffies);
39084244
39094245 local_bh_disable();
39104246
....@@ -3916,17 +4252,13 @@
39164252 dev_xmit_recursion_dec();
39174253
39184254 local_bh_enable();
3919
-
3920
- if (!dev_xmit_complete(ret))
3921
- kfree_skb(skb);
3922
-
39234255 return ret;
39244256 drop:
39254257 atomic_long_inc(&dev->tx_dropped);
39264258 kfree_skb_list(skb);
39274259 return NET_XMIT_DROP;
39284260 }
3929
-EXPORT_SYMBOL(dev_direct_xmit);
4261
+EXPORT_SYMBOL(__dev_direct_xmit);
39304262
39314263 /*************************************************************************
39324264 * Receiver routines
....@@ -3944,6 +4276,8 @@
39444276 int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */
39454277 int dev_rx_weight __read_mostly = 64;
39464278 int dev_tx_weight __read_mostly = 64;
4279
+/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */
4280
+int gro_normal_batch __read_mostly = 8;
39474281
39484282 /* Called with irq disabled */
39494283 static inline void ____napi_schedule(struct softnet_data *sd,
....@@ -3961,9 +4295,9 @@
39614295 u32 rps_cpu_mask __read_mostly;
39624296 EXPORT_SYMBOL(rps_cpu_mask);
39634297
3964
-struct static_key rps_needed __read_mostly;
4298
+struct static_key_false rps_needed __read_mostly;
39654299 EXPORT_SYMBOL(rps_needed);
3966
-struct static_key rfs_needed __read_mostly;
4300
+struct static_key_false rfs_needed __read_mostly;
39674301 EXPORT_SYMBOL(rfs_needed);
39684302
39694303 static struct rps_dev_flow *
....@@ -4194,7 +4528,7 @@
41944528 struct softnet_data *sd;
41954529 unsigned int old_flow, new_flow;
41964530
4197
- if (qlen < (netdev_max_backlog >> 1))
4531
+ if (qlen < (READ_ONCE(netdev_max_backlog) >> 1))
41984532 return false;
41994533
42004534 sd = this_cpu_ptr(&softnet_data);
....@@ -4242,7 +4576,7 @@
42424576 if (!netif_running(skb->dev))
42434577 goto drop;
42444578 qlen = skb_queue_len(&sd->input_pkt_queue);
4245
- if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
4579
+ if (qlen <= READ_ONCE(netdev_max_backlog) && !skb_flow_limit(skb, qlen)) {
42464580 if (qlen) {
42474581 enqueue:
42484582 __skb_queue_tail(&sd->input_pkt_queue, skb);
....@@ -4267,6 +4601,7 @@
42674601 rps_unlock(sd);
42684602
42694603 local_irq_restore(flags);
4604
+ preempt_check_resched_rt();
42704605
42714606 atomic_long_inc(&skb->dev->rx_dropped);
42724607 kfree_skb(skb);
....@@ -4312,7 +4647,7 @@
43124647 /* Reinjected packets coming from act_mirred or similar should
43134648 * not get XDP generic processing.
43144649 */
4315
- if (skb_is_tc_redirected(skb))
4650
+ if (skb_is_redirected(skb))
43164651 return XDP_PASS;
43174652
43184653 /* XDP packets must be linear and must have sufficient headroom
....@@ -4344,6 +4679,11 @@
43444679 xdp->data_meta = xdp->data;
43454680 xdp->data_end = xdp->data + hlen;
43464681 xdp->data_hard_start = skb->data - skb_headroom(skb);
4682
+
4683
+ /* SKB "head" area always have tailroom for skb_shared_info */
4684
+ xdp->frame_sz = (void *)skb_end_pointer(skb) - xdp->data_hard_start;
4685
+ xdp->frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
4686
+
43474687 orig_data_end = xdp->data_end;
43484688 orig_data = xdp->data;
43494689 eth = (struct ethhdr *)xdp->data;
....@@ -4367,14 +4707,11 @@
43674707 skb_reset_network_header(skb);
43684708 }
43694709
4370
- /* check if bpf_xdp_adjust_tail was used. it can only "shrink"
4371
- * pckt.
4372
- */
4373
- off = orig_data_end - xdp->data_end;
4710
+ /* check if bpf_xdp_adjust_tail was used */
4711
+ off = xdp->data_end - orig_data_end;
43744712 if (off != 0) {
43754713 skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
4376
- skb->len -= off;
4377
-
4714
+ skb->len += off; /* positive on grow, negative on shrink */
43784715 }
43794716
43804717 /* check if XDP changed eth hdr such SKB needs update */
....@@ -4397,10 +4734,10 @@
43974734 break;
43984735 default:
43994736 bpf_warn_invalid_xdp_action(act);
4400
- /* fall through */
4737
+ fallthrough;
44014738 case XDP_ABORTED:
44024739 trace_xdp_exception(skb->dev, xdp_prog, act);
4403
- /* fall through */
4740
+ fallthrough;
44044741 case XDP_DROP:
44054742 do_drop:
44064743 kfree_skb(skb);
....@@ -4420,7 +4757,7 @@
44204757 bool free_skb = true;
44214758 int cpu, rc;
44224759
4423
- txq = netdev_pick_tx(dev, skb, NULL);
4760
+ txq = netdev_core_pick_tx(dev, skb, NULL);
44244761 cpu = smp_processor_id();
44254762 HARD_TX_LOCK(dev, txq, cpu);
44264763 if (!netif_xmit_stopped(txq)) {
....@@ -4434,7 +4771,6 @@
44344771 kfree_skb(skb);
44354772 }
44364773 }
4437
-EXPORT_SYMBOL_GPL(generic_xdp_tx);
44384774
44394775 static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
44404776
....@@ -4472,16 +4808,16 @@
44724808 {
44734809 int ret;
44744810
4475
- net_timestamp_check(netdev_tstamp_prequeue, skb);
4811
+ net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
44764812
44774813 trace_netif_rx(skb);
44784814
44794815 #ifdef CONFIG_RPS
4480
- if (static_key_false(&rps_needed)) {
4816
+ if (static_branch_unlikely(&rps_needed)) {
44814817 struct rps_dev_flow voidflow, *rflow = &voidflow;
44824818 int cpu;
44834819
4484
- preempt_disable();
4820
+ migrate_disable();
44854821 rcu_read_lock();
44864822
44874823 cpu = get_rps_cpu(skb->dev, skb, &rflow);
....@@ -4491,14 +4827,14 @@
44914827 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
44924828
44934829 rcu_read_unlock();
4494
- preempt_enable();
4830
+ migrate_enable();
44954831 } else
44964832 #endif
44974833 {
44984834 unsigned int qtail;
44994835
4500
- ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
4501
- put_cpu();
4836
+ ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail);
4837
+ put_cpu_light();
45024838 }
45034839 return ret;
45044840 }
....@@ -4520,9 +4856,14 @@
45204856
45214857 int netif_rx(struct sk_buff *skb)
45224858 {
4859
+ int ret;
4860
+
45234861 trace_netif_rx_entry(skb);
45244862
4525
- return netif_rx_internal(skb);
4863
+ ret = netif_rx_internal(skb);
4864
+ trace_netif_rx_exit(ret);
4865
+
4866
+ return ret;
45264867 }
45274868 EXPORT_SYMBOL(netif_rx);
45284869
....@@ -4532,15 +4873,29 @@
45324873
45334874 trace_netif_rx_ni_entry(skb);
45344875
4535
- preempt_disable();
4876
+ local_bh_disable();
45364877 err = netif_rx_internal(skb);
4537
- if (local_softirq_pending())
4538
- do_softirq();
4539
- preempt_enable();
4878
+ local_bh_enable();
4879
+ trace_netif_rx_ni_exit(err);
45404880
45414881 return err;
45424882 }
45434883 EXPORT_SYMBOL(netif_rx_ni);
4884
+
4885
+int netif_rx_any_context(struct sk_buff *skb)
4886
+{
4887
+ /*
4888
+ * If invoked from contexts which do not invoke bottom half
4889
+ * processing either at return from interrupt or when softrqs are
4890
+ * reenabled, use netif_rx_ni() which invokes bottomhalf processing
4891
+ * directly.
4892
+ */
4893
+ if (in_interrupt())
4894
+ return netif_rx(skb);
4895
+ else
4896
+ return netif_rx_ni(skb);
4897
+}
4898
+EXPORT_SYMBOL(netif_rx_any_context);
45444899
45454900 static __latent_entropy void net_tx_action(struct softirq_action *h)
45464901 {
....@@ -4583,25 +4938,43 @@
45834938 sd->output_queue_tailp = &sd->output_queue;
45844939 local_irq_enable();
45854940
4941
+ rcu_read_lock();
4942
+
45864943 while (head) {
45874944 struct Qdisc *q = head;
45884945 spinlock_t *root_lock = NULL;
45894946
45904947 head = head->next_sched;
45914948
4592
- if (!(q->flags & TCQ_F_NOLOCK)) {
4593
- root_lock = qdisc_lock(q);
4594
- spin_lock(root_lock);
4595
- }
45964949 /* We need to make sure head->next_sched is read
45974950 * before clearing __QDISC_STATE_SCHED
45984951 */
45994952 smp_mb__before_atomic();
4953
+
4954
+ if (!(q->flags & TCQ_F_NOLOCK)) {
4955
+ root_lock = qdisc_lock(q);
4956
+ spin_lock(root_lock);
4957
+ } else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
4958
+ &q->state))) {
4959
+ /* There is a synchronize_net() between
4960
+ * STATE_DEACTIVATED flag being set and
4961
+ * qdisc_reset()/some_qdisc_is_busy() in
4962
+ * dev_deactivate(), so we can safely bail out
4963
+ * early here to avoid data race between
4964
+ * qdisc_deactivate() and some_qdisc_is_busy()
4965
+ * for lockless qdisc.
4966
+ */
4967
+ clear_bit(__QDISC_STATE_SCHED, &q->state);
4968
+ continue;
4969
+ }
4970
+
46004971 clear_bit(__QDISC_STATE_SCHED, &q->state);
46014972 qdisc_run(q);
46024973 if (root_lock)
46034974 spin_unlock(root_lock);
46044975 }
4976
+
4977
+ rcu_read_unlock();
46054978 }
46064979
46074980 xfrm_dev_backlog(sd);
....@@ -4616,7 +4989,7 @@
46164989
46174990 static inline struct sk_buff *
46184991 sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
4619
- struct net_device *orig_dev)
4992
+ struct net_device *orig_dev, bool *another)
46204993 {
46214994 #ifdef CONFIG_NET_CLS_ACT
46224995 struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
....@@ -4636,10 +5009,12 @@
46365009 }
46375010
46385011 qdisc_skb_cb(skb)->pkt_len = skb->len;
5012
+ qdisc_skb_cb(skb)->mru = 0;
46395013 skb->tc_at_ingress = 1;
46405014 mini_qdisc_bstats_cpu_update(miniq, skb);
46415015
4642
- switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
5016
+ switch (tcf_classify_ingress(skb, miniq->block, miniq->filter_list,
5017
+ &cl_res, false)) {
46435018 case TC_ACT_OK:
46445019 case TC_ACT_RECLASSIFY:
46455020 skb->tc_index = TC_H_MIN(cl_res.classid);
....@@ -4659,11 +5034,13 @@
46595034 * redirecting to another netdev
46605035 */
46615036 __skb_push(skb, skb->mac_len);
4662
- skb_do_redirect(skb);
5037
+ if (skb_do_redirect(skb) == -EAGAIN) {
5038
+ __skb_pull(skb, skb->mac_len);
5039
+ *another = true;
5040
+ break;
5041
+ }
46635042 return NULL;
4664
- case TC_ACT_REINSERT:
4665
- /* this does not scrub the packet, and updates stats on error */
4666
- skb_tc_reinsert(skb, &cl_res);
5043
+ case TC_ACT_CONSUMED:
46675044 return NULL;
46685045 default:
46695046 break;
....@@ -4763,7 +5140,6 @@
47635140 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
47645141 int *ret, struct net_device *orig_dev)
47655142 {
4766
-#ifdef CONFIG_NETFILTER_INGRESS
47675143 if (nf_hook_ingress_active(skb)) {
47685144 int ingress_retval;
47695145
....@@ -4777,7 +5153,6 @@
47775153 rcu_read_unlock();
47785154 return ingress_retval;
47795155 }
4780
-#endif /* CONFIG_NETFILTER_INGRESS */
47815156 return 0;
47825157 }
47835158
....@@ -4792,7 +5167,7 @@
47925167 int ret = NET_RX_DROP;
47935168 __be16 type;
47945169
4795
- net_timestamp_check(!netdev_tstamp_prequeue, skb);
5170
+ net_timestamp_check(!READ_ONCE(netdev_tstamp_prequeue), skb);
47965171
47975172 trace_netif_receive_skb(skb);
47985173
....@@ -4852,7 +5227,12 @@
48525227 skip_taps:
48535228 #ifdef CONFIG_NET_INGRESS
48545229 if (static_branch_unlikely(&ingress_needed_key)) {
4855
- skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
5230
+ bool another = false;
5231
+
5232
+ skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
5233
+ &another);
5234
+ if (another)
5235
+ goto another_round;
48565236 if (!skb)
48575237 goto out;
48585238
....@@ -4860,7 +5240,7 @@
48605240 goto out;
48615241 }
48625242 #endif
4863
- skb_reset_tc(skb);
5243
+ skb_reset_redirect(skb);
48645244 skip_classify:
48655245 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
48665246 goto drop;
....@@ -4897,14 +5277,42 @@
48975277 }
48985278 }
48995279
4900
- if (unlikely(skb_vlan_tag_present(skb))) {
4901
- if (skb_vlan_tag_get_id(skb))
5280
+ if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
5281
+check_vlan_id:
5282
+ if (skb_vlan_tag_get_id(skb)) {
5283
+ /* Vlan id is non 0 and vlan_do_receive() above couldn't
5284
+ * find vlan device.
5285
+ */
49025286 skb->pkt_type = PACKET_OTHERHOST;
5287
+ } else if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
5288
+ skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
5289
+ /* Outer header is 802.1P with vlan 0, inner header is
5290
+ * 802.1Q or 802.1AD and vlan_do_receive() above could
5291
+ * not find vlan dev for vlan id 0.
5292
+ */
5293
+ __vlan_hwaccel_clear_tag(skb);
5294
+ skb = skb_vlan_untag(skb);
5295
+ if (unlikely(!skb))
5296
+ goto out;
5297
+ if (vlan_do_receive(&skb))
5298
+ /* After stripping off 802.1P header with vlan 0
5299
+ * vlan dev is found for inner header.
5300
+ */
5301
+ goto another_round;
5302
+ else if (unlikely(!skb))
5303
+ goto out;
5304
+ else
5305
+ /* We have stripped outer 802.1P vlan 0 header.
5306
+ * But could not find vlan dev.
5307
+ * check again for vlan id to set OTHERHOST.
5308
+ */
5309
+ goto check_vlan_id;
5310
+ }
49035311 /* Note: we might in the future use prio bits
49045312 * and set skb->priority like in vlan_do_receive()
49055313 * For the time being, just ignore Priority Code Point
49065314 */
4907
- skb->vlan_tci = 0;
5315
+ __vlan_hwaccel_clear_tag(skb);
49085316 }
49095317
49105318 type = skb->protocol;
....@@ -4960,7 +5368,8 @@
49605368
49615369 ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
49625370 if (pt_prev)
4963
- ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
5371
+ ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
5372
+ skb->dev, pt_prev, orig_dev);
49645373 return ret;
49655374 }
49665375
....@@ -4970,7 +5379,7 @@
49705379 *
49715380 * More direct receive version of netif_receive_skb(). It should
49725381 * only be used by callers that have a need to skip RPS and Generic XDP.
4973
- * Caller must also take care of handling if (page_is_)pfmemalloc.
5382
+ * Caller must also take care of handling if ``(page_is_)pfmemalloc``.
49745383 *
49755384 * This function may only be called from softirq context and interrupts
49765385 * should be enabled.
....@@ -5002,7 +5411,8 @@
50025411 if (list_empty(head))
50035412 return;
50045413 if (pt_prev->list_func != NULL)
5005
- pt_prev->list_func(head, pt_prev, orig_dev);
5414
+ INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
5415
+ ip_list_rcv, head, pt_prev, orig_dev);
50065416 else
50075417 list_for_each_entry_safe(skb, next, head, list) {
50085418 skb_list_del_init(skb);
....@@ -5113,6 +5523,25 @@
51135523 struct bpf_prog *new = xdp->prog;
51145524 int ret = 0;
51155525
5526
+ if (new) {
5527
+ u32 i;
5528
+
5529
+ mutex_lock(&new->aux->used_maps_mutex);
5530
+
5531
+ /* generic XDP does not work with DEVMAPs that can
5532
+ * have a bpf_prog installed on an entry
5533
+ */
5534
+ for (i = 0; i < new->aux->used_map_cnt; i++) {
5535
+ if (dev_map_can_have_prog(new->aux->used_maps[i]) ||
5536
+ cpu_map_prog_allowed(new->aux->used_maps[i])) {
5537
+ mutex_unlock(&new->aux->used_maps_mutex);
5538
+ return -EINVAL;
5539
+ }
5540
+ }
5541
+
5542
+ mutex_unlock(&new->aux->used_maps_mutex);
5543
+ }
5544
+
51165545 switch (xdp->command) {
51175546 case XDP_SETUP_PROG:
51185547 rcu_assign_pointer(dev->xdp_prog, new);
....@@ -5128,10 +5557,6 @@
51285557 }
51295558 break;
51305559
5131
- case XDP_QUERY_PROG:
5132
- xdp->prog_id = old ? old->aux->id : 0;
5133
- break;
5134
-
51355560 default:
51365561 ret = -EINVAL;
51375562 break;
....@@ -5144,14 +5569,14 @@
51445569 {
51455570 int ret;
51465571
5147
- net_timestamp_check(netdev_tstamp_prequeue, skb);
5572
+ net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
51485573
51495574 if (skb_defer_rx_timestamp(skb))
51505575 return NET_RX_SUCCESS;
51515576
51525577 rcu_read_lock();
51535578 #ifdef CONFIG_RPS
5154
- if (static_key_false(&rps_needed)) {
5579
+ if (static_branch_unlikely(&rps_needed)) {
51555580 struct rps_dev_flow voidflow, *rflow = &voidflow;
51565581 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
51575582
....@@ -5174,7 +5599,7 @@
51745599
51755600 INIT_LIST_HEAD(&sublist);
51765601 list_for_each_entry_safe(skb, next, head, list) {
5177
- net_timestamp_check(netdev_tstamp_prequeue, skb);
5602
+ net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
51785603 skb_list_del_init(skb);
51795604 if (!skb_defer_rx_timestamp(skb))
51805605 list_add_tail(&skb->list, &sublist);
....@@ -5183,7 +5608,7 @@
51835608
51845609 rcu_read_lock();
51855610 #ifdef CONFIG_RPS
5186
- if (static_key_false(&rps_needed)) {
5611
+ if (static_branch_unlikely(&rps_needed)) {
51875612 list_for_each_entry_safe(skb, next, head, list) {
51885613 struct rps_dev_flow voidflow, *rflow = &voidflow;
51895614 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
....@@ -5217,9 +5642,14 @@
52175642 */
52185643 int netif_receive_skb(struct sk_buff *skb)
52195644 {
5645
+ int ret;
5646
+
52205647 trace_netif_receive_skb_entry(skb);
52215648
5222
- return netif_receive_skb_internal(skb);
5649
+ ret = netif_receive_skb_internal(skb);
5650
+ trace_netif_receive_skb_exit(ret);
5651
+
5652
+ return ret;
52235653 }
52245654 EXPORT_SYMBOL(netif_receive_skb);
52255655
....@@ -5239,13 +5669,16 @@
52395669
52405670 if (list_empty(head))
52415671 return;
5242
- list_for_each_entry(skb, head, list)
5243
- trace_netif_receive_skb_list_entry(skb);
5672
+ if (trace_netif_receive_skb_list_entry_enabled()) {
5673
+ list_for_each_entry(skb, head, list)
5674
+ trace_netif_receive_skb_list_entry(skb);
5675
+ }
52445676 netif_receive_skb_list_internal(head);
5677
+ trace_netif_receive_skb_list_exit(0);
52455678 }
52465679 EXPORT_SYMBOL(netif_receive_skb_list);
52475680
5248
-DEFINE_PER_CPU(struct work_struct, flush_works);
5681
+static DEFINE_PER_CPU(struct work_struct, flush_works);
52495682
52505683 /* Network device is going away, flush any packets still pending */
52515684 static void flush_backlog(struct work_struct *work)
....@@ -5278,23 +5711,89 @@
52785711 local_bh_enable();
52795712 }
52805713
5714
+static bool flush_required(int cpu)
5715
+{
5716
+#if IS_ENABLED(CONFIG_RPS)
5717
+ struct softnet_data *sd = &per_cpu(softnet_data, cpu);
5718
+ bool do_flush;
5719
+
5720
+ local_irq_disable();
5721
+ rps_lock(sd);
5722
+
5723
+ /* as insertion into process_queue happens with the rps lock held,
5724
+ * process_queue access may race only with dequeue
5725
+ */
5726
+ do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
5727
+ !skb_queue_empty_lockless(&sd->process_queue);
5728
+ rps_unlock(sd);
5729
+ local_irq_enable();
5730
+
5731
+ return do_flush;
5732
+#endif
5733
+ /* without RPS we can't safely check input_pkt_queue: during a
5734
+ * concurrent remote skb_queue_splice() we can detect as empty both
5735
+ * input_pkt_queue and process_queue even if the latter could end-up
5736
+ * containing a lot of packets.
5737
+ */
5738
+ return true;
5739
+}
5740
+
52815741 static void flush_all_backlogs(void)
52825742 {
5743
+ static cpumask_t flush_cpus;
52835744 unsigned int cpu;
5745
+
5746
+ /* since we are under rtnl lock protection we can use static data
5747
+ * for the cpumask and avoid allocating on stack the possibly
5748
+ * large mask
5749
+ */
5750
+ ASSERT_RTNL();
52845751
52855752 get_online_cpus();
52865753
5287
- for_each_online_cpu(cpu)
5288
- queue_work_on(cpu, system_highpri_wq,
5289
- per_cpu_ptr(&flush_works, cpu));
5754
+ cpumask_clear(&flush_cpus);
5755
+ for_each_online_cpu(cpu) {
5756
+ if (flush_required(cpu)) {
5757
+ queue_work_on(cpu, system_highpri_wq,
5758
+ per_cpu_ptr(&flush_works, cpu));
5759
+ cpumask_set_cpu(cpu, &flush_cpus);
5760
+ }
5761
+ }
52905762
5291
- for_each_online_cpu(cpu)
5763
+ /* we can have in flight packet[s] on the cpus we are not flushing,
5764
+ * synchronize_net() in unregister_netdevice_many() will take care of
5765
+ * them
5766
+ */
5767
+ for_each_cpu(cpu, &flush_cpus)
52925768 flush_work(per_cpu_ptr(&flush_works, cpu));
52935769
52945770 put_online_cpus();
52955771 }
52965772
5297
-static int napi_gro_complete(struct sk_buff *skb)
5773
+/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
5774
+static void gro_normal_list(struct napi_struct *napi)
5775
+{
5776
+ if (!napi->rx_count)
5777
+ return;
5778
+ netif_receive_skb_list_internal(&napi->rx_list);
5779
+ INIT_LIST_HEAD(&napi->rx_list);
5780
+ napi->rx_count = 0;
5781
+}
5782
+
5783
+/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded,
5784
+ * pass the whole batch up to the stack.
5785
+ */
5786
+static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, int segs)
5787
+{
5788
+ list_add_tail(&skb->list, &napi->rx_list);
5789
+ napi->rx_count += segs;
5790
+ if (napi->rx_count >= gro_normal_batch)
5791
+ gro_normal_list(napi);
5792
+}
5793
+
5794
+INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int));
5795
+INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int));
5796
+static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
52985797 {
52995798 struct packet_offload *ptype;
53005799 __be16 type = skb->protocol;
....@@ -5313,7 +5812,9 @@
53135812 if (ptype->type != type || !ptype->callbacks.gro_complete)
53145813 continue;
53155814
5316
- err = ptype->callbacks.gro_complete(skb, 0);
5815
+ err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
5816
+ ipv6_gro_complete, inet_gro_complete,
5817
+ skb, 0);
53175818 break;
53185819 }
53195820 rcu_read_unlock();
....@@ -5325,7 +5826,8 @@
53255826 }
53265827
53275828 out:
5328
- return netif_receive_skb_internal(skb);
5829
+ gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count);
5830
+ return NET_RX_SUCCESS;
53295831 }
53305832
53315833 static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
....@@ -5337,9 +5839,8 @@
53375839 list_for_each_entry_safe_reverse(skb, p, head, list) {
53385840 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
53395841 return;
5340
- list_del(&skb->list);
5341
- skb->next = NULL;
5342
- napi_gro_complete(skb);
5842
+ skb_list_del_init(skb);
5843
+ napi_gro_complete(napi, skb);
53435844 napi->gro_hash[index].count--;
53445845 }
53455846
....@@ -5353,11 +5854,13 @@
53535854 */
53545855 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
53555856 {
5356
- u32 i;
5857
+ unsigned long bitmask = napi->gro_bitmask;
5858
+ unsigned int i, base = ~0U;
53575859
5358
- for (i = 0; i < GRO_HASH_BUCKETS; i++) {
5359
- if (test_bit(i, &napi->gro_bitmask))
5360
- __napi_gro_flush_chain(napi, i, flush_old);
5860
+ while ((i = ffs(bitmask)) != 0) {
5861
+ bitmask >>= i;
5862
+ base += i;
5863
+ __napi_gro_flush_chain(napi, base, flush_old);
53615864 }
53625865 }
53635866 EXPORT_SYMBOL(napi_gro_flush);
....@@ -5382,7 +5885,9 @@
53825885 }
53835886
53845887 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
5385
- diffs |= p->vlan_tci ^ skb->vlan_tci;
5888
+ diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb);
5889
+ if (skb_vlan_tag_present(p))
5890
+ diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb);
53865891 diffs |= skb_metadata_dst_cmp(p, skb);
53875892 diffs |= skb_metadata_differs(p, skb);
53885893 if (maclen == ETH_HLEN)
....@@ -5392,13 +5897,26 @@
53925897 diffs = memcmp(skb_mac_header(p),
53935898 skb_mac_header(skb),
53945899 maclen);
5900
+
5901
+ diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb);
5902
+#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
5903
+ if (!diffs) {
5904
+ struct tc_skb_ext *skb_ext = skb_ext_find(skb, TC_SKB_EXT);
5905
+ struct tc_skb_ext *p_ext = skb_ext_find(p, TC_SKB_EXT);
5906
+
5907
+ diffs |= (!!p_ext) ^ (!!skb_ext);
5908
+ if (!diffs && unlikely(skb_ext))
5909
+ diffs |= p_ext->chain ^ skb_ext->chain;
5910
+ }
5911
+#endif
5912
+
53955913 NAPI_GRO_CB(p)->same_flow = !diffs;
53965914 }
53975915
53985916 return head;
53995917 }
54005918
5401
-static void skb_gro_reset_offset(struct sk_buff *skb)
5919
+static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff)
54025920 {
54035921 const struct skb_shared_info *pinfo = skb_shinfo(skb);
54045922 const skb_frag_t *frag0 = &pinfo->frags[0];
....@@ -5407,10 +5925,9 @@
54075925 NAPI_GRO_CB(skb)->frag0 = NULL;
54085926 NAPI_GRO_CB(skb)->frag0_len = 0;
54095927
5410
- if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
5411
- pinfo->nr_frags &&
5928
+ if (!skb_headlen(skb) && pinfo->nr_frags &&
54125929 !PageHighMem(skb_frag_page(frag0)) &&
5413
- (!NET_IP_ALIGN || !(skb_frag_off(frag0) & 3))) {
5930
+ (!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) {
54145931 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
54155932 NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
54165933 skb_frag_size(frag0),
....@@ -5429,7 +5946,7 @@
54295946 skb->data_len -= grow;
54305947 skb->tail += grow;
54315948
5432
- pinfo->frags[0].page_offset += grow;
5949
+ skb_frag_off_add(&pinfo->frags[0], grow);
54335950 skb_frag_size_sub(&pinfo->frags[0], grow);
54345951
54355952 if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
....@@ -5439,7 +5956,7 @@
54395956 }
54405957 }
54415958
5442
-static void gro_flush_oldest(struct list_head *head)
5959
+static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
54435960 {
54445961 struct sk_buff *oldest;
54455962
....@@ -5454,11 +5971,14 @@
54545971 /* Do not adjust napi->gro_hash[].count, caller is adding a new
54555972 * SKB to the chain.
54565973 */
5457
- list_del(&oldest->list);
5458
- oldest->next = NULL;
5459
- napi_gro_complete(oldest);
5974
+ skb_list_del_init(oldest);
5975
+ napi_gro_complete(napi, oldest);
54605976 }
54615977
5978
+INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *,
5979
+ struct sk_buff *));
5980
+INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *,
5981
+ struct sk_buff *));
54625982 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
54635983 {
54645984 u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
....@@ -5508,7 +6028,9 @@
55086028 NAPI_GRO_CB(skb)->csum_valid = 0;
55096029 }
55106030
5511
- pp = ptype->callbacks.gro_receive(gro_head, skb);
6031
+ pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
6032
+ ipv6_gro_receive, inet_gro_receive,
6033
+ gro_head, skb);
55126034 break;
55136035 }
55146036 rcu_read_unlock();
....@@ -5516,7 +6038,7 @@
55166038 if (&ptype->list == head)
55176039 goto normal;
55186040
5519
- if (IS_ERR(pp) && PTR_ERR(pp) == -EINPROGRESS) {
6041
+ if (PTR_ERR(pp) == -EINPROGRESS) {
55206042 ret = GRO_CONSUMED;
55216043 goto ok;
55226044 }
....@@ -5525,9 +6047,8 @@
55256047 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
55266048
55276049 if (pp) {
5528
- list_del(&pp->list);
5529
- pp->next = NULL;
5530
- napi_gro_complete(pp);
6050
+ skb_list_del_init(pp);
6051
+ napi_gro_complete(napi, pp);
55316052 napi->gro_hash[hash].count--;
55326053 }
55336054
....@@ -5538,7 +6059,7 @@
55386059 goto normal;
55396060
55406061 if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
5541
- gro_flush_oldest(gro_head);
6062
+ gro_flush_oldest(napi, gro_head);
55426063 } else {
55436064 napi->gro_hash[hash].count++;
55446065 }
....@@ -5599,16 +6120,17 @@
55996120 static void napi_skb_free_stolen_head(struct sk_buff *skb)
56006121 {
56016122 skb_dst_drop(skb);
5602
- secpath_reset(skb);
6123
+ skb_ext_put(skb);
56036124 kmem_cache_free(skbuff_head_cache, skb);
56046125 }
56056126
5606
-static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
6127
+static gro_result_t napi_skb_finish(struct napi_struct *napi,
6128
+ struct sk_buff *skb,
6129
+ gro_result_t ret)
56076130 {
56086131 switch (ret) {
56096132 case GRO_NORMAL:
5610
- if (netif_receive_skb_internal(skb))
5611
- ret = GRO_DROP;
6133
+ gro_normal_one(napi, skb, 1);
56126134 break;
56136135
56146136 case GRO_DROP:
....@@ -5633,12 +6155,17 @@
56336155
56346156 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
56356157 {
6158
+ gro_result_t ret;
6159
+
56366160 skb_mark_napi_id(skb, napi);
56376161 trace_napi_gro_receive_entry(skb);
56386162
5639
- skb_gro_reset_offset(skb);
6163
+ skb_gro_reset_offset(skb, 0);
56406164
5641
- return napi_skb_finish(dev_gro_receive(napi, skb), skb);
6165
+ ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb));
6166
+ trace_napi_gro_receive_exit(ret);
6167
+
6168
+ return ret;
56426169 }
56436170 EXPORT_SYMBOL(napi_gro_receive);
56446171
....@@ -5651,7 +6178,7 @@
56516178 __skb_pull(skb, skb_headlen(skb));
56526179 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
56536180 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
5654
- skb->vlan_tci = 0;
6181
+ __vlan_hwaccel_clear_tag(skb);
56556182 skb->dev = napi->dev;
56566183 skb->skb_iif = 0;
56576184
....@@ -5661,7 +6188,8 @@
56616188 skb->encapsulation = 0;
56626189 skb_shinfo(skb)->gso_type = 0;
56636190 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
5664
- secpath_reset(skb);
6191
+ skb_ext_reset(skb);
6192
+ nf_reset_ct(skb);
56656193
56666194 napi->skb = skb;
56676195 }
....@@ -5690,8 +6218,8 @@
56906218 case GRO_HELD:
56916219 __skb_push(skb, ETH_HLEN);
56926220 skb->protocol = eth_type_trans(skb, skb->dev);
5693
- if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
5694
- ret = GRO_DROP;
6221
+ if (ret == GRO_NORMAL)
6222
+ gro_normal_one(napi, skb, 1);
56956223 break;
56966224
56976225 case GRO_DROP:
....@@ -5726,7 +6254,7 @@
57266254 napi->skb = NULL;
57276255
57286256 skb_reset_mac_header(skb);
5729
- skb_gro_reset_offset(skb);
6257
+ skb_gro_reset_offset(skb, hlen);
57306258
57316259 if (unlikely(skb_gro_header_hard(skb, hlen))) {
57326260 eth = skb_gro_header_slow(skb, hlen, 0);
....@@ -5756,6 +6284,7 @@
57566284
57576285 gro_result_t napi_gro_frags(struct napi_struct *napi)
57586286 {
6287
+ gro_result_t ret;
57596288 struct sk_buff *skb = napi_frags_skb(napi);
57606289
57616290 if (!skb)
....@@ -5763,7 +6292,10 @@
57636292
57646293 trace_napi_gro_frags_entry(skb);
57656294
5766
- return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
6295
+ ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
6296
+ trace_napi_gro_frags_exit(ret);
6297
+
6298
+ return ret;
57676299 }
57686300 EXPORT_SYMBOL(napi_gro_frags);
57696301
....@@ -5779,10 +6311,11 @@
57796311
57806312 /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
57816313 sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
6314
+ /* See comments in __skb_checksum_complete(). */
57826315 if (likely(!sum)) {
57836316 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
57846317 !skb->csum_complete_sw)
5785
- netdev_rx_csum_fault(skb->dev);
6318
+ netdev_rx_csum_fault(skb->dev, skb);
57866319 }
57876320
57886321 NAPI_GRO_CB(skb)->csum = wsum;
....@@ -5818,12 +6351,14 @@
58186351 sd->rps_ipi_list = NULL;
58196352
58206353 local_irq_enable();
6354
+ preempt_check_resched_rt();
58216355
58226356 /* Send pending IPI's to kick RPS processing on remote cpus. */
58236357 net_rps_send_ipi(remsd);
58246358 } else
58256359 #endif
58266360 local_irq_enable();
6361
+ preempt_check_resched_rt();
58276362 }
58286363
58296364 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
....@@ -5849,7 +6384,7 @@
58496384 net_rps_action_and_irq_enable(sd);
58506385 }
58516386
5852
- napi->weight = dev_rx_weight;
6387
+ napi->weight = READ_ONCE(dev_rx_weight);
58536388 while (again) {
58546389 struct sk_buff *skb;
58556390
....@@ -5859,7 +6394,8 @@
58596394 rcu_read_unlock();
58606395 input_queue_head_incr(sd);
58616396 if (++work >= quota)
5862
- goto state_changed;
6397
+ return work;
6398
+
58636399 }
58646400
58656401 local_irq_disable();
....@@ -5883,10 +6419,6 @@
58836419 local_irq_enable();
58846420 }
58856421
5886
-state_changed:
5887
- napi_gro_flush(napi, false);
5888
- sd->current_napi = NULL;
5889
-
58906422 return work;
58916423 }
58926424
....@@ -5904,6 +6436,7 @@
59046436 local_irq_save(flags);
59056437 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
59066438 local_irq_restore(flags);
6439
+ preempt_check_resched_rt();
59076440 }
59086441 EXPORT_SYMBOL(__napi_schedule);
59096442
....@@ -5912,7 +6445,7 @@
59126445 * @n: napi context
59136446 *
59146447 * Test if NAPI routine is already running, and if not mark
5915
- * it as running. This is used as a condition variable
6448
+ * it as running. This is used as a condition variable to
59166449 * insure only one NAPI poll instance runs. We also make
59176450 * sure there is no pending NAPI disable.
59186451 */
....@@ -5961,7 +6494,8 @@
59616494
59626495 bool napi_complete_done(struct napi_struct *n, int work_done)
59636496 {
5964
- unsigned long flags, val, new;
6497
+ unsigned long flags, val, new, timeout = 0;
6498
+ bool ret = true;
59656499
59666500 /*
59676501 * 1) Don't let napi dequeue from the cpu poll list
....@@ -5973,28 +6507,31 @@
59736507 NAPIF_STATE_IN_BUSY_POLL)))
59746508 return false;
59756509
6510
+ if (work_done) {
6511
+ if (n->gro_bitmask)
6512
+ timeout = READ_ONCE(n->dev->gro_flush_timeout);
6513
+ n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
6514
+ }
6515
+ if (n->defer_hard_irqs_count > 0) {
6516
+ n->defer_hard_irqs_count--;
6517
+ timeout = READ_ONCE(n->dev->gro_flush_timeout);
6518
+ if (timeout)
6519
+ ret = false;
6520
+ }
59766521 if (n->gro_bitmask) {
5977
- unsigned long timeout = 0;
5978
-
5979
- if (work_done)
5980
- timeout = n->dev->gro_flush_timeout;
5981
-
59826522 /* When the NAPI instance uses a timeout and keeps postponing
59836523 * it, we need to bound somehow the time packets are kept in
59846524 * the GRO layer
59856525 */
59866526 napi_gro_flush(n, !!timeout);
5987
- if (timeout)
5988
- hrtimer_start(&n->timer, ns_to_ktime(timeout),
5989
- HRTIMER_MODE_REL_PINNED);
59906527 }
5991
- if (unlikely(!list_empty(&n->poll_list))) {
5992
- struct softnet_data *sd = this_cpu_ptr(&softnet_data);
59936528
6529
+ gro_normal_list(n);
6530
+
6531
+ if (unlikely(!list_empty(&n->poll_list))) {
59946532 /* If n->poll_list is not empty, we need to mask irqs */
59956533 local_irq_save(flags);
59966534 list_del_init(&n->poll_list);
5997
- sd->current_napi = NULL;
59986535 local_irq_restore(flags);
59996536 }
60006537
....@@ -6018,7 +6555,10 @@
60186555 return false;
60196556 }
60206557
6021
- return true;
6558
+ if (timeout)
6559
+ hrtimer_start(&n->timer, ns_to_ktime(timeout),
6560
+ HRTIMER_MODE_REL_PINNED);
6561
+ return ret;
60226562 }
60236563 EXPORT_SYMBOL(napi_complete_done);
60246564
....@@ -6061,10 +6601,19 @@
60616601 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
60626602 */
60636603 rc = napi->poll(napi, BUSY_POLL_BUDGET);
6604
+ /* We can't gro_normal_list() here, because napi->poll() might have
6605
+ * rearmed the napi (napi_complete_done()) in which case it could
6606
+ * already be running on another CPU.
6607
+ */
60646608 trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
60656609 netpoll_poll_unlock(have_poll_lock);
6066
- if (rc == BUSY_POLL_BUDGET)
6610
+ if (rc == BUSY_POLL_BUDGET) {
6611
+ /* As the whole budget was spent, we still own the napi so can
6612
+ * safely handle the rx_list.
6613
+ */
6614
+ gro_normal_list(napi);
60676615 __napi_schedule(napi);
6616
+ }
60686617 local_bh_enable();
60696618 }
60706619
....@@ -6109,6 +6658,7 @@
61096658 }
61106659 work = napi_poll(napi, BUSY_POLL_BUDGET);
61116660 trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
6661
+ gro_normal_list(napi);
61126662 count:
61136663 if (work > 0)
61146664 __NET_ADD_STATS(dev_net(napi->dev),
....@@ -6142,8 +6692,7 @@
61426692
61436693 static void napi_hash_add(struct napi_struct *napi)
61446694 {
6145
- if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
6146
- test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
6695
+ if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
61476696 return;
61486697
61496698 spin_lock(&napi_hash_lock);
....@@ -6164,20 +6713,14 @@
61646713 /* Warning : caller is responsible to make sure rcu grace period
61656714 * is respected before freeing memory containing @napi
61666715 */
6167
-bool napi_hash_del(struct napi_struct *napi)
6716
+static void napi_hash_del(struct napi_struct *napi)
61686717 {
6169
- bool rcu_sync_needed = false;
6170
-
61716718 spin_lock(&napi_hash_lock);
61726719
6173
- if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
6174
- rcu_sync_needed = true;
6175
- hlist_del_rcu(&napi->napi_hash_node);
6176
- }
6720
+ hlist_del_init_rcu(&napi->napi_hash_node);
6721
+
61776722 spin_unlock(&napi_hash_lock);
6178
- return rcu_sync_needed;
61796723 }
6180
-EXPORT_SYMBOL_GPL(napi_hash_del);
61816724
61826725 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
61836726 {
....@@ -6188,7 +6731,7 @@
61886731 /* Note : we use a relaxed variant of napi_schedule_prep() not setting
61896732 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
61906733 */
6191
- if (napi->gro_bitmask && !napi_disable_pending(napi) &&
6734
+ if (!napi_disable_pending(napi) &&
61926735 !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
61936736 __napi_schedule_irqoff(napi);
61946737
....@@ -6209,15 +6752,21 @@
62096752 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
62106753 int (*poll)(struct napi_struct *, int), int weight)
62116754 {
6755
+ if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
6756
+ return;
6757
+
62126758 INIT_LIST_HEAD(&napi->poll_list);
6759
+ INIT_HLIST_NODE(&napi->napi_hash_node);
62136760 hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
62146761 napi->timer.function = napi_watchdog;
62156762 init_gro_hash(napi);
62166763 napi->skb = NULL;
6764
+ INIT_LIST_HEAD(&napi->rx_list);
6765
+ napi->rx_count = 0;
62176766 napi->poll = poll;
62186767 if (weight > NAPI_POLL_WEIGHT)
6219
- pr_err_once("netif_napi_add() called with weight %d on device %s\n",
6220
- weight, dev->name);
6768
+ netdev_err_once(dev, "%s() called with weight %d\n", __func__,
6769
+ weight);
62216770 napi->weight = weight;
62226771 napi->dev = dev;
62236772 #ifdef CONFIG_NETPOLL
....@@ -6260,26 +6809,19 @@
62606809 }
62616810
62626811 /* Must be called in process context */
6263
-void netif_napi_del(struct napi_struct *napi)
6812
+void __netif_napi_del(struct napi_struct *napi)
62646813 {
6265
- might_sleep();
6266
- if (napi_hash_del(napi))
6267
- synchronize_net();
6268
- list_del_init(&napi->dev_list);
6814
+ if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
6815
+ return;
6816
+
6817
+ napi_hash_del(napi);
6818
+ list_del_rcu(&napi->dev_list);
62696819 napi_free_frags(napi);
62706820
62716821 flush_gro_hash(napi);
62726822 napi->gro_bitmask = 0;
62736823 }
6274
-EXPORT_SYMBOL(netif_napi_del);
6275
-
6276
-struct napi_struct *get_current_napi_context(void)
6277
-{
6278
- struct softnet_data *sd = this_cpu_ptr(&softnet_data);
6279
-
6280
- return sd->current_napi;
6281
-}
6282
-EXPORT_SYMBOL(get_current_napi_context);
6824
+EXPORT_SYMBOL(__netif_napi_del);
62836825
62846826 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
62856827 {
....@@ -6300,14 +6842,13 @@
63006842 */
63016843 work = 0;
63026844 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
6303
- struct softnet_data *sd = this_cpu_ptr(&softnet_data);
6304
-
6305
- sd->current_napi = n;
63066845 work = n->poll(n, weight);
63076846 trace_napi_poll(n, work, weight);
63086847 }
63096848
6310
- WARN_ON_ONCE(work > weight);
6849
+ if (unlikely(work > weight))
6850
+ pr_err_once("NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
6851
+ n->poll, work, weight);
63116852
63126853 if (likely(work < weight))
63136854 goto out_unlock;
....@@ -6328,6 +6869,8 @@
63286869 */
63296870 napi_gro_flush(n, HZ >= 1000);
63306871 }
6872
+
6873
+ gro_normal_list(n);
63316874
63326875 /* Some drivers may have called napi_schedule
63336876 * prior to exhausting their budget.
....@@ -6350,8 +6893,8 @@
63506893 {
63516894 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
63526895 unsigned long time_limit = jiffies +
6353
- usecs_to_jiffies(netdev_budget_usecs);
6354
- int budget = netdev_budget;
6896
+ usecs_to_jiffies(READ_ONCE(netdev_budget_usecs));
6897
+ int budget = READ_ONCE(netdev_budget);
63556898 LIST_HEAD(list);
63566899 LIST_HEAD(repoll);
63576900
....@@ -6401,6 +6944,9 @@
64016944 /* upper master flag, there can only be one master device per list */
64026945 bool master;
64036946
6947
+ /* lookup ignore flag */
6948
+ bool ignore;
6949
+
64046950 /* counter for the number of times this device was added to us */
64056951 u16 ref_nr;
64066952
....@@ -6423,9 +6969,10 @@
64236969 return NULL;
64246970 }
64256971
6426
-static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
6972
+static int ____netdev_has_upper_dev(struct net_device *upper_dev,
6973
+ struct netdev_nested_priv *priv)
64276974 {
6428
- struct net_device *dev = data;
6975
+ struct net_device *dev = (struct net_device *)priv->data;
64296976
64306977 return upper_dev == dev;
64316978 }
....@@ -6442,10 +6989,14 @@
64426989 bool netdev_has_upper_dev(struct net_device *dev,
64436990 struct net_device *upper_dev)
64446991 {
6992
+ struct netdev_nested_priv priv = {
6993
+ .data = (void *)upper_dev,
6994
+ };
6995
+
64456996 ASSERT_RTNL();
64466997
6447
- return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
6448
- upper_dev);
6998
+ return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
6999
+ &priv);
64497000 }
64507001 EXPORT_SYMBOL(netdev_has_upper_dev);
64517002
....@@ -6462,8 +7013,12 @@
64627013 bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
64637014 struct net_device *upper_dev)
64647015 {
6465
- return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
6466
- upper_dev);
7016
+ struct netdev_nested_priv priv = {
7017
+ .data = (void *)upper_dev,
7018
+ };
7019
+
7020
+ return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
7021
+ &priv);
64677022 }
64687023 EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
64697024
....@@ -6505,6 +7060,22 @@
65057060 return NULL;
65067061 }
65077062 EXPORT_SYMBOL(netdev_master_upper_dev_get);
7063
+
7064
+static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
7065
+{
7066
+ struct netdev_adjacent *upper;
7067
+
7068
+ ASSERT_RTNL();
7069
+
7070
+ if (list_empty(&dev->adj_list.upper))
7071
+ return NULL;
7072
+
7073
+ upper = list_first_entry(&dev->adj_list.upper,
7074
+ struct netdev_adjacent, list);
7075
+ if (likely(upper->master) && !upper->ignore)
7076
+ return upper->dev;
7077
+ return NULL;
7078
+}
65087079
65097080 /**
65107081 * netdev_has_any_lower_dev - Check if device is linked to some device
....@@ -6556,8 +7127,9 @@
65567127 }
65577128 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
65587129
6559
-static struct net_device *netdev_next_upper_dev(struct net_device *dev,
6560
- struct list_head **iter)
7130
+static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
7131
+ struct list_head **iter,
7132
+ bool *ignore)
65617133 {
65627134 struct netdev_adjacent *upper;
65637135
....@@ -6567,6 +7139,7 @@
65677139 return NULL;
65687140
65697141 *iter = &upper->list;
7142
+ *ignore = upper->ignore;
65707143
65717144 return upper->dev;
65727145 }
....@@ -6588,30 +7161,33 @@
65887161 return upper->dev;
65897162 }
65907163
6591
-static int netdev_walk_all_upper_dev(struct net_device *dev,
6592
- int (*fn)(struct net_device *dev,
6593
- void *data),
6594
- void *data)
7164
+static int __netdev_walk_all_upper_dev(struct net_device *dev,
7165
+ int (*fn)(struct net_device *dev,
7166
+ struct netdev_nested_priv *priv),
7167
+ struct netdev_nested_priv *priv)
65957168 {
65967169 struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
65977170 struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
65987171 int ret, cur = 0;
7172
+ bool ignore;
65997173
66007174 now = dev;
66017175 iter = &dev->adj_list.upper;
66027176
66037177 while (1) {
66047178 if (now != dev) {
6605
- ret = fn(now, data);
7179
+ ret = fn(now, priv);
66067180 if (ret)
66077181 return ret;
66087182 }
66097183
66107184 next = NULL;
66117185 while (1) {
6612
- udev = netdev_next_upper_dev(now, &iter);
7186
+ udev = __netdev_next_upper_dev(now, &iter, &ignore);
66137187 if (!udev)
66147188 break;
7189
+ if (ignore)
7190
+ continue;
66157191
66167192 next = udev;
66177193 niter = &udev->adj_list.upper;
....@@ -6636,8 +7212,8 @@
66367212
66377213 int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
66387214 int (*fn)(struct net_device *dev,
6639
- void *data),
6640
- void *data)
7215
+ struct netdev_nested_priv *priv),
7216
+ struct netdev_nested_priv *priv)
66417217 {
66427218 struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
66437219 struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
....@@ -6648,7 +7224,7 @@
66487224
66497225 while (1) {
66507226 if (now != dev) {
6651
- ret = fn(now, data);
7227
+ ret = fn(now, priv);
66527228 if (ret)
66537229 return ret;
66547230 }
....@@ -6680,6 +7256,20 @@
66807256 return 0;
66817257 }
66827258 EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
7259
+
7260
+static bool __netdev_has_upper_dev(struct net_device *dev,
7261
+ struct net_device *upper_dev)
7262
+{
7263
+ struct netdev_nested_priv priv = {
7264
+ .flags = 0,
7265
+ .data = (void *)upper_dev,
7266
+ };
7267
+
7268
+ ASSERT_RTNL();
7269
+
7270
+ return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
7271
+ &priv);
7272
+}
66837273
66847274 /**
66857275 * netdev_lower_get_next_private - Get the next ->private from the
....@@ -6777,10 +7367,27 @@
67777367 return lower->dev;
67787368 }
67797369
7370
+static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
7371
+ struct list_head **iter,
7372
+ bool *ignore)
7373
+{
7374
+ struct netdev_adjacent *lower;
7375
+
7376
+ lower = list_entry((*iter)->next, struct netdev_adjacent, list);
7377
+
7378
+ if (&lower->list == &dev->adj_list.lower)
7379
+ return NULL;
7380
+
7381
+ *iter = &lower->list;
7382
+ *ignore = lower->ignore;
7383
+
7384
+ return lower->dev;
7385
+}
7386
+
67807387 int netdev_walk_all_lower_dev(struct net_device *dev,
67817388 int (*fn)(struct net_device *dev,
6782
- void *data),
6783
- void *data)
7389
+ struct netdev_nested_priv *priv),
7390
+ struct netdev_nested_priv *priv)
67847391 {
67857392 struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
67867393 struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
....@@ -6791,7 +7398,7 @@
67917398
67927399 while (1) {
67937400 if (now != dev) {
6794
- ret = fn(now, data);
7401
+ ret = fn(now, priv);
67957402 if (ret)
67967403 return ret;
67977404 }
....@@ -6824,8 +7431,57 @@
68247431 }
68257432 EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
68267433
6827
-static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
6828
- struct list_head **iter)
7434
+static int __netdev_walk_all_lower_dev(struct net_device *dev,
7435
+ int (*fn)(struct net_device *dev,
7436
+ struct netdev_nested_priv *priv),
7437
+ struct netdev_nested_priv *priv)
7438
+{
7439
+ struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7440
+ struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7441
+ int ret, cur = 0;
7442
+ bool ignore;
7443
+
7444
+ now = dev;
7445
+ iter = &dev->adj_list.lower;
7446
+
7447
+ while (1) {
7448
+ if (now != dev) {
7449
+ ret = fn(now, priv);
7450
+ if (ret)
7451
+ return ret;
7452
+ }
7453
+
7454
+ next = NULL;
7455
+ while (1) {
7456
+ ldev = __netdev_next_lower_dev(now, &iter, &ignore);
7457
+ if (!ldev)
7458
+ break;
7459
+ if (ignore)
7460
+ continue;
7461
+
7462
+ next = ldev;
7463
+ niter = &ldev->adj_list.lower;
7464
+ dev_stack[cur] = now;
7465
+ iter_stack[cur++] = iter;
7466
+ break;
7467
+ }
7468
+
7469
+ if (!next) {
7470
+ if (!cur)
7471
+ return 0;
7472
+ next = dev_stack[--cur];
7473
+ niter = iter_stack[cur];
7474
+ }
7475
+
7476
+ now = next;
7477
+ iter = niter;
7478
+ }
7479
+
7480
+ return 0;
7481
+}
7482
+
7483
+struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
7484
+ struct list_head **iter)
68297485 {
68307486 struct netdev_adjacent *lower;
68317487
....@@ -6837,17 +7493,21 @@
68377493
68387494 return lower->dev;
68397495 }
7496
+EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
68407497
68417498 static u8 __netdev_upper_depth(struct net_device *dev)
68427499 {
68437500 struct net_device *udev;
68447501 struct list_head *iter;
68457502 u8 max_depth = 0;
7503
+ bool ignore;
68467504
68477505 for (iter = &dev->adj_list.upper,
6848
- udev = netdev_next_upper_dev(dev, &iter);
7506
+ udev = __netdev_next_upper_dev(dev, &iter, &ignore);
68497507 udev;
6850
- udev = netdev_next_upper_dev(dev, &iter)) {
7508
+ udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
7509
+ if (ignore)
7510
+ continue;
68517511 if (max_depth < udev->upper_level)
68527512 max_depth = udev->upper_level;
68537513 }
....@@ -6860,11 +7520,14 @@
68607520 struct net_device *ldev;
68617521 struct list_head *iter;
68627522 u8 max_depth = 0;
7523
+ bool ignore;
68637524
68647525 for (iter = &dev->adj_list.lower,
6865
- ldev = netdev_next_lower_dev(dev, &iter);
7526
+ ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
68667527 ldev;
6867
- ldev = netdev_next_lower_dev(dev, &iter)) {
7528
+ ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
7529
+ if (ignore)
7530
+ continue;
68687531 if (max_depth < ldev->lower_level)
68697532 max_depth = ldev->lower_level;
68707533 }
....@@ -6872,22 +7535,34 @@
68727535 return max_depth;
68737536 }
68747537
6875
-static int __netdev_update_upper_level(struct net_device *dev, void *data)
7538
+static int __netdev_update_upper_level(struct net_device *dev,
7539
+ struct netdev_nested_priv *__unused)
68767540 {
68777541 dev->upper_level = __netdev_upper_depth(dev) + 1;
68787542 return 0;
68797543 }
68807544
6881
-static int __netdev_update_lower_level(struct net_device *dev, void *data)
7545
+static int __netdev_update_lower_level(struct net_device *dev,
7546
+ struct netdev_nested_priv *priv)
68827547 {
68837548 dev->lower_level = __netdev_lower_depth(dev) + 1;
7549
+
7550
+#ifdef CONFIG_LOCKDEP
7551
+ if (!priv)
7552
+ return 0;
7553
+
7554
+ if (priv->flags & NESTED_SYNC_IMM)
7555
+ dev->nested_level = dev->lower_level - 1;
7556
+ if (priv->flags & NESTED_SYNC_TODO)
7557
+ net_unlink_todo(dev);
7558
+#endif
68847559 return 0;
68857560 }
68867561
68877562 int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
68887563 int (*fn)(struct net_device *dev,
6889
- void *data),
6890
- void *data)
7564
+ struct netdev_nested_priv *priv),
7565
+ struct netdev_nested_priv *priv)
68917566 {
68927567 struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
68937568 struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
....@@ -6898,7 +7573,7 @@
68987573
68997574 while (1) {
69007575 if (now != dev) {
6901
- ret = fn(now, data);
7576
+ ret = fn(now, priv);
69027577 if (ret)
69037578 return ret;
69047579 }
....@@ -7028,6 +7703,7 @@
70287703 adj->master = master;
70297704 adj->ref_nr = 1;
70307705 adj->private = private;
7706
+ adj->ignore = false;
70317707 dev_hold(adj_dev);
70327708
70337709 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
....@@ -7157,6 +7833,7 @@
71577833 static int __netdev_upper_dev_link(struct net_device *dev,
71587834 struct net_device *upper_dev, bool master,
71597835 void *upper_priv, void *upper_info,
7836
+ struct netdev_nested_priv *priv,
71607837 struct netlink_ext_ack *extack)
71617838 {
71627839 struct netdev_notifier_changeupper_info changeupper_info = {
....@@ -7178,17 +7855,17 @@
71787855 return -EBUSY;
71797856
71807857 /* To prevent loops, check if dev is not upper device to upper_dev. */
7181
- if (netdev_has_upper_dev(upper_dev, dev))
7858
+ if (__netdev_has_upper_dev(upper_dev, dev))
71827859 return -EBUSY;
71837860
71847861 if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
71857862 return -EMLINK;
71867863
71877864 if (!master) {
7188
- if (netdev_has_upper_dev(dev, upper_dev))
7865
+ if (__netdev_has_upper_dev(dev, upper_dev))
71897866 return -EEXIST;
71907867 } else {
7191
- master_dev = netdev_master_upper_dev_get(dev);
7868
+ master_dev = __netdev_master_upper_dev_get(dev);
71927869 if (master_dev)
71937870 return master_dev == upper_dev ? -EEXIST : -EBUSY;
71947871 }
....@@ -7211,10 +7888,11 @@
72117888 goto rollback;
72127889
72137890 __netdev_update_upper_level(dev, NULL);
7214
- netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
7891
+ __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
72157892
7216
- __netdev_update_lower_level(upper_dev, NULL);
7217
- netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, NULL);
7893
+ __netdev_update_lower_level(upper_dev, priv);
7894
+ __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
7895
+ priv);
72187896
72197897 return 0;
72207898
....@@ -7239,8 +7917,13 @@
72397917 struct net_device *upper_dev,
72407918 struct netlink_ext_ack *extack)
72417919 {
7920
+ struct netdev_nested_priv priv = {
7921
+ .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
7922
+ .data = NULL,
7923
+ };
7924
+
72427925 return __netdev_upper_dev_link(dev, upper_dev, false,
7243
- NULL, NULL, extack);
7926
+ NULL, NULL, &priv, extack);
72447927 }
72457928 EXPORT_SYMBOL(netdev_upper_dev_link);
72467929
....@@ -7263,21 +7946,19 @@
72637946 void *upper_priv, void *upper_info,
72647947 struct netlink_ext_ack *extack)
72657948 {
7949
+ struct netdev_nested_priv priv = {
7950
+ .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
7951
+ .data = NULL,
7952
+ };
7953
+
72667954 return __netdev_upper_dev_link(dev, upper_dev, true,
7267
- upper_priv, upper_info, extack);
7955
+ upper_priv, upper_info, &priv, extack);
72687956 }
72697957 EXPORT_SYMBOL(netdev_master_upper_dev_link);
72707958
7271
-/**
7272
- * netdev_upper_dev_unlink - Removes a link to upper device
7273
- * @dev: device
7274
- * @upper_dev: new upper device
7275
- *
7276
- * Removes a link to device which is upper to this one. The caller must hold
7277
- * the RTNL lock.
7278
- */
7279
-void netdev_upper_dev_unlink(struct net_device *dev,
7280
- struct net_device *upper_dev)
7959
+static void __netdev_upper_dev_unlink(struct net_device *dev,
7960
+ struct net_device *upper_dev,
7961
+ struct netdev_nested_priv *priv)
72817962 {
72827963 struct netdev_notifier_changeupper_info changeupper_info = {
72837964 .info = {
....@@ -7300,12 +7981,126 @@
73007981 &changeupper_info.info);
73017982
73027983 __netdev_update_upper_level(dev, NULL);
7303
- netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
7984
+ __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
73047985
7305
- __netdev_update_lower_level(upper_dev, NULL);
7306
- netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, NULL);
7986
+ __netdev_update_lower_level(upper_dev, priv);
7987
+ __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
7988
+ priv);
7989
+}
7990
+
7991
+/**
7992
+ * netdev_upper_dev_unlink - Removes a link to upper device
7993
+ * @dev: device
7994
+ * @upper_dev: new upper device
7995
+ *
7996
+ * Removes a link to device which is upper to this one. The caller must hold
7997
+ * the RTNL lock.
7998
+ */
7999
+void netdev_upper_dev_unlink(struct net_device *dev,
8000
+ struct net_device *upper_dev)
8001
+{
8002
+ struct netdev_nested_priv priv = {
8003
+ .flags = NESTED_SYNC_TODO,
8004
+ .data = NULL,
8005
+ };
8006
+
8007
+ __netdev_upper_dev_unlink(dev, upper_dev, &priv);
73078008 }
73088009 EXPORT_SYMBOL(netdev_upper_dev_unlink);
8010
+
8011
+static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
8012
+ struct net_device *lower_dev,
8013
+ bool val)
8014
+{
8015
+ struct netdev_adjacent *adj;
8016
+
8017
+ adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
8018
+ if (adj)
8019
+ adj->ignore = val;
8020
+
8021
+ adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
8022
+ if (adj)
8023
+ adj->ignore = val;
8024
+}
8025
+
8026
+static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
8027
+ struct net_device *lower_dev)
8028
+{
8029
+ __netdev_adjacent_dev_set(upper_dev, lower_dev, true);
8030
+}
8031
+
8032
+static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
8033
+ struct net_device *lower_dev)
8034
+{
8035
+ __netdev_adjacent_dev_set(upper_dev, lower_dev, false);
8036
+}
8037
+
8038
+int netdev_adjacent_change_prepare(struct net_device *old_dev,
8039
+ struct net_device *new_dev,
8040
+ struct net_device *dev,
8041
+ struct netlink_ext_ack *extack)
8042
+{
8043
+ struct netdev_nested_priv priv = {
8044
+ .flags = 0,
8045
+ .data = NULL,
8046
+ };
8047
+ int err;
8048
+
8049
+ if (!new_dev)
8050
+ return 0;
8051
+
8052
+ if (old_dev && new_dev != old_dev)
8053
+ netdev_adjacent_dev_disable(dev, old_dev);
8054
+ err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv,
8055
+ extack);
8056
+ if (err) {
8057
+ if (old_dev && new_dev != old_dev)
8058
+ netdev_adjacent_dev_enable(dev, old_dev);
8059
+ return err;
8060
+ }
8061
+
8062
+ return 0;
8063
+}
8064
+EXPORT_SYMBOL(netdev_adjacent_change_prepare);
8065
+
8066
+void netdev_adjacent_change_commit(struct net_device *old_dev,
8067
+ struct net_device *new_dev,
8068
+ struct net_device *dev)
8069
+{
8070
+ struct netdev_nested_priv priv = {
8071
+ .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
8072
+ .data = NULL,
8073
+ };
8074
+
8075
+ if (!new_dev || !old_dev)
8076
+ return;
8077
+
8078
+ if (new_dev == old_dev)
8079
+ return;
8080
+
8081
+ netdev_adjacent_dev_enable(dev, old_dev);
8082
+ __netdev_upper_dev_unlink(old_dev, dev, &priv);
8083
+}
8084
+EXPORT_SYMBOL(netdev_adjacent_change_commit);
8085
+
8086
+void netdev_adjacent_change_abort(struct net_device *old_dev,
8087
+ struct net_device *new_dev,
8088
+ struct net_device *dev)
8089
+{
8090
+ struct netdev_nested_priv priv = {
8091
+ .flags = 0,
8092
+ .data = NULL,
8093
+ };
8094
+
8095
+ if (!new_dev)
8096
+ return;
8097
+
8098
+ if (old_dev && new_dev != old_dev)
8099
+ netdev_adjacent_dev_enable(dev, old_dev);
8100
+
8101
+ __netdev_upper_dev_unlink(new_dev, dev, &priv);
8102
+}
8103
+EXPORT_SYMBOL(netdev_adjacent_change_abort);
73098104
73108105 /**
73118106 * netdev_bonding_info_change - Dispatch event about slave change
....@@ -7328,6 +8123,29 @@
73288123 &info.info);
73298124 }
73308125 EXPORT_SYMBOL(netdev_bonding_info_change);
8126
+
8127
+/**
8128
+ * netdev_get_xmit_slave - Get the xmit slave of master device
8129
+ * @dev: device
8130
+ * @skb: The packet
8131
+ * @all_slaves: assume all the slaves are active
8132
+ *
8133
+ * The reference counters are not incremented so the caller must be
8134
+ * careful with locks. The caller must hold RCU lock.
8135
+ * %NULL is returned if no slave is found.
8136
+ */
8137
+
8138
+struct net_device *netdev_get_xmit_slave(struct net_device *dev,
8139
+ struct sk_buff *skb,
8140
+ bool all_slaves)
8141
+{
8142
+ const struct net_device_ops *ops = dev->netdev_ops;
8143
+
8144
+ if (!ops->ndo_get_xmit_slave)
8145
+ return NULL;
8146
+ return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
8147
+}
8148
+EXPORT_SYMBOL(netdev_get_xmit_slave);
73318149
73328150 static void netdev_adjacent_add_links(struct net_device *dev)
73338151 {
....@@ -7419,25 +8237,6 @@
74198237 }
74208238 EXPORT_SYMBOL(netdev_lower_dev_get_private);
74218239
7422
-
7423
-int dev_get_nest_level(struct net_device *dev)
7424
-{
7425
- struct net_device *lower = NULL;
7426
- struct list_head *iter;
7427
- int max_nest = -1;
7428
- int nest;
7429
-
7430
- ASSERT_RTNL();
7431
-
7432
- netdev_for_each_lower_dev(dev, lower, iter) {
7433
- nest = dev_get_nest_level(lower);
7434
- if (max_nest < nest)
7435
- max_nest = nest;
7436
- }
7437
-
7438
- return max_nest + 1;
7439
-}
7440
-EXPORT_SYMBOL(dev_get_nest_level);
74418240
74428241 /**
74438242 * netdev_lower_change - Dispatch event about lower device state change
....@@ -7665,7 +8464,8 @@
76658464 }
76668465 EXPORT_SYMBOL(dev_get_flags);
76678466
7668
-int __dev_change_flags(struct net_device *dev, unsigned int flags)
8467
+int __dev_change_flags(struct net_device *dev, unsigned int flags,
8468
+ struct netlink_ext_ack *extack)
76698469 {
76708470 unsigned int old_flags = dev->flags;
76718471 int ret;
....@@ -7702,7 +8502,7 @@
77028502 if (old_flags & IFF_UP)
77038503 __dev_close(dev);
77048504 else
7705
- ret = __dev_open(dev);
8505
+ ret = __dev_open(dev, extack);
77068506 }
77078507
77088508 if ((flags ^ dev->gflags) & IFF_PROMISC) {
....@@ -7762,16 +8562,18 @@
77628562 * dev_change_flags - change device settings
77638563 * @dev: device
77648564 * @flags: device state flags
8565
+ * @extack: netlink extended ack
77658566 *
77668567 * Change settings on device based state flags. The flags are
77678568 * in the userspace exported format.
77688569 */
7769
-int dev_change_flags(struct net_device *dev, unsigned int flags)
8570
+int dev_change_flags(struct net_device *dev, unsigned int flags,
8571
+ struct netlink_ext_ack *extack)
77708572 {
77718573 int ret;
77728574 unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
77738575
7774
- ret = __dev_change_flags(dev, flags);
8576
+ ret = __dev_change_flags(dev, flags, extack);
77758577 if (ret < 0)
77768578 return ret;
77778579
....@@ -7914,13 +8716,36 @@
79148716 EXPORT_SYMBOL(dev_set_group);
79158717
79168718 /**
8719
+ * dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
8720
+ * @dev: device
8721
+ * @addr: new address
8722
+ * @extack: netlink extended ack
8723
+ */
8724
+int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
8725
+ struct netlink_ext_ack *extack)
8726
+{
8727
+ struct netdev_notifier_pre_changeaddr_info info = {
8728
+ .info.dev = dev,
8729
+ .info.extack = extack,
8730
+ .dev_addr = addr,
8731
+ };
8732
+ int rc;
8733
+
8734
+ rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
8735
+ return notifier_to_errno(rc);
8736
+}
8737
+EXPORT_SYMBOL(dev_pre_changeaddr_notify);
8738
+
8739
+/**
79178740 * dev_set_mac_address - Change Media Access Control Address
79188741 * @dev: device
79198742 * @sa: new address
8743
+ * @extack: netlink extended ack
79208744 *
79218745 * Change the hardware (MAC) address of the device
79228746 */
7923
-int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
8747
+int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
8748
+ struct netlink_ext_ack *extack)
79248749 {
79258750 const struct net_device_ops *ops = dev->netdev_ops;
79268751 int err;
....@@ -7931,6 +8756,9 @@
79318756 return -EINVAL;
79328757 if (!netif_device_present(dev))
79338758 return -ENODEV;
8759
+ err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
8760
+ if (err)
8761
+ return err;
79348762 err = ops->ndo_set_mac_address(dev, sa);
79358763 if (err)
79368764 return err;
....@@ -7940,6 +8768,48 @@
79408768 return 0;
79418769 }
79428770 EXPORT_SYMBOL(dev_set_mac_address);
8771
+
8772
+static DECLARE_RWSEM(dev_addr_sem);
8773
+
8774
+int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
8775
+ struct netlink_ext_ack *extack)
8776
+{
8777
+ int ret;
8778
+
8779
+ down_write(&dev_addr_sem);
8780
+ ret = dev_set_mac_address(dev, sa, extack);
8781
+ up_write(&dev_addr_sem);
8782
+ return ret;
8783
+}
8784
+EXPORT_SYMBOL(dev_set_mac_address_user);
8785
+
8786
+int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
8787
+{
8788
+ size_t size = sizeof(sa->sa_data);
8789
+ struct net_device *dev;
8790
+ int ret = 0;
8791
+
8792
+ down_read(&dev_addr_sem);
8793
+ rcu_read_lock();
8794
+
8795
+ dev = dev_get_by_name_rcu(net, dev_name);
8796
+ if (!dev) {
8797
+ ret = -ENODEV;
8798
+ goto unlock;
8799
+ }
8800
+ if (!dev->addr_len)
8801
+ memset(sa->sa_data, 0, size);
8802
+ else
8803
+ memcpy(sa->sa_data, dev->dev_addr,
8804
+ min_t(size_t, size, dev->addr_len));
8805
+ sa->sa_family = dev->type;
8806
+
8807
+unlock:
8808
+ rcu_read_unlock();
8809
+ up_read(&dev_addr_sem);
8810
+ return ret;
8811
+}
8812
+EXPORT_SYMBOL(dev_get_mac_address);
79438813
79448814 /**
79458815 * dev_change_carrier - Change device carrier
....@@ -7990,12 +8860,80 @@
79908860 char *name, size_t len)
79918861 {
79928862 const struct net_device_ops *ops = dev->netdev_ops;
8863
+ int err;
79938864
7994
- if (!ops->ndo_get_phys_port_name)
7995
- return -EOPNOTSUPP;
7996
- return ops->ndo_get_phys_port_name(dev, name, len);
8865
+ if (ops->ndo_get_phys_port_name) {
8866
+ err = ops->ndo_get_phys_port_name(dev, name, len);
8867
+ if (err != -EOPNOTSUPP)
8868
+ return err;
8869
+ }
8870
+ return devlink_compat_phys_port_name_get(dev, name, len);
79978871 }
79988872 EXPORT_SYMBOL(dev_get_phys_port_name);
8873
+
8874
+/**
8875
+ * dev_get_port_parent_id - Get the device's port parent identifier
8876
+ * @dev: network device
8877
+ * @ppid: pointer to a storage for the port's parent identifier
8878
+ * @recurse: allow/disallow recursion to lower devices
8879
+ *
8880
+ * Get the devices's port parent identifier
8881
+ */
8882
+int dev_get_port_parent_id(struct net_device *dev,
8883
+ struct netdev_phys_item_id *ppid,
8884
+ bool recurse)
8885
+{
8886
+ const struct net_device_ops *ops = dev->netdev_ops;
8887
+ struct netdev_phys_item_id first = { };
8888
+ struct net_device *lower_dev;
8889
+ struct list_head *iter;
8890
+ int err;
8891
+
8892
+ if (ops->ndo_get_port_parent_id) {
8893
+ err = ops->ndo_get_port_parent_id(dev, ppid);
8894
+ if (err != -EOPNOTSUPP)
8895
+ return err;
8896
+ }
8897
+
8898
+ err = devlink_compat_switch_id_get(dev, ppid);
8899
+ if (!err || err != -EOPNOTSUPP)
8900
+ return err;
8901
+
8902
+ if (!recurse)
8903
+ return -EOPNOTSUPP;
8904
+
8905
+ netdev_for_each_lower_dev(dev, lower_dev, iter) {
8906
+ err = dev_get_port_parent_id(lower_dev, ppid, recurse);
8907
+ if (err)
8908
+ break;
8909
+ if (!first.id_len)
8910
+ first = *ppid;
8911
+ else if (memcmp(&first, ppid, sizeof(*ppid)))
8912
+ return -EOPNOTSUPP;
8913
+ }
8914
+
8915
+ return err;
8916
+}
8917
+EXPORT_SYMBOL(dev_get_port_parent_id);
8918
+
8919
+/**
8920
+ * netdev_port_same_parent_id - Indicate if two network devices have
8921
+ * the same port parent identifier
8922
+ * @a: first network device
8923
+ * @b: second network device
8924
+ */
8925
+bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
8926
+{
8927
+ struct netdev_phys_item_id a_id = { };
8928
+ struct netdev_phys_item_id b_id = { };
8929
+
8930
+ if (dev_get_port_parent_id(a, &a_id, true) ||
8931
+ dev_get_port_parent_id(b, &b_id, true))
8932
+ return false;
8933
+
8934
+ return netdev_phys_item_id_same(&a_id, &b_id);
8935
+}
8936
+EXPORT_SYMBOL(netdev_port_same_parent_id);
79998937
80008938 /**
80018939 * dev_change_proto_down - update protocol port state information
....@@ -8017,67 +8955,495 @@
80178955 }
80188956 EXPORT_SYMBOL(dev_change_proto_down);
80198957
8020
-u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
8021
- enum bpf_netdev_command cmd)
8958
+/**
8959
+ * dev_change_proto_down_generic - generic implementation for
8960
+ * ndo_change_proto_down that sets carrier according to
8961
+ * proto_down.
8962
+ *
8963
+ * @dev: device
8964
+ * @proto_down: new value
8965
+ */
8966
+int dev_change_proto_down_generic(struct net_device *dev, bool proto_down)
80228967 {
8023
- struct netdev_bpf xdp;
8968
+ if (proto_down)
8969
+ netif_carrier_off(dev);
8970
+ else
8971
+ netif_carrier_on(dev);
8972
+ dev->proto_down = proto_down;
8973
+ return 0;
8974
+}
8975
+EXPORT_SYMBOL(dev_change_proto_down_generic);
80248976
8025
- if (!bpf_op)
8026
- return 0;
8977
+/**
8978
+ * dev_change_proto_down_reason - proto down reason
8979
+ *
8980
+ * @dev: device
8981
+ * @mask: proto down mask
8982
+ * @value: proto down value
8983
+ */
8984
+void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
8985
+ u32 value)
8986
+{
8987
+ int b;
80278988
8028
- memset(&xdp, 0, sizeof(xdp));
8029
- xdp.command = cmd;
8989
+ if (!mask) {
8990
+ dev->proto_down_reason = value;
8991
+ } else {
8992
+ for_each_set_bit(b, &mask, 32) {
8993
+ if (value & (1 << b))
8994
+ dev->proto_down_reason |= BIT(b);
8995
+ else
8996
+ dev->proto_down_reason &= ~BIT(b);
8997
+ }
8998
+ }
8999
+}
9000
+EXPORT_SYMBOL(dev_change_proto_down_reason);
80309001
8031
- /* Query must always succeed. */
8032
- WARN_ON(bpf_op(dev, &xdp) < 0 && cmd == XDP_QUERY_PROG);
9002
+struct bpf_xdp_link {
9003
+ struct bpf_link link;
9004
+ struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
9005
+ int flags;
9006
+};
80339007
8034
- return xdp.prog_id;
9008
+static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags)
9009
+{
9010
+ if (flags & XDP_FLAGS_HW_MODE)
9011
+ return XDP_MODE_HW;
9012
+ if (flags & XDP_FLAGS_DRV_MODE)
9013
+ return XDP_MODE_DRV;
9014
+ if (flags & XDP_FLAGS_SKB_MODE)
9015
+ return XDP_MODE_SKB;
9016
+ return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB;
80359017 }
80369018
8037
-static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
8038
- struct netlink_ext_ack *extack, u32 flags,
8039
- struct bpf_prog *prog)
9019
+static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
9020
+{
9021
+ switch (mode) {
9022
+ case XDP_MODE_SKB:
9023
+ return generic_xdp_install;
9024
+ case XDP_MODE_DRV:
9025
+ case XDP_MODE_HW:
9026
+ return dev->netdev_ops->ndo_bpf;
9027
+ default:
9028
+ return NULL;
9029
+ };
9030
+}
9031
+
9032
+static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
9033
+ enum bpf_xdp_mode mode)
9034
+{
9035
+ return dev->xdp_state[mode].link;
9036
+}
9037
+
9038
+static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
9039
+ enum bpf_xdp_mode mode)
9040
+{
9041
+ struct bpf_xdp_link *link = dev_xdp_link(dev, mode);
9042
+
9043
+ if (link)
9044
+ return link->link.prog;
9045
+ return dev->xdp_state[mode].prog;
9046
+}
9047
+
9048
+static u8 dev_xdp_prog_count(struct net_device *dev)
9049
+{
9050
+ u8 count = 0;
9051
+ int i;
9052
+
9053
+ for (i = 0; i < __MAX_XDP_MODE; i++)
9054
+ if (dev->xdp_state[i].prog || dev->xdp_state[i].link)
9055
+ count++;
9056
+ return count;
9057
+}
9058
+
9059
+u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
9060
+{
9061
+ struct bpf_prog *prog = dev_xdp_prog(dev, mode);
9062
+
9063
+ return prog ? prog->aux->id : 0;
9064
+}
9065
+
9066
+static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
9067
+ struct bpf_xdp_link *link)
9068
+{
9069
+ dev->xdp_state[mode].link = link;
9070
+ dev->xdp_state[mode].prog = NULL;
9071
+}
9072
+
9073
+static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
9074
+ struct bpf_prog *prog)
9075
+{
9076
+ dev->xdp_state[mode].link = NULL;
9077
+ dev->xdp_state[mode].prog = prog;
9078
+}
9079
+
9080
+static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
9081
+ bpf_op_t bpf_op, struct netlink_ext_ack *extack,
9082
+ u32 flags, struct bpf_prog *prog)
80409083 {
80419084 struct netdev_bpf xdp;
9085
+ int err;
80429086
80439087 memset(&xdp, 0, sizeof(xdp));
8044
- if (flags & XDP_FLAGS_HW_MODE)
8045
- xdp.command = XDP_SETUP_PROG_HW;
8046
- else
8047
- xdp.command = XDP_SETUP_PROG;
9088
+ xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
80489089 xdp.extack = extack;
80499090 xdp.flags = flags;
80509091 xdp.prog = prog;
80519092
8052
- return bpf_op(dev, &xdp);
9093
+ /* Drivers assume refcnt is already incremented (i.e, prog pointer is
9094
+ * "moved" into driver), so they don't increment it on their own, but
9095
+ * they do decrement refcnt when program is detached or replaced.
9096
+ * Given net_device also owns link/prog, we need to bump refcnt here
9097
+ * to prevent drivers from underflowing it.
9098
+ */
9099
+ if (prog)
9100
+ bpf_prog_inc(prog);
9101
+ err = bpf_op(dev, &xdp);
9102
+ if (err) {
9103
+ if (prog)
9104
+ bpf_prog_put(prog);
9105
+ return err;
9106
+ }
9107
+
9108
+ if (mode != XDP_MODE_HW)
9109
+ bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);
9110
+
9111
+ return 0;
80539112 }
80549113
80559114 static void dev_xdp_uninstall(struct net_device *dev)
80569115 {
8057
- struct netdev_bpf xdp;
8058
- bpf_op_t ndo_bpf;
9116
+ struct bpf_xdp_link *link;
9117
+ struct bpf_prog *prog;
9118
+ enum bpf_xdp_mode mode;
9119
+ bpf_op_t bpf_op;
80599120
8060
- /* Remove generic XDP */
8061
- WARN_ON(dev_xdp_install(dev, generic_xdp_install, NULL, 0, NULL));
9121
+ ASSERT_RTNL();
80629122
8063
- /* Remove from the driver */
8064
- ndo_bpf = dev->netdev_ops->ndo_bpf;
8065
- if (!ndo_bpf)
8066
- return;
9123
+ for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
9124
+ prog = dev_xdp_prog(dev, mode);
9125
+ if (!prog)
9126
+ continue;
80679127
8068
- memset(&xdp, 0, sizeof(xdp));
8069
- xdp.command = XDP_QUERY_PROG;
8070
- WARN_ON(ndo_bpf(dev, &xdp));
8071
- if (xdp.prog_id)
8072
- WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
8073
- NULL));
9128
+ bpf_op = dev_xdp_bpf_op(dev, mode);
9129
+ if (!bpf_op)
9130
+ continue;
80749131
8075
- /* Remove HW offload */
8076
- memset(&xdp, 0, sizeof(xdp));
8077
- xdp.command = XDP_QUERY_PROG_HW;
8078
- if (!ndo_bpf(dev, &xdp) && xdp.prog_id)
8079
- WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
8080
- NULL));
9132
+ WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
9133
+
9134
+ /* auto-detach link from net device */
9135
+ link = dev_xdp_link(dev, mode);
9136
+ if (link)
9137
+ link->dev = NULL;
9138
+ else
9139
+ bpf_prog_put(prog);
9140
+
9141
+ dev_xdp_set_link(dev, mode, NULL);
9142
+ }
9143
+}
9144
+
9145
+static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
9146
+ struct bpf_xdp_link *link, struct bpf_prog *new_prog,
9147
+ struct bpf_prog *old_prog, u32 flags)
9148
+{
9149
+ unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES);
9150
+ struct bpf_prog *cur_prog;
9151
+ enum bpf_xdp_mode mode;
9152
+ bpf_op_t bpf_op;
9153
+ int err;
9154
+
9155
+ ASSERT_RTNL();
9156
+
9157
+ /* either link or prog attachment, never both */
9158
+ if (link && (new_prog || old_prog))
9159
+ return -EINVAL;
9160
+ /* link supports only XDP mode flags */
9161
+ if (link && (flags & ~XDP_FLAGS_MODES)) {
9162
+ NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
9163
+ return -EINVAL;
9164
+ }
9165
+ /* just one XDP mode bit should be set, zero defaults to drv/skb mode */
9166
+ if (num_modes > 1) {
9167
+ NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
9168
+ return -EINVAL;
9169
+ }
9170
+ /* avoid ambiguity if offload + drv/skb mode progs are both loaded */
9171
+ if (!num_modes && dev_xdp_prog_count(dev) > 1) {
9172
+ NL_SET_ERR_MSG(extack,
9173
+ "More than one program loaded, unset mode is ambiguous");
9174
+ return -EINVAL;
9175
+ }
9176
+ /* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
9177
+ if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
9178
+ NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
9179
+ return -EINVAL;
9180
+ }
9181
+
9182
+ mode = dev_xdp_mode(dev, flags);
9183
+ /* can't replace attached link */
9184
+ if (dev_xdp_link(dev, mode)) {
9185
+ NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
9186
+ return -EBUSY;
9187
+ }
9188
+
9189
+ cur_prog = dev_xdp_prog(dev, mode);
9190
+ /* can't replace attached prog with link */
9191
+ if (link && cur_prog) {
9192
+ NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
9193
+ return -EBUSY;
9194
+ }
9195
+ if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
9196
+ NL_SET_ERR_MSG(extack, "Active program does not match expected");
9197
+ return -EEXIST;
9198
+ }
9199
+
9200
+ /* put effective new program into new_prog */
9201
+ if (link)
9202
+ new_prog = link->link.prog;
9203
+
9204
+ if (new_prog) {
9205
+ bool offload = mode == XDP_MODE_HW;
9206
+ enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
9207
+ ? XDP_MODE_DRV : XDP_MODE_SKB;
9208
+
9209
+ if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
9210
+ NL_SET_ERR_MSG(extack, "XDP program already attached");
9211
+ return -EBUSY;
9212
+ }
9213
+ if (!offload && dev_xdp_prog(dev, other_mode)) {
9214
+ NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
9215
+ return -EEXIST;
9216
+ }
9217
+ if (!offload && bpf_prog_is_dev_bound(new_prog->aux)) {
9218
+ NL_SET_ERR_MSG(extack, "Using device-bound program without HW_MODE flag is not supported");
9219
+ return -EINVAL;
9220
+ }
9221
+ if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
9222
+ NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
9223
+ return -EINVAL;
9224
+ }
9225
+ if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
9226
+ NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
9227
+ return -EINVAL;
9228
+ }
9229
+ }
9230
+
9231
+ /* don't call drivers if the effective program didn't change */
9232
+ if (new_prog != cur_prog) {
9233
+ bpf_op = dev_xdp_bpf_op(dev, mode);
9234
+ if (!bpf_op) {
9235
+ NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
9236
+ return -EOPNOTSUPP;
9237
+ }
9238
+
9239
+ err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
9240
+ if (err)
9241
+ return err;
9242
+ }
9243
+
9244
+ if (link)
9245
+ dev_xdp_set_link(dev, mode, link);
9246
+ else
9247
+ dev_xdp_set_prog(dev, mode, new_prog);
9248
+ if (cur_prog)
9249
+ bpf_prog_put(cur_prog);
9250
+
9251
+ return 0;
9252
+}
9253
+
9254
+static int dev_xdp_attach_link(struct net_device *dev,
9255
+ struct netlink_ext_ack *extack,
9256
+ struct bpf_xdp_link *link)
9257
+{
9258
+ return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
9259
+}
9260
+
9261
+static int dev_xdp_detach_link(struct net_device *dev,
9262
+ struct netlink_ext_ack *extack,
9263
+ struct bpf_xdp_link *link)
9264
+{
9265
+ enum bpf_xdp_mode mode;
9266
+ bpf_op_t bpf_op;
9267
+
9268
+ ASSERT_RTNL();
9269
+
9270
+ mode = dev_xdp_mode(dev, link->flags);
9271
+ if (dev_xdp_link(dev, mode) != link)
9272
+ return -EINVAL;
9273
+
9274
+ bpf_op = dev_xdp_bpf_op(dev, mode);
9275
+ WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
9276
+ dev_xdp_set_link(dev, mode, NULL);
9277
+ return 0;
9278
+}
9279
+
9280
+static void bpf_xdp_link_release(struct bpf_link *link)
9281
+{
9282
+ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9283
+
9284
+ rtnl_lock();
9285
+
9286
+ /* if racing with net_device's tear down, xdp_link->dev might be
9287
+ * already NULL, in which case link was already auto-detached
9288
+ */
9289
+ if (xdp_link->dev) {
9290
+ WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
9291
+ xdp_link->dev = NULL;
9292
+ }
9293
+
9294
+ rtnl_unlock();
9295
+}
9296
+
9297
+static int bpf_xdp_link_detach(struct bpf_link *link)
9298
+{
9299
+ bpf_xdp_link_release(link);
9300
+ return 0;
9301
+}
9302
+
9303
+static void bpf_xdp_link_dealloc(struct bpf_link *link)
9304
+{
9305
+ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9306
+
9307
+ kfree(xdp_link);
9308
+}
9309
+
9310
+static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
9311
+ struct seq_file *seq)
9312
+{
9313
+ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9314
+ u32 ifindex = 0;
9315
+
9316
+ rtnl_lock();
9317
+ if (xdp_link->dev)
9318
+ ifindex = xdp_link->dev->ifindex;
9319
+ rtnl_unlock();
9320
+
9321
+ seq_printf(seq, "ifindex:\t%u\n", ifindex);
9322
+}
9323
+
9324
+static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
9325
+ struct bpf_link_info *info)
9326
+{
9327
+ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9328
+ u32 ifindex = 0;
9329
+
9330
+ rtnl_lock();
9331
+ if (xdp_link->dev)
9332
+ ifindex = xdp_link->dev->ifindex;
9333
+ rtnl_unlock();
9334
+
9335
+ info->xdp.ifindex = ifindex;
9336
+ return 0;
9337
+}
9338
+
9339
+static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
9340
+ struct bpf_prog *old_prog)
9341
+{
9342
+ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9343
+ enum bpf_xdp_mode mode;
9344
+ bpf_op_t bpf_op;
9345
+ int err = 0;
9346
+
9347
+ rtnl_lock();
9348
+
9349
+ /* link might have been auto-released already, so fail */
9350
+ if (!xdp_link->dev) {
9351
+ err = -ENOLINK;
9352
+ goto out_unlock;
9353
+ }
9354
+
9355
+ if (old_prog && link->prog != old_prog) {
9356
+ err = -EPERM;
9357
+ goto out_unlock;
9358
+ }
9359
+ old_prog = link->prog;
9360
+ if (old_prog->type != new_prog->type ||
9361
+ old_prog->expected_attach_type != new_prog->expected_attach_type) {
9362
+ err = -EINVAL;
9363
+ goto out_unlock;
9364
+ }
9365
+
9366
+ if (old_prog == new_prog) {
9367
+ /* no-op, don't disturb drivers */
9368
+ bpf_prog_put(new_prog);
9369
+ goto out_unlock;
9370
+ }
9371
+
9372
+ mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
9373
+ bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
9374
+ err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
9375
+ xdp_link->flags, new_prog);
9376
+ if (err)
9377
+ goto out_unlock;
9378
+
9379
+ old_prog = xchg(&link->prog, new_prog);
9380
+ bpf_prog_put(old_prog);
9381
+
9382
+out_unlock:
9383
+ rtnl_unlock();
9384
+ return err;
9385
+}
9386
+
9387
+static const struct bpf_link_ops bpf_xdp_link_lops = {
9388
+ .release = bpf_xdp_link_release,
9389
+ .dealloc = bpf_xdp_link_dealloc,
9390
+ .detach = bpf_xdp_link_detach,
9391
+ .show_fdinfo = bpf_xdp_link_show_fdinfo,
9392
+ .fill_link_info = bpf_xdp_link_fill_link_info,
9393
+ .update_prog = bpf_xdp_link_update,
9394
+};
9395
+
9396
+int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
9397
+{
9398
+ struct net *net = current->nsproxy->net_ns;
9399
+ struct bpf_link_primer link_primer;
9400
+ struct bpf_xdp_link *link;
9401
+ struct net_device *dev;
9402
+ int err, fd;
9403
+
9404
+ rtnl_lock();
9405
+ dev = dev_get_by_index(net, attr->link_create.target_ifindex);
9406
+ if (!dev) {
9407
+ rtnl_unlock();
9408
+ return -EINVAL;
9409
+ }
9410
+
9411
+ link = kzalloc(sizeof(*link), GFP_USER);
9412
+ if (!link) {
9413
+ err = -ENOMEM;
9414
+ goto unlock;
9415
+ }
9416
+
9417
+ bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog);
9418
+ link->dev = dev;
9419
+ link->flags = attr->link_create.flags;
9420
+
9421
+ err = bpf_link_prime(&link->link, &link_primer);
9422
+ if (err) {
9423
+ kfree(link);
9424
+ goto unlock;
9425
+ }
9426
+
9427
+ err = dev_xdp_attach_link(dev, NULL, link);
9428
+ rtnl_unlock();
9429
+
9430
+ if (err) {
9431
+ link->dev = NULL;
9432
+ bpf_link_cleanup(&link_primer);
9433
+ goto out_put_dev;
9434
+ }
9435
+
9436
+ fd = bpf_link_settle(&link_primer);
9437
+ /* link itself doesn't hold dev's refcnt to not complicate shutdown */
9438
+ dev_put(dev);
9439
+ return fd;
9440
+
9441
+unlock:
9442
+ rtnl_unlock();
9443
+
9444
+out_put_dev:
9445
+ dev_put(dev);
9446
+ return err;
80819447 }
80829448
80839449 /**
....@@ -8085,56 +9451,44 @@
80859451 * @dev: device
80869452 * @extack: netlink extended ack
80879453 * @fd: new program fd or negative value to clear
9454
+ * @expected_fd: old program fd that userspace expects to replace or clear
80889455 * @flags: xdp-related flags
80899456 *
80909457 * Set or clear a bpf program for a device
80919458 */
80929459 int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
8093
- int fd, u32 flags)
9460
+ int fd, int expected_fd, u32 flags)
80949461 {
8095
- const struct net_device_ops *ops = dev->netdev_ops;
8096
- enum bpf_netdev_command query;
8097
- struct bpf_prog *prog = NULL;
8098
- bpf_op_t bpf_op, bpf_chk;
9462
+ enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags);
9463
+ struct bpf_prog *new_prog = NULL, *old_prog = NULL;
80999464 int err;
81009465
81019466 ASSERT_RTNL();
81029467
8103
- query = flags & XDP_FLAGS_HW_MODE ? XDP_QUERY_PROG_HW : XDP_QUERY_PROG;
8104
-
8105
- bpf_op = bpf_chk = ops->ndo_bpf;
8106
- if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE)))
8107
- return -EOPNOTSUPP;
8108
- if (!bpf_op || (flags & XDP_FLAGS_SKB_MODE))
8109
- bpf_op = generic_xdp_install;
8110
- if (bpf_op == bpf_chk)
8111
- bpf_chk = generic_xdp_install;
8112
-
81139468 if (fd >= 0) {
8114
- if (__dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG) ||
8115
- __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG_HW))
8116
- return -EEXIST;
8117
- if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) &&
8118
- __dev_xdp_query(dev, bpf_op, query))
8119
- return -EBUSY;
9469
+ new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
9470
+ mode != XDP_MODE_SKB);
9471
+ if (IS_ERR(new_prog))
9472
+ return PTR_ERR(new_prog);
9473
+ }
81209474
8121
- prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
8122
- bpf_op == ops->ndo_bpf);
8123
- if (IS_ERR(prog))
8124
- return PTR_ERR(prog);
8125
-
8126
- if (!(flags & XDP_FLAGS_HW_MODE) &&
8127
- bpf_prog_is_dev_bound(prog->aux)) {
8128
- NL_SET_ERR_MSG(extack, "using device-bound program without HW_MODE flag is not supported");
8129
- bpf_prog_put(prog);
8130
- return -EINVAL;
9475
+ if (expected_fd >= 0) {
9476
+ old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
9477
+ mode != XDP_MODE_SKB);
9478
+ if (IS_ERR(old_prog)) {
9479
+ err = PTR_ERR(old_prog);
9480
+ old_prog = NULL;
9481
+ goto err_out;
81319482 }
81329483 }
81339484
8134
- err = dev_xdp_install(dev, bpf_op, extack, flags, prog);
8135
- if (err < 0 && prog)
8136
- bpf_prog_put(prog);
9485
+ err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);
81379486
9487
+err_out:
9488
+ if (err && new_prog)
9489
+ bpf_prog_put(new_prog);
9490
+ if (old_prog)
9491
+ bpf_prog_put(old_prog);
81389492 return err;
81399493 }
81409494
....@@ -8166,103 +9520,6 @@
81669520 {
81679521 list_add_tail(&dev->todo_list, &net_todo_list);
81689522 dev_net(dev)->dev_unreg_count++;
8169
-}
8170
-
8171
-static void rollback_registered_many(struct list_head *head)
8172
-{
8173
- struct net_device *dev, *tmp;
8174
- LIST_HEAD(close_head);
8175
-
8176
- BUG_ON(dev_boot_phase);
8177
- ASSERT_RTNL();
8178
-
8179
- list_for_each_entry_safe(dev, tmp, head, unreg_list) {
8180
- /* Some devices call without registering
8181
- * for initialization unwind. Remove those
8182
- * devices and proceed with the remaining.
8183
- */
8184
- if (dev->reg_state == NETREG_UNINITIALIZED) {
8185
- pr_debug("unregister_netdevice: device %s/%p never was registered\n",
8186
- dev->name, dev);
8187
-
8188
- WARN_ON(1);
8189
- list_del(&dev->unreg_list);
8190
- continue;
8191
- }
8192
- dev->dismantle = true;
8193
- BUG_ON(dev->reg_state != NETREG_REGISTERED);
8194
- }
8195
-
8196
- /* If device is running, close it first. */
8197
- list_for_each_entry(dev, head, unreg_list)
8198
- list_add_tail(&dev->close_list, &close_head);
8199
- dev_close_many(&close_head, true);
8200
-
8201
- list_for_each_entry(dev, head, unreg_list) {
8202
- /* And unlink it from device chain. */
8203
- unlist_netdevice(dev);
8204
-
8205
- dev->reg_state = NETREG_UNREGISTERING;
8206
- }
8207
- flush_all_backlogs();
8208
-
8209
- synchronize_net();
8210
-
8211
- list_for_each_entry(dev, head, unreg_list) {
8212
- struct sk_buff *skb = NULL;
8213
-
8214
- /* Shutdown queueing discipline. */
8215
- dev_shutdown(dev);
8216
-
8217
- dev_xdp_uninstall(dev);
8218
-
8219
- /* Notify protocols, that we are about to destroy
8220
- * this device. They should clean all the things.
8221
- */
8222
- call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
8223
-
8224
- if (!dev->rtnl_link_ops ||
8225
- dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
8226
- skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
8227
- GFP_KERNEL, NULL, 0);
8228
-
8229
- /*
8230
- * Flush the unicast and multicast chains
8231
- */
8232
- dev_uc_flush(dev);
8233
- dev_mc_flush(dev);
8234
-
8235
- if (dev->netdev_ops->ndo_uninit)
8236
- dev->netdev_ops->ndo_uninit(dev);
8237
-
8238
- if (skb)
8239
- rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
8240
-
8241
- /* Notifier chain MUST detach us all upper devices. */
8242
- WARN_ON(netdev_has_any_upper_dev(dev));
8243
- WARN_ON(netdev_has_any_lower_dev(dev));
8244
-
8245
- /* Remove entries from kobject tree */
8246
- netdev_unregister_kobject(dev);
8247
-#ifdef CONFIG_XPS
8248
- /* Remove XPS queueing entries */
8249
- netif_reset_xps_queues_gt(dev, 0);
8250
-#endif
8251
- }
8252
-
8253
- synchronize_net();
8254
-
8255
- list_for_each_entry(dev, head, unreg_list)
8256
- dev_put(dev);
8257
-}
8258
-
8259
-static void rollback_registered(struct net_device *dev)
8260
-{
8261
- LIST_HEAD(single);
8262
-
8263
- list_add(&dev->unreg_list, &single);
8264
- rollback_registered_many(&single);
8265
- list_del(&single);
82669523 }
82679524
82689525 static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
....@@ -8410,7 +9667,7 @@
84109667 /* driver might be less strict about feature dependencies */
84119668 features = netdev_fix_features(dev, features);
84129669
8413
- /* some features can't be enabled if they're off an an upper device */
9670
+ /* some features can't be enabled if they're off on an upper device */
84149671 netdev_for_each_upper_dev_rcu(dev, upper, iter)
84159672 features = netdev_sync_upper_features(dev, upper, features);
84169673
....@@ -8533,6 +9790,11 @@
85339790 netif_dormant_on(dev);
85349791 else
85359792 netif_dormant_off(dev);
9793
+
9794
+ if (rootdev->operstate == IF_OPER_TESTING)
9795
+ netif_testing_on(dev);
9796
+ else
9797
+ netif_testing_off(dev);
85369798
85379799 if (netif_carrier_ok(rootdev))
85389800 netif_carrier_on(dev);
....@@ -8674,11 +9936,20 @@
86749936 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
86759937 BUG_ON(!net);
86769938
9939
+ ret = ethtool_check_ops(dev->ethtool_ops);
9940
+ if (ret)
9941
+ return ret;
9942
+
86779943 spin_lock_init(&dev->addr_list_lock);
86789944 netdev_set_addr_lockdep_class(dev);
86799945
86809946 ret = dev_get_valid_name(net, dev, dev->name);
86819947 if (ret < 0)
9948
+ goto out;
9949
+
9950
+ ret = -ENOMEM;
9951
+ dev->name_node = netdev_name_node_head_alloc(dev);
9952
+ if (!dev->name_node)
86829953 goto out;
86839954
86849955 /* Init, if this function is available */
....@@ -8687,7 +9958,7 @@
86879958 if (ret) {
86889959 if (ret > 0)
86899960 ret = -EIO;
8690
- goto out;
9961
+ goto err_free_name;
86919962 }
86929963 }
86939964
....@@ -8709,7 +9980,7 @@
87099980 /* Transfer changeable features to wanted_features and enable
87109981 * software offloads (GSO and GRO).
87119982 */
8712
- dev->hw_features |= NETIF_F_SOFT_FEATURES;
9983
+ dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
87139984 dev->features |= NETIF_F_SOFT_FEATURES;
87149985
87159986 if (dev->netdev_ops->ndo_udp_tunnel_add) {
....@@ -8787,17 +10058,10 @@
878710058 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
878810059 ret = notifier_to_errno(ret);
878910060 if (ret) {
8790
- rollback_registered(dev);
8791
- rcu_barrier();
8792
-
8793
- dev->reg_state = NETREG_UNREGISTERED;
8794
- /* We should put the kobject that hold in
8795
- * netdev_unregister_kobject(), otherwise
8796
- * the net device cannot be freed when
8797
- * driver calls free_netdev(), because the
8798
- * kobject is being hold.
8799
- */
8800
- kobject_put(&dev->dev.kobj);
10061
+ /* Expect explicit free_netdev() on failure */
10062
+ dev->needs_free_netdev = false;
10063
+ unregister_netdevice_queue(dev, NULL);
10064
+ goto out;
880110065 }
880210066 /*
880310067 * Prevent userspace races by waiting until the network
....@@ -8815,6 +10079,8 @@
881510079 dev->netdev_ops->ndo_uninit(dev);
881610080 if (dev->priv_destructor)
881710081 dev->priv_destructor(dev);
10082
+err_free_name:
10083
+ netdev_name_node_free(dev->name_node);
881810084 goto out;
881910085 }
882010086 EXPORT_SYMBOL(register_netdevice);
....@@ -8898,6 +10164,8 @@
889810164 }
889910165 EXPORT_SYMBOL(netdev_refcnt_read);
890010166
10167
+#define WAIT_REFS_MIN_MSECS 1
10168
+#define WAIT_REFS_MAX_MSECS 250
890110169 /**
890210170 * netdev_wait_allrefs - wait until all references are gone.
890310171 * @dev: target net_device
....@@ -8913,7 +10181,7 @@
891310181 static void netdev_wait_allrefs(struct net_device *dev)
891410182 {
891510183 unsigned long rebroadcast_time, warning_time;
8916
- int refcnt;
10184
+ int wait = 0, refcnt;
891710185
891810186 linkwatch_forget_dev(dev);
891910187
....@@ -8947,7 +10215,13 @@
894710215 rebroadcast_time = jiffies;
894810216 }
894910217
8950
- msleep(250);
10218
+ if (!wait) {
10219
+ rcu_barrier();
10220
+ wait = WAIT_REFS_MIN_MSECS;
10221
+ } else {
10222
+ msleep(wait);
10223
+ wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
10224
+ }
895110225
895210226 refcnt = netdev_refcnt_read(dev);
895310227
....@@ -8986,6 +10260,19 @@
898610260 void netdev_run_todo(void)
898710261 {
898810262 struct list_head list;
10263
+#ifdef CONFIG_LOCKDEP
10264
+ struct list_head unlink_list;
10265
+
10266
+ list_replace_init(&net_unlink_list, &unlink_list);
10267
+
10268
+ while (!list_empty(&unlink_list)) {
10269
+ struct net_device *dev = list_first_entry(&unlink_list,
10270
+ struct net_device,
10271
+ unlink_list);
10272
+ list_del_init(&dev->unlink_list);
10273
+ dev->nested_level = dev->lower_level - 1;
10274
+ }
10275
+#endif
898910276
899010277 /* Snapshot list, allow later requests */
899110278 list_replace_init(&net_todo_list, &list);
....@@ -9097,6 +10384,40 @@
909710384 }
909810385 EXPORT_SYMBOL(dev_get_stats);
909910386
10387
+/**
10388
+ * dev_fetch_sw_netstats - get per-cpu network device statistics
10389
+ * @s: place to store stats
10390
+ * @netstats: per-cpu network stats to read from
10391
+ *
10392
+ * Read per-cpu network statistics and populate the related fields in @s.
10393
+ */
10394
+void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
10395
+ const struct pcpu_sw_netstats __percpu *netstats)
10396
+{
10397
+ int cpu;
10398
+
10399
+ for_each_possible_cpu(cpu) {
10400
+ const struct pcpu_sw_netstats *stats;
10401
+ struct pcpu_sw_netstats tmp;
10402
+ unsigned int start;
10403
+
10404
+ stats = per_cpu_ptr(netstats, cpu);
10405
+ do {
10406
+ start = u64_stats_fetch_begin_irq(&stats->syncp);
10407
+ tmp.rx_packets = stats->rx_packets;
10408
+ tmp.rx_bytes = stats->rx_bytes;
10409
+ tmp.tx_packets = stats->tx_packets;
10410
+ tmp.tx_bytes = stats->tx_bytes;
10411
+ } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
10412
+
10413
+ s->rx_packets += tmp.rx_packets;
10414
+ s->rx_bytes += tmp.rx_bytes;
10415
+ s->tx_packets += tmp.tx_packets;
10416
+ s->tx_bytes += tmp.tx_bytes;
10417
+ }
10418
+}
10419
+EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats);
10420
+
910010421 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
910110422 {
910210423 struct netdev_queue *queue = dev_ingress_queue(dev);
....@@ -9198,6 +10519,10 @@
919810519 dev->gso_max_segs = GSO_MAX_SEGS;
919910520 dev->upper_level = 1;
920010521 dev->lower_level = 1;
10522
+#ifdef CONFIG_LOCKDEP
10523
+ dev->nested_level = 0;
10524
+ INIT_LIST_HEAD(&dev->unlink_list);
10525
+#endif
920110526
920210527 INIT_LIST_HEAD(&dev->napi_list);
920310528 INIT_LIST_HEAD(&dev->unreg_list);
....@@ -9207,6 +10532,7 @@
920710532 INIT_LIST_HEAD(&dev->adj_list.lower);
920810533 INIT_LIST_HEAD(&dev->ptype_all);
920910534 INIT_LIST_HEAD(&dev->ptype_specific);
10535
+ INIT_LIST_HEAD(&dev->net_notifier_list);
921010536 #ifdef CONFIG_NET_SCHED
921110537 hash_init(dev->qdisc_hash);
921210538 #endif
....@@ -9264,6 +10590,17 @@
926410590 struct napi_struct *p, *n;
926510591
926610592 might_sleep();
10593
+
10594
+ /* When called immediately after register_netdevice() failed the unwind
10595
+ * handling may still be dismantling the device. Handle that case by
10596
+ * deferring the free.
10597
+ */
10598
+ if (dev->reg_state == NETREG_UNREGISTERING) {
10599
+ ASSERT_RTNL();
10600
+ dev->needs_free_netdev = true;
10601
+ return;
10602
+ }
10603
+
926710604 netif_free_tx_queues(dev);
926810605 netif_free_rx_queues(dev);
926910606
....@@ -9277,6 +10614,8 @@
927710614
927810615 free_percpu(dev->pcpu_refcnt);
927910616 dev->pcpu_refcnt = NULL;
10617
+ free_percpu(dev->xdp_bulkq);
10618
+ dev->xdp_bulkq = NULL;
928010619
928110620 /* Compatibility with error handling in drivers */
928210621 if (dev->reg_state == NETREG_UNINITIALIZED) {
....@@ -9328,9 +10667,10 @@
932810667 if (head) {
932910668 list_move_tail(&dev->unreg_list, head);
933010669 } else {
9331
- rollback_registered(dev);
9332
- /* Finish processing unregister after unlock */
9333
- net_set_todo(dev);
10670
+ LIST_HEAD(single);
10671
+
10672
+ list_add(&dev->unreg_list, &single);
10673
+ unregister_netdevice_many(&single);
933410674 }
933510675 }
933610676 EXPORT_SYMBOL(unregister_netdevice_queue);
....@@ -9344,14 +10684,100 @@
934410684 */
934510685 void unregister_netdevice_many(struct list_head *head)
934610686 {
9347
- struct net_device *dev;
10687
+ struct net_device *dev, *tmp;
10688
+ LIST_HEAD(close_head);
934810689
9349
- if (!list_empty(head)) {
9350
- rollback_registered_many(head);
9351
- list_for_each_entry(dev, head, unreg_list)
9352
- net_set_todo(dev);
9353
- list_del(head);
10690
+ BUG_ON(dev_boot_phase);
10691
+ ASSERT_RTNL();
10692
+
10693
+ if (list_empty(head))
10694
+ return;
10695
+
10696
+ list_for_each_entry_safe(dev, tmp, head, unreg_list) {
10697
+ /* Some devices call without registering
10698
+ * for initialization unwind. Remove those
10699
+ * devices and proceed with the remaining.
10700
+ */
10701
+ if (dev->reg_state == NETREG_UNINITIALIZED) {
10702
+ pr_debug("unregister_netdevice: device %s/%p never was registered\n",
10703
+ dev->name, dev);
10704
+
10705
+ WARN_ON(1);
10706
+ list_del(&dev->unreg_list);
10707
+ continue;
10708
+ }
10709
+ dev->dismantle = true;
10710
+ BUG_ON(dev->reg_state != NETREG_REGISTERED);
935410711 }
10712
+
10713
+ /* If device is running, close it first. */
10714
+ list_for_each_entry(dev, head, unreg_list)
10715
+ list_add_tail(&dev->close_list, &close_head);
10716
+ dev_close_many(&close_head, true);
10717
+
10718
+ list_for_each_entry(dev, head, unreg_list) {
10719
+ /* And unlink it from device chain. */
10720
+ unlist_netdevice(dev);
10721
+
10722
+ dev->reg_state = NETREG_UNREGISTERING;
10723
+ }
10724
+ flush_all_backlogs();
10725
+
10726
+ synchronize_net();
10727
+
10728
+ list_for_each_entry(dev, head, unreg_list) {
10729
+ struct sk_buff *skb = NULL;
10730
+
10731
+ /* Shutdown queueing discipline. */
10732
+ dev_shutdown(dev);
10733
+
10734
+ dev_xdp_uninstall(dev);
10735
+
10736
+ /* Notify protocols, that we are about to destroy
10737
+ * this device. They should clean all the things.
10738
+ */
10739
+ call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10740
+
10741
+ if (!dev->rtnl_link_ops ||
10742
+ dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
10743
+ skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
10744
+ GFP_KERNEL, NULL, 0);
10745
+
10746
+ /*
10747
+ * Flush the unicast and multicast chains
10748
+ */
10749
+ dev_uc_flush(dev);
10750
+ dev_mc_flush(dev);
10751
+
10752
+ netdev_name_node_alt_flush(dev);
10753
+ netdev_name_node_free(dev->name_node);
10754
+
10755
+ if (dev->netdev_ops->ndo_uninit)
10756
+ dev->netdev_ops->ndo_uninit(dev);
10757
+
10758
+ if (skb)
10759
+ rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
10760
+
10761
+ /* Notifier chain MUST detach us all upper devices. */
10762
+ WARN_ON(netdev_has_any_upper_dev(dev));
10763
+ WARN_ON(netdev_has_any_lower_dev(dev));
10764
+
10765
+ /* Remove entries from kobject tree */
10766
+ netdev_unregister_kobject(dev);
10767
+#ifdef CONFIG_XPS
10768
+ /* Remove XPS queueing entries */
10769
+ netif_reset_xps_queues_gt(dev, 0);
10770
+#endif
10771
+ }
10772
+
10773
+ synchronize_net();
10774
+
10775
+ list_for_each_entry(dev, head, unreg_list) {
10776
+ dev_put(dev);
10777
+ net_set_todo(dev);
10778
+ }
10779
+
10780
+ list_del(head);
935510781 }
935610782 EXPORT_SYMBOL(unregister_netdevice_many);
935710783
....@@ -9390,6 +10816,7 @@
939010816
939110817 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
939210818 {
10819
+ struct net *net_old = dev_net(dev);
939310820 int err, new_nsid, new_ifindex;
939410821
939510822 ASSERT_RTNL();
....@@ -9405,7 +10832,7 @@
940510832
940610833 /* Get out if there is nothing todo */
940710834 err = 0;
9408
- if (net_eq(dev_net(dev), net))
10835
+ if (net_eq(net_old, net))
940910836 goto out;
941010837
941110838 /* Pick the destination device name, and ensure
....@@ -9466,6 +10893,9 @@
946610893 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
946710894 netdev_adjacent_del_links(dev);
946810895
10896
+ /* Move per-net netdevice notifiers that are following the netdevice */
10897
+ move_netdevice_notifiers_dev_net(dev, net);
10898
+
946910899 /* Actually switch the network namespace */
947010900 dev_net_set(dev, net);
947110901 dev->ifindex = new_ifindex;
....@@ -9476,6 +10906,12 @@
947610906
947710907 /* Fixup kobjects */
947810908 err = device_rename(&dev->dev, dev->name);
10909
+ WARN_ON(err);
10910
+
10911
+ /* Adapt owner in case owning user namespace of target network
10912
+ * namespace is different from the original one.
10913
+ */
10914
+ err = netdev_change_owner(dev, net_old, net);
947910915 WARN_ON(err);
948010916
948110917 /* Add the device back in the hashes */
....@@ -9542,6 +10978,7 @@
954210978
954310979 raise_softirq_irqoff(NET_TX_SOFTIRQ);
954410980 local_irq_enable();
10981
+ preempt_check_resched_rt();
954510982
954610983 #ifdef CONFIG_RPS
954710984 remsd = oldsd->rps_ipi_list;
....@@ -9555,7 +10992,7 @@
955510992 netif_rx_ni(skb);
955610993 input_queue_head_incr(oldsd);
955710994 }
9558
- while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
10995
+ while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
955910996 netif_rx_ni(skb);
956010997 input_queue_head_incr(oldsd);
956110998 }
....@@ -9608,7 +11045,7 @@
960811045 static int __net_init netdev_init(struct net *net)
960911046 {
961011047 BUILD_BUG_ON(GRO_HASH_BUCKETS >
9611
- 8 * FIELD_SIZEOF(struct napi_struct, gro_bitmask));
11048
+ 8 * sizeof_field(struct napi_struct, gro_bitmask));
961211049
961311050 if (net != &init_net)
961411051 INIT_LIST_HEAD(&net->dev_base_head);
....@@ -9620,6 +11057,8 @@
962011057 net->dev_index_head = netdev_create_hash();
962111058 if (net->dev_index_head == NULL)
962211059 goto err_idx;
11060
+
11061
+ RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
962311062
962411063 return 0;
962511064
....@@ -9742,7 +11181,7 @@
974211181 continue;
974311182
974411183 /* Leave virtual devices for the generic cleanup */
9745
- if (dev->rtnl_link_ops)
11184
+ if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund)
974611185 continue;
974711186
974811187 /* Push remaining network devices to init_net */
....@@ -9869,7 +11308,7 @@
986911308
987011309 INIT_WORK(flush, flush_backlog);
987111310
9872
- skb_queue_head_init(&sd->input_pkt_queue);
11311
+ skb_queue_head_init_raw(&sd->input_pkt_queue);
987311312 skb_queue_head_init(&sd->process_queue);
987411313 #ifdef CONFIG_XFRM_OFFLOAD
987511314 skb_queue_head_init(&sd->xfrm_backlog);