hc
2024-09-20 a36159eec6ca17402b0e146b86efaf76568dc353
kernel/drivers/net/vrf.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /*
23 * vrf.c: device driver to encapsulate a VRF space
34 *
....@@ -6,11 +7,6 @@
67 * Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com>
78 *
89 * Based on dummy, team and ipvlan drivers
9
- *
10
- * This program is free software; you can redistribute it and/or modify
11
- * it under the terms of the GNU General Public License as published by
12
- * the Free Software Foundation; either version 2 of the License, or
13
- * (at your option) any later version.
1410 */
1511
1612 #include <linux/module.h>
....@@ -25,6 +21,7 @@
2521 #include <net/rtnetlink.h>
2622 #include <linux/u64_stats_sync.h>
2723 #include <linux/hashtable.h>
24
+#include <linux/spinlock_types.h>
2825
2926 #include <linux/inetdevice.h>
3027 #include <net/arp.h>
....@@ -37,13 +34,78 @@
3734 #include <net/l3mdev.h>
3835 #include <net/fib_rules.h>
3936 #include <net/netns/generic.h>
37
+#include <net/netfilter/nf_conntrack.h>
4038
4139 #define DRV_NAME "vrf"
42
-#define DRV_VERSION "1.0"
40
+#define DRV_VERSION "1.1"
4341
4442 #define FIB_RULE_PREF 1000 /* default preference for FIB rules */
4543
44
+#define HT_MAP_BITS 4
45
+#define HASH_INITVAL ((u32)0xcafef00d)
46
+
47
+struct vrf_map {
48
+ DECLARE_HASHTABLE(ht, HT_MAP_BITS);
49
+ spinlock_t vmap_lock;
50
+
51
+ /* shared_tables:
52
+ * count how many distinct tables do not comply with the strict mode
53
+ * requirement.
54
+ * shared_tables value must be 0 in order to enable the strict mode.
55
+ *
56
+ * example of the evolution of shared_tables:
57
+ * | time
58
+ * add vrf0 --> table 100 shared_tables = 0 | t0
59
+ * add vrf1 --> table 101 shared_tables = 0 | t1
60
+ * add vrf2 --> table 100 shared_tables = 1 | t2
61
+ * add vrf3 --> table 100 shared_tables = 1 | t3
62
+ * add vrf4 --> table 101 shared_tables = 2 v t4
63
+ *
64
+ * shared_tables is a "step function" (or "staircase function")
65
+ * and it is increased by one when the second vrf is associated to a
66
+ * table.
67
+ *
68
+ * at t2, vrf0 and vrf2 are bound to table 100: shared_tables = 1.
69
+ *
70
+ * at t3, another dev (vrf3) is bound to the same table 100 but the
71
+ * value of shared_tables is still 1.
72
+ * This means that no matter how many new vrfs will register on the
73
+ * table 100, the shared_tables will not increase (considering only
74
+ * table 100).
75
+ *
76
+ * at t4, vrf4 is bound to table 101, and shared_tables = 2.
77
+ *
78
+ * Looking at the value of shared_tables we can immediately know if
79
+ * the strict_mode can or cannot be enforced. Indeed, strict_mode
80
+ * can be enforced iff shared_tables = 0.
81
+ *
82
+ * Conversely, shared_tables is decreased when a vrf is de-associated
83
+ * from a table with exactly two associated vrfs.
84
+ */
85
+ u32 shared_tables;
86
+
87
+ bool strict_mode;
88
+};
89
+
90
+struct vrf_map_elem {
91
+ struct hlist_node hnode;
92
+ struct list_head vrf_list; /* VRFs registered to this table */
93
+
94
+ u32 table_id;
95
+ int users;
96
+ int ifindex;
97
+};
98
+
4699 static unsigned int vrf_net_id;
100
+
101
+/* per netns vrf data */
102
+struct netns_vrf {
103
+ /* protected by rtnl lock */
104
+ bool add_fib_rules;
105
+
106
+ struct vrf_map vmap;
107
+ struct ctl_table_header *ctl_hdr;
108
+};
47109
48110 struct net_vrf {
49111 struct rtable __rcu *rth;
....@@ -52,6 +114,9 @@
52114 struct fib6_table *fib6_table;
53115 #endif
54116 u32 tb_id;
117
+
118
+ struct list_head me_list; /* entry in vrf_map_elem */
119
+ int ifindex;
55120 };
56121
57122 struct pcpu_dstats {
....@@ -107,6 +172,214 @@
107172 }
108173 }
109174
175
+static struct vrf_map *netns_vrf_map(struct net *net)
176
+{
177
+ struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id);
178
+
179
+ return &nn_vrf->vmap;
180
+}
181
+
182
+static struct vrf_map *netns_vrf_map_by_dev(struct net_device *dev)
183
+{
184
+ return netns_vrf_map(dev_net(dev));
185
+}
186
+
187
+static int vrf_map_elem_get_vrf_ifindex(struct vrf_map_elem *me)
188
+{
189
+ struct list_head *me_head = &me->vrf_list;
190
+ struct net_vrf *vrf;
191
+
192
+ if (list_empty(me_head))
193
+ return -ENODEV;
194
+
195
+ vrf = list_first_entry(me_head, struct net_vrf, me_list);
196
+
197
+ return vrf->ifindex;
198
+}
199
+
200
+static struct vrf_map_elem *vrf_map_elem_alloc(gfp_t flags)
201
+{
202
+ struct vrf_map_elem *me;
203
+
204
+ me = kmalloc(sizeof(*me), flags);
205
+ if (!me)
206
+ return NULL;
207
+
208
+ return me;
209
+}
210
+
211
+static void vrf_map_elem_free(struct vrf_map_elem *me)
212
+{
213
+ kfree(me);
214
+}
215
+
216
+static void vrf_map_elem_init(struct vrf_map_elem *me, int table_id,
217
+ int ifindex, int users)
218
+{
219
+ me->table_id = table_id;
220
+ me->ifindex = ifindex;
221
+ me->users = users;
222
+ INIT_LIST_HEAD(&me->vrf_list);
223
+}
224
+
225
+static struct vrf_map_elem *vrf_map_lookup_elem(struct vrf_map *vmap,
226
+ u32 table_id)
227
+{
228
+ struct vrf_map_elem *me;
229
+ u32 key;
230
+
231
+ key = jhash_1word(table_id, HASH_INITVAL);
232
+ hash_for_each_possible(vmap->ht, me, hnode, key) {
233
+ if (me->table_id == table_id)
234
+ return me;
235
+ }
236
+
237
+ return NULL;
238
+}
239
+
240
+static void vrf_map_add_elem(struct vrf_map *vmap, struct vrf_map_elem *me)
241
+{
242
+ u32 table_id = me->table_id;
243
+ u32 key;
244
+
245
+ key = jhash_1word(table_id, HASH_INITVAL);
246
+ hash_add(vmap->ht, &me->hnode, key);
247
+}
248
+
249
+static void vrf_map_del_elem(struct vrf_map_elem *me)
250
+{
251
+ hash_del(&me->hnode);
252
+}
253
+
254
+static void vrf_map_lock(struct vrf_map *vmap) __acquires(&vmap->vmap_lock)
255
+{
256
+ spin_lock(&vmap->vmap_lock);
257
+}
258
+
259
+static void vrf_map_unlock(struct vrf_map *vmap) __releases(&vmap->vmap_lock)
260
+{
261
+ spin_unlock(&vmap->vmap_lock);
262
+}
263
+
264
+/* called with rtnl lock held */
265
+static int
266
+vrf_map_register_dev(struct net_device *dev, struct netlink_ext_ack *extack)
267
+{
268
+ struct vrf_map *vmap = netns_vrf_map_by_dev(dev);
269
+ struct net_vrf *vrf = netdev_priv(dev);
270
+ struct vrf_map_elem *new_me, *me;
271
+ u32 table_id = vrf->tb_id;
272
+ bool free_new_me = false;
273
+ int users;
274
+ int res;
275
+
276
+ /* we pre-allocate elements used in the spin-locked section (so that we
277
+ * keep the spinlock as short as possibile).
278
+ */
279
+ new_me = vrf_map_elem_alloc(GFP_KERNEL);
280
+ if (!new_me)
281
+ return -ENOMEM;
282
+
283
+ vrf_map_elem_init(new_me, table_id, dev->ifindex, 0);
284
+
285
+ vrf_map_lock(vmap);
286
+
287
+ me = vrf_map_lookup_elem(vmap, table_id);
288
+ if (!me) {
289
+ me = new_me;
290
+ vrf_map_add_elem(vmap, me);
291
+ goto link_vrf;
292
+ }
293
+
294
+ /* we already have an entry in the vrf_map, so it means there is (at
295
+ * least) a vrf registered on the specific table.
296
+ */
297
+ free_new_me = true;
298
+ if (vmap->strict_mode) {
299
+ /* vrfs cannot share the same table */
300
+ NL_SET_ERR_MSG(extack, "Table is used by another VRF");
301
+ res = -EBUSY;
302
+ goto unlock;
303
+ }
304
+
305
+link_vrf:
306
+ users = ++me->users;
307
+ if (users == 2)
308
+ ++vmap->shared_tables;
309
+
310
+ list_add(&vrf->me_list, &me->vrf_list);
311
+
312
+ res = 0;
313
+
314
+unlock:
315
+ vrf_map_unlock(vmap);
316
+
317
+ /* clean-up, if needed */
318
+ if (free_new_me)
319
+ vrf_map_elem_free(new_me);
320
+
321
+ return res;
322
+}
323
+
324
+/* called with rtnl lock held */
325
+static void vrf_map_unregister_dev(struct net_device *dev)
326
+{
327
+ struct vrf_map *vmap = netns_vrf_map_by_dev(dev);
328
+ struct net_vrf *vrf = netdev_priv(dev);
329
+ u32 table_id = vrf->tb_id;
330
+ struct vrf_map_elem *me;
331
+ int users;
332
+
333
+ vrf_map_lock(vmap);
334
+
335
+ me = vrf_map_lookup_elem(vmap, table_id);
336
+ if (!me)
337
+ goto unlock;
338
+
339
+ list_del(&vrf->me_list);
340
+
341
+ users = --me->users;
342
+ if (users == 1) {
343
+ --vmap->shared_tables;
344
+ } else if (users == 0) {
345
+ vrf_map_del_elem(me);
346
+
347
+ /* no one will refer to this element anymore */
348
+ vrf_map_elem_free(me);
349
+ }
350
+
351
+unlock:
352
+ vrf_map_unlock(vmap);
353
+}
354
+
355
+/* return the vrf device index associated with the table_id */
356
+static int vrf_ifindex_lookup_by_table_id(struct net *net, u32 table_id)
357
+{
358
+ struct vrf_map *vmap = netns_vrf_map(net);
359
+ struct vrf_map_elem *me;
360
+ int ifindex;
361
+
362
+ vrf_map_lock(vmap);
363
+
364
+ if (!vmap->strict_mode) {
365
+ ifindex = -EPERM;
366
+ goto unlock;
367
+ }
368
+
369
+ me = vrf_map_lookup_elem(vmap, table_id);
370
+ if (!me) {
371
+ ifindex = -ENODEV;
372
+ goto unlock;
373
+ }
374
+
375
+ ifindex = vrf_map_elem_get_vrf_ifindex(me);
376
+
377
+unlock:
378
+ vrf_map_unlock(vmap);
379
+
380
+ return ifindex;
381
+}
382
+
110383 /* by default VRF devices do not have a qdisc and are expected
111384 * to be created with only a single queue.
112385 */
....@@ -151,11 +424,25 @@
151424 return NETDEV_TX_OK;
152425 }
153426
427
+static void vrf_nf_set_untracked(struct sk_buff *skb)
428
+{
429
+ if (skb_get_nfct(skb) == 0)
430
+ nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
431
+}
432
+
433
+static void vrf_nf_reset_ct(struct sk_buff *skb)
434
+{
435
+ if (skb_get_nfct(skb) == IP_CT_UNTRACKED)
436
+ nf_reset_ct(skb);
437
+}
438
+
154439 #if IS_ENABLED(CONFIG_IPV6)
155440 static int vrf_ip6_local_out(struct net *net, struct sock *sk,
156441 struct sk_buff *skb)
157442 {
158443 int err;
444
+
445
+ vrf_nf_reset_ct(skb);
159446
160447 err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net,
161448 sk, skb, NULL, skb_dst(skb)->dev, dst_output);
....@@ -236,6 +523,8 @@
236523 struct sk_buff *skb)
237524 {
238525 int err;
526
+
527
+ vrf_nf_reset_ct(skb);
239528
240529 err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk,
241530 skb, NULL, skb_dst(skb)->dev, dst_output);
....@@ -357,8 +646,7 @@
357646 skb_pull(skb, ETH_HLEN);
358647 }
359648
360
- /* reset skb device */
361
- nf_reset(skb);
649
+ vrf_nf_reset_ct(skb);
362650 }
363651
364652 #if IS_ENABLED(CONFIG_IPV6)
....@@ -368,11 +656,11 @@
368656 {
369657 struct dst_entry *dst = skb_dst(skb);
370658 struct net_device *dev = dst->dev;
659
+ const struct in6_addr *nexthop;
371660 struct neighbour *neigh;
372
- struct in6_addr *nexthop;
373661 int ret;
374662
375
- nf_reset(skb);
663
+ vrf_nf_reset_ct(skb);
376664
377665 skb->protocol = htons(ETH_P_IPV6);
378666 skb->dev = dev;
....@@ -384,7 +672,7 @@
384672 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
385673 if (!IS_ERR(neigh)) {
386674 sock_confirm_neigh(skb, neigh);
387
- ret = neigh_output(neigh, skb);
675
+ ret = neigh_output(neigh, skb, false);
388676 rcu_read_unlock_bh();
389677 return ret;
390678 }
....@@ -503,6 +791,8 @@
503791 if (rt6_need_strict(&ipv6_hdr(skb)->daddr))
504792 return skb;
505793
794
+ vrf_nf_set_untracked(skb);
795
+
506796 if (qdisc_tx_is_default(vrf_dev) ||
507797 IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED)
508798 return vrf_ip6_out_direct(vrf_dev, sk, skb);
....@@ -534,7 +824,7 @@
534824
535825 static int vrf_rt6_create(struct net_device *dev)
536826 {
537
- int flags = DST_HOST | DST_NOPOLICY | DST_NOXFRM;
827
+ int flags = DST_NOPOLICY | DST_NOXFRM;
538828 struct net_vrf *vrf = netdev_priv(dev);
539829 struct net *net = dev_net(dev);
540830 struct rt6_info *rt6;
....@@ -587,10 +877,10 @@
587877 struct net_device *dev = dst->dev;
588878 unsigned int hh_len = LL_RESERVED_SPACE(dev);
589879 struct neighbour *neigh;
590
- u32 nexthop;
880
+ bool is_v6gw = false;
591881 int ret = -EINVAL;
592882
593
- nf_reset(skb);
883
+ vrf_nf_reset_ct(skb);
594884
595885 /* Be paranoid, rather than too clever. */
596886 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
....@@ -610,13 +900,11 @@
610900
611901 rcu_read_lock_bh();
612902
613
- nexthop = (__force u32)rt_nexthop(rt, ip_hdr(skb)->daddr);
614
- neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
615
- if (unlikely(!neigh))
616
- neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
903
+ neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
617904 if (!IS_ERR(neigh)) {
618905 sock_confirm_neigh(skb, neigh);
619
- ret = neigh_output(neigh, skb);
906
+ /* if crossing protocols, can not use the cached header */
907
+ ret = neigh_output(neigh, skb, is_v6gw);
620908 rcu_read_unlock_bh();
621909 return ret;
622910 }
....@@ -741,6 +1029,8 @@
7411029 ipv4_is_lbcast(ip_hdr(skb)->daddr))
7421030 return skb;
7431031
1032
+ vrf_nf_set_untracked(skb);
1033
+
7441034 if (qdisc_tx_is_default(vrf_dev) ||
7451035 IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED)
7461036 return vrf_ip_out_direct(vrf_dev, sk, skb);
....@@ -795,7 +1085,7 @@
7951085 return -ENOMEM;
7961086
7971087 /* create a dst for routing packets out through a VRF device */
798
- rth = rt_dst_alloc(dev, 0, RTN_UNICAST, 1, 1, 0);
1088
+ rth = rt_dst_alloc(dev, 0, RTN_UNICAST, 1, 1);
7991089 if (!rth)
8001090 return -ENOMEM;
8011091
....@@ -809,7 +1099,8 @@
8091099 /**************************** device handling ********************/
8101100
8111101 /* cycle interface to flush neighbor cache and move routes across tables */
812
-static void cycle_netdev(struct net_device *dev)
1102
+static void cycle_netdev(struct net_device *dev,
1103
+ struct netlink_ext_ack *extack)
8131104 {
8141105 unsigned int flags = dev->flags;
8151106 int ret;
....@@ -817,9 +1108,9 @@
8171108 if (!netif_running(dev))
8181109 return;
8191110
820
- ret = dev_change_flags(dev, flags & ~IFF_UP);
1111
+ ret = dev_change_flags(dev, flags & ~IFF_UP, extack);
8211112 if (ret >= 0)
822
- ret = dev_change_flags(dev, flags);
1113
+ ret = dev_change_flags(dev, flags, extack);
8231114
8241115 if (ret < 0) {
8251116 netdev_err(dev,
....@@ -847,7 +1138,7 @@
8471138 if (ret < 0)
8481139 goto err;
8491140
850
- cycle_netdev(port_dev);
1141
+ cycle_netdev(port_dev, extack);
8511142
8521143 return 0;
8531144
....@@ -877,7 +1168,7 @@
8771168 netdev_upper_dev_unlink(port_dev, dev);
8781169 port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE;
8791170
880
- cycle_netdev(port_dev);
1171
+ cycle_netdev(port_dev, NULL);
8811172
8821173 return 0;
8831174 }
....@@ -915,9 +1206,6 @@
9151206
9161207 dev->flags = IFF_MASTER | IFF_NOARP;
9171208
918
- /* MTU is irrelevant for VRF device; set to 64k similar to lo */
919
- dev->mtu = 64 * 1024;
920
-
9211209 /* similarly, oper state is irrelevant; set to up to avoid confusion */
9221210 dev->operstate = IF_OPER_UP;
9231211 netdev_lockdep_set_classes(dev);
....@@ -936,6 +1224,7 @@
9361224 .ndo_init = vrf_dev_init,
9371225 .ndo_uninit = vrf_dev_uninit,
9381226 .ndo_start_xmit = vrf_xmit,
1227
+ .ndo_set_mac_address = eth_mac_addr,
9391228 .ndo_get_stats64 = vrf_get_stats64,
9401229 .ndo_add_slave = vrf_add_slave,
9411230 .ndo_del_slave = vrf_del_slave,
....@@ -1043,24 +1332,29 @@
10431332 struct sk_buff *skb)
10441333 {
10451334 int orig_iif = skb->skb_iif;
1046
- bool need_strict;
1335
+ bool need_strict = rt6_need_strict(&ipv6_hdr(skb)->daddr);
1336
+ bool is_ndisc = ipv6_ndisc_frame(skb);
10471337
1048
- /* loopback traffic; do not push through packet taps again.
1049
- * Reset pkt_type for upper layers to process skb
1338
+ /* loopback, multicast & non-ND link-local traffic; do not push through
1339
+ * packet taps again. Reset pkt_type for upper layers to process skb.
1340
+ * For strict packets with a source LLA, determine the dst using the
1341
+ * original ifindex.
10501342 */
1051
- if (skb->pkt_type == PACKET_LOOPBACK) {
1343
+ if (skb->pkt_type == PACKET_LOOPBACK || (need_strict && !is_ndisc)) {
10521344 skb->dev = vrf_dev;
10531345 skb->skb_iif = vrf_dev->ifindex;
10541346 IP6CB(skb)->flags |= IP6SKB_L3SLAVE;
1055
- skb->pkt_type = PACKET_HOST;
1347
+
1348
+ if (skb->pkt_type == PACKET_LOOPBACK)
1349
+ skb->pkt_type = PACKET_HOST;
1350
+ else if (ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)
1351
+ vrf_ip6_input_dst(skb, vrf_dev, orig_iif);
1352
+
10561353 goto out;
10571354 }
10581355
1059
- /* if packet is NDISC or addressed to multicast or link-local
1060
- * then keep the ingress interface
1061
- */
1062
- need_strict = rt6_need_strict(&ipv6_hdr(skb)->daddr);
1063
- if (!ipv6_ndisc_frame(skb) && !need_strict) {
1356
+ /* if packet is NDISC then keep the ingress interface */
1357
+ if (!is_ndisc) {
10641358 vrf_rx_stats(vrf_dev, skb->len);
10651359 skb->dev = vrf_dev;
10661360 skb->skb_iif = vrf_dev->ifindex;
....@@ -1139,12 +1433,14 @@
11391433 #if IS_ENABLED(CONFIG_IPV6)
11401434 /* send to link-local or multicast address via interface enslaved to
11411435 * VRF device. Force lookup to VRF table without changing flow struct
1436
+ * Note: Caller to this function must hold rcu_read_lock() and no refcnt
1437
+ * is taken on the dst by this function.
11421438 */
11431439 static struct dst_entry *vrf_link_scope_lookup(const struct net_device *dev,
11441440 struct flowi6 *fl6)
11451441 {
11461442 struct net *net = dev_net(dev);
1147
- int flags = RT6_LOOKUP_F_IFACE;
1443
+ int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_DST_NOREF;
11481444 struct dst_entry *dst = NULL;
11491445 struct rt6_info *rt;
11501446
....@@ -1154,7 +1450,6 @@
11541450 */
11551451 if (fl6->flowi6_oif == dev->ifindex) {
11561452 dst = &net->ipv6.ip6_null_entry->dst;
1157
- dst_hold(dst);
11581453 return dst;
11591454 }
11601455
....@@ -1208,7 +1503,8 @@
12081503 struct sk_buff *skb;
12091504 int err;
12101505
1211
- if (family == AF_INET6 && !ipv6_mod_enabled())
1506
+ if ((family == AF_INET6 || family == RTNL_FAMILY_IP6MR) &&
1507
+ !ipv6_mod_enabled())
12121508 return 0;
12131509
12141510 skb = nlmsg_new(vrf_fib_rule_nl_size(), GFP_KERNEL);
....@@ -1277,7 +1573,18 @@
12771573 goto ipmr_err;
12781574 #endif
12791575
1576
+#if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES)
1577
+ err = vrf_fib_rule(dev, RTNL_FAMILY_IP6MR, true);
1578
+ if (err < 0)
1579
+ goto ip6mr_err;
1580
+#endif
1581
+
12801582 return 0;
1583
+
1584
+#if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES)
1585
+ip6mr_err:
1586
+ vrf_fib_rule(dev, RTNL_FAMILY_IPMR, false);
1587
+#endif
12811588
12821589 #if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES)
12831590 ipmr_err:
....@@ -1325,6 +1632,15 @@
13251632 /* default to no qdisc; user can add if desired */
13261633 dev->priv_flags |= IFF_NO_QUEUE;
13271634 dev->priv_flags |= IFF_NO_RX_HANDLER;
1635
+ dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1636
+
1637
+ /* VRF devices do not care about MTU, but if the MTU is set
1638
+ * too low then the ipv4 and ipv6 protocols are disabled
1639
+ * which breaks networking.
1640
+ */
1641
+ dev->min_mtu = IPV6_MIN_MTU;
1642
+ dev->max_mtu = IP6_MAX_MTU;
1643
+ dev->mtu = dev->max_mtu;
13281644 }
13291645
13301646 static int vrf_validate(struct nlattr *tb[], struct nlattr *data[],
....@@ -1351,6 +1667,8 @@
13511667 netdev_for_each_lower_dev(dev, port_dev, iter)
13521668 vrf_del_slave(dev, port_dev);
13531669
1670
+ vrf_map_unregister_dev(dev);
1671
+
13541672 unregister_netdevice_queue(dev, head);
13551673 }
13561674
....@@ -1359,6 +1677,7 @@
13591677 struct netlink_ext_ack *extack)
13601678 {
13611679 struct net_vrf *vrf = netdev_priv(dev);
1680
+ struct netns_vrf *nn_vrf;
13621681 bool *add_fib_rules;
13631682 struct net *net;
13641683 int err;
....@@ -1381,11 +1700,26 @@
13811700 if (err)
13821701 goto out;
13831702
1703
+ /* mapping between table_id and vrf;
1704
+ * note: such binding could not be done in the dev init function
1705
+ * because dev->ifindex id is not available yet.
1706
+ */
1707
+ vrf->ifindex = dev->ifindex;
1708
+
1709
+ err = vrf_map_register_dev(dev, extack);
1710
+ if (err) {
1711
+ unregister_netdevice(dev);
1712
+ goto out;
1713
+ }
1714
+
13841715 net = dev_net(dev);
1385
- add_fib_rules = net_generic(net, vrf_net_id);
1716
+ nn_vrf = net_generic(net, vrf_net_id);
1717
+
1718
+ add_fib_rules = &nn_vrf->add_fib_rules;
13861719 if (*add_fib_rules) {
13871720 err = vrf_add_fib_rules(dev);
13881721 if (err) {
1722
+ vrf_map_unregister_dev(dev);
13891723 unregister_netdevice(dev);
13901724 goto out;
13911725 }
....@@ -1472,20 +1806,164 @@
14721806 .notifier_call = vrf_device_event,
14731807 };
14741808
1475
-/* Initialize per network namespace state */
1476
-static int __net_init vrf_netns_init(struct net *net)
1809
+static int vrf_map_init(struct vrf_map *vmap)
14771810 {
1478
- bool *add_fib_rules = net_generic(net, vrf_net_id);
1811
+ spin_lock_init(&vmap->vmap_lock);
1812
+ hash_init(vmap->ht);
14791813
1480
- *add_fib_rules = true;
1814
+ vmap->strict_mode = false;
14811815
14821816 return 0;
14831817 }
14841818
1819
+#ifdef CONFIG_SYSCTL
1820
+static bool vrf_strict_mode(struct vrf_map *vmap)
1821
+{
1822
+ bool strict_mode;
1823
+
1824
+ vrf_map_lock(vmap);
1825
+ strict_mode = vmap->strict_mode;
1826
+ vrf_map_unlock(vmap);
1827
+
1828
+ return strict_mode;
1829
+}
1830
+
1831
+static int vrf_strict_mode_change(struct vrf_map *vmap, bool new_mode)
1832
+{
1833
+ bool *cur_mode;
1834
+ int res = 0;
1835
+
1836
+ vrf_map_lock(vmap);
1837
+
1838
+ cur_mode = &vmap->strict_mode;
1839
+ if (*cur_mode == new_mode)
1840
+ goto unlock;
1841
+
1842
+ if (*cur_mode) {
1843
+ /* disable strict mode */
1844
+ *cur_mode = false;
1845
+ } else {
1846
+ if (vmap->shared_tables) {
1847
+ /* we cannot allow strict_mode because there are some
1848
+ * vrfs that share one or more tables.
1849
+ */
1850
+ res = -EBUSY;
1851
+ goto unlock;
1852
+ }
1853
+
1854
+ /* no tables are shared among vrfs, so we can go back
1855
+ * to 1:1 association between a vrf with its table.
1856
+ */
1857
+ *cur_mode = true;
1858
+ }
1859
+
1860
+unlock:
1861
+ vrf_map_unlock(vmap);
1862
+
1863
+ return res;
1864
+}
1865
+
1866
+static int vrf_shared_table_handler(struct ctl_table *table, int write,
1867
+ void *buffer, size_t *lenp, loff_t *ppos)
1868
+{
1869
+ struct net *net = (struct net *)table->extra1;
1870
+ struct vrf_map *vmap = netns_vrf_map(net);
1871
+ int proc_strict_mode = 0;
1872
+ struct ctl_table tmp = {
1873
+ .procname = table->procname,
1874
+ .data = &proc_strict_mode,
1875
+ .maxlen = sizeof(int),
1876
+ .mode = table->mode,
1877
+ .extra1 = SYSCTL_ZERO,
1878
+ .extra2 = SYSCTL_ONE,
1879
+ };
1880
+ int ret;
1881
+
1882
+ if (!write)
1883
+ proc_strict_mode = vrf_strict_mode(vmap);
1884
+
1885
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
1886
+
1887
+ if (write && ret == 0)
1888
+ ret = vrf_strict_mode_change(vmap, (bool)proc_strict_mode);
1889
+
1890
+ return ret;
1891
+}
1892
+
1893
+static const struct ctl_table vrf_table[] = {
1894
+ {
1895
+ .procname = "strict_mode",
1896
+ .data = NULL,
1897
+ .maxlen = sizeof(int),
1898
+ .mode = 0644,
1899
+ .proc_handler = vrf_shared_table_handler,
1900
+ /* set by the vrf_netns_init */
1901
+ .extra1 = NULL,
1902
+ },
1903
+ { },
1904
+};
1905
+
1906
+static int vrf_netns_init_sysctl(struct net *net, struct netns_vrf *nn_vrf)
1907
+{
1908
+ struct ctl_table *table;
1909
+
1910
+ table = kmemdup(vrf_table, sizeof(vrf_table), GFP_KERNEL);
1911
+ if (!table)
1912
+ return -ENOMEM;
1913
+
1914
+ /* init the extra1 parameter with the reference to current netns */
1915
+ table[0].extra1 = net;
1916
+
1917
+ nn_vrf->ctl_hdr = register_net_sysctl(net, "net/vrf", table);
1918
+ if (!nn_vrf->ctl_hdr) {
1919
+ kfree(table);
1920
+ return -ENOMEM;
1921
+ }
1922
+
1923
+ return 0;
1924
+}
1925
+
1926
+static void vrf_netns_exit_sysctl(struct net *net)
1927
+{
1928
+ struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id);
1929
+ struct ctl_table *table;
1930
+
1931
+ table = nn_vrf->ctl_hdr->ctl_table_arg;
1932
+ unregister_net_sysctl_table(nn_vrf->ctl_hdr);
1933
+ kfree(table);
1934
+}
1935
+#else
1936
+static int vrf_netns_init_sysctl(struct net *net, struct netns_vrf *nn_vrf)
1937
+{
1938
+ return 0;
1939
+}
1940
+
1941
+static void vrf_netns_exit_sysctl(struct net *net)
1942
+{
1943
+}
1944
+#endif
1945
+
1946
+/* Initialize per network namespace state */
1947
+static int __net_init vrf_netns_init(struct net *net)
1948
+{
1949
+ struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id);
1950
+
1951
+ nn_vrf->add_fib_rules = true;
1952
+ vrf_map_init(&nn_vrf->vmap);
1953
+
1954
+ return vrf_netns_init_sysctl(net, nn_vrf);
1955
+}
1956
+
1957
+static void __net_exit vrf_netns_exit(struct net *net)
1958
+{
1959
+ vrf_netns_exit_sysctl(net);
1960
+}
1961
+
14851962 static struct pernet_operations vrf_net_ops __net_initdata = {
14861963 .init = vrf_netns_init,
1964
+ .exit = vrf_netns_exit,
14871965 .id = &vrf_net_id,
1488
- .size = sizeof(bool),
1966
+ .size = sizeof(struct netns_vrf),
14891967 };
14901968
14911969 static int __init vrf_init_module(void)
....@@ -1498,14 +1976,24 @@
14981976 if (rc < 0)
14991977 goto error;
15001978
1979
+ rc = l3mdev_table_lookup_register(L3MDEV_TYPE_VRF,
1980
+ vrf_ifindex_lookup_by_table_id);
1981
+ if (rc < 0)
1982
+ goto unreg_pernet;
1983
+
15011984 rc = rtnl_link_register(&vrf_link_ops);
1502
- if (rc < 0) {
1503
- unregister_pernet_subsys(&vrf_net_ops);
1504
- goto error;
1505
- }
1985
+ if (rc < 0)
1986
+ goto table_lookup_unreg;
15061987
15071988 return 0;
15081989
1990
+table_lookup_unreg:
1991
+ l3mdev_table_lookup_unregister(L3MDEV_TYPE_VRF,
1992
+ vrf_ifindex_lookup_by_table_id);
1993
+
1994
+unreg_pernet:
1995
+ unregister_pernet_subsys(&vrf_net_ops);
1996
+
15091997 error:
15101998 unregister_netdevice_notifier(&vrf_notifier_block);
15111999 return rc;