.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-or-later |
---|
1 | 2 | /* |
---|
2 | 3 | * vrf.c: device driver to encapsulate a VRF space |
---|
3 | 4 | * |
---|
.. | .. |
---|
6 | 7 | * Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com> |
---|
7 | 8 | * |
---|
8 | 9 | * Based on dummy, team and ipvlan drivers |
---|
9 | | - * |
---|
10 | | - * This program is free software; you can redistribute it and/or modify |
---|
11 | | - * it under the terms of the GNU General Public License as published by |
---|
12 | | - * the Free Software Foundation; either version 2 of the License, or |
---|
13 | | - * (at your option) any later version. |
---|
14 | 10 | */ |
---|
15 | 11 | |
---|
16 | 12 | #include <linux/module.h> |
---|
.. | .. |
---|
25 | 21 | #include <net/rtnetlink.h> |
---|
26 | 22 | #include <linux/u64_stats_sync.h> |
---|
27 | 23 | #include <linux/hashtable.h> |
---|
| 24 | +#include <linux/spinlock_types.h> |
---|
28 | 25 | |
---|
29 | 26 | #include <linux/inetdevice.h> |
---|
30 | 27 | #include <net/arp.h> |
---|
.. | .. |
---|
37 | 34 | #include <net/l3mdev.h> |
---|
38 | 35 | #include <net/fib_rules.h> |
---|
39 | 36 | #include <net/netns/generic.h> |
---|
| 37 | +#include <net/netfilter/nf_conntrack.h> |
---|
40 | 38 | |
---|
41 | 39 | #define DRV_NAME "vrf" |
---|
42 | | -#define DRV_VERSION "1.0" |
---|
| 40 | +#define DRV_VERSION "1.1" |
---|
43 | 41 | |
---|
44 | 42 | #define FIB_RULE_PREF 1000 /* default preference for FIB rules */ |
---|
45 | 43 | |
---|
| 44 | +#define HT_MAP_BITS 4 |
---|
| 45 | +#define HASH_INITVAL ((u32)0xcafef00d) |
---|
| 46 | + |
---|
| 47 | +struct vrf_map { |
---|
| 48 | + DECLARE_HASHTABLE(ht, HT_MAP_BITS); |
---|
| 49 | + spinlock_t vmap_lock; |
---|
| 50 | + |
---|
| 51 | + /* shared_tables: |
---|
| 52 | + * count how many distinct tables do not comply with the strict mode |
---|
| 53 | + * requirement. |
---|
| 54 | + * shared_tables value must be 0 in order to enable the strict mode. |
---|
| 55 | + * |
---|
| 56 | + * example of the evolution of shared_tables: |
---|
| 57 | + * | time |
---|
| 58 | + * add vrf0 --> table 100 shared_tables = 0 | t0 |
---|
| 59 | + * add vrf1 --> table 101 shared_tables = 0 | t1 |
---|
| 60 | + * add vrf2 --> table 100 shared_tables = 1 | t2 |
---|
| 61 | + * add vrf3 --> table 100 shared_tables = 1 | t3 |
---|
| 62 | + * add vrf4 --> table 101 shared_tables = 2 v t4 |
---|
| 63 | + * |
---|
| 64 | + * shared_tables is a "step function" (or "staircase function") |
---|
| 65 | + * and it is increased by one when the second vrf is associated to a |
---|
| 66 | + * table. |
---|
| 67 | + * |
---|
| 68 | + * at t2, vrf0 and vrf2 are bound to table 100: shared_tables = 1. |
---|
| 69 | + * |
---|
| 70 | + * at t3, another dev (vrf3) is bound to the same table 100 but the |
---|
| 71 | + * value of shared_tables is still 1. |
---|
| 72 | + * This means that no matter how many new vrfs will register on the |
---|
| 73 | + * table 100, the shared_tables will not increase (considering only |
---|
| 74 | + * table 100). |
---|
| 75 | + * |
---|
| 76 | + * at t4, vrf4 is bound to table 101, and shared_tables = 2. |
---|
| 77 | + * |
---|
| 78 | + * Looking at the value of shared_tables we can immediately know if |
---|
| 79 | + * the strict_mode can or cannot be enforced. Indeed, strict_mode |
---|
| 80 | + * can be enforced iff shared_tables = 0. |
---|
| 81 | + * |
---|
| 82 | + * Conversely, shared_tables is decreased when a vrf is de-associated |
---|
| 83 | + * from a table with exactly two associated vrfs. |
---|
| 84 | + */ |
---|
| 85 | + u32 shared_tables; |
---|
| 86 | + |
---|
| 87 | + bool strict_mode; |
---|
| 88 | +}; |
---|
| 89 | + |
---|
| 90 | +struct vrf_map_elem { |
---|
| 91 | + struct hlist_node hnode; |
---|
| 92 | + struct list_head vrf_list; /* VRFs registered to this table */ |
---|
| 93 | + |
---|
| 94 | + u32 table_id; |
---|
| 95 | + int users; |
---|
| 96 | + int ifindex; |
---|
| 97 | +}; |
---|
| 98 | + |
---|
46 | 99 | static unsigned int vrf_net_id; |
---|
| 100 | + |
---|
| 101 | +/* per netns vrf data */ |
---|
| 102 | +struct netns_vrf { |
---|
| 103 | + /* protected by rtnl lock */ |
---|
| 104 | + bool add_fib_rules; |
---|
| 105 | + |
---|
| 106 | + struct vrf_map vmap; |
---|
| 107 | + struct ctl_table_header *ctl_hdr; |
---|
| 108 | +}; |
---|
47 | 109 | |
---|
48 | 110 | struct net_vrf { |
---|
49 | 111 | struct rtable __rcu *rth; |
---|
.. | .. |
---|
52 | 114 | struct fib6_table *fib6_table; |
---|
53 | 115 | #endif |
---|
54 | 116 | u32 tb_id; |
---|
| 117 | + |
---|
| 118 | + struct list_head me_list; /* entry in vrf_map_elem */ |
---|
| 119 | + int ifindex; |
---|
55 | 120 | }; |
---|
56 | 121 | |
---|
57 | 122 | struct pcpu_dstats { |
---|
.. | .. |
---|
107 | 172 | } |
---|
108 | 173 | } |
---|
109 | 174 | |
---|
| 175 | +static struct vrf_map *netns_vrf_map(struct net *net) |
---|
| 176 | +{ |
---|
| 177 | + struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id); |
---|
| 178 | + |
---|
| 179 | + return &nn_vrf->vmap; |
---|
| 180 | +} |
---|
| 181 | + |
---|
| 182 | +static struct vrf_map *netns_vrf_map_by_dev(struct net_device *dev) |
---|
| 183 | +{ |
---|
| 184 | + return netns_vrf_map(dev_net(dev)); |
---|
| 185 | +} |
---|
| 186 | + |
---|
| 187 | +static int vrf_map_elem_get_vrf_ifindex(struct vrf_map_elem *me) |
---|
| 188 | +{ |
---|
| 189 | + struct list_head *me_head = &me->vrf_list; |
---|
| 190 | + struct net_vrf *vrf; |
---|
| 191 | + |
---|
| 192 | + if (list_empty(me_head)) |
---|
| 193 | + return -ENODEV; |
---|
| 194 | + |
---|
| 195 | + vrf = list_first_entry(me_head, struct net_vrf, me_list); |
---|
| 196 | + |
---|
| 197 | + return vrf->ifindex; |
---|
| 198 | +} |
---|
| 199 | + |
---|
| 200 | +static struct vrf_map_elem *vrf_map_elem_alloc(gfp_t flags) |
---|
| 201 | +{ |
---|
| 202 | + struct vrf_map_elem *me; |
---|
| 203 | + |
---|
| 204 | + me = kmalloc(sizeof(*me), flags); |
---|
| 205 | + if (!me) |
---|
| 206 | + return NULL; |
---|
| 207 | + |
---|
| 208 | + return me; |
---|
| 209 | +} |
---|
| 210 | + |
---|
| 211 | +static void vrf_map_elem_free(struct vrf_map_elem *me) |
---|
| 212 | +{ |
---|
| 213 | + kfree(me); |
---|
| 214 | +} |
---|
| 215 | + |
---|
| 216 | +static void vrf_map_elem_init(struct vrf_map_elem *me, int table_id, |
---|
| 217 | + int ifindex, int users) |
---|
| 218 | +{ |
---|
| 219 | + me->table_id = table_id; |
---|
| 220 | + me->ifindex = ifindex; |
---|
| 221 | + me->users = users; |
---|
| 222 | + INIT_LIST_HEAD(&me->vrf_list); |
---|
| 223 | +} |
---|
| 224 | + |
---|
| 225 | +static struct vrf_map_elem *vrf_map_lookup_elem(struct vrf_map *vmap, |
---|
| 226 | + u32 table_id) |
---|
| 227 | +{ |
---|
| 228 | + struct vrf_map_elem *me; |
---|
| 229 | + u32 key; |
---|
| 230 | + |
---|
| 231 | + key = jhash_1word(table_id, HASH_INITVAL); |
---|
| 232 | + hash_for_each_possible(vmap->ht, me, hnode, key) { |
---|
| 233 | + if (me->table_id == table_id) |
---|
| 234 | + return me; |
---|
| 235 | + } |
---|
| 236 | + |
---|
| 237 | + return NULL; |
---|
| 238 | +} |
---|
| 239 | + |
---|
| 240 | +static void vrf_map_add_elem(struct vrf_map *vmap, struct vrf_map_elem *me) |
---|
| 241 | +{ |
---|
| 242 | + u32 table_id = me->table_id; |
---|
| 243 | + u32 key; |
---|
| 244 | + |
---|
| 245 | + key = jhash_1word(table_id, HASH_INITVAL); |
---|
| 246 | + hash_add(vmap->ht, &me->hnode, key); |
---|
| 247 | +} |
---|
| 248 | + |
---|
| 249 | +static void vrf_map_del_elem(struct vrf_map_elem *me) |
---|
| 250 | +{ |
---|
| 251 | + hash_del(&me->hnode); |
---|
| 252 | +} |
---|
| 253 | + |
---|
| 254 | +static void vrf_map_lock(struct vrf_map *vmap) __acquires(&vmap->vmap_lock) |
---|
| 255 | +{ |
---|
| 256 | + spin_lock(&vmap->vmap_lock); |
---|
| 257 | +} |
---|
| 258 | + |
---|
| 259 | +static void vrf_map_unlock(struct vrf_map *vmap) __releases(&vmap->vmap_lock) |
---|
| 260 | +{ |
---|
| 261 | + spin_unlock(&vmap->vmap_lock); |
---|
| 262 | +} |
---|
| 263 | + |
---|
| 264 | +/* called with rtnl lock held */ |
---|
| 265 | +static int |
---|
| 266 | +vrf_map_register_dev(struct net_device *dev, struct netlink_ext_ack *extack) |
---|
| 267 | +{ |
---|
| 268 | + struct vrf_map *vmap = netns_vrf_map_by_dev(dev); |
---|
| 269 | + struct net_vrf *vrf = netdev_priv(dev); |
---|
| 270 | + struct vrf_map_elem *new_me, *me; |
---|
| 271 | + u32 table_id = vrf->tb_id; |
---|
| 272 | + bool free_new_me = false; |
---|
| 273 | + int users; |
---|
| 274 | + int res; |
---|
| 275 | + |
---|
| 276 | + /* we pre-allocate elements used in the spin-locked section (so that we |
---|
| 277 | + * keep the spinlock as short as possibile). |
---|
| 278 | + */ |
---|
| 279 | + new_me = vrf_map_elem_alloc(GFP_KERNEL); |
---|
| 280 | + if (!new_me) |
---|
| 281 | + return -ENOMEM; |
---|
| 282 | + |
---|
| 283 | + vrf_map_elem_init(new_me, table_id, dev->ifindex, 0); |
---|
| 284 | + |
---|
| 285 | + vrf_map_lock(vmap); |
---|
| 286 | + |
---|
| 287 | + me = vrf_map_lookup_elem(vmap, table_id); |
---|
| 288 | + if (!me) { |
---|
| 289 | + me = new_me; |
---|
| 290 | + vrf_map_add_elem(vmap, me); |
---|
| 291 | + goto link_vrf; |
---|
| 292 | + } |
---|
| 293 | + |
---|
| 294 | + /* we already have an entry in the vrf_map, so it means there is (at |
---|
| 295 | + * least) a vrf registered on the specific table. |
---|
| 296 | + */ |
---|
| 297 | + free_new_me = true; |
---|
| 298 | + if (vmap->strict_mode) { |
---|
| 299 | + /* vrfs cannot share the same table */ |
---|
| 300 | + NL_SET_ERR_MSG(extack, "Table is used by another VRF"); |
---|
| 301 | + res = -EBUSY; |
---|
| 302 | + goto unlock; |
---|
| 303 | + } |
---|
| 304 | + |
---|
| 305 | +link_vrf: |
---|
| 306 | + users = ++me->users; |
---|
| 307 | + if (users == 2) |
---|
| 308 | + ++vmap->shared_tables; |
---|
| 309 | + |
---|
| 310 | + list_add(&vrf->me_list, &me->vrf_list); |
---|
| 311 | + |
---|
| 312 | + res = 0; |
---|
| 313 | + |
---|
| 314 | +unlock: |
---|
| 315 | + vrf_map_unlock(vmap); |
---|
| 316 | + |
---|
| 317 | + /* clean-up, if needed */ |
---|
| 318 | + if (free_new_me) |
---|
| 319 | + vrf_map_elem_free(new_me); |
---|
| 320 | + |
---|
| 321 | + return res; |
---|
| 322 | +} |
---|
| 323 | + |
---|
| 324 | +/* called with rtnl lock held */ |
---|
| 325 | +static void vrf_map_unregister_dev(struct net_device *dev) |
---|
| 326 | +{ |
---|
| 327 | + struct vrf_map *vmap = netns_vrf_map_by_dev(dev); |
---|
| 328 | + struct net_vrf *vrf = netdev_priv(dev); |
---|
| 329 | + u32 table_id = vrf->tb_id; |
---|
| 330 | + struct vrf_map_elem *me; |
---|
| 331 | + int users; |
---|
| 332 | + |
---|
| 333 | + vrf_map_lock(vmap); |
---|
| 334 | + |
---|
| 335 | + me = vrf_map_lookup_elem(vmap, table_id); |
---|
| 336 | + if (!me) |
---|
| 337 | + goto unlock; |
---|
| 338 | + |
---|
| 339 | + list_del(&vrf->me_list); |
---|
| 340 | + |
---|
| 341 | + users = --me->users; |
---|
| 342 | + if (users == 1) { |
---|
| 343 | + --vmap->shared_tables; |
---|
| 344 | + } else if (users == 0) { |
---|
| 345 | + vrf_map_del_elem(me); |
---|
| 346 | + |
---|
| 347 | + /* no one will refer to this element anymore */ |
---|
| 348 | + vrf_map_elem_free(me); |
---|
| 349 | + } |
---|
| 350 | + |
---|
| 351 | +unlock: |
---|
| 352 | + vrf_map_unlock(vmap); |
---|
| 353 | +} |
---|
| 354 | + |
---|
| 355 | +/* return the vrf device index associated with the table_id */ |
---|
| 356 | +static int vrf_ifindex_lookup_by_table_id(struct net *net, u32 table_id) |
---|
| 357 | +{ |
---|
| 358 | + struct vrf_map *vmap = netns_vrf_map(net); |
---|
| 359 | + struct vrf_map_elem *me; |
---|
| 360 | + int ifindex; |
---|
| 361 | + |
---|
| 362 | + vrf_map_lock(vmap); |
---|
| 363 | + |
---|
| 364 | + if (!vmap->strict_mode) { |
---|
| 365 | + ifindex = -EPERM; |
---|
| 366 | + goto unlock; |
---|
| 367 | + } |
---|
| 368 | + |
---|
| 369 | + me = vrf_map_lookup_elem(vmap, table_id); |
---|
| 370 | + if (!me) { |
---|
| 371 | + ifindex = -ENODEV; |
---|
| 372 | + goto unlock; |
---|
| 373 | + } |
---|
| 374 | + |
---|
| 375 | + ifindex = vrf_map_elem_get_vrf_ifindex(me); |
---|
| 376 | + |
---|
| 377 | +unlock: |
---|
| 378 | + vrf_map_unlock(vmap); |
---|
| 379 | + |
---|
| 380 | + return ifindex; |
---|
| 381 | +} |
---|
| 382 | + |
---|
110 | 383 | /* by default VRF devices do not have a qdisc and are expected |
---|
111 | 384 | * to be created with only a single queue. |
---|
112 | 385 | */ |
---|
.. | .. |
---|
151 | 424 | return NETDEV_TX_OK; |
---|
152 | 425 | } |
---|
153 | 426 | |
---|
| 427 | +static void vrf_nf_set_untracked(struct sk_buff *skb) |
---|
| 428 | +{ |
---|
| 429 | + if (skb_get_nfct(skb) == 0) |
---|
| 430 | + nf_ct_set(skb, NULL, IP_CT_UNTRACKED); |
---|
| 431 | +} |
---|
| 432 | + |
---|
| 433 | +static void vrf_nf_reset_ct(struct sk_buff *skb) |
---|
| 434 | +{ |
---|
| 435 | + if (skb_get_nfct(skb) == IP_CT_UNTRACKED) |
---|
| 436 | + nf_reset_ct(skb); |
---|
| 437 | +} |
---|
| 438 | + |
---|
154 | 439 | #if IS_ENABLED(CONFIG_IPV6) |
---|
155 | 440 | static int vrf_ip6_local_out(struct net *net, struct sock *sk, |
---|
156 | 441 | struct sk_buff *skb) |
---|
157 | 442 | { |
---|
158 | 443 | int err; |
---|
| 444 | + |
---|
| 445 | + vrf_nf_reset_ct(skb); |
---|
159 | 446 | |
---|
160 | 447 | err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, |
---|
161 | 448 | sk, skb, NULL, skb_dst(skb)->dev, dst_output); |
---|
.. | .. |
---|
236 | 523 | struct sk_buff *skb) |
---|
237 | 524 | { |
---|
238 | 525 | int err; |
---|
| 526 | + |
---|
| 527 | + vrf_nf_reset_ct(skb); |
---|
239 | 528 | |
---|
240 | 529 | err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, |
---|
241 | 530 | skb, NULL, skb_dst(skb)->dev, dst_output); |
---|
.. | .. |
---|
357 | 646 | skb_pull(skb, ETH_HLEN); |
---|
358 | 647 | } |
---|
359 | 648 | |
---|
360 | | - /* reset skb device */ |
---|
361 | | - nf_reset(skb); |
---|
| 649 | + vrf_nf_reset_ct(skb); |
---|
362 | 650 | } |
---|
363 | 651 | |
---|
364 | 652 | #if IS_ENABLED(CONFIG_IPV6) |
---|
.. | .. |
---|
368 | 656 | { |
---|
369 | 657 | struct dst_entry *dst = skb_dst(skb); |
---|
370 | 658 | struct net_device *dev = dst->dev; |
---|
| 659 | + const struct in6_addr *nexthop; |
---|
371 | 660 | struct neighbour *neigh; |
---|
372 | | - struct in6_addr *nexthop; |
---|
373 | 661 | int ret; |
---|
374 | 662 | |
---|
375 | | - nf_reset(skb); |
---|
| 663 | + vrf_nf_reset_ct(skb); |
---|
376 | 664 | |
---|
377 | 665 | skb->protocol = htons(ETH_P_IPV6); |
---|
378 | 666 | skb->dev = dev; |
---|
.. | .. |
---|
384 | 672 | neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); |
---|
385 | 673 | if (!IS_ERR(neigh)) { |
---|
386 | 674 | sock_confirm_neigh(skb, neigh); |
---|
387 | | - ret = neigh_output(neigh, skb); |
---|
| 675 | + ret = neigh_output(neigh, skb, false); |
---|
388 | 676 | rcu_read_unlock_bh(); |
---|
389 | 677 | return ret; |
---|
390 | 678 | } |
---|
.. | .. |
---|
503 | 791 | if (rt6_need_strict(&ipv6_hdr(skb)->daddr)) |
---|
504 | 792 | return skb; |
---|
505 | 793 | |
---|
| 794 | + vrf_nf_set_untracked(skb); |
---|
| 795 | + |
---|
506 | 796 | if (qdisc_tx_is_default(vrf_dev) || |
---|
507 | 797 | IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) |
---|
508 | 798 | return vrf_ip6_out_direct(vrf_dev, sk, skb); |
---|
.. | .. |
---|
534 | 824 | |
---|
535 | 825 | static int vrf_rt6_create(struct net_device *dev) |
---|
536 | 826 | { |
---|
537 | | - int flags = DST_HOST | DST_NOPOLICY | DST_NOXFRM; |
---|
| 827 | + int flags = DST_NOPOLICY | DST_NOXFRM; |
---|
538 | 828 | struct net_vrf *vrf = netdev_priv(dev); |
---|
539 | 829 | struct net *net = dev_net(dev); |
---|
540 | 830 | struct rt6_info *rt6; |
---|
.. | .. |
---|
587 | 877 | struct net_device *dev = dst->dev; |
---|
588 | 878 | unsigned int hh_len = LL_RESERVED_SPACE(dev); |
---|
589 | 879 | struct neighbour *neigh; |
---|
590 | | - u32 nexthop; |
---|
| 880 | + bool is_v6gw = false; |
---|
591 | 881 | int ret = -EINVAL; |
---|
592 | 882 | |
---|
593 | | - nf_reset(skb); |
---|
| 883 | + vrf_nf_reset_ct(skb); |
---|
594 | 884 | |
---|
595 | 885 | /* Be paranoid, rather than too clever. */ |
---|
596 | 886 | if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { |
---|
.. | .. |
---|
610 | 900 | |
---|
611 | 901 | rcu_read_lock_bh(); |
---|
612 | 902 | |
---|
613 | | - nexthop = (__force u32)rt_nexthop(rt, ip_hdr(skb)->daddr); |
---|
614 | | - neigh = __ipv4_neigh_lookup_noref(dev, nexthop); |
---|
615 | | - if (unlikely(!neigh)) |
---|
616 | | - neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); |
---|
| 903 | + neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); |
---|
617 | 904 | if (!IS_ERR(neigh)) { |
---|
618 | 905 | sock_confirm_neigh(skb, neigh); |
---|
619 | | - ret = neigh_output(neigh, skb); |
---|
| 906 | + /* if crossing protocols, can not use the cached header */ |
---|
| 907 | + ret = neigh_output(neigh, skb, is_v6gw); |
---|
620 | 908 | rcu_read_unlock_bh(); |
---|
621 | 909 | return ret; |
---|
622 | 910 | } |
---|
.. | .. |
---|
741 | 1029 | ipv4_is_lbcast(ip_hdr(skb)->daddr)) |
---|
742 | 1030 | return skb; |
---|
743 | 1031 | |
---|
| 1032 | + vrf_nf_set_untracked(skb); |
---|
| 1033 | + |
---|
744 | 1034 | if (qdisc_tx_is_default(vrf_dev) || |
---|
745 | 1035 | IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) |
---|
746 | 1036 | return vrf_ip_out_direct(vrf_dev, sk, skb); |
---|
.. | .. |
---|
795 | 1085 | return -ENOMEM; |
---|
796 | 1086 | |
---|
797 | 1087 | /* create a dst for routing packets out through a VRF device */ |
---|
798 | | - rth = rt_dst_alloc(dev, 0, RTN_UNICAST, 1, 1, 0); |
---|
| 1088 | + rth = rt_dst_alloc(dev, 0, RTN_UNICAST, 1, 1); |
---|
799 | 1089 | if (!rth) |
---|
800 | 1090 | return -ENOMEM; |
---|
801 | 1091 | |
---|
.. | .. |
---|
809 | 1099 | /**************************** device handling ********************/ |
---|
810 | 1100 | |
---|
811 | 1101 | /* cycle interface to flush neighbor cache and move routes across tables */ |
---|
812 | | -static void cycle_netdev(struct net_device *dev) |
---|
| 1102 | +static void cycle_netdev(struct net_device *dev, |
---|
| 1103 | + struct netlink_ext_ack *extack) |
---|
813 | 1104 | { |
---|
814 | 1105 | unsigned int flags = dev->flags; |
---|
815 | 1106 | int ret; |
---|
.. | .. |
---|
817 | 1108 | if (!netif_running(dev)) |
---|
818 | 1109 | return; |
---|
819 | 1110 | |
---|
820 | | - ret = dev_change_flags(dev, flags & ~IFF_UP); |
---|
| 1111 | + ret = dev_change_flags(dev, flags & ~IFF_UP, extack); |
---|
821 | 1112 | if (ret >= 0) |
---|
822 | | - ret = dev_change_flags(dev, flags); |
---|
| 1113 | + ret = dev_change_flags(dev, flags, extack); |
---|
823 | 1114 | |
---|
824 | 1115 | if (ret < 0) { |
---|
825 | 1116 | netdev_err(dev, |
---|
.. | .. |
---|
847 | 1138 | if (ret < 0) |
---|
848 | 1139 | goto err; |
---|
849 | 1140 | |
---|
850 | | - cycle_netdev(port_dev); |
---|
| 1141 | + cycle_netdev(port_dev, extack); |
---|
851 | 1142 | |
---|
852 | 1143 | return 0; |
---|
853 | 1144 | |
---|
.. | .. |
---|
877 | 1168 | netdev_upper_dev_unlink(port_dev, dev); |
---|
878 | 1169 | port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE; |
---|
879 | 1170 | |
---|
880 | | - cycle_netdev(port_dev); |
---|
| 1171 | + cycle_netdev(port_dev, NULL); |
---|
881 | 1172 | |
---|
882 | 1173 | return 0; |
---|
883 | 1174 | } |
---|
.. | .. |
---|
915 | 1206 | |
---|
916 | 1207 | dev->flags = IFF_MASTER | IFF_NOARP; |
---|
917 | 1208 | |
---|
918 | | - /* MTU is irrelevant for VRF device; set to 64k similar to lo */ |
---|
919 | | - dev->mtu = 64 * 1024; |
---|
920 | | - |
---|
921 | 1209 | /* similarly, oper state is irrelevant; set to up to avoid confusion */ |
---|
922 | 1210 | dev->operstate = IF_OPER_UP; |
---|
923 | 1211 | netdev_lockdep_set_classes(dev); |
---|
.. | .. |
---|
936 | 1224 | .ndo_init = vrf_dev_init, |
---|
937 | 1225 | .ndo_uninit = vrf_dev_uninit, |
---|
938 | 1226 | .ndo_start_xmit = vrf_xmit, |
---|
| 1227 | + .ndo_set_mac_address = eth_mac_addr, |
---|
939 | 1228 | .ndo_get_stats64 = vrf_get_stats64, |
---|
940 | 1229 | .ndo_add_slave = vrf_add_slave, |
---|
941 | 1230 | .ndo_del_slave = vrf_del_slave, |
---|
.. | .. |
---|
1043 | 1332 | struct sk_buff *skb) |
---|
1044 | 1333 | { |
---|
1045 | 1334 | int orig_iif = skb->skb_iif; |
---|
1046 | | - bool need_strict; |
---|
| 1335 | + bool need_strict = rt6_need_strict(&ipv6_hdr(skb)->daddr); |
---|
| 1336 | + bool is_ndisc = ipv6_ndisc_frame(skb); |
---|
1047 | 1337 | |
---|
1048 | | - /* loopback traffic; do not push through packet taps again. |
---|
1049 | | - * Reset pkt_type for upper layers to process skb |
---|
| 1338 | + /* loopback, multicast & non-ND link-local traffic; do not push through |
---|
| 1339 | + * packet taps again. Reset pkt_type for upper layers to process skb. |
---|
| 1340 | + * For strict packets with a source LLA, determine the dst using the |
---|
| 1341 | + * original ifindex. |
---|
1050 | 1342 | */ |
---|
1051 | | - if (skb->pkt_type == PACKET_LOOPBACK) { |
---|
| 1343 | + if (skb->pkt_type == PACKET_LOOPBACK || (need_strict && !is_ndisc)) { |
---|
1052 | 1344 | skb->dev = vrf_dev; |
---|
1053 | 1345 | skb->skb_iif = vrf_dev->ifindex; |
---|
1054 | 1346 | IP6CB(skb)->flags |= IP6SKB_L3SLAVE; |
---|
1055 | | - skb->pkt_type = PACKET_HOST; |
---|
| 1347 | + |
---|
| 1348 | + if (skb->pkt_type == PACKET_LOOPBACK) |
---|
| 1349 | + skb->pkt_type = PACKET_HOST; |
---|
| 1350 | + else if (ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL) |
---|
| 1351 | + vrf_ip6_input_dst(skb, vrf_dev, orig_iif); |
---|
| 1352 | + |
---|
1056 | 1353 | goto out; |
---|
1057 | 1354 | } |
---|
1058 | 1355 | |
---|
1059 | | - /* if packet is NDISC or addressed to multicast or link-local |
---|
1060 | | - * then keep the ingress interface |
---|
1061 | | - */ |
---|
1062 | | - need_strict = rt6_need_strict(&ipv6_hdr(skb)->daddr); |
---|
1063 | | - if (!ipv6_ndisc_frame(skb) && !need_strict) { |
---|
| 1356 | + /* if packet is NDISC then keep the ingress interface */ |
---|
| 1357 | + if (!is_ndisc) { |
---|
1064 | 1358 | vrf_rx_stats(vrf_dev, skb->len); |
---|
1065 | 1359 | skb->dev = vrf_dev; |
---|
1066 | 1360 | skb->skb_iif = vrf_dev->ifindex; |
---|
.. | .. |
---|
1139 | 1433 | #if IS_ENABLED(CONFIG_IPV6) |
---|
1140 | 1434 | /* send to link-local or multicast address via interface enslaved to |
---|
1141 | 1435 | * VRF device. Force lookup to VRF table without changing flow struct |
---|
| 1436 | + * Note: Caller to this function must hold rcu_read_lock() and no refcnt |
---|
| 1437 | + * is taken on the dst by this function. |
---|
1142 | 1438 | */ |
---|
1143 | 1439 | static struct dst_entry *vrf_link_scope_lookup(const struct net_device *dev, |
---|
1144 | 1440 | struct flowi6 *fl6) |
---|
1145 | 1441 | { |
---|
1146 | 1442 | struct net *net = dev_net(dev); |
---|
1147 | | - int flags = RT6_LOOKUP_F_IFACE; |
---|
| 1443 | + int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_DST_NOREF; |
---|
1148 | 1444 | struct dst_entry *dst = NULL; |
---|
1149 | 1445 | struct rt6_info *rt; |
---|
1150 | 1446 | |
---|
.. | .. |
---|
1154 | 1450 | */ |
---|
1155 | 1451 | if (fl6->flowi6_oif == dev->ifindex) { |
---|
1156 | 1452 | dst = &net->ipv6.ip6_null_entry->dst; |
---|
1157 | | - dst_hold(dst); |
---|
1158 | 1453 | return dst; |
---|
1159 | 1454 | } |
---|
1160 | 1455 | |
---|
.. | .. |
---|
1208 | 1503 | struct sk_buff *skb; |
---|
1209 | 1504 | int err; |
---|
1210 | 1505 | |
---|
1211 | | - if (family == AF_INET6 && !ipv6_mod_enabled()) |
---|
| 1506 | + if ((family == AF_INET6 || family == RTNL_FAMILY_IP6MR) && |
---|
| 1507 | + !ipv6_mod_enabled()) |
---|
1212 | 1508 | return 0; |
---|
1213 | 1509 | |
---|
1214 | 1510 | skb = nlmsg_new(vrf_fib_rule_nl_size(), GFP_KERNEL); |
---|
.. | .. |
---|
1277 | 1573 | goto ipmr_err; |
---|
1278 | 1574 | #endif |
---|
1279 | 1575 | |
---|
| 1576 | +#if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES) |
---|
| 1577 | + err = vrf_fib_rule(dev, RTNL_FAMILY_IP6MR, true); |
---|
| 1578 | + if (err < 0) |
---|
| 1579 | + goto ip6mr_err; |
---|
| 1580 | +#endif |
---|
| 1581 | + |
---|
1280 | 1582 | return 0; |
---|
| 1583 | + |
---|
| 1584 | +#if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES) |
---|
| 1585 | +ip6mr_err: |
---|
| 1586 | + vrf_fib_rule(dev, RTNL_FAMILY_IPMR, false); |
---|
| 1587 | +#endif |
---|
1281 | 1588 | |
---|
1282 | 1589 | #if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES) |
---|
1283 | 1590 | ipmr_err: |
---|
.. | .. |
---|
1325 | 1632 | /* default to no qdisc; user can add if desired */ |
---|
1326 | 1633 | dev->priv_flags |= IFF_NO_QUEUE; |
---|
1327 | 1634 | dev->priv_flags |= IFF_NO_RX_HANDLER; |
---|
| 1635 | + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; |
---|
| 1636 | + |
---|
| 1637 | + /* VRF devices do not care about MTU, but if the MTU is set |
---|
| 1638 | + * too low then the ipv4 and ipv6 protocols are disabled |
---|
| 1639 | + * which breaks networking. |
---|
| 1640 | + */ |
---|
| 1641 | + dev->min_mtu = IPV6_MIN_MTU; |
---|
| 1642 | + dev->max_mtu = IP6_MAX_MTU; |
---|
| 1643 | + dev->mtu = dev->max_mtu; |
---|
1328 | 1644 | } |
---|
1329 | 1645 | |
---|
1330 | 1646 | static int vrf_validate(struct nlattr *tb[], struct nlattr *data[], |
---|
.. | .. |
---|
1351 | 1667 | netdev_for_each_lower_dev(dev, port_dev, iter) |
---|
1352 | 1668 | vrf_del_slave(dev, port_dev); |
---|
1353 | 1669 | |
---|
| 1670 | + vrf_map_unregister_dev(dev); |
---|
| 1671 | + |
---|
1354 | 1672 | unregister_netdevice_queue(dev, head); |
---|
1355 | 1673 | } |
---|
1356 | 1674 | |
---|
.. | .. |
---|
1359 | 1677 | struct netlink_ext_ack *extack) |
---|
1360 | 1678 | { |
---|
1361 | 1679 | struct net_vrf *vrf = netdev_priv(dev); |
---|
| 1680 | + struct netns_vrf *nn_vrf; |
---|
1362 | 1681 | bool *add_fib_rules; |
---|
1363 | 1682 | struct net *net; |
---|
1364 | 1683 | int err; |
---|
.. | .. |
---|
1381 | 1700 | if (err) |
---|
1382 | 1701 | goto out; |
---|
1383 | 1702 | |
---|
| 1703 | + /* mapping between table_id and vrf; |
---|
| 1704 | + * note: such binding could not be done in the dev init function |
---|
| 1705 | + * because dev->ifindex id is not available yet. |
---|
| 1706 | + */ |
---|
| 1707 | + vrf->ifindex = dev->ifindex; |
---|
| 1708 | + |
---|
| 1709 | + err = vrf_map_register_dev(dev, extack); |
---|
| 1710 | + if (err) { |
---|
| 1711 | + unregister_netdevice(dev); |
---|
| 1712 | + goto out; |
---|
| 1713 | + } |
---|
| 1714 | + |
---|
1384 | 1715 | net = dev_net(dev); |
---|
1385 | | - add_fib_rules = net_generic(net, vrf_net_id); |
---|
| 1716 | + nn_vrf = net_generic(net, vrf_net_id); |
---|
| 1717 | + |
---|
| 1718 | + add_fib_rules = &nn_vrf->add_fib_rules; |
---|
1386 | 1719 | if (*add_fib_rules) { |
---|
1387 | 1720 | err = vrf_add_fib_rules(dev); |
---|
1388 | 1721 | if (err) { |
---|
| 1722 | + vrf_map_unregister_dev(dev); |
---|
1389 | 1723 | unregister_netdevice(dev); |
---|
1390 | 1724 | goto out; |
---|
1391 | 1725 | } |
---|
.. | .. |
---|
1472 | 1806 | .notifier_call = vrf_device_event, |
---|
1473 | 1807 | }; |
---|
1474 | 1808 | |
---|
1475 | | -/* Initialize per network namespace state */ |
---|
1476 | | -static int __net_init vrf_netns_init(struct net *net) |
---|
| 1809 | +static int vrf_map_init(struct vrf_map *vmap) |
---|
1477 | 1810 | { |
---|
1478 | | - bool *add_fib_rules = net_generic(net, vrf_net_id); |
---|
| 1811 | + spin_lock_init(&vmap->vmap_lock); |
---|
| 1812 | + hash_init(vmap->ht); |
---|
1479 | 1813 | |
---|
1480 | | - *add_fib_rules = true; |
---|
| 1814 | + vmap->strict_mode = false; |
---|
1481 | 1815 | |
---|
1482 | 1816 | return 0; |
---|
1483 | 1817 | } |
---|
1484 | 1818 | |
---|
| 1819 | +#ifdef CONFIG_SYSCTL |
---|
| 1820 | +static bool vrf_strict_mode(struct vrf_map *vmap) |
---|
| 1821 | +{ |
---|
| 1822 | + bool strict_mode; |
---|
| 1823 | + |
---|
| 1824 | + vrf_map_lock(vmap); |
---|
| 1825 | + strict_mode = vmap->strict_mode; |
---|
| 1826 | + vrf_map_unlock(vmap); |
---|
| 1827 | + |
---|
| 1828 | + return strict_mode; |
---|
| 1829 | +} |
---|
| 1830 | + |
---|
| 1831 | +static int vrf_strict_mode_change(struct vrf_map *vmap, bool new_mode) |
---|
| 1832 | +{ |
---|
| 1833 | + bool *cur_mode; |
---|
| 1834 | + int res = 0; |
---|
| 1835 | + |
---|
| 1836 | + vrf_map_lock(vmap); |
---|
| 1837 | + |
---|
| 1838 | + cur_mode = &vmap->strict_mode; |
---|
| 1839 | + if (*cur_mode == new_mode) |
---|
| 1840 | + goto unlock; |
---|
| 1841 | + |
---|
| 1842 | + if (*cur_mode) { |
---|
| 1843 | + /* disable strict mode */ |
---|
| 1844 | + *cur_mode = false; |
---|
| 1845 | + } else { |
---|
| 1846 | + if (vmap->shared_tables) { |
---|
| 1847 | + /* we cannot allow strict_mode because there are some |
---|
| 1848 | + * vrfs that share one or more tables. |
---|
| 1849 | + */ |
---|
| 1850 | + res = -EBUSY; |
---|
| 1851 | + goto unlock; |
---|
| 1852 | + } |
---|
| 1853 | + |
---|
| 1854 | + /* no tables are shared among vrfs, so we can go back |
---|
| 1855 | + * to 1:1 association between a vrf with its table. |
---|
| 1856 | + */ |
---|
| 1857 | + *cur_mode = true; |
---|
| 1858 | + } |
---|
| 1859 | + |
---|
| 1860 | +unlock: |
---|
| 1861 | + vrf_map_unlock(vmap); |
---|
| 1862 | + |
---|
| 1863 | + return res; |
---|
| 1864 | +} |
---|
| 1865 | + |
---|
| 1866 | +static int vrf_shared_table_handler(struct ctl_table *table, int write, |
---|
| 1867 | + void *buffer, size_t *lenp, loff_t *ppos) |
---|
| 1868 | +{ |
---|
| 1869 | + struct net *net = (struct net *)table->extra1; |
---|
| 1870 | + struct vrf_map *vmap = netns_vrf_map(net); |
---|
| 1871 | + int proc_strict_mode = 0; |
---|
| 1872 | + struct ctl_table tmp = { |
---|
| 1873 | + .procname = table->procname, |
---|
| 1874 | + .data = &proc_strict_mode, |
---|
| 1875 | + .maxlen = sizeof(int), |
---|
| 1876 | + .mode = table->mode, |
---|
| 1877 | + .extra1 = SYSCTL_ZERO, |
---|
| 1878 | + .extra2 = SYSCTL_ONE, |
---|
| 1879 | + }; |
---|
| 1880 | + int ret; |
---|
| 1881 | + |
---|
| 1882 | + if (!write) |
---|
| 1883 | + proc_strict_mode = vrf_strict_mode(vmap); |
---|
| 1884 | + |
---|
| 1885 | + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); |
---|
| 1886 | + |
---|
| 1887 | + if (write && ret == 0) |
---|
| 1888 | + ret = vrf_strict_mode_change(vmap, (bool)proc_strict_mode); |
---|
| 1889 | + |
---|
| 1890 | + return ret; |
---|
| 1891 | +} |
---|
| 1892 | + |
---|
| 1893 | +static const struct ctl_table vrf_table[] = { |
---|
| 1894 | + { |
---|
| 1895 | + .procname = "strict_mode", |
---|
| 1896 | + .data = NULL, |
---|
| 1897 | + .maxlen = sizeof(int), |
---|
| 1898 | + .mode = 0644, |
---|
| 1899 | + .proc_handler = vrf_shared_table_handler, |
---|
| 1900 | + /* set by the vrf_netns_init */ |
---|
| 1901 | + .extra1 = NULL, |
---|
| 1902 | + }, |
---|
| 1903 | + { }, |
---|
| 1904 | +}; |
---|
| 1905 | + |
---|
| 1906 | +static int vrf_netns_init_sysctl(struct net *net, struct netns_vrf *nn_vrf) |
---|
| 1907 | +{ |
---|
| 1908 | + struct ctl_table *table; |
---|
| 1909 | + |
---|
| 1910 | + table = kmemdup(vrf_table, sizeof(vrf_table), GFP_KERNEL); |
---|
| 1911 | + if (!table) |
---|
| 1912 | + return -ENOMEM; |
---|
| 1913 | + |
---|
| 1914 | + /* init the extra1 parameter with the reference to current netns */ |
---|
| 1915 | + table[0].extra1 = net; |
---|
| 1916 | + |
---|
| 1917 | + nn_vrf->ctl_hdr = register_net_sysctl(net, "net/vrf", table); |
---|
| 1918 | + if (!nn_vrf->ctl_hdr) { |
---|
| 1919 | + kfree(table); |
---|
| 1920 | + return -ENOMEM; |
---|
| 1921 | + } |
---|
| 1922 | + |
---|
| 1923 | + return 0; |
---|
| 1924 | +} |
---|
| 1925 | + |
---|
| 1926 | +static void vrf_netns_exit_sysctl(struct net *net) |
---|
| 1927 | +{ |
---|
| 1928 | + struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id); |
---|
| 1929 | + struct ctl_table *table; |
---|
| 1930 | + |
---|
| 1931 | + table = nn_vrf->ctl_hdr->ctl_table_arg; |
---|
| 1932 | + unregister_net_sysctl_table(nn_vrf->ctl_hdr); |
---|
| 1933 | + kfree(table); |
---|
| 1934 | +} |
---|
| 1935 | +#else |
---|
| 1936 | +static int vrf_netns_init_sysctl(struct net *net, struct netns_vrf *nn_vrf) |
---|
| 1937 | +{ |
---|
| 1938 | + return 0; |
---|
| 1939 | +} |
---|
| 1940 | + |
---|
| 1941 | +static void vrf_netns_exit_sysctl(struct net *net) |
---|
| 1942 | +{ |
---|
| 1943 | +} |
---|
| 1944 | +#endif |
---|
| 1945 | + |
---|
| 1946 | +/* Initialize per network namespace state */ |
---|
| 1947 | +static int __net_init vrf_netns_init(struct net *net) |
---|
| 1948 | +{ |
---|
| 1949 | + struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id); |
---|
| 1950 | + |
---|
| 1951 | + nn_vrf->add_fib_rules = true; |
---|
| 1952 | + vrf_map_init(&nn_vrf->vmap); |
---|
| 1953 | + |
---|
| 1954 | + return vrf_netns_init_sysctl(net, nn_vrf); |
---|
| 1955 | +} |
---|
| 1956 | + |
---|
| 1957 | +static void __net_exit vrf_netns_exit(struct net *net) |
---|
| 1958 | +{ |
---|
| 1959 | + vrf_netns_exit_sysctl(net); |
---|
| 1960 | +} |
---|
| 1961 | + |
---|
1485 | 1962 | static struct pernet_operations vrf_net_ops __net_initdata = { |
---|
1486 | 1963 | .init = vrf_netns_init, |
---|
| 1964 | + .exit = vrf_netns_exit, |
---|
1487 | 1965 | .id = &vrf_net_id, |
---|
1488 | | - .size = sizeof(bool), |
---|
| 1966 | + .size = sizeof(struct netns_vrf), |
---|
1489 | 1967 | }; |
---|
1490 | 1968 | |
---|
1491 | 1969 | static int __init vrf_init_module(void) |
---|
.. | .. |
---|
1498 | 1976 | if (rc < 0) |
---|
1499 | 1977 | goto error; |
---|
1500 | 1978 | |
---|
| 1979 | + rc = l3mdev_table_lookup_register(L3MDEV_TYPE_VRF, |
---|
| 1980 | + vrf_ifindex_lookup_by_table_id); |
---|
| 1981 | + if (rc < 0) |
---|
| 1982 | + goto unreg_pernet; |
---|
| 1983 | + |
---|
1501 | 1984 | rc = rtnl_link_register(&vrf_link_ops); |
---|
1502 | | - if (rc < 0) { |
---|
1503 | | - unregister_pernet_subsys(&vrf_net_ops); |
---|
1504 | | - goto error; |
---|
1505 | | - } |
---|
| 1985 | + if (rc < 0) |
---|
| 1986 | + goto table_lookup_unreg; |
---|
1506 | 1987 | |
---|
1507 | 1988 | return 0; |
---|
1508 | 1989 | |
---|
| 1990 | +table_lookup_unreg: |
---|
| 1991 | + l3mdev_table_lookup_unregister(L3MDEV_TYPE_VRF, |
---|
| 1992 | + vrf_ifindex_lookup_by_table_id); |
---|
| 1993 | + |
---|
| 1994 | +unreg_pernet: |
---|
| 1995 | + unregister_pernet_subsys(&vrf_net_ops); |
---|
| 1996 | + |
---|
1509 | 1997 | error: |
---|
1510 | 1998 | unregister_netdevice_notifier(&vrf_notifier_block); |
---|
1511 | 1999 | return rc; |
---|