hc
2024-10-22 8ac6c7a54ed1b98d142dce24b11c6de6a1e239a5
kernel/net/ipv6/route.c
....@@ -1,14 +1,10 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /*
23 * Linux INET6 implementation
34 * FIB front-end.
45 *
56 * Authors:
67 * Pedro Roque <roque@di.fc.ul.pt>
7
- *
8
- * This program is free software; you can redistribute it and/or
9
- * modify it under the terms of the GNU General Public License
10
- * as published by the Free Software Foundation; either version
11
- * 2 of the License, or (at your option) any later version.
128 */
139
1410 /* Changes:
....@@ -60,12 +56,13 @@
6056 #include <net/xfrm.h>
6157 #include <net/netevent.h>
6258 #include <net/netlink.h>
63
-#include <net/nexthop.h>
59
+#include <net/rtnh.h>
6460 #include <net/lwtunnel.h>
6561 #include <net/ip_tunnels.h>
6662 #include <net/l3mdev.h>
6763 #include <net/ip.h>
6864 #include <linux/uaccess.h>
65
+#include <linux/btf_ids.h>
6966
7067 #ifdef CONFIG_SYSCTL
7168 #include <linux/sysctl.h>
....@@ -104,14 +101,15 @@
104101 bool confirm_neigh);
105102 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
106103 struct sk_buff *skb);
107
-static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
108
-static size_t rt6_nlmsg_size(struct fib6_info *rt);
104
+static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
105
+ int strict);
106
+static size_t rt6_nlmsg_size(struct fib6_info *f6i);
109107 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
110108 struct fib6_info *rt, struct dst_entry *dst,
111109 struct in6_addr *dest, struct in6_addr *src,
112110 int iif, int type, u32 portid, u32 seq,
113111 unsigned int flags);
114
-static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
112
+static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
115113 const struct in6_addr *daddr,
116114 const struct in6_addr *saddr);
117115
....@@ -181,7 +179,7 @@
181179 }
182180
183181 if (rt_dev == dev) {
184
- rt->dst.dev = loopback_dev;
182
+ rt->dst.dev = blackhole_netdev;
185183 dev_hold(rt->dst.dev);
186184 dev_put(rt_dev);
187185 }
....@@ -223,7 +221,8 @@
223221 {
224222 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
225223
226
- return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
224
+ return ip6_neigh_lookup(rt6_nexthop(rt, &in6addr_any),
225
+ dst->dev, skb, daddr);
227226 }
228227
229228 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
....@@ -231,7 +230,7 @@
231230 struct net_device *dev = dst->dev;
232231 struct rt6_info *rt = (struct rt6_info *)dst;
233232
234
- daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
233
+ daddr = choose_neigh_daddr(rt6_nexthop(rt, &in6addr_any), NULL, daddr);
235234 if (!daddr)
236235 return;
237236 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
....@@ -260,34 +259,16 @@
260259 .confirm_neigh = ip6_confirm_neigh,
261260 };
262261
263
-static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
264
-{
265
- unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
266
-
267
- return mtu ? : dst->dev->mtu;
268
-}
269
-
270
-static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
271
- struct sk_buff *skb, u32 mtu,
272
- bool confirm_neigh)
273
-{
274
-}
275
-
276
-static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
277
- struct sk_buff *skb)
278
-{
279
-}
280
-
281262 static struct dst_ops ip6_dst_blackhole_ops = {
282
- .family = AF_INET6,
283
- .destroy = ip6_dst_destroy,
284
- .check = ip6_dst_check,
285
- .mtu = ip6_blackhole_mtu,
286
- .default_advmss = ip6_default_advmss,
287
- .update_pmtu = ip6_rt_blackhole_update_pmtu,
288
- .redirect = ip6_rt_blackhole_redirect,
289
- .cow_metrics = dst_cow_metrics_generic,
290
- .neigh_lookup = ip6_dst_neigh_lookup,
263
+ .family = AF_INET6,
264
+ .default_advmss = ip6_default_advmss,
265
+ .neigh_lookup = ip6_dst_neigh_lookup,
266
+ .check = ip6_dst_check,
267
+ .destroy = ip6_dst_destroy,
268
+ .cow_metrics = dst_cow_metrics_generic,
269
+ .update_pmtu = dst_blackhole_update_pmtu,
270
+ .redirect = dst_blackhole_redirect,
271
+ .mtu = dst_blackhole_mtu,
291272 };
292273
293274 static const u32 ip6_template_metrics[RTAX_MAX] = {
....@@ -298,7 +279,7 @@
298279 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP),
299280 .fib6_protocol = RTPROT_KERNEL,
300281 .fib6_metric = ~(u32)0,
301
- .fib6_ref = ATOMIC_INIT(1),
282
+ .fib6_ref = REFCOUNT_INIT(1),
302283 .fib6_type = RTN_UNREACHABLE,
303284 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics,
304285 };
....@@ -369,14 +350,11 @@
369350
370351 static void ip6_dst_destroy(struct dst_entry *dst)
371352 {
372
- struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
373353 struct rt6_info *rt = (struct rt6_info *)dst;
374354 struct fib6_info *from;
375355 struct inet6_dev *idev;
376356
377
- if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
378
- kfree(p);
379
-
357
+ ip_dst_metrics_put(dst);
380358 rt6_uncached_list_del(rt);
381359
382360 idev = rt->rt6i_idev;
....@@ -430,75 +408,183 @@
430408 return false;
431409 }
432410
433
-struct fib6_info *fib6_multipath_select(const struct net *net,
434
- struct fib6_info *match,
435
- struct flowi6 *fl6, int oif,
436
- const struct sk_buff *skb,
437
- int strict)
411
+void fib6_select_path(const struct net *net, struct fib6_result *res,
412
+ struct flowi6 *fl6, int oif, bool have_oif_match,
413
+ const struct sk_buff *skb, int strict)
438414 {
439415 struct fib6_info *sibling, *next_sibling;
416
+ struct fib6_info *match = res->f6i;
417
+
418
+ if (!match->nh && (!match->fib6_nsiblings || have_oif_match))
419
+ goto out;
420
+
421
+ if (match->nh && have_oif_match && res->nh)
422
+ return;
440423
441424 /* We might have already computed the hash for ICMPv6 errors. In such
442425 * case it will always be non-zero. Otherwise now is the time to do it.
443426 */
444
- if (!fl6->mp_hash)
427
+ if (!fl6->mp_hash &&
428
+ (!match->nh || nexthop_is_multipath(match->nh)))
445429 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
446430
447
- if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
448
- return match;
431
+ if (unlikely(match->nh)) {
432
+ nexthop_path_fib6_result(res, fl6->mp_hash);
433
+ return;
434
+ }
435
+
436
+ if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound))
437
+ goto out;
449438
450439 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
451440 fib6_siblings) {
441
+ const struct fib6_nh *nh = sibling->fib6_nh;
452442 int nh_upper_bound;
453443
454
- nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
444
+ nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
455445 if (fl6->mp_hash > nh_upper_bound)
456446 continue;
457
- if (rt6_score_route(sibling, oif, strict) < 0)
447
+ if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
458448 break;
459449 match = sibling;
460450 break;
461451 }
462452
463
- return match;
453
+out:
454
+ res->f6i = match;
455
+ res->nh = match->fib6_nh;
464456 }
465457
466458 /*
467459 * Route lookup. rcu_read_lock() should be held.
468460 */
469461
470
-static inline struct fib6_info *rt6_device_match(struct net *net,
471
- struct fib6_info *rt,
472
- const struct in6_addr *saddr,
473
- int oif,
474
- int flags)
462
+static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
463
+ const struct in6_addr *saddr, int oif, int flags)
475464 {
476
- struct fib6_info *sprt;
465
+ const struct net_device *dev;
477466
478
- if (!oif && ipv6_addr_any(saddr) &&
479
- !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
480
- return rt;
467
+ if (nh->fib_nh_flags & RTNH_F_DEAD)
468
+ return false;
481469
482
- for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
483
- const struct net_device *dev = sprt->fib6_nh.nh_dev;
470
+ dev = nh->fib_nh_dev;
471
+ if (oif) {
472
+ if (dev->ifindex == oif)
473
+ return true;
474
+ } else {
475
+ if (ipv6_chk_addr(net, saddr, dev,
476
+ flags & RT6_LOOKUP_F_IFACE))
477
+ return true;
478
+ }
484479
485
- if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
486
- continue;
480
+ return false;
481
+}
487482
488
- if (oif) {
489
- if (dev->ifindex == oif)
490
- return sprt;
483
+struct fib6_nh_dm_arg {
484
+ struct net *net;
485
+ const struct in6_addr *saddr;
486
+ int oif;
487
+ int flags;
488
+ struct fib6_nh *nh;
489
+};
490
+
491
+static int __rt6_nh_dev_match(struct fib6_nh *nh, void *_arg)
492
+{
493
+ struct fib6_nh_dm_arg *arg = _arg;
494
+
495
+ arg->nh = nh;
496
+ return __rt6_device_match(arg->net, nh, arg->saddr, arg->oif,
497
+ arg->flags);
498
+}
499
+
500
+/* returns fib6_nh from nexthop or NULL */
501
+static struct fib6_nh *rt6_nh_dev_match(struct net *net, struct nexthop *nh,
502
+ struct fib6_result *res,
503
+ const struct in6_addr *saddr,
504
+ int oif, int flags)
505
+{
506
+ struct fib6_nh_dm_arg arg = {
507
+ .net = net,
508
+ .saddr = saddr,
509
+ .oif = oif,
510
+ .flags = flags,
511
+ };
512
+
513
+ if (nexthop_is_blackhole(nh))
514
+ return NULL;
515
+
516
+ if (nexthop_for_each_fib6_nh(nh, __rt6_nh_dev_match, &arg))
517
+ return arg.nh;
518
+
519
+ return NULL;
520
+}
521
+
522
+static void rt6_device_match(struct net *net, struct fib6_result *res,
523
+ const struct in6_addr *saddr, int oif, int flags)
524
+{
525
+ struct fib6_info *f6i = res->f6i;
526
+ struct fib6_info *spf6i;
527
+ struct fib6_nh *nh;
528
+
529
+ if (!oif && ipv6_addr_any(saddr)) {
530
+ if (unlikely(f6i->nh)) {
531
+ nh = nexthop_fib6_nh(f6i->nh);
532
+ if (nexthop_is_blackhole(f6i->nh))
533
+ goto out_blackhole;
491534 } else {
492
- if (ipv6_chk_addr(net, saddr, dev,
493
- flags & RT6_LOOKUP_F_IFACE))
494
- return sprt;
535
+ nh = f6i->fib6_nh;
536
+ }
537
+ if (!(nh->fib_nh_flags & RTNH_F_DEAD))
538
+ goto out;
539
+ }
540
+
541
+ for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) {
542
+ bool matched = false;
543
+
544
+ if (unlikely(spf6i->nh)) {
545
+ nh = rt6_nh_dev_match(net, spf6i->nh, res, saddr,
546
+ oif, flags);
547
+ if (nh)
548
+ matched = true;
549
+ } else {
550
+ nh = spf6i->fib6_nh;
551
+ if (__rt6_device_match(net, nh, saddr, oif, flags))
552
+ matched = true;
553
+ }
554
+ if (matched) {
555
+ res->f6i = spf6i;
556
+ goto out;
495557 }
496558 }
497559
498
- if (oif && flags & RT6_LOOKUP_F_IFACE)
499
- return net->ipv6.fib6_null_entry;
560
+ if (oif && flags & RT6_LOOKUP_F_IFACE) {
561
+ res->f6i = net->ipv6.fib6_null_entry;
562
+ nh = res->f6i->fib6_nh;
563
+ goto out;
564
+ }
500565
501
- return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
566
+ if (unlikely(f6i->nh)) {
567
+ nh = nexthop_fib6_nh(f6i->nh);
568
+ if (nexthop_is_blackhole(f6i->nh))
569
+ goto out_blackhole;
570
+ } else {
571
+ nh = f6i->fib6_nh;
572
+ }
573
+
574
+ if (nh->fib_nh_flags & RTNH_F_DEAD) {
575
+ res->f6i = net->ipv6.fib6_null_entry;
576
+ nh = res->f6i->fib6_nh;
577
+ }
578
+out:
579
+ res->nh = nh;
580
+ res->fib6_type = res->f6i->fib6_type;
581
+ res->fib6_flags = res->f6i->fib6_flags;
582
+ return;
583
+
584
+out_blackhole:
585
+ res->fib6_flags |= RTF_REJECT;
586
+ res->fib6_type = RTN_BLACKHOLE;
587
+ res->nh = nh;
502588 }
503589
504590 #ifdef CONFIG_IPV6_ROUTER_PREF
....@@ -520,7 +606,7 @@
520606 kfree(work);
521607 }
522608
523
-static void rt6_probe(struct fib6_info *rt)
609
+static void rt6_probe(struct fib6_nh *fib6_nh)
524610 {
525611 struct __rt6_probe_work *work = NULL;
526612 const struct in6_addr *nh_gw;
....@@ -537,13 +623,13 @@
537623 * Router Reachability Probe MUST be rate-limited
538624 * to no more than one per minute.
539625 */
540
- if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
626
+ if (!fib6_nh->fib_nh_gw_family)
541627 return;
542628
543
- nh_gw = &rt->fib6_nh.nh_gw;
544
- dev = rt->fib6_nh.nh_dev;
629
+ nh_gw = &fib6_nh->fib_nh_gw6;
630
+ dev = fib6_nh->fib_nh_dev;
545631 rcu_read_lock_bh();
546
- last_probe = READ_ONCE(rt->last_probe);
632
+ last_probe = READ_ONCE(fib6_nh->last_probe);
547633 idev = __in6_dev_get(dev);
548634 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
549635 if (neigh) {
....@@ -564,7 +650,7 @@
564650 work = kmalloc(sizeof(*work), GFP_ATOMIC);
565651 }
566652
567
- if (!work || cmpxchg(&rt->last_probe,
653
+ if (!work || cmpxchg(&fib6_nh->last_probe,
568654 last_probe, jiffies) != last_probe) {
569655 kfree(work);
570656 } else {
....@@ -579,7 +665,7 @@
579665 rcu_read_unlock_bh();
580666 }
581667 #else
582
-static inline void rt6_probe(struct fib6_info *rt)
668
+static inline void rt6_probe(struct fib6_nh *fib6_nh)
583669 {
584670 }
585671 #endif
....@@ -587,27 +673,14 @@
587673 /*
588674 * Default Router Selection (RFC 2461 6.3.6)
589675 */
590
-static inline int rt6_check_dev(struct fib6_info *rt, int oif)
591
-{
592
- const struct net_device *dev = rt->fib6_nh.nh_dev;
593
-
594
- if (!oif || dev->ifindex == oif)
595
- return 2;
596
- return 0;
597
-}
598
-
599
-static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
676
+static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
600677 {
601678 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
602679 struct neighbour *neigh;
603680
604
- if (rt->fib6_flags & RTF_NONEXTHOP ||
605
- !(rt->fib6_flags & RTF_GATEWAY))
606
- return RT6_NUD_SUCCEED;
607
-
608681 rcu_read_lock_bh();
609
- neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
610
- &rt->fib6_nh.nh_gw);
682
+ neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
683
+ &fib6_nh->fib_nh_gw6);
611684 if (neigh) {
612685 read_lock(&neigh->lock);
613686 if (neigh->nud_state & NUD_VALID)
....@@ -628,58 +701,44 @@
628701 return ret;
629702 }
630703
631
-static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
704
+static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
705
+ int strict)
632706 {
633
- int m;
707
+ int m = 0;
634708
635
- m = rt6_check_dev(rt, oif);
709
+ if (!oif || nh->fib_nh_dev->ifindex == oif)
710
+ m = 2;
711
+
636712 if (!m && (strict & RT6_LOOKUP_F_IFACE))
637713 return RT6_NUD_FAIL_HARD;
638714 #ifdef CONFIG_IPV6_ROUTER_PREF
639
- m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
715
+ m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2;
640716 #endif
641
- if (strict & RT6_LOOKUP_F_REACHABLE) {
642
- int n = rt6_check_neigh(rt);
717
+ if ((strict & RT6_LOOKUP_F_REACHABLE) &&
718
+ !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) {
719
+ int n = rt6_check_neigh(nh);
643720 if (n < 0)
644721 return n;
645722 }
646723 return m;
647724 }
648725
649
-/* called with rc_read_lock held */
650
-static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
726
+static bool find_match(struct fib6_nh *nh, u32 fib6_flags,
727
+ int oif, int strict, int *mpri, bool *do_rr)
651728 {
652
- const struct net_device *dev = fib6_info_nh_dev(f6i);
653
- bool rc = false;
654
-
655
- if (dev) {
656
- const struct inet6_dev *idev = __in6_dev_get(dev);
657
-
658
- rc = !!idev->cnf.ignore_routes_with_linkdown;
659
- }
660
-
661
- return rc;
662
-}
663
-
664
-static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
665
- int *mpri, struct fib6_info *match,
666
- bool *do_rr)
667
-{
668
- int m;
669729 bool match_do_rr = false;
730
+ bool rc = false;
731
+ int m;
670732
671
- if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
733
+ if (nh->fib_nh_flags & RTNH_F_DEAD)
672734 goto out;
673735
674
- if (fib6_ignore_linkdown(rt) &&
675
- rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
736
+ if (ip6_ignore_linkdown(nh->fib_nh_dev) &&
737
+ nh->fib_nh_flags & RTNH_F_LINKDOWN &&
676738 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
677739 goto out;
678740
679
- if (fib6_check_expired(rt))
680
- goto out;
681
-
682
- m = rt6_score_route(rt, oif, strict);
741
+ m = rt6_score_route(nh, fib6_flags, oif, strict);
683742 if (m == RT6_NUD_FAIL_DO_RR) {
684743 match_do_rr = true;
685744 m = 0; /* lowest valid score */
....@@ -688,67 +747,127 @@
688747 }
689748
690749 if (strict & RT6_LOOKUP_F_REACHABLE)
691
- rt6_probe(rt);
750
+ rt6_probe(nh);
692751
693752 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
694753 if (m > *mpri) {
695754 *do_rr = match_do_rr;
696755 *mpri = m;
697
- match = rt;
756
+ rc = true;
698757 }
699758 out:
700
- return match;
759
+ return rc;
701760 }
702761
703
-static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
704
- struct fib6_info *leaf,
705
- struct fib6_info *rr_head,
706
- u32 metric, int oif, int strict,
707
- bool *do_rr)
762
+struct fib6_nh_frl_arg {
763
+ u32 flags;
764
+ int oif;
765
+ int strict;
766
+ int *mpri;
767
+ bool *do_rr;
768
+ struct fib6_nh *nh;
769
+};
770
+
771
+static int rt6_nh_find_match(struct fib6_nh *nh, void *_arg)
708772 {
709
- struct fib6_info *rt, *match, *cont;
773
+ struct fib6_nh_frl_arg *arg = _arg;
774
+
775
+ arg->nh = nh;
776
+ return find_match(nh, arg->flags, arg->oif, arg->strict,
777
+ arg->mpri, arg->do_rr);
778
+}
779
+
780
+static void __find_rr_leaf(struct fib6_info *f6i_start,
781
+ struct fib6_info *nomatch, u32 metric,
782
+ struct fib6_result *res, struct fib6_info **cont,
783
+ int oif, int strict, bool *do_rr, int *mpri)
784
+{
785
+ struct fib6_info *f6i;
786
+
787
+ for (f6i = f6i_start;
788
+ f6i && f6i != nomatch;
789
+ f6i = rcu_dereference(f6i->fib6_next)) {
790
+ bool matched = false;
791
+ struct fib6_nh *nh;
792
+
793
+ if (cont && f6i->fib6_metric != metric) {
794
+ *cont = f6i;
795
+ return;
796
+ }
797
+
798
+ if (fib6_check_expired(f6i))
799
+ continue;
800
+
801
+ if (unlikely(f6i->nh)) {
802
+ struct fib6_nh_frl_arg arg = {
803
+ .flags = f6i->fib6_flags,
804
+ .oif = oif,
805
+ .strict = strict,
806
+ .mpri = mpri,
807
+ .do_rr = do_rr
808
+ };
809
+
810
+ if (nexthop_is_blackhole(f6i->nh)) {
811
+ res->fib6_flags = RTF_REJECT;
812
+ res->fib6_type = RTN_BLACKHOLE;
813
+ res->f6i = f6i;
814
+ res->nh = nexthop_fib6_nh(f6i->nh);
815
+ return;
816
+ }
817
+ if (nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_find_match,
818
+ &arg)) {
819
+ matched = true;
820
+ nh = arg.nh;
821
+ }
822
+ } else {
823
+ nh = f6i->fib6_nh;
824
+ if (find_match(nh, f6i->fib6_flags, oif, strict,
825
+ mpri, do_rr))
826
+ matched = true;
827
+ }
828
+ if (matched) {
829
+ res->f6i = f6i;
830
+ res->nh = nh;
831
+ res->fib6_flags = f6i->fib6_flags;
832
+ res->fib6_type = f6i->fib6_type;
833
+ }
834
+ }
835
+}
836
+
837
+static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf,
838
+ struct fib6_info *rr_head, int oif, int strict,
839
+ bool *do_rr, struct fib6_result *res)
840
+{
841
+ u32 metric = rr_head->fib6_metric;
842
+ struct fib6_info *cont = NULL;
710843 int mpri = -1;
711844
712
- match = NULL;
713
- cont = NULL;
714
- for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
715
- if (rt->fib6_metric != metric) {
716
- cont = rt;
717
- break;
718
- }
845
+ __find_rr_leaf(rr_head, NULL, metric, res, &cont,
846
+ oif, strict, do_rr, &mpri);
719847
720
- match = find_match(rt, oif, strict, &mpri, match, do_rr);
721
- }
848
+ __find_rr_leaf(leaf, rr_head, metric, res, &cont,
849
+ oif, strict, do_rr, &mpri);
722850
723
- for (rt = leaf; rt && rt != rr_head;
724
- rt = rcu_dereference(rt->fib6_next)) {
725
- if (rt->fib6_metric != metric) {
726
- cont = rt;
727
- break;
728
- }
851
+ if (res->f6i || !cont)
852
+ return;
729853
730
- match = find_match(rt, oif, strict, &mpri, match, do_rr);
731
- }
732
-
733
- if (match || !cont)
734
- return match;
735
-
736
- for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
737
- match = find_match(rt, oif, strict, &mpri, match, do_rr);
738
-
739
- return match;
854
+ __find_rr_leaf(cont, NULL, metric, res, NULL,
855
+ oif, strict, do_rr, &mpri);
740856 }
741857
742
-static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
743
- int oif, int strict)
858
+static void rt6_select(struct net *net, struct fib6_node *fn, int oif,
859
+ struct fib6_result *res, int strict)
744860 {
745861 struct fib6_info *leaf = rcu_dereference(fn->leaf);
746
- struct fib6_info *match, *rt0;
862
+ struct fib6_info *rt0;
747863 bool do_rr = false;
748864 int key_plen;
749865
866
+ /* make sure this function or its helpers sets f6i */
867
+ res->f6i = NULL;
868
+
750869 if (!leaf || leaf == net->ipv6.fib6_null_entry)
751
- return net->ipv6.fib6_null_entry;
870
+ goto out;
752871
753872 rt0 = rcu_dereference(fn->rr_ptr);
754873 if (!rt0)
....@@ -765,11 +884,9 @@
765884 key_plen = rt0->fib6_src.plen;
766885 #endif
767886 if (fn->fn_bit != key_plen)
768
- return net->ipv6.fib6_null_entry;
887
+ goto out;
769888
770
- match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
771
- &do_rr);
772
-
889
+ find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res);
773890 if (do_rr) {
774891 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
775892
....@@ -786,12 +903,19 @@
786903 }
787904 }
788905
789
- return match ? match : net->ipv6.fib6_null_entry;
906
+out:
907
+ if (!res->f6i) {
908
+ res->f6i = net->ipv6.fib6_null_entry;
909
+ res->nh = res->f6i->fib6_nh;
910
+ res->fib6_flags = res->f6i->fib6_flags;
911
+ res->fib6_type = res->f6i->fib6_type;
912
+ }
790913 }
791914
792
-static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
915
+static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res)
793916 {
794
- return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
917
+ return (res->f6i->fib6_flags & RTF_NONEXTHOP) ||
918
+ res->nh->fib_nh_gw_family;
795919 }
796920
797921 #ifdef CONFIG_IPV6_ROUTE_INFO
....@@ -847,7 +971,7 @@
847971 gwaddr, dev);
848972
849973 if (rt && !lifetime) {
850
- ip6_del_rt(net, rt);
974
+ ip6_del_rt(net, rt, false);
851975 rt = NULL;
852976 }
853977
....@@ -875,17 +999,17 @@
875999 */
8761000
8771001 /* called with rcu_lock held */
878
-static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
1002
+static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res)
8791003 {
880
- struct net_device *dev = rt->fib6_nh.nh_dev;
1004
+ struct net_device *dev = res->nh->fib_nh_dev;
8811005
882
- if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
1006
+ if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
8831007 /* for copies of local routes, dst->dev needs to be the
8841008 * device if it is a master device, the master device if
8851009 * device is enslaved, and the loopback as the default
8861010 */
8871011 if (netif_is_l3_slave(dev) &&
888
- !rt6_need_strict(&rt->fib6_dst.addr))
1012
+ !rt6_need_strict(&res->f6i->fib6_dst.addr))
8891013 dev = l3mdev_master_dev_rcu(dev);
8901014 else if (!netif_is_l3_master(dev))
8911015 dev = dev_net(dev)->loopback_dev;
....@@ -925,17 +1049,15 @@
9251049 flags |= DST_NOCOUNT;
9261050 if (rt->dst_nopolicy)
9271051 flags |= DST_NOPOLICY;
928
- if (rt->dst_host)
929
- flags |= DST_HOST;
9301052
9311053 return flags;
9321054 }
9331055
934
-static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
1056
+static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type)
9351057 {
936
- rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
1058
+ rt->dst.error = ip6_rt_type_to_error(fib6_type);
9371059
938
- switch (ort->fib6_type) {
1060
+ switch (fib6_type) {
9391061 case RTN_BLACKHOLE:
9401062 rt->dst.output = dst_discard_out;
9411063 rt->dst.input = dst_discard;
....@@ -953,26 +1075,28 @@
9531075 }
9541076 }
9551077
956
-static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
1078
+static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res)
9571079 {
958
- if (ort->fib6_flags & RTF_REJECT) {
959
- ip6_rt_init_dst_reject(rt, ort);
1080
+ struct fib6_info *f6i = res->f6i;
1081
+
1082
+ if (res->fib6_flags & RTF_REJECT) {
1083
+ ip6_rt_init_dst_reject(rt, res->fib6_type);
9601084 return;
9611085 }
9621086
9631087 rt->dst.error = 0;
9641088 rt->dst.output = ip6_output;
9651089
966
- if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
1090
+ if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) {
9671091 rt->dst.input = ip6_input;
968
- } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
1092
+ } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
9691093 rt->dst.input = ip6_mc_input;
9701094 } else {
9711095 rt->dst.input = ip6_forward;
9721096 }
9731097
974
- if (ort->fib6_nh.nh_lwtstate) {
975
- rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
1098
+ if (res->nh->fib_nh_lws) {
1099
+ rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws);
9761100 lwtunnel_set_redirect(&rt->dst);
9771101 }
9781102
....@@ -984,29 +1108,29 @@
9841108 {
9851109 rt->rt6i_flags &= ~RTF_EXPIRES;
9861110 rcu_assign_pointer(rt->from, from);
987
- dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
988
- if (from->fib6_metrics != &dst_default_metrics) {
989
- rt->dst._metrics |= DST_METRICS_REFCOUNTED;
990
- refcount_inc(&from->fib6_metrics->refcnt);
991
- }
1111
+ ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
9921112 }
9931113
994
-/* Caller must already hold reference to @ort */
995
-static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
1114
+/* Caller must already hold reference to f6i in result */
1115
+static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res)
9961116 {
997
- struct net_device *dev = fib6_info_nh_dev(ort);
1117
+ const struct fib6_nh *nh = res->nh;
1118
+ const struct net_device *dev = nh->fib_nh_dev;
1119
+ struct fib6_info *f6i = res->f6i;
9981120
999
- ip6_rt_init_dst(rt, ort);
1121
+ ip6_rt_init_dst(rt, res);
10001122
1001
- rt->rt6i_dst = ort->fib6_dst;
1123
+ rt->rt6i_dst = f6i->fib6_dst;
10021124 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
1003
- rt->rt6i_gateway = ort->fib6_nh.nh_gw;
1004
- rt->rt6i_flags = ort->fib6_flags;
1005
- rt6_set_from(rt, ort);
1125
+ rt->rt6i_flags = res->fib6_flags;
1126
+ if (nh->fib_nh_gw_family) {
1127
+ rt->rt6i_gateway = nh->fib_nh_gw6;
1128
+ rt->rt6i_flags |= RTF_GATEWAY;
1129
+ }
1130
+ rt6_set_from(rt, f6i);
10061131 #ifdef CONFIG_IPV6_SUBTREES
1007
- rt->rt6i_src = ort->fib6_src;
1132
+ rt->rt6i_src = f6i->fib6_src;
10081133 #endif
1009
- rt->rt6i_prefsrc = ort->fib6_prefsrc;
10101134 }
10111135
10121136 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
....@@ -1027,14 +1151,13 @@
10271151 }
10281152 }
10291153
1030
-static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1031
- bool null_fallback)
1154
+static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
10321155 {
10331156 struct rt6_info *rt = *prt;
10341157
10351158 if (dst_hold_safe(&rt->dst))
10361159 return true;
1037
- if (null_fallback) {
1160
+ if (net) {
10381161 rt = net->ipv6.ip6_null_entry;
10391162 dst_hold(&rt->dst);
10401163 } else {
....@@ -1045,22 +1168,24 @@
10451168 }
10461169
10471170 /* called with rcu_lock held */
1048
-static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1171
+static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res)
10491172 {
1050
- unsigned short flags = fib6_info_dst_flags(rt);
1051
- struct net_device *dev = rt->fib6_nh.nh_dev;
1173
+ struct net_device *dev = res->nh->fib_nh_dev;
1174
+ struct fib6_info *f6i = res->f6i;
1175
+ unsigned short flags;
10521176 struct rt6_info *nrt;
10531177
1054
- if (!fib6_info_hold_safe(rt))
1178
+ if (!fib6_info_hold_safe(f6i))
10551179 goto fallback;
10561180
1181
+ flags = fib6_info_dst_flags(f6i);
10571182 nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
10581183 if (!nrt) {
1059
- fib6_info_release(rt);
1184
+ fib6_info_release(f6i);
10601185 goto fallback;
10611186 }
10621187
1063
- ip6_rt_copy_init(nrt, rt);
1188
+ ip6_rt_copy_init(nrt, res);
10641189 return nrt;
10651190
10661191 fallback:
....@@ -1069,13 +1194,13 @@
10691194 return nrt;
10701195 }
10711196
1072
-static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1197
+INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_lookup(struct net *net,
10731198 struct fib6_table *table,
10741199 struct flowi6 *fl6,
10751200 const struct sk_buff *skb,
10761201 int flags)
10771202 {
1078
- struct fib6_info *f6i;
1203
+ struct fib6_result res = {};
10791204 struct fib6_node *fn;
10801205 struct rt6_info *rt;
10811206
....@@ -1085,36 +1210,40 @@
10851210 rcu_read_lock();
10861211 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
10871212 restart:
1088
- f6i = rcu_dereference(fn->leaf);
1089
- if (!f6i) {
1090
- f6i = net->ipv6.fib6_null_entry;
1091
- } else {
1092
- f6i = rt6_device_match(net, f6i, &fl6->saddr,
1093
- fl6->flowi6_oif, flags);
1094
- if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1095
- f6i = fib6_multipath_select(net, f6i, fl6,
1096
- fl6->flowi6_oif, skb,
1097
- flags);
1098
- }
1099
- if (f6i == net->ipv6.fib6_null_entry) {
1213
+ res.f6i = rcu_dereference(fn->leaf);
1214
+ if (!res.f6i)
1215
+ res.f6i = net->ipv6.fib6_null_entry;
1216
+ else
1217
+ rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif,
1218
+ flags);
1219
+
1220
+ if (res.f6i == net->ipv6.fib6_null_entry) {
11001221 fn = fib6_backtrack(fn, &fl6->saddr);
11011222 if (fn)
11021223 goto restart;
1103
- }
11041224
1105
- trace_fib6_table_lookup(net, f6i, table, fl6);
1106
-
1107
- /* Search through exception table */
1108
- rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1109
- if (rt) {
1110
- if (ip6_hold_safe(net, &rt, true))
1111
- dst_use_noref(&rt->dst, jiffies);
1112
- } else if (f6i == net->ipv6.fib6_null_entry) {
11131225 rt = net->ipv6.ip6_null_entry;
11141226 dst_hold(&rt->dst);
1115
- } else {
1116
- rt = ip6_create_rt_rcu(f6i);
1227
+ goto out;
1228
+ } else if (res.fib6_flags & RTF_REJECT) {
1229
+ goto do_create;
11171230 }
1231
+
1232
+ fib6_select_path(net, &res, fl6, fl6->flowi6_oif,
1233
+ fl6->flowi6_oif != 0, skb, flags);
1234
+
1235
+ /* Search through exception table */
1236
+ rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1237
+ if (rt) {
1238
+ if (ip6_hold_safe(net, &rt))
1239
+ dst_use_noref(&rt->dst, jiffies);
1240
+ } else {
1241
+do_create:
1242
+ rt = ip6_create_rt_rcu(&res);
1243
+ }
1244
+
1245
+out:
1246
+ trace_fib6_table_lookup(net, &res, table, fl6);
11181247
11191248 rcu_read_unlock();
11201249
....@@ -1181,10 +1310,11 @@
11811310 return __ip6_ins_rt(rt, &info, NULL);
11821311 }
11831312
1184
-static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1313
+static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res,
11851314 const struct in6_addr *daddr,
11861315 const struct in6_addr *saddr)
11871316 {
1317
+ struct fib6_info *f6i = res->f6i;
11881318 struct net_device *dev;
11891319 struct rt6_info *rt;
11901320
....@@ -1192,25 +1322,24 @@
11921322 * Clone the route.
11931323 */
11941324
1195
- if (!fib6_info_hold_safe(ort))
1325
+ if (!fib6_info_hold_safe(f6i))
11961326 return NULL;
11971327
1198
- dev = ip6_rt_get_dev_rcu(ort);
1328
+ dev = ip6_rt_get_dev_rcu(res);
11991329 rt = ip6_dst_alloc(dev_net(dev), dev, 0);
12001330 if (!rt) {
1201
- fib6_info_release(ort);
1331
+ fib6_info_release(f6i);
12021332 return NULL;
12031333 }
12041334
1205
- ip6_rt_copy_init(rt, ort);
1335
+ ip6_rt_copy_init(rt, res);
12061336 rt->rt6i_flags |= RTF_CACHE;
1207
- rt->dst.flags |= DST_HOST;
12081337 rt->rt6i_dst.addr = *daddr;
12091338 rt->rt6i_dst.plen = 128;
12101339
1211
- if (!rt6_is_gw_or_nonexthop(ort)) {
1212
- if (ort->fib6_dst.plen != 128 &&
1213
- ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1340
+ if (!rt6_is_gw_or_nonexthop(res)) {
1341
+ if (f6i->fib6_dst.plen != 128 &&
1342
+ ipv6_addr_equal(&f6i->fib6_dst.addr, daddr))
12141343 rt->rt6i_flags |= RTF_ANYCAST;
12151344 #ifdef CONFIG_IPV6_SUBTREES
12161345 if (rt->rt6i_src.plen && saddr) {
....@@ -1223,59 +1352,75 @@
12231352 return rt;
12241353 }
12251354
1226
-static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1355
+static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res)
12271356 {
1228
- unsigned short flags = fib6_info_dst_flags(rt);
1357
+ struct fib6_info *f6i = res->f6i;
1358
+ unsigned short flags = fib6_info_dst_flags(f6i);
12291359 struct net_device *dev;
12301360 struct rt6_info *pcpu_rt;
12311361
1232
- if (!fib6_info_hold_safe(rt))
1362
+ if (!fib6_info_hold_safe(f6i))
12331363 return NULL;
12341364
12351365 rcu_read_lock();
1236
- dev = ip6_rt_get_dev_rcu(rt);
1237
- pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1366
+ dev = ip6_rt_get_dev_rcu(res);
1367
+ pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags | DST_NOCOUNT);
12381368 rcu_read_unlock();
12391369 if (!pcpu_rt) {
1240
- fib6_info_release(rt);
1370
+ fib6_info_release(f6i);
12411371 return NULL;
12421372 }
1243
- ip6_rt_copy_init(pcpu_rt, rt);
1373
+ ip6_rt_copy_init(pcpu_rt, res);
12441374 pcpu_rt->rt6i_flags |= RTF_PCPU;
1375
+
1376
+ if (f6i->nh)
1377
+ pcpu_rt->sernum = rt_genid_ipv6(dev_net(dev));
1378
+
12451379 return pcpu_rt;
12461380 }
12471381
1248
-/* It should be called with rcu_read_lock() acquired */
1249
-static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1382
+static bool rt6_is_valid(const struct rt6_info *rt6)
12501383 {
1251
- struct rt6_info *pcpu_rt, **p;
1384
+ return rt6->sernum == rt_genid_ipv6(dev_net(rt6->dst.dev));
1385
+}
12521386
1253
- p = this_cpu_ptr(rt->rt6i_pcpu);
1254
- pcpu_rt = *p;
1387
+/* It should be called with rcu_read_lock() acquired */
1388
+static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
1389
+{
1390
+ struct rt6_info *pcpu_rt;
12551391
1256
- if (pcpu_rt)
1257
- ip6_hold_safe(NULL, &pcpu_rt, false);
1392
+ pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu);
1393
+
1394
+ if (pcpu_rt && pcpu_rt->sernum && !rt6_is_valid(pcpu_rt)) {
1395
+ struct rt6_info *prev, **p;
1396
+
1397
+ p = this_cpu_ptr(res->nh->rt6i_pcpu);
1398
+ prev = xchg(p, NULL);
1399
+ if (prev) {
1400
+ dst_dev_put(&prev->dst);
1401
+ dst_release(&prev->dst);
1402
+ }
1403
+
1404
+ pcpu_rt = NULL;
1405
+ }
12581406
12591407 return pcpu_rt;
12601408 }
12611409
12621410 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1263
- struct fib6_info *rt)
1411
+ const struct fib6_result *res)
12641412 {
12651413 struct rt6_info *pcpu_rt, *prev, **p;
12661414
1267
- pcpu_rt = ip6_rt_pcpu_alloc(rt);
1268
- if (!pcpu_rt) {
1269
- dst_hold(&net->ipv6.ip6_null_entry->dst);
1270
- return net->ipv6.ip6_null_entry;
1271
- }
1415
+ pcpu_rt = ip6_rt_pcpu_alloc(res);
1416
+ if (!pcpu_rt)
1417
+ return NULL;
12721418
1273
- dst_hold(&pcpu_rt->dst);
1274
- p = this_cpu_ptr(rt->rt6i_pcpu);
1419
+ p = this_cpu_ptr(res->nh->rt6i_pcpu);
12751420 prev = cmpxchg(p, NULL, pcpu_rt);
12761421 BUG_ON(prev);
12771422
1278
- if (rt->fib6_destroying) {
1423
+ if (res->f6i->fib6_destroying) {
12791424 struct fib6_info *from;
12801425
12811426 from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL);
....@@ -1426,14 +1571,15 @@
14261571 return NULL;
14271572 }
14281573
1429
-static unsigned int fib6_mtu(const struct fib6_info *rt)
1574
+static unsigned int fib6_mtu(const struct fib6_result *res)
14301575 {
1576
+ const struct fib6_nh *nh = res->nh;
14311577 unsigned int mtu;
14321578
1433
- if (rt->fib6_pmtu) {
1434
- mtu = rt->fib6_pmtu;
1579
+ if (res->f6i->fib6_pmtu) {
1580
+ mtu = res->f6i->fib6_pmtu;
14351581 } else {
1436
- struct net_device *dev = fib6_info_nh_dev(rt);
1582
+ struct net_device *dev = nh->fib_nh_dev;
14371583 struct inet6_dev *idev;
14381584
14391585 rcu_read_lock();
....@@ -1444,28 +1590,78 @@
14441590
14451591 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
14461592
1447
- return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1593
+ return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
1594
+}
1595
+
1596
+#define FIB6_EXCEPTION_BUCKET_FLUSHED 0x1UL
1597
+
1598
+/* used when the flushed bit is not relevant, only access to the bucket
1599
+ * (ie., all bucket users except rt6_insert_exception);
1600
+ *
1601
+ * called under rcu lock; sometimes called with rt6_exception_lock held
1602
+ */
1603
+static
1604
+struct rt6_exception_bucket *fib6_nh_get_excptn_bucket(const struct fib6_nh *nh,
1605
+ spinlock_t *lock)
1606
+{
1607
+ struct rt6_exception_bucket *bucket;
1608
+
1609
+ if (lock)
1610
+ bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
1611
+ lockdep_is_held(lock));
1612
+ else
1613
+ bucket = rcu_dereference(nh->rt6i_exception_bucket);
1614
+
1615
+ /* remove bucket flushed bit if set */
1616
+ if (bucket) {
1617
+ unsigned long p = (unsigned long)bucket;
1618
+
1619
+ p &= ~FIB6_EXCEPTION_BUCKET_FLUSHED;
1620
+ bucket = (struct rt6_exception_bucket *)p;
1621
+ }
1622
+
1623
+ return bucket;
1624
+}
1625
+
1626
+static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket *bucket)
1627
+{
1628
+ unsigned long p = (unsigned long)bucket;
1629
+
1630
+ return !!(p & FIB6_EXCEPTION_BUCKET_FLUSHED);
1631
+}
1632
+
1633
+/* called with rt6_exception_lock held */
1634
+static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh *nh,
1635
+ spinlock_t *lock)
1636
+{
1637
+ struct rt6_exception_bucket *bucket;
1638
+ unsigned long p;
1639
+
1640
+ bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
1641
+ lockdep_is_held(lock));
1642
+
1643
+ p = (unsigned long)bucket;
1644
+ p |= FIB6_EXCEPTION_BUCKET_FLUSHED;
1645
+ bucket = (struct rt6_exception_bucket *)p;
1646
+ rcu_assign_pointer(nh->rt6i_exception_bucket, bucket);
14481647 }
14491648
14501649 static int rt6_insert_exception(struct rt6_info *nrt,
1451
- struct fib6_info *ort)
1650
+ const struct fib6_result *res)
14521651 {
14531652 struct net *net = dev_net(nrt->dst.dev);
14541653 struct rt6_exception_bucket *bucket;
1654
+ struct fib6_info *f6i = res->f6i;
14551655 struct in6_addr *src_key = NULL;
14561656 struct rt6_exception *rt6_ex;
1657
+ struct fib6_nh *nh = res->nh;
14571658 int max_depth;
14581659 int err = 0;
14591660
14601661 spin_lock_bh(&rt6_exception_lock);
14611662
1462
- if (ort->exception_bucket_flushed) {
1463
- err = -EINVAL;
1464
- goto out;
1465
- }
1466
-
1467
- bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1468
- lockdep_is_held(&rt6_exception_lock));
1663
+ bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
1664
+ lockdep_is_held(&rt6_exception_lock));
14691665 if (!bucket) {
14701666 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
14711667 GFP_ATOMIC);
....@@ -1473,29 +1669,27 @@
14731669 err = -ENOMEM;
14741670 goto out;
14751671 }
1476
- rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1672
+ rcu_assign_pointer(nh->rt6i_exception_bucket, bucket);
1673
+ } else if (fib6_nh_excptn_bucket_flushed(bucket)) {
1674
+ err = -EINVAL;
1675
+ goto out;
14771676 }
14781677
14791678 #ifdef CONFIG_IPV6_SUBTREES
1480
- /* rt6i_src.plen != 0 indicates ort is in subtree
1679
+ /* fib6_src.plen != 0 indicates f6i is in subtree
14811680 * and exception table is indexed by a hash of
1482
- * both rt6i_dst and rt6i_src.
1681
+ * both fib6_dst and fib6_src.
14831682 * Otherwise, the exception table is indexed by
1484
- * a hash of only rt6i_dst.
1683
+ * a hash of only fib6_dst.
14851684 */
1486
- if (ort->fib6_src.plen)
1685
+ if (f6i->fib6_src.plen)
14871686 src_key = &nrt->rt6i_src.addr;
14881687 #endif
1489
-
1490
- /* Update rt6i_prefsrc as it could be changed
1491
- * in rt6_remove_prefsrc()
1492
- */
1493
- nrt->rt6i_prefsrc = ort->fib6_prefsrc;
1494
- /* rt6_mtu_change() might lower mtu on ort.
1688
+ /* rt6_mtu_change() might lower mtu on f6i.
14951689 * Only insert this exception route if its mtu
1496
- * is less than ort's mtu value.
1690
+ * is less than f6i's mtu value.
14971691 */
1498
- if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1692
+ if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) {
14991693 err = -EINVAL;
15001694 goto out;
15011695 }
....@@ -1526,16 +1720,16 @@
15261720
15271721 /* Update fn->fn_sernum to invalidate all cached dst */
15281722 if (!err) {
1529
- spin_lock_bh(&ort->fib6_table->tb6_lock);
1530
- fib6_update_sernum(net, ort);
1531
- spin_unlock_bh(&ort->fib6_table->tb6_lock);
1723
+ spin_lock_bh(&f6i->fib6_table->tb6_lock);
1724
+ fib6_update_sernum(net, f6i);
1725
+ spin_unlock_bh(&f6i->fib6_table->tb6_lock);
15321726 fib6_force_start_gc(net);
15331727 }
15341728
15351729 return err;
15361730 }
15371731
1538
-void rt6_flush_exceptions(struct fib6_info *rt)
1732
+static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from)
15391733 {
15401734 struct rt6_exception_bucket *bucket;
15411735 struct rt6_exception *rt6_ex;
....@@ -1543,41 +1737,62 @@
15431737 int i;
15441738
15451739 spin_lock_bh(&rt6_exception_lock);
1546
- /* Prevent rt6_insert_exception() to recreate the bucket list */
1547
- rt->exception_bucket_flushed = 1;
15481740
1549
- bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1550
- lockdep_is_held(&rt6_exception_lock));
1741
+ bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
15511742 if (!bucket)
15521743 goto out;
15531744
1745
+ /* Prevent rt6_insert_exception() to recreate the bucket list */
1746
+ if (!from)
1747
+ fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock);
1748
+
15541749 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1555
- hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1556
- rt6_remove_exception(bucket, rt6_ex);
1557
- WARN_ON_ONCE(bucket->depth);
1750
+ hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) {
1751
+ if (!from ||
1752
+ rcu_access_pointer(rt6_ex->rt6i->from) == from)
1753
+ rt6_remove_exception(bucket, rt6_ex);
1754
+ }
1755
+ WARN_ON_ONCE(!from && bucket->depth);
15581756 bucket++;
15591757 }
1560
-
15611758 out:
15621759 spin_unlock_bh(&rt6_exception_lock);
1760
+}
1761
+
1762
+static int rt6_nh_flush_exceptions(struct fib6_nh *nh, void *arg)
1763
+{
1764
+ struct fib6_info *f6i = arg;
1765
+
1766
+ fib6_nh_flush_exceptions(nh, f6i);
1767
+
1768
+ return 0;
1769
+}
1770
+
1771
+void rt6_flush_exceptions(struct fib6_info *f6i)
1772
+{
1773
+ if (f6i->nh)
1774
+ nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions,
1775
+ f6i);
1776
+ else
1777
+ fib6_nh_flush_exceptions(f6i->fib6_nh, f6i);
15631778 }
15641779
15651780 /* Find cached rt in the hash table inside passed in rt
15661781 * Caller has to hold rcu_read_lock()
15671782 */
1568
-static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1783
+static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
15691784 const struct in6_addr *daddr,
15701785 const struct in6_addr *saddr)
15711786 {
15721787 const struct in6_addr *src_key = NULL;
15731788 struct rt6_exception_bucket *bucket;
15741789 struct rt6_exception *rt6_ex;
1575
- struct rt6_info *res = NULL;
1790
+ struct rt6_info *ret = NULL;
15761791
15771792 #ifdef CONFIG_IPV6_SUBTREES
1578
- /* rt6i_src.plen != 0 indicates rt is in subtree
1793
+ /* fib6i_src.plen != 0 indicates f6i is in subtree
15791794 * and exception table is indexed by a hash of
1580
- * both rt6i_dst and rt6i_src.
1795
+ * both fib6_dst and fib6_src.
15811796 * However, the src addr used to create the hash
15821797 * might not be exactly the passed in saddr which
15831798 * is a /128 addr from the flow.
....@@ -1586,47 +1801,42 @@
15861801 * (See the logic in ip6_rt_cache_alloc() on how
15871802 * rt->rt6i_src is updated.)
15881803 */
1589
- if (rt->fib6_src.plen)
1804
+ if (res->f6i->fib6_src.plen)
15901805 src_key = saddr;
15911806 find_ex:
15921807 #endif
1593
- bucket = rcu_dereference(rt->rt6i_exception_bucket);
1808
+ bucket = fib6_nh_get_excptn_bucket(res->nh, NULL);
15941809 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
15951810
15961811 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1597
- res = rt6_ex->rt6i;
1812
+ ret = rt6_ex->rt6i;
15981813
15991814 #ifdef CONFIG_IPV6_SUBTREES
16001815 /* Use fib6_src as src_key and redo lookup */
1601
- if (!res && src_key && src_key != &rt->fib6_src.addr) {
1602
- src_key = &rt->fib6_src.addr;
1816
+ if (!ret && src_key && src_key != &res->f6i->fib6_src.addr) {
1817
+ src_key = &res->f6i->fib6_src.addr;
16031818 goto find_ex;
16041819 }
16051820 #endif
16061821
1607
- return res;
1822
+ return ret;
16081823 }
16091824
16101825 /* Remove the passed in cached rt from the hash table that contains it */
1611
-static int rt6_remove_exception_rt(struct rt6_info *rt)
1826
+static int fib6_nh_remove_exception(const struct fib6_nh *nh, int plen,
1827
+ const struct rt6_info *rt)
16121828 {
1829
+ const struct in6_addr *src_key = NULL;
16131830 struct rt6_exception_bucket *bucket;
1614
- struct in6_addr *src_key = NULL;
16151831 struct rt6_exception *rt6_ex;
1616
- struct fib6_info *from;
16171832 int err;
16181833
1619
- from = rcu_dereference(rt->from);
1620
- if (!from ||
1621
- !(rt->rt6i_flags & RTF_CACHE))
1622
- return -EINVAL;
1623
-
1624
- if (!rcu_access_pointer(from->rt6i_exception_bucket))
1834
+ if (!rcu_access_pointer(nh->rt6i_exception_bucket))
16251835 return -ENOENT;
16261836
16271837 spin_lock_bh(&rt6_exception_lock);
1628
- bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1629
- lockdep_is_held(&rt6_exception_lock));
1838
+ bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
1839
+
16301840 #ifdef CONFIG_IPV6_SUBTREES
16311841 /* rt6i_src.plen != 0 indicates 'from' is in subtree
16321842 * and exception table is indexed by a hash of
....@@ -1634,7 +1844,7 @@
16341844 * Otherwise, the exception table is indexed by
16351845 * a hash of only rt6i_dst.
16361846 */
1637
- if (from->fib6_src.plen)
1847
+ if (plen)
16381848 src_key = &rt->rt6i_src.addr;
16391849 #endif
16401850 rt6_ex = __rt6_find_exception_spinlock(&bucket,
....@@ -1651,23 +1861,60 @@
16511861 return err;
16521862 }
16531863
1864
+struct fib6_nh_excptn_arg {
1865
+ struct rt6_info *rt;
1866
+ int plen;
1867
+};
1868
+
1869
+static int rt6_nh_remove_exception_rt(struct fib6_nh *nh, void *_arg)
1870
+{
1871
+ struct fib6_nh_excptn_arg *arg = _arg;
1872
+ int err;
1873
+
1874
+ err = fib6_nh_remove_exception(nh, arg->plen, arg->rt);
1875
+ if (err == 0)
1876
+ return 1;
1877
+
1878
+ return 0;
1879
+}
1880
+
1881
+static int rt6_remove_exception_rt(struct rt6_info *rt)
1882
+{
1883
+ struct fib6_info *from;
1884
+
1885
+ from = rcu_dereference(rt->from);
1886
+ if (!from || !(rt->rt6i_flags & RTF_CACHE))
1887
+ return -EINVAL;
1888
+
1889
+ if (from->nh) {
1890
+ struct fib6_nh_excptn_arg arg = {
1891
+ .rt = rt,
1892
+ .plen = from->fib6_src.plen
1893
+ };
1894
+ int rc;
1895
+
1896
+ /* rc = 1 means an entry was found */
1897
+ rc = nexthop_for_each_fib6_nh(from->nh,
1898
+ rt6_nh_remove_exception_rt,
1899
+ &arg);
1900
+ return rc ? 0 : -ENOENT;
1901
+ }
1902
+
1903
+ return fib6_nh_remove_exception(from->fib6_nh,
1904
+ from->fib6_src.plen, rt);
1905
+}
1906
+
16541907 /* Find rt6_ex which contains the passed in rt cache and
16551908 * refresh its stamp
16561909 */
1657
-static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1910
+static void fib6_nh_update_exception(const struct fib6_nh *nh, int plen,
1911
+ const struct rt6_info *rt)
16581912 {
1913
+ const struct in6_addr *src_key = NULL;
16591914 struct rt6_exception_bucket *bucket;
1660
- struct in6_addr *src_key = NULL;
16611915 struct rt6_exception *rt6_ex;
1662
- struct fib6_info *from;
16631916
1664
- rcu_read_lock();
1665
- from = rcu_dereference(rt->from);
1666
- if (!from || !(rt->rt6i_flags & RTF_CACHE))
1667
- goto unlock;
1668
-
1669
- bucket = rcu_dereference(from->rt6i_exception_bucket);
1670
-
1917
+ bucket = fib6_nh_get_excptn_bucket(nh, NULL);
16711918 #ifdef CONFIG_IPV6_SUBTREES
16721919 /* rt6i_src.plen != 0 indicates 'from' is in subtree
16731920 * and exception table is indexed by a hash of
....@@ -1675,36 +1922,65 @@
16751922 * Otherwise, the exception table is indexed by
16761923 * a hash of only rt6i_dst.
16771924 */
1678
- if (from->fib6_src.plen)
1925
+ if (plen)
16791926 src_key = &rt->rt6i_src.addr;
16801927 #endif
1681
- rt6_ex = __rt6_find_exception_rcu(&bucket,
1682
- &rt->rt6i_dst.addr,
1683
- src_key);
1928
+ rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key);
16841929 if (rt6_ex)
16851930 rt6_ex->stamp = jiffies;
1686
-
1687
-unlock:
1688
- rcu_read_unlock();
16891931 }
16901932
1691
-static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
1933
+struct fib6_nh_match_arg {
1934
+ const struct net_device *dev;
1935
+ const struct in6_addr *gw;
1936
+ struct fib6_nh *match;
1937
+};
1938
+
1939
+/* determine if fib6_nh has given device and gateway */
1940
+static int fib6_nh_find_match(struct fib6_nh *nh, void *_arg)
16921941 {
1693
- struct rt6_exception_bucket *bucket;
1694
- struct rt6_exception *rt6_ex;
1695
- int i;
1942
+ struct fib6_nh_match_arg *arg = _arg;
16961943
1697
- bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1698
- lockdep_is_held(&rt6_exception_lock));
1944
+ if (arg->dev != nh->fib_nh_dev ||
1945
+ (arg->gw && !nh->fib_nh_gw_family) ||
1946
+ (!arg->gw && nh->fib_nh_gw_family) ||
1947
+ (arg->gw && !ipv6_addr_equal(arg->gw, &nh->fib_nh_gw6)))
1948
+ return 0;
16991949
1700
- if (bucket) {
1701
- for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1702
- hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1703
- rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1704
- }
1705
- bucket++;
1706
- }
1950
+ arg->match = nh;
1951
+
1952
+ /* found a match, break the loop */
1953
+ return 1;
1954
+}
1955
+
1956
+static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1957
+{
1958
+ struct fib6_info *from;
1959
+ struct fib6_nh *fib6_nh;
1960
+
1961
+ rcu_read_lock();
1962
+
1963
+ from = rcu_dereference(rt->from);
1964
+ if (!from || !(rt->rt6i_flags & RTF_CACHE))
1965
+ goto unlock;
1966
+
1967
+ if (from->nh) {
1968
+ struct fib6_nh_match_arg arg = {
1969
+ .dev = rt->dst.dev,
1970
+ .gw = &rt->rt6i_gateway,
1971
+ };
1972
+
1973
+ nexthop_for_each_fib6_nh(from->nh, fib6_nh_find_match, &arg);
1974
+
1975
+ if (!arg.match)
1976
+ goto unlock;
1977
+ fib6_nh = arg.match;
1978
+ } else {
1979
+ fib6_nh = from->fib6_nh;
17071980 }
1981
+ fib6_nh_update_exception(fib6_nh, from->fib6_src.plen, rt);
1982
+unlock:
1983
+ rcu_read_unlock();
17081984 }
17091985
17101986 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
....@@ -1730,15 +2006,13 @@
17302006 }
17312007
17322008 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1733
- struct fib6_info *rt, int mtu)
2009
+ const struct fib6_nh *nh, int mtu)
17342010 {
17352011 struct rt6_exception_bucket *bucket;
17362012 struct rt6_exception *rt6_ex;
17372013 int i;
17382014
1739
- bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1740
- lockdep_is_held(&rt6_exception_lock));
1741
-
2015
+ bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
17422016 if (!bucket)
17432017 return;
17442018
....@@ -1760,21 +2034,19 @@
17602034
17612035 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
17622036
1763
-static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1764
- struct in6_addr *gateway)
2037
+static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh *nh,
2038
+ const struct in6_addr *gateway)
17652039 {
17662040 struct rt6_exception_bucket *bucket;
17672041 struct rt6_exception *rt6_ex;
17682042 struct hlist_node *tmp;
17692043 int i;
17702044
1771
- if (!rcu_access_pointer(rt->rt6i_exception_bucket))
2045
+ if (!rcu_access_pointer(nh->rt6i_exception_bucket))
17722046 return;
17732047
17742048 spin_lock_bh(&rt6_exception_lock);
1775
- bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1776
- lockdep_is_held(&rt6_exception_lock));
1777
-
2049
+ bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
17782050 if (bucket) {
17792051 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
17802052 hlist_for_each_entry_safe(rt6_ex, tmp,
....@@ -1839,23 +2111,21 @@
18392111 gc_args->more++;
18402112 }
18412113
1842
-void rt6_age_exceptions(struct fib6_info *rt,
1843
- struct fib6_gc_args *gc_args,
1844
- unsigned long now)
2114
+static void fib6_nh_age_exceptions(const struct fib6_nh *nh,
2115
+ struct fib6_gc_args *gc_args,
2116
+ unsigned long now)
18452117 {
18462118 struct rt6_exception_bucket *bucket;
18472119 struct rt6_exception *rt6_ex;
18482120 struct hlist_node *tmp;
18492121 int i;
18502122
1851
- if (!rcu_access_pointer(rt->rt6i_exception_bucket))
2123
+ if (!rcu_access_pointer(nh->rt6i_exception_bucket))
18522124 return;
18532125
18542126 rcu_read_lock_bh();
18552127 spin_lock(&rt6_exception_lock);
1856
- bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1857
- lockdep_is_held(&rt6_exception_lock));
1858
-
2128
+ bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
18592129 if (bucket) {
18602130 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
18612131 hlist_for_each_entry_safe(rt6_ex, tmp,
....@@ -1870,12 +2140,41 @@
18702140 rcu_read_unlock_bh();
18712141 }
18722142
2143
+struct fib6_nh_age_excptn_arg {
2144
+ struct fib6_gc_args *gc_args;
2145
+ unsigned long now;
2146
+};
2147
+
2148
+static int rt6_nh_age_exceptions(struct fib6_nh *nh, void *_arg)
2149
+{
2150
+ struct fib6_nh_age_excptn_arg *arg = _arg;
2151
+
2152
+ fib6_nh_age_exceptions(nh, arg->gc_args, arg->now);
2153
+ return 0;
2154
+}
2155
+
2156
+void rt6_age_exceptions(struct fib6_info *f6i,
2157
+ struct fib6_gc_args *gc_args,
2158
+ unsigned long now)
2159
+{
2160
+ if (f6i->nh) {
2161
+ struct fib6_nh_age_excptn_arg arg = {
2162
+ .gc_args = gc_args,
2163
+ .now = now
2164
+ };
2165
+
2166
+ nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_age_exceptions,
2167
+ &arg);
2168
+ } else {
2169
+ fib6_nh_age_exceptions(f6i->fib6_nh, gc_args, now);
2170
+ }
2171
+}
2172
+
18732173 /* must be called with rcu lock held */
1874
-struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1875
- int oif, struct flowi6 *fl6, int strict)
2174
+int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif,
2175
+ struct flowi6 *fl6, struct fib6_result *res, int strict)
18762176 {
18772177 struct fib6_node *fn, *saved_fn;
1878
- struct fib6_info *f6i;
18792178
18802179 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
18812180 saved_fn = fn;
....@@ -1884,8 +2183,8 @@
18842183 oif = 0;
18852184
18862185 redo_rt6_select:
1887
- f6i = rt6_select(net, fn, oif, strict);
1888
- if (f6i == net->ipv6.fib6_null_entry) {
2186
+ rt6_select(net, fn, oif, res, strict);
2187
+ if (res->f6i == net->ipv6.fib6_null_entry) {
18892188 fn = fib6_backtrack(fn, &fl6->saddr);
18902189 if (fn)
18912190 goto redo_rt6_select;
....@@ -1897,18 +2196,21 @@
18972196 }
18982197 }
18992198
1900
- trace_fib6_table_lookup(net, f6i, table, fl6);
2199
+ trace_fib6_table_lookup(net, res, table, fl6);
19012200
1902
- return f6i;
2201
+ return 0;
19032202 }
19042203
19052204 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
19062205 int oif, struct flowi6 *fl6,
19072206 const struct sk_buff *skb, int flags)
19082207 {
1909
- struct fib6_info *f6i;
1910
- struct rt6_info *rt;
2208
+ struct fib6_result res = {};
2209
+ struct rt6_info *rt = NULL;
19112210 int strict = 0;
2211
+
2212
+ WARN_ON_ONCE((flags & RT6_LOOKUP_F_DST_NOREF) &&
2213
+ !rcu_read_lock_held());
19122214
19132215 strict |= flags & RT6_LOOKUP_F_IFACE;
19142216 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
....@@ -1917,70 +2219,59 @@
19172219
19182220 rcu_read_lock();
19192221
1920
- f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1921
- if (f6i->fib6_nsiblings)
1922
- f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
2222
+ fib6_table_lookup(net, table, oif, fl6, &res, strict);
2223
+ if (res.f6i == net->ipv6.fib6_null_entry)
2224
+ goto out;
19232225
1924
- if (f6i == net->ipv6.fib6_null_entry) {
1925
- rt = net->ipv6.ip6_null_entry;
1926
- rcu_read_unlock();
1927
- dst_hold(&rt->dst);
1928
- return rt;
1929
- }
2226
+ fib6_select_path(net, &res, fl6, oif, false, skb, strict);
19302227
19312228 /*Search through exception table */
1932
- rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
2229
+ rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
19332230 if (rt) {
1934
- if (ip6_hold_safe(net, &rt, true))
1935
- dst_use_noref(&rt->dst, jiffies);
1936
-
1937
- rcu_read_unlock();
1938
- return rt;
2231
+ goto out;
19392232 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1940
- !(f6i->fib6_flags & RTF_GATEWAY))) {
2233
+ !res.nh->fib_nh_gw_family)) {
19412234 /* Create a RTF_CACHE clone which will not be
19422235 * owned by the fib6 tree. It is for the special case where
19432236 * the daddr in the skb during the neighbor look-up is different
19442237 * from the fl6->daddr used to look-up route here.
19452238 */
1946
- struct rt6_info *uncached_rt;
2239
+ rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
19472240
1948
- uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1949
-
1950
- rcu_read_unlock();
1951
-
1952
- if (uncached_rt) {
1953
- /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1954
- * No need for another dst_hold()
2241
+ if (rt) {
2242
+ /* 1 refcnt is taken during ip6_rt_cache_alloc().
2243
+ * As rt6_uncached_list_add() does not consume refcnt,
2244
+ * this refcnt is always returned to the caller even
2245
+ * if caller sets RT6_LOOKUP_F_DST_NOREF flag.
19552246 */
1956
- rt6_uncached_list_add(uncached_rt);
2247
+ rt6_uncached_list_add(rt);
19572248 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1958
- } else {
1959
- uncached_rt = net->ipv6.ip6_null_entry;
1960
- dst_hold(&uncached_rt->dst);
1961
- }
2249
+ rcu_read_unlock();
19622250
1963
- return uncached_rt;
2251
+ return rt;
2252
+ }
19642253 } else {
19652254 /* Get a percpu copy */
1966
-
1967
- struct rt6_info *pcpu_rt;
1968
-
19692255 local_bh_disable();
1970
- pcpu_rt = rt6_get_pcpu_route(f6i);
2256
+ rt = rt6_get_pcpu_route(&res);
19712257
1972
- if (!pcpu_rt)
1973
- pcpu_rt = rt6_make_pcpu_route(net, f6i);
2258
+ if (!rt)
2259
+ rt = rt6_make_pcpu_route(net, &res);
19742260
19752261 local_bh_enable();
1976
- rcu_read_unlock();
1977
-
1978
- return pcpu_rt;
19792262 }
2263
+out:
2264
+ if (!rt)
2265
+ rt = net->ipv6.ip6_null_entry;
2266
+ if (!(flags & RT6_LOOKUP_F_DST_NOREF))
2267
+ ip6_hold_safe(net, &rt);
2268
+ rcu_read_unlock();
2269
+
2270
+ return rt;
19802271 }
19812272 EXPORT_SYMBOL_GPL(ip6_pol_route);
19822273
1983
-static struct rt6_info *ip6_pol_route_input(struct net *net,
2274
+INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_input(struct net *net,
19842275 struct fib6_table *table,
19852276 struct flowi6 *fl6,
19862277 const struct sk_buff *skb,
....@@ -2022,10 +2313,7 @@
20222313 if (!icmph)
20232314 goto out;
20242315
2025
- if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
2026
- icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
2027
- icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
2028
- icmph->icmp6_type != ICMPV6_PARAMPROB)
2316
+ if (!icmpv6_is_err(icmph->icmp6_type))
20292317 goto out;
20302318
20312319 inner_iph = skb_header_pointer(skb,
....@@ -2101,17 +2389,54 @@
21012389 hash_keys.basic.ip_proto = fl6->flowi6_proto;
21022390 }
21032391 break;
2392
+ case 2:
2393
+ memset(&hash_keys, 0, sizeof(hash_keys));
2394
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2395
+ if (skb) {
2396
+ struct flow_keys keys;
2397
+
2398
+ if (!flkeys) {
2399
+ skb_flow_dissect_flow_keys(skb, &keys, 0);
2400
+ flkeys = &keys;
2401
+ }
2402
+
2403
+ /* Inner can be v4 or v6 */
2404
+ if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2405
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2406
+ hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
2407
+ hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
2408
+ } else if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2409
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2410
+ hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2411
+ hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2412
+ hash_keys.tags.flow_label = flkeys->tags.flow_label;
2413
+ hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2414
+ } else {
2415
+ /* Same as case 0 */
2416
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2417
+ ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2418
+ }
2419
+ } else {
2420
+ /* Same as case 0 */
2421
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2422
+ hash_keys.addrs.v6addrs.src = fl6->saddr;
2423
+ hash_keys.addrs.v6addrs.dst = fl6->daddr;
2424
+ hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2425
+ hash_keys.basic.ip_proto = fl6->flowi6_proto;
2426
+ }
2427
+ break;
21042428 }
21052429 mhash = flow_hash_from_keys(&hash_keys);
21062430
21072431 return mhash >> 1;
21082432 }
21092433
2434
+/* Called with rcu held */
21102435 void ip6_route_input(struct sk_buff *skb)
21112436 {
21122437 const struct ipv6hdr *iph = ipv6_hdr(skb);
21132438 struct net *net = dev_net(skb->dev);
2114
- int flags = RT6_LOOKUP_F_HAS_SADDR;
2439
+ int flags = RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_DST_NOREF;
21152440 struct ip_tunnel_info *tun_info;
21162441 struct flowi6 fl6 = {
21172442 .flowi6_iif = skb->dev->ifindex,
....@@ -2133,11 +2458,11 @@
21332458 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
21342459 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
21352460 skb_dst_drop(skb);
2136
- skb_dst_set(skb,
2137
- ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2461
+ skb_dst_set_noref(skb, ip6_route_input_lookup(net, skb->dev,
2462
+ &fl6, skb, flags));
21382463 }
21392464
2140
-static struct rt6_info *ip6_pol_route_output(struct net *net,
2465
+INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_output(struct net *net,
21412466 struct fib6_table *table,
21422467 struct flowi6 *fl6,
21432468 const struct sk_buff *skb,
....@@ -2146,14 +2471,17 @@
21462471 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
21472472 }
21482473
2149
-struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2150
- struct flowi6 *fl6, int flags)
2474
+struct dst_entry *ip6_route_output_flags_noref(struct net *net,
2475
+ const struct sock *sk,
2476
+ struct flowi6 *fl6, int flags)
21512477 {
21522478 bool any_src;
21532479
2154
- if (rt6_need_strict(&fl6->daddr)) {
2480
+ if (ipv6_addr_type(&fl6->daddr) &
2481
+ (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
21552482 struct dst_entry *dst;
21562483
2484
+ /* This function does not take refcnt on the dst */
21572485 dst = l3mdev_link_scope_lookup(net, fl6);
21582486 if (dst)
21592487 return dst;
....@@ -2161,6 +2489,7 @@
21612489
21622490 fl6->flowi6_iif = LOOPBACK_IFINDEX;
21632491
2492
+ flags |= RT6_LOOKUP_F_DST_NOREF;
21642493 any_src = ipv6_addr_any(&fl6->saddr);
21652494 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
21662495 (fl6->flowi6_oif && any_src))
....@@ -2172,6 +2501,28 @@
21722501 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
21732502
21742503 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2504
+}
2505
+EXPORT_SYMBOL_GPL(ip6_route_output_flags_noref);
2506
+
2507
+struct dst_entry *ip6_route_output_flags(struct net *net,
2508
+ const struct sock *sk,
2509
+ struct flowi6 *fl6,
2510
+ int flags)
2511
+{
2512
+ struct dst_entry *dst;
2513
+ struct rt6_info *rt6;
2514
+
2515
+ rcu_read_lock();
2516
+ dst = ip6_route_output_flags_noref(net, sk, fl6, flags);
2517
+ rt6 = (struct rt6_info *)dst;
2518
+ /* For dst cached in uncached_list, refcnt is already taken. */
2519
+ if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) {
2520
+ dst = &net->ipv6.ip6_null_entry->dst;
2521
+ dst_hold(dst);
2522
+ }
2523
+ rcu_read_unlock();
2524
+
2525
+ return dst;
21752526 }
21762527 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
21772528
....@@ -2260,6 +2611,9 @@
22602611 struct rt6_info *rt;
22612612
22622613 rt = container_of(dst, struct rt6_info, dst);
2614
+
2615
+ if (rt->sernum)
2616
+ return rt6_is_valid(rt) ? dst : NULL;
22632617
22642618 rcu_read_lock();
22652619
....@@ -2354,14 +2708,8 @@
23542708
23552709 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
23562710 {
2357
- bool from_set;
2358
-
2359
- rcu_read_lock();
2360
- from_set = !!rcu_dereference(rt->from);
2361
- rcu_read_unlock();
2362
-
23632711 return !(rt->rt6i_flags & RTF_CACHE) &&
2364
- (rt->rt6i_flags & RTF_PCPU || from_set);
2712
+ (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
23652713 }
23662714
23672715 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
....@@ -2390,7 +2738,8 @@
23902738 if (confirm_neigh)
23912739 dst_confirm_neigh(dst, daddr);
23922740
2393
- mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2741
+ if (mtu < IPV6_MIN_MTU)
2742
+ return;
23942743 if (mtu >= dst_mtu(dst))
23952744 return;
23962745
....@@ -2400,21 +2749,44 @@
24002749 if (rt6->rt6i_flags & RTF_CACHE)
24012750 rt6_update_exception_stamp_rt(rt6);
24022751 } else if (daddr) {
2403
- struct fib6_info *from;
2752
+ struct fib6_result res = {};
24042753 struct rt6_info *nrt6;
24052754
24062755 rcu_read_lock();
2407
- from = rcu_dereference(rt6->from);
2408
- if (!from) {
2409
- rcu_read_unlock();
2410
- return;
2756
+ res.f6i = rcu_dereference(rt6->from);
2757
+ if (!res.f6i)
2758
+ goto out_unlock;
2759
+
2760
+ res.fib6_flags = res.f6i->fib6_flags;
2761
+ res.fib6_type = res.f6i->fib6_type;
2762
+
2763
+ if (res.f6i->nh) {
2764
+ struct fib6_nh_match_arg arg = {
2765
+ .dev = dst->dev,
2766
+ .gw = &rt6->rt6i_gateway,
2767
+ };
2768
+
2769
+ nexthop_for_each_fib6_nh(res.f6i->nh,
2770
+ fib6_nh_find_match, &arg);
2771
+
2772
+ /* fib6_info uses a nexthop that does not have fib6_nh
2773
+ * using the dst->dev + gw. Should be impossible.
2774
+ */
2775
+ if (!arg.match)
2776
+ goto out_unlock;
2777
+
2778
+ res.nh = arg.match;
2779
+ } else {
2780
+ res.nh = res.f6i->fib6_nh;
24112781 }
2412
- nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2782
+
2783
+ nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr);
24132784 if (nrt6) {
24142785 rt6_do_update_pmtu(nrt6, mtu);
2415
- if (rt6_insert_exception(nrt6, from))
2786
+ if (rt6_insert_exception(nrt6, &res))
24162787 dst_release_immediate(&nrt6->dst);
24172788 }
2789
+out_unlock:
24182790 rcu_read_unlock();
24192791 }
24202792 }
....@@ -2432,15 +2804,14 @@
24322804 {
24332805 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
24342806 struct dst_entry *dst;
2435
- struct flowi6 fl6;
2436
-
2437
- memset(&fl6, 0, sizeof(fl6));
2438
- fl6.flowi6_oif = oif;
2439
- fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2440
- fl6.daddr = iph->daddr;
2441
- fl6.saddr = iph->saddr;
2442
- fl6.flowlabel = ip6_flowinfo(iph);
2443
- fl6.flowi6_uid = uid;
2807
+ struct flowi6 fl6 = {
2808
+ .flowi6_oif = oif,
2809
+ .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2810
+ .daddr = iph->daddr,
2811
+ .saddr = iph->saddr,
2812
+ .flowlabel = ip6_flowinfo(iph),
2813
+ .flowi6_uid = uid,
2814
+ };
24442815
24452816 dst = ip6_route_output(net, NULL, &fl6);
24462817 if (!dst->error)
....@@ -2488,20 +2859,72 @@
24882859 NULL);
24892860 }
24902861
2862
+static bool ip6_redirect_nh_match(const struct fib6_result *res,
2863
+ struct flowi6 *fl6,
2864
+ const struct in6_addr *gw,
2865
+ struct rt6_info **ret)
2866
+{
2867
+ const struct fib6_nh *nh = res->nh;
2868
+
2869
+ if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family ||
2870
+ fl6->flowi6_oif != nh->fib_nh_dev->ifindex)
2871
+ return false;
2872
+
2873
+ /* rt_cache's gateway might be different from its 'parent'
2874
+ * in the case of an ip redirect.
2875
+ * So we keep searching in the exception table if the gateway
2876
+ * is different.
2877
+ */
2878
+ if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) {
2879
+ struct rt6_info *rt_cache;
2880
+
2881
+ rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr);
2882
+ if (rt_cache &&
2883
+ ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) {
2884
+ *ret = rt_cache;
2885
+ return true;
2886
+ }
2887
+ return false;
2888
+ }
2889
+ return true;
2890
+}
2891
+
2892
+struct fib6_nh_rd_arg {
2893
+ struct fib6_result *res;
2894
+ struct flowi6 *fl6;
2895
+ const struct in6_addr *gw;
2896
+ struct rt6_info **ret;
2897
+};
2898
+
2899
+static int fib6_nh_redirect_match(struct fib6_nh *nh, void *_arg)
2900
+{
2901
+ struct fib6_nh_rd_arg *arg = _arg;
2902
+
2903
+ arg->res->nh = nh;
2904
+ return ip6_redirect_nh_match(arg->res, arg->fl6, arg->gw, arg->ret);
2905
+}
2906
+
24912907 /* Handle redirects */
24922908 struct ip6rd_flowi {
24932909 struct flowi6 fl6;
24942910 struct in6_addr gateway;
24952911 };
24962912
2497
-static struct rt6_info *__ip6_route_redirect(struct net *net,
2913
+INDIRECT_CALLABLE_SCOPE struct rt6_info *__ip6_route_redirect(struct net *net,
24982914 struct fib6_table *table,
24992915 struct flowi6 *fl6,
25002916 const struct sk_buff *skb,
25012917 int flags)
25022918 {
25032919 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2504
- struct rt6_info *ret = NULL, *rt_cache;
2920
+ struct rt6_info *ret = NULL;
2921
+ struct fib6_result res = {};
2922
+ struct fib6_nh_rd_arg arg = {
2923
+ .res = &res,
2924
+ .fl6 = fl6,
2925
+ .gw = &rdfl->gateway,
2926
+ .ret = &ret
2927
+ };
25052928 struct fib6_info *rt;
25062929 struct fib6_node *fn;
25072930
....@@ -2525,34 +2948,25 @@
25252948 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
25262949 restart:
25272950 for_each_fib6_node_rt_rcu(fn) {
2528
- if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2529
- continue;
2951
+ res.f6i = rt;
25302952 if (fib6_check_expired(rt))
25312953 continue;
25322954 if (rt->fib6_flags & RTF_REJECT)
25332955 break;
2534
- if (!(rt->fib6_flags & RTF_GATEWAY))
2535
- continue;
2536
- if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2537
- continue;
2538
- /* rt_cache's gateway might be different from its 'parent'
2539
- * in the case of an ip redirect.
2540
- * So we keep searching in the exception table if the gateway
2541
- * is different.
2542
- */
2543
- if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2544
- rt_cache = rt6_find_cached_rt(rt,
2545
- &fl6->daddr,
2546
- &fl6->saddr);
2547
- if (rt_cache &&
2548
- ipv6_addr_equal(&rdfl->gateway,
2549
- &rt_cache->rt6i_gateway)) {
2550
- ret = rt_cache;
2551
- break;
2552
- }
2553
- continue;
2956
+ if (unlikely(rt->nh)) {
2957
+ if (nexthop_is_blackhole(rt->nh))
2958
+ continue;
2959
+ /* on match, res->nh is filled in and potentially ret */
2960
+ if (nexthop_for_each_fib6_nh(rt->nh,
2961
+ fib6_nh_redirect_match,
2962
+ &arg))
2963
+ goto out;
2964
+ } else {
2965
+ res.nh = rt->fib6_nh;
2966
+ if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway,
2967
+ &ret))
2968
+ goto out;
25542969 }
2555
- break;
25562970 }
25572971
25582972 if (!rt)
....@@ -2568,15 +2982,20 @@
25682982 goto restart;
25692983 }
25702984
2985
+ res.f6i = rt;
2986
+ res.nh = rt->fib6_nh;
25712987 out:
2572
- if (ret)
2573
- ip6_hold_safe(net, &ret, true);
2574
- else
2575
- ret = ip6_create_rt_rcu(rt);
2988
+ if (ret) {
2989
+ ip6_hold_safe(net, &ret);
2990
+ } else {
2991
+ res.fib6_flags = res.f6i->fib6_flags;
2992
+ res.fib6_type = res.f6i->fib6_type;
2993
+ ret = ip6_create_rt_rcu(&res);
2994
+ }
25762995
25772996 rcu_read_unlock();
25782997
2579
- trace_fib6_table_lookup(net, rt, table, fl6);
2998
+ trace_fib6_table_lookup(net, &res, table, fl6);
25802999 return ret;
25813000 };
25823001
....@@ -2600,16 +3019,15 @@
26003019 {
26013020 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
26023021 struct dst_entry *dst;
2603
- struct flowi6 fl6;
2604
-
2605
- memset(&fl6, 0, sizeof(fl6));
2606
- fl6.flowi6_iif = LOOPBACK_IFINDEX;
2607
- fl6.flowi6_oif = oif;
2608
- fl6.flowi6_mark = mark;
2609
- fl6.daddr = iph->daddr;
2610
- fl6.saddr = iph->saddr;
2611
- fl6.flowlabel = ip6_flowinfo(iph);
2612
- fl6.flowi6_uid = uid;
3022
+ struct flowi6 fl6 = {
3023
+ .flowi6_iif = LOOPBACK_IFINDEX,
3024
+ .flowi6_oif = oif,
3025
+ .flowi6_mark = mark,
3026
+ .daddr = iph->daddr,
3027
+ .saddr = iph->saddr,
3028
+ .flowlabel = ip6_flowinfo(iph),
3029
+ .flowi6_uid = uid,
3030
+ };
26133031
26143032 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
26153033 rt6_do_redirect(dst, NULL, skb);
....@@ -2617,21 +3035,18 @@
26173035 }
26183036 EXPORT_SYMBOL_GPL(ip6_redirect);
26193037
2620
-void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2621
- u32 mark)
3038
+void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
26223039 {
26233040 const struct ipv6hdr *iph = ipv6_hdr(skb);
26243041 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
26253042 struct dst_entry *dst;
2626
- struct flowi6 fl6;
2627
-
2628
- memset(&fl6, 0, sizeof(fl6));
2629
- fl6.flowi6_iif = LOOPBACK_IFINDEX;
2630
- fl6.flowi6_oif = oif;
2631
- fl6.flowi6_mark = mark;
2632
- fl6.daddr = msg->dest;
2633
- fl6.saddr = iph->daddr;
2634
- fl6.flowi6_uid = sock_net_uid(net, NULL);
3043
+ struct flowi6 fl6 = {
3044
+ .flowi6_iif = LOOPBACK_IFINDEX,
3045
+ .flowi6_oif = oif,
3046
+ .daddr = msg->dest,
3047
+ .saddr = iph->daddr,
3048
+ .flowi6_uid = sock_net_uid(net, NULL),
3049
+ };
26353050
26363051 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
26373052 rt6_do_redirect(dst, NULL, skb);
....@@ -2698,9 +3113,12 @@
26983113 * based on ip6_dst_mtu_forward and exception logic of
26993114 * rt6_find_cached_rt; called with rcu_read_lock
27003115 */
2701
-u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2702
- struct in6_addr *saddr)
3116
+u32 ip6_mtu_from_fib6(const struct fib6_result *res,
3117
+ const struct in6_addr *daddr,
3118
+ const struct in6_addr *saddr)
27033119 {
3120
+ const struct fib6_nh *nh = res->nh;
3121
+ struct fib6_info *f6i = res->f6i;
27043122 struct inet6_dev *idev;
27053123 struct rt6_info *rt;
27063124 u32 mtu = 0;
....@@ -2711,11 +3129,11 @@
27113129 goto out;
27123130 }
27133131
2714
- rt = rt6_find_cached_rt(f6i, daddr, saddr);
3132
+ rt = rt6_find_cached_rt(res, daddr, saddr);
27153133 if (unlikely(rt)) {
27163134 mtu = dst_metric_raw(&rt->dst, RTAX_MTU);
27173135 } else {
2718
- struct net_device *dev = fib6_info_nh_dev(f6i);
3136
+ struct net_device *dev = nh->fib_nh_dev;
27193137
27203138 mtu = IPV6_MIN_MTU;
27213139 idev = __in6_dev_get(dev);
....@@ -2725,7 +3143,7 @@
27253143
27263144 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
27273145 out:
2728
- return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
3146
+ return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
27293147 }
27303148
27313149 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
....@@ -2746,7 +3164,6 @@
27463164 goto out;
27473165 }
27483166
2749
- rt->dst.flags |= DST_HOST;
27503167 rt->dst.input = ip6_input;
27513168 rt->dst.output = ip6_output;
27523169 rt->rt6i_gateway = fl6->daddr;
....@@ -2778,6 +3195,9 @@
27783195 int entries;
27793196
27803197 entries = dst_entries_get_fast(ops);
3198
+ if (entries > rt_max_size)
3199
+ entries = dst_entries_get_slow(ops);
3200
+
27813201 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
27823202 entries <= rt_max_size)
27833203 goto out;
....@@ -2792,28 +3212,9 @@
27923212 return entries > rt_max_size;
27933213 }
27943214
2795
-static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2796
- struct fib6_config *cfg)
2797
-{
2798
- struct dst_metrics *p;
2799
-
2800
- if (!cfg->fc_mx)
2801
- return 0;
2802
-
2803
- p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
2804
- if (unlikely(!p))
2805
- return -ENOMEM;
2806
-
2807
- refcount_set(&p->refcnt, 1);
2808
- rt->fib6_metrics = p;
2809
-
2810
- return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2811
-}
2812
-
2813
-static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2814
- struct fib6_config *cfg,
2815
- const struct in6_addr *gw_addr,
2816
- u32 tbid, int flags)
3215
+static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg,
3216
+ const struct in6_addr *gw_addr, u32 tbid,
3217
+ int flags, struct fib6_result *res)
28173218 {
28183219 struct flowi6 fl6 = {
28193220 .flowi6_oif = cfg->fc_ifindex,
....@@ -2821,25 +3222,23 @@
28213222 .saddr = cfg->fc_prefsrc,
28223223 };
28233224 struct fib6_table *table;
2824
- struct rt6_info *rt;
3225
+ int err;
28253226
28263227 table = fib6_get_table(net, tbid);
28273228 if (!table)
2828
- return NULL;
3229
+ return -EINVAL;
28293230
28303231 if (!ipv6_addr_any(&cfg->fc_prefsrc))
28313232 flags |= RT6_LOOKUP_F_HAS_SADDR;
28323233
28333234 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2834
- rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
28353235
2836
- /* if table lookup failed, fall back to full lookup */
2837
- if (rt == net->ipv6.ip6_null_entry) {
2838
- ip6_rt_put(rt);
2839
- rt = NULL;
2840
- }
3236
+ err = fib6_table_lookup(net, table, cfg->fc_ifindex, &fl6, res, flags);
3237
+ if (!err && res->f6i != net->ipv6.fib6_null_entry)
3238
+ fib6_select_path(net, res, &fl6, cfg->fc_ifindex,
3239
+ cfg->fc_ifindex != 0, NULL, flags);
28413240
2842
- return rt;
3241
+ return err;
28433242 }
28443243
28453244 static int ip6_route_check_nh_onlink(struct net *net,
....@@ -2847,29 +3246,19 @@
28473246 const struct net_device *dev,
28483247 struct netlink_ext_ack *extack)
28493248 {
2850
- u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
3249
+ u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
28513250 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2852
- u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2853
- struct fib6_info *from;
2854
- struct rt6_info *grt;
3251
+ struct fib6_result res = {};
28553252 int err;
28563253
2857
- err = 0;
2858
- grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2859
- if (grt) {
2860
- rcu_read_lock();
2861
- from = rcu_dereference(grt->from);
2862
- if (!grt->dst.error &&
2863
- /* ignore match if it is the default route */
2864
- from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2865
- (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2866
- NL_SET_ERR_MSG(extack,
2867
- "Nexthop has invalid gateway or device mismatch");
2868
- err = -EINVAL;
2869
- }
2870
- rcu_read_unlock();
2871
-
2872
- ip6_rt_put(grt);
3254
+ err = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0, &res);
3255
+ if (!err && !(res.fib6_flags & RTF_REJECT) &&
3256
+ /* ignore match if it is the default route */
3257
+ !ipv6_addr_any(&res.f6i->fib6_dst.addr) &&
3258
+ (res.fib6_type != RTN_UNICAST || dev != res.nh->fib_nh_dev)) {
3259
+ NL_SET_ERR_MSG(extack,
3260
+ "Nexthop has invalid gateway or device mismatch");
3261
+ err = -EINVAL;
28733262 }
28743263
28753264 return err;
....@@ -2882,47 +3271,50 @@
28823271 {
28833272 const struct in6_addr *gw_addr = &cfg->fc_gateway;
28843273 struct net_device *dev = _dev ? *_dev : NULL;
2885
- struct rt6_info *grt = NULL;
3274
+ int flags = RT6_LOOKUP_F_IFACE;
3275
+ struct fib6_result res = {};
28863276 int err = -EHOSTUNREACH;
28873277
28883278 if (cfg->fc_table) {
2889
- int flags = RT6_LOOKUP_F_IFACE;
2890
-
2891
- grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2892
- cfg->fc_table, flags);
2893
- if (grt) {
2894
- if (grt->rt6i_flags & RTF_GATEWAY ||
2895
- (dev && dev != grt->dst.dev)) {
2896
- ip6_rt_put(grt);
2897
- grt = NULL;
2898
- }
2899
- }
3279
+ err = ip6_nh_lookup_table(net, cfg, gw_addr,
3280
+ cfg->fc_table, flags, &res);
3281
+ /* gw_addr can not require a gateway or resolve to a reject
3282
+ * route. If a device is given, it must match the result.
3283
+ */
3284
+ if (err || res.fib6_flags & RTF_REJECT ||
3285
+ res.nh->fib_nh_gw_family ||
3286
+ (dev && dev != res.nh->fib_nh_dev))
3287
+ err = -EHOSTUNREACH;
29003288 }
29013289
2902
- if (!grt)
2903
- grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
3290
+ if (err < 0) {
3291
+ struct flowi6 fl6 = {
3292
+ .flowi6_oif = cfg->fc_ifindex,
3293
+ .daddr = *gw_addr,
3294
+ };
29043295
2905
- if (!grt)
2906
- goto out;
3296
+ err = fib6_lookup(net, cfg->fc_ifindex, &fl6, &res, flags);
3297
+ if (err || res.fib6_flags & RTF_REJECT ||
3298
+ res.nh->fib_nh_gw_family)
3299
+ err = -EHOSTUNREACH;
29073300
3301
+ if (err)
3302
+ return err;
3303
+
3304
+ fib6_select_path(net, &res, &fl6, cfg->fc_ifindex,
3305
+ cfg->fc_ifindex != 0, NULL, flags);
3306
+ }
3307
+
3308
+ err = 0;
29083309 if (dev) {
2909
- if (dev != grt->dst.dev) {
2910
- ip6_rt_put(grt);
2911
- goto out;
2912
- }
3310
+ if (dev != res.nh->fib_nh_dev)
3311
+ err = -EHOSTUNREACH;
29133312 } else {
2914
- *_dev = dev = grt->dst.dev;
2915
- *idev = grt->rt6i_idev;
3313
+ *_dev = dev = res.nh->fib_nh_dev;
29163314 dev_hold(dev);
2917
- in6_dev_hold(grt->rt6i_idev);
3315
+ *idev = in6_dev_get(dev);
29183316 }
29193317
2920
- if (!(grt->rt6i_flags & RTF_GATEWAY))
2921
- err = 0;
2922
-
2923
- ip6_rt_put(grt);
2924
-
2925
-out:
29263318 return err;
29273319 }
29283320
....@@ -2963,10 +3355,14 @@
29633355 goto out;
29643356 }
29653357
3358
+ rcu_read_lock();
3359
+
29663360 if (cfg->fc_flags & RTNH_F_ONLINK)
29673361 err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
29683362 else
29693363 err = ip6_route_check_nh(net, cfg, _dev, idev);
3364
+
3365
+ rcu_read_unlock();
29703366
29713367 if (err)
29723368 goto out;
....@@ -2999,17 +3395,192 @@
29993395 return err;
30003396 }
30013397
3398
+static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
3399
+{
3400
+ if ((flags & RTF_REJECT) ||
3401
+ (dev && (dev->flags & IFF_LOOPBACK) &&
3402
+ !(addr_type & IPV6_ADDR_LOOPBACK) &&
3403
+ !(flags & (RTF_ANYCAST | RTF_LOCAL))))
3404
+ return true;
3405
+
3406
+ return false;
3407
+}
3408
+
3409
+int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
3410
+ struct fib6_config *cfg, gfp_t gfp_flags,
3411
+ struct netlink_ext_ack *extack)
3412
+{
3413
+ struct net_device *dev = NULL;
3414
+ struct inet6_dev *idev = NULL;
3415
+ int addr_type;
3416
+ int err;
3417
+
3418
+ fib6_nh->fib_nh_family = AF_INET6;
3419
+#ifdef CONFIG_IPV6_ROUTER_PREF
3420
+ fib6_nh->last_probe = jiffies;
3421
+#endif
3422
+ if (cfg->fc_is_fdb) {
3423
+ fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
3424
+ fib6_nh->fib_nh_gw_family = AF_INET6;
3425
+ return 0;
3426
+ }
3427
+
3428
+ err = -ENODEV;
3429
+ if (cfg->fc_ifindex) {
3430
+ dev = dev_get_by_index(net, cfg->fc_ifindex);
3431
+ if (!dev)
3432
+ goto out;
3433
+ idev = in6_dev_get(dev);
3434
+ if (!idev)
3435
+ goto out;
3436
+ }
3437
+
3438
+ if (cfg->fc_flags & RTNH_F_ONLINK) {
3439
+ if (!dev) {
3440
+ NL_SET_ERR_MSG(extack,
3441
+ "Nexthop device required for onlink");
3442
+ goto out;
3443
+ }
3444
+
3445
+ if (!(dev->flags & IFF_UP)) {
3446
+ NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3447
+ err = -ENETDOWN;
3448
+ goto out;
3449
+ }
3450
+
3451
+ fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
3452
+ }
3453
+
3454
+ fib6_nh->fib_nh_weight = 1;
3455
+
3456
+ /* We cannot add true routes via loopback here,
3457
+ * they would result in kernel looping; promote them to reject routes
3458
+ */
3459
+ addr_type = ipv6_addr_type(&cfg->fc_dst);
3460
+ if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
3461
+ /* hold loopback dev/idev if we haven't done so. */
3462
+ if (dev != net->loopback_dev) {
3463
+ if (dev) {
3464
+ dev_put(dev);
3465
+ in6_dev_put(idev);
3466
+ }
3467
+ dev = net->loopback_dev;
3468
+ dev_hold(dev);
3469
+ idev = in6_dev_get(dev);
3470
+ if (!idev) {
3471
+ err = -ENODEV;
3472
+ goto out;
3473
+ }
3474
+ }
3475
+ goto pcpu_alloc;
3476
+ }
3477
+
3478
+ if (cfg->fc_flags & RTF_GATEWAY) {
3479
+ err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3480
+ if (err)
3481
+ goto out;
3482
+
3483
+ fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
3484
+ fib6_nh->fib_nh_gw_family = AF_INET6;
3485
+ }
3486
+
3487
+ err = -ENODEV;
3488
+ if (!dev)
3489
+ goto out;
3490
+
3491
+ if (idev->cnf.disable_ipv6) {
3492
+ NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3493
+ err = -EACCES;
3494
+ goto out;
3495
+ }
3496
+
3497
+ if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
3498
+ NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3499
+ err = -ENETDOWN;
3500
+ goto out;
3501
+ }
3502
+
3503
+ if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3504
+ !netif_carrier_ok(dev))
3505
+ fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
3506
+
3507
+ err = fib_nh_common_init(net, &fib6_nh->nh_common, cfg->fc_encap,
3508
+ cfg->fc_encap_type, cfg, gfp_flags, extack);
3509
+ if (err)
3510
+ goto out;
3511
+
3512
+pcpu_alloc:
3513
+ fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags);
3514
+ if (!fib6_nh->rt6i_pcpu) {
3515
+ err = -ENOMEM;
3516
+ goto out;
3517
+ }
3518
+
3519
+ fib6_nh->fib_nh_dev = dev;
3520
+ fib6_nh->fib_nh_oif = dev->ifindex;
3521
+ err = 0;
3522
+out:
3523
+ if (idev)
3524
+ in6_dev_put(idev);
3525
+
3526
+ if (err) {
3527
+ lwtstate_put(fib6_nh->fib_nh_lws);
3528
+ fib6_nh->fib_nh_lws = NULL;
3529
+ if (dev)
3530
+ dev_put(dev);
3531
+ }
3532
+
3533
+ return err;
3534
+}
3535
+
3536
+void fib6_nh_release(struct fib6_nh *fib6_nh)
3537
+{
3538
+ struct rt6_exception_bucket *bucket;
3539
+
3540
+ rcu_read_lock();
3541
+
3542
+ fib6_nh_flush_exceptions(fib6_nh, NULL);
3543
+ bucket = fib6_nh_get_excptn_bucket(fib6_nh, NULL);
3544
+ if (bucket) {
3545
+ rcu_assign_pointer(fib6_nh->rt6i_exception_bucket, NULL);
3546
+ kfree(bucket);
3547
+ }
3548
+
3549
+ rcu_read_unlock();
3550
+
3551
+ if (fib6_nh->rt6i_pcpu) {
3552
+ int cpu;
3553
+
3554
+ for_each_possible_cpu(cpu) {
3555
+ struct rt6_info **ppcpu_rt;
3556
+ struct rt6_info *pcpu_rt;
3557
+
3558
+ ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
3559
+ pcpu_rt = *ppcpu_rt;
3560
+ if (pcpu_rt) {
3561
+ dst_dev_put(&pcpu_rt->dst);
3562
+ dst_release(&pcpu_rt->dst);
3563
+ *ppcpu_rt = NULL;
3564
+ }
3565
+ }
3566
+
3567
+ free_percpu(fib6_nh->rt6i_pcpu);
3568
+ }
3569
+
3570
+ fib_nh_common_release(&fib6_nh->nh_common);
3571
+}
3572
+
30023573 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
30033574 gfp_t gfp_flags,
30043575 struct netlink_ext_ack *extack)
30053576 {
30063577 struct net *net = cfg->fc_nlinfo.nl_net;
30073578 struct fib6_info *rt = NULL;
3008
- struct net_device *dev = NULL;
3009
- struct inet6_dev *idev = NULL;
3579
+ struct nexthop *nh = NULL;
30103580 struct fib6_table *table;
3011
- int addr_type;
3581
+ struct fib6_nh *fib6_nh;
30123582 int err = -EINVAL;
3583
+ int addr_type;
30133584
30143585 /* RTF_PCPU is an internal flag; can not be set by userspace */
30153586 if (cfg->fc_flags & RTF_PCPU) {
....@@ -3043,32 +3614,15 @@
30433614 goto out;
30443615 }
30453616 #endif
3046
- if (cfg->fc_ifindex) {
3047
- err = -ENODEV;
3048
- dev = dev_get_by_index(net, cfg->fc_ifindex);
3049
- if (!dev)
3050
- goto out;
3051
- idev = in6_dev_get(dev);
3052
- if (!idev)
3053
- goto out;
3054
- }
3055
-
3056
- if (cfg->fc_metric == 0)
3057
- cfg->fc_metric = IP6_RT_PRIO_USER;
3058
-
3059
- if (cfg->fc_flags & RTNH_F_ONLINK) {
3060
- if (!dev) {
3061
- NL_SET_ERR_MSG(extack,
3062
- "Nexthop device required for onlink");
3063
- err = -ENODEV;
3617
+ if (cfg->fc_nh_id) {
3618
+ nh = nexthop_find_by_id(net, cfg->fc_nh_id);
3619
+ if (!nh) {
3620
+ NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
30643621 goto out;
30653622 }
3066
-
3067
- if (!(dev->flags & IFF_UP)) {
3068
- NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3069
- err = -ENETDOWN;
3623
+ err = fib6_check_nexthop(nh, cfg, extack);
3624
+ if (err)
30703625 goto out;
3071
- }
30723626 }
30733627
30743628 err = -ENOBUFS;
....@@ -3087,19 +3641,21 @@
30873641 goto out;
30883642
30893643 err = -ENOMEM;
3090
- rt = fib6_info_alloc(gfp_flags);
3644
+ rt = fib6_info_alloc(gfp_flags, !nh);
30913645 if (!rt)
30923646 goto out;
30933647
3094
-#ifdef CONFIG_IPV6_ROUTER_PREF
3095
- rt->last_probe = jiffies;
3096
-#endif
3648
+ rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
3649
+ extack);
3650
+ if (IS_ERR(rt->fib6_metrics)) {
3651
+ err = PTR_ERR(rt->fib6_metrics);
3652
+ /* Do not leave garbage there. */
3653
+ rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3654
+ goto out_free;
3655
+ }
3656
+
30973657 if (cfg->fc_flags & RTF_ADDRCONF)
30983658 rt->dst_nocount = true;
3099
-
3100
- err = ip6_convert_metrics(net, rt, cfg);
3101
- if (err < 0)
3102
- goto out;
31033659
31043660 if (cfg->fc_flags & RTF_EXPIRES)
31053661 fib6_set_expires(rt, jiffies +
....@@ -3111,84 +3667,48 @@
31113667 cfg->fc_protocol = RTPROT_BOOT;
31123668 rt->fib6_protocol = cfg->fc_protocol;
31133669
3114
- addr_type = ipv6_addr_type(&cfg->fc_dst);
3115
-
3116
- if (cfg->fc_encap) {
3117
- struct lwtunnel_state *lwtstate;
3118
-
3119
- err = lwtunnel_build_state(cfg->fc_encap_type,
3120
- cfg->fc_encap, AF_INET6, cfg,
3121
- &lwtstate, extack);
3122
- if (err)
3123
- goto out;
3124
- rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3125
- }
3670
+ rt->fib6_table = table;
3671
+ rt->fib6_metric = cfg->fc_metric;
3672
+ rt->fib6_type = cfg->fc_type ? : RTN_UNICAST;
3673
+ rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;
31263674
31273675 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
31283676 rt->fib6_dst.plen = cfg->fc_dst_len;
3129
- if (rt->fib6_dst.plen == 128)
3130
- rt->dst_host = true;
31313677
31323678 #ifdef CONFIG_IPV6_SUBTREES
31333679 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
31343680 rt->fib6_src.plen = cfg->fc_src_len;
31353681 #endif
3136
-
3137
- rt->fib6_metric = cfg->fc_metric;
3138
- rt->fib6_nh.nh_weight = 1;
3139
-
3140
- rt->fib6_type = cfg->fc_type ? : RTN_UNICAST;
3141
-
3142
- /* We cannot add true routes via loopback here,
3143
- they would result in kernel looping; promote them to reject routes
3144
- */
3145
- if ((cfg->fc_flags & RTF_REJECT) ||
3146
- (dev && (dev->flags & IFF_LOOPBACK) &&
3147
- !(addr_type & IPV6_ADDR_LOOPBACK) &&
3148
- !(cfg->fc_flags & RTF_LOCAL))) {
3149
- /* hold loopback dev/idev if we haven't done so. */
3150
- if (dev != net->loopback_dev) {
3151
- if (dev) {
3152
- dev_put(dev);
3153
- in6_dev_put(idev);
3154
- }
3155
- dev = net->loopback_dev;
3156
- dev_hold(dev);
3157
- idev = in6_dev_get(dev);
3158
- if (!idev) {
3159
- err = -ENODEV;
3160
- goto out;
3161
- }
3682
+ if (nh) {
3683
+ if (rt->fib6_src.plen) {
3684
+ NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing");
3685
+ goto out_free;
31623686 }
3163
- rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3164
- goto install_route;
3165
- }
3166
-
3167
- if (cfg->fc_flags & RTF_GATEWAY) {
3168
- err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3687
+ if (!nexthop_get(nh)) {
3688
+ NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
3689
+ goto out_free;
3690
+ }
3691
+ rt->nh = nh;
3692
+ fib6_nh = nexthop_fib6_nh(rt->nh);
3693
+ } else {
3694
+ err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack);
31693695 if (err)
31703696 goto out;
31713697
3172
- rt->fib6_nh.nh_gw = cfg->fc_gateway;
3173
- }
3698
+ fib6_nh = rt->fib6_nh;
31743699
3175
- err = -ENODEV;
3176
- if (!dev)
3177
- goto out;
3178
-
3179
- if (idev->cnf.disable_ipv6) {
3180
- NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3181
- err = -EACCES;
3182
- goto out;
3183
- }
3184
-
3185
- if (!(dev->flags & IFF_UP)) {
3186
- NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3187
- err = -ENETDOWN;
3188
- goto out;
3700
+ /* We cannot add true routes via loopback here, they would
3701
+ * result in kernel looping; promote them to reject routes
3702
+ */
3703
+ addr_type = ipv6_addr_type(&cfg->fc_dst);
3704
+ if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev,
3705
+ addr_type))
3706
+ rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
31893707 }
31903708
31913709 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3710
+ struct net_device *dev = fib6_nh->fib_nh_dev;
3711
+
31923712 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
31933713 NL_SET_ERR_MSG(extack, "Invalid source address");
31943714 err = -EINVAL;
....@@ -3199,29 +3719,13 @@
31993719 } else
32003720 rt->fib6_prefsrc.plen = 0;
32013721
3202
- rt->fib6_flags = cfg->fc_flags;
3203
-
3204
-install_route:
3205
- if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3206
- !netif_carrier_ok(dev))
3207
- rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3208
- rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3209
- rt->fib6_nh.nh_dev = dev;
3210
- rt->fib6_table = table;
3211
-
3212
- cfg->fc_nlinfo.nl_net = dev_net(dev);
3213
-
3214
- if (idev)
3215
- in6_dev_put(idev);
3216
-
32173722 return rt;
32183723 out:
3219
- if (dev)
3220
- dev_put(dev);
3221
- if (idev)
3222
- in6_dev_put(idev);
3223
-
32243724 fib6_info_release(rt);
3725
+ return ERR_PTR(err);
3726
+out_free:
3727
+ ip_fib_metrics_put(rt->fib6_metrics);
3728
+ kfree(rt);
32253729 return ERR_PTR(err);
32263730 }
32273731
....@@ -3262,9 +3766,12 @@
32623766 return err;
32633767 }
32643768
3265
-int ip6_del_rt(struct net *net, struct fib6_info *rt)
3769
+int ip6_del_rt(struct net *net, struct fib6_info *rt, bool skip_notify)
32663770 {
3267
- struct nl_info info = { .nl_net = net };
3771
+ struct nl_info info = {
3772
+ .nl_net = net,
3773
+ .skip_notify = skip_notify
3774
+ };
32683775
32693776 return __ip6_del_rt(rt, &info);
32703777 }
....@@ -3284,6 +3791,7 @@
32843791
32853792 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
32863793 struct fib6_info *sibling, *next_sibling;
3794
+ struct fib6_node *fn;
32873795
32883796 /* prefer to send a single notification with all hops */
32893797 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
....@@ -3299,6 +3807,32 @@
32993807 info->skip_notify = 1;
33003808 }
33013809
3810
+ /* 'rt' points to the first sibling route. If it is not the
3811
+ * leaf, then we do not need to send a notification. Otherwise,
3812
+ * we need to check if the last sibling has a next route or not
3813
+ * and emit a replace or delete notification, respectively.
3814
+ */
3815
+ info->skip_notify_kernel = 1;
3816
+ fn = rcu_dereference_protected(rt->fib6_node,
3817
+ lockdep_is_held(&table->tb6_lock));
3818
+ if (rcu_access_pointer(fn->leaf) == rt) {
3819
+ struct fib6_info *last_sibling, *replace_rt;
3820
+
3821
+ last_sibling = list_last_entry(&rt->fib6_siblings,
3822
+ struct fib6_info,
3823
+ fib6_siblings);
3824
+ replace_rt = rcu_dereference_protected(
3825
+ last_sibling->fib6_next,
3826
+ lockdep_is_held(&table->tb6_lock));
3827
+ if (replace_rt)
3828
+ call_fib6_entry_notifiers_replace(net,
3829
+ replace_rt);
3830
+ else
3831
+ call_fib6_multipath_entry_notifiers(net,
3832
+ FIB_EVENT_ENTRY_DEL,
3833
+ rt, rt->fib6_nsiblings,
3834
+ NULL);
3835
+ }
33023836 list_for_each_entry_safe(sibling, next_sibling,
33033837 &rt->fib6_siblings,
33043838 fib6_siblings) {
....@@ -3321,7 +3855,7 @@
33213855 return err;
33223856 }
33233857
3324
-static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3858
+static int __ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
33253859 {
33263860 int rc = -ESRCH;
33273861
....@@ -3337,10 +3871,49 @@
33373871 return rc;
33383872 }
33393873
3874
+static int ip6_del_cached_rt(struct fib6_config *cfg, struct fib6_info *rt,
3875
+ struct fib6_nh *nh)
3876
+{
3877
+ struct fib6_result res = {
3878
+ .f6i = rt,
3879
+ .nh = nh,
3880
+ };
3881
+ struct rt6_info *rt_cache;
3882
+
3883
+ rt_cache = rt6_find_cached_rt(&res, &cfg->fc_dst, &cfg->fc_src);
3884
+ if (rt_cache)
3885
+ return __ip6_del_cached_rt(rt_cache, cfg);
3886
+
3887
+ return 0;
3888
+}
3889
+
3890
+struct fib6_nh_del_cached_rt_arg {
3891
+ struct fib6_config *cfg;
3892
+ struct fib6_info *f6i;
3893
+};
3894
+
3895
+static int fib6_nh_del_cached_rt(struct fib6_nh *nh, void *_arg)
3896
+{
3897
+ struct fib6_nh_del_cached_rt_arg *arg = _arg;
3898
+ int rc;
3899
+
3900
+ rc = ip6_del_cached_rt(arg->cfg, arg->f6i, nh);
3901
+ return rc != -ESRCH ? rc : 0;
3902
+}
3903
+
3904
+static int ip6_del_cached_rt_nh(struct fib6_config *cfg, struct fib6_info *f6i)
3905
+{
3906
+ struct fib6_nh_del_cached_rt_arg arg = {
3907
+ .cfg = cfg,
3908
+ .f6i = f6i
3909
+ };
3910
+
3911
+ return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_del_cached_rt, &arg);
3912
+}
3913
+
33403914 static int ip6_route_del(struct fib6_config *cfg,
33413915 struct netlink_ext_ack *extack)
33423916 {
3343
- struct rt6_info *rt_cache;
33443917 struct fib6_table *table;
33453918 struct fib6_info *rt;
33463919 struct fib6_node *fn;
....@@ -3361,30 +3934,53 @@
33613934
33623935 if (fn) {
33633936 for_each_fib6_node_rt_rcu(fn) {
3364
- if (cfg->fc_flags & RTF_CACHE) {
3365
- int rc;
3937
+ struct fib6_nh *nh;
33663938
3367
- rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3368
- &cfg->fc_src);
3369
- if (rt_cache) {
3370
- rc = ip6_del_cached_rt(rt_cache, cfg);
3371
- if (rc != -ESRCH) {
3372
- rcu_read_unlock();
3373
- return rc;
3374
- }
3939
+ if (rt->nh && cfg->fc_nh_id &&
3940
+ rt->nh->id != cfg->fc_nh_id)
3941
+ continue;
3942
+
3943
+ if (cfg->fc_flags & RTF_CACHE) {
3944
+ int rc = 0;
3945
+
3946
+ if (rt->nh) {
3947
+ rc = ip6_del_cached_rt_nh(cfg, rt);
3948
+ } else if (cfg->fc_nh_id) {
3949
+ continue;
3950
+ } else {
3951
+ nh = rt->fib6_nh;
3952
+ rc = ip6_del_cached_rt(cfg, rt, nh);
3953
+ }
3954
+ if (rc != -ESRCH) {
3955
+ rcu_read_unlock();
3956
+ return rc;
33753957 }
33763958 continue;
33773959 }
3378
- if (cfg->fc_ifindex &&
3379
- (!rt->fib6_nh.nh_dev ||
3380
- rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3381
- continue;
3382
- if (cfg->fc_flags & RTF_GATEWAY &&
3383
- !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3384
- continue;
3960
+
33853961 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
33863962 continue;
3387
- if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3963
+ if (cfg->fc_protocol &&
3964
+ cfg->fc_protocol != rt->fib6_protocol)
3965
+ continue;
3966
+
3967
+ if (rt->nh) {
3968
+ if (!fib6_info_hold_safe(rt))
3969
+ continue;
3970
+ rcu_read_unlock();
3971
+
3972
+ return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3973
+ }
3974
+ if (cfg->fc_nh_id)
3975
+ continue;
3976
+
3977
+ nh = rt->fib6_nh;
3978
+ if (cfg->fc_ifindex &&
3979
+ (!nh->fib_nh_dev ||
3980
+ nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
3981
+ continue;
3982
+ if (cfg->fc_flags & RTF_GATEWAY &&
3983
+ !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
33883984 continue;
33893985 if (!fib6_info_hold_safe(rt))
33903986 continue;
....@@ -3406,10 +4002,10 @@
34064002 {
34074003 struct netevent_redirect netevent;
34084004 struct rt6_info *rt, *nrt = NULL;
4005
+ struct fib6_result res = {};
34094006 struct ndisc_options ndopts;
34104007 struct inet6_dev *in6_dev;
34114008 struct neighbour *neigh;
3412
- struct fib6_info *from;
34134009 struct rd_msg *msg;
34144010 int optlen, on_link;
34154011 u8 *lladdr;
....@@ -3492,11 +4088,32 @@
34924088 NDISC_REDIRECT, &ndopts);
34934089
34944090 rcu_read_lock();
3495
- from = rcu_dereference(rt->from);
3496
- if (!from)
4091
+ res.f6i = rcu_dereference(rt->from);
4092
+ if (!res.f6i)
34974093 goto out;
34984094
3499
- nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
4095
+ if (res.f6i->nh) {
4096
+ struct fib6_nh_match_arg arg = {
4097
+ .dev = dst->dev,
4098
+ .gw = &rt->rt6i_gateway,
4099
+ };
4100
+
4101
+ nexthop_for_each_fib6_nh(res.f6i->nh,
4102
+ fib6_nh_find_match, &arg);
4103
+
4104
+ /* fib6_info uses a nexthop that does not have fib6_nh
4105
+ * using the dst->dev. Should be impossible
4106
+ */
4107
+ if (!arg.match)
4108
+ goto out;
4109
+ res.nh = arg.match;
4110
+ } else {
4111
+ res.nh = res.f6i->fib6_nh;
4112
+ }
4113
+
4114
+ res.fib6_flags = res.f6i->fib6_flags;
4115
+ res.fib6_type = res.f6i->fib6_type;
4116
+ nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL);
35004117 if (!nrt)
35014118 goto out;
35024119
....@@ -3507,7 +4124,7 @@
35074124 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
35084125
35094126 /* rt6_insert_exception() will take care of duplicated exceptions */
3510
- if (rt6_insert_exception(nrt, from)) {
4127
+ if (rt6_insert_exception(nrt, &res)) {
35114128 dst_release_immediate(&nrt->dst);
35124129 goto out;
35134130 }
....@@ -3545,11 +4162,15 @@
35454162 goto out;
35464163
35474164 for_each_fib6_node_rt_rcu(fn) {
3548
- if (rt->fib6_nh.nh_dev->ifindex != ifindex)
4165
+ /* these routes do not use nexthops */
4166
+ if (rt->nh)
35494167 continue;
3550
- if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
4168
+ if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex)
35514169 continue;
3552
- if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
4170
+ if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
4171
+ !rt->fib6_nh->fib_nh_gw_family)
4172
+ continue;
4173
+ if (!ipv6_addr_equal(&rt->fib6_nh->fib_nh_gw6, gwaddr))
35534174 continue;
35544175 if (!fib6_info_hold_safe(rt))
35554176 continue;
....@@ -3579,7 +4200,7 @@
35794200 .fc_nlinfo.nl_net = net,
35804201 };
35814202
3582
- cfg.fc_table = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_INFO),
4203
+ cfg.fc_table = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_INFO);
35834204 cfg.fc_dst = *prefix;
35844205 cfg.fc_gateway = *gwaddr;
35854206
....@@ -3607,9 +4228,16 @@
36074228
36084229 rcu_read_lock();
36094230 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3610
- if (dev == rt->fib6_nh.nh_dev &&
4231
+ struct fib6_nh *nh;
4232
+
4233
+ /* RA routes do not use nexthops */
4234
+ if (rt->nh)
4235
+ continue;
4236
+
4237
+ nh = rt->fib6_nh;
4238
+ if (dev == nh->fib_nh_dev &&
36114239 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3612
- ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
4240
+ ipv6_addr_equal(&nh->fib_nh_gw6, addr))
36134241 break;
36144242 }
36154243 if (rt && !fib6_info_hold_safe(rt))
....@@ -3673,60 +4301,48 @@
36734301 struct in6_rtmsg *rtmsg,
36744302 struct fib6_config *cfg)
36754303 {
3676
- memset(cfg, 0, sizeof(*cfg));
4304
+ *cfg = (struct fib6_config){
4305
+ .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
4306
+ : RT6_TABLE_MAIN,
4307
+ .fc_ifindex = rtmsg->rtmsg_ifindex,
4308
+ .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER,
4309
+ .fc_expires = rtmsg->rtmsg_info,
4310
+ .fc_dst_len = rtmsg->rtmsg_dst_len,
4311
+ .fc_src_len = rtmsg->rtmsg_src_len,
4312
+ .fc_flags = rtmsg->rtmsg_flags,
4313
+ .fc_type = rtmsg->rtmsg_type,
36774314
3678
- cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3679
- : RT6_TABLE_MAIN;
3680
- cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3681
- cfg->fc_metric = rtmsg->rtmsg_metric;
3682
- cfg->fc_expires = rtmsg->rtmsg_info;
3683
- cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3684
- cfg->fc_src_len = rtmsg->rtmsg_src_len;
3685
- cfg->fc_flags = rtmsg->rtmsg_flags;
3686
- cfg->fc_type = rtmsg->rtmsg_type;
4315
+ .fc_nlinfo.nl_net = net,
36874316
3688
- cfg->fc_nlinfo.nl_net = net;
3689
-
3690
- cfg->fc_dst = rtmsg->rtmsg_dst;
3691
- cfg->fc_src = rtmsg->rtmsg_src;
3692
- cfg->fc_gateway = rtmsg->rtmsg_gateway;
4317
+ .fc_dst = rtmsg->rtmsg_dst,
4318
+ .fc_src = rtmsg->rtmsg_src,
4319
+ .fc_gateway = rtmsg->rtmsg_gateway,
4320
+ };
36934321 }
36944322
3695
-int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4323
+int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg)
36964324 {
36974325 struct fib6_config cfg;
3698
- struct in6_rtmsg rtmsg;
36994326 int err;
37004327
4328
+ if (cmd != SIOCADDRT && cmd != SIOCDELRT)
4329
+ return -EINVAL;
4330
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
4331
+ return -EPERM;
4332
+
4333
+ rtmsg_to_fib6_config(net, rtmsg, &cfg);
4334
+
4335
+ rtnl_lock();
37014336 switch (cmd) {
3702
- case SIOCADDRT: /* Add a route */
3703
- case SIOCDELRT: /* Delete a route */
3704
- if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3705
- return -EPERM;
3706
- err = copy_from_user(&rtmsg, arg,
3707
- sizeof(struct in6_rtmsg));
3708
- if (err)
3709
- return -EFAULT;
3710
-
3711
- rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3712
-
3713
- rtnl_lock();
3714
- switch (cmd) {
3715
- case SIOCADDRT:
3716
- err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3717
- break;
3718
- case SIOCDELRT:
3719
- err = ip6_route_del(&cfg, NULL);
3720
- break;
3721
- default:
3722
- err = -EINVAL;
3723
- }
3724
- rtnl_unlock();
3725
-
3726
- return err;
4337
+ case SIOCADDRT:
4338
+ err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
4339
+ break;
4340
+ case SIOCDELRT:
4341
+ err = ip6_route_del(&cfg, NULL);
4342
+ break;
37274343 }
3728
-
3729
- return -EINVAL;
4344
+ rtnl_unlock();
4345
+ return err;
37304346 }
37314347
37324348 /*
....@@ -3735,23 +4351,34 @@
37354351
37364352 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
37374353 {
3738
- int type;
37394354 struct dst_entry *dst = skb_dst(skb);
4355
+ struct net *net = dev_net(dst->dev);
4356
+ struct inet6_dev *idev;
4357
+ int type;
4358
+
4359
+ if (netif_is_l3_master(skb->dev) ||
4360
+ dst->dev == net->loopback_dev)
4361
+ idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
4362
+ else
4363
+ idev = ip6_dst_idev(dst);
4364
+
37404365 switch (ipstats_mib_noroutes) {
37414366 case IPSTATS_MIB_INNOROUTES:
37424367 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
37434368 if (type == IPV6_ADDR_ANY) {
3744
- IP6_INC_STATS(dev_net(dst->dev),
3745
- __in6_dev_get_safely(skb->dev),
3746
- IPSTATS_MIB_INADDRERRORS);
4369
+ IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
37474370 break;
37484371 }
3749
- /* FALLTHROUGH */
4372
+ fallthrough;
37504373 case IPSTATS_MIB_OUTNOROUTES:
3751
- IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3752
- ipstats_mib_noroutes);
4374
+ IP6_INC_STATS(net, idev, ipstats_mib_noroutes);
37534375 break;
37544376 }
4377
+
4378
+ /* Start over by dropping the dst for l3mdev case */
4379
+ if (netif_is_l3_master(skb->dev))
4380
+ skb_dst_drop(skb);
4381
+
37554382 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
37564383 kfree_skb(skb);
37574384 return 0;
....@@ -3788,33 +4415,35 @@
37884415 const struct in6_addr *addr,
37894416 bool anycast, gfp_t gfp_flags)
37904417 {
3791
- u32 tb_id;
3792
- struct net_device *dev = idev->dev;
4418
+ struct fib6_config cfg = {
4419
+ .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
4420
+ .fc_ifindex = idev->dev->ifindex,
4421
+ .fc_flags = RTF_UP | RTF_NONEXTHOP,
4422
+ .fc_dst = *addr,
4423
+ .fc_dst_len = 128,
4424
+ .fc_protocol = RTPROT_KERNEL,
4425
+ .fc_nlinfo.nl_net = net,
4426
+ .fc_ignore_dev_down = true,
4427
+ };
37934428 struct fib6_info *f6i;
37944429
3795
- f6i = fib6_info_alloc(gfp_flags);
3796
- if (!f6i)
3797
- return ERR_PTR(-ENOMEM);
3798
-
3799
- f6i->dst_nocount = true;
3800
- f6i->dst_host = true;
3801
- f6i->fib6_protocol = RTPROT_KERNEL;
3802
- f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
38034430 if (anycast) {
3804
- f6i->fib6_type = RTN_ANYCAST;
3805
- f6i->fib6_flags |= RTF_ANYCAST;
4431
+ cfg.fc_type = RTN_ANYCAST;
4432
+ cfg.fc_flags |= RTF_ANYCAST;
38064433 } else {
3807
- f6i->fib6_type = RTN_LOCAL;
3808
- f6i->fib6_flags |= RTF_LOCAL;
4434
+ cfg.fc_type = RTN_LOCAL;
4435
+ cfg.fc_flags |= RTF_LOCAL;
38094436 }
38104437
3811
- f6i->fib6_nh.nh_gw = *addr;
3812
- dev_hold(dev);
3813
- f6i->fib6_nh.nh_dev = dev;
3814
- f6i->fib6_dst.addr = *addr;
3815
- f6i->fib6_dst.plen = 128;
3816
- tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3817
- f6i->fib6_table = fib6_get_table(net, tb_id);
4438
+ f6i = ip6_route_info_create(&cfg, gfp_flags, NULL);
4439
+ if (!IS_ERR(f6i)) {
4440
+ f6i->dst_nocount = true;
4441
+
4442
+ if (!anycast &&
4443
+ (net->ipv6.devconf_all->disable_policy ||
4444
+ idev->cnf.disable_policy))
4445
+ f6i->dst_nopolicy = true;
4446
+ }
38184447
38194448 return f6i;
38204449 }
....@@ -3832,14 +4461,13 @@
38324461 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
38334462 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
38344463
3835
- if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
4464
+ if (!rt->nh &&
4465
+ ((void *)rt->fib6_nh->fib_nh_dev == dev || !dev) &&
38364466 rt != net->ipv6.fib6_null_entry &&
38374467 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
38384468 spin_lock_bh(&rt6_exception_lock);
38394469 /* remove prefsrc entry */
38404470 rt->fib6_prefsrc.plen = 0;
3841
- /* need to update cache as well */
3842
- rt6_exceptions_remove_prefsrc(rt);
38434471 spin_unlock_bh(&rt6_exception_lock);
38444472 }
38454473 return 0;
....@@ -3856,23 +4484,28 @@
38564484 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
38574485 }
38584486
3859
-#define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
4487
+#define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT)
38604488
38614489 /* Remove routers and update dst entries when gateway turn into host. */
38624490 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
38634491 {
38644492 struct in6_addr *gateway = (struct in6_addr *)arg;
4493
+ struct fib6_nh *nh;
38654494
4495
+ /* RA routes do not use nexthops */
4496
+ if (rt->nh)
4497
+ return 0;
4498
+
4499
+ nh = rt->fib6_nh;
38664500 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3867
- ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
4501
+ nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6))
38684502 return -1;
3869
- }
38704503
38714504 /* Further clean up cached routes in exception table.
38724505 * This is needed because cached route may have a different
38734506 * gateway than its 'parent' in the case of an ip redirect.
38744507 */
3875
- rt6_exceptions_clean_tohost(rt, gateway);
4508
+ fib6_nh_exceptions_clean_tohost(nh, gateway);
38764509
38774510 return 0;
38784511 }
....@@ -3885,7 +4518,7 @@
38854518 struct arg_netdev_event {
38864519 const struct net_device *dev;
38874520 union {
3888
- unsigned int nh_flags;
4521
+ unsigned char nh_flags;
38894522 unsigned long event;
38904523 };
38914524 };
....@@ -3910,11 +4543,12 @@
39104543 return NULL;
39114544 }
39124545
4546
+/* only called for fib entries with builtin fib6_nh */
39134547 static bool rt6_is_dead(const struct fib6_info *rt)
39144548 {
3915
- if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3916
- (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3917
- fib6_ignore_linkdown(rt)))
4549
+ if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD ||
4550
+ (rt->fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN &&
4551
+ ip6_ignore_linkdown(rt->fib6_nh->fib_nh_dev)))
39184552 return true;
39194553
39204554 return false;
....@@ -3926,11 +4560,11 @@
39264560 int total = 0;
39274561
39284562 if (!rt6_is_dead(rt))
3929
- total += rt->fib6_nh.nh_weight;
4563
+ total += rt->fib6_nh->fib_nh_weight;
39304564
39314565 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
39324566 if (!rt6_is_dead(iter))
3933
- total += iter->fib6_nh.nh_weight;
4567
+ total += iter->fib6_nh->fib_nh_weight;
39344568 }
39354569
39364570 return total;
....@@ -3941,11 +4575,11 @@
39414575 int upper_bound = -1;
39424576
39434577 if (!rt6_is_dead(rt)) {
3944
- *weight += rt->fib6_nh.nh_weight;
4578
+ *weight += rt->fib6_nh->fib_nh_weight;
39454579 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
39464580 total) - 1;
39474581 }
3948
- atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
4582
+ atomic_set(&rt->fib6_nh->fib_nh_upper_bound, upper_bound);
39494583 }
39504584
39514585 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
....@@ -3988,8 +4622,9 @@
39884622 const struct arg_netdev_event *arg = p_arg;
39894623 struct net *net = dev_net(arg->dev);
39904624
3991
- if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3992
- rt->fib6_nh.nh_flags &= ~arg->nh_flags;
4625
+ if (rt != net->ipv6.fib6_null_entry && !rt->nh &&
4626
+ rt->fib6_nh->fib_nh_dev == arg->dev) {
4627
+ rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags;
39934628 fib6_update_sernum_upto_root(net, rt);
39944629 rt6_multipath_rebalance(rt);
39954630 }
....@@ -3997,7 +4632,7 @@
39974632 return 0;
39984633 }
39994634
4000
-void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
4635
+void rt6_sync_up(struct net_device *dev, unsigned char nh_flags)
40014636 {
40024637 struct arg_netdev_event arg = {
40034638 .dev = dev,
....@@ -4012,15 +4647,16 @@
40124647 fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
40134648 }
40144649
4650
+/* only called for fib entries with inline fib6_nh */
40154651 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
40164652 const struct net_device *dev)
40174653 {
40184654 struct fib6_info *iter;
40194655
4020
- if (rt->fib6_nh.nh_dev == dev)
4656
+ if (rt->fib6_nh->fib_nh_dev == dev)
40214657 return true;
40224658 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4023
- if (iter->fib6_nh.nh_dev == dev)
4659
+ if (iter->fib6_nh->fib_nh_dev == dev)
40244660 return true;
40254661
40264662 return false;
....@@ -4041,12 +4677,12 @@
40414677 struct fib6_info *iter;
40424678 unsigned int dead = 0;
40434679
4044
- if (rt->fib6_nh.nh_dev == down_dev ||
4045
- rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4680
+ if (rt->fib6_nh->fib_nh_dev == down_dev ||
4681
+ rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD)
40464682 dead++;
40474683 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4048
- if (iter->fib6_nh.nh_dev == down_dev ||
4049
- iter->fib6_nh.nh_flags & RTNH_F_DEAD)
4684
+ if (iter->fib6_nh->fib_nh_dev == down_dev ||
4685
+ iter->fib6_nh->fib_nh_flags & RTNH_F_DEAD)
40504686 dead++;
40514687
40524688 return dead;
....@@ -4054,15 +4690,15 @@
40544690
40554691 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
40564692 const struct net_device *dev,
4057
- unsigned int nh_flags)
4693
+ unsigned char nh_flags)
40584694 {
40594695 struct fib6_info *iter;
40604696
4061
- if (rt->fib6_nh.nh_dev == dev)
4062
- rt->fib6_nh.nh_flags |= nh_flags;
4697
+ if (rt->fib6_nh->fib_nh_dev == dev)
4698
+ rt->fib6_nh->fib_nh_flags |= nh_flags;
40634699 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4064
- if (iter->fib6_nh.nh_dev == dev)
4065
- iter->fib6_nh.nh_flags |= nh_flags;
4700
+ if (iter->fib6_nh->fib_nh_dev == dev)
4701
+ iter->fib6_nh->fib_nh_flags |= nh_flags;
40664702 }
40674703
40684704 /* called with write lock held for table with rt */
....@@ -4072,17 +4708,17 @@
40724708 const struct net_device *dev = arg->dev;
40734709 struct net *net = dev_net(dev);
40744710
4075
- if (rt == net->ipv6.fib6_null_entry)
4711
+ if (rt == net->ipv6.fib6_null_entry || rt->nh)
40764712 return 0;
40774713
40784714 switch (arg->event) {
40794715 case NETDEV_UNREGISTER:
4080
- return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4716
+ return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0;
40814717 case NETDEV_DOWN:
40824718 if (rt->should_flush)
40834719 return -1;
40844720 if (!rt->fib6_nsiblings)
4085
- return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4721
+ return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0;
40864722 if (rt6_multipath_uses_dev(rt, dev)) {
40874723 unsigned int count;
40884724
....@@ -4098,10 +4734,10 @@
40984734 }
40994735 return -2;
41004736 case NETDEV_CHANGE:
4101
- if (rt->fib6_nh.nh_dev != dev ||
4737
+ if (rt->fib6_nh->fib_nh_dev != dev ||
41024738 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
41034739 break;
4104
- rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4740
+ rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
41054741 rt6_multipath_rebalance(rt);
41064742 break;
41074743 }
....@@ -4117,8 +4753,12 @@
41174753 .event = event,
41184754 },
41194755 };
4756
+ struct net *net = dev_net(dev);
41204757
4121
- fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
4758
+ if (net->ipv6.sysctl.skip_notify_on_dev_down)
4759
+ fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4760
+ else
4761
+ fib6_clean_all(net, fib6_ifdown, &arg);
41224762 }
41234763
41244764 void rt6_disable_ip(struct net_device *dev, unsigned long event)
....@@ -4131,9 +4771,36 @@
41314771 struct rt6_mtu_change_arg {
41324772 struct net_device *dev;
41334773 unsigned int mtu;
4774
+ struct fib6_info *f6i;
41344775 };
41354776
4136
-static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4777
+static int fib6_nh_mtu_change(struct fib6_nh *nh, void *_arg)
4778
+{
4779
+ struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *)_arg;
4780
+ struct fib6_info *f6i = arg->f6i;
4781
+
4782
+ /* For administrative MTU increase, there is no way to discover
4783
+ * IPv6 PMTU increase, so PMTU increase should be updated here.
4784
+ * Since RFC 1981 doesn't include administrative MTU increase
4785
+ * update PMTU increase is a MUST. (i.e. jumbo frame)
4786
+ */
4787
+ if (nh->fib_nh_dev == arg->dev) {
4788
+ struct inet6_dev *idev = __in6_dev_get(arg->dev);
4789
+ u32 mtu = f6i->fib6_pmtu;
4790
+
4791
+ if (mtu >= arg->mtu ||
4792
+ (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4793
+ fib6_metric_set(f6i, RTAX_MTU, arg->mtu);
4794
+
4795
+ spin_lock_bh(&rt6_exception_lock);
4796
+ rt6_exceptions_update_pmtu(idev, nh, arg->mtu);
4797
+ spin_unlock_bh(&rt6_exception_lock);
4798
+ }
4799
+
4800
+ return 0;
4801
+}
4802
+
4803
+static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg)
41374804 {
41384805 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
41394806 struct inet6_dev *idev;
....@@ -4148,24 +4815,17 @@
41484815 if (!idev)
41494816 return 0;
41504817
4151
- /* For administrative MTU increase, there is no way to discover
4152
- IPv6 PMTU increase, so PMTU increase should be updated here.
4153
- Since RFC 1981 doesn't include administrative MTU increase
4154
- update PMTU increase is a MUST. (i.e. jumbo frame)
4155
- */
4156
- if (rt->fib6_nh.nh_dev == arg->dev &&
4157
- !fib6_metric_locked(rt, RTAX_MTU)) {
4158
- u32 mtu = rt->fib6_pmtu;
4818
+ if (fib6_metric_locked(f6i, RTAX_MTU))
4819
+ return 0;
41594820
4160
- if (mtu >= arg->mtu ||
4161
- (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4162
- fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4163
-
4164
- spin_lock_bh(&rt6_exception_lock);
4165
- rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4166
- spin_unlock_bh(&rt6_exception_lock);
4821
+ arg->f6i = f6i;
4822
+ if (f6i->nh) {
4823
+ /* fib6_nh_mtu_change only returns 0, so this is safe */
4824
+ return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_mtu_change,
4825
+ arg);
41674826 }
4168
- return 0;
4827
+
4828
+ return fib6_nh_mtu_change(f6i->fib6_nh, arg);
41694829 }
41704830
41714831 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
....@@ -4179,6 +4839,7 @@
41794839 }
41804840
41814841 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4842
+ [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 },
41824843 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
41834844 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) },
41844845 [RTA_OIF] = { .type = NLA_U32 },
....@@ -4196,6 +4857,7 @@
41964857 [RTA_IP_PROTO] = { .type = NLA_U8 },
41974858 [RTA_SPORT] = { .type = NLA_U16 },
41984859 [RTA_DPORT] = { .type = NLA_U16 },
4860
+ [RTA_NH_ID] = { .type = NLA_U32 },
41994861 };
42004862
42014863 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
....@@ -4207,21 +4869,26 @@
42074869 unsigned int pref;
42084870 int err;
42094871
4210
- err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4211
- NULL);
4872
+ err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
4873
+ rtm_ipv6_policy, extack);
42124874 if (err < 0)
42134875 goto errout;
42144876
42154877 err = -EINVAL;
42164878 rtm = nlmsg_data(nlh);
4217
- memset(cfg, 0, sizeof(*cfg));
42184879
4219
- cfg->fc_table = rtm->rtm_table;
4220
- cfg->fc_dst_len = rtm->rtm_dst_len;
4221
- cfg->fc_src_len = rtm->rtm_src_len;
4222
- cfg->fc_flags = RTF_UP;
4223
- cfg->fc_protocol = rtm->rtm_protocol;
4224
- cfg->fc_type = rtm->rtm_type;
4880
+ *cfg = (struct fib6_config){
4881
+ .fc_table = rtm->rtm_table,
4882
+ .fc_dst_len = rtm->rtm_dst_len,
4883
+ .fc_src_len = rtm->rtm_src_len,
4884
+ .fc_flags = RTF_UP,
4885
+ .fc_protocol = rtm->rtm_protocol,
4886
+ .fc_type = rtm->rtm_type,
4887
+
4888
+ .fc_nlinfo.portid = NETLINK_CB(skb).portid,
4889
+ .fc_nlinfo.nlh = nlh,
4890
+ .fc_nlinfo.nl_net = sock_net(skb->sk),
4891
+ };
42254892
42264893 if (rtm->rtm_type == RTN_UNREACHABLE ||
42274894 rtm->rtm_type == RTN_BLACKHOLE ||
....@@ -4237,9 +4904,15 @@
42374904
42384905 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
42394906
4240
- cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4241
- cfg->fc_nlinfo.nlh = nlh;
4242
- cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4907
+ if (tb[RTA_NH_ID]) {
4908
+ if (tb[RTA_GATEWAY] || tb[RTA_OIF] ||
4909
+ tb[RTA_MULTIPATH] || tb[RTA_ENCAP]) {
4910
+ NL_SET_ERR_MSG(extack,
4911
+ "Nexthop specification and nexthop id are mutually exclusive");
4912
+ goto errout;
4913
+ }
4914
+ cfg->fc_nh_id = nla_get_u32(tb[RTA_NH_ID]);
4915
+ }
42434916
42444917 if (tb[RTA_GATEWAY]) {
42454918 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
....@@ -4334,17 +5007,6 @@
43345007 struct list_head next;
43355008 };
43365009
4337
-static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4338
-{
4339
- struct rt6_nh *nh;
4340
-
4341
- list_for_each_entry(nh, rt6_nh_list, next) {
4342
- pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4343
- &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4344
- nh->r_cfg.fc_ifindex);
4345
- }
4346
-}
4347
-
43485010 static int ip6_route_info_append(struct net *net,
43495011 struct list_head *rt6_nh_list,
43505012 struct fib6_info *rt,
....@@ -4388,6 +5050,32 @@
43885050
43895051 if (rt)
43905052 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
5053
+}
5054
+
5055
+static bool ip6_route_mpath_should_notify(const struct fib6_info *rt)
5056
+{
5057
+ bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
5058
+ bool should_notify = false;
5059
+ struct fib6_info *leaf;
5060
+ struct fib6_node *fn;
5061
+
5062
+ rcu_read_lock();
5063
+ fn = rcu_dereference(rt->fib6_node);
5064
+ if (!fn)
5065
+ goto out;
5066
+
5067
+ leaf = rcu_dereference(fn->leaf);
5068
+ if (!leaf)
5069
+ goto out;
5070
+
5071
+ if (rt == leaf ||
5072
+ (rt_can_ecmp && rt->fib6_metric == leaf->fib6_metric &&
5073
+ rt6_qualify_for_ecmp(leaf)))
5074
+ should_notify = true;
5075
+out:
5076
+ rcu_read_unlock();
5077
+
5078
+ return should_notify;
43915079 }
43925080
43935081 static int fib6_gw_from_attr(struct in6_addr *gw, struct nlattr *nla,
....@@ -4451,6 +5139,10 @@
44515139 r_cfg.fc_flags |= RTF_GATEWAY;
44525140 }
44535141 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
5142
+
5143
+ /* RTA_ENCAP_TYPE length checked in
5144
+ * lwtunnel_valid_encap_type_attr
5145
+ */
44545146 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
44555147 if (nla)
44565148 r_cfg.fc_encap_type = nla_get_u16(nla);
....@@ -4471,7 +5163,7 @@
44715163 goto cleanup;
44725164 }
44735165
4474
- rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
5166
+ rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1;
44755167
44765168 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
44775169 rt, &r_cfg);
....@@ -4483,11 +5175,22 @@
44835175 rtnh = rtnh_next(rtnh, &remaining);
44845176 }
44855177
5178
+ if (list_empty(&rt6_nh_list)) {
5179
+ NL_SET_ERR_MSG(extack,
5180
+ "Invalid nexthop configuration - no valid nexthops");
5181
+ return -EINVAL;
5182
+ }
5183
+
44865184 /* for add and replace send one notification with all nexthops.
44875185 * Skip the notification in fib6_add_rt2node and send one with
44885186 * the full route when done
44895187 */
44905188 info->skip_notify = 1;
5189
+
5190
+ /* For add and replace, send one notification with all nexthops. For
5191
+ * append, send one notification with all appended nexthops.
5192
+ */
5193
+ info->skip_notify_kernel = 1;
44915194
44925195 err_nh = NULL;
44935196 list_for_each_entry(nh, &rt6_nh_list, next) {
....@@ -4507,7 +5210,8 @@
45075210 nh->fib6_info = NULL;
45085211 if (err) {
45095212 if (replace && nhn)
4510
- ip6_print_replace_route_err(&rt6_nh_list);
5213
+ NL_SET_ERR_MSG_MOD(extack,
5214
+ "multipath route replace failed (check consistency of installed routes)");
45115215 err_nh = nh;
45125216 goto add_errout;
45135217 }
....@@ -4525,6 +5229,29 @@
45255229 cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_CREATE;
45265230 }
45275231 nhn++;
5232
+ }
5233
+
5234
+ /* An in-kernel notification should only be sent in case the new
5235
+ * multipath route is added as the first route in the node, or if
5236
+ * it was appended to it. We pass 'rt_notif' since it is the first
5237
+ * sibling and might allow us to skip some checks in the replace case.
5238
+ */
5239
+ if (ip6_route_mpath_should_notify(rt_notif)) {
5240
+ enum fib_event_type fib_event;
5241
+
5242
+ if (rt_notif->fib6_nsiblings != nhn - 1)
5243
+ fib_event = FIB_EVENT_ENTRY_APPEND;
5244
+ else
5245
+ fib_event = FIB_EVENT_ENTRY_REPLACE;
5246
+
5247
+ err = call_fib6_multipath_entry_notifiers(info->nl_net,
5248
+ fib_event, rt_notif,
5249
+ nhn - 1, extack);
5250
+ if (err) {
5251
+ /* Delete all the siblings that were just added */
5252
+ err_nh = NULL;
5253
+ goto add_errout;
5254
+ }
45285255 }
45295256
45305257 /* success ... tell user about new route */
....@@ -4562,9 +5289,10 @@
45625289 {
45635290 struct fib6_config r_cfg;
45645291 struct rtnexthop *rtnh;
5292
+ int last_err = 0;
45655293 int remaining;
45665294 int attrlen;
4567
- int err = 1, last_err = 0;
5295
+ int err;
45685296
45695297 remaining = cfg->fc_mp_len;
45705298 rtnh = (struct rtnexthop *)cfg->fc_mp;
....@@ -4612,6 +5340,12 @@
46125340 if (err < 0)
46135341 return err;
46145342
5343
+ if (cfg.fc_nh_id &&
5344
+ !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id)) {
5345
+ NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
5346
+ return -EINVAL;
5347
+ }
5348
+
46155349 if (cfg.fc_mp)
46165350 return ip6_route_multipath_del(&cfg, extack);
46175351 else {
....@@ -4630,23 +5364,56 @@
46305364 if (err < 0)
46315365 return err;
46325366
5367
+ if (cfg.fc_metric == 0)
5368
+ cfg.fc_metric = IP6_RT_PRIO_USER;
5369
+
46335370 if (cfg.fc_mp)
46345371 return ip6_route_multipath_add(&cfg, extack);
46355372 else
46365373 return ip6_route_add(&cfg, GFP_KERNEL, extack);
46375374 }
46385375
4639
-static size_t rt6_nlmsg_size(struct fib6_info *rt)
5376
+/* add the overhead of this fib6_nh to nexthop_len */
5377
+static int rt6_nh_nlmsg_size(struct fib6_nh *nh, void *arg)
46405378 {
4641
- int nexthop_len = 0;
5379
+ int *nexthop_len = arg;
46425380
4643
- if (rt->fib6_nsiblings) {
4644
- nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
4645
- + NLA_ALIGN(sizeof(struct rtnexthop))
4646
- + nla_total_size(16) /* RTA_GATEWAY */
4647
- + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
5381
+ *nexthop_len += nla_total_size(0) /* RTA_MULTIPATH */
5382
+ + NLA_ALIGN(sizeof(struct rtnexthop))
5383
+ + nla_total_size(16); /* RTA_GATEWAY */
46485384
4649
- nexthop_len *= rt->fib6_nsiblings;
5385
+ if (nh->fib_nh_lws) {
5386
+ /* RTA_ENCAP_TYPE */
5387
+ *nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws);
5388
+ /* RTA_ENCAP */
5389
+ *nexthop_len += nla_total_size(2);
5390
+ }
5391
+
5392
+ return 0;
5393
+}
5394
+
5395
+static size_t rt6_nlmsg_size(struct fib6_info *f6i)
5396
+{
5397
+ int nexthop_len;
5398
+
5399
+ if (f6i->nh) {
5400
+ nexthop_len = nla_total_size(4); /* RTA_NH_ID */
5401
+ nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size,
5402
+ &nexthop_len);
5403
+ } else {
5404
+ struct fib6_info *sibling, *next_sibling;
5405
+ struct fib6_nh *nh = f6i->fib6_nh;
5406
+
5407
+ nexthop_len = 0;
5408
+ if (f6i->fib6_nsiblings) {
5409
+ rt6_nh_nlmsg_size(nh, &nexthop_len);
5410
+
5411
+ list_for_each_entry_safe(sibling, next_sibling,
5412
+ &f6i->fib6_siblings, fib6_siblings) {
5413
+ rt6_nh_nlmsg_size(sibling->fib6_nh, &nexthop_len);
5414
+ }
5415
+ }
5416
+ nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws);
46505417 }
46515418
46525419 return NLMSG_ALIGN(sizeof(struct rtmsg))
....@@ -4662,70 +5429,31 @@
46625429 + nla_total_size(sizeof(struct rta_cacheinfo))
46635430 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
46645431 + nla_total_size(1) /* RTA_PREF */
4665
- + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
46665432 + nexthop_len;
46675433 }
46685434
4669
-static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4670
- unsigned int *flags, bool skip_oif)
5435
+static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh,
5436
+ unsigned char *flags)
46715437 {
4672
- if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4673
- *flags |= RTNH_F_DEAD;
5438
+ if (nexthop_is_multipath(nh)) {
5439
+ struct nlattr *mp;
46745440
4675
- if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4676
- *flags |= RTNH_F_LINKDOWN;
5441
+ mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
5442
+ if (!mp)
5443
+ goto nla_put_failure;
46775444
4678
- rcu_read_lock();
4679
- if (fib6_ignore_linkdown(rt))
4680
- *flags |= RTNH_F_DEAD;
4681
- rcu_read_unlock();
4682
- }
5445
+ if (nexthop_mpath_fill_node(skb, nh, AF_INET6))
5446
+ goto nla_put_failure;
46835447
4684
- if (rt->fib6_flags & RTF_GATEWAY) {
4685
- if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
5448
+ nla_nest_end(skb, mp);
5449
+ } else {
5450
+ struct fib6_nh *fib6_nh;
5451
+
5452
+ fib6_nh = nexthop_fib6_nh(nh);
5453
+ if (fib_nexthop_info(skb, &fib6_nh->nh_common, AF_INET6,
5454
+ flags, false) < 0)
46865455 goto nla_put_failure;
46875456 }
4688
-
4689
- *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4690
- if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4691
- *flags |= RTNH_F_OFFLOAD;
4692
-
4693
- /* not needed for multipath encoding b/c it has a rtnexthop struct */
4694
- if (!skip_oif && rt->fib6_nh.nh_dev &&
4695
- nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4696
- goto nla_put_failure;
4697
-
4698
- if (rt->fib6_nh.nh_lwtstate &&
4699
- lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4700
- goto nla_put_failure;
4701
-
4702
- return 0;
4703
-
4704
-nla_put_failure:
4705
- return -EMSGSIZE;
4706
-}
4707
-
4708
-/* add multipath next hop */
4709
-static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4710
-{
4711
- const struct net_device *dev = rt->fib6_nh.nh_dev;
4712
- struct rtnexthop *rtnh;
4713
- unsigned int flags = 0;
4714
-
4715
- rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4716
- if (!rtnh)
4717
- goto nla_put_failure;
4718
-
4719
- rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4720
- rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4721
-
4722
- if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4723
- goto nla_put_failure;
4724
-
4725
- rtnh->rtnh_flags = flags;
4726
-
4727
- /* length of rtnetlink header + attributes */
4728
- rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
47295457
47305458 return 0;
47315459
....@@ -4742,6 +5470,7 @@
47425470 struct rt6_info *rt6 = (struct rt6_info *)dst;
47435471 struct rt6key *rt6_dst, *rt6_src;
47445472 u32 *pmetrics, table, rt6_flags;
5473
+ unsigned char nh_flags = 0;
47455474 struct nlmsghdr *nlh;
47465475 struct rtmsg *rtm;
47475476 long expires = 0;
....@@ -4845,28 +5574,54 @@
48455574 struct fib6_info *sibling, *next_sibling;
48465575 struct nlattr *mp;
48475576
4848
- mp = nla_nest_start(skb, RTA_MULTIPATH);
5577
+ mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
48495578 if (!mp)
48505579 goto nla_put_failure;
48515580
4852
- if (rt6_add_nexthop(skb, rt) < 0)
5581
+ if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common,
5582
+ rt->fib6_nh->fib_nh_weight, AF_INET6,
5583
+ 0) < 0)
48535584 goto nla_put_failure;
48545585
48555586 list_for_each_entry_safe(sibling, next_sibling,
48565587 &rt->fib6_siblings, fib6_siblings) {
4857
- if (rt6_add_nexthop(skb, sibling) < 0)
5588
+ if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common,
5589
+ sibling->fib6_nh->fib_nh_weight,
5590
+ AF_INET6, 0) < 0)
48585591 goto nla_put_failure;
48595592 }
48605593
48615594 nla_nest_end(skb, mp);
4862
- } else {
4863
- if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
5595
+ } else if (rt->nh) {
5596
+ if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id))
48645597 goto nla_put_failure;
5598
+
5599
+ if (nexthop_is_blackhole(rt->nh))
5600
+ rtm->rtm_type = RTN_BLACKHOLE;
5601
+
5602
+ if (READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode) &&
5603
+ rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0)
5604
+ goto nla_put_failure;
5605
+
5606
+ rtm->rtm_flags |= nh_flags;
5607
+ } else {
5608
+ if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common, AF_INET6,
5609
+ &nh_flags, false) < 0)
5610
+ goto nla_put_failure;
5611
+
5612
+ rtm->rtm_flags |= nh_flags;
48655613 }
48665614
48675615 if (rt6_flags & RTF_EXPIRES) {
48685616 expires = dst ? dst->expires : rt->expires;
48695617 expires -= jiffies;
5618
+ }
5619
+
5620
+ if (!dst) {
5621
+ if (rt->offload)
5622
+ rtm->rtm_flags |= RTM_F_OFFLOAD;
5623
+ if (rt->trap)
5624
+ rtm->rtm_flags |= RTM_F_TRAP;
48705625 }
48715626
48725627 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
....@@ -4884,28 +5639,235 @@
48845639 return -EMSGSIZE;
48855640 }
48865641
4887
-int rt6_dump_route(struct fib6_info *rt, void *p_arg)
5642
+static int fib6_info_nh_uses_dev(struct fib6_nh *nh, void *arg)
48885643 {
4889
- struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4890
- struct net *net = arg->net;
5644
+ const struct net_device *dev = arg;
48915645
4892
- if (rt == net->ipv6.fib6_null_entry)
4893
- return 0;
5646
+ if (nh->fib_nh_dev == dev)
5647
+ return 1;
48945648
4895
- if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4896
- struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
5649
+ return 0;
5650
+}
48975651
4898
- /* user wants prefix routes only */
4899
- if (rtm->rtm_flags & RTM_F_PREFIX &&
4900
- !(rt->fib6_flags & RTF_PREFIX_RT)) {
4901
- /* success since this is not a prefix route */
4902
- return 1;
5652
+static bool fib6_info_uses_dev(const struct fib6_info *f6i,
5653
+ const struct net_device *dev)
5654
+{
5655
+ if (f6i->nh) {
5656
+ struct net_device *_dev = (struct net_device *)dev;
5657
+
5658
+ return !!nexthop_for_each_fib6_nh(f6i->nh,
5659
+ fib6_info_nh_uses_dev,
5660
+ _dev);
5661
+ }
5662
+
5663
+ if (f6i->fib6_nh->fib_nh_dev == dev)
5664
+ return true;
5665
+
5666
+ if (f6i->fib6_nsiblings) {
5667
+ struct fib6_info *sibling, *next_sibling;
5668
+
5669
+ list_for_each_entry_safe(sibling, next_sibling,
5670
+ &f6i->fib6_siblings, fib6_siblings) {
5671
+ if (sibling->fib6_nh->fib_nh_dev == dev)
5672
+ return true;
49035673 }
49045674 }
49055675
4906
- return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4907
- RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4908
- arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
5676
+ return false;
5677
+}
5678
+
5679
+struct fib6_nh_exception_dump_walker {
5680
+ struct rt6_rtnl_dump_arg *dump;
5681
+ struct fib6_info *rt;
5682
+ unsigned int flags;
5683
+ unsigned int skip;
5684
+ unsigned int count;
5685
+};
5686
+
5687
+static int rt6_nh_dump_exceptions(struct fib6_nh *nh, void *arg)
5688
+{
5689
+ struct fib6_nh_exception_dump_walker *w = arg;
5690
+ struct rt6_rtnl_dump_arg *dump = w->dump;
5691
+ struct rt6_exception_bucket *bucket;
5692
+ struct rt6_exception *rt6_ex;
5693
+ int i, err;
5694
+
5695
+ bucket = fib6_nh_get_excptn_bucket(nh, NULL);
5696
+ if (!bucket)
5697
+ return 0;
5698
+
5699
+ for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
5700
+ hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
5701
+ if (w->skip) {
5702
+ w->skip--;
5703
+ continue;
5704
+ }
5705
+
5706
+ /* Expiration of entries doesn't bump sernum, insertion
5707
+ * does. Removal is triggered by insertion, so we can
5708
+ * rely on the fact that if entries change between two
5709
+ * partial dumps, this node is scanned again completely,
5710
+ * see rt6_insert_exception() and fib6_dump_table().
5711
+ *
5712
+ * Count expired entries we go through as handled
5713
+ * entries that we'll skip next time, in case of partial
5714
+ * node dump. Otherwise, if entries expire meanwhile,
5715
+ * we'll skip the wrong amount.
5716
+ */
5717
+ if (rt6_check_expired(rt6_ex->rt6i)) {
5718
+ w->count++;
5719
+ continue;
5720
+ }
5721
+
5722
+ err = rt6_fill_node(dump->net, dump->skb, w->rt,
5723
+ &rt6_ex->rt6i->dst, NULL, NULL, 0,
5724
+ RTM_NEWROUTE,
5725
+ NETLINK_CB(dump->cb->skb).portid,
5726
+ dump->cb->nlh->nlmsg_seq, w->flags);
5727
+ if (err)
5728
+ return err;
5729
+
5730
+ w->count++;
5731
+ }
5732
+ bucket++;
5733
+ }
5734
+
5735
+ return 0;
5736
+}
5737
+
5738
+/* Return -1 if done with node, number of handled routes on partial dump */
5739
+int rt6_dump_route(struct fib6_info *rt, void *p_arg, unsigned int skip)
5740
+{
5741
+ struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
5742
+ struct fib_dump_filter *filter = &arg->filter;
5743
+ unsigned int flags = NLM_F_MULTI;
5744
+ struct net *net = arg->net;
5745
+ int count = 0;
5746
+
5747
+ if (rt == net->ipv6.fib6_null_entry)
5748
+ return -1;
5749
+
5750
+ if ((filter->flags & RTM_F_PREFIX) &&
5751
+ !(rt->fib6_flags & RTF_PREFIX_RT)) {
5752
+ /* success since this is not a prefix route */
5753
+ return -1;
5754
+ }
5755
+ if (filter->filter_set &&
5756
+ ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
5757
+ (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
5758
+ (filter->protocol && rt->fib6_protocol != filter->protocol))) {
5759
+ return -1;
5760
+ }
5761
+
5762
+ if (filter->filter_set ||
5763
+ !filter->dump_routes || !filter->dump_exceptions) {
5764
+ flags |= NLM_F_DUMP_FILTERED;
5765
+ }
5766
+
5767
+ if (filter->dump_routes) {
5768
+ if (skip) {
5769
+ skip--;
5770
+ } else {
5771
+ if (rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL,
5772
+ 0, RTM_NEWROUTE,
5773
+ NETLINK_CB(arg->cb->skb).portid,
5774
+ arg->cb->nlh->nlmsg_seq, flags)) {
5775
+ return 0;
5776
+ }
5777
+ count++;
5778
+ }
5779
+ }
5780
+
5781
+ if (filter->dump_exceptions) {
5782
+ struct fib6_nh_exception_dump_walker w = { .dump = arg,
5783
+ .rt = rt,
5784
+ .flags = flags,
5785
+ .skip = skip,
5786
+ .count = 0 };
5787
+ int err;
5788
+
5789
+ rcu_read_lock();
5790
+ if (rt->nh) {
5791
+ err = nexthop_for_each_fib6_nh(rt->nh,
5792
+ rt6_nh_dump_exceptions,
5793
+ &w);
5794
+ } else {
5795
+ err = rt6_nh_dump_exceptions(rt->fib6_nh, &w);
5796
+ }
5797
+ rcu_read_unlock();
5798
+
5799
+ if (err)
5800
+ return count += w.count;
5801
+ }
5802
+
5803
+ return -1;
5804
+}
5805
+
5806
+static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
5807
+ const struct nlmsghdr *nlh,
5808
+ struct nlattr **tb,
5809
+ struct netlink_ext_ack *extack)
5810
+{
5811
+ struct rtmsg *rtm;
5812
+ int i, err;
5813
+
5814
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
5815
+ NL_SET_ERR_MSG_MOD(extack,
5816
+ "Invalid header for get route request");
5817
+ return -EINVAL;
5818
+ }
5819
+
5820
+ if (!netlink_strict_get_check(skb))
5821
+ return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
5822
+ rtm_ipv6_policy, extack);
5823
+
5824
+ rtm = nlmsg_data(nlh);
5825
+ if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
5826
+ (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
5827
+ rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
5828
+ rtm->rtm_type) {
5829
+ NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
5830
+ return -EINVAL;
5831
+ }
5832
+ if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
5833
+ NL_SET_ERR_MSG_MOD(extack,
5834
+ "Invalid flags for get route request");
5835
+ return -EINVAL;
5836
+ }
5837
+
5838
+ err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
5839
+ rtm_ipv6_policy, extack);
5840
+ if (err)
5841
+ return err;
5842
+
5843
+ if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
5844
+ (tb[RTA_DST] && !rtm->rtm_dst_len)) {
5845
+ NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
5846
+ return -EINVAL;
5847
+ }
5848
+
5849
+ for (i = 0; i <= RTA_MAX; i++) {
5850
+ if (!tb[i])
5851
+ continue;
5852
+
5853
+ switch (i) {
5854
+ case RTA_SRC:
5855
+ case RTA_DST:
5856
+ case RTA_IIF:
5857
+ case RTA_OIF:
5858
+ case RTA_MARK:
5859
+ case RTA_UID:
5860
+ case RTA_SPORT:
5861
+ case RTA_DPORT:
5862
+ case RTA_IP_PROTO:
5863
+ break;
5864
+ default:
5865
+ NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
5866
+ return -EINVAL;
5867
+ }
5868
+ }
5869
+
5870
+ return 0;
49095871 }
49105872
49115873 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
....@@ -4919,16 +5881,14 @@
49195881 struct rt6_info *rt;
49205882 struct sk_buff *skb;
49215883 struct rtmsg *rtm;
4922
- struct flowi6 fl6;
5884
+ struct flowi6 fl6 = {};
49235885 bool fibmatch;
49245886
4925
- err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4926
- extack);
5887
+ err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
49275888 if (err < 0)
49285889 goto errout;
49295890
49305891 err = -EINVAL;
4931
- memset(&fl6, 0, sizeof(fl6));
49325892 rtm = nlmsg_data(nlh);
49335893 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
49345894 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
....@@ -5085,6 +6045,38 @@
50856045 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
50866046 }
50876047
6048
+void fib6_rt_update(struct net *net, struct fib6_info *rt,
6049
+ struct nl_info *info)
6050
+{
6051
+ u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
6052
+ struct sk_buff *skb;
6053
+ int err = -ENOBUFS;
6054
+
6055
+ /* call_fib6_entry_notifiers will be removed when in-kernel notifier
6056
+ * is implemented and supported for nexthop objects
6057
+ */
6058
+ call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, rt, NULL);
6059
+
6060
+ skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
6061
+ if (!skb)
6062
+ goto errout;
6063
+
6064
+ err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
6065
+ RTM_NEWROUTE, info->portid, seq, NLM_F_REPLACE);
6066
+ if (err < 0) {
6067
+ /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
6068
+ WARN_ON(err == -EMSGSIZE);
6069
+ kfree_skb(skb);
6070
+ goto errout;
6071
+ }
6072
+ rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
6073
+ info->nlh, gfp_any());
6074
+ return;
6075
+errout:
6076
+ if (err < 0)
6077
+ rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
6078
+}
6079
+
50886080 static int ip6_route_dev_notify(struct notifier_block *this,
50896081 unsigned long event, void *ptr)
50906082 {
....@@ -5095,7 +6087,7 @@
50956087 return NOTIFY_OK;
50966088
50976089 if (event == NETDEV_REGISTER) {
5098
- net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
6090
+ net->ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = dev;
50996091 net->ipv6.ip6_null_entry->dst.dev = dev;
51006092 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
51016093 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
....@@ -5142,23 +6134,26 @@
51426134
51436135 #ifdef CONFIG_SYSCTL
51446136
5145
-static
5146
-int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5147
- void __user *buffer, size_t *lenp, loff_t *ppos)
6137
+static int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
6138
+ void *buffer, size_t *lenp, loff_t *ppos)
51486139 {
51496140 struct net *net;
51506141 int delay;
6142
+ int ret;
51516143 if (!write)
51526144 return -EINVAL;
51536145
51546146 net = (struct net *)ctl->extra1;
51556147 delay = net->ipv6.sysctl.flush_delay;
5156
- proc_dointvec(ctl, write, buffer, lenp, ppos);
6148
+ ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
6149
+ if (ret)
6150
+ return ret;
6151
+
51576152 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
51586153 return 0;
51596154 }
51606155
5161
-struct ctl_table ipv6_route_table_template[] = {
6156
+static struct ctl_table ipv6_route_table_template[] = {
51626157 {
51636158 .procname = "flush",
51646159 .data = &init_net.ipv6.sysctl.flush_delay,
....@@ -5229,6 +6224,15 @@
52296224 .mode = 0644,
52306225 .proc_handler = proc_dointvec_ms_jiffies,
52316226 },
6227
+ {
6228
+ .procname = "skip_notify_on_dev_down",
6229
+ .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down,
6230
+ .maxlen = sizeof(int),
6231
+ .mode = 0644,
6232
+ .proc_handler = proc_dointvec_minmax,
6233
+ .extra1 = SYSCTL_ZERO,
6234
+ .extra2 = SYSCTL_ONE,
6235
+ },
52326236 { }
52336237 };
52346238
....@@ -5252,6 +6256,7 @@
52526256 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
52536257 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
52546258 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
6259
+ table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
52556260
52566261 /* Don't export sysctls to unprivileged users */
52576262 if (net->user_ns != &init_user_ns)
....@@ -5272,11 +6277,11 @@
52726277 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
52736278 goto out_ip6_dst_ops;
52746279
5275
- net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5276
- sizeof(*net->ipv6.fib6_null_entry),
5277
- GFP_KERNEL);
6280
+ net->ipv6.fib6_null_entry = fib6_info_alloc(GFP_KERNEL, true);
52786281 if (!net->ipv6.fib6_null_entry)
52796282 goto out_ip6_dst_entries;
6283
+ memcpy(net->ipv6.fib6_null_entry, &fib6_null_entry_template,
6284
+ sizeof(*net->ipv6.fib6_null_entry));
52806285
52816286 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
52826287 sizeof(*net->ipv6.ip6_null_entry),
....@@ -5286,6 +6291,7 @@
52866291 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
52876292 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
52886293 ip6_template_metrics, true);
6294
+ INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->rt6i_uncached);
52896295
52906296 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
52916297 net->ipv6.fib6_has_custom_rules = false;
....@@ -5297,6 +6303,7 @@
52976303 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
52986304 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
52996305 ip6_template_metrics, true);
6306
+ INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->rt6i_uncached);
53006307
53016308 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
53026309 sizeof(*net->ipv6.ip6_blk_hole_entry),
....@@ -5306,6 +6313,10 @@
53066313 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
53076314 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
53086315 ip6_template_metrics, true);
6316
+ INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->rt6i_uncached);
6317
+#ifdef CONFIG_IPV6_SUBTREES
6318
+ net->ipv6.fib6_routes_require_src = 0;
6319
+#endif
53096320 #endif
53106321
53116322 net->ipv6.sysctl.flush_delay = 0;
....@@ -5316,6 +6327,7 @@
53166327 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
53176328 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
53186329 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
6330
+ net->ipv6.sysctl.skip_notify_on_dev_down = 0;
53196331
53206332 net->ipv6.ip6_rt_gc_expire = 30*HZ;
53216333
....@@ -5351,10 +6363,16 @@
53516363 static int __net_init ip6_route_net_init_late(struct net *net)
53526364 {
53536365 #ifdef CONFIG_PROC_FS
5354
- proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5355
- sizeof(struct ipv6_route_iter));
5356
- proc_create_net_single("rt6_stats", 0444, net->proc_net,
5357
- rt6_stats_seq_show, NULL);
6366
+ if (!proc_create_net("ipv6_route", 0, net->proc_net,
6367
+ &ipv6_route_seq_ops,
6368
+ sizeof(struct ipv6_route_iter)))
6369
+ return -ENOMEM;
6370
+
6371
+ if (!proc_create_net_single("rt6_stats", 0444, net->proc_net,
6372
+ rt6_stats_seq_show, NULL)) {
6373
+ remove_proc_entry("ipv6_route", net->proc_net);
6374
+ return -ENOMEM;
6375
+ }
53586376 #endif
53596377 return 0;
53606378 }
....@@ -5412,7 +6430,7 @@
54126430 /* Registering of the loopback is done before this portion of code,
54136431 * the loopback reference in rt6_info will not be taken, do it
54146432 * manually for init_net */
5415
- init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
6433
+ init_net.ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = init_net.loopback_dev;
54166434 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
54176435 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
54186436 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
....@@ -5422,6 +6440,43 @@
54226440 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
54236441 #endif
54246442 }
6443
+
6444
+#if IS_BUILTIN(CONFIG_IPV6)
6445
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
6446
+DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt)
6447
+
6448
+BTF_ID_LIST(btf_fib6_info_id)
6449
+BTF_ID(struct, fib6_info)
6450
+
6451
+static const struct bpf_iter_seq_info ipv6_route_seq_info = {
6452
+ .seq_ops = &ipv6_route_seq_ops,
6453
+ .init_seq_private = bpf_iter_init_seq_net,
6454
+ .fini_seq_private = bpf_iter_fini_seq_net,
6455
+ .seq_priv_size = sizeof(struct ipv6_route_iter),
6456
+};
6457
+
6458
+static struct bpf_iter_reg ipv6_route_reg_info = {
6459
+ .target = "ipv6_route",
6460
+ .ctx_arg_info_size = 1,
6461
+ .ctx_arg_info = {
6462
+ { offsetof(struct bpf_iter__ipv6_route, rt),
6463
+ PTR_TO_BTF_ID_OR_NULL },
6464
+ },
6465
+ .seq_info = &ipv6_route_seq_info,
6466
+};
6467
+
6468
+static int __init bpf_iter_register(void)
6469
+{
6470
+ ipv6_route_reg_info.ctx_arg_info[0].btf_id = *btf_fib6_info_id;
6471
+ return bpf_iter_reg_target(&ipv6_route_reg_info);
6472
+}
6473
+
6474
+static void bpf_iter_unregister(void)
6475
+{
6476
+ bpf_iter_unreg_target(&ipv6_route_reg_info);
6477
+}
6478
+#endif
6479
+#endif
54256480
54266481 int __init ip6_route_init(void)
54276482 {
....@@ -5485,6 +6540,14 @@
54856540 if (ret)
54866541 goto out_register_late_subsys;
54876542
6543
+#if IS_BUILTIN(CONFIG_IPV6)
6544
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
6545
+ ret = bpf_iter_register();
6546
+ if (ret)
6547
+ goto out_register_late_subsys;
6548
+#endif
6549
+#endif
6550
+
54886551 for_each_possible_cpu(cpu) {
54896552 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
54906553
....@@ -5517,6 +6580,11 @@
55176580
55186581 void ip6_route_cleanup(void)
55196582 {
6583
+#if IS_BUILTIN(CONFIG_IPV6)
6584
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
6585
+ bpf_iter_unregister();
6586
+#endif
6587
+#endif
55206588 unregister_netdevice_notifier(&ip6_route_dev_notifier);
55216589 unregister_pernet_subsys(&ip6_route_net_late_ops);
55226590 fib6_rules_cleanup();