hc
2024-10-22 8ac6c7a54ed1b98d142dce24b11c6de6a1e239a5
kernel/net/ipv6/ip6_fib.c
....@@ -1,14 +1,10 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /*
23 * Linux INET6 implementation
34 * Forwarding Information Database
45 *
56 * Authors:
67 * Pedro Roque <roque@di.fc.ul.pt>
7
- *
8
- * This program is free software; you can redistribute it and/or
9
- * modify it under the terms of the GNU General Public License
10
- * as published by the Free Software Foundation; either version
11
- * 2 of the License, or (at your option) any later version.
128 *
139 * Changes:
1410 * Yuji SEKIYA @USAGI: Support default route on router node;
....@@ -29,6 +25,7 @@
2925 #include <linux/list.h>
3026 #include <linux/slab.h>
3127
28
+#include <net/ip.h>
3229 #include <net/ipv6.h>
3330 #include <net/ndisc.h>
3431 #include <net/addrconf.h>
....@@ -46,6 +43,7 @@
4643 int (*func)(struct fib6_info *, void *arg);
4744 int sernum;
4845 void *arg;
46
+ bool skip_notify;
4947 };
5048
5149 #ifdef CONFIG_IPV6_SUBTREES
....@@ -145,24 +143,21 @@
145143 addr[fn_bit >> 5];
146144 }
147145
148
-struct fib6_info *fib6_info_alloc(gfp_t gfp_flags)
146
+struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh)
149147 {
150148 struct fib6_info *f6i;
149
+ size_t sz = sizeof(*f6i);
151150
152
- f6i = kzalloc(sizeof(*f6i), gfp_flags);
151
+ if (with_fib6_nh)
152
+ sz += sizeof(struct fib6_nh);
153
+
154
+ f6i = kzalloc(sz, gfp_flags);
153155 if (!f6i)
154156 return NULL;
155157
156
- f6i->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags);
157
- if (!f6i->rt6i_pcpu) {
158
- kfree(f6i);
159
- return NULL;
160
- }
161
-
158
+ /* fib6_siblings is a union with nh_list, so this initializes both */
162159 INIT_LIST_HEAD(&f6i->fib6_siblings);
163
- f6i->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
164
-
165
- atomic_inc(&f6i->fib6_ref);
160
+ refcount_set(&f6i->fib6_ref, 1);
166161
167162 return f6i;
168163 }
....@@ -170,45 +165,15 @@
170165 void fib6_info_destroy_rcu(struct rcu_head *head)
171166 {
172167 struct fib6_info *f6i = container_of(head, struct fib6_info, rcu);
173
- struct rt6_exception_bucket *bucket;
174
- struct dst_metrics *m;
175168
176169 WARN_ON(f6i->fib6_node);
177170
178
- bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1);
179
- if (bucket) {
180
- f6i->rt6i_exception_bucket = NULL;
181
- kfree(bucket);
182
- }
171
+ if (f6i->nh)
172
+ nexthop_put(f6i->nh);
173
+ else
174
+ fib6_nh_release(f6i->fib6_nh);
183175
184
- if (f6i->rt6i_pcpu) {
185
- int cpu;
186
-
187
- for_each_possible_cpu(cpu) {
188
- struct rt6_info **ppcpu_rt;
189
- struct rt6_info *pcpu_rt;
190
-
191
- ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu);
192
- pcpu_rt = *ppcpu_rt;
193
- if (pcpu_rt) {
194
- dst_dev_put(&pcpu_rt->dst);
195
- dst_release(&pcpu_rt->dst);
196
- *ppcpu_rt = NULL;
197
- }
198
- }
199
-
200
- free_percpu(f6i->rt6i_pcpu);
201
- }
202
-
203
- lwtstate_put(f6i->fib6_nh.nh_lwtstate);
204
-
205
- if (f6i->fib6_nh.nh_dev)
206
- dev_put(f6i->fib6_nh.nh_dev);
207
-
208
- m = f6i->fib6_metrics;
209
- if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt))
210
- kfree(m);
211
-
176
+ ip_fib_metrics_put(f6i->fib6_metrics);
212177 kfree(f6i);
213178 }
214179 EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu);
....@@ -349,21 +314,24 @@
349314 {
350315 struct rt6_info *rt;
351316
352
- rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags);
317
+ rt = pol_lookup_func(lookup,
318
+ net, net->ipv6.fib6_main_tbl, fl6, skb, flags);
353319 if (rt->dst.error == -EAGAIN) {
354
- ip6_rt_put(rt);
320
+ ip6_rt_put_flags(rt, flags);
355321 rt = net->ipv6.ip6_null_entry;
356
- dst_hold(&rt->dst);
322
+ if (!(flags & RT6_LOOKUP_F_DST_NOREF))
323
+ dst_hold(&rt->dst);
357324 }
358325
359326 return &rt->dst;
360327 }
361328
362329 /* called with rcu lock held; no reference taken on fib6_info */
363
-struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
364
- int flags)
330
+int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
331
+ struct fib6_result *res, int flags)
365332 {
366
- return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, flags);
333
+ return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6,
334
+ res, flags);
367335 }
368336
369337 static void __net_init fib6_tables_init(struct net *net)
....@@ -390,21 +358,38 @@
390358 return fib_seq;
391359 }
392360
393
-static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net,
361
+static int call_fib6_entry_notifier(struct notifier_block *nb,
394362 enum fib_event_type event_type,
395
- struct fib6_info *rt)
363
+ struct fib6_info *rt,
364
+ struct netlink_ext_ack *extack)
396365 {
397366 struct fib6_entry_notifier_info info = {
367
+ .info.extack = extack,
398368 .rt = rt,
399369 };
400370
401
- return call_fib6_notifier(nb, net, event_type, &info.info);
371
+ return call_fib6_notifier(nb, event_type, &info.info);
402372 }
403373
404
-static int call_fib6_entry_notifiers(struct net *net,
405
- enum fib_event_type event_type,
406
- struct fib6_info *rt,
407
- struct netlink_ext_ack *extack)
374
+static int call_fib6_multipath_entry_notifier(struct notifier_block *nb,
375
+ enum fib_event_type event_type,
376
+ struct fib6_info *rt,
377
+ unsigned int nsiblings,
378
+ struct netlink_ext_ack *extack)
379
+{
380
+ struct fib6_entry_notifier_info info = {
381
+ .info.extack = extack,
382
+ .rt = rt,
383
+ .nsiblings = nsiblings,
384
+ };
385
+
386
+ return call_fib6_notifier(nb, event_type, &info.info);
387
+}
388
+
389
+int call_fib6_entry_notifiers(struct net *net,
390
+ enum fib_event_type event_type,
391
+ struct fib6_info *rt,
392
+ struct netlink_ext_ack *extack)
408393 {
409394 struct fib6_entry_notifier_info info = {
410395 .info.extack = extack,
....@@ -415,43 +400,88 @@
415400 return call_fib6_notifiers(net, event_type, &info.info);
416401 }
417402
403
+int call_fib6_multipath_entry_notifiers(struct net *net,
404
+ enum fib_event_type event_type,
405
+ struct fib6_info *rt,
406
+ unsigned int nsiblings,
407
+ struct netlink_ext_ack *extack)
408
+{
409
+ struct fib6_entry_notifier_info info = {
410
+ .info.extack = extack,
411
+ .rt = rt,
412
+ .nsiblings = nsiblings,
413
+ };
414
+
415
+ rt->fib6_table->fib_seq++;
416
+ return call_fib6_notifiers(net, event_type, &info.info);
417
+}
418
+
419
+int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt)
420
+{
421
+ struct fib6_entry_notifier_info info = {
422
+ .rt = rt,
423
+ .nsiblings = rt->fib6_nsiblings,
424
+ };
425
+
426
+ rt->fib6_table->fib_seq++;
427
+ return call_fib6_notifiers(net, FIB_EVENT_ENTRY_REPLACE, &info.info);
428
+}
429
+
418430 struct fib6_dump_arg {
419431 struct net *net;
420432 struct notifier_block *nb;
433
+ struct netlink_ext_ack *extack;
421434 };
422435
423
-static void fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg)
436
+static int fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg)
424437 {
425
- if (rt == arg->net->ipv6.fib6_null_entry)
426
- return;
427
- call_fib6_entry_notifier(arg->nb, arg->net, FIB_EVENT_ENTRY_ADD, rt);
438
+ enum fib_event_type fib_event = FIB_EVENT_ENTRY_REPLACE;
439
+ int err;
440
+
441
+ if (!rt || rt == arg->net->ipv6.fib6_null_entry)
442
+ return 0;
443
+
444
+ if (rt->fib6_nsiblings)
445
+ err = call_fib6_multipath_entry_notifier(arg->nb, fib_event,
446
+ rt,
447
+ rt->fib6_nsiblings,
448
+ arg->extack);
449
+ else
450
+ err = call_fib6_entry_notifier(arg->nb, fib_event, rt,
451
+ arg->extack);
452
+
453
+ return err;
428454 }
429455
430456 static int fib6_node_dump(struct fib6_walker *w)
431457 {
432
- struct fib6_info *rt;
458
+ int err;
433459
434
- for_each_fib6_walker_rt(w)
435
- fib6_rt_dump(rt, w->args);
460
+ err = fib6_rt_dump(w->leaf, w->args);
436461 w->leaf = NULL;
437
- return 0;
462
+ return err;
438463 }
439464
440
-static void fib6_table_dump(struct net *net, struct fib6_table *tb,
441
- struct fib6_walker *w)
465
+static int fib6_table_dump(struct net *net, struct fib6_table *tb,
466
+ struct fib6_walker *w)
442467 {
468
+ int err;
469
+
443470 w->root = &tb->tb6_root;
444471 spin_lock_bh(&tb->tb6_lock);
445
- fib6_walk(net, w);
472
+ err = fib6_walk(net, w);
446473 spin_unlock_bh(&tb->tb6_lock);
474
+ return err;
447475 }
448476
449477 /* Called with rcu_read_lock() */
450
-int fib6_tables_dump(struct net *net, struct notifier_block *nb)
478
+int fib6_tables_dump(struct net *net, struct notifier_block *nb,
479
+ struct netlink_ext_ack *extack)
451480 {
452481 struct fib6_dump_arg arg;
453482 struct fib6_walker *w;
454483 unsigned int h;
484
+ int err = 0;
455485
456486 w = kzalloc(sizeof(*w), GFP_ATOMIC);
457487 if (!w)
....@@ -460,19 +490,24 @@
460490 w->func = fib6_node_dump;
461491 arg.net = net;
462492 arg.nb = nb;
493
+ arg.extack = extack;
463494 w->args = &arg;
464495
465496 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
466497 struct hlist_head *head = &net->ipv6.fib_table_hash[h];
467498 struct fib6_table *tb;
468499
469
- hlist_for_each_entry_rcu(tb, head, tb6_hlist)
470
- fib6_table_dump(net, tb, w);
500
+ hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
501
+ err = fib6_table_dump(net, tb, w);
502
+ if (err < 0)
503
+ goto out;
504
+ }
471505 }
472506
507
+out:
473508 kfree(w);
474509
475
- return 0;
510
+ return err;
476511 }
477512
478513 static int fib6_dump_node(struct fib6_walker *w)
....@@ -481,12 +516,19 @@
481516 struct fib6_info *rt;
482517
483518 for_each_fib6_walker_rt(w) {
484
- res = rt6_dump_route(rt, w->args);
485
- if (res < 0) {
519
+ res = rt6_dump_route(rt, w->args, w->skip_in_node);
520
+ if (res >= 0) {
486521 /* Frame is full, suspend walking */
487522 w->leaf = rt;
523
+
524
+ /* We'll restart from this node, so if some routes were
525
+ * already dumped, skip them next time.
526
+ */
527
+ w->skip_in_node += res;
528
+
488529 return 1;
489530 }
531
+ w->skip_in_node = 0;
490532
491533 /* Multipath routes are dumped in one route with the
492534 * RTA_MULTIPATH attribute. Jump 'rt' to point to the
....@@ -538,6 +580,7 @@
538580 if (cb->args[4] == 0) {
539581 w->count = 0;
540582 w->skip = 0;
583
+ w->skip_in_node = 0;
541584
542585 spin_lock_bh(&table->tb6_lock);
543586 res = fib6_walk(net, w);
....@@ -554,6 +597,7 @@
554597 w->state = FWS_INIT;
555598 w->node = w->root;
556599 w->skip = w->count;
600
+ w->skip_in_node = 0;
557601 } else
558602 w->skip = 0;
559603
....@@ -571,17 +615,29 @@
571615
572616 static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
573617 {
618
+ struct rt6_rtnl_dump_arg arg = { .filter.dump_exceptions = true,
619
+ .filter.dump_routes = true };
620
+ const struct nlmsghdr *nlh = cb->nlh;
574621 struct net *net = sock_net(skb->sk);
575622 unsigned int h, s_h;
576623 unsigned int e = 0, s_e;
577
- struct rt6_rtnl_dump_arg arg;
578624 struct fib6_walker *w;
579625 struct fib6_table *tb;
580626 struct hlist_head *head;
581627 int res = 0;
582628
583
- s_h = cb->args[0];
584
- s_e = cb->args[1];
629
+ if (cb->strict_check) {
630
+ int err;
631
+
632
+ err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb);
633
+ if (err < 0)
634
+ return err;
635
+ } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
636
+ struct rtmsg *rtm = nlmsg_data(nlh);
637
+
638
+ if (rtm->rtm_flags & RTM_F_PREFIX)
639
+ arg.filter.flags = RTM_F_PREFIX;
640
+ }
585641
586642 w = (void *)cb->args[2];
587643 if (!w) {
....@@ -607,6 +663,27 @@
607663 arg.net = net;
608664 w->args = &arg;
609665
666
+ if (arg.filter.table_id) {
667
+ tb = fib6_get_table(net, arg.filter.table_id);
668
+ if (!tb) {
669
+ if (rtnl_msg_family(cb->nlh) != PF_INET6)
670
+ goto out;
671
+
672
+ NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist");
673
+ return -ENOENT;
674
+ }
675
+
676
+ if (!cb->args[0]) {
677
+ res = fib6_dump_table(tb, skb, cb);
678
+ if (!res)
679
+ cb->args[0] = 1;
680
+ }
681
+ goto out;
682
+ }
683
+
684
+ s_h = cb->args[0];
685
+ s_e = cb->args[1];
686
+
610687 rcu_read_lock();
611688 for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) {
612689 e = 0;
....@@ -616,16 +693,16 @@
616693 goto next;
617694 res = fib6_dump_table(tb, skb, cb);
618695 if (res != 0)
619
- goto out;
696
+ goto out_unlock;
620697 next:
621698 e++;
622699 }
623700 }
624
-out:
701
+out_unlock:
625702 rcu_read_unlock();
626703 cb->args[1] = e;
627704 cb->args[0] = h;
628
-
705
+out:
629706 res = res < 0 ? res : skb->len;
630707 if (res <= 0)
631708 fib6_dump_end(cb);
....@@ -820,8 +897,8 @@
820897
821898 RCU_INIT_POINTER(in->parent, pn);
822899 in->leaf = fn->leaf;
823
- atomic_inc(&rcu_dereference_protected(in->leaf,
824
- lockdep_is_held(&table->tb6_lock))->fib6_ref);
900
+ fib6_info_hold(rcu_dereference_protected(in->leaf,
901
+ lockdep_is_held(&table->tb6_lock)));
825902
826903 /* update parent pointer */
827904 if (dir)
....@@ -873,16 +950,14 @@
873950 return ln;
874951 }
875952
876
-static void fib6_drop_pcpu_from(struct fib6_info *f6i,
877
- const struct fib6_table *table)
953
+static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh,
954
+ const struct fib6_info *match,
955
+ const struct fib6_table *table)
878956 {
879957 int cpu;
880958
881
- /* Make sure rt6_make_pcpu_route() wont add other percpu routes
882
- * while we are cleaning them here.
883
- */
884
- f6i->fib6_destroying = 1;
885
- mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */
959
+ if (!fib6_nh->rt6i_pcpu)
960
+ return;
886961
887962 /* release the reference to this fib entry from
888963 * all of its cached pcpu routes
....@@ -891,14 +966,58 @@
891966 struct rt6_info **ppcpu_rt;
892967 struct rt6_info *pcpu_rt;
893968
894
- ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu);
969
+ ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
895970 pcpu_rt = *ppcpu_rt;
896
- if (pcpu_rt) {
971
+
972
+ /* only dropping the 'from' reference if the cached route
973
+ * is using 'match'. The cached pcpu_rt->from only changes
974
+ * from a fib6_info to NULL (ip6_dst_destroy); it can never
975
+ * change from one fib6_info reference to another
976
+ */
977
+ if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) {
897978 struct fib6_info *from;
898979
899980 from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL);
900981 fib6_info_release(from);
901982 }
983
+ }
984
+}
985
+
986
+struct fib6_nh_pcpu_arg {
987
+ struct fib6_info *from;
988
+ const struct fib6_table *table;
989
+};
990
+
991
+static int fib6_nh_drop_pcpu_from(struct fib6_nh *nh, void *_arg)
992
+{
993
+ struct fib6_nh_pcpu_arg *arg = _arg;
994
+
995
+ __fib6_drop_pcpu_from(nh, arg->from, arg->table);
996
+ return 0;
997
+}
998
+
999
+static void fib6_drop_pcpu_from(struct fib6_info *f6i,
1000
+ const struct fib6_table *table)
1001
+{
1002
+ /* Make sure rt6_make_pcpu_route() wont add other percpu routes
1003
+ * while we are cleaning them here.
1004
+ */
1005
+ f6i->fib6_destroying = 1;
1006
+ mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */
1007
+
1008
+ if (f6i->nh) {
1009
+ struct fib6_nh_pcpu_arg arg = {
1010
+ .from = f6i,
1011
+ .table = table
1012
+ };
1013
+
1014
+ nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from,
1015
+ &arg);
1016
+ } else {
1017
+ struct fib6_nh *fib6_nh;
1018
+
1019
+ fib6_nh = f6i->fib6_nh;
1020
+ __fib6_drop_pcpu_from(fib6_nh, f6i, table);
9021021 }
9031022 }
9041023
....@@ -909,10 +1028,12 @@
9091028
9101029 /* Flush all cached dst in exception table */
9111030 rt6_flush_exceptions(rt);
912
- if (rt->rt6i_pcpu)
913
- fib6_drop_pcpu_from(rt, table);
1031
+ fib6_drop_pcpu_from(rt, table);
9141032
915
- if (atomic_read(&rt->fib6_ref) != 1) {
1033
+ if (rt->nh && !list_empty(&rt->nh_list))
1034
+ list_del_init(&rt->nh_list);
1035
+
1036
+ if (refcount_read(&rt->fib6_ref) != 1) {
9161037 /* This route is used as dummy address holder in some split
9171038 * nodes. It is not leaked, but it still holds other resources,
9181039 * which must be released in time. So, scan ascendant nodes
....@@ -925,7 +1046,7 @@
9251046 struct fib6_info *new_leaf;
9261047 if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) {
9271048 new_leaf = fib6_find_prefix(net, table, fn);
928
- atomic_inc(&new_leaf->fib6_ref);
1049
+ fib6_info_hold(new_leaf);
9291050
9301051 rcu_assign_pointer(fn->leaf, new_leaf);
9311052 fib6_info_release(rt);
....@@ -955,6 +1076,7 @@
9551076 (info->nlh->nlmsg_flags & NLM_F_CREATE));
9561077 int found = 0;
9571078 bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
1079
+ bool notify_sibling_rt = false;
9581080 u16 nlflags = NLM_F_EXCL;
9591081 int err;
9601082
....@@ -1047,6 +1169,7 @@
10471169
10481170 /* Find the first route that have the same metric */
10491171 sibling = leaf;
1172
+ notify_sibling_rt = true;
10501173 while (sibling) {
10511174 if (sibling->fib6_metric == rt->fib6_metric &&
10521175 rt6_qualify_for_ecmp(sibling)) {
....@@ -1056,6 +1179,7 @@
10561179 }
10571180 sibling = rcu_dereference_protected(sibling->fib6_next,
10581181 lockdep_is_held(&rt->fib6_table->tb6_lock));
1182
+ notify_sibling_rt = false;
10591183 }
10601184 /* For each sibling in the list, increment the counter of
10611185 * siblings. BUG() if counters does not match, list of siblings
....@@ -1082,30 +1206,43 @@
10821206 add:
10831207 nlflags |= NLM_F_CREATE;
10841208
1085
- err = call_fib6_entry_notifiers(info->nl_net,
1086
- FIB_EVENT_ENTRY_ADD,
1087
- rt, extack);
1088
- if (err) {
1089
- struct fib6_info *sibling, *next_sibling;
1209
+ /* The route should only be notified if it is the first
1210
+ * route in the node or if it is added as a sibling
1211
+ * route to the first route in the node.
1212
+ */
1213
+ if (!info->skip_notify_kernel &&
1214
+ (notify_sibling_rt || ins == &fn->leaf)) {
1215
+ enum fib_event_type fib_event;
10901216
1091
- /* If the route has siblings, then it first
1092
- * needs to be unlinked from them.
1093
- */
1094
- if (!rt->fib6_nsiblings)
1217
+ if (notify_sibling_rt)
1218
+ fib_event = FIB_EVENT_ENTRY_APPEND;
1219
+ else
1220
+ fib_event = FIB_EVENT_ENTRY_REPLACE;
1221
+ err = call_fib6_entry_notifiers(info->nl_net,
1222
+ fib_event, rt,
1223
+ extack);
1224
+ if (err) {
1225
+ struct fib6_info *sibling, *next_sibling;
1226
+
1227
+ /* If the route has siblings, then it first
1228
+ * needs to be unlinked from them.
1229
+ */
1230
+ if (!rt->fib6_nsiblings)
1231
+ return err;
1232
+
1233
+ list_for_each_entry_safe(sibling, next_sibling,
1234
+ &rt->fib6_siblings,
1235
+ fib6_siblings)
1236
+ sibling->fib6_nsiblings--;
1237
+ rt->fib6_nsiblings = 0;
1238
+ list_del_init(&rt->fib6_siblings);
1239
+ rt6_multipath_rebalance(next_sibling);
10951240 return err;
1096
-
1097
- list_for_each_entry_safe(sibling, next_sibling,
1098
- &rt->fib6_siblings,
1099
- fib6_siblings)
1100
- sibling->fib6_nsiblings--;
1101
- rt->fib6_nsiblings = 0;
1102
- list_del_init(&rt->fib6_siblings);
1103
- rt6_multipath_rebalance(next_sibling);
1104
- return err;
1241
+ }
11051242 }
11061243
11071244 rcu_assign_pointer(rt->fib6_next, iter);
1108
- atomic_inc(&rt->fib6_ref);
1245
+ fib6_info_hold(rt);
11091246 rcu_assign_pointer(rt->fib6_node, fn);
11101247 rcu_assign_pointer(*ins, rt);
11111248 if (!info->skip_notify)
....@@ -1127,13 +1264,15 @@
11271264 return -ENOENT;
11281265 }
11291266
1130
- err = call_fib6_entry_notifiers(info->nl_net,
1131
- FIB_EVENT_ENTRY_REPLACE,
1132
- rt, extack);
1133
- if (err)
1134
- return err;
1267
+ if (!info->skip_notify_kernel && ins == &fn->leaf) {
1268
+ err = call_fib6_entry_notifiers(info->nl_net,
1269
+ FIB_EVENT_ENTRY_REPLACE,
1270
+ rt, extack);
1271
+ if (err)
1272
+ return err;
1273
+ }
11351274
1136
- atomic_inc(&rt->fib6_ref);
1275
+ fib6_info_hold(rt);
11371276 rcu_assign_pointer(rt->fib6_node, fn);
11381277 rt->fib6_next = iter->fib6_next;
11391278 rcu_assign_pointer(*ins, rt);
....@@ -1215,6 +1354,14 @@
12151354 __fib6_update_sernum_upto_root(rt, fib6_new_sernum(net));
12161355 }
12171356
1357
+/* allow ipv4 to update sernum via ipv6_stub */
1358
+void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i)
1359
+{
1360
+ spin_lock_bh(&f6i->fib6_table->tb6_lock);
1361
+ fib6_update_sernum_upto_root(net, f6i);
1362
+ spin_unlock_bh(&f6i->fib6_table->tb6_lock);
1363
+}
1364
+
12181365 /*
12191366 * Add routing information to the routing tree.
12201367 * <destination addr>/<source addr>
....@@ -1230,7 +1377,6 @@
12301377 int err = -ENOMEM;
12311378 int allow_create = 1;
12321379 int replace_required = 0;
1233
- int sernum = fib6_new_sernum(info->nl_net);
12341380
12351381 if (info->nlh) {
12361382 if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
....@@ -1275,7 +1421,7 @@
12751421 if (!sfn)
12761422 goto failure;
12771423
1278
- atomic_inc(&info->nl_net->ipv6.fib6_null_entry->fib6_ref);
1424
+ fib6_info_hold(info->nl_net->ipv6.fib6_null_entry);
12791425 rcu_assign_pointer(sfn->leaf,
12801426 info->nl_net->ipv6.fib6_null_entry);
12811427 sfn->fn_flags = RTN_ROOT;
....@@ -1318,7 +1464,7 @@
13181464 rcu_assign_pointer(fn->leaf,
13191465 info->nl_net->ipv6.fib6_null_entry);
13201466 } else {
1321
- atomic_inc(&rt->fib6_ref);
1467
+ fib6_info_hold(rt);
13221468 rcu_assign_pointer(fn->leaf, rt);
13231469 }
13241470 }
....@@ -1328,7 +1474,9 @@
13281474
13291475 err = fib6_add_rt2node(fn, rt, info, extack);
13301476 if (!err) {
1331
- __fib6_update_sernum_upto_root(rt, sernum);
1477
+ if (rt->nh)
1478
+ list_add(&rt->nh_list, &rt->nh->f6i_list);
1479
+ __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net));
13321480 fib6_start_gc(info->nl_net, rt);
13331481 }
13341482
....@@ -1364,6 +1512,8 @@
13641512 }
13651513 #endif
13661514 goto failure;
1515
+ } else if (fib6_requires_src(rt)) {
1516
+ fib6_routes_require_src_inc(info->nl_net);
13671517 }
13681518 return err;
13691519
....@@ -1664,10 +1814,14 @@
16641814
16651815 children = 0;
16661816 child = NULL;
1667
- if (fn_r)
1668
- child = fn_r, children |= 1;
1669
- if (fn_l)
1670
- child = fn_l, children |= 2;
1817
+ if (fn_r) {
1818
+ child = fn_r;
1819
+ children |= 1;
1820
+ }
1821
+ if (fn_l) {
1822
+ child = fn_l;
1823
+ children |= 2;
1824
+ }
16711825
16721826 if (children == 3 || FIB6_SUBTREE(fn)
16731827 #ifdef CONFIG_IPV6_SUBTREES
....@@ -1746,12 +1900,28 @@
17461900 static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
17471901 struct fib6_info __rcu **rtp, struct nl_info *info)
17481902 {
1903
+ struct fib6_info *leaf, *replace_rt = NULL;
17491904 struct fib6_walker *w;
17501905 struct fib6_info *rt = rcu_dereference_protected(*rtp,
17511906 lockdep_is_held(&table->tb6_lock));
17521907 struct net *net = info->nl_net;
1908
+ bool notify_del = false;
17531909
17541910 RT6_TRACE("fib6_del_route\n");
1911
+
1912
+ /* If the deleted route is the first in the node and it is not part of
1913
+ * a multipath route, then we need to replace it with the next route
1914
+ * in the node, if exists.
1915
+ */
1916
+ leaf = rcu_dereference_protected(fn->leaf,
1917
+ lockdep_is_held(&table->tb6_lock));
1918
+ if (leaf == rt && !rt->fib6_nsiblings) {
1919
+ if (rcu_access_pointer(rt->fib6_next))
1920
+ replace_rt = rcu_dereference_protected(rt->fib6_next,
1921
+ lockdep_is_held(&table->tb6_lock));
1922
+ else
1923
+ notify_del = true;
1924
+ }
17551925
17561926 /* Unlink it */
17571927 *rtp = rt->fib6_next;
....@@ -1767,6 +1937,14 @@
17671937 if (rt->fib6_nsiblings) {
17681938 struct fib6_info *sibling, *next_sibling;
17691939
1940
+ /* The route is deleted from a multipath route. If this
1941
+ * multipath route is the first route in the node, then we need
1942
+ * to emit a delete notification. Otherwise, we need to skip
1943
+ * the notification.
1944
+ */
1945
+ if (rt->fib6_metric == leaf->fib6_metric &&
1946
+ rt6_qualify_for_ecmp(leaf))
1947
+ notify_del = true;
17701948 list_for_each_entry_safe(sibling, next_sibling,
17711949 &rt->fib6_siblings, fib6_siblings)
17721950 sibling->fib6_nsiblings--;
....@@ -1802,9 +1980,16 @@
18021980
18031981 fib6_purge_rt(rt, fn, net);
18041982
1805
- call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL);
1983
+ if (!info->skip_notify_kernel) {
1984
+ if (notify_del)
1985
+ call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL,
1986
+ rt, NULL);
1987
+ else if (replace_rt)
1988
+ call_fib6_entry_notifiers_replace(net, replace_rt);
1989
+ }
18061990 if (!info->skip_notify)
18071991 inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
1992
+
18081993 fib6_info_release(rt);
18091994 }
18101995
....@@ -1836,6 +2021,8 @@
18362021 struct fib6_info *cur = rcu_dereference_protected(*rtp,
18372022 lockdep_is_held(&table->tb6_lock));
18382023 if (rt == cur) {
2024
+ if (fib6_requires_src(cur))
2025
+ fib6_routes_require_src_dec(info->nl_net);
18392026 fib6_del_route(table, fn, rtp, info);
18402027 return 0;
18412028 }
....@@ -1890,8 +2077,8 @@
18902077 continue;
18912078 }
18922079 w->state = FWS_L;
2080
+ fallthrough;
18932081 #endif
1894
- /* fall through */
18952082 case FWS_L:
18962083 left = rcu_dereference_protected(fn->left, 1);
18972084 if (left) {
....@@ -1900,7 +2087,7 @@
19002087 continue;
19012088 }
19022089 w->state = FWS_R;
1903
- /* fall through */
2090
+ fallthrough;
19042091 case FWS_R:
19052092 right = rcu_dereference_protected(fn->right, 1);
19062093 if (right) {
....@@ -1910,7 +2097,7 @@
19102097 }
19112098 w->state = FWS_C;
19122099 w->leaf = rcu_dereference_protected(fn->leaf, 1);
1913
- /* fall through */
2100
+ fallthrough;
19142101 case FWS_C:
19152102 if (w->leaf && fn->fn_flags & RTN_RTINFO) {
19162103 int err;
....@@ -1929,7 +2116,7 @@
19292116 }
19302117 skip:
19312118 w->state = FWS_U;
1932
- /* fall through */
2119
+ fallthrough;
19332120 case FWS_U:
19342121 if (fn == w->root)
19352122 return 0;
....@@ -1981,6 +2168,7 @@
19812168 struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w);
19822169 struct nl_info info = {
19832170 .nl_net = c->net,
2171
+ .skip_notify = c->skip_notify,
19842172 };
19852173
19862174 if (c->sernum != FIB6_NO_SERNUM_CHANGE &&
....@@ -2032,7 +2220,7 @@
20322220
20332221 static void fib6_clean_tree(struct net *net, struct fib6_node *root,
20342222 int (*func)(struct fib6_info *, void *arg),
2035
- int sernum, void *arg)
2223
+ int sernum, void *arg, bool skip_notify)
20362224 {
20372225 struct fib6_cleaner c;
20382226
....@@ -2040,17 +2228,19 @@
20402228 c.w.func = fib6_clean_node;
20412229 c.w.count = 0;
20422230 c.w.skip = 0;
2231
+ c.w.skip_in_node = 0;
20432232 c.func = func;
20442233 c.sernum = sernum;
20452234 c.arg = arg;
20462235 c.net = net;
2236
+ c.skip_notify = skip_notify;
20472237
20482238 fib6_walk(net, &c.w);
20492239 }
20502240
20512241 static void __fib6_clean_all(struct net *net,
20522242 int (*func)(struct fib6_info *, void *),
2053
- int sernum, void *arg)
2243
+ int sernum, void *arg, bool skip_notify)
20542244 {
20552245 struct fib6_table *table;
20562246 struct hlist_head *head;
....@@ -2062,7 +2252,7 @@
20622252 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
20632253 spin_lock_bh(&table->tb6_lock);
20642254 fib6_clean_tree(net, &table->tb6_root,
2065
- func, sernum, arg);
2255
+ func, sernum, arg, skip_notify);
20662256 spin_unlock_bh(&table->tb6_lock);
20672257 }
20682258 }
....@@ -2072,14 +2262,21 @@
20722262 void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *),
20732263 void *arg)
20742264 {
2075
- __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg);
2265
+ __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, false);
2266
+}
2267
+
2268
+void fib6_clean_all_skip_notify(struct net *net,
2269
+ int (*func)(struct fib6_info *, void *),
2270
+ void *arg)
2271
+{
2272
+ __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, true);
20762273 }
20772274
20782275 static void fib6_flush_trees(struct net *net)
20792276 {
20802277 int new_sernum = fib6_new_sernum(net);
20812278
2082
- __fib6_clean_all(net, NULL, new_sernum, NULL);
2279
+ __fib6_clean_all(net, NULL, new_sernum, NULL, false);
20832280 }
20842281
20852282 /*
....@@ -2279,11 +2476,16 @@
22792476 }
22802477
22812478 #ifdef CONFIG_PROC_FS
2282
-static int ipv6_route_seq_show(struct seq_file *seq, void *v)
2479
+static int ipv6_route_native_seq_show(struct seq_file *seq, void *v)
22832480 {
22842481 struct fib6_info *rt = v;
22852482 struct ipv6_route_iter *iter = seq->private;
2483
+ struct fib6_nh *fib6_nh = rt->fib6_nh;
2484
+ unsigned int flags = rt->fib6_flags;
22862485 const struct net_device *dev;
2486
+
2487
+ if (rt->nh)
2488
+ fib6_nh = nexthop_fib6_nh_bh(rt->nh);
22872489
22882490 seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen);
22892491
....@@ -2292,15 +2494,17 @@
22922494 #else
22932495 seq_puts(seq, "00000000000000000000000000000000 00 ");
22942496 #endif
2295
- if (rt->fib6_flags & RTF_GATEWAY)
2296
- seq_printf(seq, "%pi6", &rt->fib6_nh.nh_gw);
2297
- else
2497
+ if (fib6_nh->fib_nh_gw_family) {
2498
+ flags |= RTF_GATEWAY;
2499
+ seq_printf(seq, "%pi6", &fib6_nh->fib_nh_gw6);
2500
+ } else {
22982501 seq_puts(seq, "00000000000000000000000000000000");
2502
+ }
22992503
2300
- dev = rt->fib6_nh.nh_dev;
2504
+ dev = fib6_nh->fib_nh_dev;
23012505 seq_printf(seq, " %08x %08x %08x %08x %8s\n",
2302
- rt->fib6_metric, atomic_read(&rt->fib6_ref), 0,
2303
- rt->fib6_flags, dev ? dev->name : "");
2506
+ rt->fib6_metric, refcount_read(&rt->fib6_ref), 0,
2507
+ flags, dev ? dev->name : "");
23042508 iter->w.leaf = NULL;
23052509 return 0;
23062510 }
....@@ -2434,7 +2638,7 @@
24342638 return w->node && !(w->state == FWS_U && w->node == w->root);
24352639 }
24362640
2437
-static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
2641
+static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v)
24382642 __releases(RCU_BH)
24392643 {
24402644 struct net *net = seq_file_net(seq);
....@@ -2446,6 +2650,62 @@
24462650 rcu_read_unlock_bh();
24472651 }
24482652
2653
+#if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL)
2654
+static int ipv6_route_prog_seq_show(struct bpf_prog *prog,
2655
+ struct bpf_iter_meta *meta,
2656
+ void *v)
2657
+{
2658
+ struct bpf_iter__ipv6_route ctx;
2659
+
2660
+ ctx.meta = meta;
2661
+ ctx.rt = v;
2662
+ return bpf_iter_run_prog(prog, &ctx);
2663
+}
2664
+
2665
+static int ipv6_route_seq_show(struct seq_file *seq, void *v)
2666
+{
2667
+ struct ipv6_route_iter *iter = seq->private;
2668
+ struct bpf_iter_meta meta;
2669
+ struct bpf_prog *prog;
2670
+ int ret;
2671
+
2672
+ meta.seq = seq;
2673
+ prog = bpf_iter_get_info(&meta, false);
2674
+ if (!prog)
2675
+ return ipv6_route_native_seq_show(seq, v);
2676
+
2677
+ ret = ipv6_route_prog_seq_show(prog, &meta, v);
2678
+ iter->w.leaf = NULL;
2679
+
2680
+ return ret;
2681
+}
2682
+
2683
+static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
2684
+{
2685
+ struct bpf_iter_meta meta;
2686
+ struct bpf_prog *prog;
2687
+
2688
+ if (!v) {
2689
+ meta.seq = seq;
2690
+ prog = bpf_iter_get_info(&meta, true);
2691
+ if (prog)
2692
+ (void)ipv6_route_prog_seq_show(prog, &meta, v);
2693
+ }
2694
+
2695
+ ipv6_route_native_seq_stop(seq, v);
2696
+}
2697
+#else
2698
+static int ipv6_route_seq_show(struct seq_file *seq, void *v)
2699
+{
2700
+ return ipv6_route_native_seq_show(seq, v);
2701
+}
2702
+
2703
+static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
2704
+{
2705
+ ipv6_route_native_seq_stop(seq, v);
2706
+}
2707
+#endif
2708
+
24492709 const struct seq_operations ipv6_route_seq_ops = {
24502710 .start = ipv6_route_seq_start,
24512711 .next = ipv6_route_seq_next,