hc
2024-05-11 04dd17822334871b23ea2862f7798fb0e0007777
kernel/net/ipv4/route.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /*
23 * INET An implementation of the TCP/IP protocol suite for the LINUX
34 * operating system. INET is implemented using the BSD Socket
....@@ -55,11 +56,6 @@
5556 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
5657 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
5758 * Ilia Sotnikov : Removed TOS from hash calculations
58
- *
59
- * This program is free software; you can redistribute it and/or
60
- * modify it under the terms of the GNU General Public License
61
- * as published by the Free Software Foundation; either version
62
- * 2 of the License, or (at your option) any later version.
6359 */
6460
6561 #define pr_fmt(fmt) "IPv4: " fmt
....@@ -70,7 +66,7 @@
7066 #include <linux/types.h>
7167 #include <linux/kernel.h>
7268 #include <linux/mm.h>
73
-#include <linux/bootmem.h>
69
+#include <linux/memblock.h>
7470 #include <linux/string.h>
7571 #include <linux/socket.h>
7672 #include <linux/sockios.h>
....@@ -100,6 +96,7 @@
10096 #include <net/inetpeer.h>
10197 #include <net/sock.h>
10298 #include <net/ip_fib.h>
99
+#include <net/nexthop.h>
103100 #include <net/arp.h>
104101 #include <net/tcp.h>
105102 #include <net/icmp.h>
....@@ -241,11 +238,11 @@
241238 return seq_open(file, &rt_cache_seq_ops);
242239 }
243240
244
-static const struct file_operations rt_cache_seq_fops = {
245
- .open = rt_cache_seq_open,
246
- .read = seq_read,
247
- .llseek = seq_lseek,
248
- .release = seq_release,
241
+static const struct proc_ops rt_cache_proc_ops = {
242
+ .proc_open = rt_cache_seq_open,
243
+ .proc_read = seq_read,
244
+ .proc_lseek = seq_lseek,
245
+ .proc_release = seq_release,
249246 };
250247
251248
....@@ -332,11 +329,11 @@
332329 return seq_open(file, &rt_cpu_seq_ops);
333330 }
334331
335
-static const struct file_operations rt_cpu_seq_fops = {
336
- .open = rt_cpu_seq_open,
337
- .read = seq_read,
338
- .llseek = seq_lseek,
339
- .release = seq_release,
332
+static const struct proc_ops rt_cpu_proc_ops = {
333
+ .proc_open = rt_cpu_seq_open,
334
+ .proc_read = seq_read,
335
+ .proc_lseek = seq_lseek,
336
+ .proc_release = seq_release,
340337 };
341338
342339 #ifdef CONFIG_IP_ROUTE_CLASSID
....@@ -370,12 +367,12 @@
370367 struct proc_dir_entry *pde;
371368
372369 pde = proc_create("rt_cache", 0444, net->proc_net,
373
- &rt_cache_seq_fops);
370
+ &rt_cache_proc_ops);
374371 if (!pde)
375372 goto err1;
376373
377374 pde = proc_create("rt_cache", 0444,
378
- net->proc_net_stat, &rt_cpu_seq_fops);
375
+ net->proc_net_stat, &rt_cpu_proc_ops);
379376 if (!pde)
380377 goto err2;
381378
....@@ -437,37 +434,46 @@
437434 struct sk_buff *skb,
438435 const void *daddr)
439436 {
437
+ const struct rtable *rt = container_of(dst, struct rtable, dst);
440438 struct net_device *dev = dst->dev;
441
- const __be32 *pkey = daddr;
442
- const struct rtable *rt;
443439 struct neighbour *n;
444440
445
- rt = (const struct rtable *) dst;
446
- if (rt->rt_gateway)
447
- pkey = (const __be32 *) &rt->rt_gateway;
448
- else if (skb)
449
- pkey = &ip_hdr(skb)->daddr;
441
+ rcu_read_lock_bh();
450442
451
- n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
452
- if (n)
453
- return n;
454
- return neigh_create(&arp_tbl, pkey, dev);
443
+ if (likely(rt->rt_gw_family == AF_INET)) {
444
+ n = ip_neigh_gw4(dev, rt->rt_gw4);
445
+ } else if (rt->rt_gw_family == AF_INET6) {
446
+ n = ip_neigh_gw6(dev, &rt->rt_gw6);
447
+ } else {
448
+ __be32 pkey;
449
+
450
+ pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
451
+ n = ip_neigh_gw4(dev, pkey);
452
+ }
453
+
454
+ if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
455
+ n = NULL;
456
+
457
+ rcu_read_unlock_bh();
458
+
459
+ return n;
455460 }
456461
457462 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
458463 {
464
+ const struct rtable *rt = container_of(dst, struct rtable, dst);
459465 struct net_device *dev = dst->dev;
460466 const __be32 *pkey = daddr;
461
- const struct rtable *rt;
462467
463
- rt = (const struct rtable *)dst;
464
- if (rt->rt_gateway)
465
- pkey = (const __be32 *)&rt->rt_gateway;
466
- else if (!daddr ||
468
+ if (rt->rt_gw_family == AF_INET) {
469
+ pkey = (const __be32 *)&rt->rt_gw4;
470
+ } else if (rt->rt_gw_family == AF_INET6) {
471
+ return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
472
+ } else if (!daddr ||
467473 (rt->rt_flags &
468
- (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
474
+ (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
469475 return;
470
-
476
+ }
471477 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
472478 }
473479
....@@ -522,6 +528,15 @@
522528 iph->id = htons(id);
523529 }
524530 EXPORT_SYMBOL(__ip_select_ident);
531
+
532
+static void ip_rt_fix_tos(struct flowi4 *fl4)
533
+{
534
+ __u8 tos = RT_FL_TOS(fl4);
535
+
536
+ fl4->flowi4_tos = tos & IPTOS_RT_MASK;
537
+ fl4->flowi4_scope = tos & RTO_ONLINK ?
538
+ RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
539
+}
525540
526541 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
527542 const struct sock *sk,
....@@ -643,13 +658,15 @@
643658
644659 if (fnhe->fnhe_gw) {
645660 rt->rt_flags |= RTCF_REDIRECTED;
646
- rt->rt_gateway = fnhe->fnhe_gw;
647661 rt->rt_uses_gateway = 1;
662
+ rt->rt_gw_family = AF_INET;
663
+ rt->rt_gw4 = fnhe->fnhe_gw;
648664 }
649665 }
650666
651
-static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
652
- u32 pmtu, bool lock, unsigned long expires)
667
+static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
668
+ __be32 gw, u32 pmtu, bool lock,
669
+ unsigned long expires)
653670 {
654671 struct fnhe_hash_bucket *hash;
655672 struct fib_nh_exception *fnhe;
....@@ -658,17 +675,17 @@
658675 unsigned int i;
659676 int depth;
660677
661
- genid = fnhe_genid(dev_net(nh->nh_dev));
678
+ genid = fnhe_genid(dev_net(nhc->nhc_dev));
662679 hval = fnhe_hashfun(daddr);
663680
664681 spin_lock_bh(&fnhe_lock);
665682
666
- hash = rcu_dereference(nh->nh_exceptions);
683
+ hash = rcu_dereference(nhc->nhc_exceptions);
667684 if (!hash) {
668685 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
669686 if (!hash)
670687 goto out_unlock;
671
- rcu_assign_pointer(nh->nh_exceptions, hash);
688
+ rcu_assign_pointer(nhc->nhc_exceptions, hash);
672689 }
673690
674691 hash += hval;
....@@ -727,13 +744,13 @@
727744 * stale, so anyone caching it rechecks if this exception
728745 * applies to them.
729746 */
730
- rt = rcu_dereference(nh->nh_rth_input);
747
+ rt = rcu_dereference(nhc->nhc_rth_input);
731748 if (rt)
732749 rt->dst.obsolete = DST_OBSOLETE_KILL;
733750
734751 for_each_possible_cpu(i) {
735752 struct rtable __rcu **prt;
736
- prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
753
+ prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
737754 rt = rcu_dereference(*prt);
738755 if (rt)
739756 rt->dst.obsolete = DST_OBSOLETE_KILL;
....@@ -768,7 +785,7 @@
768785 return;
769786 }
770787
771
- if (rt->rt_gateway != old_gw)
788
+ if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
772789 return;
773790
774791 in_dev = __in_dev_get_rcu(dev);
....@@ -799,11 +816,11 @@
799816 neigh_event_send(n, NULL);
800817 } else {
801818 if (fib_lookup(net, fl4, &res, 0) == 0) {
802
- struct fib_nh *nh;
819
+ struct fib_nh_common *nhc;
803820
804821 fib_select_path(net, &res, fl4, skb);
805
- nh = &FIB_RES_NH(res);
806
- update_or_create_fnhe(nh, fl4->daddr, new_gw,
822
+ nhc = FIB_RES_NHC(res);
823
+ update_or_create_fnhe(nhc, fl4->daddr, new_gw,
807824 0, false,
808825 jiffies + ip_rt_gc_timeout);
809826 }
....@@ -845,6 +862,7 @@
845862 rt = (struct rtable *) dst;
846863
847864 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
865
+ ip_rt_fix_tos(&fl4);
848866 __ip_do_redirect(rt, skb, &fl4, true);
849867 }
850868
....@@ -1029,13 +1047,14 @@
10291047 {
10301048 struct dst_entry *dst = &rt->dst;
10311049 struct net *net = dev_net(dst->dev);
1032
- u32 old_mtu = ipv4_mtu(dst);
10331050 struct fib_result res;
10341051 bool lock = false;
1052
+ u32 old_mtu;
10351053
10361054 if (ip_mtu_locked(dst))
10371055 return;
10381056
1057
+ old_mtu = ipv4_mtu(dst);
10391058 if (old_mtu < mtu)
10401059 return;
10411060
....@@ -1050,11 +1069,11 @@
10501069
10511070 rcu_read_lock();
10521071 if (fib_lookup(net, fl4, &res, 0) == 0) {
1053
- struct fib_nh *nh;
1072
+ struct fib_nh_common *nhc;
10541073
10551074 fib_select_path(net, &res, fl4, NULL);
1056
- nh = &FIB_RES_NH(res);
1057
- update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1075
+ nhc = FIB_RES_NHC(res);
1076
+ update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
10581077 jiffies + ip_rt_mtu_expires);
10591078 }
10601079 rcu_read_unlock();
....@@ -1068,21 +1087,25 @@
10681087 struct flowi4 fl4;
10691088
10701089 ip_rt_build_flow_key(&fl4, sk, skb);
1090
+ ip_rt_fix_tos(&fl4);
1091
+
1092
+ /* Don't make lookup fail for bridged encapsulations */
1093
+ if (skb && netif_is_any_bridge_port(skb->dev))
1094
+ fl4.flowi4_oif = 0;
1095
+
10711096 __ip_rt_update_pmtu(rt, &fl4, mtu);
10721097 }
10731098
10741099 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1075
- int oif, u32 mark, u8 protocol, int flow_flags)
1100
+ int oif, u8 protocol)
10761101 {
1077
- const struct iphdr *iph = (const struct iphdr *) skb->data;
1102
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
10781103 struct flowi4 fl4;
10791104 struct rtable *rt;
1080
-
1081
- if (!mark)
1082
- mark = IP4_REPLY_MARK(net, skb->mark);
1105
+ u32 mark = IP4_REPLY_MARK(net, skb->mark);
10831106
10841107 __build_flow_key(net, &fl4, NULL, iph, oif,
1085
- RT_TOS(iph->tos), protocol, mark, flow_flags);
1108
+ RT_TOS(iph->tos), protocol, mark, 0);
10861109 rt = __ip_route_output_key(net, &fl4);
10871110 if (!IS_ERR(rt)) {
10881111 __ip_rt_update_pmtu(rt, &fl4, mtu);
....@@ -1093,7 +1116,7 @@
10931116
10941117 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
10951118 {
1096
- const struct iphdr *iph = (const struct iphdr *) skb->data;
1119
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
10971120 struct flowi4 fl4;
10981121 struct rtable *rt;
10991122
....@@ -1111,7 +1134,7 @@
11111134
11121135 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
11131136 {
1114
- const struct iphdr *iph = (const struct iphdr *) skb->data;
1137
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
11151138 struct flowi4 fl4;
11161139 struct rtable *rt;
11171140 struct dst_entry *odst = NULL;
....@@ -1139,9 +1162,11 @@
11391162 goto out;
11401163
11411164 new = true;
1165
+ } else {
1166
+ ip_rt_fix_tos(&fl4);
11421167 }
11431168
1144
- __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1169
+ __ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
11451170
11461171 if (!dst_check(&rt->dst, 0)) {
11471172 if (new)
....@@ -1164,14 +1189,14 @@
11641189 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
11651190
11661191 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1167
- int oif, u32 mark, u8 protocol, int flow_flags)
1192
+ int oif, u8 protocol)
11681193 {
1169
- const struct iphdr *iph = (const struct iphdr *) skb->data;
1194
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
11701195 struct flowi4 fl4;
11711196 struct rtable *rt;
11721197
11731198 __build_flow_key(net, &fl4, NULL, iph, oif,
1174
- RT_TOS(iph->tos), protocol, mark, flow_flags);
1199
+ RT_TOS(iph->tos), protocol, 0, 0);
11751200 rt = __ip_route_output_key(net, &fl4);
11761201 if (!IS_ERR(rt)) {
11771202 __ip_do_redirect(rt, skb, &fl4, false);
....@@ -1182,7 +1207,7 @@
11821207
11831208 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
11841209 {
1185
- const struct iphdr *iph = (const struct iphdr *) skb->data;
1210
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
11861211 struct flowi4 fl4;
11871212 struct rtable *rt;
11881213 struct net *net = sock_net(sk);
....@@ -1206,7 +1231,7 @@
12061231 *
12071232 * When a PMTU/redirect information update invalidates a route,
12081233 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1209
- * DST_OBSOLETE_DEAD by dst_free().
1234
+ * DST_OBSOLETE_DEAD.
12101235 */
12111236 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
12121237 return NULL;
....@@ -1215,6 +1240,7 @@
12151240
12161241 static void ipv4_send_dest_unreach(struct sk_buff *skb)
12171242 {
1243
+ struct net_device *dev;
12181244 struct ip_options opt;
12191245 int res;
12201246
....@@ -1232,7 +1258,8 @@
12321258 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
12331259
12341260 rcu_read_lock();
1235
- res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1261
+ dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev;
1262
+ res = __ip_options_compile(dev_net(dev), &opt, skb, NULL);
12361263 rcu_read_unlock();
12371264
12381265 if (res)
....@@ -1279,22 +1306,19 @@
12791306 src = ip_hdr(skb)->saddr;
12801307 else {
12811308 struct fib_result res;
1282
- struct flowi4 fl4;
1283
- struct iphdr *iph;
1284
-
1285
- iph = ip_hdr(skb);
1286
-
1287
- memset(&fl4, 0, sizeof(fl4));
1288
- fl4.daddr = iph->daddr;
1289
- fl4.saddr = iph->saddr;
1290
- fl4.flowi4_tos = RT_TOS(iph->tos);
1291
- fl4.flowi4_oif = rt->dst.dev->ifindex;
1292
- fl4.flowi4_iif = skb->dev->ifindex;
1293
- fl4.flowi4_mark = skb->mark;
1309
+ struct iphdr *iph = ip_hdr(skb);
1310
+ struct flowi4 fl4 = {
1311
+ .daddr = iph->daddr,
1312
+ .saddr = iph->saddr,
1313
+ .flowi4_tos = RT_TOS(iph->tos),
1314
+ .flowi4_oif = rt->dst.dev->ifindex,
1315
+ .flowi4_iif = skb->dev->ifindex,
1316
+ .flowi4_mark = skb->mark,
1317
+ };
12941318
12951319 rcu_read_lock();
12961320 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1297
- src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1321
+ src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
12981322 else
12991323 src = inet_select_addr(rt->dst.dev,
13001324 rt_nexthop(rt, iph->daddr),
....@@ -1325,7 +1349,7 @@
13251349
13261350 static unsigned int ipv4_mtu(const struct dst_entry *dst)
13271351 {
1328
- const struct rtable *rt = (const struct rtable *) dst;
1352
+ const struct rtable *rt = (const struct rtable *)dst;
13291353 unsigned int mtu = rt->rt_pmtu;
13301354
13311355 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
....@@ -1347,7 +1371,7 @@
13471371 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
13481372 }
13491373
1350
-static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1374
+static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
13511375 {
13521376 struct fnhe_hash_bucket *hash;
13531377 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
....@@ -1355,7 +1379,7 @@
13551379
13561380 spin_lock_bh(&fnhe_lock);
13571381
1358
- hash = rcu_dereference_protected(nh->nh_exceptions,
1382
+ hash = rcu_dereference_protected(nhc->nhc_exceptions,
13591383 lockdep_is_held(&fnhe_lock));
13601384 hash += hval;
13611385
....@@ -1381,9 +1405,10 @@
13811405 spin_unlock_bh(&fnhe_lock);
13821406 }
13831407
1384
-static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1408
+static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1409
+ __be32 daddr)
13851410 {
1386
- struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1411
+ struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
13871412 struct fib_nh_exception *fnhe;
13881413 u32 hval;
13891414
....@@ -1397,7 +1422,7 @@
13971422 if (fnhe->fnhe_daddr == daddr) {
13981423 if (fnhe->fnhe_expires &&
13991424 time_after(jiffies, fnhe->fnhe_expires)) {
1400
- ip_del_fnhe(nh, daddr);
1425
+ ip_del_fnhe(nhc, daddr);
14011426 break;
14021427 }
14031428 return fnhe;
....@@ -1414,19 +1439,19 @@
14141439
14151440 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
14161441 {
1442
+ struct fib_nh_common *nhc = res->nhc;
1443
+ struct net_device *dev = nhc->nhc_dev;
14171444 struct fib_info *fi = res->fi;
1418
- struct fib_nh *nh = &fi->fib_nh[res->nh_sel];
1419
- struct net_device *dev = nh->nh_dev;
14201445 u32 mtu = 0;
14211446
1422
- if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1447
+ if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) ||
14231448 fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
14241449 mtu = fi->fib_mtu;
14251450
14261451 if (likely(!mtu)) {
14271452 struct fib_nh_exception *fnhe;
14281453
1429
- fnhe = find_exception(nh, daddr);
1454
+ fnhe = find_exception(nhc, daddr);
14301455 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
14311456 mtu = fnhe->fnhe_pmtu;
14321457 }
....@@ -1434,7 +1459,7 @@
14341459 if (likely(!mtu))
14351460 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
14361461
1437
- return mtu - lwtunnel_headroom(nh->nh_lwtstate, mtu);
1462
+ return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
14381463 }
14391464
14401465 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
....@@ -1465,8 +1490,10 @@
14651490 orig = NULL;
14661491 }
14671492 fill_route_from_fnhe(rt, fnhe);
1468
- if (!rt->rt_gateway)
1469
- rt->rt_gateway = daddr;
1493
+ if (!rt->rt_gw4) {
1494
+ rt->rt_gw4 = daddr;
1495
+ rt->rt_gw_family = AF_INET;
1496
+ }
14701497
14711498 if (do_cache) {
14721499 dst_hold(&rt->dst);
....@@ -1485,15 +1512,15 @@
14851512 return ret;
14861513 }
14871514
1488
-static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1515
+static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
14891516 {
14901517 struct rtable *orig, *prev, **p;
14911518 bool ret = true;
14921519
14931520 if (rt_is_input_route(rt)) {
1494
- p = (struct rtable **)&nh->nh_rth_input;
1521
+ p = (struct rtable **)&nhc->nhc_rth_input;
14951522 } else {
1496
- p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1523
+ p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
14971524 }
14981525 orig = *p;
14991526
....@@ -1546,18 +1573,14 @@
15461573
15471574 static void ipv4_dst_destroy(struct dst_entry *dst)
15481575 {
1549
- struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
15501576 struct rtable *rt = (struct rtable *)dst;
15511577
1552
- if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1553
- kfree(p);
1554
-
1578
+ ip_dst_metrics_put(dst);
15551579 rt_del_uncached_list(rt);
15561580 }
15571581
15581582 void rt_flush_dev(struct net_device *dev)
15591583 {
1560
- struct net *net = dev_net(dev);
15611584 struct rtable *rt;
15621585 int cpu;
15631586
....@@ -1568,7 +1591,7 @@
15681591 list_for_each_entry(rt, &ul->head, rt_uncached) {
15691592 if (rt->dst.dev != dev)
15701593 continue;
1571
- rt->dst.dev = net->loopback_dev;
1594
+ rt->dst.dev = blackhole_netdev;
15721595 dev_hold(rt->dst.dev);
15731596 dev_put(dev);
15741597 }
....@@ -1592,33 +1615,43 @@
15921615 bool cached = false;
15931616
15941617 if (fi) {
1595
- struct fib_nh *nh = &FIB_RES_NH(*res);
1618
+ struct fib_nh_common *nhc = FIB_RES_NHC(*res);
15961619
1597
- if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1598
- rt->rt_gateway = nh->nh_gw;
1620
+ if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
15991621 rt->rt_uses_gateway = 1;
1622
+ rt->rt_gw_family = nhc->nhc_gw_family;
1623
+ /* only INET and INET6 are supported */
1624
+ if (likely(nhc->nhc_gw_family == AF_INET))
1625
+ rt->rt_gw4 = nhc->nhc_gw.ipv4;
1626
+ else
1627
+ rt->rt_gw6 = nhc->nhc_gw.ipv6;
16001628 }
1601
- dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1602
- if (fi->fib_metrics != &dst_default_metrics) {
1603
- rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1604
- refcount_inc(&fi->fib_metrics->refcnt);
1605
- }
1629
+
1630
+ ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1631
+
16061632 #ifdef CONFIG_IP_ROUTE_CLASSID
1607
- rt->dst.tclassid = nh->nh_tclassid;
1633
+ if (nhc->nhc_family == AF_INET) {
1634
+ struct fib_nh *nh;
1635
+
1636
+ nh = container_of(nhc, struct fib_nh, nh_common);
1637
+ rt->dst.tclassid = nh->nh_tclassid;
1638
+ }
16081639 #endif
1609
- rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1640
+ rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
16101641 if (unlikely(fnhe))
16111642 cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
16121643 else if (do_cache)
1613
- cached = rt_cache_route(nh, rt);
1644
+ cached = rt_cache_route(nhc, rt);
16141645 if (unlikely(!cached)) {
16151646 /* Routes we intend to cache in nexthop exception or
16161647 * FIB nexthop have the DST_NOCACHE bit clear.
16171648 * However, if we are unsuccessful at storing this
16181649 * route into the cache we really need to set it.
16191650 */
1620
- if (!rt->rt_gateway)
1621
- rt->rt_gateway = daddr;
1651
+ if (!rt->rt_gw4) {
1652
+ rt->rt_gw_family = AF_INET;
1653
+ rt->rt_gw4 = daddr;
1654
+ }
16221655 rt_add_uncached_list(rt);
16231656 }
16241657 } else
....@@ -1634,12 +1667,11 @@
16341667
16351668 struct rtable *rt_dst_alloc(struct net_device *dev,
16361669 unsigned int flags, u16 type,
1637
- bool nopolicy, bool noxfrm, bool will_cache)
1670
+ bool nopolicy, bool noxfrm)
16381671 {
16391672 struct rtable *rt;
16401673
16411674 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1642
- (will_cache ? 0 : DST_HOST) |
16431675 (nopolicy ? DST_NOPOLICY : 0) |
16441676 (noxfrm ? DST_NOXFRM : 0));
16451677
....@@ -1651,8 +1683,9 @@
16511683 rt->rt_iif = 0;
16521684 rt->rt_pmtu = 0;
16531685 rt->rt_mtu_locked = 0;
1654
- rt->rt_gateway = 0;
16551686 rt->rt_uses_gateway = 0;
1687
+ rt->rt_gw_family = 0;
1688
+ rt->rt_gw4 = 0;
16561689 INIT_LIST_HEAD(&rt->rt_uncached);
16571690
16581691 rt->dst.output = ip_output;
....@@ -1663,6 +1696,38 @@
16631696 return rt;
16641697 }
16651698 EXPORT_SYMBOL(rt_dst_alloc);
1699
+
1700
+struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1701
+{
1702
+ struct rtable *new_rt;
1703
+
1704
+ new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1705
+ rt->dst.flags);
1706
+
1707
+ if (new_rt) {
1708
+ new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1709
+ new_rt->rt_flags = rt->rt_flags;
1710
+ new_rt->rt_type = rt->rt_type;
1711
+ new_rt->rt_is_input = rt->rt_is_input;
1712
+ new_rt->rt_iif = rt->rt_iif;
1713
+ new_rt->rt_pmtu = rt->rt_pmtu;
1714
+ new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1715
+ new_rt->rt_gw_family = rt->rt_gw_family;
1716
+ if (rt->rt_gw_family == AF_INET)
1717
+ new_rt->rt_gw4 = rt->rt_gw4;
1718
+ else if (rt->rt_gw_family == AF_INET6)
1719
+ new_rt->rt_gw6 = rt->rt_gw6;
1720
+ INIT_LIST_HEAD(&new_rt->rt_uncached);
1721
+
1722
+ new_rt->dst.input = rt->dst.input;
1723
+ new_rt->dst.output = rt->dst.output;
1724
+ new_rt->dst.error = rt->dst.error;
1725
+ new_rt->dst.lastuse = jiffies;
1726
+ new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1727
+ }
1728
+ return new_rt;
1729
+}
1730
+EXPORT_SYMBOL(rt_dst_clone);
16661731
16671732 /* called in rcu_read_lock() section */
16681733 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
....@@ -1683,7 +1748,8 @@
16831748 return -EINVAL;
16841749
16851750 if (ipv4_is_zeronet(saddr)) {
1686
- if (!ipv4_is_local_multicast(daddr))
1751
+ if (!ipv4_is_local_multicast(daddr) &&
1752
+ ip_hdr(skb)->protocol != IPPROTO_IGMP)
16871753 return -EINVAL;
16881754 } else {
16891755 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
....@@ -1712,7 +1778,7 @@
17121778 flags |= RTCF_LOCAL;
17131779
17141780 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1715
- IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1781
+ IN_DEV_ORCONF(in_dev, NOPOLICY), false);
17161782 if (!rth)
17171783 return -ENOBUFS;
17181784
....@@ -1728,6 +1794,7 @@
17281794 #endif
17291795 RT_CACHE_STAT_INC(in_slow_mc);
17301796
1797
+ skb_dst_drop(skb);
17311798 skb_dst_set(skb, &rth->dst);
17321799 return 0;
17331800 }
....@@ -1752,7 +1819,7 @@
17521819 print_hex_dump(KERN_WARNING, "ll header: ",
17531820 DUMP_PREFIX_OFFSET, 16, 1,
17541821 skb_mac_header(skb),
1755
- dev->hard_header_len, true);
1822
+ dev->hard_header_len, false);
17561823 }
17571824 }
17581825 #endif
....@@ -1764,6 +1831,8 @@
17641831 struct in_device *in_dev,
17651832 __be32 daddr, __be32 saddr, u32 tos)
17661833 {
1834
+ struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1835
+ struct net_device *dev = nhc->nhc_dev;
17671836 struct fib_nh_exception *fnhe;
17681837 struct rtable *rth;
17691838 int err;
....@@ -1772,7 +1841,7 @@
17721841 u32 itag = 0;
17731842
17741843 /* get a working reference to the output device */
1775
- out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1844
+ out_dev = __in_dev_get_rcu(dev);
17761845 if (!out_dev) {
17771846 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
17781847 return -EINVAL;
....@@ -1789,10 +1858,14 @@
17891858
17901859 do_cache = res->fi && !itag;
17911860 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1792
- skb->protocol == htons(ETH_P_IP) &&
1793
- (IN_DEV_SHARED_MEDIA(out_dev) ||
1794
- inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1795
- IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1861
+ skb->protocol == htons(ETH_P_IP)) {
1862
+ __be32 gw;
1863
+
1864
+ gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1865
+ if (IN_DEV_SHARED_MEDIA(out_dev) ||
1866
+ inet_addr_onlink(out_dev, saddr, gw))
1867
+ IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1868
+ }
17961869
17971870 if (skb->protocol != htons(ETH_P_IP)) {
17981871 /* Not IP (i.e. ARP). Do not create route, if it is
....@@ -1809,12 +1882,12 @@
18091882 }
18101883 }
18111884
1812
- fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1885
+ fnhe = find_exception(nhc, daddr);
18131886 if (do_cache) {
18141887 if (fnhe)
18151888 rth = rcu_dereference(fnhe->fnhe_rth_input);
18161889 else
1817
- rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1890
+ rth = rcu_dereference(nhc->nhc_rth_input);
18181891 if (rt_cache_valid(rth)) {
18191892 skb_dst_set_noref(skb, &rth->dst);
18201893 goto out;
....@@ -1822,8 +1895,8 @@
18221895 }
18231896
18241897 rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1825
- IN_DEV_CONF_GET(in_dev, NOPOLICY),
1826
- IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1898
+ IN_DEV_ORCONF(in_dev, NOPOLICY),
1899
+ IN_DEV_ORCONF(out_dev, NOXFRM));
18271900 if (!rth) {
18281901 err = -ENOBUFS;
18291902 goto cleanup;
....@@ -1869,10 +1942,7 @@
18691942 if (!icmph)
18701943 goto out;
18711944
1872
- if (icmph->type != ICMP_DEST_UNREACH &&
1873
- icmph->type != ICMP_REDIRECT &&
1874
- icmph->type != ICMP_TIME_EXCEEDED &&
1875
- icmph->type != ICMP_PARAMETERPROB)
1945
+ if (!icmp_is_err(icmph->type))
18761946 goto out;
18771947
18781948 inner_iph = skb_header_pointer(skb,
....@@ -1891,6 +1961,7 @@
18911961 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
18921962 const struct sk_buff *skb, struct flow_keys *flkeys)
18931963 {
1964
+ u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
18941965 struct flow_keys hash_keys;
18951966 u32 mhash;
18961967
....@@ -1938,8 +2009,41 @@
19382009 hash_keys.basic.ip_proto = fl4->flowi4_proto;
19392010 }
19402011 break;
2012
+ case 2:
2013
+ memset(&hash_keys, 0, sizeof(hash_keys));
2014
+ /* skb is currently provided only when forwarding */
2015
+ if (skb) {
2016
+ struct flow_keys keys;
2017
+
2018
+ skb_flow_dissect_flow_keys(skb, &keys, 0);
2019
+ /* Inner can be v4 or v6 */
2020
+ if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2021
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2022
+ hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
2023
+ hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
2024
+ } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2025
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2026
+ hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
2027
+ hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
2028
+ hash_keys.tags.flow_label = keys.tags.flow_label;
2029
+ hash_keys.basic.ip_proto = keys.basic.ip_proto;
2030
+ } else {
2031
+ /* Same as case 0 */
2032
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2033
+ ip_multipath_l3_keys(skb, &hash_keys);
2034
+ }
2035
+ } else {
2036
+ /* Same as case 0 */
2037
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2038
+ hash_keys.addrs.v4addrs.src = fl4->saddr;
2039
+ hash_keys.addrs.v4addrs.dst = fl4->daddr;
2040
+ }
2041
+ break;
19412042 }
19422043 mhash = flow_hash_from_keys(&hash_keys);
2044
+
2045
+ if (multipath_hash)
2046
+ mhash = jhash_2words(mhash, multipath_hash, 0);
19432047
19442048 return mhash >> 1;
19452049 }
....@@ -1952,10 +2056,11 @@
19522056 struct flow_keys *hkeys)
19532057 {
19542058 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1955
- if (res->fi && res->fi->fib_nhs > 1) {
2059
+ if (res->fi && fib_info_num_path(res->fi) > 1) {
19562060 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
19572061
19582062 fib_select_multipath(res, h);
2063
+ IPCB(skb)->flags |= IPSKB_MULTIPATH;
19592064 }
19602065 #endif
19612066
....@@ -1963,10 +2068,65 @@
19632068 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
19642069 }
19652070
2071
+/* Implements all the saddr-related checks as ip_route_input_slow(),
2072
+ * assuming daddr is valid and the destination is not a local broadcast one.
2073
+ * Uses the provided hint instead of performing a route lookup.
2074
+ */
2075
+int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2076
+ u8 tos, struct net_device *dev,
2077
+ const struct sk_buff *hint)
2078
+{
2079
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
2080
+ struct rtable *rt = skb_rtable(hint);
2081
+ struct net *net = dev_net(dev);
2082
+ int err = -EINVAL;
2083
+ u32 tag = 0;
2084
+
2085
+ if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2086
+ goto martian_source;
2087
+
2088
+ if (ipv4_is_zeronet(saddr))
2089
+ goto martian_source;
2090
+
2091
+ if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2092
+ goto martian_source;
2093
+
2094
+ if (rt->rt_type != RTN_LOCAL)
2095
+ goto skip_validate_source;
2096
+
2097
+ tos &= IPTOS_RT_MASK;
2098
+ err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2099
+ if (err < 0)
2100
+ goto martian_source;
2101
+
2102
+skip_validate_source:
2103
+ skb_dst_copy(skb, hint);
2104
+ return 0;
2105
+
2106
+martian_source:
2107
+ ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2108
+ return err;
2109
+}
2110
+
2111
+/* get device for dst_alloc with local routes */
2112
+static struct net_device *ip_rt_get_dev(struct net *net,
2113
+ const struct fib_result *res)
2114
+{
2115
+ struct fib_nh_common *nhc = res->fi ? res->nhc : NULL;
2116
+ struct net_device *dev = NULL;
2117
+
2118
+ if (nhc)
2119
+ dev = l3mdev_master_dev_rcu(nhc->nhc_dev);
2120
+
2121
+ return dev ? : net->loopback_dev;
2122
+}
2123
+
19662124 /*
19672125 * NOTE. We drop all the packets that has local source
19682126 * addresses, because every properly looped back packet
19692127 * must have correct destination already attached by output routine.
2128
+ * Changes in the enforced policies must be applied also to
2129
+ * ip_route_use_hint().
19702130 *
19712131 * Such approach solves two big problems:
19722132 * 1. Not simplex devices are handled properly.
....@@ -2045,6 +2205,7 @@
20452205 fl4.daddr = daddr;
20462206 fl4.saddr = saddr;
20472207 fl4.flowi4_uid = sock_net_uid(net, NULL);
2208
+ fl4.flowi4_multipath_hash = 0;
20482209
20492210 if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
20502211 flkeys = &_flkeys;
....@@ -2106,7 +2267,9 @@
21062267 local_input:
21072268 do_cache &= res->fi && !itag;
21082269 if (do_cache) {
2109
- rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2270
+ struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2271
+
2272
+ rth = rcu_dereference(nhc->nhc_rth_input);
21102273 if (rt_cache_valid(rth)) {
21112274 skb_dst_set_noref(skb, &rth->dst);
21122275 err = 0;
....@@ -2114,9 +2277,9 @@
21142277 }
21152278 }
21162279
2117
- rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2280
+ rth = rt_dst_alloc(ip_rt_get_dev(net, res),
21182281 flags | RTCF_LOCAL, res->type,
2119
- IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2282
+ IN_DEV_ORCONF(in_dev, NOPOLICY), false);
21202283 if (!rth)
21212284 goto e_nobufs;
21222285
....@@ -2134,16 +2297,16 @@
21342297 }
21352298
21362299 if (do_cache) {
2137
- struct fib_nh *nh = &FIB_RES_NH(*res);
2300
+ struct fib_nh_common *nhc = FIB_RES_NHC(*res);
21382301
2139
- rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2302
+ rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
21402303 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
21412304 WARN_ON(rth->dst.input == lwtunnel_input);
21422305 rth->dst.lwtstate->orig_input = rth->dst.input;
21432306 rth->dst.input = lwtunnel_input;
21442307 }
21452308
2146
- if (unlikely(!rt_cache_route(nh, rth)))
2309
+ if (unlikely(!rt_cache_route(nhc, rth)))
21472310 rt_add_uncached_list(rth);
21482311 }
21492312 skb_dst_set(skb, &rth->dst);
....@@ -2314,10 +2477,10 @@
23142477 fnhe = NULL;
23152478 do_cache &= fi != NULL;
23162479 if (fi) {
2480
+ struct fib_nh_common *nhc = FIB_RES_NHC(*res);
23172481 struct rtable __rcu **prth;
2318
- struct fib_nh *nh = &FIB_RES_NH(*res);
23192482
2320
- fnhe = find_exception(nh, fl4->daddr);
2483
+ fnhe = find_exception(nhc, fl4->daddr);
23212484 if (!do_cache)
23222485 goto add;
23232486 if (fnhe) {
....@@ -2325,12 +2488,12 @@
23252488 } else {
23262489 if (unlikely(fl4->flowi4_flags &
23272490 FLOWI_FLAG_KNOWN_NH &&
2328
- !(nh->nh_gw &&
2329
- nh->nh_scope == RT_SCOPE_LINK))) {
2491
+ !(nhc->nhc_gw_family &&
2492
+ nhc->nhc_scope == RT_SCOPE_LINK))) {
23302493 do_cache = false;
23312494 goto add;
23322495 }
2333
- prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2496
+ prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
23342497 }
23352498 rth = rcu_dereference(*prth);
23362499 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
....@@ -2339,9 +2502,8 @@
23392502
23402503 add:
23412504 rth = rt_dst_alloc(dev_out, flags, type,
2342
- IN_DEV_CONF_GET(in_dev, NOPOLICY),
2343
- IN_DEV_CONF_GET(in_dev, NOXFRM),
2344
- do_cache);
2505
+ IN_DEV_ORCONF(in_dev, NOPOLICY),
2506
+ IN_DEV_ORCONF(in_dev, NOXFRM));
23452507 if (!rth)
23462508 return ERR_PTR(-ENOBUFS);
23472509
....@@ -2379,7 +2541,6 @@
23792541 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
23802542 const struct sk_buff *skb)
23812543 {
2382
- __u8 tos = RT_FL_TOS(fl4);
23832544 struct fib_result res = {
23842545 .type = RTN_UNSPEC,
23852546 .fi = NULL,
....@@ -2389,9 +2550,7 @@
23892550 struct rtable *rth;
23902551
23912552 fl4->flowi4_iif = LOOPBACK_IFINDEX;
2392
- fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2393
- fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2394
- RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2553
+ ip_rt_fix_tos(fl4);
23952554
23962555 rcu_read_lock();
23972556 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
....@@ -2572,44 +2731,15 @@
25722731 return rth;
25732732 }
25742733
2575
-static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2576
-{
2577
- return NULL;
2578
-}
2579
-
2580
-static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2581
-{
2582
- unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2583
-
2584
- return mtu ? : dst->dev->mtu;
2585
-}
2586
-
2587
-static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2588
- struct sk_buff *skb, u32 mtu,
2589
- bool confirm_neigh)
2590
-{
2591
-}
2592
-
2593
-static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2594
- struct sk_buff *skb)
2595
-{
2596
-}
2597
-
2598
-static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2599
- unsigned long old)
2600
-{
2601
- return NULL;
2602
-}
2603
-
26042734 static struct dst_ops ipv4_dst_blackhole_ops = {
2605
- .family = AF_INET,
2606
- .check = ipv4_blackhole_dst_check,
2607
- .mtu = ipv4_blackhole_mtu,
2608
- .default_advmss = ipv4_default_advmss,
2609
- .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2610
- .redirect = ipv4_rt_blackhole_redirect,
2611
- .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2612
- .neigh_lookup = ipv4_neigh_lookup,
2735
+ .family = AF_INET,
2736
+ .default_advmss = ipv4_default_advmss,
2737
+ .neigh_lookup = ipv4_neigh_lookup,
2738
+ .check = dst_blackhole_check,
2739
+ .cow_metrics = dst_blackhole_cow_metrics,
2740
+ .update_pmtu = dst_blackhole_update_pmtu,
2741
+ .redirect = dst_blackhole_redirect,
2742
+ .mtu = dst_blackhole_mtu,
26132743 };
26142744
26152745 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
....@@ -2637,8 +2767,12 @@
26372767 rt->rt_genid = rt_genid_ipv4(net);
26382768 rt->rt_flags = ort->rt_flags;
26392769 rt->rt_type = ort->rt_type;
2640
- rt->rt_gateway = ort->rt_gateway;
26412770 rt->rt_uses_gateway = ort->rt_uses_gateway;
2771
+ rt->rt_gw_family = ort->rt_gw_family;
2772
+ if (rt->rt_gw_family == AF_INET)
2773
+ rt->rt_gw4 = ort->rt_gw4;
2774
+ else if (rt->rt_gw_family == AF_INET6)
2775
+ rt->rt_gw6 = ort->rt_gw6;
26422776
26432777 INIT_LIST_HEAD(&rt->rt_uncached);
26442778 }
....@@ -2667,10 +2801,59 @@
26672801 }
26682802 EXPORT_SYMBOL_GPL(ip_route_output_flow);
26692803
2804
+struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2805
+ struct net_device *dev,
2806
+ struct net *net, __be32 *saddr,
2807
+ const struct ip_tunnel_info *info,
2808
+ u8 protocol, bool use_cache)
2809
+{
2810
+#ifdef CONFIG_DST_CACHE
2811
+ struct dst_cache *dst_cache;
2812
+#endif
2813
+ struct rtable *rt = NULL;
2814
+ struct flowi4 fl4;
2815
+ __u8 tos;
2816
+
2817
+#ifdef CONFIG_DST_CACHE
2818
+ dst_cache = (struct dst_cache *)&info->dst_cache;
2819
+ if (use_cache) {
2820
+ rt = dst_cache_get_ip4(dst_cache, saddr);
2821
+ if (rt)
2822
+ return rt;
2823
+ }
2824
+#endif
2825
+ memset(&fl4, 0, sizeof(fl4));
2826
+ fl4.flowi4_mark = skb->mark;
2827
+ fl4.flowi4_proto = protocol;
2828
+ fl4.daddr = info->key.u.ipv4.dst;
2829
+ fl4.saddr = info->key.u.ipv4.src;
2830
+ tos = info->key.tos;
2831
+ fl4.flowi4_tos = RT_TOS(tos);
2832
+
2833
+ rt = ip_route_output_key(net, &fl4);
2834
+ if (IS_ERR(rt)) {
2835
+ netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2836
+ return ERR_PTR(-ENETUNREACH);
2837
+ }
2838
+ if (rt->dst.dev == dev) { /* is this necessary? */
2839
+ netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2840
+ ip_rt_put(rt);
2841
+ return ERR_PTR(-ELOOP);
2842
+ }
2843
+#ifdef CONFIG_DST_CACHE
2844
+ if (use_cache)
2845
+ dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2846
+#endif
2847
+ *saddr = fl4.saddr;
2848
+ return rt;
2849
+}
2850
+EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2851
+
26702852 /* called with rcu_read_lock held */
26712853 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
26722854 struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2673
- struct sk_buff *skb, u32 portid, u32 seq)
2855
+ struct sk_buff *skb, u32 portid, u32 seq,
2856
+ unsigned int flags)
26742857 {
26752858 struct rtmsg *r;
26762859 struct nlmsghdr *nlh;
....@@ -2678,7 +2861,7 @@
26782861 u32 error;
26792862 u32 metrics[RTAX_MAX];
26802863
2681
- nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2864
+ nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
26822865 if (!nlh)
26832866 return -EMSGSIZE;
26842867
....@@ -2686,7 +2869,7 @@
26862869 r->rtm_family = AF_INET;
26872870 r->rtm_dst_len = 32;
26882871 r->rtm_src_len = 0;
2689
- r->rtm_tos = fl4->flowi4_tos;
2872
+ r->rtm_tos = fl4 ? fl4->flowi4_tos : 0;
26902873 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
26912874 if (nla_put_u32(skb, RTA_TABLE, table_id))
26922875 goto nla_put_failure;
....@@ -2714,14 +2897,29 @@
27142897 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
27152898 goto nla_put_failure;
27162899 #endif
2717
- if (!rt_is_input_route(rt) &&
2900
+ if (fl4 && !rt_is_input_route(rt) &&
27182901 fl4->saddr != src) {
27192902 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
27202903 goto nla_put_failure;
27212904 }
2722
- if (rt->rt_uses_gateway &&
2723
- nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2724
- goto nla_put_failure;
2905
+ if (rt->rt_uses_gateway) {
2906
+ if (rt->rt_gw_family == AF_INET &&
2907
+ nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2908
+ goto nla_put_failure;
2909
+ } else if (rt->rt_gw_family == AF_INET6) {
2910
+ int alen = sizeof(struct in6_addr);
2911
+ struct nlattr *nla;
2912
+ struct rtvia *via;
2913
+
2914
+ nla = nla_reserve(skb, RTA_VIA, alen + 2);
2915
+ if (!nla)
2916
+ goto nla_put_failure;
2917
+
2918
+ via = nla_data(nla);
2919
+ via->rtvia_family = AF_INET6;
2920
+ memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2921
+ }
2922
+ }
27252923
27262924 expires = rt->dst.expires;
27272925 if (expires) {
....@@ -2741,35 +2939,39 @@
27412939 if (rtnetlink_put_metrics(skb, metrics) < 0)
27422940 goto nla_put_failure;
27432941
2744
- if (fl4->flowi4_mark &&
2745
- nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2746
- goto nla_put_failure;
2942
+ if (fl4) {
2943
+ if (fl4->flowi4_mark &&
2944
+ nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2945
+ goto nla_put_failure;
27472946
2748
- if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2749
- nla_put_u32(skb, RTA_UID,
2750
- from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2751
- goto nla_put_failure;
2947
+ if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2948
+ nla_put_u32(skb, RTA_UID,
2949
+ from_kuid_munged(current_user_ns(),
2950
+ fl4->flowi4_uid)))
2951
+ goto nla_put_failure;
2952
+
2953
+ if (rt_is_input_route(rt)) {
2954
+#ifdef CONFIG_IP_MROUTE
2955
+ if (ipv4_is_multicast(dst) &&
2956
+ !ipv4_is_local_multicast(dst) &&
2957
+ IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2958
+ int err = ipmr_get_route(net, skb,
2959
+ fl4->saddr, fl4->daddr,
2960
+ r, portid);
2961
+
2962
+ if (err <= 0) {
2963
+ if (err == 0)
2964
+ return 0;
2965
+ goto nla_put_failure;
2966
+ }
2967
+ } else
2968
+#endif
2969
+ if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2970
+ goto nla_put_failure;
2971
+ }
2972
+ }
27522973
27532974 error = rt->dst.error;
2754
-
2755
- if (rt_is_input_route(rt)) {
2756
-#ifdef CONFIG_IP_MROUTE
2757
- if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2758
- IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2759
- int err = ipmr_get_route(net, skb,
2760
- fl4->saddr, fl4->daddr,
2761
- r, portid);
2762
-
2763
- if (err <= 0) {
2764
- if (err == 0)
2765
- return 0;
2766
- goto nla_put_failure;
2767
- }
2768
- } else
2769
-#endif
2770
- if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2771
- goto nla_put_failure;
2772
- }
27732975
27742976 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
27752977 goto nla_put_failure;
....@@ -2780,6 +2982,81 @@
27802982 nla_put_failure:
27812983 nlmsg_cancel(skb, nlh);
27822984 return -EMSGSIZE;
2985
+}
2986
+
2987
+static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2988
+ struct netlink_callback *cb, u32 table_id,
2989
+ struct fnhe_hash_bucket *bucket, int genid,
2990
+ int *fa_index, int fa_start, unsigned int flags)
2991
+{
2992
+ int i;
2993
+
2994
+ for (i = 0; i < FNHE_HASH_SIZE; i++) {
2995
+ struct fib_nh_exception *fnhe;
2996
+
2997
+ for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2998
+ fnhe = rcu_dereference(fnhe->fnhe_next)) {
2999
+ struct rtable *rt;
3000
+ int err;
3001
+
3002
+ if (*fa_index < fa_start)
3003
+ goto next;
3004
+
3005
+ if (fnhe->fnhe_genid != genid)
3006
+ goto next;
3007
+
3008
+ if (fnhe->fnhe_expires &&
3009
+ time_after(jiffies, fnhe->fnhe_expires))
3010
+ goto next;
3011
+
3012
+ rt = rcu_dereference(fnhe->fnhe_rth_input);
3013
+ if (!rt)
3014
+ rt = rcu_dereference(fnhe->fnhe_rth_output);
3015
+ if (!rt)
3016
+ goto next;
3017
+
3018
+ err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
3019
+ table_id, NULL, skb,
3020
+ NETLINK_CB(cb->skb).portid,
3021
+ cb->nlh->nlmsg_seq, flags);
3022
+ if (err)
3023
+ return err;
3024
+next:
3025
+ (*fa_index)++;
3026
+ }
3027
+ }
3028
+
3029
+ return 0;
3030
+}
3031
+
3032
+int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3033
+ u32 table_id, struct fib_info *fi,
3034
+ int *fa_index, int fa_start, unsigned int flags)
3035
+{
3036
+ struct net *net = sock_net(cb->skb->sk);
3037
+ int nhsel, genid = fnhe_genid(net);
3038
+
3039
+ for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3040
+ struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3041
+ struct fnhe_hash_bucket *bucket;
3042
+ int err;
3043
+
3044
+ if (nhc->nhc_flags & RTNH_F_DEAD)
3045
+ continue;
3046
+
3047
+ rcu_read_lock();
3048
+ bucket = rcu_dereference(nhc->nhc_exceptions);
3049
+ err = 0;
3050
+ if (bucket)
3051
+ err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3052
+ genid, fa_index, fa_start,
3053
+ flags);
3054
+ rcu_read_unlock();
3055
+ if (err)
3056
+ return err;
3057
+ }
3058
+
3059
+ return 0;
27833060 }
27843061
27853062 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
....@@ -2843,6 +3120,75 @@
28433120 return skb;
28443121 }
28453122
3123
+static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3124
+ const struct nlmsghdr *nlh,
3125
+ struct nlattr **tb,
3126
+ struct netlink_ext_ack *extack)
3127
+{
3128
+ struct rtmsg *rtm;
3129
+ int i, err;
3130
+
3131
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3132
+ NL_SET_ERR_MSG(extack,
3133
+ "ipv4: Invalid header for route get request");
3134
+ return -EINVAL;
3135
+ }
3136
+
3137
+ if (!netlink_strict_get_check(skb))
3138
+ return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3139
+ rtm_ipv4_policy, extack);
3140
+
3141
+ rtm = nlmsg_data(nlh);
3142
+ if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3143
+ (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3144
+ rtm->rtm_table || rtm->rtm_protocol ||
3145
+ rtm->rtm_scope || rtm->rtm_type) {
3146
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3147
+ return -EINVAL;
3148
+ }
3149
+
3150
+ if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3151
+ RTM_F_LOOKUP_TABLE |
3152
+ RTM_F_FIB_MATCH)) {
3153
+ NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3154
+ return -EINVAL;
3155
+ }
3156
+
3157
+ err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3158
+ rtm_ipv4_policy, extack);
3159
+ if (err)
3160
+ return err;
3161
+
3162
+ if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3163
+ (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3164
+ NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3165
+ return -EINVAL;
3166
+ }
3167
+
3168
+ for (i = 0; i <= RTA_MAX; i++) {
3169
+ if (!tb[i])
3170
+ continue;
3171
+
3172
+ switch (i) {
3173
+ case RTA_IIF:
3174
+ case RTA_OIF:
3175
+ case RTA_SRC:
3176
+ case RTA_DST:
3177
+ case RTA_IP_PROTO:
3178
+ case RTA_SPORT:
3179
+ case RTA_DPORT:
3180
+ case RTA_MARK:
3181
+ case RTA_UID:
3182
+ break;
3183
+ default:
3184
+ NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3185
+ return -EINVAL;
3186
+ }
3187
+ }
3188
+
3189
+ return 0;
3190
+}
3191
+
28463192 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
28473193 struct netlink_ext_ack *extack)
28483194 {
....@@ -2855,7 +3201,7 @@
28553201 struct rtable *rt = NULL;
28563202 struct sk_buff *skb;
28573203 struct rtmsg *rtm;
2858
- struct flowi4 fl4;
3204
+ struct flowi4 fl4 = {};
28593205 __be32 dst = 0;
28603206 __be32 src = 0;
28613207 kuid_t uid;
....@@ -2863,8 +3209,7 @@
28633209 int err;
28643210 int mark;
28653211
2866
- err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2867
- extack);
3212
+ err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
28683213 if (err < 0)
28693214 return err;
28703215
....@@ -2895,7 +3240,6 @@
28953240 if (!skb)
28963241 return -ENOBUFS;
28973242
2898
- memset(&fl4, 0, sizeof(fl4));
28993243 fl4.daddr = dst;
29003244 fl4.saddr = src;
29013245 fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
....@@ -2931,6 +3275,7 @@
29313275 err = -rt->dst.error;
29323276 } else {
29333277 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3278
+ skb->dev = net->loopback_dev;
29343279 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
29353280 err = 0;
29363281 if (IS_ERR(rt))
....@@ -2955,19 +3300,45 @@
29553300 skb_reset_mac_header(skb);
29563301
29573302 if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3303
+ struct fib_rt_info fri;
3304
+
29583305 if (!res.fi) {
29593306 err = fib_props[res.type].error;
29603307 if (!err)
29613308 err = -EHOSTUNREACH;
29623309 goto errout_rcu;
29633310 }
3311
+ fri.fi = res.fi;
3312
+ fri.tb_id = table_id;
3313
+ fri.dst = res.prefix;
3314
+ fri.dst_len = res.prefixlen;
3315
+ fri.tos = fl4.flowi4_tos;
3316
+ fri.type = rt->rt_type;
3317
+ fri.offload = 0;
3318
+ fri.trap = 0;
3319
+ if (res.fa_head) {
3320
+ struct fib_alias *fa;
3321
+
3322
+ hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3323
+ u8 slen = 32 - fri.dst_len;
3324
+
3325
+ if (fa->fa_slen == slen &&
3326
+ fa->tb_id == fri.tb_id &&
3327
+ fa->fa_tos == fri.tos &&
3328
+ fa->fa_info == res.fi &&
3329
+ fa->fa_type == fri.type) {
3330
+ fri.offload = fa->offload;
3331
+ fri.trap = fa->trap;
3332
+ break;
3333
+ }
3334
+ }
3335
+ }
29643336 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2965
- nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2966
- rt->rt_type, res.prefix, res.prefixlen,
2967
- fl4.flowi4_tos, res.fi, 0);
3337
+ nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
29683338 } else {
29693339 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
2970
- NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
3340
+ NETLINK_CB(in_skb).portid,
3341
+ nlh->nlmsg_seq, 0);
29713342 }
29723343 if (err < 0)
29733344 goto errout_rcu;
....@@ -2996,8 +3367,7 @@
29963367 static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
29973368
29983369 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2999
- void __user *buffer,
3000
- size_t *lenp, loff_t *ppos)
3370
+ void *buffer, size_t *lenp, loff_t *ppos)
30013371 {
30023372 struct net *net = (struct net *)__ctl->extra1;
30033373
....@@ -3122,9 +3492,11 @@
31223492 { }
31233493 };
31243494
3495
+static const char ipv4_route_flush_procname[] = "flush";
3496
+
31253497 static struct ctl_table ipv4_route_flush_table[] = {
31263498 {
3127
- .procname = "flush",
3499
+ .procname = ipv4_route_flush_procname,
31283500 .maxlen = sizeof(int),
31293501 .mode = 0200,
31303502 .proc_handler = ipv4_sysctl_rtcache_flush,
....@@ -3142,9 +3514,11 @@
31423514 if (!tbl)
31433515 goto err_dup;
31443516
3145
- /* Don't export sysctls to unprivileged users */
3146
- if (net->user_ns != &init_user_ns)
3147
- tbl[0].procname = NULL;
3517
+ /* Don't export non-whitelisted sysctls to unprivileged users */
3518
+ if (net->user_ns != &init_user_ns) {
3519
+ if (tbl[0].procname != ipv4_route_flush_procname)
3520
+ tbl[0].procname = NULL;
3521
+ }
31483522 }
31493523 tbl[0].extra1 = net;
31503524