From 072de836f53be56a70cecf70b43ae43b7ce17376 Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Mon, 11 Dec 2023 10:08:36 +0000
Subject: [PATCH] mk-rootfs.sh
---
kernel/net/ipv4/route.c | 845 ++++++++++++++++++++++++++++++++++++++++---------------
1 files changed, 608 insertions(+), 237 deletions(-)
diff --git a/kernel/net/ipv4/route.c b/kernel/net/ipv4/route.c
index b41d4ac..216f76b 100644
--- a/kernel/net/ipv4/route.c
+++ b/kernel/net/ipv4/route.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -55,11 +56,6 @@
* Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
* Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
* Ilia Sotnikov : Removed TOS from hash calculations
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) "IPv4: " fmt
@@ -70,7 +66,7 @@
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
@@ -100,6 +96,7 @@
#include <net/inetpeer.h>
#include <net/sock.h>
#include <net/ip_fib.h>
+#include <net/nexthop.h>
#include <net/arp.h>
#include <net/tcp.h>
#include <net/icmp.h>
@@ -241,11 +238,11 @@
return seq_open(file, &rt_cache_seq_ops);
}
-static const struct file_operations rt_cache_seq_fops = {
- .open = rt_cache_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
+static const struct proc_ops rt_cache_proc_ops = {
+ .proc_open = rt_cache_seq_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release,
};
@@ -332,11 +329,11 @@
return seq_open(file, &rt_cpu_seq_ops);
}
-static const struct file_operations rt_cpu_seq_fops = {
- .open = rt_cpu_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
+static const struct proc_ops rt_cpu_proc_ops = {
+ .proc_open = rt_cpu_seq_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release,
};
#ifdef CONFIG_IP_ROUTE_CLASSID
@@ -370,12 +367,12 @@
struct proc_dir_entry *pde;
pde = proc_create("rt_cache", 0444, net->proc_net,
- &rt_cache_seq_fops);
+ &rt_cache_proc_ops);
if (!pde)
goto err1;
pde = proc_create("rt_cache", 0444,
- net->proc_net_stat, &rt_cpu_seq_fops);
+ net->proc_net_stat, &rt_cpu_proc_ops);
if (!pde)
goto err2;
@@ -437,37 +434,46 @@
struct sk_buff *skb,
const void *daddr)
{
+ const struct rtable *rt = container_of(dst, struct rtable, dst);
struct net_device *dev = dst->dev;
- const __be32 *pkey = daddr;
- const struct rtable *rt;
struct neighbour *n;
- rt = (const struct rtable *) dst;
- if (rt->rt_gateway)
- pkey = (const __be32 *) &rt->rt_gateway;
- else if (skb)
- pkey = &ip_hdr(skb)->daddr;
+ rcu_read_lock_bh();
- n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
- if (n)
- return n;
- return neigh_create(&arp_tbl, pkey, dev);
+ if (likely(rt->rt_gw_family == AF_INET)) {
+ n = ip_neigh_gw4(dev, rt->rt_gw4);
+ } else if (rt->rt_gw_family == AF_INET6) {
+ n = ip_neigh_gw6(dev, &rt->rt_gw6);
+ } else {
+ __be32 pkey;
+
+ pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
+ n = ip_neigh_gw4(dev, pkey);
+ }
+
+ if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
+ n = NULL;
+
+ rcu_read_unlock_bh();
+
+ return n;
}
static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
{
+ const struct rtable *rt = container_of(dst, struct rtable, dst);
struct net_device *dev = dst->dev;
const __be32 *pkey = daddr;
- const struct rtable *rt;
- rt = (const struct rtable *)dst;
- if (rt->rt_gateway)
- pkey = (const __be32 *)&rt->rt_gateway;
- else if (!daddr ||
+ if (rt->rt_gw_family == AF_INET) {
+ pkey = (const __be32 *)&rt->rt_gw4;
+ } else if (rt->rt_gw_family == AF_INET6) {
+ return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
+ } else if (!daddr ||
(rt->rt_flags &
- (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
+ (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
return;
-
+ }
__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
}
@@ -522,6 +528,15 @@
iph->id = htons(id);
}
EXPORT_SYMBOL(__ip_select_ident);
+
+static void ip_rt_fix_tos(struct flowi4 *fl4)
+{
+ __u8 tos = RT_FL_TOS(fl4);
+
+ fl4->flowi4_tos = tos & IPTOS_RT_MASK;
+ fl4->flowi4_scope = tos & RTO_ONLINK ?
+ RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
+}
static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
const struct sock *sk,
@@ -643,13 +658,15 @@
if (fnhe->fnhe_gw) {
rt->rt_flags |= RTCF_REDIRECTED;
- rt->rt_gateway = fnhe->fnhe_gw;
rt->rt_uses_gateway = 1;
+ rt->rt_gw_family = AF_INET;
+ rt->rt_gw4 = fnhe->fnhe_gw;
}
}
-static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
- u32 pmtu, bool lock, unsigned long expires)
+static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
+ __be32 gw, u32 pmtu, bool lock,
+ unsigned long expires)
{
struct fnhe_hash_bucket *hash;
struct fib_nh_exception *fnhe;
@@ -658,17 +675,17 @@
unsigned int i;
int depth;
- genid = fnhe_genid(dev_net(nh->nh_dev));
+ genid = fnhe_genid(dev_net(nhc->nhc_dev));
hval = fnhe_hashfun(daddr);
spin_lock_bh(&fnhe_lock);
- hash = rcu_dereference(nh->nh_exceptions);
+ hash = rcu_dereference(nhc->nhc_exceptions);
if (!hash) {
hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
if (!hash)
goto out_unlock;
- rcu_assign_pointer(nh->nh_exceptions, hash);
+ rcu_assign_pointer(nhc->nhc_exceptions, hash);
}
hash += hval;
@@ -727,13 +744,13 @@
* stale, so anyone caching it rechecks if this exception
* applies to them.
*/
- rt = rcu_dereference(nh->nh_rth_input);
+ rt = rcu_dereference(nhc->nhc_rth_input);
if (rt)
rt->dst.obsolete = DST_OBSOLETE_KILL;
for_each_possible_cpu(i) {
struct rtable __rcu **prt;
- prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
+ prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
rt = rcu_dereference(*prt);
if (rt)
rt->dst.obsolete = DST_OBSOLETE_KILL;
@@ -768,7 +785,7 @@
return;
}
- if (rt->rt_gateway != old_gw)
+ if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
return;
in_dev = __in_dev_get_rcu(dev);
@@ -799,11 +816,11 @@
neigh_event_send(n, NULL);
} else {
if (fib_lookup(net, fl4, &res, 0) == 0) {
- struct fib_nh *nh;
+ struct fib_nh_common *nhc;
fib_select_path(net, &res, fl4, skb);
- nh = &FIB_RES_NH(res);
- update_or_create_fnhe(nh, fl4->daddr, new_gw,
+ nhc = FIB_RES_NHC(res);
+ update_or_create_fnhe(nhc, fl4->daddr, new_gw,
0, false,
jiffies + ip_rt_gc_timeout);
}
@@ -845,6 +862,7 @@
rt = (struct rtable *) dst;
__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
+ ip_rt_fix_tos(&fl4);
__ip_do_redirect(rt, skb, &fl4, true);
}
@@ -1029,13 +1047,14 @@
{
struct dst_entry *dst = &rt->dst;
struct net *net = dev_net(dst->dev);
- u32 old_mtu = ipv4_mtu(dst);
struct fib_result res;
bool lock = false;
+ u32 old_mtu;
if (ip_mtu_locked(dst))
return;
+ old_mtu = ipv4_mtu(dst);
if (old_mtu < mtu)
return;
@@ -1050,11 +1069,11 @@
rcu_read_lock();
if (fib_lookup(net, fl4, &res, 0) == 0) {
- struct fib_nh *nh;
+ struct fib_nh_common *nhc;
fib_select_path(net, &res, fl4, NULL);
- nh = &FIB_RES_NH(res);
- update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
+ nhc = FIB_RES_NHC(res);
+ update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
jiffies + ip_rt_mtu_expires);
}
rcu_read_unlock();
@@ -1068,21 +1087,25 @@
struct flowi4 fl4;
ip_rt_build_flow_key(&fl4, sk, skb);
+ ip_rt_fix_tos(&fl4);
+
+ /* Don't make lookup fail for bridged encapsulations */
+ if (skb && netif_is_any_bridge_port(skb->dev))
+ fl4.flowi4_oif = 0;
+
__ip_rt_update_pmtu(rt, &fl4, mtu);
}
void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
- int oif, u32 mark, u8 protocol, int flow_flags)
+ int oif, u8 protocol)
{
- const struct iphdr *iph = (const struct iphdr *) skb->data;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
struct flowi4 fl4;
struct rtable *rt;
-
- if (!mark)
- mark = IP4_REPLY_MARK(net, skb->mark);
+ u32 mark = IP4_REPLY_MARK(net, skb->mark);
__build_flow_key(net, &fl4, NULL, iph, oif,
- RT_TOS(iph->tos), protocol, mark, flow_flags);
+ RT_TOS(iph->tos), protocol, mark, 0);
rt = __ip_route_output_key(net, &fl4);
if (!IS_ERR(rt)) {
__ip_rt_update_pmtu(rt, &fl4, mtu);
@@ -1093,7 +1116,7 @@
static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
{
- const struct iphdr *iph = (const struct iphdr *) skb->data;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
struct flowi4 fl4;
struct rtable *rt;
@@ -1111,7 +1134,7 @@
void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
{
- const struct iphdr *iph = (const struct iphdr *) skb->data;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
struct flowi4 fl4;
struct rtable *rt;
struct dst_entry *odst = NULL;
@@ -1139,9 +1162,11 @@
goto out;
new = true;
+ } else {
+ ip_rt_fix_tos(&fl4);
}
- __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
+ __ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
if (!dst_check(&rt->dst, 0)) {
if (new)
@@ -1164,14 +1189,14 @@
EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
void ipv4_redirect(struct sk_buff *skb, struct net *net,
- int oif, u32 mark, u8 protocol, int flow_flags)
+ int oif, u8 protocol)
{
- const struct iphdr *iph = (const struct iphdr *) skb->data;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
struct flowi4 fl4;
struct rtable *rt;
__build_flow_key(net, &fl4, NULL, iph, oif,
- RT_TOS(iph->tos), protocol, mark, flow_flags);
+ RT_TOS(iph->tos), protocol, 0, 0);
rt = __ip_route_output_key(net, &fl4);
if (!IS_ERR(rt)) {
__ip_do_redirect(rt, skb, &fl4, false);
@@ -1182,7 +1207,7 @@
void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
{
- const struct iphdr *iph = (const struct iphdr *) skb->data;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
struct flowi4 fl4;
struct rtable *rt;
struct net *net = sock_net(sk);
@@ -1206,7 +1231,7 @@
*
* When a PMTU/redirect information update invalidates a route,
* this is indicated by setting obsolete to DST_OBSOLETE_KILL or
- * DST_OBSOLETE_DEAD by dst_free().
+ * DST_OBSOLETE_DEAD.
*/
if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
return NULL;
@@ -1279,22 +1304,19 @@
src = ip_hdr(skb)->saddr;
else {
struct fib_result res;
- struct flowi4 fl4;
- struct iphdr *iph;
-
- iph = ip_hdr(skb);
-
- memset(&fl4, 0, sizeof(fl4));
- fl4.daddr = iph->daddr;
- fl4.saddr = iph->saddr;
- fl4.flowi4_tos = RT_TOS(iph->tos);
- fl4.flowi4_oif = rt->dst.dev->ifindex;
- fl4.flowi4_iif = skb->dev->ifindex;
- fl4.flowi4_mark = skb->mark;
+ struct iphdr *iph = ip_hdr(skb);
+ struct flowi4 fl4 = {
+ .daddr = iph->daddr,
+ .saddr = iph->saddr,
+ .flowi4_tos = RT_TOS(iph->tos),
+ .flowi4_oif = rt->dst.dev->ifindex,
+ .flowi4_iif = skb->dev->ifindex,
+ .flowi4_mark = skb->mark,
+ };
rcu_read_lock();
if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
- src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
+ src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
else
src = inet_select_addr(rt->dst.dev,
rt_nexthop(rt, iph->daddr),
@@ -1325,7 +1347,7 @@
static unsigned int ipv4_mtu(const struct dst_entry *dst)
{
- const struct rtable *rt = (const struct rtable *) dst;
+ const struct rtable *rt = (const struct rtable *)dst;
unsigned int mtu = rt->rt_pmtu;
if (!mtu || time_after_eq(jiffies, rt->dst.expires))
@@ -1347,7 +1369,7 @@
return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
}
-static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
+static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
{
struct fnhe_hash_bucket *hash;
struct fib_nh_exception *fnhe, __rcu **fnhe_p;
@@ -1355,7 +1377,7 @@
spin_lock_bh(&fnhe_lock);
- hash = rcu_dereference_protected(nh->nh_exceptions,
+ hash = rcu_dereference_protected(nhc->nhc_exceptions,
lockdep_is_held(&fnhe_lock));
hash += hval;
@@ -1381,9 +1403,10 @@
spin_unlock_bh(&fnhe_lock);
}
-static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
+static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
+ __be32 daddr)
{
- struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
+ struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
struct fib_nh_exception *fnhe;
u32 hval;
@@ -1397,7 +1420,7 @@
if (fnhe->fnhe_daddr == daddr) {
if (fnhe->fnhe_expires &&
time_after(jiffies, fnhe->fnhe_expires)) {
- ip_del_fnhe(nh, daddr);
+ ip_del_fnhe(nhc, daddr);
break;
}
return fnhe;
@@ -1414,19 +1437,19 @@
u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
{
+ struct fib_nh_common *nhc = res->nhc;
+ struct net_device *dev = nhc->nhc_dev;
struct fib_info *fi = res->fi;
- struct fib_nh *nh = &fi->fib_nh[res->nh_sel];
- struct net_device *dev = nh->nh_dev;
u32 mtu = 0;
- if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
+ if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) ||
fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
mtu = fi->fib_mtu;
if (likely(!mtu)) {
struct fib_nh_exception *fnhe;
- fnhe = find_exception(nh, daddr);
+ fnhe = find_exception(nhc, daddr);
if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
mtu = fnhe->fnhe_pmtu;
}
@@ -1434,7 +1457,7 @@
if (likely(!mtu))
mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
- return mtu - lwtunnel_headroom(nh->nh_lwtstate, mtu);
+ return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
}
static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
@@ -1465,8 +1488,10 @@
orig = NULL;
}
fill_route_from_fnhe(rt, fnhe);
- if (!rt->rt_gateway)
- rt->rt_gateway = daddr;
+ if (!rt->rt_gw4) {
+ rt->rt_gw4 = daddr;
+ rt->rt_gw_family = AF_INET;
+ }
if (do_cache) {
dst_hold(&rt->dst);
@@ -1485,15 +1510,15 @@
return ret;
}
-static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
+static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
{
struct rtable *orig, *prev, **p;
bool ret = true;
if (rt_is_input_route(rt)) {
- p = (struct rtable **)&nh->nh_rth_input;
+ p = (struct rtable **)&nhc->nhc_rth_input;
} else {
- p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
+ p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
}
orig = *p;
@@ -1546,18 +1571,14 @@
static void ipv4_dst_destroy(struct dst_entry *dst)
{
- struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
struct rtable *rt = (struct rtable *)dst;
- if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
- kfree(p);
-
+ ip_dst_metrics_put(dst);
rt_del_uncached_list(rt);
}
void rt_flush_dev(struct net_device *dev)
{
- struct net *net = dev_net(dev);
struct rtable *rt;
int cpu;
@@ -1568,7 +1589,7 @@
list_for_each_entry(rt, &ul->head, rt_uncached) {
if (rt->dst.dev != dev)
continue;
- rt->dst.dev = net->loopback_dev;
+ rt->dst.dev = blackhole_netdev;
dev_hold(rt->dst.dev);
dev_put(dev);
}
@@ -1592,33 +1613,43 @@
bool cached = false;
if (fi) {
- struct fib_nh *nh = &FIB_RES_NH(*res);
+ struct fib_nh_common *nhc = FIB_RES_NHC(*res);
- if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
- rt->rt_gateway = nh->nh_gw;
+ if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
rt->rt_uses_gateway = 1;
+ rt->rt_gw_family = nhc->nhc_gw_family;
+ /* only INET and INET6 are supported */
+ if (likely(nhc->nhc_gw_family == AF_INET))
+ rt->rt_gw4 = nhc->nhc_gw.ipv4;
+ else
+ rt->rt_gw6 = nhc->nhc_gw.ipv6;
}
- dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
- if (fi->fib_metrics != &dst_default_metrics) {
- rt->dst._metrics |= DST_METRICS_REFCOUNTED;
- refcount_inc(&fi->fib_metrics->refcnt);
- }
+
+ ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
+
#ifdef CONFIG_IP_ROUTE_CLASSID
- rt->dst.tclassid = nh->nh_tclassid;
+ if (nhc->nhc_family == AF_INET) {
+ struct fib_nh *nh;
+
+ nh = container_of(nhc, struct fib_nh, nh_common);
+ rt->dst.tclassid = nh->nh_tclassid;
+ }
#endif
- rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
+ rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
if (unlikely(fnhe))
cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
else if (do_cache)
- cached = rt_cache_route(nh, rt);
+ cached = rt_cache_route(nhc, rt);
if (unlikely(!cached)) {
/* Routes we intend to cache in nexthop exception or
* FIB nexthop have the DST_NOCACHE bit clear.
* However, if we are unsuccessful at storing this
* route into the cache we really need to set it.
*/
- if (!rt->rt_gateway)
- rt->rt_gateway = daddr;
+ if (!rt->rt_gw4) {
+ rt->rt_gw_family = AF_INET;
+ rt->rt_gw4 = daddr;
+ }
rt_add_uncached_list(rt);
}
} else
@@ -1634,12 +1665,11 @@
struct rtable *rt_dst_alloc(struct net_device *dev,
unsigned int flags, u16 type,
- bool nopolicy, bool noxfrm, bool will_cache)
+ bool nopolicy, bool noxfrm)
{
struct rtable *rt;
rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
- (will_cache ? 0 : DST_HOST) |
(nopolicy ? DST_NOPOLICY : 0) |
(noxfrm ? DST_NOXFRM : 0));
@@ -1651,8 +1681,9 @@
rt->rt_iif = 0;
rt->rt_pmtu = 0;
rt->rt_mtu_locked = 0;
- rt->rt_gateway = 0;
rt->rt_uses_gateway = 0;
+ rt->rt_gw_family = 0;
+ rt->rt_gw4 = 0;
INIT_LIST_HEAD(&rt->rt_uncached);
rt->dst.output = ip_output;
@@ -1663,6 +1694,38 @@
return rt;
}
EXPORT_SYMBOL(rt_dst_alloc);
+
+struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
+{
+ struct rtable *new_rt;
+
+ new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
+ rt->dst.flags);
+
+ if (new_rt) {
+ new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
+ new_rt->rt_flags = rt->rt_flags;
+ new_rt->rt_type = rt->rt_type;
+ new_rt->rt_is_input = rt->rt_is_input;
+ new_rt->rt_iif = rt->rt_iif;
+ new_rt->rt_pmtu = rt->rt_pmtu;
+ new_rt->rt_mtu_locked = rt->rt_mtu_locked;
+ new_rt->rt_gw_family = rt->rt_gw_family;
+ if (rt->rt_gw_family == AF_INET)
+ new_rt->rt_gw4 = rt->rt_gw4;
+ else if (rt->rt_gw_family == AF_INET6)
+ new_rt->rt_gw6 = rt->rt_gw6;
+ INIT_LIST_HEAD(&new_rt->rt_uncached);
+
+ new_rt->dst.input = rt->dst.input;
+ new_rt->dst.output = rt->dst.output;
+ new_rt->dst.error = rt->dst.error;
+ new_rt->dst.lastuse = jiffies;
+ new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
+ }
+ return new_rt;
+}
+EXPORT_SYMBOL(rt_dst_clone);
/* called in rcu_read_lock() section */
int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
@@ -1683,7 +1746,8 @@
return -EINVAL;
if (ipv4_is_zeronet(saddr)) {
- if (!ipv4_is_local_multicast(daddr))
+ if (!ipv4_is_local_multicast(daddr) &&
+ ip_hdr(skb)->protocol != IPPROTO_IGMP)
return -EINVAL;
} else {
err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
@@ -1712,7 +1776,7 @@
flags |= RTCF_LOCAL;
rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
- IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
+ IN_DEV_ORCONF(in_dev, NOPOLICY), false);
if (!rth)
return -ENOBUFS;
@@ -1728,6 +1792,7 @@
#endif
RT_CACHE_STAT_INC(in_slow_mc);
+ skb_dst_drop(skb);
skb_dst_set(skb, &rth->dst);
return 0;
}
@@ -1752,7 +1817,7 @@
print_hex_dump(KERN_WARNING, "ll header: ",
DUMP_PREFIX_OFFSET, 16, 1,
skb_mac_header(skb),
- dev->hard_header_len, true);
+ dev->hard_header_len, false);
}
}
#endif
@@ -1764,6 +1829,8 @@
struct in_device *in_dev,
__be32 daddr, __be32 saddr, u32 tos)
{
+ struct fib_nh_common *nhc = FIB_RES_NHC(*res);
+ struct net_device *dev = nhc->nhc_dev;
struct fib_nh_exception *fnhe;
struct rtable *rth;
int err;
@@ -1772,7 +1839,7 @@
u32 itag = 0;
/* get a working reference to the output device */
- out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
+ out_dev = __in_dev_get_rcu(dev);
if (!out_dev) {
net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
return -EINVAL;
@@ -1789,10 +1856,14 @@
do_cache = res->fi && !itag;
if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
- skb->protocol == htons(ETH_P_IP) &&
- (IN_DEV_SHARED_MEDIA(out_dev) ||
- inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
- IPCB(skb)->flags |= IPSKB_DOREDIRECT;
+ skb->protocol == htons(ETH_P_IP)) {
+ __be32 gw;
+
+ gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
+ if (IN_DEV_SHARED_MEDIA(out_dev) ||
+ inet_addr_onlink(out_dev, saddr, gw))
+ IPCB(skb)->flags |= IPSKB_DOREDIRECT;
+ }
if (skb->protocol != htons(ETH_P_IP)) {
/* Not IP (i.e. ARP). Do not create route, if it is
@@ -1809,12 +1880,12 @@
}
}
- fnhe = find_exception(&FIB_RES_NH(*res), daddr);
+ fnhe = find_exception(nhc, daddr);
if (do_cache) {
if (fnhe)
rth = rcu_dereference(fnhe->fnhe_rth_input);
else
- rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
+ rth = rcu_dereference(nhc->nhc_rth_input);
if (rt_cache_valid(rth)) {
skb_dst_set_noref(skb, &rth->dst);
goto out;
@@ -1822,8 +1893,8 @@
}
rth = rt_dst_alloc(out_dev->dev, 0, res->type,
- IN_DEV_CONF_GET(in_dev, NOPOLICY),
- IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
+ IN_DEV_ORCONF(in_dev, NOPOLICY),
+ IN_DEV_ORCONF(out_dev, NOXFRM));
if (!rth) {
err = -ENOBUFS;
goto cleanup;
@@ -1869,10 +1940,7 @@
if (!icmph)
goto out;
- if (icmph->type != ICMP_DEST_UNREACH &&
- icmph->type != ICMP_REDIRECT &&
- icmph->type != ICMP_TIME_EXCEEDED &&
- icmph->type != ICMP_PARAMETERPROB)
+ if (!icmp_is_err(icmph->type))
goto out;
inner_iph = skb_header_pointer(skb,
@@ -1891,6 +1959,7 @@
int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
const struct sk_buff *skb, struct flow_keys *flkeys)
{
+ u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
struct flow_keys hash_keys;
u32 mhash;
@@ -1938,8 +2007,41 @@
hash_keys.basic.ip_proto = fl4->flowi4_proto;
}
break;
+ case 2:
+ memset(&hash_keys, 0, sizeof(hash_keys));
+ /* skb is currently provided only when forwarding */
+ if (skb) {
+ struct flow_keys keys;
+
+ skb_flow_dissect_flow_keys(skb, &keys, 0);
+ /* Inner can be v4 or v6 */
+ if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
+ hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
+ } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
+ hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
+ hash_keys.tags.flow_label = keys.tags.flow_label;
+ hash_keys.basic.ip_proto = keys.basic.ip_proto;
+ } else {
+ /* Same as case 0 */
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ ip_multipath_l3_keys(skb, &hash_keys);
+ }
+ } else {
+ /* Same as case 0 */
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ hash_keys.addrs.v4addrs.src = fl4->saddr;
+ hash_keys.addrs.v4addrs.dst = fl4->daddr;
+ }
+ break;
}
mhash = flow_hash_from_keys(&hash_keys);
+
+ if (multipath_hash)
+ mhash = jhash_2words(mhash, multipath_hash, 0);
return mhash >> 1;
}
@@ -1952,7 +2054,7 @@
struct flow_keys *hkeys)
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (res->fi && res->fi->fib_nhs > 1) {
+ if (res->fi && fib_info_num_path(res->fi) > 1) {
int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
fib_select_multipath(res, h);
@@ -1963,10 +2065,65 @@
return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
}
+/* Implements all the saddr-related checks as ip_route_input_slow(),
+ * assuming daddr is valid and the destination is not a local broadcast one.
+ * Uses the provided hint instead of performing a route lookup.
+ */
+int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ u8 tos, struct net_device *dev,
+ const struct sk_buff *hint)
+{
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
+ struct rtable *rt = skb_rtable(hint);
+ struct net *net = dev_net(dev);
+ int err = -EINVAL;
+ u32 tag = 0;
+
+ if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
+ goto martian_source;
+
+ if (ipv4_is_zeronet(saddr))
+ goto martian_source;
+
+ if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
+ goto martian_source;
+
+ if (rt->rt_type != RTN_LOCAL)
+ goto skip_validate_source;
+
+ tos &= IPTOS_RT_MASK;
+ err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
+ if (err < 0)
+ goto martian_source;
+
+skip_validate_source:
+ skb_dst_copy(skb, hint);
+ return 0;
+
+martian_source:
+ ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
+ return err;
+}
+
+/* get device for dst_alloc with local routes */
+static struct net_device *ip_rt_get_dev(struct net *net,
+ const struct fib_result *res)
+{
+ struct fib_nh_common *nhc = res->fi ? res->nhc : NULL;
+ struct net_device *dev = NULL;
+
+ if (nhc)
+ dev = l3mdev_master_dev_rcu(nhc->nhc_dev);
+
+ return dev ? : net->loopback_dev;
+}
+
/*
* NOTE. We drop all the packets that has local source
* addresses, because every properly looped back packet
* must have correct destination already attached by output routine.
+ * Changes in the enforced policies must be applied also to
+ * ip_route_use_hint().
*
* Such approach solves two big problems:
* 1. Not simplex devices are handled properly.
@@ -2045,6 +2202,7 @@
fl4.daddr = daddr;
fl4.saddr = saddr;
fl4.flowi4_uid = sock_net_uid(net, NULL);
+ fl4.flowi4_multipath_hash = 0;
if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
flkeys = &_flkeys;
@@ -2106,7 +2264,9 @@
local_input:
do_cache &= res->fi && !itag;
if (do_cache) {
- rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
+ struct fib_nh_common *nhc = FIB_RES_NHC(*res);
+
+ rth = rcu_dereference(nhc->nhc_rth_input);
if (rt_cache_valid(rth)) {
skb_dst_set_noref(skb, &rth->dst);
err = 0;
@@ -2114,9 +2274,9 @@
}
}
- rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
+ rth = rt_dst_alloc(ip_rt_get_dev(net, res),
flags | RTCF_LOCAL, res->type,
- IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
+ IN_DEV_ORCONF(in_dev, NOPOLICY), false);
if (!rth)
goto e_nobufs;
@@ -2134,16 +2294,16 @@
}
if (do_cache) {
- struct fib_nh *nh = &FIB_RES_NH(*res);
+ struct fib_nh_common *nhc = FIB_RES_NHC(*res);
- rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
+ rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
WARN_ON(rth->dst.input == lwtunnel_input);
rth->dst.lwtstate->orig_input = rth->dst.input;
rth->dst.input = lwtunnel_input;
}
- if (unlikely(!rt_cache_route(nh, rth)))
+ if (unlikely(!rt_cache_route(nhc, rth)))
rt_add_uncached_list(rth);
}
skb_dst_set(skb, &rth->dst);
@@ -2314,10 +2474,10 @@
fnhe = NULL;
do_cache &= fi != NULL;
if (fi) {
+ struct fib_nh_common *nhc = FIB_RES_NHC(*res);
struct rtable __rcu **prth;
- struct fib_nh *nh = &FIB_RES_NH(*res);
- fnhe = find_exception(nh, fl4->daddr);
+ fnhe = find_exception(nhc, fl4->daddr);
if (!do_cache)
goto add;
if (fnhe) {
@@ -2325,12 +2485,12 @@
} else {
if (unlikely(fl4->flowi4_flags &
FLOWI_FLAG_KNOWN_NH &&
- !(nh->nh_gw &&
- nh->nh_scope == RT_SCOPE_LINK))) {
+ !(nhc->nhc_gw_family &&
+ nhc->nhc_scope == RT_SCOPE_LINK))) {
do_cache = false;
goto add;
}
- prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
+ prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
}
rth = rcu_dereference(*prth);
if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
@@ -2339,9 +2499,8 @@
add:
rth = rt_dst_alloc(dev_out, flags, type,
- IN_DEV_CONF_GET(in_dev, NOPOLICY),
- IN_DEV_CONF_GET(in_dev, NOXFRM),
- do_cache);
+ IN_DEV_ORCONF(in_dev, NOPOLICY),
+ IN_DEV_ORCONF(in_dev, NOXFRM));
if (!rth)
return ERR_PTR(-ENOBUFS);
@@ -2379,7 +2538,6 @@
struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
const struct sk_buff *skb)
{
- __u8 tos = RT_FL_TOS(fl4);
struct fib_result res = {
.type = RTN_UNSPEC,
.fi = NULL,
@@ -2389,9 +2547,7 @@
struct rtable *rth;
fl4->flowi4_iif = LOOPBACK_IFINDEX;
- fl4->flowi4_tos = tos & IPTOS_RT_MASK;
- fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
- RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
+ ip_rt_fix_tos(fl4);
rcu_read_lock();
rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
@@ -2572,44 +2728,15 @@
return rth;
}
-static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
-{
- return NULL;
-}
-
-static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
-{
- unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
-
- return mtu ? : dst->dev->mtu;
-}
-
-static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb, u32 mtu,
- bool confirm_neigh)
-{
-}
-
-static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb)
-{
-}
-
-static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
- unsigned long old)
-{
- return NULL;
-}
-
static struct dst_ops ipv4_dst_blackhole_ops = {
- .family = AF_INET,
- .check = ipv4_blackhole_dst_check,
- .mtu = ipv4_blackhole_mtu,
- .default_advmss = ipv4_default_advmss,
- .update_pmtu = ipv4_rt_blackhole_update_pmtu,
- .redirect = ipv4_rt_blackhole_redirect,
- .cow_metrics = ipv4_rt_blackhole_cow_metrics,
- .neigh_lookup = ipv4_neigh_lookup,
+ .family = AF_INET,
+ .default_advmss = ipv4_default_advmss,
+ .neigh_lookup = ipv4_neigh_lookup,
+ .check = dst_blackhole_check,
+ .cow_metrics = dst_blackhole_cow_metrics,
+ .update_pmtu = dst_blackhole_update_pmtu,
+ .redirect = dst_blackhole_redirect,
+ .mtu = dst_blackhole_mtu,
};
struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
@@ -2637,8 +2764,12 @@
rt->rt_genid = rt_genid_ipv4(net);
rt->rt_flags = ort->rt_flags;
rt->rt_type = ort->rt_type;
- rt->rt_gateway = ort->rt_gateway;
rt->rt_uses_gateway = ort->rt_uses_gateway;
+ rt->rt_gw_family = ort->rt_gw_family;
+ if (rt->rt_gw_family == AF_INET)
+ rt->rt_gw4 = ort->rt_gw4;
+ else if (rt->rt_gw_family == AF_INET6)
+ rt->rt_gw6 = ort->rt_gw6;
INIT_LIST_HEAD(&rt->rt_uncached);
}
@@ -2667,10 +2798,59 @@
}
EXPORT_SYMBOL_GPL(ip_route_output_flow);
+struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
+ struct net_device *dev,
+ struct net *net, __be32 *saddr,
+ const struct ip_tunnel_info *info,
+ u8 protocol, bool use_cache)
+{
+#ifdef CONFIG_DST_CACHE
+ struct dst_cache *dst_cache;
+#endif
+ struct rtable *rt = NULL;
+ struct flowi4 fl4;
+ __u8 tos;
+
+#ifdef CONFIG_DST_CACHE
+ dst_cache = (struct dst_cache *)&info->dst_cache;
+ if (use_cache) {
+ rt = dst_cache_get_ip4(dst_cache, saddr);
+ if (rt)
+ return rt;
+ }
+#endif
+ memset(&fl4, 0, sizeof(fl4));
+ fl4.flowi4_mark = skb->mark;
+ fl4.flowi4_proto = protocol;
+ fl4.daddr = info->key.u.ipv4.dst;
+ fl4.saddr = info->key.u.ipv4.src;
+ tos = info->key.tos;
+ fl4.flowi4_tos = RT_TOS(tos);
+
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt)) {
+ netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
+ return ERR_PTR(-ENETUNREACH);
+ }
+ if (rt->dst.dev == dev) { /* is this necessary? */
+ netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
+ ip_rt_put(rt);
+ return ERR_PTR(-ELOOP);
+ }
+#ifdef CONFIG_DST_CACHE
+ if (use_cache)
+ dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
+#endif
+ *saddr = fl4.saddr;
+ return rt;
+}
+EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
+
/* called with rcu_read_lock held */
static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
struct rtable *rt, u32 table_id, struct flowi4 *fl4,
- struct sk_buff *skb, u32 portid, u32 seq)
+ struct sk_buff *skb, u32 portid, u32 seq,
+ unsigned int flags)
{
struct rtmsg *r;
struct nlmsghdr *nlh;
@@ -2678,7 +2858,7 @@
u32 error;
u32 metrics[RTAX_MAX];
- nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
+ nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
if (!nlh)
return -EMSGSIZE;
@@ -2686,7 +2866,7 @@
r->rtm_family = AF_INET;
r->rtm_dst_len = 32;
r->rtm_src_len = 0;
- r->rtm_tos = fl4->flowi4_tos;
+ r->rtm_tos = fl4 ? fl4->flowi4_tos : 0;
r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
if (nla_put_u32(skb, RTA_TABLE, table_id))
goto nla_put_failure;
@@ -2714,14 +2894,29 @@
nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
goto nla_put_failure;
#endif
- if (!rt_is_input_route(rt) &&
+ if (fl4 && !rt_is_input_route(rt) &&
fl4->saddr != src) {
if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
goto nla_put_failure;
}
- if (rt->rt_uses_gateway &&
- nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
- goto nla_put_failure;
+ if (rt->rt_uses_gateway) {
+ if (rt->rt_gw_family == AF_INET &&
+ nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
+ goto nla_put_failure;
+ } else if (rt->rt_gw_family == AF_INET6) {
+ int alen = sizeof(struct in6_addr);
+ struct nlattr *nla;
+ struct rtvia *via;
+
+ nla = nla_reserve(skb, RTA_VIA, alen + 2);
+ if (!nla)
+ goto nla_put_failure;
+
+ via = nla_data(nla);
+ via->rtvia_family = AF_INET6;
+ memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
+ }
+ }
expires = rt->dst.expires;
if (expires) {
@@ -2741,35 +2936,39 @@
if (rtnetlink_put_metrics(skb, metrics) < 0)
goto nla_put_failure;
- if (fl4->flowi4_mark &&
- nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
- goto nla_put_failure;
+ if (fl4) {
+ if (fl4->flowi4_mark &&
+ nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
+ goto nla_put_failure;
- if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
- nla_put_u32(skb, RTA_UID,
- from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
- goto nla_put_failure;
+ if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
+ nla_put_u32(skb, RTA_UID,
+ from_kuid_munged(current_user_ns(),
+ fl4->flowi4_uid)))
+ goto nla_put_failure;
+
+ if (rt_is_input_route(rt)) {
+#ifdef CONFIG_IP_MROUTE
+ if (ipv4_is_multicast(dst) &&
+ !ipv4_is_local_multicast(dst) &&
+ IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
+ int err = ipmr_get_route(net, skb,
+ fl4->saddr, fl4->daddr,
+ r, portid);
+
+ if (err <= 0) {
+ if (err == 0)
+ return 0;
+ goto nla_put_failure;
+ }
+ } else
+#endif
+ if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
+ goto nla_put_failure;
+ }
+ }
error = rt->dst.error;
-
- if (rt_is_input_route(rt)) {
-#ifdef CONFIG_IP_MROUTE
- if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
- IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
- int err = ipmr_get_route(net, skb,
- fl4->saddr, fl4->daddr,
- r, portid);
-
- if (err <= 0) {
- if (err == 0)
- return 0;
- goto nla_put_failure;
- }
- } else
-#endif
- if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
- goto nla_put_failure;
- }
if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
goto nla_put_failure;
@@ -2780,6 +2979,81 @@
nla_put_failure:
nlmsg_cancel(skb, nlh);
return -EMSGSIZE;
+}
+
+static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, u32 table_id,
+ struct fnhe_hash_bucket *bucket, int genid,
+ int *fa_index, int fa_start, unsigned int flags)
+{
+ int i;
+
+ for (i = 0; i < FNHE_HASH_SIZE; i++) {
+ struct fib_nh_exception *fnhe;
+
+ for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
+ fnhe = rcu_dereference(fnhe->fnhe_next)) {
+ struct rtable *rt;
+ int err;
+
+ if (*fa_index < fa_start)
+ goto next;
+
+ if (fnhe->fnhe_genid != genid)
+ goto next;
+
+ if (fnhe->fnhe_expires &&
+ time_after(jiffies, fnhe->fnhe_expires))
+ goto next;
+
+ rt = rcu_dereference(fnhe->fnhe_rth_input);
+ if (!rt)
+ rt = rcu_dereference(fnhe->fnhe_rth_output);
+ if (!rt)
+ goto next;
+
+ err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
+ table_id, NULL, skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, flags);
+ if (err)
+ return err;
+next:
+ (*fa_index)++;
+ }
+ }
+
+ return 0;
+}
+
+int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
+ u32 table_id, struct fib_info *fi,
+ int *fa_index, int fa_start, unsigned int flags)
+{
+ struct net *net = sock_net(cb->skb->sk);
+ int nhsel, genid = fnhe_genid(net);
+
+ for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
+ struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
+ struct fnhe_hash_bucket *bucket;
+ int err;
+
+ if (nhc->nhc_flags & RTNH_F_DEAD)
+ continue;
+
+ rcu_read_lock();
+ bucket = rcu_dereference(nhc->nhc_exceptions);
+ err = 0;
+ if (bucket)
+ err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
+ genid, fa_index, fa_start,
+ flags);
+ rcu_read_unlock();
+ if (err)
+ return err;
+ }
+
+ return 0;
}
static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
@@ -2843,6 +3117,75 @@
return skb;
}
+static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct rtmsg *rtm;
+ int i, err;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
+ NL_SET_ERR_MSG(extack,
+ "ipv4: Invalid header for route get request");
+ return -EINVAL;
+ }
+
+ if (!netlink_strict_get_check(skb))
+ return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
+ rtm_ipv4_policy, extack);
+
+ rtm = nlmsg_data(nlh);
+ if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
+ (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
+ rtm->rtm_table || rtm->rtm_protocol ||
+ rtm->rtm_scope || rtm->rtm_type) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
+ return -EINVAL;
+ }
+
+ if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
+ RTM_F_LOOKUP_TABLE |
+ RTM_F_FIB_MATCH)) {
+ NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
+ return -EINVAL;
+ }
+
+ err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
+ rtm_ipv4_policy, extack);
+ if (err)
+ return err;
+
+ if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
+ (tb[RTA_DST] && !rtm->rtm_dst_len)) {
+ NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
+ return -EINVAL;
+ }
+
+ for (i = 0; i <= RTA_MAX; i++) {
+ if (!tb[i])
+ continue;
+
+ switch (i) {
+ case RTA_IIF:
+ case RTA_OIF:
+ case RTA_SRC:
+ case RTA_DST:
+ case RTA_IP_PROTO:
+ case RTA_SPORT:
+ case RTA_DPORT:
+ case RTA_MARK:
+ case RTA_UID:
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
@@ -2855,7 +3198,7 @@
struct rtable *rt = NULL;
struct sk_buff *skb;
struct rtmsg *rtm;
- struct flowi4 fl4;
+ struct flowi4 fl4 = {};
__be32 dst = 0;
__be32 src = 0;
kuid_t uid;
@@ -2863,8 +3206,7 @@
int err;
int mark;
- err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
- extack);
+ err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
if (err < 0)
return err;
@@ -2895,7 +3237,6 @@
if (!skb)
return -ENOBUFS;
- memset(&fl4, 0, sizeof(fl4));
fl4.daddr = dst;
fl4.saddr = src;
fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
@@ -2931,6 +3272,7 @@
err = -rt->dst.error;
} else {
fl4.flowi4_iif = LOOPBACK_IFINDEX;
+ skb->dev = net->loopback_dev;
rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
err = 0;
if (IS_ERR(rt))
@@ -2955,19 +3297,45 @@
skb_reset_mac_header(skb);
if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
+ struct fib_rt_info fri;
+
if (!res.fi) {
err = fib_props[res.type].error;
if (!err)
err = -EHOSTUNREACH;
goto errout_rcu;
}
+ fri.fi = res.fi;
+ fri.tb_id = table_id;
+ fri.dst = res.prefix;
+ fri.dst_len = res.prefixlen;
+ fri.tos = fl4.flowi4_tos;
+ fri.type = rt->rt_type;
+ fri.offload = 0;
+ fri.trap = 0;
+ if (res.fa_head) {
+ struct fib_alias *fa;
+
+ hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
+ u8 slen = 32 - fri.dst_len;
+
+ if (fa->fa_slen == slen &&
+ fa->tb_id == fri.tb_id &&
+ fa->fa_tos == fri.tos &&
+ fa->fa_info == res.fi &&
+ fa->fa_type == fri.type) {
+ fri.offload = fa->offload;
+ fri.trap = fa->trap;
+ break;
+ }
+ }
+ }
err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
- nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
- rt->rt_type, res.prefix, res.prefixlen,
- fl4.flowi4_tos, res.fi, 0);
+ nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
} else {
err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
- NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
+ NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, 0);
}
if (err < 0)
goto errout_rcu;
@@ -2996,8 +3364,7 @@
static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net = (struct net *)__ctl->extra1;
@@ -3122,9 +3489,11 @@
{ }
};
+static const char ipv4_route_flush_procname[] = "flush";
+
static struct ctl_table ipv4_route_flush_table[] = {
{
- .procname = "flush",
+ .procname = ipv4_route_flush_procname,
.maxlen = sizeof(int),
.mode = 0200,
.proc_handler = ipv4_sysctl_rtcache_flush,
@@ -3142,9 +3511,11 @@
if (!tbl)
goto err_dup;
- /* Don't export sysctls to unprivileged users */
- if (net->user_ns != &init_user_ns)
- tbl[0].procname = NULL;
+ /* Don't export non-whitelisted sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns) {
+ if (tbl[0].procname != ipv4_route_flush_procname)
+ tbl[0].procname = NULL;
+ }
}
tbl[0].extra1 = net;
--
Gitblit v1.6.2