From 8d2a02b24d66aa359e83eebc1ed3c0f85367a1cb Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Thu, 16 May 2024 03:11:33 +0000 Subject: [PATCH] AX88772C_eeprom and ax8872c build together --- kernel/net/ipv4/route.c | 850 ++++++++++++++++++++++++++++++++++++++++--------------- 1 files changed, 612 insertions(+), 238 deletions(-) diff --git a/kernel/net/ipv4/route.c b/kernel/net/ipv4/route.c index b41d4ac..3159ad0 100644 --- a/kernel/net/ipv4/route.c +++ b/kernel/net/ipv4/route.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket @@ -55,11 +56,6 @@ * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes. * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect * Ilia Sotnikov : Removed TOS from hash calculations - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #define pr_fmt(fmt) "IPv4: " fmt @@ -70,7 +66,7 @@ #include <linux/types.h> #include <linux/kernel.h> #include <linux/mm.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> @@ -100,6 +96,7 @@ #include <net/inetpeer.h> #include <net/sock.h> #include <net/ip_fib.h> +#include <net/nexthop.h> #include <net/arp.h> #include <net/tcp.h> #include <net/icmp.h> @@ -241,11 +238,11 @@ return seq_open(file, &rt_cache_seq_ops); } -static const struct file_operations rt_cache_seq_fops = { - .open = rt_cache_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, +static const struct proc_ops rt_cache_proc_ops = { + .proc_open = rt_cache_seq_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, }; @@ -332,11 +329,11 @@ return seq_open(file, &rt_cpu_seq_ops); } -static const struct file_operations rt_cpu_seq_fops = { - .open = rt_cpu_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, +static const struct proc_ops rt_cpu_proc_ops = { + .proc_open = rt_cpu_seq_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, }; #ifdef CONFIG_IP_ROUTE_CLASSID @@ -370,12 +367,12 @@ struct proc_dir_entry *pde; pde = proc_create("rt_cache", 0444, net->proc_net, - &rt_cache_seq_fops); + &rt_cache_proc_ops); if (!pde) goto err1; pde = proc_create("rt_cache", 0444, - net->proc_net_stat, &rt_cpu_seq_fops); + net->proc_net_stat, &rt_cpu_proc_ops); if (!pde) goto err2; @@ -437,37 +434,46 @@ struct sk_buff *skb, const void *daddr) { + const struct rtable *rt = container_of(dst, struct rtable, dst); struct net_device *dev = dst->dev; - const __be32 *pkey = daddr; - const struct rtable *rt; struct neighbour *n; - rt = (const struct rtable *) dst; - if (rt->rt_gateway) - pkey = (const __be32 *) &rt->rt_gateway; - else if (skb) - pkey = &ip_hdr(skb)->daddr; + rcu_read_lock_bh(); - n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey); - if (n) - return n; - return neigh_create(&arp_tbl, pkey, dev); + if (likely(rt->rt_gw_family == AF_INET)) { + n = ip_neigh_gw4(dev, rt->rt_gw4); + } else if (rt->rt_gw_family == AF_INET6) { + n = ip_neigh_gw6(dev, &rt->rt_gw6); + } else { + __be32 pkey; + + pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr); + n = ip_neigh_gw4(dev, pkey); + } + + if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt)) + n = NULL; + + rcu_read_unlock_bh(); + + return n; } static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr) { + const struct rtable *rt = container_of(dst, struct rtable, dst); struct net_device *dev = dst->dev; const __be32 *pkey = daddr; - const struct rtable *rt; - rt = (const struct rtable *)dst; - if (rt->rt_gateway) - pkey = (const __be32 *)&rt->rt_gateway; - else if (!daddr || + if (rt->rt_gw_family == AF_INET) { + pkey = (const __be32 *)&rt->rt_gw4; + } else if (rt->rt_gw_family == AF_INET6) { + return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6); + } else if (!daddr || (rt->rt_flags & - (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) + (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) { return; - + } __ipv4_confirm_neigh(dev, *(__force u32 *)pkey); } @@ -522,6 +528,15 @@ iph->id = htons(id); } EXPORT_SYMBOL(__ip_select_ident); + +static void ip_rt_fix_tos(struct flowi4 *fl4) +{ + __u8 tos = RT_FL_TOS(fl4); + + fl4->flowi4_tos = tos & IPTOS_RT_MASK; + fl4->flowi4_scope = tos & RTO_ONLINK ? + RT_SCOPE_LINK : RT_SCOPE_UNIVERSE; +} static void __build_flow_key(const struct net *net, struct flowi4 *fl4, const struct sock *sk, @@ -643,13 +658,15 @@ if (fnhe->fnhe_gw) { rt->rt_flags |= RTCF_REDIRECTED; - rt->rt_gateway = fnhe->fnhe_gw; rt->rt_uses_gateway = 1; + rt->rt_gw_family = AF_INET; + rt->rt_gw4 = fnhe->fnhe_gw; } } -static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, - u32 pmtu, bool lock, unsigned long expires) +static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr, + __be32 gw, u32 pmtu, bool lock, + unsigned long expires) { struct fnhe_hash_bucket *hash; struct fib_nh_exception *fnhe; @@ -658,17 +675,17 @@ unsigned int i; int depth; - genid = fnhe_genid(dev_net(nh->nh_dev)); + genid = fnhe_genid(dev_net(nhc->nhc_dev)); hval = fnhe_hashfun(daddr); spin_lock_bh(&fnhe_lock); - hash = rcu_dereference(nh->nh_exceptions); + hash = rcu_dereference(nhc->nhc_exceptions); if (!hash) { hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC); if (!hash) goto out_unlock; - rcu_assign_pointer(nh->nh_exceptions, hash); + rcu_assign_pointer(nhc->nhc_exceptions, hash); } hash += hval; @@ -727,13 +744,13 @@ * stale, so anyone caching it rechecks if this exception * applies to them. */ - rt = rcu_dereference(nh->nh_rth_input); + rt = rcu_dereference(nhc->nhc_rth_input); if (rt) rt->dst.obsolete = DST_OBSOLETE_KILL; for_each_possible_cpu(i) { struct rtable __rcu **prt; - prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i); + prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i); rt = rcu_dereference(*prt); if (rt) rt->dst.obsolete = DST_OBSOLETE_KILL; @@ -768,7 +785,7 @@ return; } - if (rt->rt_gateway != old_gw) + if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw) return; in_dev = __in_dev_get_rcu(dev); @@ -799,11 +816,11 @@ neigh_event_send(n, NULL); } else { if (fib_lookup(net, fl4, &res, 0) == 0) { - struct fib_nh *nh; + struct fib_nh_common *nhc; fib_select_path(net, &res, fl4, skb); - nh = &FIB_RES_NH(res); - update_or_create_fnhe(nh, fl4->daddr, new_gw, + nhc = FIB_RES_NHC(res); + update_or_create_fnhe(nhc, fl4->daddr, new_gw, 0, false, jiffies + ip_rt_gc_timeout); } @@ -845,6 +862,7 @@ rt = (struct rtable *) dst; __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0); + ip_rt_fix_tos(&fl4); __ip_do_redirect(rt, skb, &fl4, true); } @@ -1029,13 +1047,14 @@ { struct dst_entry *dst = &rt->dst; struct net *net = dev_net(dst->dev); - u32 old_mtu = ipv4_mtu(dst); struct fib_result res; bool lock = false; + u32 old_mtu; if (ip_mtu_locked(dst)) return; + old_mtu = ipv4_mtu(dst); if (old_mtu < mtu) return; @@ -1050,11 +1069,11 @@ rcu_read_lock(); if (fib_lookup(net, fl4, &res, 0) == 0) { - struct fib_nh *nh; + struct fib_nh_common *nhc; fib_select_path(net, &res, fl4, NULL); - nh = &FIB_RES_NH(res); - update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock, + nhc = FIB_RES_NHC(res); + update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, jiffies + ip_rt_mtu_expires); } rcu_read_unlock(); @@ -1068,21 +1087,25 @@ struct flowi4 fl4; ip_rt_build_flow_key(&fl4, sk, skb); + ip_rt_fix_tos(&fl4); + + /* Don't make lookup fail for bridged encapsulations */ + if (skb && netif_is_any_bridge_port(skb->dev)) + fl4.flowi4_oif = 0; + __ip_rt_update_pmtu(rt, &fl4, mtu); } void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, - int oif, u32 mark, u8 protocol, int flow_flags) + int oif, u8 protocol) { - const struct iphdr *iph = (const struct iphdr *) skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; - - if (!mark) - mark = IP4_REPLY_MARK(net, skb->mark); + u32 mark = IP4_REPLY_MARK(net, skb->mark); __build_flow_key(net, &fl4, NULL, iph, oif, - RT_TOS(iph->tos), protocol, mark, flow_flags); + RT_TOS(iph->tos), protocol, mark, 0); rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { __ip_rt_update_pmtu(rt, &fl4, mtu); @@ -1093,7 +1116,7 @@ static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) { - const struct iphdr *iph = (const struct iphdr *) skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; @@ -1111,7 +1134,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) { - const struct iphdr *iph = (const struct iphdr *) skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; struct dst_entry *odst = NULL; @@ -1139,9 +1162,11 @@ goto out; new = true; + } else { + ip_rt_fix_tos(&fl4); } - __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu); + __ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu); if (!dst_check(&rt->dst, 0)) { if (new) @@ -1164,14 +1189,14 @@ EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); void ipv4_redirect(struct sk_buff *skb, struct net *net, - int oif, u32 mark, u8 protocol, int flow_flags) + int oif, u8 protocol) { - const struct iphdr *iph = (const struct iphdr *) skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; __build_flow_key(net, &fl4, NULL, iph, oif, - RT_TOS(iph->tos), protocol, mark, flow_flags); + RT_TOS(iph->tos), protocol, 0, 0); rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { __ip_do_redirect(rt, skb, &fl4, false); @@ -1182,7 +1207,7 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) { - const struct iphdr *iph = (const struct iphdr *) skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; struct net *net = sock_net(sk); @@ -1206,7 +1231,7 @@ * * When a PMTU/redirect information update invalidates a route, * this is indicated by setting obsolete to DST_OBSOLETE_KILL or - * DST_OBSOLETE_DEAD by dst_free(). + * DST_OBSOLETE_DEAD. */ if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt)) return NULL; @@ -1215,6 +1240,7 @@ static void ipv4_send_dest_unreach(struct sk_buff *skb) { + struct net_device *dev; struct ip_options opt; int res; @@ -1232,7 +1258,8 @@ opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); rcu_read_lock(); - res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL); + dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev; + res = __ip_options_compile(dev_net(dev), &opt, skb, NULL); rcu_read_unlock(); if (res) @@ -1279,22 +1306,19 @@ src = ip_hdr(skb)->saddr; else { struct fib_result res; - struct flowi4 fl4; - struct iphdr *iph; - - iph = ip_hdr(skb); - - memset(&fl4, 0, sizeof(fl4)); - fl4.daddr = iph->daddr; - fl4.saddr = iph->saddr; - fl4.flowi4_tos = RT_TOS(iph->tos); - fl4.flowi4_oif = rt->dst.dev->ifindex; - fl4.flowi4_iif = skb->dev->ifindex; - fl4.flowi4_mark = skb->mark; + struct iphdr *iph = ip_hdr(skb); + struct flowi4 fl4 = { + .daddr = iph->daddr, + .saddr = iph->saddr, + .flowi4_tos = RT_TOS(iph->tos), + .flowi4_oif = rt->dst.dev->ifindex, + .flowi4_iif = skb->dev->ifindex, + .flowi4_mark = skb->mark, + }; rcu_read_lock(); if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0) - src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res); + src = fib_result_prefsrc(dev_net(rt->dst.dev), &res); else src = inet_select_addr(rt->dst.dev, rt_nexthop(rt, iph->daddr), @@ -1325,7 +1349,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) { - const struct rtable *rt = (const struct rtable *) dst; + const struct rtable *rt = (const struct rtable *)dst; unsigned int mtu = rt->rt_pmtu; if (!mtu || time_after_eq(jiffies, rt->dst.expires)) @@ -1347,7 +1371,7 @@ return mtu - lwtunnel_headroom(dst->lwtstate, mtu); } -static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr) +static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr) { struct fnhe_hash_bucket *hash; struct fib_nh_exception *fnhe, __rcu **fnhe_p; @@ -1355,7 +1379,7 @@ spin_lock_bh(&fnhe_lock); - hash = rcu_dereference_protected(nh->nh_exceptions, + hash = rcu_dereference_protected(nhc->nhc_exceptions, lockdep_is_held(&fnhe_lock)); hash += hval; @@ -1381,9 +1405,10 @@ spin_unlock_bh(&fnhe_lock); } -static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) +static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc, + __be32 daddr) { - struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions); + struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions); struct fib_nh_exception *fnhe; u32 hval; @@ -1397,7 +1422,7 @@ if (fnhe->fnhe_daddr == daddr) { if (fnhe->fnhe_expires && time_after(jiffies, fnhe->fnhe_expires)) { - ip_del_fnhe(nh, daddr); + ip_del_fnhe(nhc, daddr); break; } return fnhe; @@ -1414,19 +1439,19 @@ u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr) { + struct fib_nh_common *nhc = res->nhc; + struct net_device *dev = nhc->nhc_dev; struct fib_info *fi = res->fi; - struct fib_nh *nh = &fi->fib_nh[res->nh_sel]; - struct net_device *dev = nh->nh_dev; u32 mtu = 0; - if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu || + if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) || fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU)) mtu = fi->fib_mtu; if (likely(!mtu)) { struct fib_nh_exception *fnhe; - fnhe = find_exception(nh, daddr); + fnhe = find_exception(nhc, daddr); if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires)) mtu = fnhe->fnhe_pmtu; } @@ -1434,7 +1459,7 @@ if (likely(!mtu)) mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU); - return mtu - lwtunnel_headroom(nh->nh_lwtstate, mtu); + return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu); } static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, @@ -1465,8 +1490,10 @@ orig = NULL; } fill_route_from_fnhe(rt, fnhe); - if (!rt->rt_gateway) - rt->rt_gateway = daddr; + if (!rt->rt_gw4) { + rt->rt_gw4 = daddr; + rt->rt_gw_family = AF_INET; + } if (do_cache) { dst_hold(&rt->dst); @@ -1485,15 +1512,15 @@ return ret; } -static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt) +static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt) { struct rtable *orig, *prev, **p; bool ret = true; if (rt_is_input_route(rt)) { - p = (struct rtable **)&nh->nh_rth_input; + p = (struct rtable **)&nhc->nhc_rth_input; } else { - p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output); + p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output); } orig = *p; @@ -1546,18 +1573,14 @@ static void ipv4_dst_destroy(struct dst_entry *dst) { - struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst); struct rtable *rt = (struct rtable *)dst; - if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt)) - kfree(p); - + ip_dst_metrics_put(dst); rt_del_uncached_list(rt); } void rt_flush_dev(struct net_device *dev) { - struct net *net = dev_net(dev); struct rtable *rt; int cpu; @@ -1568,7 +1591,7 @@ list_for_each_entry(rt, &ul->head, rt_uncached) { if (rt->dst.dev != dev) continue; - rt->dst.dev = net->loopback_dev; + rt->dst.dev = blackhole_netdev; dev_hold(rt->dst.dev); dev_put(dev); } @@ -1592,33 +1615,43 @@ bool cached = false; if (fi) { - struct fib_nh *nh = &FIB_RES_NH(*res); + struct fib_nh_common *nhc = FIB_RES_NHC(*res); - if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) { - rt->rt_gateway = nh->nh_gw; + if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) { rt->rt_uses_gateway = 1; + rt->rt_gw_family = nhc->nhc_gw_family; + /* only INET and INET6 are supported */ + if (likely(nhc->nhc_gw_family == AF_INET)) + rt->rt_gw4 = nhc->nhc_gw.ipv4; + else + rt->rt_gw6 = nhc->nhc_gw.ipv6; } - dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true); - if (fi->fib_metrics != &dst_default_metrics) { - rt->dst._metrics |= DST_METRICS_REFCOUNTED; - refcount_inc(&fi->fib_metrics->refcnt); - } + + ip_dst_init_metrics(&rt->dst, fi->fib_metrics); + #ifdef CONFIG_IP_ROUTE_CLASSID - rt->dst.tclassid = nh->nh_tclassid; + if (nhc->nhc_family == AF_INET) { + struct fib_nh *nh; + + nh = container_of(nhc, struct fib_nh, nh_common); + rt->dst.tclassid = nh->nh_tclassid; + } #endif - rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate); + rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate); if (unlikely(fnhe)) cached = rt_bind_exception(rt, fnhe, daddr, do_cache); else if (do_cache) - cached = rt_cache_route(nh, rt); + cached = rt_cache_route(nhc, rt); if (unlikely(!cached)) { /* Routes we intend to cache in nexthop exception or * FIB nexthop have the DST_NOCACHE bit clear. * However, if we are unsuccessful at storing this * route into the cache we really need to set it. */ - if (!rt->rt_gateway) - rt->rt_gateway = daddr; + if (!rt->rt_gw4) { + rt->rt_gw_family = AF_INET; + rt->rt_gw4 = daddr; + } rt_add_uncached_list(rt); } } else @@ -1634,12 +1667,11 @@ struct rtable *rt_dst_alloc(struct net_device *dev, unsigned int flags, u16 type, - bool nopolicy, bool noxfrm, bool will_cache) + bool nopolicy, bool noxfrm) { struct rtable *rt; rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, - (will_cache ? 0 : DST_HOST) | (nopolicy ? DST_NOPOLICY : 0) | (noxfrm ? DST_NOXFRM : 0)); @@ -1651,8 +1683,9 @@ rt->rt_iif = 0; rt->rt_pmtu = 0; rt->rt_mtu_locked = 0; - rt->rt_gateway = 0; rt->rt_uses_gateway = 0; + rt->rt_gw_family = 0; + rt->rt_gw4 = 0; INIT_LIST_HEAD(&rt->rt_uncached); rt->dst.output = ip_output; @@ -1663,6 +1696,38 @@ return rt; } EXPORT_SYMBOL(rt_dst_alloc); + +struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt) +{ + struct rtable *new_rt; + + new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, + rt->dst.flags); + + if (new_rt) { + new_rt->rt_genid = rt_genid_ipv4(dev_net(dev)); + new_rt->rt_flags = rt->rt_flags; + new_rt->rt_type = rt->rt_type; + new_rt->rt_is_input = rt->rt_is_input; + new_rt->rt_iif = rt->rt_iif; + new_rt->rt_pmtu = rt->rt_pmtu; + new_rt->rt_mtu_locked = rt->rt_mtu_locked; + new_rt->rt_gw_family = rt->rt_gw_family; + if (rt->rt_gw_family == AF_INET) + new_rt->rt_gw4 = rt->rt_gw4; + else if (rt->rt_gw_family == AF_INET6) + new_rt->rt_gw6 = rt->rt_gw6; + INIT_LIST_HEAD(&new_rt->rt_uncached); + + new_rt->dst.input = rt->dst.input; + new_rt->dst.output = rt->dst.output; + new_rt->dst.error = rt->dst.error; + new_rt->dst.lastuse = jiffies; + new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate); + } + return new_rt; +} +EXPORT_SYMBOL(rt_dst_clone); /* called in rcu_read_lock() section */ int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, @@ -1683,7 +1748,8 @@ return -EINVAL; if (ipv4_is_zeronet(saddr)) { - if (!ipv4_is_local_multicast(daddr)) + if (!ipv4_is_local_multicast(daddr) && + ip_hdr(skb)->protocol != IPPROTO_IGMP) return -EINVAL; } else { err = fib_validate_source(skb, saddr, 0, tos, 0, dev, @@ -1712,7 +1778,7 @@ flags |= RTCF_LOCAL; rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST, - IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false); + IN_DEV_ORCONF(in_dev, NOPOLICY), false); if (!rth) return -ENOBUFS; @@ -1728,6 +1794,7 @@ #endif RT_CACHE_STAT_INC(in_slow_mc); + skb_dst_drop(skb); skb_dst_set(skb, &rth->dst); return 0; } @@ -1752,7 +1819,7 @@ print_hex_dump(KERN_WARNING, "ll header: ", DUMP_PREFIX_OFFSET, 16, 1, skb_mac_header(skb), - dev->hard_header_len, true); + dev->hard_header_len, false); } } #endif @@ -1764,6 +1831,8 @@ struct in_device *in_dev, __be32 daddr, __be32 saddr, u32 tos) { + struct fib_nh_common *nhc = FIB_RES_NHC(*res); + struct net_device *dev = nhc->nhc_dev; struct fib_nh_exception *fnhe; struct rtable *rth; int err; @@ -1772,7 +1841,7 @@ u32 itag = 0; /* get a working reference to the output device */ - out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res)); + out_dev = __in_dev_get_rcu(dev); if (!out_dev) { net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n"); return -EINVAL; @@ -1789,10 +1858,14 @@ do_cache = res->fi && !itag; if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) && - skb->protocol == htons(ETH_P_IP) && - (IN_DEV_SHARED_MEDIA(out_dev) || - inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) - IPCB(skb)->flags |= IPSKB_DOREDIRECT; + skb->protocol == htons(ETH_P_IP)) { + __be32 gw; + + gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0; + if (IN_DEV_SHARED_MEDIA(out_dev) || + inet_addr_onlink(out_dev, saddr, gw)) + IPCB(skb)->flags |= IPSKB_DOREDIRECT; + } if (skb->protocol != htons(ETH_P_IP)) { /* Not IP (i.e. ARP). Do not create route, if it is @@ -1809,12 +1882,12 @@ } } - fnhe = find_exception(&FIB_RES_NH(*res), daddr); + fnhe = find_exception(nhc, daddr); if (do_cache) { if (fnhe) rth = rcu_dereference(fnhe->fnhe_rth_input); else - rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); + rth = rcu_dereference(nhc->nhc_rth_input); if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); goto out; @@ -1822,8 +1895,8 @@ } rth = rt_dst_alloc(out_dev->dev, 0, res->type, - IN_DEV_CONF_GET(in_dev, NOPOLICY), - IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache); + IN_DEV_ORCONF(in_dev, NOPOLICY), + IN_DEV_ORCONF(out_dev, NOXFRM)); if (!rth) { err = -ENOBUFS; goto cleanup; @@ -1869,10 +1942,7 @@ if (!icmph) goto out; - if (icmph->type != ICMP_DEST_UNREACH && - icmph->type != ICMP_REDIRECT && - icmph->type != ICMP_TIME_EXCEEDED && - icmph->type != ICMP_PARAMETERPROB) + if (!icmp_is_err(icmph->type)) goto out; inner_iph = skb_header_pointer(skb, @@ -1891,6 +1961,7 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, const struct sk_buff *skb, struct flow_keys *flkeys) { + u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0; struct flow_keys hash_keys; u32 mhash; @@ -1938,8 +2009,41 @@ hash_keys.basic.ip_proto = fl4->flowi4_proto; } break; + case 2: + memset(&hash_keys, 0, sizeof(hash_keys)); + /* skb is currently provided only when forwarding */ + if (skb) { + struct flow_keys keys; + + skb_flow_dissect_flow_keys(skb, &keys, 0); + /* Inner can be v4 or v6 */ + if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; + hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; + } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; + hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; + hash_keys.tags.flow_label = keys.tags.flow_label; + hash_keys.basic.ip_proto = keys.basic.ip_proto; + } else { + /* Same as case 0 */ + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + ip_multipath_l3_keys(skb, &hash_keys); + } + } else { + /* Same as case 0 */ + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + hash_keys.addrs.v4addrs.src = fl4->saddr; + hash_keys.addrs.v4addrs.dst = fl4->daddr; + } + break; } mhash = flow_hash_from_keys(&hash_keys); + + if (multipath_hash) + mhash = jhash_2words(mhash, multipath_hash, 0); return mhash >> 1; } @@ -1952,10 +2056,11 @@ struct flow_keys *hkeys) { #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res->fi && res->fi->fib_nhs > 1) { + if (res->fi && fib_info_num_path(res->fi) > 1) { int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys); fib_select_multipath(res, h); + IPCB(skb)->flags |= IPSKB_MULTIPATH; } #endif @@ -1963,10 +2068,65 @@ return __mkroute_input(skb, res, in_dev, daddr, saddr, tos); } +/* Implements all the saddr-related checks as ip_route_input_slow(), + * assuming daddr is valid and the destination is not a local broadcast one. + * Uses the provided hint instead of performing a route lookup. + */ +int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev, + const struct sk_buff *hint) +{ + struct in_device *in_dev = __in_dev_get_rcu(dev); + struct rtable *rt = skb_rtable(hint); + struct net *net = dev_net(dev); + int err = -EINVAL; + u32 tag = 0; + + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) + goto martian_source; + + if (ipv4_is_zeronet(saddr)) + goto martian_source; + + if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) + goto martian_source; + + if (rt->rt_type != RTN_LOCAL) + goto skip_validate_source; + + tos &= IPTOS_RT_MASK; + err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag); + if (err < 0) + goto martian_source; + +skip_validate_source: + skb_dst_copy(skb, hint); + return 0; + +martian_source: + ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); + return err; +} + +/* get device for dst_alloc with local routes */ +static struct net_device *ip_rt_get_dev(struct net *net, + const struct fib_result *res) +{ + struct fib_nh_common *nhc = res->fi ? res->nhc : NULL; + struct net_device *dev = NULL; + + if (nhc) + dev = l3mdev_master_dev_rcu(nhc->nhc_dev); + + return dev ? : net->loopback_dev; +} + /* * NOTE. We drop all the packets that has local source * addresses, because every properly looped back packet * must have correct destination already attached by output routine. + * Changes in the enforced policies must be applied also to + * ip_route_use_hint(). * * Such approach solves two big problems: * 1. Not simplex devices are handled properly. @@ -2045,6 +2205,7 @@ fl4.daddr = daddr; fl4.saddr = saddr; fl4.flowi4_uid = sock_net_uid(net, NULL); + fl4.flowi4_multipath_hash = 0; if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) { flkeys = &_flkeys; @@ -2106,7 +2267,9 @@ local_input: do_cache &= res->fi && !itag; if (do_cache) { - rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); + struct fib_nh_common *nhc = FIB_RES_NHC(*res); + + rth = rcu_dereference(nhc->nhc_rth_input); if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); err = 0; @@ -2114,9 +2277,9 @@ } } - rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev, + rth = rt_dst_alloc(ip_rt_get_dev(net, res), flags | RTCF_LOCAL, res->type, - IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache); + IN_DEV_ORCONF(in_dev, NOPOLICY), false); if (!rth) goto e_nobufs; @@ -2134,16 +2297,16 @@ } if (do_cache) { - struct fib_nh *nh = &FIB_RES_NH(*res); + struct fib_nh_common *nhc = FIB_RES_NHC(*res); - rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate); + rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate); if (lwtunnel_input_redirect(rth->dst.lwtstate)) { WARN_ON(rth->dst.input == lwtunnel_input); rth->dst.lwtstate->orig_input = rth->dst.input; rth->dst.input = lwtunnel_input; } - if (unlikely(!rt_cache_route(nh, rth))) + if (unlikely(!rt_cache_route(nhc, rth))) rt_add_uncached_list(rth); } skb_dst_set(skb, &rth->dst); @@ -2314,10 +2477,10 @@ fnhe = NULL; do_cache &= fi != NULL; if (fi) { + struct fib_nh_common *nhc = FIB_RES_NHC(*res); struct rtable __rcu **prth; - struct fib_nh *nh = &FIB_RES_NH(*res); - fnhe = find_exception(nh, fl4->daddr); + fnhe = find_exception(nhc, fl4->daddr); if (!do_cache) goto add; if (fnhe) { @@ -2325,12 +2488,12 @@ } else { if (unlikely(fl4->flowi4_flags & FLOWI_FLAG_KNOWN_NH && - !(nh->nh_gw && - nh->nh_scope == RT_SCOPE_LINK))) { + !(nhc->nhc_gw_family && + nhc->nhc_scope == RT_SCOPE_LINK))) { do_cache = false; goto add; } - prth = raw_cpu_ptr(nh->nh_pcpu_rth_output); + prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output); } rth = rcu_dereference(*prth); if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst)) @@ -2339,9 +2502,8 @@ add: rth = rt_dst_alloc(dev_out, flags, type, - IN_DEV_CONF_GET(in_dev, NOPOLICY), - IN_DEV_CONF_GET(in_dev, NOXFRM), - do_cache); + IN_DEV_ORCONF(in_dev, NOPOLICY), + IN_DEV_ORCONF(in_dev, NOXFRM)); if (!rth) return ERR_PTR(-ENOBUFS); @@ -2379,7 +2541,6 @@ struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, const struct sk_buff *skb) { - __u8 tos = RT_FL_TOS(fl4); struct fib_result res = { .type = RTN_UNSPEC, .fi = NULL, @@ -2389,9 +2550,7 @@ struct rtable *rth; fl4->flowi4_iif = LOOPBACK_IFINDEX; - fl4->flowi4_tos = tos & IPTOS_RT_MASK; - fl4->flowi4_scope = ((tos & RTO_ONLINK) ? - RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); + ip_rt_fix_tos(fl4); rcu_read_lock(); rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb); @@ -2572,44 +2731,15 @@ return rth; } -static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) -{ - return NULL; -} - -static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst) -{ - unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); - - return mtu ? : dst->dev->mtu; -} - -static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu, - bool confirm_neigh) -{ -} - -static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb) -{ -} - -static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst, - unsigned long old) -{ - return NULL; -} - static struct dst_ops ipv4_dst_blackhole_ops = { - .family = AF_INET, - .check = ipv4_blackhole_dst_check, - .mtu = ipv4_blackhole_mtu, - .default_advmss = ipv4_default_advmss, - .update_pmtu = ipv4_rt_blackhole_update_pmtu, - .redirect = ipv4_rt_blackhole_redirect, - .cow_metrics = ipv4_rt_blackhole_cow_metrics, - .neigh_lookup = ipv4_neigh_lookup, + .family = AF_INET, + .default_advmss = ipv4_default_advmss, + .neigh_lookup = ipv4_neigh_lookup, + .check = dst_blackhole_check, + .cow_metrics = dst_blackhole_cow_metrics, + .update_pmtu = dst_blackhole_update_pmtu, + .redirect = dst_blackhole_redirect, + .mtu = dst_blackhole_mtu, }; struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) @@ -2637,8 +2767,12 @@ rt->rt_genid = rt_genid_ipv4(net); rt->rt_flags = ort->rt_flags; rt->rt_type = ort->rt_type; - rt->rt_gateway = ort->rt_gateway; rt->rt_uses_gateway = ort->rt_uses_gateway; + rt->rt_gw_family = ort->rt_gw_family; + if (rt->rt_gw_family == AF_INET) + rt->rt_gw4 = ort->rt_gw4; + else if (rt->rt_gw_family == AF_INET6) + rt->rt_gw6 = ort->rt_gw6; INIT_LIST_HEAD(&rt->rt_uncached); } @@ -2667,10 +2801,59 @@ } EXPORT_SYMBOL_GPL(ip_route_output_flow); +struct rtable *ip_route_output_tunnel(struct sk_buff *skb, + struct net_device *dev, + struct net *net, __be32 *saddr, + const struct ip_tunnel_info *info, + u8 protocol, bool use_cache) +{ +#ifdef CONFIG_DST_CACHE + struct dst_cache *dst_cache; +#endif + struct rtable *rt = NULL; + struct flowi4 fl4; + __u8 tos; + +#ifdef CONFIG_DST_CACHE + dst_cache = (struct dst_cache *)&info->dst_cache; + if (use_cache) { + rt = dst_cache_get_ip4(dst_cache, saddr); + if (rt) + return rt; + } +#endif + memset(&fl4, 0, sizeof(fl4)); + fl4.flowi4_mark = skb->mark; + fl4.flowi4_proto = protocol; + fl4.daddr = info->key.u.ipv4.dst; + fl4.saddr = info->key.u.ipv4.src; + tos = info->key.tos; + fl4.flowi4_tos = RT_TOS(tos); + + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) { + netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr); + return ERR_PTR(-ENETUNREACH); + } + if (rt->dst.dev == dev) { /* is this necessary? */ + netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr); + ip_rt_put(rt); + return ERR_PTR(-ELOOP); + } +#ifdef CONFIG_DST_CACHE + if (use_cache) + dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr); +#endif + *saddr = fl4.saddr; + return rt; +} +EXPORT_SYMBOL_GPL(ip_route_output_tunnel); + /* called with rcu_read_lock held */ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, struct rtable *rt, u32 table_id, struct flowi4 *fl4, - struct sk_buff *skb, u32 portid, u32 seq) + struct sk_buff *skb, u32 portid, u32 seq, + unsigned int flags) { struct rtmsg *r; struct nlmsghdr *nlh; @@ -2678,7 +2861,7 @@ u32 error; u32 metrics[RTAX_MAX]; - nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0); + nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags); if (!nlh) return -EMSGSIZE; @@ -2686,7 +2869,7 @@ r->rtm_family = AF_INET; r->rtm_dst_len = 32; r->rtm_src_len = 0; - r->rtm_tos = fl4->flowi4_tos; + r->rtm_tos = fl4 ? fl4->flowi4_tos : 0; r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT; if (nla_put_u32(skb, RTA_TABLE, table_id)) goto nla_put_failure; @@ -2714,14 +2897,29 @@ nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid)) goto nla_put_failure; #endif - if (!rt_is_input_route(rt) && + if (fl4 && !rt_is_input_route(rt) && fl4->saddr != src) { if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr)) goto nla_put_failure; } - if (rt->rt_uses_gateway && - nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway)) - goto nla_put_failure; + if (rt->rt_uses_gateway) { + if (rt->rt_gw_family == AF_INET && + nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) { + goto nla_put_failure; + } else if (rt->rt_gw_family == AF_INET6) { + int alen = sizeof(struct in6_addr); + struct nlattr *nla; + struct rtvia *via; + + nla = nla_reserve(skb, RTA_VIA, alen + 2); + if (!nla) + goto nla_put_failure; + + via = nla_data(nla); + via->rtvia_family = AF_INET6; + memcpy(via->rtvia_addr, &rt->rt_gw6, alen); + } + } expires = rt->dst.expires; if (expires) { @@ -2741,35 +2939,39 @@ if (rtnetlink_put_metrics(skb, metrics) < 0) goto nla_put_failure; - if (fl4->flowi4_mark && - nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark)) - goto nla_put_failure; + if (fl4) { + if (fl4->flowi4_mark && + nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark)) + goto nla_put_failure; - if (!uid_eq(fl4->flowi4_uid, INVALID_UID) && - nla_put_u32(skb, RTA_UID, - from_kuid_munged(current_user_ns(), fl4->flowi4_uid))) - goto nla_put_failure; + if (!uid_eq(fl4->flowi4_uid, INVALID_UID) && + nla_put_u32(skb, RTA_UID, + from_kuid_munged(current_user_ns(), + fl4->flowi4_uid))) + goto nla_put_failure; + + if (rt_is_input_route(rt)) { +#ifdef CONFIG_IP_MROUTE + if (ipv4_is_multicast(dst) && + !ipv4_is_local_multicast(dst) && + IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { + int err = ipmr_get_route(net, skb, + fl4->saddr, fl4->daddr, + r, portid); + + if (err <= 0) { + if (err == 0) + return 0; + goto nla_put_failure; + } + } else +#endif + if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif)) + goto nla_put_failure; + } + } error = rt->dst.error; - - if (rt_is_input_route(rt)) { -#ifdef CONFIG_IP_MROUTE - if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && - IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { - int err = ipmr_get_route(net, skb, - fl4->saddr, fl4->daddr, - r, portid); - - if (err <= 0) { - if (err == 0) - return 0; - goto nla_put_failure; - } - } else -#endif - if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif)) - goto nla_put_failure; - } if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0) goto nla_put_failure; @@ -2780,6 +2982,81 @@ nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; +} + +static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, u32 table_id, + struct fnhe_hash_bucket *bucket, int genid, + int *fa_index, int fa_start, unsigned int flags) +{ + int i; + + for (i = 0; i < FNHE_HASH_SIZE; i++) { + struct fib_nh_exception *fnhe; + + for (fnhe = rcu_dereference(bucket[i].chain); fnhe; + fnhe = rcu_dereference(fnhe->fnhe_next)) { + struct rtable *rt; + int err; + + if (*fa_index < fa_start) + goto next; + + if (fnhe->fnhe_genid != genid) + goto next; + + if (fnhe->fnhe_expires && + time_after(jiffies, fnhe->fnhe_expires)) + goto next; + + rt = rcu_dereference(fnhe->fnhe_rth_input); + if (!rt) + rt = rcu_dereference(fnhe->fnhe_rth_output); + if (!rt) + goto next; + + err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt, + table_id, NULL, skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, flags); + if (err) + return err; +next: + (*fa_index)++; + } + } + + return 0; +} + +int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb, + u32 table_id, struct fib_info *fi, + int *fa_index, int fa_start, unsigned int flags) +{ + struct net *net = sock_net(cb->skb->sk); + int nhsel, genid = fnhe_genid(net); + + for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) { + struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel); + struct fnhe_hash_bucket *bucket; + int err; + + if (nhc->nhc_flags & RTNH_F_DEAD) + continue; + + rcu_read_lock(); + bucket = rcu_dereference(nhc->nhc_exceptions); + err = 0; + if (bucket) + err = fnhe_dump_bucket(net, skb, cb, table_id, bucket, + genid, fa_index, fa_start, + flags); + rcu_read_unlock(); + if (err) + return err; + } + + return 0; } static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst, @@ -2843,6 +3120,75 @@ return skb; } +static int inet_rtm_valid_getroute_req(struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct rtmsg *rtm; + int i, err; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + NL_SET_ERR_MSG(extack, + "ipv4: Invalid header for route get request"); + return -EINVAL; + } + + if (!netlink_strict_get_check(skb)) + return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_ipv4_policy, extack); + + rtm = nlmsg_data(nlh); + if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) || + (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) || + rtm->rtm_table || rtm->rtm_protocol || + rtm->rtm_scope || rtm->rtm_type) { + NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request"); + return -EINVAL; + } + + if (rtm->rtm_flags & ~(RTM_F_NOTIFY | + RTM_F_LOOKUP_TABLE | + RTM_F_FIB_MATCH)) { + NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request"); + return -EINVAL; + } + + err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_ipv4_policy, extack); + if (err) + return err; + + if ((tb[RTA_SRC] && !rtm->rtm_src_len) || + (tb[RTA_DST] && !rtm->rtm_dst_len)) { + NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4"); + return -EINVAL; + } + + for (i = 0; i <= RTA_MAX; i++) { + if (!tb[i]) + continue; + + switch (i) { + case RTA_IIF: + case RTA_OIF: + case RTA_SRC: + case RTA_DST: + case RTA_IP_PROTO: + case RTA_SPORT: + case RTA_DPORT: + case RTA_MARK: + case RTA_UID: + break; + default: + NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request"); + return -EINVAL; + } + } + + return 0; +} + static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -2855,7 +3201,7 @@ struct rtable *rt = NULL; struct sk_buff *skb; struct rtmsg *rtm; - struct flowi4 fl4; + struct flowi4 fl4 = {}; __be32 dst = 0; __be32 src = 0; kuid_t uid; @@ -2863,8 +3209,7 @@ int err; int mark; - err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, - extack); + err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack); if (err < 0) return err; @@ -2895,7 +3240,6 @@ if (!skb) return -ENOBUFS; - memset(&fl4, 0, sizeof(fl4)); fl4.daddr = dst; fl4.saddr = src; fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK; @@ -2931,6 +3275,7 @@ err = -rt->dst.error; } else { fl4.flowi4_iif = LOOPBACK_IFINDEX; + skb->dev = net->loopback_dev; rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb); err = 0; if (IS_ERR(rt)) @@ -2955,19 +3300,45 @@ skb_reset_mac_header(skb); if (rtm->rtm_flags & RTM_F_FIB_MATCH) { + struct fib_rt_info fri; + if (!res.fi) { err = fib_props[res.type].error; if (!err) err = -EHOSTUNREACH; goto errout_rcu; } + fri.fi = res.fi; + fri.tb_id = table_id; + fri.dst = res.prefix; + fri.dst_len = res.prefixlen; + fri.tos = fl4.flowi4_tos; + fri.type = rt->rt_type; + fri.offload = 0; + fri.trap = 0; + if (res.fa_head) { + struct fib_alias *fa; + + hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) { + u8 slen = 32 - fri.dst_len; + + if (fa->fa_slen == slen && + fa->tb_id == fri.tb_id && + fa->fa_tos == fri.tos && + fa->fa_info == res.fi && + fa->fa_type == fri.type) { + fri.offload = fa->offload; + fri.trap = fa->trap; + break; + } + } + } err = fib_dump_info(skb, NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, RTM_NEWROUTE, table_id, - rt->rt_type, res.prefix, res.prefixlen, - fl4.flowi4_tos, res.fi, 0); + nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0); } else { err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb, - NETLINK_CB(in_skb).portid, nlh->nlmsg_seq); + NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, 0); } if (err < 0) goto errout_rcu; @@ -2996,8 +3367,7 @@ static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU; static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = (struct net *)__ctl->extra1; @@ -3122,9 +3492,11 @@ { } }; +static const char ipv4_route_flush_procname[] = "flush"; + static struct ctl_table ipv4_route_flush_table[] = { { - .procname = "flush", + .procname = ipv4_route_flush_procname, .maxlen = sizeof(int), .mode = 0200, .proc_handler = ipv4_sysctl_rtcache_flush, @@ -3142,9 +3514,11 @@ if (!tbl) goto err_dup; - /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) - tbl[0].procname = NULL; + /* Don't export non-whitelisted sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) { + if (tbl[0].procname != ipv4_route_flush_procname) + tbl[0].procname = NULL; + } } tbl[0].extra1 = net; -- Gitblit v1.6.2