| .. | .. |
|---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-or-later |
|---|
| 1 | 2 | /* |
|---|
| 2 | 3 | * INET An implementation of the TCP/IP protocol suite for the LINUX |
|---|
| 3 | 4 | * operating system. INET is implemented using the BSD Socket |
|---|
| .. | .. |
|---|
| 13 | 14 | * Stefan Becker, <stefanb@yello.ping.de> |
|---|
| 14 | 15 | * Jorge Cwik, <jorge@laser.satlink.net> |
|---|
| 15 | 16 | * Arnt Gulbrandsen, <agulbra@nvg.unit.no> |
|---|
| 16 | | - * |
|---|
| 17 | 17 | * |
|---|
| 18 | 18 | * Fixes: |
|---|
| 19 | 19 | * Alan Cox : Commented a couple of minor bits of surplus code |
|---|
| .. | .. |
|---|
| 96 | 96 | * Jos Vos : Do accounting *before* call_in_firewall |
|---|
| 97 | 97 | * Willy Konynenberg : Transparent proxying support |
|---|
| 98 | 98 | * |
|---|
| 99 | | - * |
|---|
| 100 | | - * |
|---|
| 101 | 99 | * To Fix: |
|---|
| 102 | 100 | * IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient |
|---|
| 103 | 101 | * and could be made very efficient with the addition of some virtual memory hacks to permit |
|---|
| .. | .. |
|---|
| 106 | 104 | * interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet |
|---|
| 107 | 105 | * output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause |
|---|
| 108 | 106 | * fragmentation anyway. |
|---|
| 109 | | - * |
|---|
| 110 | | - * This program is free software; you can redistribute it and/or |
|---|
| 111 | | - * modify it under the terms of the GNU General Public License |
|---|
| 112 | | - * as published by the Free Software Foundation; either version |
|---|
| 113 | | - * 2 of the License, or (at your option) any later version. |
|---|
| 114 | 107 | */ |
|---|
| 115 | 108 | |
|---|
| 116 | 109 | #define pr_fmt(fmt) "IPv4: " fmt |
|---|
| .. | .. |
|---|
| 130 | 123 | #include <linux/inetdevice.h> |
|---|
| 131 | 124 | #include <linux/netdevice.h> |
|---|
| 132 | 125 | #include <linux/etherdevice.h> |
|---|
| 126 | +#include <linux/indirect_call_wrapper.h> |
|---|
| 133 | 127 | |
|---|
| 134 | 128 | #include <net/snmp.h> |
|---|
| 135 | 129 | #include <net/ip.h> |
|---|
| .. | .. |
|---|
| 188 | 182 | return false; |
|---|
| 189 | 183 | } |
|---|
| 190 | 184 | |
|---|
| 185 | +INDIRECT_CALLABLE_DECLARE(int udp_rcv(struct sk_buff *)); |
|---|
| 186 | +INDIRECT_CALLABLE_DECLARE(int tcp_v4_rcv(struct sk_buff *)); |
|---|
| 187 | +void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol) |
|---|
| 188 | +{ |
|---|
| 189 | + const struct net_protocol *ipprot; |
|---|
| 190 | + int raw, ret; |
|---|
| 191 | + |
|---|
| 192 | +resubmit: |
|---|
| 193 | + raw = raw_local_deliver(skb, protocol); |
|---|
| 194 | + |
|---|
| 195 | + ipprot = rcu_dereference(inet_protos[protocol]); |
|---|
| 196 | + if (ipprot) { |
|---|
| 197 | + if (!ipprot->no_policy) { |
|---|
| 198 | + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { |
|---|
| 199 | + kfree_skb(skb); |
|---|
| 200 | + return; |
|---|
| 201 | + } |
|---|
| 202 | + nf_reset_ct(skb); |
|---|
| 203 | + } |
|---|
| 204 | + ret = INDIRECT_CALL_2(ipprot->handler, tcp_v4_rcv, udp_rcv, |
|---|
| 205 | + skb); |
|---|
| 206 | + if (ret < 0) { |
|---|
| 207 | + protocol = -ret; |
|---|
| 208 | + goto resubmit; |
|---|
| 209 | + } |
|---|
| 210 | + __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); |
|---|
| 211 | + } else { |
|---|
| 212 | + if (!raw) { |
|---|
| 213 | + if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { |
|---|
| 214 | + __IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS); |
|---|
| 215 | + icmp_send(skb, ICMP_DEST_UNREACH, |
|---|
| 216 | + ICMP_PROT_UNREACH, 0); |
|---|
| 217 | + } |
|---|
| 218 | + kfree_skb(skb); |
|---|
| 219 | + } else { |
|---|
| 220 | + __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); |
|---|
| 221 | + consume_skb(skb); |
|---|
| 222 | + } |
|---|
| 223 | + } |
|---|
| 224 | +} |
|---|
| 225 | + |
|---|
| 191 | 226 | static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb) |
|---|
| 192 | 227 | { |
|---|
| 193 | 228 | __skb_pull(skb, skb_network_header_len(skb)); |
|---|
| 194 | 229 | |
|---|
| 195 | 230 | rcu_read_lock(); |
|---|
| 196 | | - { |
|---|
| 197 | | - int protocol = ip_hdr(skb)->protocol; |
|---|
| 198 | | - const struct net_protocol *ipprot; |
|---|
| 199 | | - int raw; |
|---|
| 200 | | - |
|---|
| 201 | | - resubmit: |
|---|
| 202 | | - raw = raw_local_deliver(skb, protocol); |
|---|
| 203 | | - |
|---|
| 204 | | - ipprot = rcu_dereference(inet_protos[protocol]); |
|---|
| 205 | | - if (ipprot) { |
|---|
| 206 | | - int ret; |
|---|
| 207 | | - |
|---|
| 208 | | - if (!ipprot->no_policy) { |
|---|
| 209 | | - if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { |
|---|
| 210 | | - kfree_skb(skb); |
|---|
| 211 | | - goto out; |
|---|
| 212 | | - } |
|---|
| 213 | | - nf_reset(skb); |
|---|
| 214 | | - } |
|---|
| 215 | | - ret = ipprot->handler(skb); |
|---|
| 216 | | - if (ret < 0) { |
|---|
| 217 | | - protocol = -ret; |
|---|
| 218 | | - goto resubmit; |
|---|
| 219 | | - } |
|---|
| 220 | | - __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); |
|---|
| 221 | | - } else { |
|---|
| 222 | | - if (!raw) { |
|---|
| 223 | | - if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { |
|---|
| 224 | | - __IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS); |
|---|
| 225 | | - icmp_send(skb, ICMP_DEST_UNREACH, |
|---|
| 226 | | - ICMP_PROT_UNREACH, 0); |
|---|
| 227 | | - } |
|---|
| 228 | | - kfree_skb(skb); |
|---|
| 229 | | - } else { |
|---|
| 230 | | - __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); |
|---|
| 231 | | - consume_skb(skb); |
|---|
| 232 | | - } |
|---|
| 233 | | - } |
|---|
| 234 | | - } |
|---|
| 235 | | - out: |
|---|
| 231 | + ip_protocol_deliver_rcu(net, skb, ip_hdr(skb)->protocol); |
|---|
| 236 | 232 | rcu_read_unlock(); |
|---|
| 237 | 233 | |
|---|
| 238 | 234 | return 0; |
|---|
| .. | .. |
|---|
| 306 | 302 | return true; |
|---|
| 307 | 303 | } |
|---|
| 308 | 304 | |
|---|
| 305 | +static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph, |
|---|
| 306 | + const struct sk_buff *hint) |
|---|
| 307 | +{ |
|---|
| 308 | + return hint && !skb_dst(skb) && ip_hdr(hint)->daddr == iph->daddr && |
|---|
| 309 | + ip_hdr(hint)->tos == iph->tos; |
|---|
| 310 | +} |
|---|
| 311 | + |
|---|
| 312 | +int tcp_v4_early_demux(struct sk_buff *skb); |
|---|
| 313 | +int udp_v4_early_demux(struct sk_buff *skb); |
|---|
| 309 | 314 | static int ip_rcv_finish_core(struct net *net, struct sock *sk, |
|---|
| 310 | | - struct sk_buff *skb, struct net_device *dev) |
|---|
| 315 | + struct sk_buff *skb, struct net_device *dev, |
|---|
| 316 | + const struct sk_buff *hint) |
|---|
| 311 | 317 | { |
|---|
| 312 | 318 | const struct iphdr *iph = ip_hdr(skb); |
|---|
| 313 | | - int (*edemux)(struct sk_buff *skb); |
|---|
| 314 | 319 | struct rtable *rt; |
|---|
| 315 | 320 | int err; |
|---|
| 316 | 321 | |
|---|
| 317 | | - if (net->ipv4.sysctl_ip_early_demux && |
|---|
| 322 | + if (ip_can_use_hint(skb, iph, hint)) { |
|---|
| 323 | + err = ip_route_use_hint(skb, iph->daddr, iph->saddr, iph->tos, |
|---|
| 324 | + dev, hint); |
|---|
| 325 | + if (unlikely(err)) |
|---|
| 326 | + goto drop_error; |
|---|
| 327 | + } |
|---|
| 328 | + |
|---|
| 329 | + if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) && |
|---|
| 318 | 330 | !skb_dst(skb) && |
|---|
| 319 | 331 | !skb->sk && |
|---|
| 320 | 332 | !ip_is_fragment(iph)) { |
|---|
| 321 | | - const struct net_protocol *ipprot; |
|---|
| 322 | | - int protocol = iph->protocol; |
|---|
| 333 | + switch (iph->protocol) { |
|---|
| 334 | + case IPPROTO_TCP: |
|---|
| 335 | + if (READ_ONCE(net->ipv4.sysctl_tcp_early_demux)) { |
|---|
| 336 | + tcp_v4_early_demux(skb); |
|---|
| 323 | 337 | |
|---|
| 324 | | - ipprot = rcu_dereference(inet_protos[protocol]); |
|---|
| 325 | | - if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) { |
|---|
| 326 | | - err = edemux(skb); |
|---|
| 327 | | - if (unlikely(err)) |
|---|
| 328 | | - goto drop_error; |
|---|
| 329 | | - /* must reload iph, skb->head might have changed */ |
|---|
| 330 | | - iph = ip_hdr(skb); |
|---|
| 338 | + /* must reload iph, skb->head might have changed */ |
|---|
| 339 | + iph = ip_hdr(skb); |
|---|
| 340 | + } |
|---|
| 341 | + break; |
|---|
| 342 | + case IPPROTO_UDP: |
|---|
| 343 | + if (READ_ONCE(net->ipv4.sysctl_udp_early_demux)) { |
|---|
| 344 | + err = udp_v4_early_demux(skb); |
|---|
| 345 | + if (unlikely(err)) |
|---|
| 346 | + goto drop_error; |
|---|
| 347 | + |
|---|
| 348 | + /* must reload iph, skb->head might have changed */ |
|---|
| 349 | + iph = ip_hdr(skb); |
|---|
| 350 | + } |
|---|
| 351 | + break; |
|---|
| 331 | 352 | } |
|---|
| 332 | 353 | } |
|---|
| 333 | 354 | |
|---|
| .. | .. |
|---|
| 409 | 430 | if (!skb) |
|---|
| 410 | 431 | return NET_RX_SUCCESS; |
|---|
| 411 | 432 | |
|---|
| 412 | | - ret = ip_rcv_finish_core(net, sk, skb, dev); |
|---|
| 433 | + ret = ip_rcv_finish_core(net, sk, skb, dev, NULL); |
|---|
| 413 | 434 | if (ret != NET_RX_DROP) |
|---|
| 414 | 435 | ret = dst_input(skb); |
|---|
| 415 | 436 | return ret; |
|---|
| .. | .. |
|---|
| 428 | 449 | */ |
|---|
| 429 | 450 | if (skb->pkt_type == PACKET_OTHERHOST) |
|---|
| 430 | 451 | goto drop; |
|---|
| 431 | | - |
|---|
| 432 | 452 | |
|---|
| 433 | 453 | __IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len); |
|---|
| 434 | 454 | |
|---|
| .. | .. |
|---|
| 496 | 516 | IPCB(skb)->iif = skb->skb_iif; |
|---|
| 497 | 517 | |
|---|
| 498 | 518 | /* Must drop socket now because of tproxy. */ |
|---|
| 499 | | - skb_orphan(skb); |
|---|
| 519 | + if (!skb_sk_is_prefetched(skb)) |
|---|
| 520 | + skb_orphan(skb); |
|---|
| 500 | 521 | |
|---|
| 501 | 522 | return skb; |
|---|
| 502 | 523 | |
|---|
| .. | .. |
|---|
| 521 | 542 | skb = ip_rcv_core(skb, net); |
|---|
| 522 | 543 | if (skb == NULL) |
|---|
| 523 | 544 | return NET_RX_DROP; |
|---|
| 545 | + |
|---|
| 524 | 546 | return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, |
|---|
| 525 | 547 | net, NULL, skb, dev, NULL, |
|---|
| 526 | 548 | ip_rcv_finish); |
|---|
| .. | .. |
|---|
| 531 | 553 | struct sk_buff *skb, *next; |
|---|
| 532 | 554 | |
|---|
| 533 | 555 | list_for_each_entry_safe(skb, next, head, list) { |
|---|
| 534 | | - list_del(&skb->list); |
|---|
| 535 | | - /* Handle ip{6}_forward case, as sch_direct_xmit have |
|---|
| 536 | | - * another kind of SKB-list usage (see validate_xmit_skb_list) |
|---|
| 537 | | - */ |
|---|
| 538 | | - skb->next = NULL; |
|---|
| 556 | + skb_list_del_init(skb); |
|---|
| 539 | 557 | dst_input(skb); |
|---|
| 540 | 558 | } |
|---|
| 559 | +} |
|---|
| 560 | + |
|---|
| 561 | +static struct sk_buff *ip_extract_route_hint(const struct net *net, |
|---|
| 562 | + struct sk_buff *skb, int rt_type) |
|---|
| 563 | +{ |
|---|
| 564 | + if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST) |
|---|
| 565 | + return NULL; |
|---|
| 566 | + |
|---|
| 567 | + return skb; |
|---|
| 541 | 568 | } |
|---|
| 542 | 569 | |
|---|
| 543 | 570 | static void ip_list_rcv_finish(struct net *net, struct sock *sk, |
|---|
| 544 | 571 | struct list_head *head) |
|---|
| 545 | 572 | { |
|---|
| 573 | + struct sk_buff *skb, *next, *hint = NULL; |
|---|
| 546 | 574 | struct dst_entry *curr_dst = NULL; |
|---|
| 547 | | - struct sk_buff *skb, *next; |
|---|
| 548 | 575 | struct list_head sublist; |
|---|
| 549 | 576 | |
|---|
| 550 | 577 | INIT_LIST_HEAD(&sublist); |
|---|
| .. | .. |
|---|
| 559 | 586 | skb = l3mdev_ip_rcv(skb); |
|---|
| 560 | 587 | if (!skb) |
|---|
| 561 | 588 | continue; |
|---|
| 562 | | - if (ip_rcv_finish_core(net, sk, skb, dev) == NET_RX_DROP) |
|---|
| 589 | + if (ip_rcv_finish_core(net, sk, skb, dev, hint) == NET_RX_DROP) |
|---|
| 563 | 590 | continue; |
|---|
| 564 | 591 | |
|---|
| 565 | 592 | dst = skb_dst(skb); |
|---|
| 566 | 593 | if (curr_dst != dst) { |
|---|
| 594 | + hint = ip_extract_route_hint(net, skb, |
|---|
| 595 | + ((struct rtable *)dst)->rt_type); |
|---|
| 596 | + |
|---|
| 567 | 597 | /* dispatch old sublist */ |
|---|
| 568 | 598 | if (!list_empty(&sublist)) |
|---|
| 569 | 599 | ip_sublist_rcv_finish(&sublist); |
|---|
| .. | .. |
|---|
| 616 | 646 | list_add_tail(&skb->list, &sublist); |
|---|
| 617 | 647 | } |
|---|
| 618 | 648 | /* dispatch final sublist */ |
|---|
| 619 | | - ip_sublist_rcv(&sublist, curr_dev, curr_net); |
|---|
| 649 | + if (!list_empty(&sublist)) |
|---|
| 650 | + ip_sublist_rcv(&sublist, curr_dev, curr_net); |
|---|
| 620 | 651 | } |
|---|