.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
---|
1 | 2 | /* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch> |
---|
2 | | - * |
---|
3 | | - * This program is free software; you can redistribute it and/or |
---|
4 | | - * modify it under the terms of version 2 of the GNU General Public |
---|
5 | | - * License as published by the Free Software Foundation. |
---|
6 | | - * |
---|
7 | | - * This program is distributed in the hope that it will be useful, but |
---|
8 | | - * WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
9 | | - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
---|
10 | | - * General Public License for more details. |
---|
11 | 3 | */ |
---|
12 | 4 | |
---|
13 | 5 | #include <linux/kernel.h> |
---|
.. | .. |
---|
16 | 8 | #include <linux/types.h> |
---|
17 | 9 | #include <linux/bpf.h> |
---|
18 | 10 | #include <net/lwtunnel.h> |
---|
| 11 | +#include <net/gre.h> |
---|
| 12 | +#include <net/ip6_route.h> |
---|
| 13 | +#include <net/ipv6_stubs.h> |
---|
19 | 14 | |
---|
20 | 15 | struct bpf_lwt_prog { |
---|
21 | 16 | struct bpf_prog *prog; |
---|
.. | .. |
---|
44 | 39 | { |
---|
45 | 40 | int ret; |
---|
46 | 41 | |
---|
47 | | - /* Preempt disable and BH disable are needed to protect per-cpu |
---|
| 42 | + /* Migration disable and BH disable are needed to protect per-cpu |
---|
48 | 43 | * redirect_info between BPF prog and skb_do_redirect(). |
---|
49 | 44 | */ |
---|
50 | | - preempt_disable(); |
---|
| 45 | + migrate_disable(); |
---|
51 | 46 | local_bh_disable(); |
---|
52 | 47 | bpf_compute_data_pointers(skb); |
---|
53 | 48 | ret = bpf_prog_run_save_cb(lwt->prog, skb); |
---|
54 | 49 | |
---|
55 | 50 | switch (ret) { |
---|
56 | 51 | case BPF_OK: |
---|
| 52 | + case BPF_LWT_REROUTE: |
---|
57 | 53 | break; |
---|
58 | 54 | |
---|
59 | 55 | case BPF_REDIRECT: |
---|
.. | .. |
---|
82 | 78 | } |
---|
83 | 79 | |
---|
84 | 80 | local_bh_enable(); |
---|
85 | | - preempt_enable(); |
---|
| 81 | + migrate_enable(); |
---|
86 | 82 | |
---|
87 | 83 | return ret; |
---|
| 84 | +} |
---|
| 85 | + |
---|
| 86 | +static int bpf_lwt_input_reroute(struct sk_buff *skb) |
---|
| 87 | +{ |
---|
| 88 | + int err = -EINVAL; |
---|
| 89 | + |
---|
| 90 | + if (skb->protocol == htons(ETH_P_IP)) { |
---|
| 91 | + struct net_device *dev = skb_dst(skb)->dev; |
---|
| 92 | + struct iphdr *iph = ip_hdr(skb); |
---|
| 93 | + |
---|
| 94 | + dev_hold(dev); |
---|
| 95 | + skb_dst_drop(skb); |
---|
| 96 | + err = ip_route_input_noref(skb, iph->daddr, iph->saddr, |
---|
| 97 | + iph->tos, dev); |
---|
| 98 | + dev_put(dev); |
---|
| 99 | + } else if (skb->protocol == htons(ETH_P_IPV6)) { |
---|
| 100 | + skb_dst_drop(skb); |
---|
| 101 | + err = ipv6_stub->ipv6_route_input(skb); |
---|
| 102 | + } else { |
---|
| 103 | + err = -EAFNOSUPPORT; |
---|
| 104 | + } |
---|
| 105 | + |
---|
| 106 | + if (err) |
---|
| 107 | + goto err; |
---|
| 108 | + return dst_input(skb); |
---|
| 109 | + |
---|
| 110 | +err: |
---|
| 111 | + kfree_skb(skb); |
---|
| 112 | + return err; |
---|
88 | 113 | } |
---|
89 | 114 | |
---|
90 | 115 | static int bpf_input(struct sk_buff *skb) |
---|
.. | .. |
---|
98 | 123 | ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT); |
---|
99 | 124 | if (ret < 0) |
---|
100 | 125 | return ret; |
---|
| 126 | + if (ret == BPF_LWT_REROUTE) |
---|
| 127 | + return bpf_lwt_input_reroute(skb); |
---|
101 | 128 | } |
---|
102 | 129 | |
---|
103 | 130 | if (unlikely(!dst->lwtstate->orig_input)) { |
---|
104 | | - pr_warn_once("orig_input not set on dst for prog %s\n", |
---|
105 | | - bpf->out.name); |
---|
106 | 131 | kfree_skb(skb); |
---|
107 | 132 | return -EINVAL; |
---|
108 | 133 | } |
---|
.. | .. |
---|
133 | 158 | return dst->lwtstate->orig_output(net, sk, skb); |
---|
134 | 159 | } |
---|
135 | 160 | |
---|
136 | | -static int xmit_check_hhlen(struct sk_buff *skb) |
---|
| 161 | +static int xmit_check_hhlen(struct sk_buff *skb, int hh_len) |
---|
137 | 162 | { |
---|
138 | | - int hh_len = skb_dst(skb)->dev->hard_header_len; |
---|
139 | | - |
---|
140 | 163 | if (skb_headroom(skb) < hh_len) { |
---|
141 | 164 | int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); |
---|
142 | 165 | |
---|
.. | .. |
---|
147 | 170 | return 0; |
---|
148 | 171 | } |
---|
149 | 172 | |
---|
| 173 | +static int bpf_lwt_xmit_reroute(struct sk_buff *skb) |
---|
| 174 | +{ |
---|
| 175 | + struct net_device *l3mdev = l3mdev_master_dev_rcu(skb_dst(skb)->dev); |
---|
| 176 | + int oif = l3mdev ? l3mdev->ifindex : 0; |
---|
| 177 | + struct dst_entry *dst = NULL; |
---|
| 178 | + int err = -EAFNOSUPPORT; |
---|
| 179 | + struct sock *sk; |
---|
| 180 | + struct net *net; |
---|
| 181 | + bool ipv4; |
---|
| 182 | + |
---|
| 183 | + if (skb->protocol == htons(ETH_P_IP)) |
---|
| 184 | + ipv4 = true; |
---|
| 185 | + else if (skb->protocol == htons(ETH_P_IPV6)) |
---|
| 186 | + ipv4 = false; |
---|
| 187 | + else |
---|
| 188 | + goto err; |
---|
| 189 | + |
---|
| 190 | + sk = sk_to_full_sk(skb->sk); |
---|
| 191 | + if (sk) { |
---|
| 192 | + if (sk->sk_bound_dev_if) |
---|
| 193 | + oif = sk->sk_bound_dev_if; |
---|
| 194 | + net = sock_net(sk); |
---|
| 195 | + } else { |
---|
| 196 | + net = dev_net(skb_dst(skb)->dev); |
---|
| 197 | + } |
---|
| 198 | + |
---|
| 199 | + if (ipv4) { |
---|
| 200 | + struct iphdr *iph = ip_hdr(skb); |
---|
| 201 | + struct flowi4 fl4 = {}; |
---|
| 202 | + struct rtable *rt; |
---|
| 203 | + |
---|
| 204 | + fl4.flowi4_oif = oif; |
---|
| 205 | + fl4.flowi4_mark = skb->mark; |
---|
| 206 | + fl4.flowi4_uid = sock_net_uid(net, sk); |
---|
| 207 | + fl4.flowi4_tos = RT_TOS(iph->tos); |
---|
| 208 | + fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; |
---|
| 209 | + fl4.flowi4_proto = iph->protocol; |
---|
| 210 | + fl4.daddr = iph->daddr; |
---|
| 211 | + fl4.saddr = iph->saddr; |
---|
| 212 | + |
---|
| 213 | + rt = ip_route_output_key(net, &fl4); |
---|
| 214 | + if (IS_ERR(rt)) { |
---|
| 215 | + err = PTR_ERR(rt); |
---|
| 216 | + goto err; |
---|
| 217 | + } |
---|
| 218 | + dst = &rt->dst; |
---|
| 219 | + } else { |
---|
| 220 | + struct ipv6hdr *iph6 = ipv6_hdr(skb); |
---|
| 221 | + struct flowi6 fl6 = {}; |
---|
| 222 | + |
---|
| 223 | + fl6.flowi6_oif = oif; |
---|
| 224 | + fl6.flowi6_mark = skb->mark; |
---|
| 225 | + fl6.flowi6_uid = sock_net_uid(net, sk); |
---|
| 226 | + fl6.flowlabel = ip6_flowinfo(iph6); |
---|
| 227 | + fl6.flowi6_proto = iph6->nexthdr; |
---|
| 228 | + fl6.daddr = iph6->daddr; |
---|
| 229 | + fl6.saddr = iph6->saddr; |
---|
| 230 | + |
---|
| 231 | + dst = ipv6_stub->ipv6_dst_lookup_flow(net, skb->sk, &fl6, NULL); |
---|
| 232 | + if (IS_ERR(dst)) { |
---|
| 233 | + err = PTR_ERR(dst); |
---|
| 234 | + goto err; |
---|
| 235 | + } |
---|
| 236 | + } |
---|
| 237 | + if (unlikely(dst->error)) { |
---|
| 238 | + err = dst->error; |
---|
| 239 | + dst_release(dst); |
---|
| 240 | + goto err; |
---|
| 241 | + } |
---|
| 242 | + |
---|
| 243 | + /* Although skb header was reserved in bpf_lwt_push_ip_encap(), it |
---|
| 244 | + * was done for the previous dst, so we are doing it here again, in |
---|
| 245 | + * case the new dst needs much more space. The call below is a noop |
---|
| 246 | + * if there is enough header space in skb. |
---|
| 247 | + */ |
---|
| 248 | + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); |
---|
| 249 | + if (unlikely(err)) |
---|
| 250 | + goto err; |
---|
| 251 | + |
---|
| 252 | + skb_dst_drop(skb); |
---|
| 253 | + skb_dst_set(skb, dst); |
---|
| 254 | + |
---|
| 255 | + err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb); |
---|
| 256 | + if (unlikely(err)) |
---|
| 257 | + return err; |
---|
| 258 | + |
---|
| 259 | + /* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */ |
---|
| 260 | + return LWTUNNEL_XMIT_DONE; |
---|
| 261 | + |
---|
| 262 | +err: |
---|
| 263 | + kfree_skb(skb); |
---|
| 264 | + return err; |
---|
| 265 | +} |
---|
| 266 | + |
---|
150 | 267 | static int bpf_xmit(struct sk_buff *skb) |
---|
151 | 268 | { |
---|
152 | 269 | struct dst_entry *dst = skb_dst(skb); |
---|
.. | .. |
---|
154 | 271 | |
---|
155 | 272 | bpf = bpf_lwt_lwtunnel(dst->lwtstate); |
---|
156 | 273 | if (bpf->xmit.prog) { |
---|
| 274 | + int hh_len = dst->dev->hard_header_len; |
---|
| 275 | + __be16 proto = skb->protocol; |
---|
157 | 276 | int ret; |
---|
158 | 277 | |
---|
159 | 278 | ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT); |
---|
160 | 279 | switch (ret) { |
---|
161 | 280 | case BPF_OK: |
---|
| 281 | + /* If the header changed, e.g. via bpf_lwt_push_encap, |
---|
| 282 | + * BPF_LWT_REROUTE below should have been used if the |
---|
| 283 | + * protocol was also changed. |
---|
| 284 | + */ |
---|
| 285 | + if (skb->protocol != proto) { |
---|
| 286 | + kfree_skb(skb); |
---|
| 287 | + return -EINVAL; |
---|
| 288 | + } |
---|
162 | 289 | /* If the header was expanded, headroom might be too |
---|
163 | 290 | * small for L2 header to come, expand as needed. |
---|
164 | 291 | */ |
---|
165 | | - ret = xmit_check_hhlen(skb); |
---|
| 292 | + ret = xmit_check_hhlen(skb, hh_len); |
---|
166 | 293 | if (unlikely(ret)) |
---|
167 | 294 | return ret; |
---|
168 | 295 | |
---|
169 | 296 | return LWTUNNEL_XMIT_CONTINUE; |
---|
170 | 297 | case BPF_REDIRECT: |
---|
171 | 298 | return LWTUNNEL_XMIT_DONE; |
---|
| 299 | + case BPF_LWT_REROUTE: |
---|
| 300 | + return bpf_lwt_xmit_reroute(skb); |
---|
172 | 301 | default: |
---|
173 | 302 | return ret; |
---|
174 | 303 | } |
---|
.. | .. |
---|
208 | 337 | int ret; |
---|
209 | 338 | u32 fd; |
---|
210 | 339 | |
---|
211 | | - ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy, |
---|
212 | | - NULL); |
---|
| 340 | + ret = nla_parse_nested_deprecated(tb, LWT_BPF_PROG_MAX, attr, |
---|
| 341 | + bpf_prog_policy, NULL); |
---|
213 | 342 | if (ret < 0) |
---|
214 | 343 | return ret; |
---|
215 | 344 | |
---|
.. | .. |
---|
237 | 366 | [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 }, |
---|
238 | 367 | }; |
---|
239 | 368 | |
---|
240 | | -static int bpf_build_state(struct nlattr *nla, |
---|
| 369 | +static int bpf_build_state(struct net *net, struct nlattr *nla, |
---|
241 | 370 | unsigned int family, const void *cfg, |
---|
242 | 371 | struct lwtunnel_state **ts, |
---|
243 | 372 | struct netlink_ext_ack *extack) |
---|
.. | .. |
---|
250 | 379 | if (family != AF_INET && family != AF_INET6) |
---|
251 | 380 | return -EAFNOSUPPORT; |
---|
252 | 381 | |
---|
253 | | - ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy, extack); |
---|
| 382 | + ret = nla_parse_nested_deprecated(tb, LWT_BPF_MAX, nla, bpf_nl_policy, |
---|
| 383 | + extack); |
---|
254 | 384 | if (ret < 0) |
---|
255 | 385 | return ret; |
---|
256 | 386 | |
---|
.. | .. |
---|
318 | 448 | if (!prog->prog) |
---|
319 | 449 | return 0; |
---|
320 | 450 | |
---|
321 | | - nest = nla_nest_start(skb, attr); |
---|
| 451 | + nest = nla_nest_start_noflag(skb, attr); |
---|
322 | 452 | if (!nest) |
---|
323 | 453 | return -EMSGSIZE; |
---|
324 | 454 | |
---|
.. | .. |
---|
390 | 520 | .owner = THIS_MODULE, |
---|
391 | 521 | }; |
---|
392 | 522 | |
---|
| 523 | +static int handle_gso_type(struct sk_buff *skb, unsigned int gso_type, |
---|
| 524 | + int encap_len) |
---|
| 525 | +{ |
---|
| 526 | + struct skb_shared_info *shinfo = skb_shinfo(skb); |
---|
| 527 | + |
---|
| 528 | + gso_type |= SKB_GSO_DODGY; |
---|
| 529 | + shinfo->gso_type |= gso_type; |
---|
| 530 | + skb_decrease_gso_size(shinfo, encap_len); |
---|
| 531 | + shinfo->gso_segs = 0; |
---|
| 532 | + return 0; |
---|
| 533 | +} |
---|
| 534 | + |
---|
| 535 | +static int handle_gso_encap(struct sk_buff *skb, bool ipv4, int encap_len) |
---|
| 536 | +{ |
---|
| 537 | + int next_hdr_offset; |
---|
| 538 | + void *next_hdr; |
---|
| 539 | + __u8 protocol; |
---|
| 540 | + |
---|
| 541 | + /* SCTP and UDP_L4 gso need more nuanced handling than what |
---|
| 542 | + * handle_gso_type() does above: skb_decrease_gso_size() is not enough. |
---|
| 543 | + * So at the moment only TCP GSO packets are let through. |
---|
| 544 | + */ |
---|
| 545 | + if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) |
---|
| 546 | + return -ENOTSUPP; |
---|
| 547 | + |
---|
| 548 | + if (ipv4) { |
---|
| 549 | + protocol = ip_hdr(skb)->protocol; |
---|
| 550 | + next_hdr_offset = sizeof(struct iphdr); |
---|
| 551 | + next_hdr = skb_network_header(skb) + next_hdr_offset; |
---|
| 552 | + } else { |
---|
| 553 | + protocol = ipv6_hdr(skb)->nexthdr; |
---|
| 554 | + next_hdr_offset = sizeof(struct ipv6hdr); |
---|
| 555 | + next_hdr = skb_network_header(skb) + next_hdr_offset; |
---|
| 556 | + } |
---|
| 557 | + |
---|
| 558 | + switch (protocol) { |
---|
| 559 | + case IPPROTO_GRE: |
---|
| 560 | + next_hdr_offset += sizeof(struct gre_base_hdr); |
---|
| 561 | + if (next_hdr_offset > encap_len) |
---|
| 562 | + return -EINVAL; |
---|
| 563 | + |
---|
| 564 | + if (((struct gre_base_hdr *)next_hdr)->flags & GRE_CSUM) |
---|
| 565 | + return handle_gso_type(skb, SKB_GSO_GRE_CSUM, |
---|
| 566 | + encap_len); |
---|
| 567 | + return handle_gso_type(skb, SKB_GSO_GRE, encap_len); |
---|
| 568 | + |
---|
| 569 | + case IPPROTO_UDP: |
---|
| 570 | + next_hdr_offset += sizeof(struct udphdr); |
---|
| 571 | + if (next_hdr_offset > encap_len) |
---|
| 572 | + return -EINVAL; |
---|
| 573 | + |
---|
| 574 | + if (((struct udphdr *)next_hdr)->check) |
---|
| 575 | + return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL_CSUM, |
---|
| 576 | + encap_len); |
---|
| 577 | + return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL, encap_len); |
---|
| 578 | + |
---|
| 579 | + case IPPROTO_IP: |
---|
| 580 | + case IPPROTO_IPV6: |
---|
| 581 | + if (ipv4) |
---|
| 582 | + return handle_gso_type(skb, SKB_GSO_IPXIP4, encap_len); |
---|
| 583 | + else |
---|
| 584 | + return handle_gso_type(skb, SKB_GSO_IPXIP6, encap_len); |
---|
| 585 | + |
---|
| 586 | + default: |
---|
| 587 | + return -EPROTONOSUPPORT; |
---|
| 588 | + } |
---|
| 589 | +} |
---|
| 590 | + |
---|
| 591 | +int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress) |
---|
| 592 | +{ |
---|
| 593 | + struct iphdr *iph; |
---|
| 594 | + bool ipv4; |
---|
| 595 | + int err; |
---|
| 596 | + |
---|
| 597 | + if (unlikely(len < sizeof(struct iphdr) || len > LWT_BPF_MAX_HEADROOM)) |
---|
| 598 | + return -EINVAL; |
---|
| 599 | + |
---|
| 600 | + /* validate protocol and length */ |
---|
| 601 | + iph = (struct iphdr *)hdr; |
---|
| 602 | + if (iph->version == 4) { |
---|
| 603 | + ipv4 = true; |
---|
| 604 | + if (unlikely(len < iph->ihl * 4)) |
---|
| 605 | + return -EINVAL; |
---|
| 606 | + } else if (iph->version == 6) { |
---|
| 607 | + ipv4 = false; |
---|
| 608 | + if (unlikely(len < sizeof(struct ipv6hdr))) |
---|
| 609 | + return -EINVAL; |
---|
| 610 | + } else { |
---|
| 611 | + return -EINVAL; |
---|
| 612 | + } |
---|
| 613 | + |
---|
| 614 | + if (ingress) |
---|
| 615 | + err = skb_cow_head(skb, len + skb->mac_len); |
---|
| 616 | + else |
---|
| 617 | + err = skb_cow_head(skb, |
---|
| 618 | + len + LL_RESERVED_SPACE(skb_dst(skb)->dev)); |
---|
| 619 | + if (unlikely(err)) |
---|
| 620 | + return err; |
---|
| 621 | + |
---|
| 622 | + /* push the encap headers and fix pointers */ |
---|
| 623 | + skb_reset_inner_headers(skb); |
---|
| 624 | + skb_reset_inner_mac_header(skb); /* mac header is not yet set */ |
---|
| 625 | + skb_set_inner_protocol(skb, skb->protocol); |
---|
| 626 | + skb->encapsulation = 1; |
---|
| 627 | + skb_push(skb, len); |
---|
| 628 | + if (ingress) |
---|
| 629 | + skb_postpush_rcsum(skb, iph, len); |
---|
| 630 | + skb_reset_network_header(skb); |
---|
| 631 | + memcpy(skb_network_header(skb), hdr, len); |
---|
| 632 | + bpf_compute_data_pointers(skb); |
---|
| 633 | + skb_clear_hash(skb); |
---|
| 634 | + |
---|
| 635 | + if (ipv4) { |
---|
| 636 | + skb->protocol = htons(ETH_P_IP); |
---|
| 637 | + iph = ip_hdr(skb); |
---|
| 638 | + |
---|
| 639 | + if (!iph->check) |
---|
| 640 | + iph->check = ip_fast_csum((unsigned char *)iph, |
---|
| 641 | + iph->ihl); |
---|
| 642 | + } else { |
---|
| 643 | + skb->protocol = htons(ETH_P_IPV6); |
---|
| 644 | + } |
---|
| 645 | + |
---|
| 646 | + if (skb_is_gso(skb)) |
---|
| 647 | + return handle_gso_encap(skb, ipv4, len); |
---|
| 648 | + |
---|
| 649 | + return 0; |
---|
| 650 | +} |
---|
| 651 | + |
---|
393 | 652 | static int __init bpf_lwt_init(void) |
---|
394 | 653 | { |
---|
395 | 654 | return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF); |
---|