hc
2024-05-10 23fa18eaa71266feff7ba8d83022d9e1cc83c65a
kernel/net/core/lwt_bpf.c
....@@ -1,13 +1,5 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch>
2
- *
3
- * This program is free software; you can redistribute it and/or
4
- * modify it under the terms of version 2 of the GNU General Public
5
- * License as published by the Free Software Foundation.
6
- *
7
- * This program is distributed in the hope that it will be useful, but
8
- * WITHOUT ANY WARRANTY; without even the implied warranty of
9
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10
- * General Public License for more details.
113 */
124
135 #include <linux/kernel.h>
....@@ -16,6 +8,9 @@
168 #include <linux/types.h>
179 #include <linux/bpf.h>
1810 #include <net/lwtunnel.h>
11
+#include <net/gre.h>
12
+#include <net/ip6_route.h>
13
+#include <net/ipv6_stubs.h>
1914
2015 struct bpf_lwt_prog {
2116 struct bpf_prog *prog;
....@@ -44,16 +39,17 @@
4439 {
4540 int ret;
4641
47
- /* Preempt disable and BH disable are needed to protect per-cpu
42
+ /* Migration disable and BH disable are needed to protect per-cpu
4843 * redirect_info between BPF prog and skb_do_redirect().
4944 */
50
- preempt_disable();
45
+ migrate_disable();
5146 local_bh_disable();
5247 bpf_compute_data_pointers(skb);
5348 ret = bpf_prog_run_save_cb(lwt->prog, skb);
5449
5550 switch (ret) {
5651 case BPF_OK:
52
+ case BPF_LWT_REROUTE:
5753 break;
5854
5955 case BPF_REDIRECT:
....@@ -63,9 +59,8 @@
6359 ret = BPF_OK;
6460 } else {
6561 skb_reset_mac_header(skb);
66
- ret = skb_do_redirect(skb);
67
- if (ret == 0)
68
- ret = BPF_REDIRECT;
62
+ skb_do_redirect(skb);
63
+ ret = BPF_REDIRECT;
6964 }
7065 break;
7166
....@@ -82,9 +77,38 @@
8277 }
8378
8479 local_bh_enable();
85
- preempt_enable();
80
+ migrate_enable();
8681
8782 return ret;
83
+}
84
+
85
+static int bpf_lwt_input_reroute(struct sk_buff *skb)
86
+{
87
+ int err = -EINVAL;
88
+
89
+ if (skb->protocol == htons(ETH_P_IP)) {
90
+ struct net_device *dev = skb_dst(skb)->dev;
91
+ struct iphdr *iph = ip_hdr(skb);
92
+
93
+ dev_hold(dev);
94
+ skb_dst_drop(skb);
95
+ err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
96
+ iph->tos, dev);
97
+ dev_put(dev);
98
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
99
+ skb_dst_drop(skb);
100
+ err = ipv6_stub->ipv6_route_input(skb);
101
+ } else {
102
+ err = -EAFNOSUPPORT;
103
+ }
104
+
105
+ if (err)
106
+ goto err;
107
+ return dst_input(skb);
108
+
109
+err:
110
+ kfree_skb(skb);
111
+ return err;
88112 }
89113
90114 static int bpf_input(struct sk_buff *skb)
....@@ -98,11 +122,11 @@
98122 ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT);
99123 if (ret < 0)
100124 return ret;
125
+ if (ret == BPF_LWT_REROUTE)
126
+ return bpf_lwt_input_reroute(skb);
101127 }
102128
103129 if (unlikely(!dst->lwtstate->orig_input)) {
104
- pr_warn_once("orig_input not set on dst for prog %s\n",
105
- bpf->out.name);
106130 kfree_skb(skb);
107131 return -EINVAL;
108132 }
....@@ -133,10 +157,8 @@
133157 return dst->lwtstate->orig_output(net, sk, skb);
134158 }
135159
136
-static int xmit_check_hhlen(struct sk_buff *skb)
160
+static int xmit_check_hhlen(struct sk_buff *skb, int hh_len)
137161 {
138
- int hh_len = skb_dst(skb)->dev->hard_header_len;
139
-
140162 if (skb_headroom(skb) < hh_len) {
141163 int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb));
142164
....@@ -147,6 +169,100 @@
147169 return 0;
148170 }
149171
172
+static int bpf_lwt_xmit_reroute(struct sk_buff *skb)
173
+{
174
+ struct net_device *l3mdev = l3mdev_master_dev_rcu(skb_dst(skb)->dev);
175
+ int oif = l3mdev ? l3mdev->ifindex : 0;
176
+ struct dst_entry *dst = NULL;
177
+ int err = -EAFNOSUPPORT;
178
+ struct sock *sk;
179
+ struct net *net;
180
+ bool ipv4;
181
+
182
+ if (skb->protocol == htons(ETH_P_IP))
183
+ ipv4 = true;
184
+ else if (skb->protocol == htons(ETH_P_IPV6))
185
+ ipv4 = false;
186
+ else
187
+ goto err;
188
+
189
+ sk = sk_to_full_sk(skb->sk);
190
+ if (sk) {
191
+ if (sk->sk_bound_dev_if)
192
+ oif = sk->sk_bound_dev_if;
193
+ net = sock_net(sk);
194
+ } else {
195
+ net = dev_net(skb_dst(skb)->dev);
196
+ }
197
+
198
+ if (ipv4) {
199
+ struct iphdr *iph = ip_hdr(skb);
200
+ struct flowi4 fl4 = {};
201
+ struct rtable *rt;
202
+
203
+ fl4.flowi4_oif = oif;
204
+ fl4.flowi4_mark = skb->mark;
205
+ fl4.flowi4_uid = sock_net_uid(net, sk);
206
+ fl4.flowi4_tos = RT_TOS(iph->tos);
207
+ fl4.flowi4_flags = FLOWI_FLAG_ANYSRC;
208
+ fl4.flowi4_proto = iph->protocol;
209
+ fl4.daddr = iph->daddr;
210
+ fl4.saddr = iph->saddr;
211
+
212
+ rt = ip_route_output_key(net, &fl4);
213
+ if (IS_ERR(rt)) {
214
+ err = PTR_ERR(rt);
215
+ goto err;
216
+ }
217
+ dst = &rt->dst;
218
+ } else {
219
+ struct ipv6hdr *iph6 = ipv6_hdr(skb);
220
+ struct flowi6 fl6 = {};
221
+
222
+ fl6.flowi6_oif = oif;
223
+ fl6.flowi6_mark = skb->mark;
224
+ fl6.flowi6_uid = sock_net_uid(net, sk);
225
+ fl6.flowlabel = ip6_flowinfo(iph6);
226
+ fl6.flowi6_proto = iph6->nexthdr;
227
+ fl6.daddr = iph6->daddr;
228
+ fl6.saddr = iph6->saddr;
229
+
230
+ dst = ipv6_stub->ipv6_dst_lookup_flow(net, skb->sk, &fl6, NULL);
231
+ if (IS_ERR(dst)) {
232
+ err = PTR_ERR(dst);
233
+ goto err;
234
+ }
235
+ }
236
+ if (unlikely(dst->error)) {
237
+ err = dst->error;
238
+ dst_release(dst);
239
+ goto err;
240
+ }
241
+
242
+ /* Although skb header was reserved in bpf_lwt_push_ip_encap(), it
243
+ * was done for the previous dst, so we are doing it here again, in
244
+ * case the new dst needs much more space. The call below is a noop
245
+ * if there is enough header space in skb.
246
+ */
247
+ err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev));
248
+ if (unlikely(err))
249
+ goto err;
250
+
251
+ skb_dst_drop(skb);
252
+ skb_dst_set(skb, dst);
253
+
254
+ err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb);
255
+ if (unlikely(err))
256
+ return net_xmit_errno(err);
257
+
258
+ /* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */
259
+ return LWTUNNEL_XMIT_DONE;
260
+
261
+err:
262
+ kfree_skb(skb);
263
+ return err;
264
+}
265
+
150266 static int bpf_xmit(struct sk_buff *skb)
151267 {
152268 struct dst_entry *dst = skb_dst(skb);
....@@ -154,21 +270,33 @@
154270
155271 bpf = bpf_lwt_lwtunnel(dst->lwtstate);
156272 if (bpf->xmit.prog) {
273
+ int hh_len = dst->dev->hard_header_len;
274
+ __be16 proto = skb->protocol;
157275 int ret;
158276
159277 ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT);
160278 switch (ret) {
161279 case BPF_OK:
280
+ /* If the header changed, e.g. via bpf_lwt_push_encap,
281
+ * BPF_LWT_REROUTE below should have been used if the
282
+ * protocol was also changed.
283
+ */
284
+ if (skb->protocol != proto) {
285
+ kfree_skb(skb);
286
+ return -EINVAL;
287
+ }
162288 /* If the header was expanded, headroom might be too
163289 * small for L2 header to come, expand as needed.
164290 */
165
- ret = xmit_check_hhlen(skb);
291
+ ret = xmit_check_hhlen(skb, hh_len);
166292 if (unlikely(ret))
167293 return ret;
168294
169295 return LWTUNNEL_XMIT_CONTINUE;
170296 case BPF_REDIRECT:
171297 return LWTUNNEL_XMIT_DONE;
298
+ case BPF_LWT_REROUTE:
299
+ return bpf_lwt_xmit_reroute(skb);
172300 default:
173301 return ret;
174302 }
....@@ -208,8 +336,8 @@
208336 int ret;
209337 u32 fd;
210338
211
- ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy,
212
- NULL);
339
+ ret = nla_parse_nested_deprecated(tb, LWT_BPF_PROG_MAX, attr,
340
+ bpf_prog_policy, NULL);
213341 if (ret < 0)
214342 return ret;
215343
....@@ -237,7 +365,7 @@
237365 [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 },
238366 };
239367
240
-static int bpf_build_state(struct nlattr *nla,
368
+static int bpf_build_state(struct net *net, struct nlattr *nla,
241369 unsigned int family, const void *cfg,
242370 struct lwtunnel_state **ts,
243371 struct netlink_ext_ack *extack)
....@@ -250,7 +378,8 @@
250378 if (family != AF_INET && family != AF_INET6)
251379 return -EAFNOSUPPORT;
252380
253
- ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy, extack);
381
+ ret = nla_parse_nested_deprecated(tb, LWT_BPF_MAX, nla, bpf_nl_policy,
382
+ extack);
254383 if (ret < 0)
255384 return ret;
256385
....@@ -318,7 +447,7 @@
318447 if (!prog->prog)
319448 return 0;
320449
321
- nest = nla_nest_start(skb, attr);
450
+ nest = nla_nest_start_noflag(skb, attr);
322451 if (!nest)
323452 return -EMSGSIZE;
324453
....@@ -390,6 +519,135 @@
390519 .owner = THIS_MODULE,
391520 };
392521
522
+static int handle_gso_type(struct sk_buff *skb, unsigned int gso_type,
523
+ int encap_len)
524
+{
525
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
526
+
527
+ gso_type |= SKB_GSO_DODGY;
528
+ shinfo->gso_type |= gso_type;
529
+ skb_decrease_gso_size(shinfo, encap_len);
530
+ shinfo->gso_segs = 0;
531
+ return 0;
532
+}
533
+
534
+static int handle_gso_encap(struct sk_buff *skb, bool ipv4, int encap_len)
535
+{
536
+ int next_hdr_offset;
537
+ void *next_hdr;
538
+ __u8 protocol;
539
+
540
+ /* SCTP and UDP_L4 gso need more nuanced handling than what
541
+ * handle_gso_type() does above: skb_decrease_gso_size() is not enough.
542
+ * So at the moment only TCP GSO packets are let through.
543
+ */
544
+ if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
545
+ return -ENOTSUPP;
546
+
547
+ if (ipv4) {
548
+ protocol = ip_hdr(skb)->protocol;
549
+ next_hdr_offset = sizeof(struct iphdr);
550
+ next_hdr = skb_network_header(skb) + next_hdr_offset;
551
+ } else {
552
+ protocol = ipv6_hdr(skb)->nexthdr;
553
+ next_hdr_offset = sizeof(struct ipv6hdr);
554
+ next_hdr = skb_network_header(skb) + next_hdr_offset;
555
+ }
556
+
557
+ switch (protocol) {
558
+ case IPPROTO_GRE:
559
+ next_hdr_offset += sizeof(struct gre_base_hdr);
560
+ if (next_hdr_offset > encap_len)
561
+ return -EINVAL;
562
+
563
+ if (((struct gre_base_hdr *)next_hdr)->flags & GRE_CSUM)
564
+ return handle_gso_type(skb, SKB_GSO_GRE_CSUM,
565
+ encap_len);
566
+ return handle_gso_type(skb, SKB_GSO_GRE, encap_len);
567
+
568
+ case IPPROTO_UDP:
569
+ next_hdr_offset += sizeof(struct udphdr);
570
+ if (next_hdr_offset > encap_len)
571
+ return -EINVAL;
572
+
573
+ if (((struct udphdr *)next_hdr)->check)
574
+ return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL_CSUM,
575
+ encap_len);
576
+ return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL, encap_len);
577
+
578
+ case IPPROTO_IP:
579
+ case IPPROTO_IPV6:
580
+ if (ipv4)
581
+ return handle_gso_type(skb, SKB_GSO_IPXIP4, encap_len);
582
+ else
583
+ return handle_gso_type(skb, SKB_GSO_IPXIP6, encap_len);
584
+
585
+ default:
586
+ return -EPROTONOSUPPORT;
587
+ }
588
+}
589
+
590
+int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress)
591
+{
592
+ struct iphdr *iph;
593
+ bool ipv4;
594
+ int err;
595
+
596
+ if (unlikely(len < sizeof(struct iphdr) || len > LWT_BPF_MAX_HEADROOM))
597
+ return -EINVAL;
598
+
599
+ /* validate protocol and length */
600
+ iph = (struct iphdr *)hdr;
601
+ if (iph->version == 4) {
602
+ ipv4 = true;
603
+ if (unlikely(len < iph->ihl * 4))
604
+ return -EINVAL;
605
+ } else if (iph->version == 6) {
606
+ ipv4 = false;
607
+ if (unlikely(len < sizeof(struct ipv6hdr)))
608
+ return -EINVAL;
609
+ } else {
610
+ return -EINVAL;
611
+ }
612
+
613
+ if (ingress)
614
+ err = skb_cow_head(skb, len + skb->mac_len);
615
+ else
616
+ err = skb_cow_head(skb,
617
+ len + LL_RESERVED_SPACE(skb_dst(skb)->dev));
618
+ if (unlikely(err))
619
+ return err;
620
+
621
+ /* push the encap headers and fix pointers */
622
+ skb_reset_inner_headers(skb);
623
+ skb_reset_inner_mac_header(skb); /* mac header is not yet set */
624
+ skb_set_inner_protocol(skb, skb->protocol);
625
+ skb->encapsulation = 1;
626
+ skb_push(skb, len);
627
+ if (ingress)
628
+ skb_postpush_rcsum(skb, iph, len);
629
+ skb_reset_network_header(skb);
630
+ memcpy(skb_network_header(skb), hdr, len);
631
+ bpf_compute_data_pointers(skb);
632
+ skb_clear_hash(skb);
633
+
634
+ if (ipv4) {
635
+ skb->protocol = htons(ETH_P_IP);
636
+ iph = ip_hdr(skb);
637
+
638
+ if (!iph->check)
639
+ iph->check = ip_fast_csum((unsigned char *)iph,
640
+ iph->ihl);
641
+ } else {
642
+ skb->protocol = htons(ETH_P_IPV6);
643
+ }
644
+
645
+ if (skb_is_gso(skb))
646
+ return handle_gso_encap(skb, ipv4, len);
647
+
648
+ return 0;
649
+}
650
+
393651 static int __init bpf_lwt_init(void)
394652 {
395653 return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF);