hc
2023-12-09 b22da3d8526a935aa31e086e63f60ff3246cb61c
kernel/net/core/lwt_bpf.c
....@@ -1,13 +1,5 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch>
2
- *
3
- * This program is free software; you can redistribute it and/or
4
- * modify it under the terms of version 2 of the GNU General Public
5
- * License as published by the Free Software Foundation.
6
- *
7
- * This program is distributed in the hope that it will be useful, but
8
- * WITHOUT ANY WARRANTY; without even the implied warranty of
9
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10
- * General Public License for more details.
113 */
124
135 #include <linux/kernel.h>
....@@ -16,6 +8,9 @@
168 #include <linux/types.h>
179 #include <linux/bpf.h>
1810 #include <net/lwtunnel.h>
11
+#include <net/gre.h>
12
+#include <net/ip6_route.h>
13
+#include <net/ipv6_stubs.h>
1914
2015 struct bpf_lwt_prog {
2116 struct bpf_prog *prog;
....@@ -44,16 +39,17 @@
4439 {
4540 int ret;
4641
47
- /* Preempt disable and BH disable are needed to protect per-cpu
42
+ /* Migration disable and BH disable are needed to protect per-cpu
4843 * redirect_info between BPF prog and skb_do_redirect().
4944 */
50
- preempt_disable();
45
+ migrate_disable();
5146 local_bh_disable();
5247 bpf_compute_data_pointers(skb);
5348 ret = bpf_prog_run_save_cb(lwt->prog, skb);
5449
5550 switch (ret) {
5651 case BPF_OK:
52
+ case BPF_LWT_REROUTE:
5753 break;
5854
5955 case BPF_REDIRECT:
....@@ -82,9 +78,38 @@
8278 }
8379
8480 local_bh_enable();
85
- preempt_enable();
81
+ migrate_enable();
8682
8783 return ret;
84
+}
85
+
86
+static int bpf_lwt_input_reroute(struct sk_buff *skb)
87
+{
88
+ int err = -EINVAL;
89
+
90
+ if (skb->protocol == htons(ETH_P_IP)) {
91
+ struct net_device *dev = skb_dst(skb)->dev;
92
+ struct iphdr *iph = ip_hdr(skb);
93
+
94
+ dev_hold(dev);
95
+ skb_dst_drop(skb);
96
+ err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
97
+ iph->tos, dev);
98
+ dev_put(dev);
99
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
100
+ skb_dst_drop(skb);
101
+ err = ipv6_stub->ipv6_route_input(skb);
102
+ } else {
103
+ err = -EAFNOSUPPORT;
104
+ }
105
+
106
+ if (err)
107
+ goto err;
108
+ return dst_input(skb);
109
+
110
+err:
111
+ kfree_skb(skb);
112
+ return err;
88113 }
89114
90115 static int bpf_input(struct sk_buff *skb)
....@@ -98,11 +123,11 @@
98123 ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT);
99124 if (ret < 0)
100125 return ret;
126
+ if (ret == BPF_LWT_REROUTE)
127
+ return bpf_lwt_input_reroute(skb);
101128 }
102129
103130 if (unlikely(!dst->lwtstate->orig_input)) {
104
- pr_warn_once("orig_input not set on dst for prog %s\n",
105
- bpf->out.name);
106131 kfree_skb(skb);
107132 return -EINVAL;
108133 }
....@@ -133,10 +158,8 @@
133158 return dst->lwtstate->orig_output(net, sk, skb);
134159 }
135160
136
-static int xmit_check_hhlen(struct sk_buff *skb)
161
+static int xmit_check_hhlen(struct sk_buff *skb, int hh_len)
137162 {
138
- int hh_len = skb_dst(skb)->dev->hard_header_len;
139
-
140163 if (skb_headroom(skb) < hh_len) {
141164 int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb));
142165
....@@ -147,6 +170,100 @@
147170 return 0;
148171 }
149172
173
+static int bpf_lwt_xmit_reroute(struct sk_buff *skb)
174
+{
175
+ struct net_device *l3mdev = l3mdev_master_dev_rcu(skb_dst(skb)->dev);
176
+ int oif = l3mdev ? l3mdev->ifindex : 0;
177
+ struct dst_entry *dst = NULL;
178
+ int err = -EAFNOSUPPORT;
179
+ struct sock *sk;
180
+ struct net *net;
181
+ bool ipv4;
182
+
183
+ if (skb->protocol == htons(ETH_P_IP))
184
+ ipv4 = true;
185
+ else if (skb->protocol == htons(ETH_P_IPV6))
186
+ ipv4 = false;
187
+ else
188
+ goto err;
189
+
190
+ sk = sk_to_full_sk(skb->sk);
191
+ if (sk) {
192
+ if (sk->sk_bound_dev_if)
193
+ oif = sk->sk_bound_dev_if;
194
+ net = sock_net(sk);
195
+ } else {
196
+ net = dev_net(skb_dst(skb)->dev);
197
+ }
198
+
199
+ if (ipv4) {
200
+ struct iphdr *iph = ip_hdr(skb);
201
+ struct flowi4 fl4 = {};
202
+ struct rtable *rt;
203
+
204
+ fl4.flowi4_oif = oif;
205
+ fl4.flowi4_mark = skb->mark;
206
+ fl4.flowi4_uid = sock_net_uid(net, sk);
207
+ fl4.flowi4_tos = RT_TOS(iph->tos);
208
+ fl4.flowi4_flags = FLOWI_FLAG_ANYSRC;
209
+ fl4.flowi4_proto = iph->protocol;
210
+ fl4.daddr = iph->daddr;
211
+ fl4.saddr = iph->saddr;
212
+
213
+ rt = ip_route_output_key(net, &fl4);
214
+ if (IS_ERR(rt)) {
215
+ err = PTR_ERR(rt);
216
+ goto err;
217
+ }
218
+ dst = &rt->dst;
219
+ } else {
220
+ struct ipv6hdr *iph6 = ipv6_hdr(skb);
221
+ struct flowi6 fl6 = {};
222
+
223
+ fl6.flowi6_oif = oif;
224
+ fl6.flowi6_mark = skb->mark;
225
+ fl6.flowi6_uid = sock_net_uid(net, sk);
226
+ fl6.flowlabel = ip6_flowinfo(iph6);
227
+ fl6.flowi6_proto = iph6->nexthdr;
228
+ fl6.daddr = iph6->daddr;
229
+ fl6.saddr = iph6->saddr;
230
+
231
+ dst = ipv6_stub->ipv6_dst_lookup_flow(net, skb->sk, &fl6, NULL);
232
+ if (IS_ERR(dst)) {
233
+ err = PTR_ERR(dst);
234
+ goto err;
235
+ }
236
+ }
237
+ if (unlikely(dst->error)) {
238
+ err = dst->error;
239
+ dst_release(dst);
240
+ goto err;
241
+ }
242
+
243
+ /* Although skb header was reserved in bpf_lwt_push_ip_encap(), it
244
+ * was done for the previous dst, so we are doing it here again, in
245
+ * case the new dst needs much more space. The call below is a noop
246
+ * if there is enough header space in skb.
247
+ */
248
+ err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev));
249
+ if (unlikely(err))
250
+ goto err;
251
+
252
+ skb_dst_drop(skb);
253
+ skb_dst_set(skb, dst);
254
+
255
+ err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb);
256
+ if (unlikely(err))
257
+ return err;
258
+
259
+ /* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */
260
+ return LWTUNNEL_XMIT_DONE;
261
+
262
+err:
263
+ kfree_skb(skb);
264
+ return err;
265
+}
266
+
150267 static int bpf_xmit(struct sk_buff *skb)
151268 {
152269 struct dst_entry *dst = skb_dst(skb);
....@@ -154,21 +271,33 @@
154271
155272 bpf = bpf_lwt_lwtunnel(dst->lwtstate);
156273 if (bpf->xmit.prog) {
274
+ int hh_len = dst->dev->hard_header_len;
275
+ __be16 proto = skb->protocol;
157276 int ret;
158277
159278 ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT);
160279 switch (ret) {
161280 case BPF_OK:
281
+ /* If the header changed, e.g. via bpf_lwt_push_encap,
282
+ * BPF_LWT_REROUTE below should have been used if the
283
+ * protocol was also changed.
284
+ */
285
+ if (skb->protocol != proto) {
286
+ kfree_skb(skb);
287
+ return -EINVAL;
288
+ }
162289 /* If the header was expanded, headroom might be too
163290 * small for L2 header to come, expand as needed.
164291 */
165
- ret = xmit_check_hhlen(skb);
292
+ ret = xmit_check_hhlen(skb, hh_len);
166293 if (unlikely(ret))
167294 return ret;
168295
169296 return LWTUNNEL_XMIT_CONTINUE;
170297 case BPF_REDIRECT:
171298 return LWTUNNEL_XMIT_DONE;
299
+ case BPF_LWT_REROUTE:
300
+ return bpf_lwt_xmit_reroute(skb);
172301 default:
173302 return ret;
174303 }
....@@ -208,8 +337,8 @@
208337 int ret;
209338 u32 fd;
210339
211
- ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy,
212
- NULL);
340
+ ret = nla_parse_nested_deprecated(tb, LWT_BPF_PROG_MAX, attr,
341
+ bpf_prog_policy, NULL);
213342 if (ret < 0)
214343 return ret;
215344
....@@ -237,7 +366,7 @@
237366 [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 },
238367 };
239368
240
-static int bpf_build_state(struct nlattr *nla,
369
+static int bpf_build_state(struct net *net, struct nlattr *nla,
241370 unsigned int family, const void *cfg,
242371 struct lwtunnel_state **ts,
243372 struct netlink_ext_ack *extack)
....@@ -250,7 +379,8 @@
250379 if (family != AF_INET && family != AF_INET6)
251380 return -EAFNOSUPPORT;
252381
253
- ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy, extack);
382
+ ret = nla_parse_nested_deprecated(tb, LWT_BPF_MAX, nla, bpf_nl_policy,
383
+ extack);
254384 if (ret < 0)
255385 return ret;
256386
....@@ -318,7 +448,7 @@
318448 if (!prog->prog)
319449 return 0;
320450
321
- nest = nla_nest_start(skb, attr);
451
+ nest = nla_nest_start_noflag(skb, attr);
322452 if (!nest)
323453 return -EMSGSIZE;
324454
....@@ -390,6 +520,135 @@
390520 .owner = THIS_MODULE,
391521 };
392522
523
+static int handle_gso_type(struct sk_buff *skb, unsigned int gso_type,
524
+ int encap_len)
525
+{
526
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
527
+
528
+ gso_type |= SKB_GSO_DODGY;
529
+ shinfo->gso_type |= gso_type;
530
+ skb_decrease_gso_size(shinfo, encap_len);
531
+ shinfo->gso_segs = 0;
532
+ return 0;
533
+}
534
+
535
+static int handle_gso_encap(struct sk_buff *skb, bool ipv4, int encap_len)
536
+{
537
+ int next_hdr_offset;
538
+ void *next_hdr;
539
+ __u8 protocol;
540
+
541
+ /* SCTP and UDP_L4 gso need more nuanced handling than what
542
+ * handle_gso_type() does above: skb_decrease_gso_size() is not enough.
543
+ * So at the moment only TCP GSO packets are let through.
544
+ */
545
+ if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
546
+ return -ENOTSUPP;
547
+
548
+ if (ipv4) {
549
+ protocol = ip_hdr(skb)->protocol;
550
+ next_hdr_offset = sizeof(struct iphdr);
551
+ next_hdr = skb_network_header(skb) + next_hdr_offset;
552
+ } else {
553
+ protocol = ipv6_hdr(skb)->nexthdr;
554
+ next_hdr_offset = sizeof(struct ipv6hdr);
555
+ next_hdr = skb_network_header(skb) + next_hdr_offset;
556
+ }
557
+
558
+ switch (protocol) {
559
+ case IPPROTO_GRE:
560
+ next_hdr_offset += sizeof(struct gre_base_hdr);
561
+ if (next_hdr_offset > encap_len)
562
+ return -EINVAL;
563
+
564
+ if (((struct gre_base_hdr *)next_hdr)->flags & GRE_CSUM)
565
+ return handle_gso_type(skb, SKB_GSO_GRE_CSUM,
566
+ encap_len);
567
+ return handle_gso_type(skb, SKB_GSO_GRE, encap_len);
568
+
569
+ case IPPROTO_UDP:
570
+ next_hdr_offset += sizeof(struct udphdr);
571
+ if (next_hdr_offset > encap_len)
572
+ return -EINVAL;
573
+
574
+ if (((struct udphdr *)next_hdr)->check)
575
+ return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL_CSUM,
576
+ encap_len);
577
+ return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL, encap_len);
578
+
579
+ case IPPROTO_IP:
580
+ case IPPROTO_IPV6:
581
+ if (ipv4)
582
+ return handle_gso_type(skb, SKB_GSO_IPXIP4, encap_len);
583
+ else
584
+ return handle_gso_type(skb, SKB_GSO_IPXIP6, encap_len);
585
+
586
+ default:
587
+ return -EPROTONOSUPPORT;
588
+ }
589
+}
590
+
591
+int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress)
592
+{
593
+ struct iphdr *iph;
594
+ bool ipv4;
595
+ int err;
596
+
597
+ if (unlikely(len < sizeof(struct iphdr) || len > LWT_BPF_MAX_HEADROOM))
598
+ return -EINVAL;
599
+
600
+ /* validate protocol and length */
601
+ iph = (struct iphdr *)hdr;
602
+ if (iph->version == 4) {
603
+ ipv4 = true;
604
+ if (unlikely(len < iph->ihl * 4))
605
+ return -EINVAL;
606
+ } else if (iph->version == 6) {
607
+ ipv4 = false;
608
+ if (unlikely(len < sizeof(struct ipv6hdr)))
609
+ return -EINVAL;
610
+ } else {
611
+ return -EINVAL;
612
+ }
613
+
614
+ if (ingress)
615
+ err = skb_cow_head(skb, len + skb->mac_len);
616
+ else
617
+ err = skb_cow_head(skb,
618
+ len + LL_RESERVED_SPACE(skb_dst(skb)->dev));
619
+ if (unlikely(err))
620
+ return err;
621
+
622
+ /* push the encap headers and fix pointers */
623
+ skb_reset_inner_headers(skb);
624
+ skb_reset_inner_mac_header(skb); /* mac header is not yet set */
625
+ skb_set_inner_protocol(skb, skb->protocol);
626
+ skb->encapsulation = 1;
627
+ skb_push(skb, len);
628
+ if (ingress)
629
+ skb_postpush_rcsum(skb, iph, len);
630
+ skb_reset_network_header(skb);
631
+ memcpy(skb_network_header(skb), hdr, len);
632
+ bpf_compute_data_pointers(skb);
633
+ skb_clear_hash(skb);
634
+
635
+ if (ipv4) {
636
+ skb->protocol = htons(ETH_P_IP);
637
+ iph = ip_hdr(skb);
638
+
639
+ if (!iph->check)
640
+ iph->check = ip_fast_csum((unsigned char *)iph,
641
+ iph->ihl);
642
+ } else {
643
+ skb->protocol = htons(ETH_P_IPV6);
644
+ }
645
+
646
+ if (skb_is_gso(skb))
647
+ return handle_gso_encap(skb, ipv4, len);
648
+
649
+ return 0;
650
+}
651
+
393652 static int __init bpf_lwt_init(void)
394653 {
395654 return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF);