hc
2024-09-20 cf4ce59b3b70238352c7f1729f0f7223214828ad
kernel/kernel/bpf/inode.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * Minimal file system backend for holding eBPF maps and programs,
34 * used by bpf(2) object pinning.
....@@ -5,10 +6,6 @@
56 * Authors:
67 *
78 * Daniel Borkmann <daniel@iogearbox.net>
8
- *
9
- * This program is free software; you can redistribute it and/or
10
- * modify it under the terms of the GNU General Public License
11
- * version 2 as published by the Free Software Foundation.
129 */
1310
1411 #include <linux/init.h>
....@@ -17,26 +14,32 @@
1714 #include <linux/mount.h>
1815 #include <linux/namei.h>
1916 #include <linux/fs.h>
17
+#include <linux/fs_context.h>
18
+#include <linux/fs_parser.h>
2019 #include <linux/kdev_t.h>
21
-#include <linux/parser.h>
2220 #include <linux/filter.h>
2321 #include <linux/bpf.h>
2422 #include <linux/bpf_trace.h>
23
+#include "preload/bpf_preload.h"
2524
2625 enum bpf_type {
2726 BPF_TYPE_UNSPEC = 0,
2827 BPF_TYPE_PROG,
2928 BPF_TYPE_MAP,
29
+ BPF_TYPE_LINK,
3030 };
3131
3232 static void *bpf_any_get(void *raw, enum bpf_type type)
3333 {
3434 switch (type) {
3535 case BPF_TYPE_PROG:
36
- raw = bpf_prog_inc(raw);
36
+ bpf_prog_inc(raw);
3737 break;
3838 case BPF_TYPE_MAP:
39
- raw = bpf_map_inc(raw, true);
39
+ bpf_map_inc_with_uref(raw);
40
+ break;
41
+ case BPF_TYPE_LINK:
42
+ bpf_link_inc(raw);
4043 break;
4144 default:
4245 WARN_ON_ONCE(1);
....@@ -55,6 +58,9 @@
5558 case BPF_TYPE_MAP:
5659 bpf_map_put_with_uref(raw);
5760 break;
61
+ case BPF_TYPE_LINK:
62
+ bpf_link_put(raw);
63
+ break;
5864 default:
5965 WARN_ON_ONCE(1);
6066 break;
....@@ -65,20 +71,32 @@
6571 {
6672 void *raw;
6773
68
- *type = BPF_TYPE_MAP;
6974 raw = bpf_map_get_with_uref(ufd);
70
- if (IS_ERR(raw)) {
71
- *type = BPF_TYPE_PROG;
72
- raw = bpf_prog_get(ufd);
75
+ if (!IS_ERR(raw)) {
76
+ *type = BPF_TYPE_MAP;
77
+ return raw;
7378 }
7479
75
- return raw;
80
+ raw = bpf_prog_get(ufd);
81
+ if (!IS_ERR(raw)) {
82
+ *type = BPF_TYPE_PROG;
83
+ return raw;
84
+ }
85
+
86
+ raw = bpf_link_get_from_fd(ufd);
87
+ if (!IS_ERR(raw)) {
88
+ *type = BPF_TYPE_LINK;
89
+ return raw;
90
+ }
91
+
92
+ return ERR_PTR(-EINVAL);
7693 }
7794
7895 static const struct inode_operations bpf_dir_iops;
7996
8097 static const struct inode_operations bpf_prog_iops = { };
8198 static const struct inode_operations bpf_map_iops = { };
99
+static const struct inode_operations bpf_link_iops = { };
82100
83101 static struct inode *bpf_get_inode(struct super_block *sb,
84102 const struct inode *dir,
....@@ -116,6 +134,8 @@
116134 *type = BPF_TYPE_PROG;
117135 else if (inode->i_op == &bpf_map_iops)
118136 *type = BPF_TYPE_MAP;
137
+ else if (inode->i_op == &bpf_link_iops)
138
+ *type = BPF_TYPE_LINK;
119139 else
120140 return -EACCES;
121141
....@@ -339,13 +359,23 @@
339359 &bpffs_map_fops : &bpffs_obj_fops);
340360 }
341361
362
+static int bpf_mklink(struct dentry *dentry, umode_t mode, void *arg)
363
+{
364
+ struct bpf_link *link = arg;
365
+
366
+ return bpf_mkobj_ops(dentry, mode, arg, &bpf_link_iops,
367
+ bpf_link_is_iter(link) ?
368
+ &bpf_iter_fops : &bpffs_obj_fops);
369
+}
370
+
342371 static struct dentry *
343372 bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags)
344373 {
345374 /* Dots in names (e.g. "/sys/fs/bpf/foo.bar") are reserved for future
346
- * extensions.
375
+ * extensions. That allows popoulate_bpffs() create special files.
347376 */
348
- if (strchr(dentry->d_name.name, '.'))
377
+ if ((dir->i_mode & S_IALLUGO) &&
378
+ strchr(dentry->d_name.name, '.'))
349379 return ERR_PTR(-EPERM);
350380
351381 return simple_lookup(dir, dentry, flags);
....@@ -383,7 +413,28 @@
383413 .unlink = simple_unlink,
384414 };
385415
386
-static int bpf_obj_do_pin(const struct filename *pathname, void *raw,
416
+/* pin iterator link into bpffs */
417
+static int bpf_iter_link_pin_kernel(struct dentry *parent,
418
+ const char *name, struct bpf_link *link)
419
+{
420
+ umode_t mode = S_IFREG | S_IRUSR;
421
+ struct dentry *dentry;
422
+ int ret;
423
+
424
+ inode_lock(parent->d_inode);
425
+ dentry = lookup_one_len(name, parent, strlen(name));
426
+ if (IS_ERR(dentry)) {
427
+ inode_unlock(parent->d_inode);
428
+ return PTR_ERR(dentry);
429
+ }
430
+ ret = bpf_mkobj_ops(dentry, mode, link, &bpf_link_iops,
431
+ &bpf_iter_fops);
432
+ dput(dentry);
433
+ inode_unlock(parent->d_inode);
434
+ return ret;
435
+}
436
+
437
+static int bpf_obj_do_pin(const char __user *pathname, void *raw,
387438 enum bpf_type type)
388439 {
389440 struct dentry *dentry;
....@@ -392,7 +443,7 @@
392443 umode_t mode;
393444 int ret;
394445
395
- dentry = kern_path_create(AT_FDCWD, pathname->name, &path, 0);
446
+ dentry = user_path_create(AT_FDCWD, pathname, &path, 0);
396447 if (IS_ERR(dentry))
397448 return PTR_ERR(dentry);
398449
....@@ -415,6 +466,9 @@
415466 case BPF_TYPE_MAP:
416467 ret = vfs_mkobj(dentry, mode, bpf_mkmap, raw);
417468 break;
469
+ case BPF_TYPE_LINK:
470
+ ret = vfs_mkobj(dentry, mode, bpf_mklink, raw);
471
+ break;
418472 default:
419473 ret = -EPERM;
420474 }
....@@ -425,30 +479,22 @@
425479
426480 int bpf_obj_pin_user(u32 ufd, const char __user *pathname)
427481 {
428
- struct filename *pname;
429482 enum bpf_type type;
430483 void *raw;
431484 int ret;
432485
433
- pname = getname(pathname);
434
- if (IS_ERR(pname))
435
- return PTR_ERR(pname);
436
-
437486 raw = bpf_fd_probe_obj(ufd, &type);
438
- if (IS_ERR(raw)) {
439
- ret = PTR_ERR(raw);
440
- goto out;
441
- }
487
+ if (IS_ERR(raw))
488
+ return PTR_ERR(raw);
442489
443
- ret = bpf_obj_do_pin(pname, raw, type);
490
+ ret = bpf_obj_do_pin(pathname, raw, type);
444491 if (ret != 0)
445492 bpf_any_put(raw, type);
446
-out:
447
- putname(pname);
493
+
448494 return ret;
449495 }
450496
451
-static void *bpf_obj_do_get(const struct filename *pathname,
497
+static void *bpf_obj_do_get(const char __user *pathname,
452498 enum bpf_type *type, int flags)
453499 {
454500 struct inode *inode;
....@@ -456,7 +502,7 @@
456502 void *raw;
457503 int ret;
458504
459
- ret = kern_path(pathname->name, LOOKUP_FOLLOW, &path);
505
+ ret = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW, &path);
460506 if (ret)
461507 return ERR_PTR(ret);
462508
....@@ -483,36 +529,29 @@
483529 int bpf_obj_get_user(const char __user *pathname, int flags)
484530 {
485531 enum bpf_type type = BPF_TYPE_UNSPEC;
486
- struct filename *pname;
487
- int ret = -ENOENT;
488532 int f_flags;
489533 void *raw;
534
+ int ret;
490535
491536 f_flags = bpf_get_file_flag(flags);
492537 if (f_flags < 0)
493538 return f_flags;
494539
495
- pname = getname(pathname);
496
- if (IS_ERR(pname))
497
- return PTR_ERR(pname);
498
-
499
- raw = bpf_obj_do_get(pname, &type, f_flags);
500
- if (IS_ERR(raw)) {
501
- ret = PTR_ERR(raw);
502
- goto out;
503
- }
540
+ raw = bpf_obj_do_get(pathname, &type, f_flags);
541
+ if (IS_ERR(raw))
542
+ return PTR_ERR(raw);
504543
505544 if (type == BPF_TYPE_PROG)
506545 ret = bpf_prog_new_fd(raw);
507546 else if (type == BPF_TYPE_MAP)
508547 ret = bpf_map_new_fd(raw, f_flags);
548
+ else if (type == BPF_TYPE_LINK)
549
+ ret = (f_flags != O_RDWR) ? -EINVAL : bpf_link_new_fd(raw);
509550 else
510
- goto out;
551
+ return -ENOENT;
511552
512553 if (ret < 0)
513554 bpf_any_put(raw, type);
514
-out:
515
- putname(pname);
516555 return ret;
517556 }
518557
....@@ -524,6 +563,8 @@
524563 return ERR_PTR(ret);
525564
526565 if (inode->i_op == &bpf_map_iops)
566
+ return ERR_PTR(-EINVAL);
567
+ if (inode->i_op == &bpf_link_iops)
527568 return ERR_PTR(-EINVAL);
528569 if (inode->i_op != &bpf_prog_iops)
529570 return ERR_PTR(-EACCES);
....@@ -537,7 +578,8 @@
537578 if (!bpf_prog_get_ok(prog, &type, false))
538579 return ERR_PTR(-EINVAL);
539580
540
- return bpf_prog_inc(prog);
581
+ bpf_prog_inc(prog);
582
+ return prog;
541583 }
542584
543585 struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type)
....@@ -567,9 +609,8 @@
567609 return 0;
568610 }
569611
570
-static void bpf_destroy_inode_deferred(struct rcu_head *head)
612
+static void bpf_free_inode(struct inode *inode)
571613 {
572
- struct inode *inode = container_of(head, struct inode, i_rcu);
573614 enum bpf_type type;
574615
575616 if (S_ISLNK(inode->i_mode))
....@@ -579,71 +620,140 @@
579620 free_inode_nonrcu(inode);
580621 }
581622
582
-static void bpf_destroy_inode(struct inode *inode)
583
-{
584
- call_rcu(&inode->i_rcu, bpf_destroy_inode_deferred);
585
-}
586
-
587623 static const struct super_operations bpf_super_ops = {
588624 .statfs = simple_statfs,
589625 .drop_inode = generic_delete_inode,
590626 .show_options = bpf_show_options,
591
- .destroy_inode = bpf_destroy_inode,
627
+ .free_inode = bpf_free_inode,
592628 };
593629
594630 enum {
595631 OPT_MODE,
596
- OPT_ERR,
597632 };
598633
599
-static const match_table_t bpf_mount_tokens = {
600
- { OPT_MODE, "mode=%o" },
601
- { OPT_ERR, NULL },
634
+static const struct fs_parameter_spec bpf_fs_parameters[] = {
635
+ fsparam_u32oct ("mode", OPT_MODE),
636
+ {}
602637 };
603638
604639 struct bpf_mount_opts {
605640 umode_t mode;
606641 };
607642
608
-static int bpf_parse_options(char *data, struct bpf_mount_opts *opts)
643
+static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
609644 {
610
- substring_t args[MAX_OPT_ARGS];
611
- int option, token;
612
- char *ptr;
645
+ struct bpf_mount_opts *opts = fc->fs_private;
646
+ struct fs_parse_result result;
647
+ int opt;
613648
614
- opts->mode = S_IRWXUGO;
615
-
616
- while ((ptr = strsep(&data, ",")) != NULL) {
617
- if (!*ptr)
618
- continue;
619
-
620
- token = match_token(ptr, bpf_mount_tokens, args);
621
- switch (token) {
622
- case OPT_MODE:
623
- if (match_octal(&args[0], &option))
624
- return -EINVAL;
625
- opts->mode = option & S_IALLUGO;
626
- break;
649
+ opt = fs_parse(fc, bpf_fs_parameters, param, &result);
650
+ if (opt < 0)
627651 /* We might like to report bad mount options here, but
628652 * traditionally we've ignored all mount options, so we'd
629653 * better continue to ignore non-existing options for bpf.
630654 */
631
- }
655
+ return opt == -ENOPARAM ? 0 : opt;
656
+
657
+ switch (opt) {
658
+ case OPT_MODE:
659
+ opts->mode = result.uint_32 & S_IALLUGO;
660
+ break;
632661 }
633662
634663 return 0;
635664 }
636665
637
-static int bpf_fill_super(struct super_block *sb, void *data, int silent)
666
+struct bpf_preload_ops *bpf_preload_ops;
667
+EXPORT_SYMBOL_GPL(bpf_preload_ops);
668
+
669
+static bool bpf_preload_mod_get(void)
670
+{
671
+ /* If bpf_preload.ko wasn't loaded earlier then load it now.
672
+ * When bpf_preload is built into vmlinux the module's __init
673
+ * function will populate it.
674
+ */
675
+ if (!bpf_preload_ops) {
676
+ request_module("bpf_preload");
677
+ if (!bpf_preload_ops)
678
+ return false;
679
+ }
680
+ /* And grab the reference, so the module doesn't disappear while the
681
+ * kernel is interacting with the kernel module and its UMD.
682
+ */
683
+ if (!try_module_get(bpf_preload_ops->owner)) {
684
+ pr_err("bpf_preload module get failed.\n");
685
+ return false;
686
+ }
687
+ return true;
688
+}
689
+
690
+static void bpf_preload_mod_put(void)
691
+{
692
+ if (bpf_preload_ops)
693
+ /* now user can "rmmod bpf_preload" if necessary */
694
+ module_put(bpf_preload_ops->owner);
695
+}
696
+
697
+static DEFINE_MUTEX(bpf_preload_lock);
698
+
699
+static int populate_bpffs(struct dentry *parent)
700
+{
701
+ struct bpf_preload_info objs[BPF_PRELOAD_LINKS] = {};
702
+ struct bpf_link *links[BPF_PRELOAD_LINKS] = {};
703
+ int err = 0, i;
704
+
705
+ /* grab the mutex to make sure the kernel interactions with bpf_preload
706
+ * UMD are serialized
707
+ */
708
+ mutex_lock(&bpf_preload_lock);
709
+
710
+ /* if bpf_preload.ko wasn't built into vmlinux then load it */
711
+ if (!bpf_preload_mod_get())
712
+ goto out;
713
+
714
+ if (!bpf_preload_ops->info.tgid) {
715
+ /* preload() will start UMD that will load BPF iterator programs */
716
+ err = bpf_preload_ops->preload(objs);
717
+ if (err)
718
+ goto out_put;
719
+ for (i = 0; i < BPF_PRELOAD_LINKS; i++) {
720
+ links[i] = bpf_link_by_id(objs[i].link_id);
721
+ if (IS_ERR(links[i])) {
722
+ err = PTR_ERR(links[i]);
723
+ goto out_put;
724
+ }
725
+ }
726
+ for (i = 0; i < BPF_PRELOAD_LINKS; i++) {
727
+ err = bpf_iter_link_pin_kernel(parent,
728
+ objs[i].link_name, links[i]);
729
+ if (err)
730
+ goto out_put;
731
+ /* do not unlink successfully pinned links even
732
+ * if later link fails to pin
733
+ */
734
+ links[i] = NULL;
735
+ }
736
+ /* finish() will tell UMD process to exit */
737
+ err = bpf_preload_ops->finish();
738
+ if (err)
739
+ goto out_put;
740
+ }
741
+out_put:
742
+ bpf_preload_mod_put();
743
+out:
744
+ mutex_unlock(&bpf_preload_lock);
745
+ for (i = 0; i < BPF_PRELOAD_LINKS && err; i++)
746
+ if (!IS_ERR_OR_NULL(links[i]))
747
+ bpf_link_put(links[i]);
748
+ return err;
749
+}
750
+
751
+static int bpf_fill_super(struct super_block *sb, struct fs_context *fc)
638752 {
639753 static const struct tree_descr bpf_rfiles[] = { { "" } };
640
- struct bpf_mount_opts opts;
754
+ struct bpf_mount_opts *opts = fc->fs_private;
641755 struct inode *inode;
642756 int ret;
643
-
644
- ret = bpf_parse_options(data, &opts);
645
- if (ret)
646
- return ret;
647757
648758 ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles);
649759 if (ret)
....@@ -654,21 +764,50 @@
654764 inode = sb->s_root->d_inode;
655765 inode->i_op = &bpf_dir_iops;
656766 inode->i_mode &= ~S_IALLUGO;
657
- inode->i_mode |= S_ISVTX | opts.mode;
658
-
767
+ populate_bpffs(sb->s_root);
768
+ inode->i_mode |= S_ISVTX | opts->mode;
659769 return 0;
660770 }
661771
662
-static struct dentry *bpf_mount(struct file_system_type *type, int flags,
663
- const char *dev_name, void *data)
772
+static int bpf_get_tree(struct fs_context *fc)
664773 {
665
- return mount_nodev(type, flags, data, bpf_fill_super);
774
+ return get_tree_nodev(fc, bpf_fill_super);
775
+}
776
+
777
+static void bpf_free_fc(struct fs_context *fc)
778
+{
779
+ kfree(fc->fs_private);
780
+}
781
+
782
+static const struct fs_context_operations bpf_context_ops = {
783
+ .free = bpf_free_fc,
784
+ .parse_param = bpf_parse_param,
785
+ .get_tree = bpf_get_tree,
786
+};
787
+
788
+/*
789
+ * Set up the filesystem mount context.
790
+ */
791
+static int bpf_init_fs_context(struct fs_context *fc)
792
+{
793
+ struct bpf_mount_opts *opts;
794
+
795
+ opts = kzalloc(sizeof(struct bpf_mount_opts), GFP_KERNEL);
796
+ if (!opts)
797
+ return -ENOMEM;
798
+
799
+ opts->mode = S_IRWXUGO;
800
+
801
+ fc->fs_private = opts;
802
+ fc->ops = &bpf_context_ops;
803
+ return 0;
666804 }
667805
668806 static struct file_system_type bpf_fs_type = {
669807 .owner = THIS_MODULE,
670808 .name = "bpf",
671
- .mount = bpf_mount,
809
+ .init_fs_context = bpf_init_fs_context,
810
+ .parameters = bpf_fs_parameters,
672811 .kill_sb = kill_litter_super,
673812 };
674813
....@@ -676,6 +815,8 @@
676815 {
677816 int ret;
678817
818
+ mutex_init(&bpf_preload_lock);
819
+
679820 ret = sysfs_create_mount_point(fs_kobj, "bpf");
680821 if (ret)
681822 return ret;