hc
2024-05-13 9d77db3c730780c8ef5ccd4b66403ff5675cfe4e
kernel/kernel/bpf/syscall.c
....@@ -1,17 +1,10 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2
- *
3
- * This program is free software; you can redistribute it and/or
4
- * modify it under the terms of version 2 of the GNU General Public
5
- * License as published by the Free Software Foundation.
6
- *
7
- * This program is distributed in the hope that it will be useful, but
8
- * WITHOUT ANY WARRANTY; without even the implied warranty of
9
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10
- * General Public License for more details.
113 */
124 #include <linux/bpf.h>
135 #include <linux/bpf_trace.h>
146 #include <linux/bpf_lirc.h>
7
+#include <linux/bpf_verifier.h>
158 #include <linux/btf.h>
169 #include <linux/syscalls.h>
1710 #include <linux/slab.h>
....@@ -30,15 +23,24 @@
3023 #include <linux/cred.h>
3124 #include <linux/timekeeping.h>
3225 #include <linux/ctype.h>
33
-#include <linux/btf.h>
3426 #include <linux/nospec.h>
27
+#include <linux/audit.h>
28
+#include <uapi/linux/btf.h>
29
+#include <linux/pgtable.h>
30
+#include <linux/bpf_lsm.h>
31
+#include <linux/poll.h>
32
+#include <linux/bpf-netns.h>
33
+#include <linux/rcupdate_trace.h>
3534
36
-#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
37
- (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
38
- (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
39
- (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
35
+#include <trace/hooks/syscall_check.h>
36
+
37
+#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
38
+ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
39
+ (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
40
+#define IS_FD_PROG_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY)
4041 #define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
41
-#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map))
42
+#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map) || \
43
+ IS_FD_HASH(map))
4244
4345 #define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY)
4446
....@@ -47,17 +49,21 @@
4749 static DEFINE_SPINLOCK(prog_idr_lock);
4850 static DEFINE_IDR(map_idr);
4951 static DEFINE_SPINLOCK(map_idr_lock);
52
+static DEFINE_IDR(link_idr);
53
+static DEFINE_SPINLOCK(link_idr_lock);
5054
5155 int sysctl_unprivileged_bpf_disabled __read_mostly =
5256 IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? 2 : 0;
5357
5458 static const struct bpf_map_ops * const bpf_map_types[] = {
55
-#define BPF_PROG_TYPE(_id, _ops)
59
+#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
5660 #define BPF_MAP_TYPE(_id, _ops) \
5761 [_id] = &_ops,
62
+#define BPF_LINK_TYPE(_id, _name)
5863 #include <linux/bpf_types.h>
5964 #undef BPF_PROG_TYPE
6065 #undef BPF_MAP_TYPE
66
+#undef BPF_LINK_TYPE
6167 };
6268
6369 /*
....@@ -73,35 +79,23 @@
7379 size_t expected_size,
7480 size_t actual_size)
7581 {
76
- unsigned char __user *addr;
77
- unsigned char __user *end;
78
- unsigned char val;
79
- int err;
82
+ unsigned char __user *addr = uaddr + expected_size;
83
+ int res;
8084
8185 if (unlikely(actual_size > PAGE_SIZE)) /* silly large */
8286 return -E2BIG;
8387
84
- if (unlikely(!access_ok(VERIFY_READ, uaddr, actual_size)))
85
- return -EFAULT;
86
-
8788 if (actual_size <= expected_size)
8889 return 0;
8990
90
- addr = uaddr + expected_size;
91
- end = uaddr + actual_size;
92
-
93
- for (; addr < end; addr++) {
94
- err = get_user(val, addr);
95
- if (err)
96
- return err;
97
- if (val)
98
- return -E2BIG;
99
- }
100
-
101
- return 0;
91
+ res = check_zeroed_user(addr, actual_size - expected_size);
92
+ if (res < 0)
93
+ return res;
94
+ return res ? 0 : -E2BIG;
10295 }
10396
10497 const struct bpf_map_ops bpf_map_offload_ops = {
98
+ .map_meta_equal = bpf_map_meta_equal,
10599 .map_alloc = bpf_map_offload_map_alloc,
106100 .map_free = bpf_map_offload_map_free,
107101 .map_check_btf = map_check_no_btf,
....@@ -136,28 +130,223 @@
136130 return map;
137131 }
138132
139
-void *bpf_map_area_alloc(size_t size, int numa_node)
133
+static void bpf_map_write_active_inc(struct bpf_map *map)
140134 {
141
- /* We definitely need __GFP_NORETRY, so OOM killer doesn't
142
- * trigger under memory pressure as we really just want to
143
- * fail instead.
135
+ atomic64_inc(&map->writecnt);
136
+}
137
+
138
+static void bpf_map_write_active_dec(struct bpf_map *map)
139
+{
140
+ atomic64_dec(&map->writecnt);
141
+}
142
+
143
+bool bpf_map_write_active(const struct bpf_map *map)
144
+{
145
+ return atomic64_read(&map->writecnt) != 0;
146
+}
147
+
148
+static u32 bpf_map_value_size(struct bpf_map *map)
149
+{
150
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
151
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
152
+ map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
153
+ map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
154
+ return round_up(map->value_size, 8) * num_possible_cpus();
155
+ else if (IS_FD_MAP(map))
156
+ return sizeof(u32);
157
+ else
158
+ return map->value_size;
159
+}
160
+
161
+static void maybe_wait_bpf_programs(struct bpf_map *map)
162
+{
163
+ /* Wait for any running BPF programs to complete so that
164
+ * userspace, when we return to it, knows that all programs
165
+ * that could be running use the new map value.
144166 */
145
- const gfp_t flags = __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO;
167
+ if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
168
+ map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
169
+ synchronize_rcu();
170
+}
171
+
172
+static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
173
+ void *value, __u64 flags)
174
+{
175
+ int err;
176
+
177
+ /* Need to create a kthread, thus must support schedule */
178
+ if (bpf_map_is_dev_bound(map)) {
179
+ return bpf_map_offload_update_elem(map, key, value, flags);
180
+ } else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
181
+ map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
182
+ return map->ops->map_update_elem(map, key, value, flags);
183
+ } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH ||
184
+ map->map_type == BPF_MAP_TYPE_SOCKMAP) {
185
+ return sock_map_update_elem_sys(map, key, value, flags);
186
+ } else if (IS_FD_PROG_ARRAY(map)) {
187
+ return bpf_fd_array_map_update_elem(map, f.file, key, value,
188
+ flags);
189
+ }
190
+
191
+ bpf_disable_instrumentation();
192
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
193
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
194
+ err = bpf_percpu_hash_update(map, key, value, flags);
195
+ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
196
+ err = bpf_percpu_array_update(map, key, value, flags);
197
+ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
198
+ err = bpf_percpu_cgroup_storage_update(map, key, value,
199
+ flags);
200
+ } else if (IS_FD_ARRAY(map)) {
201
+ rcu_read_lock();
202
+ err = bpf_fd_array_map_update_elem(map, f.file, key, value,
203
+ flags);
204
+ rcu_read_unlock();
205
+ } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
206
+ rcu_read_lock();
207
+ err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
208
+ flags);
209
+ rcu_read_unlock();
210
+ } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
211
+ /* rcu_read_lock() is not needed */
212
+ err = bpf_fd_reuseport_array_update_elem(map, key, value,
213
+ flags);
214
+ } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
215
+ map->map_type == BPF_MAP_TYPE_STACK) {
216
+ err = map->ops->map_push_elem(map, value, flags);
217
+ } else {
218
+ rcu_read_lock();
219
+ err = map->ops->map_update_elem(map, key, value, flags);
220
+ rcu_read_unlock();
221
+ }
222
+ bpf_enable_instrumentation();
223
+ maybe_wait_bpf_programs(map);
224
+
225
+ return err;
226
+}
227
+
228
+static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
229
+ __u64 flags)
230
+{
231
+ void *ptr;
232
+ int err;
233
+
234
+ if (bpf_map_is_dev_bound(map))
235
+ return bpf_map_offload_lookup_elem(map, key, value);
236
+
237
+ bpf_disable_instrumentation();
238
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
239
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
240
+ err = bpf_percpu_hash_copy(map, key, value);
241
+ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
242
+ err = bpf_percpu_array_copy(map, key, value);
243
+ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
244
+ err = bpf_percpu_cgroup_storage_copy(map, key, value);
245
+ } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
246
+ err = bpf_stackmap_copy(map, key, value);
247
+ } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
248
+ err = bpf_fd_array_map_lookup_elem(map, key, value);
249
+ } else if (IS_FD_HASH(map)) {
250
+ err = bpf_fd_htab_map_lookup_elem(map, key, value);
251
+ } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
252
+ err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
253
+ } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
254
+ map->map_type == BPF_MAP_TYPE_STACK) {
255
+ err = map->ops->map_peek_elem(map, value);
256
+ } else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
257
+ /* struct_ops map requires directly updating "value" */
258
+ err = bpf_struct_ops_map_sys_lookup_elem(map, key, value);
259
+ } else {
260
+ rcu_read_lock();
261
+ if (map->ops->map_lookup_elem_sys_only)
262
+ ptr = map->ops->map_lookup_elem_sys_only(map, key);
263
+ else
264
+ ptr = map->ops->map_lookup_elem(map, key);
265
+ if (IS_ERR(ptr)) {
266
+ err = PTR_ERR(ptr);
267
+ } else if (!ptr) {
268
+ err = -ENOENT;
269
+ } else {
270
+ err = 0;
271
+ if (flags & BPF_F_LOCK)
272
+ /* lock 'ptr' and copy everything but lock */
273
+ copy_map_value_locked(map, value, ptr, true);
274
+ else
275
+ copy_map_value(map, value, ptr);
276
+ /* mask lock, since value wasn't zero inited */
277
+ check_and_init_map_lock(map, value);
278
+ }
279
+ rcu_read_unlock();
280
+ }
281
+
282
+ bpf_enable_instrumentation();
283
+ maybe_wait_bpf_programs(map);
284
+
285
+ return err;
286
+}
287
+
288
+static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
289
+{
290
+ /* We really just want to fail instead of triggering OOM killer
291
+ * under memory pressure, therefore we set __GFP_NORETRY to kmalloc,
292
+ * which is used for lower order allocation requests.
293
+ *
294
+ * It has been observed that higher order allocation requests done by
295
+ * vmalloc with __GFP_NORETRY being set might fail due to not trying
296
+ * to reclaim memory from the page cache, thus we set
297
+ * __GFP_RETRY_MAYFAIL to avoid such situations.
298
+ */
299
+
300
+ const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO;
301
+ unsigned int flags = 0;
302
+ unsigned long align = 1;
146303 void *area;
147304
148
- if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
149
- area = kmalloc_node(size, GFP_USER | flags, numa_node);
305
+ if (size >= SIZE_MAX)
306
+ return NULL;
307
+
308
+ /* kmalloc()'ed memory can't be mmap()'ed */
309
+ if (mmapable) {
310
+ BUG_ON(!PAGE_ALIGNED(size));
311
+ align = SHMLBA;
312
+ flags = VM_USERMAP;
313
+ } else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
314
+ area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY,
315
+ numa_node);
150316 if (area != NULL)
151317 return area;
152318 }
153319
154
- return __vmalloc_node_flags_caller(size, numa_node, GFP_KERNEL | flags,
155
- __builtin_return_address(0));
320
+ return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
321
+ gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL,
322
+ flags, numa_node, __builtin_return_address(0));
323
+}
324
+
325
+void *bpf_map_area_alloc(u64 size, int numa_node)
326
+{
327
+ return __bpf_map_area_alloc(size, numa_node, false);
328
+}
329
+
330
+void *bpf_map_area_mmapable_alloc(u64 size, int numa_node)
331
+{
332
+ return __bpf_map_area_alloc(size, numa_node, true);
156333 }
157334
158335 void bpf_map_area_free(void *area)
159336 {
160337 kvfree(area);
338
+}
339
+
340
+static u32 bpf_map_flags_retain_permanent(u32 flags)
341
+{
342
+ /* Some map creation flags are not tied to the map object but
343
+ * rather to the map fd instead, so they have no meaning upon
344
+ * map object inspection since multiple file descriptors with
345
+ * different (access) properties can exist here. Thus, given
346
+ * this has zero meaning for the map itself, lets clear these
347
+ * from here.
348
+ */
349
+ return flags & ~(BPF_F_RDONLY | BPF_F_WRONLY);
161350 }
162351
163352 void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
....@@ -166,21 +355,8 @@
166355 map->key_size = attr->key_size;
167356 map->value_size = attr->value_size;
168357 map->max_entries = attr->max_entries;
169
- map->map_flags = attr->map_flags;
358
+ map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags);
170359 map->numa_node = bpf_map_attr_numa_node(attr);
171
-}
172
-
173
-int bpf_map_precharge_memlock(u32 pages)
174
-{
175
- struct user_struct *user = get_current_user();
176
- unsigned long memlock_limit, cur;
177
-
178
- memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
179
- cur = atomic_long_read(&user->locked_vm);
180
- free_uid(user);
181
- if (cur + pages > memlock_limit)
182
- return -EPERM;
183
- return 0;
184360 }
185361
186362 static int bpf_charge_memlock(struct user_struct *user, u32 pages)
....@@ -196,45 +372,62 @@
196372
197373 static void bpf_uncharge_memlock(struct user_struct *user, u32 pages)
198374 {
199
- atomic_long_sub(pages, &user->locked_vm);
375
+ if (user)
376
+ atomic_long_sub(pages, &user->locked_vm);
200377 }
201378
202
-static int bpf_map_init_memlock(struct bpf_map *map)
379
+int bpf_map_charge_init(struct bpf_map_memory *mem, u64 size)
203380 {
204
- struct user_struct *user = get_current_user();
381
+ u32 pages = round_up(size, PAGE_SIZE) >> PAGE_SHIFT;
382
+ struct user_struct *user;
205383 int ret;
206384
207
- ret = bpf_charge_memlock(user, map->pages);
385
+ if (size >= U32_MAX - PAGE_SIZE)
386
+ return -E2BIG;
387
+
388
+ user = get_current_user();
389
+ ret = bpf_charge_memlock(user, pages);
208390 if (ret) {
209391 free_uid(user);
210392 return ret;
211393 }
212
- map->user = user;
213
- return ret;
394
+
395
+ mem->pages = pages;
396
+ mem->user = user;
397
+
398
+ return 0;
214399 }
215400
216
-static void bpf_map_release_memlock(struct bpf_map *map)
401
+void bpf_map_charge_finish(struct bpf_map_memory *mem)
217402 {
218
- struct user_struct *user = map->user;
219
- bpf_uncharge_memlock(user, map->pages);
220
- free_uid(user);
403
+ bpf_uncharge_memlock(mem->user, mem->pages);
404
+ free_uid(mem->user);
405
+}
406
+
407
+void bpf_map_charge_move(struct bpf_map_memory *dst,
408
+ struct bpf_map_memory *src)
409
+{
410
+ *dst = *src;
411
+
412
+ /* Make sure src will not be used for the redundant uncharging. */
413
+ memset(src, 0, sizeof(struct bpf_map_memory));
221414 }
222415
223416 int bpf_map_charge_memlock(struct bpf_map *map, u32 pages)
224417 {
225418 int ret;
226419
227
- ret = bpf_charge_memlock(map->user, pages);
420
+ ret = bpf_charge_memlock(map->memory.user, pages);
228421 if (ret)
229422 return ret;
230
- map->pages += pages;
423
+ map->memory.pages += pages;
231424 return ret;
232425 }
233426
234427 void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages)
235428 {
236
- bpf_uncharge_memlock(map->user, pages);
237
- map->pages -= pages;
429
+ bpf_uncharge_memlock(map->memory.user, pages);
430
+ map->memory.pages -= pages;
238431 }
239432
240433 static int bpf_map_alloc_id(struct bpf_map *map)
....@@ -285,16 +478,18 @@
285478 static void bpf_map_free_deferred(struct work_struct *work)
286479 {
287480 struct bpf_map *map = container_of(work, struct bpf_map, work);
481
+ struct bpf_map_memory mem;
288482
289
- bpf_map_release_memlock(map);
483
+ bpf_map_charge_move(&mem, &map->memory);
290484 security_bpf_map_free(map);
291485 /* implementation dependent freeing */
292486 map->ops->map_free(map);
487
+ bpf_map_charge_finish(&mem);
293488 }
294489
295490 static void bpf_map_put_uref(struct bpf_map *map)
296491 {
297
- if (atomic_dec_and_test(&map->usercnt)) {
492
+ if (atomic64_dec_and_test(&map->usercnt)) {
298493 if (map->ops->map_release_uref)
299494 map->ops->map_release_uref(map);
300495 }
....@@ -305,7 +500,7 @@
305500 */
306501 static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
307502 {
308
- if (atomic_dec_and_test(&map->refcnt)) {
503
+ if (atomic64_dec_and_test(&map->refcnt)) {
309504 /* bpf_map_free_id() must be called first */
310505 bpf_map_free_id(map, do_idr_lock);
311506 btf_put(map->btf);
....@@ -337,18 +532,31 @@
337532 return 0;
338533 }
339534
535
+static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f)
536
+{
537
+ fmode_t mode = f.file->f_mode;
538
+
539
+ /* Our file permissions may have been overridden by global
540
+ * map permissions facing syscall side.
541
+ */
542
+ if (READ_ONCE(map->frozen))
543
+ mode &= ~FMODE_CAN_WRITE;
544
+ return mode;
545
+}
546
+
340547 #ifdef CONFIG_PROC_FS
341548 static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
342549 {
343550 const struct bpf_map *map = filp->private_data;
344551 const struct bpf_array *array;
345
- u32 owner_prog_type = 0;
346
- u32 owner_jited = 0;
552
+ u32 type = 0, jited = 0;
347553
348554 if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
349555 array = container_of(map, struct bpf_array, map);
350
- owner_prog_type = array->owner_prog_type;
351
- owner_jited = array->owner_jited;
556
+ spin_lock(&array->aux->owner.lock);
557
+ type = array->aux->owner.type;
558
+ jited = array->aux->owner.jited;
559
+ spin_unlock(&array->aux->owner.lock);
352560 }
353561
354562 seq_printf(m,
....@@ -358,20 +566,19 @@
358566 "max_entries:\t%u\n"
359567 "map_flags:\t%#x\n"
360568 "memlock:\t%llu\n"
361
- "map_id:\t%u\n",
569
+ "map_id:\t%u\n"
570
+ "frozen:\t%u\n",
362571 map->map_type,
363572 map->key_size,
364573 map->value_size,
365574 map->max_entries,
366575 map->map_flags,
367
- map->pages * 1ULL << PAGE_SHIFT,
368
- map->id);
369
-
370
- if (owner_prog_type) {
371
- seq_printf(m, "owner_prog_type:\t%u\n",
372
- owner_prog_type);
373
- seq_printf(m, "owner_jited:\t%u\n",
374
- owner_jited);
576
+ map->memory.pages * 1ULL << PAGE_SHIFT,
577
+ map->id,
578
+ READ_ONCE(map->frozen));
579
+ if (type) {
580
+ seq_printf(m, "owner_prog_type:\t%u\n", type);
581
+ seq_printf(m, "owner_jited:\t%u\n", jited);
375582 }
376583 }
377584 #endif
....@@ -394,6 +601,87 @@
394601 return -EINVAL;
395602 }
396603
604
+/* called for any extra memory-mapped regions (except initial) */
605
+static void bpf_map_mmap_open(struct vm_area_struct *vma)
606
+{
607
+ struct bpf_map *map = vma->vm_file->private_data;
608
+
609
+ if (vma->vm_flags & VM_MAYWRITE)
610
+ bpf_map_write_active_inc(map);
611
+}
612
+
613
+/* called for all unmapped memory region (including initial) */
614
+static void bpf_map_mmap_close(struct vm_area_struct *vma)
615
+{
616
+ struct bpf_map *map = vma->vm_file->private_data;
617
+
618
+ if (vma->vm_flags & VM_MAYWRITE)
619
+ bpf_map_write_active_dec(map);
620
+}
621
+
622
+static const struct vm_operations_struct bpf_map_default_vmops = {
623
+ .open = bpf_map_mmap_open,
624
+ .close = bpf_map_mmap_close,
625
+};
626
+
627
+static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
628
+{
629
+ struct bpf_map *map = filp->private_data;
630
+ int err;
631
+
632
+ if (!map->ops->map_mmap || map_value_has_spin_lock(map))
633
+ return -ENOTSUPP;
634
+
635
+ if (!(vma->vm_flags & VM_SHARED))
636
+ return -EINVAL;
637
+
638
+ mutex_lock(&map->freeze_mutex);
639
+
640
+ if (vma->vm_flags & VM_WRITE) {
641
+ if (map->frozen) {
642
+ err = -EPERM;
643
+ goto out;
644
+ }
645
+ /* map is meant to be read-only, so do not allow mapping as
646
+ * writable, because it's possible to leak a writable page
647
+ * reference and allows user-space to still modify it after
648
+ * freezing, while verifier will assume contents do not change
649
+ */
650
+ if (map->map_flags & BPF_F_RDONLY_PROG) {
651
+ err = -EACCES;
652
+ goto out;
653
+ }
654
+ }
655
+
656
+ /* set default open/close callbacks */
657
+ vma->vm_ops = &bpf_map_default_vmops;
658
+ vma->vm_private_data = map;
659
+ vma->vm_flags &= ~VM_MAYEXEC;
660
+ if (!(vma->vm_flags & VM_WRITE))
661
+ /* disallow re-mapping with PROT_WRITE */
662
+ vma->vm_flags &= ~VM_MAYWRITE;
663
+
664
+ err = map->ops->map_mmap(map, vma);
665
+ if (err)
666
+ goto out;
667
+
668
+ if (vma->vm_flags & VM_MAYWRITE)
669
+ bpf_map_write_active_inc(map);
670
+out:
671
+ mutex_unlock(&map->freeze_mutex);
672
+ return err;
673
+}
674
+
675
+static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts)
676
+{
677
+ struct bpf_map *map = filp->private_data;
678
+
679
+ if (map->ops->map_poll)
680
+ return map->ops->map_poll(map, filp, pts);
681
+
682
+ return EPOLLERR;
683
+}
684
+
397685 const struct file_operations bpf_map_fops = {
398686 #ifdef CONFIG_PROC_FS
399687 .show_fdinfo = bpf_map_show_fdinfo,
....@@ -401,6 +689,8 @@
401689 .release = bpf_map_release,
402690 .read = bpf_dummy_read,
403691 .write = bpf_dummy_write,
692
+ .mmap = bpf_map_mmap,
693
+ .poll = bpf_map_poll,
404694 };
405695
406696 int bpf_map_new_fd(struct bpf_map *map, int flags)
....@@ -434,62 +724,92 @@
434724 offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
435725 sizeof(attr->CMD##_LAST_FIELD)) != NULL
436726
437
-/* dst and src must have at least BPF_OBJ_NAME_LEN number of bytes.
438
- * Return 0 on success and < 0 on error.
727
+/* dst and src must have at least "size" number of bytes.
728
+ * Return strlen on success and < 0 on error.
439729 */
440
-static int bpf_obj_name_cpy(char *dst, const char *src)
730
+int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size)
441731 {
442
- const char *end = src + BPF_OBJ_NAME_LEN;
732
+ const char *end = src + size;
733
+ const char *orig_src = src;
443734
444
- memset(dst, 0, BPF_OBJ_NAME_LEN);
445
-
446
- /* Copy all isalnum() and '_' char */
735
+ memset(dst, 0, size);
736
+ /* Copy all isalnum(), '_' and '.' chars. */
447737 while (src < end && *src) {
448
- if (!isalnum(*src) && *src != '_')
738
+ if (!isalnum(*src) &&
739
+ *src != '_' && *src != '.')
449740 return -EINVAL;
450741 *dst++ = *src++;
451742 }
452743
453
- /* No '\0' found in BPF_OBJ_NAME_LEN number of bytes */
744
+ /* No '\0' found in "size" number of bytes */
454745 if (src == end)
455746 return -EINVAL;
456747
457
- return 0;
748
+ return src - orig_src;
458749 }
459750
460751 int map_check_no_btf(const struct bpf_map *map,
752
+ const struct btf *btf,
461753 const struct btf_type *key_type,
462754 const struct btf_type *value_type)
463755 {
464756 return -ENOTSUPP;
465757 }
466758
467
-static int map_check_btf(const struct bpf_map *map, const struct btf *btf,
759
+static int map_check_btf(struct bpf_map *map, const struct btf *btf,
468760 u32 btf_key_id, u32 btf_value_id)
469761 {
470762 const struct btf_type *key_type, *value_type;
471763 u32 key_size, value_size;
472764 int ret = 0;
473765
474
- key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
475
- if (!key_type || key_size != map->key_size)
476
- return -EINVAL;
766
+ /* Some maps allow key to be unspecified. */
767
+ if (btf_key_id) {
768
+ key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
769
+ if (!key_type || key_size != map->key_size)
770
+ return -EINVAL;
771
+ } else {
772
+ key_type = btf_type_by_id(btf, 0);
773
+ if (!map->ops->map_check_btf)
774
+ return -EINVAL;
775
+ }
477776
478777 value_type = btf_type_id_size(btf, &btf_value_id, &value_size);
479778 if (!value_type || value_size != map->value_size)
480779 return -EINVAL;
481780
781
+ map->spin_lock_off = btf_find_spin_lock(btf, value_type);
782
+
783
+ if (map_value_has_spin_lock(map)) {
784
+ if (map->map_flags & BPF_F_RDONLY_PROG)
785
+ return -EACCES;
786
+ if (map->map_type != BPF_MAP_TYPE_HASH &&
787
+ map->map_type != BPF_MAP_TYPE_ARRAY &&
788
+ map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
789
+ map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
790
+ map->map_type != BPF_MAP_TYPE_INODE_STORAGE)
791
+ return -ENOTSUPP;
792
+ if (map->spin_lock_off + sizeof(struct bpf_spin_lock) >
793
+ map->value_size) {
794
+ WARN_ONCE(1,
795
+ "verifier bug spin_lock_off %d value_size %d\n",
796
+ map->spin_lock_off, map->value_size);
797
+ return -EFAULT;
798
+ }
799
+ }
800
+
482801 if (map->ops->map_check_btf)
483
- ret = map->ops->map_check_btf(map, key_type, value_type);
802
+ ret = map->ops->map_check_btf(map, btf, key_type, value_type);
484803
485804 return ret;
486805 }
487806
488
-#define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id
807
+#define BPF_MAP_CREATE_LAST_FIELD btf_vmlinux_value_type_id
489808 /* called via syscall */
490809 static int map_create(union bpf_attr *attr)
491810 {
492811 int numa_node = bpf_map_attr_numa_node(attr);
812
+ struct bpf_map_memory mem;
493813 struct bpf_map *map;
494814 int f_flags;
495815 int err;
....@@ -497,6 +817,14 @@
497817 err = CHECK_ATTR(BPF_MAP_CREATE);
498818 if (err)
499819 return -EINVAL;
820
+
821
+ if (attr->btf_vmlinux_value_type_id) {
822
+ if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
823
+ attr->btf_key_type_id || attr->btf_value_type_id)
824
+ return -EINVAL;
825
+ } else if (attr->btf_key_type_id && !attr->btf_value_type_id) {
826
+ return -EINVAL;
827
+ }
500828
501829 f_flags = bpf_get_file_flag(attr->map_flags);
502830 if (f_flags < 0)
....@@ -512,50 +840,53 @@
512840 if (IS_ERR(map))
513841 return PTR_ERR(map);
514842
515
- err = bpf_obj_name_cpy(map->name, attr->map_name);
516
- if (err)
517
- goto free_map_nouncharge;
843
+ err = bpf_obj_name_cpy(map->name, attr->map_name,
844
+ sizeof(attr->map_name));
845
+ if (err < 0)
846
+ goto free_map;
518847
519
- atomic_set(&map->refcnt, 1);
520
- atomic_set(&map->usercnt, 1);
848
+ atomic64_set(&map->refcnt, 1);
849
+ atomic64_set(&map->usercnt, 1);
850
+ mutex_init(&map->freeze_mutex);
521851
522
- if (attr->btf_key_type_id || attr->btf_value_type_id) {
852
+ map->spin_lock_off = -EINVAL;
853
+ if (attr->btf_key_type_id || attr->btf_value_type_id ||
854
+ /* Even the map's value is a kernel's struct,
855
+ * the bpf_prog.o must have BTF to begin with
856
+ * to figure out the corresponding kernel's
857
+ * counter part. Thus, attr->btf_fd has
858
+ * to be valid also.
859
+ */
860
+ attr->btf_vmlinux_value_type_id) {
523861 struct btf *btf;
524
-
525
- if (!attr->btf_key_type_id || !attr->btf_value_type_id) {
526
- err = -EINVAL;
527
- goto free_map_nouncharge;
528
- }
529862
530863 btf = btf_get_by_fd(attr->btf_fd);
531864 if (IS_ERR(btf)) {
532865 err = PTR_ERR(btf);
533
- goto free_map_nouncharge;
866
+ goto free_map;
534867 }
535
-
536
- err = map_check_btf(map, btf, attr->btf_key_type_id,
537
- attr->btf_value_type_id);
538
- if (err) {
539
- btf_put(btf);
540
- goto free_map_nouncharge;
541
- }
542
-
543868 map->btf = btf;
869
+
870
+ if (attr->btf_value_type_id) {
871
+ err = map_check_btf(map, btf, attr->btf_key_type_id,
872
+ attr->btf_value_type_id);
873
+ if (err)
874
+ goto free_map;
875
+ }
876
+
544877 map->btf_key_type_id = attr->btf_key_type_id;
545878 map->btf_value_type_id = attr->btf_value_type_id;
879
+ map->btf_vmlinux_value_type_id =
880
+ attr->btf_vmlinux_value_type_id;
546881 }
547882
548883 err = security_bpf_map_alloc(map);
549884 if (err)
550
- goto free_map_nouncharge;
551
-
552
- err = bpf_map_init_memlock(map);
553
- if (err)
554
- goto free_map_sec;
885
+ goto free_map;
555886
556887 err = bpf_map_alloc_id(map);
557888 if (err)
558
- goto free_map;
889
+ goto free_map_sec;
559890
560891 err = bpf_map_new_fd(map, f_flags);
561892 if (err < 0) {
....@@ -571,13 +902,13 @@
571902
572903 return err;
573904
574
-free_map:
575
- bpf_map_release_memlock(map);
576905 free_map_sec:
577906 security_bpf_map_free(map);
578
-free_map_nouncharge:
907
+free_map:
579908 btf_put(map->btf);
909
+ bpf_map_charge_move(&mem, &map->memory);
580910 map->ops->map_free(map);
911
+ bpf_map_charge_finish(&mem);
581912 return err;
582913 }
583914
....@@ -596,20 +927,33 @@
596927 return f.file->private_data;
597928 }
598929
599
-/* prog's and map's refcnt limit */
600
-#define BPF_MAX_REFCNT 32768
601
-
602
-struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref)
930
+void bpf_map_inc(struct bpf_map *map)
603931 {
604
- if (atomic_inc_return(&map->refcnt) > BPF_MAX_REFCNT) {
605
- atomic_dec(&map->refcnt);
606
- return ERR_PTR(-EBUSY);
607
- }
608
- if (uref)
609
- atomic_inc(&map->usercnt);
610
- return map;
932
+ atomic64_inc(&map->refcnt);
611933 }
612934 EXPORT_SYMBOL_GPL(bpf_map_inc);
935
+
936
+void bpf_map_inc_with_uref(struct bpf_map *map)
937
+{
938
+ atomic64_inc(&map->refcnt);
939
+ atomic64_inc(&map->usercnt);
940
+}
941
+EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref);
942
+
943
+struct bpf_map *bpf_map_get(u32 ufd)
944
+{
945
+ struct fd f = fdget(ufd);
946
+ struct bpf_map *map;
947
+
948
+ map = __bpf_map_get(f);
949
+ if (IS_ERR(map))
950
+ return map;
951
+
952
+ bpf_map_inc(map);
953
+ fdput(f);
954
+
955
+ return map;
956
+}
613957
614958 struct bpf_map *bpf_map_get_with_uref(u32 ufd)
615959 {
....@@ -620,41 +964,54 @@
620964 if (IS_ERR(map))
621965 return map;
622966
623
- map = bpf_map_inc(map, true);
967
+ bpf_map_inc_with_uref(map);
624968 fdput(f);
625969
626970 return map;
627971 }
628972
629973 /* map_idr_lock should have been held */
630
-static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map,
631
- bool uref)
974
+static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
632975 {
633976 int refold;
634977
635
- refold = atomic_fetch_add_unless(&map->refcnt, 1, 0);
636
-
637
- if (refold >= BPF_MAX_REFCNT) {
638
- __bpf_map_put(map, false);
639
- return ERR_PTR(-EBUSY);
640
- }
641
-
978
+ refold = atomic64_fetch_add_unless(&map->refcnt, 1, 0);
642979 if (!refold)
643980 return ERR_PTR(-ENOENT);
644
-
645981 if (uref)
646
- atomic_inc(&map->usercnt);
982
+ atomic64_inc(&map->usercnt);
647983
648984 return map;
649985 }
986
+
987
+struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map)
988
+{
989
+ spin_lock_bh(&map_idr_lock);
990
+ map = __bpf_map_inc_not_zero(map, false);
991
+ spin_unlock_bh(&map_idr_lock);
992
+
993
+ return map;
994
+}
995
+EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero);
650996
651997 int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
652998 {
653999 return -ENOTSUPP;
6541000 }
6551001
1002
+static void *__bpf_copy_key(void __user *ukey, u64 key_size)
1003
+{
1004
+ if (key_size)
1005
+ return memdup_user(ukey, key_size);
1006
+
1007
+ if (ukey)
1008
+ return ERR_PTR(-EINVAL);
1009
+
1010
+ return NULL;
1011
+}
1012
+
6561013 /* last field in 'union bpf_attr' used by this command */
657
-#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
1014
+#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags
6581015
6591016 static int map_lookup_elem(union bpf_attr *attr)
6601017 {
....@@ -662,7 +1019,7 @@
6621019 void __user *uvalue = u64_to_user_ptr(attr->value);
6631020 int ufd = attr->map_fd;
6641021 struct bpf_map *map;
665
- void *key, *value, *ptr;
1022
+ void *key, *value;
6661023 u32 value_size;
6671024 struct fd f;
6681025 int err;
....@@ -670,71 +1027,38 @@
6701027 if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
6711028 return -EINVAL;
6721029
1030
+ if (attr->flags & ~BPF_F_LOCK)
1031
+ return -EINVAL;
1032
+
6731033 f = fdget(ufd);
6741034 map = __bpf_map_get(f);
6751035 if (IS_ERR(map))
6761036 return PTR_ERR(map);
677
-
678
- if (!(f.file->f_mode & FMODE_CAN_READ)) {
1037
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
6791038 err = -EPERM;
6801039 goto err_put;
6811040 }
6821041
683
- key = memdup_user(ukey, map->key_size);
1042
+ if ((attr->flags & BPF_F_LOCK) &&
1043
+ !map_value_has_spin_lock(map)) {
1044
+ err = -EINVAL;
1045
+ goto err_put;
1046
+ }
1047
+
1048
+ key = __bpf_copy_key(ukey, map->key_size);
6841049 if (IS_ERR(key)) {
6851050 err = PTR_ERR(key);
6861051 goto err_put;
6871052 }
6881053
689
- if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
690
- map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
691
- map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
692
- value_size = round_up(map->value_size, 8) * num_possible_cpus();
693
- else if (IS_FD_MAP(map))
694
- value_size = sizeof(u32);
695
- else
696
- value_size = map->value_size;
1054
+ value_size = bpf_map_value_size(map);
6971055
6981056 err = -ENOMEM;
6991057 value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
7001058 if (!value)
7011059 goto free_key;
7021060
703
- if (bpf_map_is_dev_bound(map)) {
704
- err = bpf_map_offload_lookup_elem(map, key, value);
705
- goto done;
706
- }
707
-
708
- preempt_disable();
709
- this_cpu_inc(bpf_prog_active);
710
- if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
711
- map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
712
- err = bpf_percpu_hash_copy(map, key, value);
713
- } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
714
- err = bpf_percpu_array_copy(map, key, value);
715
- } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
716
- err = bpf_stackmap_copy(map, key, value);
717
- } else if (IS_FD_ARRAY(map)) {
718
- err = bpf_fd_array_map_lookup_elem(map, key, value);
719
- } else if (IS_FD_HASH(map)) {
720
- err = bpf_fd_htab_map_lookup_elem(map, key, value);
721
- } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
722
- err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
723
- } else {
724
- rcu_read_lock();
725
- if (map->ops->map_lookup_elem_sys_only)
726
- ptr = map->ops->map_lookup_elem_sys_only(map, key);
727
- else
728
- ptr = map->ops->map_lookup_elem(map, key);
729
- if (ptr)
730
- memcpy(value, ptr, value_size);
731
- rcu_read_unlock();
732
- err = ptr ? 0 : -ENOENT;
733
- }
734
- this_cpu_dec(bpf_prog_active);
735
- preempt_enable();
736
-
737
-done:
1061
+ err = bpf_map_copy_value(map, key, value, attr->flags);
7381062 if (err)
7391063 goto free_value;
7401064
....@@ -753,16 +1077,6 @@
7531077 return err;
7541078 }
7551079
756
-static void maybe_wait_bpf_programs(struct bpf_map *map)
757
-{
758
- /* Wait for any running BPF programs to complete so that
759
- * userspace, when we return to it, knows that all programs
760
- * that could be running use the new map value.
761
- */
762
- if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
763
- map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
764
- synchronize_rcu();
765
-}
7661080
7671081 #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
7681082
....@@ -784,13 +1098,19 @@
7841098 map = __bpf_map_get(f);
7851099 if (IS_ERR(map))
7861100 return PTR_ERR(map);
787
-
788
- if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
1101
+ bpf_map_write_active_inc(map);
1102
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
7891103 err = -EPERM;
7901104 goto err_put;
7911105 }
7921106
793
- key = memdup_user(ukey, map->key_size);
1107
+ if ((attr->flags & BPF_F_LOCK) &&
1108
+ !map_value_has_spin_lock(map)) {
1109
+ err = -EINVAL;
1110
+ goto err_put;
1111
+ }
1112
+
1113
+ key = __bpf_copy_key(ukey, map->key_size);
7941114 if (IS_ERR(key)) {
7951115 err = PTR_ERR(key);
7961116 goto err_put;
....@@ -798,7 +1118,8 @@
7981118
7991119 if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
8001120 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
801
- map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
1121
+ map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
1122
+ map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
8021123 value_size = round_up(map->value_size, 8) * num_possible_cpus();
8031124 else
8041125 value_size = map->value_size;
....@@ -812,55 +1133,14 @@
8121133 if (copy_from_user(value, uvalue, value_size) != 0)
8131134 goto free_value;
8141135
815
- /* Need to create a kthread, thus must support schedule */
816
- if (bpf_map_is_dev_bound(map)) {
817
- err = bpf_map_offload_update_elem(map, key, value, attr->flags);
818
- goto out;
819
- } else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
820
- map->map_type == BPF_MAP_TYPE_SOCKHASH ||
821
- map->map_type == BPF_MAP_TYPE_SOCKMAP) {
822
- err = map->ops->map_update_elem(map, key, value, attr->flags);
823
- goto out;
824
- }
1136
+ err = bpf_map_update_value(map, f, key, value, attr->flags);
8251137
826
- /* must increment bpf_prog_active to avoid kprobe+bpf triggering from
827
- * inside bpf map update or delete otherwise deadlocks are possible
828
- */
829
- preempt_disable();
830
- __this_cpu_inc(bpf_prog_active);
831
- if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
832
- map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
833
- err = bpf_percpu_hash_update(map, key, value, attr->flags);
834
- } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
835
- err = bpf_percpu_array_update(map, key, value, attr->flags);
836
- } else if (IS_FD_ARRAY(map)) {
837
- rcu_read_lock();
838
- err = bpf_fd_array_map_update_elem(map, f.file, key, value,
839
- attr->flags);
840
- rcu_read_unlock();
841
- } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
842
- rcu_read_lock();
843
- err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
844
- attr->flags);
845
- rcu_read_unlock();
846
- } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
847
- /* rcu_read_lock() is not needed */
848
- err = bpf_fd_reuseport_array_update_elem(map, key, value,
849
- attr->flags);
850
- } else {
851
- rcu_read_lock();
852
- err = map->ops->map_update_elem(map, key, value, attr->flags);
853
- rcu_read_unlock();
854
- }
855
- __this_cpu_dec(bpf_prog_active);
856
- preempt_enable();
857
- maybe_wait_bpf_programs(map);
858
-out:
8591138 free_value:
8601139 kfree(value);
8611140 free_key:
8621141 kfree(key);
8631142 err_put:
1143
+ bpf_map_write_active_dec(map);
8641144 fdput(f);
8651145 return err;
8661146 }
....@@ -883,13 +1163,13 @@
8831163 map = __bpf_map_get(f);
8841164 if (IS_ERR(map))
8851165 return PTR_ERR(map);
886
-
887
- if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
1166
+ bpf_map_write_active_inc(map);
1167
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
8881168 err = -EPERM;
8891169 goto err_put;
8901170 }
8911171
892
- key = memdup_user(ukey, map->key_size);
1172
+ key = __bpf_copy_key(ukey, map->key_size);
8931173 if (IS_ERR(key)) {
8941174 err = PTR_ERR(key);
8951175 goto err_put;
....@@ -898,19 +1178,23 @@
8981178 if (bpf_map_is_dev_bound(map)) {
8991179 err = bpf_map_offload_delete_elem(map, key);
9001180 goto out;
1181
+ } else if (IS_FD_PROG_ARRAY(map) ||
1182
+ map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
1183
+ /* These maps require sleepable context */
1184
+ err = map->ops->map_delete_elem(map, key);
1185
+ goto out;
9011186 }
9021187
903
- preempt_disable();
904
- __this_cpu_inc(bpf_prog_active);
1188
+ bpf_disable_instrumentation();
9051189 rcu_read_lock();
9061190 err = map->ops->map_delete_elem(map, key);
9071191 rcu_read_unlock();
908
- __this_cpu_dec(bpf_prog_active);
909
- preempt_enable();
1192
+ bpf_enable_instrumentation();
9101193 maybe_wait_bpf_programs(map);
9111194 out:
9121195 kfree(key);
9131196 err_put:
1197
+ bpf_map_write_active_dec(map);
9141198 fdput(f);
9151199 return err;
9161200 }
....@@ -935,14 +1219,13 @@
9351219 map = __bpf_map_get(f);
9361220 if (IS_ERR(map))
9371221 return PTR_ERR(map);
938
-
939
- if (!(f.file->f_mode & FMODE_CAN_READ)) {
1222
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
9401223 err = -EPERM;
9411224 goto err_put;
9421225 }
9431226
9441227 if (ukey) {
945
- key = memdup_user(ukey, map->key_size);
1228
+ key = __bpf_copy_key(ukey, map->key_size);
9461229 if (IS_ERR(key)) {
9471230 err = PTR_ERR(key);
9481231 goto err_put;
....@@ -983,13 +1266,340 @@
9831266 return err;
9841267 }
9851268
1269
+int generic_map_delete_batch(struct bpf_map *map,
1270
+ const union bpf_attr *attr,
1271
+ union bpf_attr __user *uattr)
1272
+{
1273
+ void __user *keys = u64_to_user_ptr(attr->batch.keys);
1274
+ u32 cp, max_count;
1275
+ int err = 0;
1276
+ void *key;
1277
+
1278
+ if (attr->batch.elem_flags & ~BPF_F_LOCK)
1279
+ return -EINVAL;
1280
+
1281
+ if ((attr->batch.elem_flags & BPF_F_LOCK) &&
1282
+ !map_value_has_spin_lock(map)) {
1283
+ return -EINVAL;
1284
+ }
1285
+
1286
+ max_count = attr->batch.count;
1287
+ if (!max_count)
1288
+ return 0;
1289
+
1290
+ key = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
1291
+ if (!key)
1292
+ return -ENOMEM;
1293
+
1294
+ for (cp = 0; cp < max_count; cp++) {
1295
+ err = -EFAULT;
1296
+ if (copy_from_user(key, keys + cp * map->key_size,
1297
+ map->key_size))
1298
+ break;
1299
+
1300
+ if (bpf_map_is_dev_bound(map)) {
1301
+ err = bpf_map_offload_delete_elem(map, key);
1302
+ break;
1303
+ }
1304
+
1305
+ bpf_disable_instrumentation();
1306
+ rcu_read_lock();
1307
+ err = map->ops->map_delete_elem(map, key);
1308
+ rcu_read_unlock();
1309
+ bpf_enable_instrumentation();
1310
+ maybe_wait_bpf_programs(map);
1311
+ if (err)
1312
+ break;
1313
+ cond_resched();
1314
+ }
1315
+ if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
1316
+ err = -EFAULT;
1317
+
1318
+ kfree(key);
1319
+ return err;
1320
+}
1321
+
1322
+int generic_map_update_batch(struct bpf_map *map,
1323
+ const union bpf_attr *attr,
1324
+ union bpf_attr __user *uattr)
1325
+{
1326
+ void __user *values = u64_to_user_ptr(attr->batch.values);
1327
+ void __user *keys = u64_to_user_ptr(attr->batch.keys);
1328
+ u32 value_size, cp, max_count;
1329
+ int ufd = attr->batch.map_fd;
1330
+ void *key, *value;
1331
+ struct fd f;
1332
+ int err = 0;
1333
+
1334
+ if (attr->batch.elem_flags & ~BPF_F_LOCK)
1335
+ return -EINVAL;
1336
+
1337
+ if ((attr->batch.elem_flags & BPF_F_LOCK) &&
1338
+ !map_value_has_spin_lock(map)) {
1339
+ return -EINVAL;
1340
+ }
1341
+
1342
+ value_size = bpf_map_value_size(map);
1343
+
1344
+ max_count = attr->batch.count;
1345
+ if (!max_count)
1346
+ return 0;
1347
+
1348
+ key = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
1349
+ if (!key)
1350
+ return -ENOMEM;
1351
+
1352
+ value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
1353
+ if (!value) {
1354
+ kfree(key);
1355
+ return -ENOMEM;
1356
+ }
1357
+
1358
+ f = fdget(ufd); /* bpf_map_do_batch() guarantees ufd is valid */
1359
+ for (cp = 0; cp < max_count; cp++) {
1360
+ err = -EFAULT;
1361
+ if (copy_from_user(key, keys + cp * map->key_size,
1362
+ map->key_size) ||
1363
+ copy_from_user(value, values + cp * value_size, value_size))
1364
+ break;
1365
+
1366
+ err = bpf_map_update_value(map, f, key, value,
1367
+ attr->batch.elem_flags);
1368
+
1369
+ if (err)
1370
+ break;
1371
+ cond_resched();
1372
+ }
1373
+
1374
+ if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
1375
+ err = -EFAULT;
1376
+
1377
+ kfree(value);
1378
+ kfree(key);
1379
+ fdput(f);
1380
+ return err;
1381
+}
1382
+
1383
+#define MAP_LOOKUP_RETRIES 3
1384
+
1385
+int generic_map_lookup_batch(struct bpf_map *map,
1386
+ const union bpf_attr *attr,
1387
+ union bpf_attr __user *uattr)
1388
+{
1389
+ void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch);
1390
+ void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
1391
+ void __user *values = u64_to_user_ptr(attr->batch.values);
1392
+ void __user *keys = u64_to_user_ptr(attr->batch.keys);
1393
+ void *buf, *buf_prevkey, *prev_key, *key, *value;
1394
+ int err, retry = MAP_LOOKUP_RETRIES;
1395
+ u32 value_size, cp, max_count;
1396
+
1397
+ if (attr->batch.elem_flags & ~BPF_F_LOCK)
1398
+ return -EINVAL;
1399
+
1400
+ if ((attr->batch.elem_flags & BPF_F_LOCK) &&
1401
+ !map_value_has_spin_lock(map))
1402
+ return -EINVAL;
1403
+
1404
+ value_size = bpf_map_value_size(map);
1405
+
1406
+ max_count = attr->batch.count;
1407
+ if (!max_count)
1408
+ return 0;
1409
+
1410
+ if (put_user(0, &uattr->batch.count))
1411
+ return -EFAULT;
1412
+
1413
+ buf_prevkey = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
1414
+ if (!buf_prevkey)
1415
+ return -ENOMEM;
1416
+
1417
+ buf = kmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN);
1418
+ if (!buf) {
1419
+ kfree(buf_prevkey);
1420
+ return -ENOMEM;
1421
+ }
1422
+
1423
+ err = -EFAULT;
1424
+ prev_key = NULL;
1425
+ if (ubatch && copy_from_user(buf_prevkey, ubatch, map->key_size))
1426
+ goto free_buf;
1427
+ key = buf;
1428
+ value = key + map->key_size;
1429
+ if (ubatch)
1430
+ prev_key = buf_prevkey;
1431
+
1432
+ for (cp = 0; cp < max_count;) {
1433
+ rcu_read_lock();
1434
+ err = map->ops->map_get_next_key(map, prev_key, key);
1435
+ rcu_read_unlock();
1436
+ if (err)
1437
+ break;
1438
+ err = bpf_map_copy_value(map, key, value,
1439
+ attr->batch.elem_flags);
1440
+
1441
+ if (err == -ENOENT) {
1442
+ if (retry) {
1443
+ retry--;
1444
+ continue;
1445
+ }
1446
+ err = -EINTR;
1447
+ break;
1448
+ }
1449
+
1450
+ if (err)
1451
+ goto free_buf;
1452
+
1453
+ if (copy_to_user(keys + cp * map->key_size, key,
1454
+ map->key_size)) {
1455
+ err = -EFAULT;
1456
+ goto free_buf;
1457
+ }
1458
+ if (copy_to_user(values + cp * value_size, value, value_size)) {
1459
+ err = -EFAULT;
1460
+ goto free_buf;
1461
+ }
1462
+
1463
+ if (!prev_key)
1464
+ prev_key = buf_prevkey;
1465
+
1466
+ swap(prev_key, key);
1467
+ retry = MAP_LOOKUP_RETRIES;
1468
+ cp++;
1469
+ cond_resched();
1470
+ }
1471
+
1472
+ if (err == -EFAULT)
1473
+ goto free_buf;
1474
+
1475
+ if ((copy_to_user(&uattr->batch.count, &cp, sizeof(cp)) ||
1476
+ (cp && copy_to_user(uobatch, prev_key, map->key_size))))
1477
+ err = -EFAULT;
1478
+
1479
+free_buf:
1480
+ kfree(buf_prevkey);
1481
+ kfree(buf);
1482
+ return err;
1483
+}
1484
+
1485
+#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value
1486
+
1487
+static int map_lookup_and_delete_elem(union bpf_attr *attr)
1488
+{
1489
+ void __user *ukey = u64_to_user_ptr(attr->key);
1490
+ void __user *uvalue = u64_to_user_ptr(attr->value);
1491
+ int ufd = attr->map_fd;
1492
+ struct bpf_map *map;
1493
+ void *key, *value;
1494
+ u32 value_size;
1495
+ struct fd f;
1496
+ int err;
1497
+
1498
+ if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
1499
+ return -EINVAL;
1500
+
1501
+ f = fdget(ufd);
1502
+ map = __bpf_map_get(f);
1503
+ if (IS_ERR(map))
1504
+ return PTR_ERR(map);
1505
+ bpf_map_write_active_inc(map);
1506
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) ||
1507
+ !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
1508
+ err = -EPERM;
1509
+ goto err_put;
1510
+ }
1511
+
1512
+ key = __bpf_copy_key(ukey, map->key_size);
1513
+ if (IS_ERR(key)) {
1514
+ err = PTR_ERR(key);
1515
+ goto err_put;
1516
+ }
1517
+
1518
+ value_size = map->value_size;
1519
+
1520
+ err = -ENOMEM;
1521
+ value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
1522
+ if (!value)
1523
+ goto free_key;
1524
+
1525
+ if (map->map_type == BPF_MAP_TYPE_QUEUE ||
1526
+ map->map_type == BPF_MAP_TYPE_STACK) {
1527
+ err = map->ops->map_pop_elem(map, value);
1528
+ } else {
1529
+ err = -ENOTSUPP;
1530
+ }
1531
+
1532
+ if (err)
1533
+ goto free_value;
1534
+
1535
+ if (copy_to_user(uvalue, value, value_size) != 0) {
1536
+ err = -EFAULT;
1537
+ goto free_value;
1538
+ }
1539
+
1540
+ err = 0;
1541
+
1542
+free_value:
1543
+ kfree(value);
1544
+free_key:
1545
+ kfree(key);
1546
+err_put:
1547
+ bpf_map_write_active_dec(map);
1548
+ fdput(f);
1549
+ return err;
1550
+}
1551
+
1552
+#define BPF_MAP_FREEZE_LAST_FIELD map_fd
1553
+
1554
+static int map_freeze(const union bpf_attr *attr)
1555
+{
1556
+ int err = 0, ufd = attr->map_fd;
1557
+ struct bpf_map *map;
1558
+ struct fd f;
1559
+
1560
+ if (CHECK_ATTR(BPF_MAP_FREEZE))
1561
+ return -EINVAL;
1562
+
1563
+ f = fdget(ufd);
1564
+ map = __bpf_map_get(f);
1565
+ if (IS_ERR(map))
1566
+ return PTR_ERR(map);
1567
+
1568
+ if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
1569
+ fdput(f);
1570
+ return -ENOTSUPP;
1571
+ }
1572
+
1573
+ mutex_lock(&map->freeze_mutex);
1574
+ if (bpf_map_write_active(map)) {
1575
+ err = -EBUSY;
1576
+ goto err_put;
1577
+ }
1578
+ if (READ_ONCE(map->frozen)) {
1579
+ err = -EBUSY;
1580
+ goto err_put;
1581
+ }
1582
+ if (!bpf_capable()) {
1583
+ err = -EPERM;
1584
+ goto err_put;
1585
+ }
1586
+
1587
+ WRITE_ONCE(map->frozen, true);
1588
+err_put:
1589
+ mutex_unlock(&map->freeze_mutex);
1590
+ fdput(f);
1591
+ return err;
1592
+}
1593
+
9861594 static const struct bpf_prog_ops * const bpf_prog_types[] = {
987
-#define BPF_PROG_TYPE(_id, _name) \
1595
+#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
9881596 [_id] = & _name ## _prog_ops,
9891597 #define BPF_MAP_TYPE(_id, _ops)
1598
+#define BPF_LINK_TYPE(_id, _name)
9901599 #include <linux/bpf_types.h>
9911600 #undef BPF_PROG_TYPE
9921601 #undef BPF_MAP_TYPE
1602
+#undef BPF_LINK_TYPE
9931603 };
9941604
9951605 static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
....@@ -1011,18 +1621,34 @@
10111621 return 0;
10121622 }
10131623
1014
-/* drop refcnt on maps used by eBPF program and free auxilary data */
1015
-static void free_used_maps(struct bpf_prog_aux *aux)
1624
+enum bpf_audit {
1625
+ BPF_AUDIT_LOAD,
1626
+ BPF_AUDIT_UNLOAD,
1627
+ BPF_AUDIT_MAX,
1628
+};
1629
+
1630
+static const char * const bpf_audit_str[BPF_AUDIT_MAX] = {
1631
+ [BPF_AUDIT_LOAD] = "LOAD",
1632
+ [BPF_AUDIT_UNLOAD] = "UNLOAD",
1633
+};
1634
+
1635
+static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op)
10161636 {
1017
- int i;
1637
+ struct audit_context *ctx = NULL;
1638
+ struct audit_buffer *ab;
10181639
1019
- if (aux->cgroup_storage)
1020
- bpf_cgroup_storage_release(aux->prog, aux->cgroup_storage);
1021
-
1022
- for (i = 0; i < aux->used_map_cnt; i++)
1023
- bpf_map_put(aux->used_maps[i]);
1024
-
1025
- kfree(aux->used_maps);
1640
+ if (WARN_ON_ONCE(op >= BPF_AUDIT_MAX))
1641
+ return;
1642
+ if (audit_enabled == AUDIT_OFF)
1643
+ return;
1644
+ if (op == BPF_AUDIT_LOAD)
1645
+ ctx = audit_context();
1646
+ ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF);
1647
+ if (unlikely(!ab))
1648
+ return;
1649
+ audit_log_format(ab, "prog-id=%u op=%s",
1650
+ prog->aux->id, bpf_audit_str[op]);
1651
+ audit_log_end(ab);
10261652 }
10271653
10281654 int __bpf_prog_charge(struct user_struct *user, u32 pages)
....@@ -1117,20 +1743,37 @@
11171743 {
11181744 struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
11191745
1120
- free_used_maps(aux);
1746
+ kvfree(aux->func_info);
1747
+ kfree(aux->func_info_aux);
11211748 bpf_prog_uncharge_memlock(aux->prog);
11221749 security_bpf_prog_free(aux);
11231750 bpf_prog_free(aux->prog);
11241751 }
11251752
1753
+static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
1754
+{
1755
+ bpf_prog_kallsyms_del_all(prog);
1756
+ btf_put(prog->aux->btf);
1757
+ bpf_prog_free_linfo(prog);
1758
+
1759
+ if (deferred) {
1760
+ if (prog->aux->sleepable)
1761
+ call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu);
1762
+ else
1763
+ call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
1764
+ } else {
1765
+ __bpf_prog_put_rcu(&prog->aux->rcu);
1766
+ }
1767
+}
1768
+
11261769 static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
11271770 {
1128
- if (atomic_dec_and_test(&prog->aux->refcnt)) {
1771
+ if (atomic64_dec_and_test(&prog->aux->refcnt)) {
1772
+ perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
1773
+ bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
11291774 /* bpf_prog_free_id() must be called first */
11301775 bpf_prog_free_id(prog, do_idr_lock);
1131
- bpf_prog_kallsyms_del_all(prog);
1132
-
1133
- call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
1776
+ __bpf_prog_put_noref(prog, true);
11341777 }
11351778 }
11361779
....@@ -1148,24 +1791,54 @@
11481791 return 0;
11491792 }
11501793
1794
+static void bpf_prog_get_stats(const struct bpf_prog *prog,
1795
+ struct bpf_prog_stats *stats)
1796
+{
1797
+ u64 nsecs = 0, cnt = 0;
1798
+ int cpu;
1799
+
1800
+ for_each_possible_cpu(cpu) {
1801
+ const struct bpf_prog_stats *st;
1802
+ unsigned int start;
1803
+ u64 tnsecs, tcnt;
1804
+
1805
+ st = per_cpu_ptr(prog->aux->stats, cpu);
1806
+ do {
1807
+ start = u64_stats_fetch_begin_irq(&st->syncp);
1808
+ tnsecs = st->nsecs;
1809
+ tcnt = st->cnt;
1810
+ } while (u64_stats_fetch_retry_irq(&st->syncp, start));
1811
+ nsecs += tnsecs;
1812
+ cnt += tcnt;
1813
+ }
1814
+ stats->nsecs = nsecs;
1815
+ stats->cnt = cnt;
1816
+}
1817
+
11511818 #ifdef CONFIG_PROC_FS
11521819 static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
11531820 {
11541821 const struct bpf_prog *prog = filp->private_data;
11551822 char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
1823
+ struct bpf_prog_stats stats;
11561824
1825
+ bpf_prog_get_stats(prog, &stats);
11571826 bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
11581827 seq_printf(m,
11591828 "prog_type:\t%u\n"
11601829 "prog_jited:\t%u\n"
11611830 "prog_tag:\t%s\n"
11621831 "memlock:\t%llu\n"
1163
- "prog_id:\t%u\n",
1832
+ "prog_id:\t%u\n"
1833
+ "run_time_ns:\t%llu\n"
1834
+ "run_cnt:\t%llu\n",
11641835 prog->type,
11651836 prog->jited,
11661837 prog_tag,
11671838 prog->pages * 1ULL << PAGE_SHIFT,
1168
- prog->aux->id);
1839
+ prog->aux->id,
1840
+ stats.nsecs,
1841
+ stats.cnt);
11691842 }
11701843 #endif
11711844
....@@ -1202,13 +1875,9 @@
12021875 return f.file->private_data;
12031876 }
12041877
1205
-struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i)
1878
+void bpf_prog_add(struct bpf_prog *prog, int i)
12061879 {
1207
- if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) {
1208
- atomic_sub(i, &prog->aux->refcnt);
1209
- return ERR_PTR(-EBUSY);
1210
- }
1211
- return prog;
1880
+ atomic64_add(i, &prog->aux->refcnt);
12121881 }
12131882 EXPORT_SYMBOL_GPL(bpf_prog_add);
12141883
....@@ -1219,13 +1888,13 @@
12191888 * path holds a reference to the program, thus atomic_sub() can
12201889 * be safely used in such cases!
12211890 */
1222
- WARN_ON(atomic_sub_return(i, &prog->aux->refcnt) == 0);
1891
+ WARN_ON(atomic64_sub_return(i, &prog->aux->refcnt) == 0);
12231892 }
12241893 EXPORT_SYMBOL_GPL(bpf_prog_sub);
12251894
1226
-struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
1895
+void bpf_prog_inc(struct bpf_prog *prog)
12271896 {
1228
- return bpf_prog_add(prog, 1);
1897
+ atomic64_inc(&prog->aux->refcnt);
12291898 }
12301899 EXPORT_SYMBOL_GPL(bpf_prog_inc);
12311900
....@@ -1234,12 +1903,7 @@
12341903 {
12351904 int refold;
12361905
1237
- refold = atomic_fetch_add_unless(&prog->aux->refcnt, 1, 0);
1238
-
1239
- if (refold >= BPF_MAX_REFCNT) {
1240
- __bpf_prog_put(prog, false);
1241
- return ERR_PTR(-EBUSY);
1242
- }
1906
+ refold = atomic64_fetch_add_unless(&prog->aux->refcnt, 1, 0);
12431907
12441908 if (!refold)
12451909 return ERR_PTR(-ENOENT);
....@@ -1277,7 +1941,7 @@
12771941 goto out;
12781942 }
12791943
1280
- prog = bpf_prog_inc(prog);
1944
+ bpf_prog_inc(prog);
12811945 out:
12821946 fdput(f);
12831947 return prog;
....@@ -1322,13 +1986,34 @@
13221986 }
13231987
13241988 static int
1325
-bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
1326
- enum bpf_attach_type expected_attach_type)
1989
+bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
1990
+ enum bpf_attach_type expected_attach_type,
1991
+ u32 btf_id, u32 prog_fd)
13271992 {
1993
+ if (btf_id) {
1994
+ if (btf_id > BTF_MAX_TYPE)
1995
+ return -EINVAL;
1996
+
1997
+ switch (prog_type) {
1998
+ case BPF_PROG_TYPE_TRACING:
1999
+ case BPF_PROG_TYPE_LSM:
2000
+ case BPF_PROG_TYPE_STRUCT_OPS:
2001
+ case BPF_PROG_TYPE_EXT:
2002
+ break;
2003
+ default:
2004
+ return -EINVAL;
2005
+ }
2006
+ }
2007
+
2008
+ if (prog_fd && prog_type != BPF_PROG_TYPE_TRACING &&
2009
+ prog_type != BPF_PROG_TYPE_EXT)
2010
+ return -EINVAL;
2011
+
13282012 switch (prog_type) {
13292013 case BPF_PROG_TYPE_CGROUP_SOCK:
13302014 switch (expected_attach_type) {
13312015 case BPF_CGROUP_INET_SOCK_CREATE:
2016
+ case BPF_CGROUP_INET_SOCK_RELEASE:
13322017 case BPF_CGROUP_INET4_POST_BIND:
13332018 case BPF_CGROUP_INET6_POST_BIND:
13342019 return 0;
....@@ -1341,6 +2026,10 @@
13412026 case BPF_CGROUP_INET6_BIND:
13422027 case BPF_CGROUP_INET4_CONNECT:
13432028 case BPF_CGROUP_INET6_CONNECT:
2029
+ case BPF_CGROUP_INET4_GETPEERNAME:
2030
+ case BPF_CGROUP_INET6_GETPEERNAME:
2031
+ case BPF_CGROUP_INET4_GETSOCKNAME:
2032
+ case BPF_CGROUP_INET6_GETSOCKNAME:
13442033 case BPF_CGROUP_UDP4_SENDMSG:
13452034 case BPF_CGROUP_UDP6_SENDMSG:
13462035 case BPF_CGROUP_UDP4_RECVMSG:
....@@ -1349,15 +2038,88 @@
13492038 default:
13502039 return -EINVAL;
13512040 }
2041
+ case BPF_PROG_TYPE_CGROUP_SKB:
2042
+ switch (expected_attach_type) {
2043
+ case BPF_CGROUP_INET_INGRESS:
2044
+ case BPF_CGROUP_INET_EGRESS:
2045
+ return 0;
2046
+ default:
2047
+ return -EINVAL;
2048
+ }
2049
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
2050
+ switch (expected_attach_type) {
2051
+ case BPF_CGROUP_SETSOCKOPT:
2052
+ case BPF_CGROUP_GETSOCKOPT:
2053
+ return 0;
2054
+ default:
2055
+ return -EINVAL;
2056
+ }
2057
+ case BPF_PROG_TYPE_SK_LOOKUP:
2058
+ if (expected_attach_type == BPF_SK_LOOKUP)
2059
+ return 0;
2060
+ return -EINVAL;
2061
+ case BPF_PROG_TYPE_EXT:
2062
+ if (expected_attach_type)
2063
+ return -EINVAL;
2064
+ fallthrough;
13522065 default:
13532066 return 0;
13542067 }
13552068 }
13562069
1357
-/* last field in 'union bpf_attr' used by this command */
1358
-#define BPF_PROG_LOAD_LAST_FIELD expected_attach_type
2070
+static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
2071
+{
2072
+ switch (prog_type) {
2073
+ case BPF_PROG_TYPE_SCHED_CLS:
2074
+ case BPF_PROG_TYPE_SCHED_ACT:
2075
+ case BPF_PROG_TYPE_XDP:
2076
+ case BPF_PROG_TYPE_LWT_IN:
2077
+ case BPF_PROG_TYPE_LWT_OUT:
2078
+ case BPF_PROG_TYPE_LWT_XMIT:
2079
+ case BPF_PROG_TYPE_LWT_SEG6LOCAL:
2080
+ case BPF_PROG_TYPE_SK_SKB:
2081
+ case BPF_PROG_TYPE_SK_MSG:
2082
+ case BPF_PROG_TYPE_LIRC_MODE2:
2083
+ case BPF_PROG_TYPE_FLOW_DISSECTOR:
2084
+ case BPF_PROG_TYPE_CGROUP_DEVICE:
2085
+ case BPF_PROG_TYPE_CGROUP_SOCK:
2086
+ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
2087
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
2088
+ case BPF_PROG_TYPE_CGROUP_SYSCTL:
2089
+ case BPF_PROG_TYPE_SOCK_OPS:
2090
+ case BPF_PROG_TYPE_EXT: /* extends any prog */
2091
+ return true;
2092
+ case BPF_PROG_TYPE_CGROUP_SKB:
2093
+ /* always unpriv */
2094
+ case BPF_PROG_TYPE_SK_REUSEPORT:
2095
+ /* equivalent to SOCKET_FILTER. need CAP_BPF only */
2096
+ default:
2097
+ return false;
2098
+ }
2099
+}
13592100
1360
-static int bpf_prog_load(union bpf_attr *attr)
2101
+static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
2102
+{
2103
+ switch (prog_type) {
2104
+ case BPF_PROG_TYPE_KPROBE:
2105
+ case BPF_PROG_TYPE_TRACEPOINT:
2106
+ case BPF_PROG_TYPE_PERF_EVENT:
2107
+ case BPF_PROG_TYPE_RAW_TRACEPOINT:
2108
+ case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
2109
+ case BPF_PROG_TYPE_TRACING:
2110
+ case BPF_PROG_TYPE_LSM:
2111
+ case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */
2112
+ case BPF_PROG_TYPE_EXT: /* extends any prog */
2113
+ return true;
2114
+ default:
2115
+ return false;
2116
+ }
2117
+}
2118
+
2119
+/* last field in 'union bpf_attr' used by this command */
2120
+#define BPF_PROG_LOAD_LAST_FIELD attach_prog_fd
2121
+
2122
+static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
13612123 {
13622124 enum bpf_prog_type type = attr->prog_type;
13632125 struct bpf_prog *prog;
....@@ -1368,12 +2130,16 @@
13682130 if (CHECK_ATTR(BPF_PROG_LOAD))
13692131 return -EINVAL;
13702132
1371
- if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | BPF_F_ANY_ALIGNMENT))
2133
+ if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
2134
+ BPF_F_ANY_ALIGNMENT |
2135
+ BPF_F_TEST_STATE_FREQ |
2136
+ BPF_F_SLEEPABLE |
2137
+ BPF_F_TEST_RND_HI32))
13722138 return -EINVAL;
13732139
13742140 if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
13752141 (attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
1376
- !capable(CAP_SYS_ADMIN))
2142
+ !bpf_capable())
13772143 return -EPERM;
13782144
13792145 /* copy eBPF program license from user space */
....@@ -1385,20 +2151,23 @@
13852151 /* eBPF programs must be GPL compatible to use GPL-ed functions */
13862152 is_gpl = license_is_gpl_compatible(license);
13872153
1388
- if (attr->insn_cnt == 0 || attr->insn_cnt > BPF_MAXINSNS)
2154
+ if (attr->insn_cnt == 0 ||
2155
+ attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
13892156 return -E2BIG;
1390
-
1391
- if (type == BPF_PROG_TYPE_KPROBE &&
1392
- attr->kern_version != LINUX_VERSION_CODE)
1393
- return -EINVAL;
1394
-
13952157 if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
13962158 type != BPF_PROG_TYPE_CGROUP_SKB &&
1397
- !capable(CAP_SYS_ADMIN))
2159
+ !bpf_capable())
2160
+ return -EPERM;
2161
+
2162
+ if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN) && !capable(CAP_SYS_ADMIN))
2163
+ return -EPERM;
2164
+ if (is_perfmon_prog_type(type) && !perfmon_capable())
13982165 return -EPERM;
13992166
14002167 bpf_prog_load_fixup_attach_type(attr);
1401
- if (bpf_prog_load_check_attach_type(type, attr->expected_attach_type))
2168
+ if (bpf_prog_load_check_attach(type, attr->expected_attach_type,
2169
+ attr->attach_btf_id,
2170
+ attr->attach_prog_fd))
14022171 return -EINVAL;
14032172
14042173 /* plain bpf_prog allocation */
....@@ -1407,8 +2176,20 @@
14072176 return -ENOMEM;
14082177
14092178 prog->expected_attach_type = attr->expected_attach_type;
2179
+ prog->aux->attach_btf_id = attr->attach_btf_id;
2180
+ if (attr->attach_prog_fd) {
2181
+ struct bpf_prog *dst_prog;
2182
+
2183
+ dst_prog = bpf_prog_get(attr->attach_prog_fd);
2184
+ if (IS_ERR(dst_prog)) {
2185
+ err = PTR_ERR(dst_prog);
2186
+ goto free_prog_nouncharge;
2187
+ }
2188
+ prog->aux->dst_prog = dst_prog;
2189
+ }
14102190
14112191 prog->aux->offload_requested = !!attr->prog_ifindex;
2192
+ prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
14122193
14132194 err = security_bpf_prog_alloc(prog->aux);
14142195 if (err)
....@@ -1428,7 +2209,7 @@
14282209 prog->orig_prog = NULL;
14292210 prog->jited = 0;
14302211
1431
- atomic_set(&prog->aux->refcnt, 1);
2212
+ atomic64_set(&prog->aux->refcnt, 1);
14322213 prog->gpl_compatible = is_gpl ? 1 : 0;
14332214
14342215 if (bpf_prog_is_dev_bound(prog->aux)) {
....@@ -1442,13 +2223,14 @@
14422223 if (err < 0)
14432224 goto free_prog;
14442225
1445
- prog->aux->load_time = ktime_get_boot_ns();
1446
- err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name);
1447
- if (err)
2226
+ prog->aux->load_time = ktime_get_boottime_ns();
2227
+ err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name,
2228
+ sizeof(attr->prog_name));
2229
+ if (err < 0)
14482230 goto free_prog;
14492231
14502232 /* run eBPF verifier */
1451
- err = bpf_check(&prog, attr);
2233
+ err = bpf_check(&prog, attr, uattr);
14522234 if (err < 0)
14532235 goto free_used_maps;
14542236
....@@ -1475,6 +2257,8 @@
14752257 * be using bpf_prog_put() given the program is exposed.
14762258 */
14772259 bpf_prog_kallsyms_add(prog);
2260
+ perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
2261
+ bpf_audit_prog(prog, BPF_AUDIT_LOAD);
14782262
14792263 err = bpf_prog_new_fd(prog);
14802264 if (err < 0)
....@@ -1482,8 +2266,12 @@
14822266 return err;
14832267
14842268 free_used_maps:
1485
- bpf_prog_kallsyms_del_subprogs(prog);
1486
- free_used_maps(prog->aux);
2269
+ /* In case we have subprogs, we need to wait for a grace
2270
+ * period before we can tear down JIT memory since symbols
2271
+ * are already exposed under kallsyms.
2272
+ */
2273
+ __bpf_prog_put_noref(prog, prog->aux->func_cnt);
2274
+ return err;
14872275 free_prog:
14882276 bpf_prog_uncharge_memlock(prog);
14892277 free_prog_sec:
....@@ -1513,78 +2301,610 @@
15132301 attr->file_flags);
15142302 }
15152303
1516
-struct bpf_raw_tracepoint {
1517
- struct bpf_raw_event_map *btp;
1518
- struct bpf_prog *prog;
1519
-};
1520
-
1521
-static int bpf_raw_tracepoint_release(struct inode *inode, struct file *filp)
2304
+void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
2305
+ const struct bpf_link_ops *ops, struct bpf_prog *prog)
15222306 {
1523
- struct bpf_raw_tracepoint *raw_tp = filp->private_data;
2307
+ atomic64_set(&link->refcnt, 1);
2308
+ link->type = type;
2309
+ link->id = 0;
2310
+ link->ops = ops;
2311
+ link->prog = prog;
2312
+}
15242313
1525
- if (raw_tp->prog) {
1526
- bpf_probe_unregister(raw_tp->btp, raw_tp->prog);
1527
- bpf_prog_put(raw_tp->prog);
2314
+static void bpf_link_free_id(int id)
2315
+{
2316
+ if (!id)
2317
+ return;
2318
+
2319
+ spin_lock_bh(&link_idr_lock);
2320
+ idr_remove(&link_idr, id);
2321
+ spin_unlock_bh(&link_idr_lock);
2322
+}
2323
+
2324
+/* Clean up bpf_link and corresponding anon_inode file and FD. After
2325
+ * anon_inode is created, bpf_link can't be just kfree()'d due to deferred
2326
+ * anon_inode's release() call. This helper marksbpf_link as
2327
+ * defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt
2328
+ * is not decremented, it's the responsibility of a calling code that failed
2329
+ * to complete bpf_link initialization.
2330
+ */
2331
+void bpf_link_cleanup(struct bpf_link_primer *primer)
2332
+{
2333
+ primer->link->prog = NULL;
2334
+ bpf_link_free_id(primer->id);
2335
+ fput(primer->file);
2336
+ put_unused_fd(primer->fd);
2337
+}
2338
+
2339
+void bpf_link_inc(struct bpf_link *link)
2340
+{
2341
+ atomic64_inc(&link->refcnt);
2342
+}
2343
+
2344
+/* bpf_link_free is guaranteed to be called from process context */
2345
+static void bpf_link_free(struct bpf_link *link)
2346
+{
2347
+ bpf_link_free_id(link->id);
2348
+ if (link->prog) {
2349
+ /* detach BPF program, clean up used resources */
2350
+ link->ops->release(link);
2351
+ bpf_prog_put(link->prog);
15282352 }
1529
- kfree(raw_tp);
2353
+ /* free bpf_link and its containing memory */
2354
+ link->ops->dealloc(link);
2355
+}
2356
+
2357
+static void bpf_link_put_deferred(struct work_struct *work)
2358
+{
2359
+ struct bpf_link *link = container_of(work, struct bpf_link, work);
2360
+
2361
+ bpf_link_free(link);
2362
+}
2363
+
2364
+/* bpf_link_put can be called from atomic context, but ensures that resources
2365
+ * are freed from process context
2366
+ */
2367
+void bpf_link_put(struct bpf_link *link)
2368
+{
2369
+ if (!atomic64_dec_and_test(&link->refcnt))
2370
+ return;
2371
+
2372
+ if (in_atomic()) {
2373
+ INIT_WORK(&link->work, bpf_link_put_deferred);
2374
+ schedule_work(&link->work);
2375
+ } else {
2376
+ bpf_link_free(link);
2377
+ }
2378
+}
2379
+
2380
+static int bpf_link_release(struct inode *inode, struct file *filp)
2381
+{
2382
+ struct bpf_link *link = filp->private_data;
2383
+
2384
+ bpf_link_put(link);
15302385 return 0;
15312386 }
15322387
1533
-static const struct file_operations bpf_raw_tp_fops = {
1534
- .release = bpf_raw_tracepoint_release,
2388
+#ifdef CONFIG_PROC_FS
2389
+#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
2390
+#define BPF_MAP_TYPE(_id, _ops)
2391
+#define BPF_LINK_TYPE(_id, _name) [_id] = #_name,
2392
+static const char *bpf_link_type_strs[] = {
2393
+ [BPF_LINK_TYPE_UNSPEC] = "<invalid>",
2394
+#include <linux/bpf_types.h>
2395
+};
2396
+#undef BPF_PROG_TYPE
2397
+#undef BPF_MAP_TYPE
2398
+#undef BPF_LINK_TYPE
2399
+
2400
+static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
2401
+{
2402
+ const struct bpf_link *link = filp->private_data;
2403
+ const struct bpf_prog *prog = link->prog;
2404
+ char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
2405
+
2406
+ bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
2407
+ seq_printf(m,
2408
+ "link_type:\t%s\n"
2409
+ "link_id:\t%u\n"
2410
+ "prog_tag:\t%s\n"
2411
+ "prog_id:\t%u\n",
2412
+ bpf_link_type_strs[link->type],
2413
+ link->id,
2414
+ prog_tag,
2415
+ prog->aux->id);
2416
+ if (link->ops->show_fdinfo)
2417
+ link->ops->show_fdinfo(link, m);
2418
+}
2419
+#endif
2420
+
2421
+static const struct file_operations bpf_link_fops = {
2422
+#ifdef CONFIG_PROC_FS
2423
+ .show_fdinfo = bpf_link_show_fdinfo,
2424
+#endif
2425
+ .release = bpf_link_release,
15352426 .read = bpf_dummy_read,
15362427 .write = bpf_dummy_write,
2428
+};
2429
+
2430
+static int bpf_link_alloc_id(struct bpf_link *link)
2431
+{
2432
+ int id;
2433
+
2434
+ idr_preload(GFP_KERNEL);
2435
+ spin_lock_bh(&link_idr_lock);
2436
+ id = idr_alloc_cyclic(&link_idr, link, 1, INT_MAX, GFP_ATOMIC);
2437
+ spin_unlock_bh(&link_idr_lock);
2438
+ idr_preload_end();
2439
+
2440
+ return id;
2441
+}
2442
+
2443
+/* Prepare bpf_link to be exposed to user-space by allocating anon_inode file,
2444
+ * reserving unused FD and allocating ID from link_idr. This is to be paired
2445
+ * with bpf_link_settle() to install FD and ID and expose bpf_link to
2446
+ * user-space, if bpf_link is successfully attached. If not, bpf_link and
2447
+ * pre-allocated resources are to be freed with bpf_cleanup() call. All the
2448
+ * transient state is passed around in struct bpf_link_primer.
2449
+ * This is preferred way to create and initialize bpf_link, especially when
2450
+ * there are complicated and expensive operations inbetween creating bpf_link
2451
+ * itself and attaching it to BPF hook. By using bpf_link_prime() and
2452
+ * bpf_link_settle() kernel code using bpf_link doesn't have to perform
2453
+ * expensive (and potentially failing) roll back operations in a rare case
2454
+ * that file, FD, or ID can't be allocated.
2455
+ */
2456
+int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer)
2457
+{
2458
+ struct file *file;
2459
+ int fd, id;
2460
+
2461
+ fd = get_unused_fd_flags(O_CLOEXEC);
2462
+ if (fd < 0)
2463
+ return fd;
2464
+
2465
+
2466
+ id = bpf_link_alloc_id(link);
2467
+ if (id < 0) {
2468
+ put_unused_fd(fd);
2469
+ return id;
2470
+ }
2471
+
2472
+ file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC);
2473
+ if (IS_ERR(file)) {
2474
+ bpf_link_free_id(id);
2475
+ put_unused_fd(fd);
2476
+ return PTR_ERR(file);
2477
+ }
2478
+
2479
+ primer->link = link;
2480
+ primer->file = file;
2481
+ primer->fd = fd;
2482
+ primer->id = id;
2483
+ return 0;
2484
+}
2485
+
2486
+int bpf_link_settle(struct bpf_link_primer *primer)
2487
+{
2488
+ /* make bpf_link fetchable by ID */
2489
+ spin_lock_bh(&link_idr_lock);
2490
+ primer->link->id = primer->id;
2491
+ spin_unlock_bh(&link_idr_lock);
2492
+ /* make bpf_link fetchable by FD */
2493
+ fd_install(primer->fd, primer->file);
2494
+ /* pass through installed FD */
2495
+ return primer->fd;
2496
+}
2497
+
2498
+int bpf_link_new_fd(struct bpf_link *link)
2499
+{
2500
+ return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC);
2501
+}
2502
+
2503
+struct bpf_link *bpf_link_get_from_fd(u32 ufd)
2504
+{
2505
+ struct fd f = fdget(ufd);
2506
+ struct bpf_link *link;
2507
+
2508
+ if (!f.file)
2509
+ return ERR_PTR(-EBADF);
2510
+ if (f.file->f_op != &bpf_link_fops) {
2511
+ fdput(f);
2512
+ return ERR_PTR(-EINVAL);
2513
+ }
2514
+
2515
+ link = f.file->private_data;
2516
+ bpf_link_inc(link);
2517
+ fdput(f);
2518
+
2519
+ return link;
2520
+}
2521
+
2522
+struct bpf_tracing_link {
2523
+ struct bpf_link link;
2524
+ enum bpf_attach_type attach_type;
2525
+ struct bpf_trampoline *trampoline;
2526
+ struct bpf_prog *tgt_prog;
2527
+};
2528
+
2529
+static void bpf_tracing_link_release(struct bpf_link *link)
2530
+{
2531
+ struct bpf_tracing_link *tr_link =
2532
+ container_of(link, struct bpf_tracing_link, link);
2533
+
2534
+ WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog,
2535
+ tr_link->trampoline));
2536
+
2537
+ bpf_trampoline_put(tr_link->trampoline);
2538
+
2539
+ /* tgt_prog is NULL if target is a kernel function */
2540
+ if (tr_link->tgt_prog)
2541
+ bpf_prog_put(tr_link->tgt_prog);
2542
+}
2543
+
2544
+static void bpf_tracing_link_dealloc(struct bpf_link *link)
2545
+{
2546
+ struct bpf_tracing_link *tr_link =
2547
+ container_of(link, struct bpf_tracing_link, link);
2548
+
2549
+ kfree(tr_link);
2550
+}
2551
+
2552
+static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link,
2553
+ struct seq_file *seq)
2554
+{
2555
+ struct bpf_tracing_link *tr_link =
2556
+ container_of(link, struct bpf_tracing_link, link);
2557
+
2558
+ seq_printf(seq,
2559
+ "attach_type:\t%d\n",
2560
+ tr_link->attach_type);
2561
+}
2562
+
2563
+static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
2564
+ struct bpf_link_info *info)
2565
+{
2566
+ struct bpf_tracing_link *tr_link =
2567
+ container_of(link, struct bpf_tracing_link, link);
2568
+
2569
+ info->tracing.attach_type = tr_link->attach_type;
2570
+
2571
+ return 0;
2572
+}
2573
+
2574
+static const struct bpf_link_ops bpf_tracing_link_lops = {
2575
+ .release = bpf_tracing_link_release,
2576
+ .dealloc = bpf_tracing_link_dealloc,
2577
+ .show_fdinfo = bpf_tracing_link_show_fdinfo,
2578
+ .fill_link_info = bpf_tracing_link_fill_link_info,
2579
+};
2580
+
2581
+static int bpf_tracing_prog_attach(struct bpf_prog *prog,
2582
+ int tgt_prog_fd,
2583
+ u32 btf_id)
2584
+{
2585
+ struct bpf_link_primer link_primer;
2586
+ struct bpf_prog *tgt_prog = NULL;
2587
+ struct bpf_trampoline *tr = NULL;
2588
+ struct bpf_tracing_link *link;
2589
+ u64 key = 0;
2590
+ int err;
2591
+
2592
+ switch (prog->type) {
2593
+ case BPF_PROG_TYPE_TRACING:
2594
+ if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
2595
+ prog->expected_attach_type != BPF_TRACE_FEXIT &&
2596
+ prog->expected_attach_type != BPF_MODIFY_RETURN) {
2597
+ err = -EINVAL;
2598
+ goto out_put_prog;
2599
+ }
2600
+ break;
2601
+ case BPF_PROG_TYPE_EXT:
2602
+ if (prog->expected_attach_type != 0) {
2603
+ err = -EINVAL;
2604
+ goto out_put_prog;
2605
+ }
2606
+ break;
2607
+ case BPF_PROG_TYPE_LSM:
2608
+ if (prog->expected_attach_type != BPF_LSM_MAC) {
2609
+ err = -EINVAL;
2610
+ goto out_put_prog;
2611
+ }
2612
+ break;
2613
+ default:
2614
+ err = -EINVAL;
2615
+ goto out_put_prog;
2616
+ }
2617
+
2618
+ if (!!tgt_prog_fd != !!btf_id) {
2619
+ err = -EINVAL;
2620
+ goto out_put_prog;
2621
+ }
2622
+
2623
+ if (tgt_prog_fd) {
2624
+ /* For now we only allow new targets for BPF_PROG_TYPE_EXT */
2625
+ if (prog->type != BPF_PROG_TYPE_EXT) {
2626
+ err = -EINVAL;
2627
+ goto out_put_prog;
2628
+ }
2629
+
2630
+ tgt_prog = bpf_prog_get(tgt_prog_fd);
2631
+ if (IS_ERR(tgt_prog)) {
2632
+ err = PTR_ERR(tgt_prog);
2633
+ tgt_prog = NULL;
2634
+ goto out_put_prog;
2635
+ }
2636
+
2637
+ key = bpf_trampoline_compute_key(tgt_prog, btf_id);
2638
+ }
2639
+
2640
+ link = kzalloc(sizeof(*link), GFP_USER);
2641
+ if (!link) {
2642
+ err = -ENOMEM;
2643
+ goto out_put_prog;
2644
+ }
2645
+ bpf_link_init(&link->link, BPF_LINK_TYPE_TRACING,
2646
+ &bpf_tracing_link_lops, prog);
2647
+ link->attach_type = prog->expected_attach_type;
2648
+
2649
+ mutex_lock(&prog->aux->dst_mutex);
2650
+
2651
+ /* There are a few possible cases here:
2652
+ *
2653
+ * - if prog->aux->dst_trampoline is set, the program was just loaded
2654
+ * and not yet attached to anything, so we can use the values stored
2655
+ * in prog->aux
2656
+ *
2657
+ * - if prog->aux->dst_trampoline is NULL, the program has already been
2658
+ * attached to a target and its initial target was cleared (below)
2659
+ *
2660
+ * - if tgt_prog != NULL, the caller specified tgt_prog_fd +
2661
+ * target_btf_id using the link_create API.
2662
+ *
2663
+ * - if tgt_prog == NULL when this function was called using the old
2664
+ * raw_tracepoint_open API, and we need a target from prog->aux
2665
+ *
2666
+ * The combination of no saved target in prog->aux, and no target
2667
+ * specified on load is illegal, and we reject that here.
2668
+ */
2669
+ if (!prog->aux->dst_trampoline && !tgt_prog) {
2670
+ err = -ENOENT;
2671
+ goto out_unlock;
2672
+ }
2673
+
2674
+ if (!prog->aux->dst_trampoline ||
2675
+ (key && key != prog->aux->dst_trampoline->key)) {
2676
+ /* If there is no saved target, or the specified target is
2677
+ * different from the destination specified at load time, we
2678
+ * need a new trampoline and a check for compatibility
2679
+ */
2680
+ struct bpf_attach_target_info tgt_info = {};
2681
+
2682
+ err = bpf_check_attach_target(NULL, prog, tgt_prog, btf_id,
2683
+ &tgt_info);
2684
+ if (err)
2685
+ goto out_unlock;
2686
+
2687
+ tr = bpf_trampoline_get(key, &tgt_info);
2688
+ if (!tr) {
2689
+ err = -ENOMEM;
2690
+ goto out_unlock;
2691
+ }
2692
+ } else {
2693
+ /* The caller didn't specify a target, or the target was the
2694
+ * same as the destination supplied during program load. This
2695
+ * means we can reuse the trampoline and reference from program
2696
+ * load time, and there is no need to allocate a new one. This
2697
+ * can only happen once for any program, as the saved values in
2698
+ * prog->aux are cleared below.
2699
+ */
2700
+ tr = prog->aux->dst_trampoline;
2701
+ tgt_prog = prog->aux->dst_prog;
2702
+ }
2703
+
2704
+ err = bpf_link_prime(&link->link, &link_primer);
2705
+ if (err)
2706
+ goto out_unlock;
2707
+
2708
+ err = bpf_trampoline_link_prog(prog, tr);
2709
+ if (err) {
2710
+ bpf_link_cleanup(&link_primer);
2711
+ link = NULL;
2712
+ goto out_unlock;
2713
+ }
2714
+
2715
+ link->tgt_prog = tgt_prog;
2716
+ link->trampoline = tr;
2717
+
2718
+ /* Always clear the trampoline and target prog from prog->aux to make
2719
+ * sure the original attach destination is not kept alive after a
2720
+ * program is (re-)attached to another target.
2721
+ */
2722
+ if (prog->aux->dst_prog &&
2723
+ (tgt_prog_fd || tr != prog->aux->dst_trampoline))
2724
+ /* got extra prog ref from syscall, or attaching to different prog */
2725
+ bpf_prog_put(prog->aux->dst_prog);
2726
+ if (prog->aux->dst_trampoline && tr != prog->aux->dst_trampoline)
2727
+ /* we allocated a new trampoline, so free the old one */
2728
+ bpf_trampoline_put(prog->aux->dst_trampoline);
2729
+
2730
+ prog->aux->dst_prog = NULL;
2731
+ prog->aux->dst_trampoline = NULL;
2732
+ mutex_unlock(&prog->aux->dst_mutex);
2733
+
2734
+ return bpf_link_settle(&link_primer);
2735
+out_unlock:
2736
+ if (tr && tr != prog->aux->dst_trampoline)
2737
+ bpf_trampoline_put(tr);
2738
+ mutex_unlock(&prog->aux->dst_mutex);
2739
+ kfree(link);
2740
+out_put_prog:
2741
+ if (tgt_prog_fd && tgt_prog)
2742
+ bpf_prog_put(tgt_prog);
2743
+ return err;
2744
+}
2745
+
2746
+struct bpf_raw_tp_link {
2747
+ struct bpf_link link;
2748
+ struct bpf_raw_event_map *btp;
2749
+};
2750
+
2751
+static void bpf_raw_tp_link_release(struct bpf_link *link)
2752
+{
2753
+ struct bpf_raw_tp_link *raw_tp =
2754
+ container_of(link, struct bpf_raw_tp_link, link);
2755
+
2756
+ bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog);
2757
+ bpf_put_raw_tracepoint(raw_tp->btp);
2758
+}
2759
+
2760
+static void bpf_raw_tp_link_dealloc(struct bpf_link *link)
2761
+{
2762
+ struct bpf_raw_tp_link *raw_tp =
2763
+ container_of(link, struct bpf_raw_tp_link, link);
2764
+
2765
+ kfree(raw_tp);
2766
+}
2767
+
2768
+static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link,
2769
+ struct seq_file *seq)
2770
+{
2771
+ struct bpf_raw_tp_link *raw_tp_link =
2772
+ container_of(link, struct bpf_raw_tp_link, link);
2773
+
2774
+ seq_printf(seq,
2775
+ "tp_name:\t%s\n",
2776
+ raw_tp_link->btp->tp->name);
2777
+}
2778
+
2779
+static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
2780
+ struct bpf_link_info *info)
2781
+{
2782
+ struct bpf_raw_tp_link *raw_tp_link =
2783
+ container_of(link, struct bpf_raw_tp_link, link);
2784
+ char __user *ubuf = u64_to_user_ptr(info->raw_tracepoint.tp_name);
2785
+ const char *tp_name = raw_tp_link->btp->tp->name;
2786
+ u32 ulen = info->raw_tracepoint.tp_name_len;
2787
+ size_t tp_len = strlen(tp_name);
2788
+
2789
+ if (!ulen ^ !ubuf)
2790
+ return -EINVAL;
2791
+
2792
+ info->raw_tracepoint.tp_name_len = tp_len + 1;
2793
+
2794
+ if (!ubuf)
2795
+ return 0;
2796
+
2797
+ if (ulen >= tp_len + 1) {
2798
+ if (copy_to_user(ubuf, tp_name, tp_len + 1))
2799
+ return -EFAULT;
2800
+ } else {
2801
+ char zero = '\0';
2802
+
2803
+ if (copy_to_user(ubuf, tp_name, ulen - 1))
2804
+ return -EFAULT;
2805
+ if (put_user(zero, ubuf + ulen - 1))
2806
+ return -EFAULT;
2807
+ return -ENOSPC;
2808
+ }
2809
+
2810
+ return 0;
2811
+}
2812
+
2813
+static const struct bpf_link_ops bpf_raw_tp_link_lops = {
2814
+ .release = bpf_raw_tp_link_release,
2815
+ .dealloc = bpf_raw_tp_link_dealloc,
2816
+ .show_fdinfo = bpf_raw_tp_link_show_fdinfo,
2817
+ .fill_link_info = bpf_raw_tp_link_fill_link_info,
15372818 };
15382819
15392820 #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
15402821
15412822 static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
15422823 {
1543
- struct bpf_raw_tracepoint *raw_tp;
2824
+ struct bpf_link_primer link_primer;
2825
+ struct bpf_raw_tp_link *link;
15442826 struct bpf_raw_event_map *btp;
15452827 struct bpf_prog *prog;
1546
- char tp_name[128];
1547
- int tp_fd, err;
2828
+ const char *tp_name;
2829
+ char buf[128];
2830
+ int err;
15482831
1549
- if (strncpy_from_user(tp_name, u64_to_user_ptr(attr->raw_tracepoint.name),
1550
- sizeof(tp_name) - 1) < 0)
1551
- return -EFAULT;
1552
- tp_name[sizeof(tp_name) - 1] = 0;
2832
+ if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
2833
+ return -EINVAL;
15532834
1554
- btp = bpf_find_raw_tracepoint(tp_name);
1555
- if (!btp)
1556
- return -ENOENT;
2835
+ prog = bpf_prog_get(attr->raw_tracepoint.prog_fd);
2836
+ if (IS_ERR(prog))
2837
+ return PTR_ERR(prog);
15572838
1558
- raw_tp = kzalloc(sizeof(*raw_tp), GFP_USER);
1559
- if (!raw_tp)
1560
- return -ENOMEM;
1561
- raw_tp->btp = btp;
1562
-
1563
- prog = bpf_prog_get_type(attr->raw_tracepoint.prog_fd,
1564
- BPF_PROG_TYPE_RAW_TRACEPOINT);
1565
- if (IS_ERR(prog)) {
1566
- err = PTR_ERR(prog);
1567
- goto out_free_tp;
1568
- }
1569
-
1570
- err = bpf_probe_register(raw_tp->btp, prog);
1571
- if (err)
2839
+ switch (prog->type) {
2840
+ case BPF_PROG_TYPE_TRACING:
2841
+ case BPF_PROG_TYPE_EXT:
2842
+ case BPF_PROG_TYPE_LSM:
2843
+ if (attr->raw_tracepoint.name) {
2844
+ /* The attach point for this category of programs
2845
+ * should be specified via btf_id during program load.
2846
+ */
2847
+ err = -EINVAL;
2848
+ goto out_put_prog;
2849
+ }
2850
+ if (prog->type == BPF_PROG_TYPE_TRACING &&
2851
+ prog->expected_attach_type == BPF_TRACE_RAW_TP) {
2852
+ tp_name = prog->aux->attach_func_name;
2853
+ break;
2854
+ }
2855
+ err = bpf_tracing_prog_attach(prog, 0, 0);
2856
+ if (err >= 0)
2857
+ return err;
15722858 goto out_put_prog;
1573
-
1574
- raw_tp->prog = prog;
1575
- tp_fd = anon_inode_getfd("bpf-raw-tracepoint", &bpf_raw_tp_fops, raw_tp,
1576
- O_CLOEXEC);
1577
- if (tp_fd < 0) {
1578
- bpf_probe_unregister(raw_tp->btp, prog);
1579
- err = tp_fd;
2859
+ case BPF_PROG_TYPE_RAW_TRACEPOINT:
2860
+ case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
2861
+ if (strncpy_from_user(buf,
2862
+ u64_to_user_ptr(attr->raw_tracepoint.name),
2863
+ sizeof(buf) - 1) < 0) {
2864
+ err = -EFAULT;
2865
+ goto out_put_prog;
2866
+ }
2867
+ buf[sizeof(buf) - 1] = 0;
2868
+ tp_name = buf;
2869
+ break;
2870
+ default:
2871
+ err = -EINVAL;
15802872 goto out_put_prog;
15812873 }
1582
- return tp_fd;
15832874
2875
+ btp = bpf_get_raw_tracepoint(tp_name);
2876
+ if (!btp) {
2877
+ err = -ENOENT;
2878
+ goto out_put_prog;
2879
+ }
2880
+
2881
+ link = kzalloc(sizeof(*link), GFP_USER);
2882
+ if (!link) {
2883
+ err = -ENOMEM;
2884
+ goto out_put_btp;
2885
+ }
2886
+ bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT,
2887
+ &bpf_raw_tp_link_lops, prog);
2888
+ link->btp = btp;
2889
+
2890
+ err = bpf_link_prime(&link->link, &link_primer);
2891
+ if (err) {
2892
+ kfree(link);
2893
+ goto out_put_btp;
2894
+ }
2895
+
2896
+ err = bpf_probe_register(link->btp, prog);
2897
+ if (err) {
2898
+ bpf_link_cleanup(&link_primer);
2899
+ goto out_put_btp;
2900
+ }
2901
+
2902
+ return bpf_link_settle(&link_primer);
2903
+
2904
+out_put_btp:
2905
+ bpf_put_raw_tracepoint(btp);
15842906 out_put_prog:
15852907 bpf_prog_put(prog);
1586
-out_free_tp:
1587
- kfree(raw_tp);
15882908 return err;
15892909 }
15902910
....@@ -1594,16 +2914,81 @@
15942914 switch (prog->type) {
15952915 case BPF_PROG_TYPE_CGROUP_SOCK:
15962916 case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
2917
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
2918
+ case BPF_PROG_TYPE_SK_LOOKUP:
15972919 return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
2920
+ case BPF_PROG_TYPE_CGROUP_SKB:
2921
+ if (!capable(CAP_NET_ADMIN))
2922
+ /* cg-skb progs can be loaded by unpriv user.
2923
+ * check permissions at attach time.
2924
+ */
2925
+ return -EPERM;
2926
+ return prog->enforce_expected_attach_type &&
2927
+ prog->expected_attach_type != attach_type ?
2928
+ -EINVAL : 0;
15982929 default:
15992930 return 0;
16002931 }
16012932 }
16022933
1603
-#define BPF_PROG_ATTACH_LAST_FIELD attach_flags
2934
+static enum bpf_prog_type
2935
+attach_type_to_prog_type(enum bpf_attach_type attach_type)
2936
+{
2937
+ switch (attach_type) {
2938
+ case BPF_CGROUP_INET_INGRESS:
2939
+ case BPF_CGROUP_INET_EGRESS:
2940
+ return BPF_PROG_TYPE_CGROUP_SKB;
2941
+ case BPF_CGROUP_INET_SOCK_CREATE:
2942
+ case BPF_CGROUP_INET_SOCK_RELEASE:
2943
+ case BPF_CGROUP_INET4_POST_BIND:
2944
+ case BPF_CGROUP_INET6_POST_BIND:
2945
+ return BPF_PROG_TYPE_CGROUP_SOCK;
2946
+ case BPF_CGROUP_INET4_BIND:
2947
+ case BPF_CGROUP_INET6_BIND:
2948
+ case BPF_CGROUP_INET4_CONNECT:
2949
+ case BPF_CGROUP_INET6_CONNECT:
2950
+ case BPF_CGROUP_INET4_GETPEERNAME:
2951
+ case BPF_CGROUP_INET6_GETPEERNAME:
2952
+ case BPF_CGROUP_INET4_GETSOCKNAME:
2953
+ case BPF_CGROUP_INET6_GETSOCKNAME:
2954
+ case BPF_CGROUP_UDP4_SENDMSG:
2955
+ case BPF_CGROUP_UDP6_SENDMSG:
2956
+ case BPF_CGROUP_UDP4_RECVMSG:
2957
+ case BPF_CGROUP_UDP6_RECVMSG:
2958
+ return BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
2959
+ case BPF_CGROUP_SOCK_OPS:
2960
+ return BPF_PROG_TYPE_SOCK_OPS;
2961
+ case BPF_CGROUP_DEVICE:
2962
+ return BPF_PROG_TYPE_CGROUP_DEVICE;
2963
+ case BPF_SK_MSG_VERDICT:
2964
+ return BPF_PROG_TYPE_SK_MSG;
2965
+ case BPF_SK_SKB_STREAM_PARSER:
2966
+ case BPF_SK_SKB_STREAM_VERDICT:
2967
+ return BPF_PROG_TYPE_SK_SKB;
2968
+ case BPF_LIRC_MODE2:
2969
+ return BPF_PROG_TYPE_LIRC_MODE2;
2970
+ case BPF_FLOW_DISSECTOR:
2971
+ return BPF_PROG_TYPE_FLOW_DISSECTOR;
2972
+ case BPF_CGROUP_SYSCTL:
2973
+ return BPF_PROG_TYPE_CGROUP_SYSCTL;
2974
+ case BPF_CGROUP_GETSOCKOPT:
2975
+ case BPF_CGROUP_SETSOCKOPT:
2976
+ return BPF_PROG_TYPE_CGROUP_SOCKOPT;
2977
+ case BPF_TRACE_ITER:
2978
+ return BPF_PROG_TYPE_TRACING;
2979
+ case BPF_SK_LOOKUP:
2980
+ return BPF_PROG_TYPE_SK_LOOKUP;
2981
+ case BPF_XDP:
2982
+ return BPF_PROG_TYPE_XDP;
2983
+ default:
2984
+ return BPF_PROG_TYPE_UNSPEC;
2985
+ }
2986
+}
2987
+
2988
+#define BPF_PROG_ATTACH_LAST_FIELD replace_bpf_fd
16042989
16052990 #define BPF_F_ATTACH_MASK \
1606
- (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)
2991
+ (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI | BPF_F_REPLACE)
16072992
16082993 static int bpf_prog_attach(const union bpf_attr *attr)
16092994 {
....@@ -1611,54 +2996,15 @@
16112996 struct bpf_prog *prog;
16122997 int ret;
16132998
1614
- if (!capable(CAP_NET_ADMIN))
1615
- return -EPERM;
1616
-
16172999 if (CHECK_ATTR(BPF_PROG_ATTACH))
16183000 return -EINVAL;
16193001
16203002 if (attr->attach_flags & ~BPF_F_ATTACH_MASK)
16213003 return -EINVAL;
16223004
1623
- switch (attr->attach_type) {
1624
- case BPF_CGROUP_INET_INGRESS:
1625
- case BPF_CGROUP_INET_EGRESS:
1626
- ptype = BPF_PROG_TYPE_CGROUP_SKB;
1627
- break;
1628
- case BPF_CGROUP_INET_SOCK_CREATE:
1629
- case BPF_CGROUP_INET4_POST_BIND:
1630
- case BPF_CGROUP_INET6_POST_BIND:
1631
- ptype = BPF_PROG_TYPE_CGROUP_SOCK;
1632
- break;
1633
- case BPF_CGROUP_INET4_BIND:
1634
- case BPF_CGROUP_INET6_BIND:
1635
- case BPF_CGROUP_INET4_CONNECT:
1636
- case BPF_CGROUP_INET6_CONNECT:
1637
- case BPF_CGROUP_UDP4_SENDMSG:
1638
- case BPF_CGROUP_UDP6_SENDMSG:
1639
- case BPF_CGROUP_UDP4_RECVMSG:
1640
- case BPF_CGROUP_UDP6_RECVMSG:
1641
- ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
1642
- break;
1643
- case BPF_CGROUP_SOCK_OPS:
1644
- ptype = BPF_PROG_TYPE_SOCK_OPS;
1645
- break;
1646
- case BPF_CGROUP_DEVICE:
1647
- ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
1648
- break;
1649
- case BPF_SK_MSG_VERDICT:
1650
- ptype = BPF_PROG_TYPE_SK_MSG;
1651
- break;
1652
- case BPF_SK_SKB_STREAM_PARSER:
1653
- case BPF_SK_SKB_STREAM_VERDICT:
1654
- ptype = BPF_PROG_TYPE_SK_SKB;
1655
- break;
1656
- case BPF_LIRC_MODE2:
1657
- ptype = BPF_PROG_TYPE_LIRC_MODE2;
1658
- break;
1659
- default:
3005
+ ptype = attach_type_to_prog_type(attr->attach_type);
3006
+ if (ptype == BPF_PROG_TYPE_UNSPEC)
16603007 return -EINVAL;
1661
- }
16623008
16633009 prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
16643010 if (IS_ERR(prog))
....@@ -1672,13 +3018,25 @@
16723018 switch (ptype) {
16733019 case BPF_PROG_TYPE_SK_SKB:
16743020 case BPF_PROG_TYPE_SK_MSG:
1675
- ret = sockmap_get_from_fd(attr, ptype, prog);
3021
+ ret = sock_map_get_from_fd(attr, prog);
16763022 break;
16773023 case BPF_PROG_TYPE_LIRC_MODE2:
16783024 ret = lirc_prog_attach(attr, prog);
16793025 break;
1680
- default:
3026
+ case BPF_PROG_TYPE_FLOW_DISSECTOR:
3027
+ ret = netns_bpf_prog_attach(attr, prog);
3028
+ break;
3029
+ case BPF_PROG_TYPE_CGROUP_DEVICE:
3030
+ case BPF_PROG_TYPE_CGROUP_SKB:
3031
+ case BPF_PROG_TYPE_CGROUP_SOCK:
3032
+ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
3033
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
3034
+ case BPF_PROG_TYPE_CGROUP_SYSCTL:
3035
+ case BPF_PROG_TYPE_SOCK_OPS:
16813036 ret = cgroup_bpf_prog_attach(attr, ptype, prog);
3037
+ break;
3038
+ default:
3039
+ ret = -EINVAL;
16823040 }
16833041
16843042 if (ret)
....@@ -1692,50 +3050,30 @@
16923050 {
16933051 enum bpf_prog_type ptype;
16943052
1695
- if (!capable(CAP_NET_ADMIN))
1696
- return -EPERM;
1697
-
16983053 if (CHECK_ATTR(BPF_PROG_DETACH))
16993054 return -EINVAL;
17003055
1701
- switch (attr->attach_type) {
1702
- case BPF_CGROUP_INET_INGRESS:
1703
- case BPF_CGROUP_INET_EGRESS:
1704
- ptype = BPF_PROG_TYPE_CGROUP_SKB;
1705
- break;
1706
- case BPF_CGROUP_INET_SOCK_CREATE:
1707
- case BPF_CGROUP_INET4_POST_BIND:
1708
- case BPF_CGROUP_INET6_POST_BIND:
1709
- ptype = BPF_PROG_TYPE_CGROUP_SOCK;
1710
- break;
1711
- case BPF_CGROUP_INET4_BIND:
1712
- case BPF_CGROUP_INET6_BIND:
1713
- case BPF_CGROUP_INET4_CONNECT:
1714
- case BPF_CGROUP_INET6_CONNECT:
1715
- case BPF_CGROUP_UDP4_SENDMSG:
1716
- case BPF_CGROUP_UDP6_SENDMSG:
1717
- case BPF_CGROUP_UDP4_RECVMSG:
1718
- case BPF_CGROUP_UDP6_RECVMSG:
1719
- ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
1720
- break;
1721
- case BPF_CGROUP_SOCK_OPS:
1722
- ptype = BPF_PROG_TYPE_SOCK_OPS;
1723
- break;
1724
- case BPF_CGROUP_DEVICE:
1725
- ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
1726
- break;
1727
- case BPF_SK_MSG_VERDICT:
1728
- return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, NULL);
1729
- case BPF_SK_SKB_STREAM_PARSER:
1730
- case BPF_SK_SKB_STREAM_VERDICT:
1731
- return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, NULL);
1732
- case BPF_LIRC_MODE2:
3056
+ ptype = attach_type_to_prog_type(attr->attach_type);
3057
+
3058
+ switch (ptype) {
3059
+ case BPF_PROG_TYPE_SK_MSG:
3060
+ case BPF_PROG_TYPE_SK_SKB:
3061
+ return sock_map_prog_detach(attr, ptype);
3062
+ case BPF_PROG_TYPE_LIRC_MODE2:
17333063 return lirc_prog_detach(attr);
3064
+ case BPF_PROG_TYPE_FLOW_DISSECTOR:
3065
+ return netns_bpf_prog_detach(attr, ptype);
3066
+ case BPF_PROG_TYPE_CGROUP_DEVICE:
3067
+ case BPF_PROG_TYPE_CGROUP_SKB:
3068
+ case BPF_PROG_TYPE_CGROUP_SOCK:
3069
+ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
3070
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
3071
+ case BPF_PROG_TYPE_CGROUP_SYSCTL:
3072
+ case BPF_PROG_TYPE_SOCK_OPS:
3073
+ return cgroup_bpf_prog_detach(attr, ptype);
17343074 default:
17353075 return -EINVAL;
17363076 }
1737
-
1738
- return cgroup_bpf_prog_detach(attr, ptype);
17393077 }
17403078
17413079 #define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt
....@@ -1754,29 +3092,38 @@
17543092 case BPF_CGROUP_INET_INGRESS:
17553093 case BPF_CGROUP_INET_EGRESS:
17563094 case BPF_CGROUP_INET_SOCK_CREATE:
3095
+ case BPF_CGROUP_INET_SOCK_RELEASE:
17573096 case BPF_CGROUP_INET4_BIND:
17583097 case BPF_CGROUP_INET6_BIND:
17593098 case BPF_CGROUP_INET4_POST_BIND:
17603099 case BPF_CGROUP_INET6_POST_BIND:
17613100 case BPF_CGROUP_INET4_CONNECT:
17623101 case BPF_CGROUP_INET6_CONNECT:
3102
+ case BPF_CGROUP_INET4_GETPEERNAME:
3103
+ case BPF_CGROUP_INET6_GETPEERNAME:
3104
+ case BPF_CGROUP_INET4_GETSOCKNAME:
3105
+ case BPF_CGROUP_INET6_GETSOCKNAME:
17633106 case BPF_CGROUP_UDP4_SENDMSG:
17643107 case BPF_CGROUP_UDP6_SENDMSG:
17653108 case BPF_CGROUP_UDP4_RECVMSG:
17663109 case BPF_CGROUP_UDP6_RECVMSG:
17673110 case BPF_CGROUP_SOCK_OPS:
17683111 case BPF_CGROUP_DEVICE:
1769
- break;
3112
+ case BPF_CGROUP_SYSCTL:
3113
+ case BPF_CGROUP_GETSOCKOPT:
3114
+ case BPF_CGROUP_SETSOCKOPT:
3115
+ return cgroup_bpf_prog_query(attr, uattr);
17703116 case BPF_LIRC_MODE2:
17713117 return lirc_prog_query(attr, uattr);
3118
+ case BPF_FLOW_DISSECTOR:
3119
+ case BPF_SK_LOOKUP:
3120
+ return netns_bpf_prog_query(attr, uattr);
17723121 default:
17733122 return -EINVAL;
17743123 }
1775
-
1776
- return cgroup_bpf_prog_query(attr, uattr);
17773124 }
17783125
1779
-#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
3126
+#define BPF_PROG_TEST_RUN_LAST_FIELD test.cpu
17803127
17813128 static int bpf_prog_test_run(const union bpf_attr *attr,
17823129 union bpf_attr __user *uattr)
....@@ -1784,9 +3131,15 @@
17843131 struct bpf_prog *prog;
17853132 int ret = -ENOTSUPP;
17863133
1787
- if (!capable(CAP_SYS_ADMIN))
1788
- return -EPERM;
17893134 if (CHECK_ATTR(BPF_PROG_TEST_RUN))
3135
+ return -EINVAL;
3136
+
3137
+ if ((attr->test.ctx_size_in && !attr->test.ctx_in) ||
3138
+ (!attr->test.ctx_size_in && attr->test.ctx_in))
3139
+ return -EINVAL;
3140
+
3141
+ if ((attr->test.ctx_size_out && !attr->test.ctx_out) ||
3142
+ (!attr->test.ctx_size_out && attr->test.ctx_out))
17903143 return -EINVAL;
17913144
17923145 prog = bpf_prog_get(attr->test.prog_fd);
....@@ -1828,7 +3181,62 @@
18283181 return err;
18293182 }
18303183
3184
+struct bpf_map *bpf_map_get_curr_or_next(u32 *id)
3185
+{
3186
+ struct bpf_map *map;
3187
+
3188
+ spin_lock_bh(&map_idr_lock);
3189
+again:
3190
+ map = idr_get_next(&map_idr, id);
3191
+ if (map) {
3192
+ map = __bpf_map_inc_not_zero(map, false);
3193
+ if (IS_ERR(map)) {
3194
+ (*id)++;
3195
+ goto again;
3196
+ }
3197
+ }
3198
+ spin_unlock_bh(&map_idr_lock);
3199
+
3200
+ return map;
3201
+}
3202
+
3203
+struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id)
3204
+{
3205
+ struct bpf_prog *prog;
3206
+
3207
+ spin_lock_bh(&prog_idr_lock);
3208
+again:
3209
+ prog = idr_get_next(&prog_idr, id);
3210
+ if (prog) {
3211
+ prog = bpf_prog_inc_not_zero(prog);
3212
+ if (IS_ERR(prog)) {
3213
+ (*id)++;
3214
+ goto again;
3215
+ }
3216
+ }
3217
+ spin_unlock_bh(&prog_idr_lock);
3218
+
3219
+ return prog;
3220
+}
3221
+
18313222 #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
3223
+
3224
+struct bpf_prog *bpf_prog_by_id(u32 id)
3225
+{
3226
+ struct bpf_prog *prog;
3227
+
3228
+ if (!id)
3229
+ return ERR_PTR(-ENOENT);
3230
+
3231
+ spin_lock_bh(&prog_idr_lock);
3232
+ prog = idr_find(&prog_idr, id);
3233
+ if (prog)
3234
+ prog = bpf_prog_inc_not_zero(prog);
3235
+ else
3236
+ prog = ERR_PTR(-ENOENT);
3237
+ spin_unlock_bh(&prog_idr_lock);
3238
+ return prog;
3239
+}
18323240
18333241 static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
18343242 {
....@@ -1842,14 +3250,7 @@
18423250 if (!capable(CAP_SYS_ADMIN))
18433251 return -EPERM;
18443252
1845
- spin_lock_bh(&prog_idr_lock);
1846
- prog = idr_find(&prog_idr, id);
1847
- if (prog)
1848
- prog = bpf_prog_inc_not_zero(prog);
1849
- else
1850
- prog = ERR_PTR(-ENOENT);
1851
- spin_unlock_bh(&prog_idr_lock);
1852
-
3253
+ prog = bpf_prog_by_id(id);
18533254 if (IS_ERR(prog))
18543255 return PTR_ERR(prog);
18553256
....@@ -1883,7 +3284,7 @@
18833284 spin_lock_bh(&map_idr_lock);
18843285 map = idr_find(&map_idr, id);
18853286 if (map)
1886
- map = bpf_map_inc_not_zero(map, true);
3287
+ map = __bpf_map_inc_not_zero(map, true);
18873288 else
18883289 map = ERR_PTR(-ENOENT);
18893290 spin_unlock_bh(&map_idr_lock);
....@@ -1899,14 +3300,31 @@
18993300 }
19003301
19013302 static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog,
1902
- unsigned long addr)
3303
+ unsigned long addr, u32 *off,
3304
+ u32 *type)
19033305 {
3306
+ const struct bpf_map *map;
19043307 int i;
19053308
1906
- for (i = 0; i < prog->aux->used_map_cnt; i++)
1907
- if (prog->aux->used_maps[i] == (void *)addr)
1908
- return prog->aux->used_maps[i];
1909
- return NULL;
3309
+ mutex_lock(&prog->aux->used_maps_mutex);
3310
+ for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) {
3311
+ map = prog->aux->used_maps[i];
3312
+ if (map == (void *)addr) {
3313
+ *type = BPF_PSEUDO_MAP_FD;
3314
+ goto out;
3315
+ }
3316
+ if (!map->ops->map_direct_value_meta)
3317
+ continue;
3318
+ if (!map->ops->map_direct_value_meta(map, addr, off)) {
3319
+ *type = BPF_PSEUDO_MAP_VALUE;
3320
+ goto out;
3321
+ }
3322
+ }
3323
+ map = NULL;
3324
+
3325
+out:
3326
+ mutex_unlock(&prog->aux->used_maps_mutex);
3327
+ return map;
19103328 }
19113329
19123330 static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog,
....@@ -1914,7 +3332,9 @@
19143332 {
19153333 const struct bpf_map *map;
19163334 struct bpf_insn *insns;
3335
+ u32 off, type;
19173336 u64 imm;
3337
+ u8 code;
19183338 int i;
19193339
19203340 insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog),
....@@ -1923,41 +3343,71 @@
19233343 return insns;
19243344
19253345 for (i = 0; i < prog->len; i++) {
1926
- if (insns[i].code == (BPF_JMP | BPF_TAIL_CALL)) {
3346
+ code = insns[i].code;
3347
+
3348
+ if (code == (BPF_JMP | BPF_TAIL_CALL)) {
19273349 insns[i].code = BPF_JMP | BPF_CALL;
19283350 insns[i].imm = BPF_FUNC_tail_call;
19293351 /* fall-through */
19303352 }
1931
- if (insns[i].code == (BPF_JMP | BPF_CALL) ||
1932
- insns[i].code == (BPF_JMP | BPF_CALL_ARGS)) {
1933
- if (insns[i].code == (BPF_JMP | BPF_CALL_ARGS))
3353
+ if (code == (BPF_JMP | BPF_CALL) ||
3354
+ code == (BPF_JMP | BPF_CALL_ARGS)) {
3355
+ if (code == (BPF_JMP | BPF_CALL_ARGS))
19343356 insns[i].code = BPF_JMP | BPF_CALL;
19353357 if (!bpf_dump_raw_ok(f_cred))
19363358 insns[i].imm = 0;
19373359 continue;
19383360 }
1939
-
1940
- if (insns[i].code != (BPF_LD | BPF_IMM | BPF_DW))
1941
- continue;
1942
-
1943
- imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm;
1944
- map = bpf_map_from_imm(prog, imm);
1945
- if (map) {
1946
- insns[i].src_reg = BPF_PSEUDO_MAP_FD;
1947
- insns[i].imm = map->id;
1948
- insns[i + 1].imm = 0;
3361
+ if (BPF_CLASS(code) == BPF_LDX && BPF_MODE(code) == BPF_PROBE_MEM) {
3362
+ insns[i].code = BPF_LDX | BPF_SIZE(code) | BPF_MEM;
19493363 continue;
19503364 }
19513365
1952
- if (!bpf_dump_raw_ok(f_cred) &&
1953
- imm == (unsigned long)prog->aux) {
1954
- insns[i].imm = 0;
1955
- insns[i + 1].imm = 0;
3366
+ if (code != (BPF_LD | BPF_IMM | BPF_DW))
3367
+ continue;
3368
+
3369
+ imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm;
3370
+ map = bpf_map_from_imm(prog, imm, &off, &type);
3371
+ if (map) {
3372
+ insns[i].src_reg = type;
3373
+ insns[i].imm = map->id;
3374
+ insns[i + 1].imm = off;
19563375 continue;
19573376 }
19583377 }
19593378
19603379 return insns;
3380
+}
3381
+
3382
+static int set_info_rec_size(struct bpf_prog_info *info)
3383
+{
3384
+ /*
3385
+ * Ensure info.*_rec_size is the same as kernel expected size
3386
+ *
3387
+ * or
3388
+ *
3389
+ * Only allow zero *_rec_size if both _rec_size and _cnt are
3390
+ * zero. In this case, the kernel will set the expected
3391
+ * _rec_size back to the info.
3392
+ */
3393
+
3394
+ if ((info->nr_func_info || info->func_info_rec_size) &&
3395
+ info->func_info_rec_size != sizeof(struct bpf_func_info))
3396
+ return -EINVAL;
3397
+
3398
+ if ((info->nr_line_info || info->line_info_rec_size) &&
3399
+ info->line_info_rec_size != sizeof(struct bpf_line_info))
3400
+ return -EINVAL;
3401
+
3402
+ if ((info->nr_jited_line_info || info->jited_line_info_rec_size) &&
3403
+ info->jited_line_info_rec_size != sizeof(__u64))
3404
+ return -EINVAL;
3405
+
3406
+ info->func_info_rec_size = sizeof(struct bpf_func_info);
3407
+ info->line_info_rec_size = sizeof(struct bpf_line_info);
3408
+ info->jited_line_info_rec_size = sizeof(__u64);
3409
+
3410
+ return 0;
19613411 }
19623412
19633413 static int bpf_prog_get_info_by_fd(struct file *file,
....@@ -1968,6 +3418,7 @@
19683418 struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
19693419 struct bpf_prog_info info;
19703420 u32 info_len = attr->info.info_len;
3421
+ struct bpf_prog_stats stats;
19713422 char __user *uinsns;
19723423 u32 ulen;
19733424 int err;
....@@ -1991,6 +3442,7 @@
19913442 memcpy(info.tag, prog->tag, sizeof(prog->tag));
19923443 memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));
19933444
3445
+ mutex_lock(&prog->aux->used_maps_mutex);
19943446 ulen = info.nr_map_ids;
19953447 info.nr_map_ids = prog->aux->used_map_cnt;
19963448 ulen = min_t(u32, info.nr_map_ids, ulen);
....@@ -2000,15 +3452,29 @@
20003452
20013453 for (i = 0; i < ulen; i++)
20023454 if (put_user(prog->aux->used_maps[i]->id,
2003
- &user_map_ids[i]))
3455
+ &user_map_ids[i])) {
3456
+ mutex_unlock(&prog->aux->used_maps_mutex);
20043457 return -EFAULT;
3458
+ }
20053459 }
3460
+ mutex_unlock(&prog->aux->used_maps_mutex);
20063461
2007
- if (!capable(CAP_SYS_ADMIN)) {
3462
+ err = set_info_rec_size(&info);
3463
+ if (err)
3464
+ return err;
3465
+
3466
+ bpf_prog_get_stats(prog, &stats);
3467
+ info.run_time_ns = stats.nsecs;
3468
+ info.run_cnt = stats.cnt;
3469
+
3470
+ if (!bpf_capable()) {
20083471 info.jited_prog_len = 0;
20093472 info.xlated_prog_len = 0;
20103473 info.nr_jited_ksyms = 0;
20113474 info.nr_jited_func_lens = 0;
3475
+ info.nr_func_info = 0;
3476
+ info.nr_line_info = 0;
3477
+ info.nr_jited_line_info = 0;
20123478 goto done;
20133479 }
20143480
....@@ -2089,11 +3555,11 @@
20893555 }
20903556
20913557 ulen = info.nr_jited_ksyms;
2092
- info.nr_jited_ksyms = prog->aux->func_cnt;
2093
- if (info.nr_jited_ksyms && ulen) {
3558
+ info.nr_jited_ksyms = prog->aux->func_cnt ? : 1;
3559
+ if (ulen) {
20943560 if (bpf_dump_raw_ok(file->f_cred)) {
3561
+ unsigned long ksym_addr;
20953562 u64 __user *user_ksyms;
2096
- ulong ksym_addr;
20973563 u32 i;
20983564
20993565 /* copy the address of the kernel symbol
....@@ -2101,10 +3567,17 @@
21013567 */
21023568 ulen = min_t(u32, info.nr_jited_ksyms, ulen);
21033569 user_ksyms = u64_to_user_ptr(info.jited_ksyms);
2104
- for (i = 0; i < ulen; i++) {
2105
- ksym_addr = (ulong) prog->aux->func[i]->bpf_func;
2106
- ksym_addr &= PAGE_MASK;
2107
- if (put_user((u64) ksym_addr, &user_ksyms[i]))
3570
+ if (prog->aux->func_cnt) {
3571
+ for (i = 0; i < ulen; i++) {
3572
+ ksym_addr = (unsigned long)
3573
+ prog->aux->func[i]->bpf_func;
3574
+ if (put_user((u64) ksym_addr,
3575
+ &user_ksyms[i]))
3576
+ return -EFAULT;
3577
+ }
3578
+ } else {
3579
+ ksym_addr = (unsigned long) prog->bpf_func;
3580
+ if (put_user((u64) ksym_addr, &user_ksyms[0]))
21083581 return -EFAULT;
21093582 }
21103583 } else {
....@@ -2113,8 +3586,8 @@
21133586 }
21143587
21153588 ulen = info.nr_jited_func_lens;
2116
- info.nr_jited_func_lens = prog->aux->func_cnt;
2117
- if (info.nr_jited_func_lens && ulen) {
3589
+ info.nr_jited_func_lens = prog->aux->func_cnt ? : 1;
3590
+ if (ulen) {
21183591 if (bpf_dump_raw_ok(file->f_cred)) {
21193592 u32 __user *user_lens;
21203593 u32 func_len, i;
....@@ -2122,13 +3595,91 @@
21223595 /* copy the JITed image lengths for each function */
21233596 ulen = min_t(u32, info.nr_jited_func_lens, ulen);
21243597 user_lens = u64_to_user_ptr(info.jited_func_lens);
2125
- for (i = 0; i < ulen; i++) {
2126
- func_len = prog->aux->func[i]->jited_len;
2127
- if (put_user(func_len, &user_lens[i]))
3598
+ if (prog->aux->func_cnt) {
3599
+ for (i = 0; i < ulen; i++) {
3600
+ func_len =
3601
+ prog->aux->func[i]->jited_len;
3602
+ if (put_user(func_len, &user_lens[i]))
3603
+ return -EFAULT;
3604
+ }
3605
+ } else {
3606
+ func_len = prog->jited_len;
3607
+ if (put_user(func_len, &user_lens[0]))
21283608 return -EFAULT;
21293609 }
21303610 } else {
21313611 info.jited_func_lens = 0;
3612
+ }
3613
+ }
3614
+
3615
+ if (prog->aux->btf)
3616
+ info.btf_id = btf_id(prog->aux->btf);
3617
+
3618
+ ulen = info.nr_func_info;
3619
+ info.nr_func_info = prog->aux->func_info_cnt;
3620
+ if (info.nr_func_info && ulen) {
3621
+ char __user *user_finfo;
3622
+
3623
+ user_finfo = u64_to_user_ptr(info.func_info);
3624
+ ulen = min_t(u32, info.nr_func_info, ulen);
3625
+ if (copy_to_user(user_finfo, prog->aux->func_info,
3626
+ info.func_info_rec_size * ulen))
3627
+ return -EFAULT;
3628
+ }
3629
+
3630
+ ulen = info.nr_line_info;
3631
+ info.nr_line_info = prog->aux->nr_linfo;
3632
+ if (info.nr_line_info && ulen) {
3633
+ __u8 __user *user_linfo;
3634
+
3635
+ user_linfo = u64_to_user_ptr(info.line_info);
3636
+ ulen = min_t(u32, info.nr_line_info, ulen);
3637
+ if (copy_to_user(user_linfo, prog->aux->linfo,
3638
+ info.line_info_rec_size * ulen))
3639
+ return -EFAULT;
3640
+ }
3641
+
3642
+ ulen = info.nr_jited_line_info;
3643
+ if (prog->aux->jited_linfo)
3644
+ info.nr_jited_line_info = prog->aux->nr_linfo;
3645
+ else
3646
+ info.nr_jited_line_info = 0;
3647
+ if (info.nr_jited_line_info && ulen) {
3648
+ if (bpf_dump_raw_ok(file->f_cred)) {
3649
+ __u64 __user *user_linfo;
3650
+ u32 i;
3651
+
3652
+ user_linfo = u64_to_user_ptr(info.jited_line_info);
3653
+ ulen = min_t(u32, info.nr_jited_line_info, ulen);
3654
+ for (i = 0; i < ulen; i++) {
3655
+ if (put_user((__u64)(long)prog->aux->jited_linfo[i],
3656
+ &user_linfo[i]))
3657
+ return -EFAULT;
3658
+ }
3659
+ } else {
3660
+ info.jited_line_info = 0;
3661
+ }
3662
+ }
3663
+
3664
+ ulen = info.nr_prog_tags;
3665
+ info.nr_prog_tags = prog->aux->func_cnt ? : 1;
3666
+ if (ulen) {
3667
+ __u8 __user (*user_prog_tags)[BPF_TAG_SIZE];
3668
+ u32 i;
3669
+
3670
+ user_prog_tags = u64_to_user_ptr(info.prog_tags);
3671
+ ulen = min_t(u32, info.nr_prog_tags, ulen);
3672
+ if (prog->aux->func_cnt) {
3673
+ for (i = 0; i < ulen; i++) {
3674
+ if (copy_to_user(user_prog_tags[i],
3675
+ prog->aux->func[i]->tag,
3676
+ BPF_TAG_SIZE))
3677
+ return -EFAULT;
3678
+ }
3679
+ } else {
3680
+ if (copy_to_user(user_prog_tags[0],
3681
+ prog->tag, BPF_TAG_SIZE))
3682
+ return -EFAULT;
21323683 }
21333684 }
21343685
....@@ -2169,6 +3720,7 @@
21693720 info.btf_key_type_id = map->btf_key_type_id;
21703721 info.btf_value_type_id = map->btf_value_type_id;
21713722 }
3723
+ info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id;
21723724
21733725 if (bpf_map_is_dev_bound(map)) {
21743726 err = bpf_map_offload_info_fill(&info, map);
....@@ -2199,6 +3751,43 @@
21993751 return btf_get_info_by_fd(btf, attr, uattr);
22003752 }
22013753
3754
+static int bpf_link_get_info_by_fd(struct file *file,
3755
+ struct bpf_link *link,
3756
+ const union bpf_attr *attr,
3757
+ union bpf_attr __user *uattr)
3758
+{
3759
+ struct bpf_link_info __user *uinfo = u64_to_user_ptr(attr->info.info);
3760
+ struct bpf_link_info info;
3761
+ u32 info_len = attr->info.info_len;
3762
+ int err;
3763
+
3764
+ err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len);
3765
+ if (err)
3766
+ return err;
3767
+ info_len = min_t(u32, sizeof(info), info_len);
3768
+
3769
+ memset(&info, 0, sizeof(info));
3770
+ if (copy_from_user(&info, uinfo, info_len))
3771
+ return -EFAULT;
3772
+
3773
+ info.type = link->type;
3774
+ info.id = link->id;
3775
+ info.prog_id = link->prog->aux->id;
3776
+
3777
+ if (link->ops->fill_link_info) {
3778
+ err = link->ops->fill_link_info(link, &info);
3779
+ if (err)
3780
+ return err;
3781
+ }
3782
+
3783
+ if (copy_to_user(uinfo, &info, info_len) ||
3784
+ put_user(info_len, &uattr->info.info_len))
3785
+ return -EFAULT;
3786
+
3787
+ return 0;
3788
+}
3789
+
3790
+
22023791 #define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
22033792
22043793 static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
....@@ -2223,6 +3812,9 @@
22233812 uattr);
22243813 else if (f.file->f_op == &btf_fops)
22253814 err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr);
3815
+ else if (f.file->f_op == &bpf_link_fops)
3816
+ err = bpf_link_get_info_by_fd(f.file, f.file->private_data,
3817
+ attr, uattr);
22263818 else
22273819 err = -EINVAL;
22283820
....@@ -2237,7 +3829,7 @@
22373829 if (CHECK_ATTR(BPF_BTF_LOAD))
22383830 return -EINVAL;
22393831
2240
- if (!capable(CAP_SYS_ADMIN))
3832
+ if (!bpf_capable())
22413833 return -EPERM;
22423834
22433835 return btf_new_fd(attr);
....@@ -2325,7 +3917,9 @@
23253917 if (attr->task_fd_query.flags != 0)
23263918 return -EINVAL;
23273919
3920
+ rcu_read_lock();
23283921 task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
3922
+ rcu_read_unlock();
23293923 if (!task)
23303924 return -ENOENT;
23313925
....@@ -2347,15 +3941,21 @@
23473941 if (err)
23483942 goto out;
23493943
2350
- if (file->f_op == &bpf_raw_tp_fops) {
2351
- struct bpf_raw_tracepoint *raw_tp = file->private_data;
2352
- struct bpf_raw_event_map *btp = raw_tp->btp;
3944
+ if (file->f_op == &bpf_link_fops) {
3945
+ struct bpf_link *link = file->private_data;
23533946
2354
- err = bpf_task_fd_query_copy(attr, uattr,
2355
- raw_tp->prog->aux->id,
2356
- BPF_FD_TYPE_RAW_TRACEPOINT,
2357
- btp->tp->name, 0, 0);
2358
- goto put_file;
3947
+ if (link->ops == &bpf_raw_tp_link_lops) {
3948
+ struct bpf_raw_tp_link *raw_tp =
3949
+ container_of(link, struct bpf_raw_tp_link, link);
3950
+ struct bpf_raw_event_map *btp = raw_tp->btp;
3951
+
3952
+ err = bpf_task_fd_query_copy(attr, uattr,
3953
+ raw_tp->link.prog->aux->id,
3954
+ BPF_FD_TYPE_RAW_TRACEPOINT,
3955
+ btp->tp->name, 0, 0);
3956
+ goto put_file;
3957
+ }
3958
+ goto out_not_supp;
23593959 }
23603960
23613961 event = perf_get_event(file);
....@@ -2375,6 +3975,7 @@
23753975 goto put_file;
23763976 }
23773977
3978
+out_not_supp:
23783979 err = -ENOTSUPP;
23793980 put_file:
23803981 fput(file);
....@@ -2382,12 +3983,411 @@
23823983 return err;
23833984 }
23843985
3986
+#define BPF_MAP_BATCH_LAST_FIELD batch.flags
3987
+
3988
+#define BPF_DO_BATCH(fn) \
3989
+ do { \
3990
+ if (!fn) { \
3991
+ err = -ENOTSUPP; \
3992
+ goto err_put; \
3993
+ } \
3994
+ err = fn(map, attr, uattr); \
3995
+ } while (0)
3996
+
3997
+static int bpf_map_do_batch(const union bpf_attr *attr,
3998
+ union bpf_attr __user *uattr,
3999
+ int cmd)
4000
+{
4001
+ bool has_read = cmd == BPF_MAP_LOOKUP_BATCH ||
4002
+ cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH;
4003
+ bool has_write = cmd != BPF_MAP_LOOKUP_BATCH;
4004
+ struct bpf_map *map;
4005
+ int err, ufd;
4006
+ struct fd f;
4007
+
4008
+ if (CHECK_ATTR(BPF_MAP_BATCH))
4009
+ return -EINVAL;
4010
+
4011
+ ufd = attr->batch.map_fd;
4012
+ f = fdget(ufd);
4013
+ map = __bpf_map_get(f);
4014
+ if (IS_ERR(map))
4015
+ return PTR_ERR(map);
4016
+ if (has_write)
4017
+ bpf_map_write_active_inc(map);
4018
+ if (has_read && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
4019
+ err = -EPERM;
4020
+ goto err_put;
4021
+ }
4022
+ if (has_write && !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
4023
+ err = -EPERM;
4024
+ goto err_put;
4025
+ }
4026
+
4027
+ if (cmd == BPF_MAP_LOOKUP_BATCH)
4028
+ BPF_DO_BATCH(map->ops->map_lookup_batch);
4029
+ else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
4030
+ BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch);
4031
+ else if (cmd == BPF_MAP_UPDATE_BATCH)
4032
+ BPF_DO_BATCH(map->ops->map_update_batch);
4033
+ else
4034
+ BPF_DO_BATCH(map->ops->map_delete_batch);
4035
+err_put:
4036
+ if (has_write)
4037
+ bpf_map_write_active_dec(map);
4038
+ fdput(f);
4039
+ return err;
4040
+}
4041
+
4042
+static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
4043
+{
4044
+ if (attr->link_create.attach_type != prog->expected_attach_type)
4045
+ return -EINVAL;
4046
+
4047
+ if (prog->expected_attach_type == BPF_TRACE_ITER)
4048
+ return bpf_iter_link_attach(attr, prog);
4049
+ else if (prog->type == BPF_PROG_TYPE_EXT)
4050
+ return bpf_tracing_prog_attach(prog,
4051
+ attr->link_create.target_fd,
4052
+ attr->link_create.target_btf_id);
4053
+ return -EINVAL;
4054
+}
4055
+
4056
+#define BPF_LINK_CREATE_LAST_FIELD link_create.iter_info_len
4057
+static int link_create(union bpf_attr *attr)
4058
+{
4059
+ enum bpf_prog_type ptype;
4060
+ struct bpf_prog *prog;
4061
+ int ret;
4062
+
4063
+ if (CHECK_ATTR(BPF_LINK_CREATE))
4064
+ return -EINVAL;
4065
+
4066
+ prog = bpf_prog_get(attr->link_create.prog_fd);
4067
+ if (IS_ERR(prog))
4068
+ return PTR_ERR(prog);
4069
+
4070
+ ret = bpf_prog_attach_check_attach_type(prog,
4071
+ attr->link_create.attach_type);
4072
+ if (ret)
4073
+ goto out;
4074
+
4075
+ if (prog->type == BPF_PROG_TYPE_EXT) {
4076
+ ret = tracing_bpf_link_attach(attr, prog);
4077
+ goto out;
4078
+ }
4079
+
4080
+ ptype = attach_type_to_prog_type(attr->link_create.attach_type);
4081
+ if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) {
4082
+ ret = -EINVAL;
4083
+ goto out;
4084
+ }
4085
+
4086
+ switch (ptype) {
4087
+ case BPF_PROG_TYPE_CGROUP_SKB:
4088
+ case BPF_PROG_TYPE_CGROUP_SOCK:
4089
+ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
4090
+ case BPF_PROG_TYPE_SOCK_OPS:
4091
+ case BPF_PROG_TYPE_CGROUP_DEVICE:
4092
+ case BPF_PROG_TYPE_CGROUP_SYSCTL:
4093
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
4094
+ ret = cgroup_bpf_link_attach(attr, prog);
4095
+ break;
4096
+ case BPF_PROG_TYPE_TRACING:
4097
+ ret = tracing_bpf_link_attach(attr, prog);
4098
+ break;
4099
+ case BPF_PROG_TYPE_FLOW_DISSECTOR:
4100
+ case BPF_PROG_TYPE_SK_LOOKUP:
4101
+ ret = netns_bpf_link_create(attr, prog);
4102
+ break;
4103
+#ifdef CONFIG_NET
4104
+ case BPF_PROG_TYPE_XDP:
4105
+ ret = bpf_xdp_link_attach(attr, prog);
4106
+ break;
4107
+#endif
4108
+ default:
4109
+ ret = -EINVAL;
4110
+ }
4111
+
4112
+out:
4113
+ if (ret < 0)
4114
+ bpf_prog_put(prog);
4115
+ return ret;
4116
+}
4117
+
4118
+#define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd
4119
+
4120
+static int link_update(union bpf_attr *attr)
4121
+{
4122
+ struct bpf_prog *old_prog = NULL, *new_prog;
4123
+ struct bpf_link *link;
4124
+ u32 flags;
4125
+ int ret;
4126
+
4127
+ if (CHECK_ATTR(BPF_LINK_UPDATE))
4128
+ return -EINVAL;
4129
+
4130
+ flags = attr->link_update.flags;
4131
+ if (flags & ~BPF_F_REPLACE)
4132
+ return -EINVAL;
4133
+
4134
+ link = bpf_link_get_from_fd(attr->link_update.link_fd);
4135
+ if (IS_ERR(link))
4136
+ return PTR_ERR(link);
4137
+
4138
+ new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
4139
+ if (IS_ERR(new_prog)) {
4140
+ ret = PTR_ERR(new_prog);
4141
+ goto out_put_link;
4142
+ }
4143
+
4144
+ if (flags & BPF_F_REPLACE) {
4145
+ old_prog = bpf_prog_get(attr->link_update.old_prog_fd);
4146
+ if (IS_ERR(old_prog)) {
4147
+ ret = PTR_ERR(old_prog);
4148
+ old_prog = NULL;
4149
+ goto out_put_progs;
4150
+ }
4151
+ } else if (attr->link_update.old_prog_fd) {
4152
+ ret = -EINVAL;
4153
+ goto out_put_progs;
4154
+ }
4155
+
4156
+ if (link->ops->update_prog)
4157
+ ret = link->ops->update_prog(link, new_prog, old_prog);
4158
+ else
4159
+ ret = -EINVAL;
4160
+
4161
+out_put_progs:
4162
+ if (old_prog)
4163
+ bpf_prog_put(old_prog);
4164
+ if (ret)
4165
+ bpf_prog_put(new_prog);
4166
+out_put_link:
4167
+ bpf_link_put(link);
4168
+ return ret;
4169
+}
4170
+
4171
+#define BPF_LINK_DETACH_LAST_FIELD link_detach.link_fd
4172
+
4173
+static int link_detach(union bpf_attr *attr)
4174
+{
4175
+ struct bpf_link *link;
4176
+ int ret;
4177
+
4178
+ if (CHECK_ATTR(BPF_LINK_DETACH))
4179
+ return -EINVAL;
4180
+
4181
+ link = bpf_link_get_from_fd(attr->link_detach.link_fd);
4182
+ if (IS_ERR(link))
4183
+ return PTR_ERR(link);
4184
+
4185
+ if (link->ops->detach)
4186
+ ret = link->ops->detach(link);
4187
+ else
4188
+ ret = -EOPNOTSUPP;
4189
+
4190
+ bpf_link_put(link);
4191
+ return ret;
4192
+}
4193
+
4194
+static struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link)
4195
+{
4196
+ return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? link : ERR_PTR(-ENOENT);
4197
+}
4198
+
4199
+struct bpf_link *bpf_link_by_id(u32 id)
4200
+{
4201
+ struct bpf_link *link;
4202
+
4203
+ if (!id)
4204
+ return ERR_PTR(-ENOENT);
4205
+
4206
+ spin_lock_bh(&link_idr_lock);
4207
+ /* before link is "settled", ID is 0, pretend it doesn't exist yet */
4208
+ link = idr_find(&link_idr, id);
4209
+ if (link) {
4210
+ if (link->id)
4211
+ link = bpf_link_inc_not_zero(link);
4212
+ else
4213
+ link = ERR_PTR(-EAGAIN);
4214
+ } else {
4215
+ link = ERR_PTR(-ENOENT);
4216
+ }
4217
+ spin_unlock_bh(&link_idr_lock);
4218
+ return link;
4219
+}
4220
+
4221
+#define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id
4222
+
4223
+static int bpf_link_get_fd_by_id(const union bpf_attr *attr)
4224
+{
4225
+ struct bpf_link *link;
4226
+ u32 id = attr->link_id;
4227
+ int fd;
4228
+
4229
+ if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID))
4230
+ return -EINVAL;
4231
+
4232
+ if (!capable(CAP_SYS_ADMIN))
4233
+ return -EPERM;
4234
+
4235
+ link = bpf_link_by_id(id);
4236
+ if (IS_ERR(link))
4237
+ return PTR_ERR(link);
4238
+
4239
+ fd = bpf_link_new_fd(link);
4240
+ if (fd < 0)
4241
+ bpf_link_put(link);
4242
+
4243
+ return fd;
4244
+}
4245
+
4246
+DEFINE_MUTEX(bpf_stats_enabled_mutex);
4247
+
4248
+static int bpf_stats_release(struct inode *inode, struct file *file)
4249
+{
4250
+ mutex_lock(&bpf_stats_enabled_mutex);
4251
+ static_key_slow_dec(&bpf_stats_enabled_key.key);
4252
+ mutex_unlock(&bpf_stats_enabled_mutex);
4253
+ return 0;
4254
+}
4255
+
4256
+static const struct file_operations bpf_stats_fops = {
4257
+ .release = bpf_stats_release,
4258
+};
4259
+
4260
+static int bpf_enable_runtime_stats(void)
4261
+{
4262
+ int fd;
4263
+
4264
+ mutex_lock(&bpf_stats_enabled_mutex);
4265
+
4266
+ /* Set a very high limit to avoid overflow */
4267
+ if (static_key_count(&bpf_stats_enabled_key.key) > INT_MAX / 2) {
4268
+ mutex_unlock(&bpf_stats_enabled_mutex);
4269
+ return -EBUSY;
4270
+ }
4271
+
4272
+ fd = anon_inode_getfd("bpf-stats", &bpf_stats_fops, NULL, O_CLOEXEC);
4273
+ if (fd >= 0)
4274
+ static_key_slow_inc(&bpf_stats_enabled_key.key);
4275
+
4276
+ mutex_unlock(&bpf_stats_enabled_mutex);
4277
+ return fd;
4278
+}
4279
+
4280
+#define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type
4281
+
4282
+static int bpf_enable_stats(union bpf_attr *attr)
4283
+{
4284
+
4285
+ if (CHECK_ATTR(BPF_ENABLE_STATS))
4286
+ return -EINVAL;
4287
+
4288
+ if (!capable(CAP_SYS_ADMIN))
4289
+ return -EPERM;
4290
+
4291
+ switch (attr->enable_stats.type) {
4292
+ case BPF_STATS_RUN_TIME:
4293
+ return bpf_enable_runtime_stats();
4294
+ default:
4295
+ break;
4296
+ }
4297
+ return -EINVAL;
4298
+}
4299
+
4300
+#define BPF_ITER_CREATE_LAST_FIELD iter_create.flags
4301
+
4302
+static int bpf_iter_create(union bpf_attr *attr)
4303
+{
4304
+ struct bpf_link *link;
4305
+ int err;
4306
+
4307
+ if (CHECK_ATTR(BPF_ITER_CREATE))
4308
+ return -EINVAL;
4309
+
4310
+ if (attr->iter_create.flags)
4311
+ return -EINVAL;
4312
+
4313
+ link = bpf_link_get_from_fd(attr->iter_create.link_fd);
4314
+ if (IS_ERR(link))
4315
+ return PTR_ERR(link);
4316
+
4317
+ err = bpf_iter_new_fd(link);
4318
+ bpf_link_put(link);
4319
+
4320
+ return err;
4321
+}
4322
+
4323
+#define BPF_PROG_BIND_MAP_LAST_FIELD prog_bind_map.flags
4324
+
4325
+static int bpf_prog_bind_map(union bpf_attr *attr)
4326
+{
4327
+ struct bpf_prog *prog;
4328
+ struct bpf_map *map;
4329
+ struct bpf_map **used_maps_old, **used_maps_new;
4330
+ int i, ret = 0;
4331
+
4332
+ if (CHECK_ATTR(BPF_PROG_BIND_MAP))
4333
+ return -EINVAL;
4334
+
4335
+ if (attr->prog_bind_map.flags)
4336
+ return -EINVAL;
4337
+
4338
+ prog = bpf_prog_get(attr->prog_bind_map.prog_fd);
4339
+ if (IS_ERR(prog))
4340
+ return PTR_ERR(prog);
4341
+
4342
+ map = bpf_map_get(attr->prog_bind_map.map_fd);
4343
+ if (IS_ERR(map)) {
4344
+ ret = PTR_ERR(map);
4345
+ goto out_prog_put;
4346
+ }
4347
+
4348
+ mutex_lock(&prog->aux->used_maps_mutex);
4349
+
4350
+ used_maps_old = prog->aux->used_maps;
4351
+
4352
+ for (i = 0; i < prog->aux->used_map_cnt; i++)
4353
+ if (used_maps_old[i] == map) {
4354
+ bpf_map_put(map);
4355
+ goto out_unlock;
4356
+ }
4357
+
4358
+ used_maps_new = kmalloc_array(prog->aux->used_map_cnt + 1,
4359
+ sizeof(used_maps_new[0]),
4360
+ GFP_KERNEL);
4361
+ if (!used_maps_new) {
4362
+ ret = -ENOMEM;
4363
+ goto out_unlock;
4364
+ }
4365
+
4366
+ memcpy(used_maps_new, used_maps_old,
4367
+ sizeof(used_maps_old[0]) * prog->aux->used_map_cnt);
4368
+ used_maps_new[prog->aux->used_map_cnt] = map;
4369
+
4370
+ prog->aux->used_map_cnt++;
4371
+ prog->aux->used_maps = used_maps_new;
4372
+
4373
+ kfree(used_maps_old);
4374
+
4375
+out_unlock:
4376
+ mutex_unlock(&prog->aux->used_maps_mutex);
4377
+
4378
+ if (ret)
4379
+ bpf_map_put(map);
4380
+out_prog_put:
4381
+ bpf_prog_put(prog);
4382
+ return ret;
4383
+}
4384
+
23854385 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
23864386 {
23874387 union bpf_attr attr;
23884388 int err;
23894389
2390
- if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
4390
+ if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
23914391 return -EPERM;
23924392
23934393 err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
....@@ -2399,6 +4399,8 @@
23994399 memset(&attr, 0, sizeof(attr));
24004400 if (copy_from_user(&attr, uattr, size) != 0)
24014401 return -EFAULT;
4402
+
4403
+ trace_android_vh_check_bpf_syscall(cmd, &attr, size);
24024404
24034405 err = security_bpf(cmd, &attr, size);
24044406 if (err < 0)
....@@ -2420,8 +4422,11 @@
24204422 case BPF_MAP_GET_NEXT_KEY:
24214423 err = map_get_next_key(&attr);
24224424 break;
4425
+ case BPF_MAP_FREEZE:
4426
+ err = map_freeze(&attr);
4427
+ break;
24234428 case BPF_PROG_LOAD:
2424
- err = bpf_prog_load(&attr);
4429
+ err = bpf_prog_load(&attr, uattr);
24254430 break;
24264431 case BPF_OBJ_PIN:
24274432 err = bpf_obj_pin(&attr);
....@@ -2449,6 +4454,10 @@
24494454 err = bpf_obj_get_next_id(&attr, uattr,
24504455 &map_idr, &map_idr_lock);
24514456 break;
4457
+ case BPF_BTF_GET_NEXT_ID:
4458
+ err = bpf_obj_get_next_id(&attr, uattr,
4459
+ &btf_idr, &btf_idr_lock);
4460
+ break;
24524461 case BPF_PROG_GET_FD_BY_ID:
24534462 err = bpf_prog_get_fd_by_id(&attr);
24544463 break;
....@@ -2470,6 +4479,47 @@
24704479 case BPF_TASK_FD_QUERY:
24714480 err = bpf_task_fd_query(&attr, uattr);
24724481 break;
4482
+ case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
4483
+ err = map_lookup_and_delete_elem(&attr);
4484
+ break;
4485
+ case BPF_MAP_LOOKUP_BATCH:
4486
+ err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_BATCH);
4487
+ break;
4488
+ case BPF_MAP_LOOKUP_AND_DELETE_BATCH:
4489
+ err = bpf_map_do_batch(&attr, uattr,
4490
+ BPF_MAP_LOOKUP_AND_DELETE_BATCH);
4491
+ break;
4492
+ case BPF_MAP_UPDATE_BATCH:
4493
+ err = bpf_map_do_batch(&attr, uattr, BPF_MAP_UPDATE_BATCH);
4494
+ break;
4495
+ case BPF_MAP_DELETE_BATCH:
4496
+ err = bpf_map_do_batch(&attr, uattr, BPF_MAP_DELETE_BATCH);
4497
+ break;
4498
+ case BPF_LINK_CREATE:
4499
+ err = link_create(&attr);
4500
+ break;
4501
+ case BPF_LINK_UPDATE:
4502
+ err = link_update(&attr);
4503
+ break;
4504
+ case BPF_LINK_GET_FD_BY_ID:
4505
+ err = bpf_link_get_fd_by_id(&attr);
4506
+ break;
4507
+ case BPF_LINK_GET_NEXT_ID:
4508
+ err = bpf_obj_get_next_id(&attr, uattr,
4509
+ &link_idr, &link_idr_lock);
4510
+ break;
4511
+ case BPF_ENABLE_STATS:
4512
+ err = bpf_enable_stats(&attr);
4513
+ break;
4514
+ case BPF_ITER_CREATE:
4515
+ err = bpf_iter_create(&attr);
4516
+ break;
4517
+ case BPF_LINK_DETACH:
4518
+ err = link_detach(&attr);
4519
+ break;
4520
+ case BPF_PROG_BIND_MAP:
4521
+ err = bpf_prog_bind_map(&attr);
4522
+ break;
24734523 default:
24744524 err = -EINVAL;
24754525 break;