hc
2024-05-10 748e4f3d702def1a4bff191e0cf93b6a05340f01
kernel/kernel/bpf/stackmap.c
....@@ -1,17 +1,16 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /* Copyright (c) 2016 Facebook
2
- *
3
- * This program is free software; you can redistribute it and/or
4
- * modify it under the terms of version 2 of the GNU General Public
5
- * License as published by the Free Software Foundation.
63 */
74 #include <linux/bpf.h>
85 #include <linux/jhash.h>
96 #include <linux/filter.h>
7
+#include <linux/kernel.h>
108 #include <linux/stacktrace.h>
119 #include <linux/perf_event.h>
1210 #include <linux/elf.h>
1311 #include <linux/pagemap.h>
1412 #include <linux/irq_work.h>
13
+#include <linux/btf_ids.h>
1514 #include "percpu_freelist.h"
1615
1716 #define STACK_CREATE_FLAG_MASK \
....@@ -36,16 +35,18 @@
3635 /* irq_work to run up_read() for build_id lookup in nmi context */
3736 struct stack_map_irq_work {
3837 struct irq_work irq_work;
39
- struct rw_semaphore *sem;
38
+ struct mm_struct *mm;
4039 };
4140
4241 static void do_up_read(struct irq_work *entry)
4342 {
4443 struct stack_map_irq_work *work;
4544
45
+ if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)))
46
+ return;
47
+
4648 work = container_of(entry, struct stack_map_irq_work, irq_work);
47
- up_read_non_owner(work->sem);
48
- work->sem = NULL;
49
+ mmap_read_unlock_non_owner(work->mm);
4950 }
5051
5152 static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work);
....@@ -90,10 +91,11 @@
9091 {
9192 u32 value_size = attr->value_size;
9293 struct bpf_stack_map *smap;
94
+ struct bpf_map_memory mem;
9395 u64 cost, n_buckets;
9496 int err;
9597
96
- if (!capable(CAP_SYS_ADMIN))
98
+ if (!bpf_capable())
9799 return ERR_PTR(-EPERM);
98100
99101 if (attr->map_flags & ~STACK_CREATE_FLAG_MASK)
....@@ -119,40 +121,37 @@
119121 return ERR_PTR(-E2BIG);
120122
121123 cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap);
122
- if (cost >= U32_MAX - PAGE_SIZE)
123
- return ERR_PTR(-E2BIG);
124
+ err = bpf_map_charge_init(&mem, cost + attr->max_entries *
125
+ (sizeof(struct stack_map_bucket) + (u64)value_size));
126
+ if (err)
127
+ return ERR_PTR(err);
124128
125129 smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr));
126
- if (!smap)
130
+ if (!smap) {
131
+ bpf_map_charge_finish(&mem);
127132 return ERR_PTR(-ENOMEM);
128
-
129
- err = -E2BIG;
130
- cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
131
- if (cost >= U32_MAX - PAGE_SIZE)
132
- goto free_smap;
133
+ }
133134
134135 bpf_map_init_from_attr(&smap->map, attr);
135136 smap->map.value_size = value_size;
136137 smap->n_buckets = n_buckets;
137
- smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
138
-
139
- err = bpf_map_precharge_memlock(smap->map.pages);
140
- if (err)
141
- goto free_smap;
142138
143139 err = get_callchain_buffers(sysctl_perf_event_max_stack);
144140 if (err)
145
- goto free_smap;
141
+ goto free_charge;
146142
147143 err = prealloc_elems_and_freelist(smap);
148144 if (err)
149145 goto put_buffers;
150146
147
+ bpf_map_charge_move(&smap->map.memory, &mem);
148
+
151149 return &smap->map;
152150
153151 put_buffers:
154152 put_callchain_buffers();
155
-free_smap:
153
+free_charge:
154
+ bpf_map_charge_finish(&mem);
156155 bpf_map_area_free(smap);
157156 return ERR_PTR(err);
158157 }
....@@ -217,11 +216,13 @@
217216
218217 phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr));
219218
220
- for (i = 0; i < ehdr->e_phnum; ++i)
221
- if (phdr[i].p_type == PT_NOTE)
222
- return stack_map_parse_build_id(page_addr, build_id,
223
- page_addr + phdr[i].p_offset,
224
- phdr[i].p_filesz);
219
+ for (i = 0; i < ehdr->e_phnum; ++i) {
220
+ if (phdr[i].p_type == PT_NOTE &&
221
+ !stack_map_parse_build_id(page_addr, build_id,
222
+ page_addr + phdr[i].p_offset,
223
+ phdr[i].p_filesz))
224
+ return 0;
225
+ }
225226 return -EINVAL;
226227 }
227228
....@@ -240,11 +241,13 @@
240241
241242 phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr));
242243
243
- for (i = 0; i < ehdr->e_phnum; ++i)
244
- if (phdr[i].p_type == PT_NOTE)
245
- return stack_map_parse_build_id(page_addr, build_id,
246
- page_addr + phdr[i].p_offset,
247
- phdr[i].p_filesz);
244
+ for (i = 0; i < ehdr->e_phnum; ++i) {
245
+ if (phdr[i].p_type == PT_NOTE &&
246
+ !stack_map_parse_build_id(page_addr, build_id,
247
+ page_addr + phdr[i].p_offset,
248
+ phdr[i].p_filesz))
249
+ return 0;
250
+ }
248251 return -EINVAL;
249252 }
250253
....@@ -296,10 +299,19 @@
296299 struct stack_map_irq_work *work = NULL;
297300
298301 if (irqs_disabled()) {
299
- work = this_cpu_ptr(&up_read_work);
300
- if (work->irq_work.flags & IRQ_WORK_BUSY)
301
- /* cannot queue more up_read, fallback */
302
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
303
+ work = this_cpu_ptr(&up_read_work);
304
+ if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) {
305
+ /* cannot queue more up_read, fallback */
306
+ irq_work_busy = true;
307
+ }
308
+ } else {
309
+ /*
310
+ * PREEMPT_RT does not allow to trylock mmap sem in
311
+ * interrupt disabled context. Force the fallback code.
312
+ */
302313 irq_work_busy = true;
314
+ }
303315 }
304316
305317 /*
....@@ -313,7 +325,7 @@
313325 * with build_id.
314326 */
315327 if (!user || !current || !current->mm || irq_work_busy ||
316
- down_read_trylock(&current->mm->mmap_sem) == 0) {
328
+ !mmap_read_trylock_non_owner(current->mm)) {
317329 /* cannot access current->mm, fall back to ips */
318330 for (i = 0; i < trace_nr; i++) {
319331 id_offs[i].status = BPF_STACK_BUILD_ID_IP;
....@@ -338,58 +350,68 @@
338350 }
339351
340352 if (!work) {
341
- up_read(&current->mm->mmap_sem);
353
+ mmap_read_unlock_non_owner(current->mm);
342354 } else {
343
- work->sem = &current->mm->mmap_sem;
355
+ work->mm = current->mm;
344356 irq_work_queue(&work->irq_work);
345
- /*
346
- * The irq_work will release the mmap_sem with
347
- * up_read_non_owner(). The rwsem_release() is called
348
- * here to release the lock from lockdep's perspective.
349
- */
350
- rwsem_release(&current->mm->mmap_sem.dep_map, 1, _RET_IP_);
351357 }
352358 }
353359
354
-BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
355
- u64, flags)
360
+static struct perf_callchain_entry *
361
+get_callchain_entry_for_task(struct task_struct *task, u32 max_depth)
362
+{
363
+#ifdef CONFIG_STACKTRACE
364
+ struct perf_callchain_entry *entry;
365
+ int rctx;
366
+
367
+ entry = get_callchain_entry(&rctx);
368
+
369
+ if (!entry)
370
+ return NULL;
371
+
372
+ entry->nr = stack_trace_save_tsk(task, (unsigned long *)entry->ip,
373
+ max_depth, 0);
374
+
375
+ /* stack_trace_save_tsk() works on unsigned long array, while
376
+ * perf_callchain_entry uses u64 array. For 32-bit systems, it is
377
+ * necessary to fix this mismatch.
378
+ */
379
+ if (__BITS_PER_LONG != 64) {
380
+ unsigned long *from = (unsigned long *) entry->ip;
381
+ u64 *to = entry->ip;
382
+ int i;
383
+
384
+ /* copy data from the end to avoid using extra buffer */
385
+ for (i = entry->nr - 1; i >= 0; i--)
386
+ to[i] = (u64)(from[i]);
387
+ }
388
+
389
+ put_callchain_entry(rctx);
390
+
391
+ return entry;
392
+#else /* CONFIG_STACKTRACE */
393
+ return NULL;
394
+#endif
395
+}
396
+
397
+static long __bpf_get_stackid(struct bpf_map *map,
398
+ struct perf_callchain_entry *trace, u64 flags)
356399 {
357400 struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
358
- struct perf_callchain_entry *trace;
359401 struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
360
- u32 max_depth = map->value_size / stack_map_data_size(map);
361
- /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
362
- u32 init_nr = sysctl_perf_event_max_stack - max_depth;
363402 u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
364403 u32 hash, id, trace_nr, trace_len;
365404 bool user = flags & BPF_F_USER_STACK;
366
- bool kernel = !user;
367405 u64 *ips;
368406 bool hash_matches;
369407
370
- if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
371
- BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
372
- return -EINVAL;
373
-
374
- trace = get_perf_callchain(regs, init_nr, kernel, user,
375
- sysctl_perf_event_max_stack, false, false);
376
-
377
- if (unlikely(!trace))
378
- /* couldn't fetch the stack trace */
379
- return -EFAULT;
380
-
381
- /* get_perf_callchain() guarantees that trace->nr >= init_nr
382
- * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
383
- */
384
- trace_nr = trace->nr - init_nr;
385
-
386
- if (trace_nr <= skip)
408
+ if (trace->nr <= skip)
387409 /* skipping more than usable stack trace */
388410 return -EFAULT;
389411
390
- trace_nr -= skip;
412
+ trace_nr = trace->nr - skip;
391413 trace_len = trace_nr * sizeof(u64);
392
- ips = trace->ip + skip + init_nr;
414
+ ips = trace->ip + skip;
393415 hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0);
394416 id = hash & (smap->n_buckets - 1);
395417 bucket = READ_ONCE(smap->buckets[id]);
....@@ -442,6 +464,33 @@
442464 return id;
443465 }
444466
467
+BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
468
+ u64, flags)
469
+{
470
+ u32 max_depth = map->value_size / stack_map_data_size(map);
471
+ u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
472
+ bool user = flags & BPF_F_USER_STACK;
473
+ struct perf_callchain_entry *trace;
474
+ bool kernel = !user;
475
+
476
+ if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
477
+ BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
478
+ return -EINVAL;
479
+
480
+ max_depth += skip;
481
+ if (max_depth > sysctl_perf_event_max_stack)
482
+ max_depth = sysctl_perf_event_max_stack;
483
+
484
+ trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
485
+ false, false);
486
+
487
+ if (unlikely(!trace))
488
+ /* couldn't fetch the stack trace */
489
+ return -EFAULT;
490
+
491
+ return __bpf_get_stackid(map, trace, flags);
492
+}
493
+
445494 const struct bpf_func_proto bpf_get_stackid_proto = {
446495 .func = bpf_get_stackid,
447496 .gpl_only = true,
....@@ -451,10 +500,80 @@
451500 .arg3_type = ARG_ANYTHING,
452501 };
453502
454
-BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
455
- u64, flags)
503
+static __u64 count_kernel_ip(struct perf_callchain_entry *trace)
456504 {
457
- u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
505
+ __u64 nr_kernel = 0;
506
+
507
+ while (nr_kernel < trace->nr) {
508
+ if (trace->ip[nr_kernel] == PERF_CONTEXT_USER)
509
+ break;
510
+ nr_kernel++;
511
+ }
512
+ return nr_kernel;
513
+}
514
+
515
+BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx,
516
+ struct bpf_map *, map, u64, flags)
517
+{
518
+ struct perf_event *event = ctx->event;
519
+ struct perf_callchain_entry *trace;
520
+ bool kernel, user;
521
+ __u64 nr_kernel;
522
+ int ret;
523
+
524
+ /* perf_sample_data doesn't have callchain, use bpf_get_stackid */
525
+ if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
526
+ return bpf_get_stackid((unsigned long)(ctx->regs),
527
+ (unsigned long) map, flags, 0, 0);
528
+
529
+ if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
530
+ BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
531
+ return -EINVAL;
532
+
533
+ user = flags & BPF_F_USER_STACK;
534
+ kernel = !user;
535
+
536
+ trace = ctx->data->callchain;
537
+ if (unlikely(!trace))
538
+ return -EFAULT;
539
+
540
+ nr_kernel = count_kernel_ip(trace);
541
+
542
+ if (kernel) {
543
+ __u64 nr = trace->nr;
544
+
545
+ trace->nr = nr_kernel;
546
+ ret = __bpf_get_stackid(map, trace, flags);
547
+
548
+ /* restore nr */
549
+ trace->nr = nr;
550
+ } else { /* user */
551
+ u64 skip = flags & BPF_F_SKIP_FIELD_MASK;
552
+
553
+ skip += nr_kernel;
554
+ if (skip > BPF_F_SKIP_FIELD_MASK)
555
+ return -EFAULT;
556
+
557
+ flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
558
+ ret = __bpf_get_stackid(map, trace, flags);
559
+ }
560
+ return ret;
561
+}
562
+
563
+const struct bpf_func_proto bpf_get_stackid_proto_pe = {
564
+ .func = bpf_get_stackid_pe,
565
+ .gpl_only = false,
566
+ .ret_type = RET_INTEGER,
567
+ .arg1_type = ARG_PTR_TO_CTX,
568
+ .arg2_type = ARG_CONST_MAP_PTR,
569
+ .arg3_type = ARG_ANYTHING,
570
+};
571
+
572
+static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
573
+ struct perf_callchain_entry *trace_in,
574
+ void *buf, u32 size, u64 flags)
575
+{
576
+ u32 trace_nr, copy_len, elem_size, num_elem, max_depth;
458577 bool user_build_id = flags & BPF_F_USER_BUILD_ID;
459578 u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
460579 bool user = flags & BPF_F_USER_STACK;
....@@ -474,24 +593,33 @@
474593 if (unlikely(size % elem_size))
475594 goto clear;
476595
596
+ /* cannot get valid user stack for task without user_mode regs */
597
+ if (task && user && !user_mode(regs))
598
+ goto err_fault;
599
+
477600 num_elem = size / elem_size;
478
- if (sysctl_perf_event_max_stack < num_elem)
479
- init_nr = 0;
601
+ max_depth = num_elem + skip;
602
+ if (sysctl_perf_event_max_stack < max_depth)
603
+ max_depth = sysctl_perf_event_max_stack;
604
+
605
+ if (trace_in)
606
+ trace = trace_in;
607
+ else if (kernel && task)
608
+ trace = get_callchain_entry_for_task(task, max_depth);
480609 else
481
- init_nr = sysctl_perf_event_max_stack - num_elem;
482
- trace = get_perf_callchain(regs, init_nr, kernel, user,
483
- sysctl_perf_event_max_stack, false, false);
610
+ trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
611
+ false, false);
484612 if (unlikely(!trace))
485613 goto err_fault;
486614
487
- trace_nr = trace->nr - init_nr;
488
- if (trace_nr < skip)
615
+ if (trace->nr < skip)
489616 goto err_fault;
490617
491
- trace_nr -= skip;
618
+ trace_nr = trace->nr - skip;
492619 trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
493620 copy_len = trace_nr * elem_size;
494
- ips = trace->ip + skip + init_nr;
621
+
622
+ ips = trace->ip + skip;
495623 if (user && user_build_id)
496624 stack_map_get_build_id_offset(buf, ips, trace_nr, user);
497625 else
....@@ -508,8 +636,107 @@
508636 return err;
509637 }
510638
639
+BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
640
+ u64, flags)
641
+{
642
+ return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);
643
+}
644
+
511645 const struct bpf_func_proto bpf_get_stack_proto = {
512646 .func = bpf_get_stack,
647
+ .gpl_only = true,
648
+ .ret_type = RET_INTEGER,
649
+ .arg1_type = ARG_PTR_TO_CTX,
650
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
651
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
652
+ .arg4_type = ARG_ANYTHING,
653
+};
654
+
655
+BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
656
+ u32, size, u64, flags)
657
+{
658
+ struct pt_regs *regs;
659
+ long res = -EINVAL;
660
+
661
+ if (!try_get_task_stack(task))
662
+ return -EFAULT;
663
+
664
+ regs = task_pt_regs(task);
665
+ if (regs)
666
+ res = __bpf_get_stack(regs, task, NULL, buf, size, flags);
667
+ put_task_stack(task);
668
+
669
+ return res;
670
+}
671
+
672
+BTF_ID_LIST_SINGLE(bpf_get_task_stack_btf_ids, struct, task_struct)
673
+
674
+const struct bpf_func_proto bpf_get_task_stack_proto = {
675
+ .func = bpf_get_task_stack,
676
+ .gpl_only = false,
677
+ .ret_type = RET_INTEGER,
678
+ .arg1_type = ARG_PTR_TO_BTF_ID,
679
+ .arg1_btf_id = &bpf_get_task_stack_btf_ids[0],
680
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
681
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
682
+ .arg4_type = ARG_ANYTHING,
683
+};
684
+
685
+BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
686
+ void *, buf, u32, size, u64, flags)
687
+{
688
+ struct pt_regs *regs = (struct pt_regs *)(ctx->regs);
689
+ struct perf_event *event = ctx->event;
690
+ struct perf_callchain_entry *trace;
691
+ bool kernel, user;
692
+ int err = -EINVAL;
693
+ __u64 nr_kernel;
694
+
695
+ if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
696
+ return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);
697
+
698
+ if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
699
+ BPF_F_USER_BUILD_ID)))
700
+ goto clear;
701
+
702
+ user = flags & BPF_F_USER_STACK;
703
+ kernel = !user;
704
+
705
+ err = -EFAULT;
706
+ trace = ctx->data->callchain;
707
+ if (unlikely(!trace))
708
+ goto clear;
709
+
710
+ nr_kernel = count_kernel_ip(trace);
711
+
712
+ if (kernel) {
713
+ __u64 nr = trace->nr;
714
+
715
+ trace->nr = nr_kernel;
716
+ err = __bpf_get_stack(regs, NULL, trace, buf, size, flags);
717
+
718
+ /* restore nr */
719
+ trace->nr = nr;
720
+ } else { /* user */
721
+ u64 skip = flags & BPF_F_SKIP_FIELD_MASK;
722
+
723
+ skip += nr_kernel;
724
+ if (skip > BPF_F_SKIP_FIELD_MASK)
725
+ goto clear;
726
+
727
+ flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
728
+ err = __bpf_get_stack(regs, NULL, trace, buf, size, flags);
729
+ }
730
+ return err;
731
+
732
+clear:
733
+ memset(buf, 0, size);
734
+ return err;
735
+
736
+}
737
+
738
+const struct bpf_func_proto bpf_get_stack_proto_pe = {
739
+ .func = bpf_get_stack_pe,
513740 .gpl_only = true,
514741 .ret_type = RET_INTEGER,
515742 .arg1_type = ARG_PTR_TO_CTX,
....@@ -521,7 +748,7 @@
521748 /* Called from eBPF program */
522749 static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
523750 {
524
- return NULL;
751
+ return ERR_PTR(-EOPNOTSUPP);
525752 }
526753
527754 /* Called from syscall */
....@@ -607,16 +834,15 @@
607834 {
608835 struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
609836
610
- /* wait for bpf programs to complete before freeing stack map */
611
- synchronize_rcu();
612
-
613837 bpf_map_area_free(smap->elems);
614838 pcpu_freelist_destroy(&smap->freelist);
615839 bpf_map_area_free(smap);
616840 put_callchain_buffers();
617841 }
618842
619
-const struct bpf_map_ops stack_map_ops = {
843
+static int stack_trace_map_btf_id;
844
+const struct bpf_map_ops stack_trace_map_ops = {
845
+ .map_meta_equal = bpf_map_meta_equal,
620846 .map_alloc = stack_map_alloc,
621847 .map_free = stack_map_free,
622848 .map_get_next_key = stack_map_get_next_key,
....@@ -624,6 +850,8 @@
624850 .map_update_elem = stack_map_update_elem,
625851 .map_delete_elem = stack_map_delete_elem,
626852 .map_check_btf = map_check_no_btf,
853
+ .map_btf_name = "bpf_stack_map",
854
+ .map_btf_id = &stack_trace_map_btf_id,
627855 };
628856
629857 static int __init stack_map_init(void)