| .. | .. | 
|---|
 | 1 | +// SPDX-License-Identifier: GPL-2.0-only  | 
|---|
| 1 | 2 |  /* Copyright (c) 2016 Facebook | 
|---|
| 2 |  | - *  | 
|---|
| 3 |  | - * This program is free software; you can redistribute it and/or  | 
|---|
| 4 |  | - * modify it under the terms of version 2 of the GNU General Public  | 
|---|
| 5 |  | - * License as published by the Free Software Foundation.  | 
|---|
| 6 | 3 |   */ | 
|---|
| 7 | 4 |  #include <linux/bpf.h> | 
|---|
| 8 | 5 |  #include <linux/jhash.h> | 
|---|
| 9 | 6 |  #include <linux/filter.h> | 
|---|
 | 7 | +#include <linux/kernel.h>  | 
|---|
| 10 | 8 |  #include <linux/stacktrace.h> | 
|---|
| 11 | 9 |  #include <linux/perf_event.h> | 
|---|
| 12 | 10 |  #include <linux/elf.h> | 
|---|
| 13 | 11 |  #include <linux/pagemap.h> | 
|---|
| 14 | 12 |  #include <linux/irq_work.h> | 
|---|
 | 13 | +#include <linux/btf_ids.h>  | 
|---|
| 15 | 14 |  #include "percpu_freelist.h" | 
|---|
| 16 | 15 |   | 
|---|
| 17 | 16 |  #define STACK_CREATE_FLAG_MASK					\ | 
|---|
| .. | .. | 
|---|
| 36 | 35 |  /* irq_work to run up_read() for build_id lookup in nmi context */ | 
|---|
| 37 | 36 |  struct stack_map_irq_work { | 
|---|
| 38 | 37 |  	struct irq_work irq_work; | 
|---|
| 39 |  | -	struct rw_semaphore *sem;  | 
|---|
 | 38 | +	struct mm_struct *mm;  | 
|---|
| 40 | 39 |  }; | 
|---|
| 41 | 40 |   | 
|---|
| 42 | 41 |  static void do_up_read(struct irq_work *entry) | 
|---|
| 43 | 42 |  { | 
|---|
| 44 | 43 |  	struct stack_map_irq_work *work; | 
|---|
| 45 | 44 |   | 
|---|
 | 45 | +	if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)))  | 
|---|
 | 46 | +		return;  | 
|---|
 | 47 | +  | 
|---|
| 46 | 48 |  	work = container_of(entry, struct stack_map_irq_work, irq_work); | 
|---|
| 47 |  | -	up_read_non_owner(work->sem);  | 
|---|
| 48 |  | -	work->sem = NULL;  | 
|---|
 | 49 | +	mmap_read_unlock_non_owner(work->mm);  | 
|---|
| 49 | 50 |  } | 
|---|
| 50 | 51 |   | 
|---|
| 51 | 52 |  static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work); | 
|---|
| .. | .. | 
|---|
| 90 | 91 |  { | 
|---|
| 91 | 92 |  	u32 value_size = attr->value_size; | 
|---|
| 92 | 93 |  	struct bpf_stack_map *smap; | 
|---|
 | 94 | +	struct bpf_map_memory mem;  | 
|---|
| 93 | 95 |  	u64 cost, n_buckets; | 
|---|
| 94 | 96 |  	int err; | 
|---|
| 95 | 97 |   | 
|---|
| 96 |  | -	if (!capable(CAP_SYS_ADMIN))  | 
|---|
 | 98 | +	if (!bpf_capable())  | 
|---|
| 97 | 99 |  		return ERR_PTR(-EPERM); | 
|---|
| 98 | 100 |   | 
|---|
| 99 | 101 |  	if (attr->map_flags & ~STACK_CREATE_FLAG_MASK) | 
|---|
| .. | .. | 
|---|
| 119 | 121 |  		return ERR_PTR(-E2BIG); | 
|---|
| 120 | 122 |   | 
|---|
| 121 | 123 |  	cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); | 
|---|
| 122 |  | -	if (cost >= U32_MAX - PAGE_SIZE)  | 
|---|
| 123 |  | -		return ERR_PTR(-E2BIG);  | 
|---|
 | 124 | +	err = bpf_map_charge_init(&mem, cost + attr->max_entries *  | 
|---|
 | 125 | +			   (sizeof(struct stack_map_bucket) + (u64)value_size));  | 
|---|
 | 126 | +	if (err)  | 
|---|
 | 127 | +		return ERR_PTR(err);  | 
|---|
| 124 | 128 |   | 
|---|
| 125 | 129 |  	smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr)); | 
|---|
| 126 |  | -	if (!smap)  | 
|---|
 | 130 | +	if (!smap) {  | 
|---|
 | 131 | +		bpf_map_charge_finish(&mem);  | 
|---|
| 127 | 132 |  		return ERR_PTR(-ENOMEM); | 
|---|
| 128 |  | -  | 
|---|
| 129 |  | -	err = -E2BIG;  | 
|---|
| 130 |  | -	cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));  | 
|---|
| 131 |  | -	if (cost >= U32_MAX - PAGE_SIZE)  | 
|---|
| 132 |  | -		goto free_smap;  | 
|---|
 | 133 | +	}  | 
|---|
| 133 | 134 |   | 
|---|
| 134 | 135 |  	bpf_map_init_from_attr(&smap->map, attr); | 
|---|
| 135 | 136 |  	smap->map.value_size = value_size; | 
|---|
| 136 | 137 |  	smap->n_buckets = n_buckets; | 
|---|
| 137 |  | -	smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;  | 
|---|
| 138 |  | -  | 
|---|
| 139 |  | -	err = bpf_map_precharge_memlock(smap->map.pages);  | 
|---|
| 140 |  | -	if (err)  | 
|---|
| 141 |  | -		goto free_smap;  | 
|---|
| 142 | 138 |   | 
|---|
| 143 | 139 |  	err = get_callchain_buffers(sysctl_perf_event_max_stack); | 
|---|
| 144 | 140 |  	if (err) | 
|---|
| 145 |  | -		goto free_smap;  | 
|---|
 | 141 | +		goto free_charge;  | 
|---|
| 146 | 142 |   | 
|---|
| 147 | 143 |  	err = prealloc_elems_and_freelist(smap); | 
|---|
| 148 | 144 |  	if (err) | 
|---|
| 149 | 145 |  		goto put_buffers; | 
|---|
| 150 | 146 |   | 
|---|
 | 147 | +	bpf_map_charge_move(&smap->map.memory, &mem);  | 
|---|
 | 148 | +  | 
|---|
| 151 | 149 |  	return &smap->map; | 
|---|
| 152 | 150 |   | 
|---|
| 153 | 151 |  put_buffers: | 
|---|
| 154 | 152 |  	put_callchain_buffers(); | 
|---|
| 155 |  | -free_smap:  | 
|---|
 | 153 | +free_charge:  | 
|---|
 | 154 | +	bpf_map_charge_finish(&mem);  | 
|---|
| 156 | 155 |  	bpf_map_area_free(smap); | 
|---|
| 157 | 156 |  	return ERR_PTR(err); | 
|---|
| 158 | 157 |  } | 
|---|
| .. | .. | 
|---|
| 217 | 216 |   | 
|---|
| 218 | 217 |  	phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr)); | 
|---|
| 219 | 218 |   | 
|---|
| 220 |  | -	for (i = 0; i < ehdr->e_phnum; ++i)  | 
|---|
| 221 |  | -		if (phdr[i].p_type == PT_NOTE)  | 
|---|
| 222 |  | -			return stack_map_parse_build_id(page_addr, build_id,  | 
|---|
| 223 |  | -					page_addr + phdr[i].p_offset,  | 
|---|
| 224 |  | -					phdr[i].p_filesz);  | 
|---|
 | 219 | +	for (i = 0; i < ehdr->e_phnum; ++i) {  | 
|---|
 | 220 | +		if (phdr[i].p_type == PT_NOTE &&  | 
|---|
 | 221 | +		    !stack_map_parse_build_id(page_addr, build_id,  | 
|---|
 | 222 | +					      page_addr + phdr[i].p_offset,  | 
|---|
 | 223 | +					      phdr[i].p_filesz))  | 
|---|
 | 224 | +			return 0;  | 
|---|
 | 225 | +	}  | 
|---|
| 225 | 226 |  	return -EINVAL; | 
|---|
| 226 | 227 |  } | 
|---|
| 227 | 228 |   | 
|---|
| .. | .. | 
|---|
| 240 | 241 |   | 
|---|
| 241 | 242 |  	phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr)); | 
|---|
| 242 | 243 |   | 
|---|
| 243 |  | -	for (i = 0; i < ehdr->e_phnum; ++i)  | 
|---|
| 244 |  | -		if (phdr[i].p_type == PT_NOTE)  | 
|---|
| 245 |  | -			return stack_map_parse_build_id(page_addr, build_id,  | 
|---|
| 246 |  | -					page_addr + phdr[i].p_offset,  | 
|---|
| 247 |  | -					phdr[i].p_filesz);  | 
|---|
 | 244 | +	for (i = 0; i < ehdr->e_phnum; ++i) {  | 
|---|
 | 245 | +		if (phdr[i].p_type == PT_NOTE &&  | 
|---|
 | 246 | +		    !stack_map_parse_build_id(page_addr, build_id,  | 
|---|
 | 247 | +					      page_addr + phdr[i].p_offset,  | 
|---|
 | 248 | +					      phdr[i].p_filesz))  | 
|---|
 | 249 | +			return 0;  | 
|---|
 | 250 | +	}  | 
|---|
| 248 | 251 |  	return -EINVAL; | 
|---|
| 249 | 252 |  } | 
|---|
| 250 | 253 |   | 
|---|
| .. | .. | 
|---|
| 296 | 299 |  	struct stack_map_irq_work *work = NULL; | 
|---|
| 297 | 300 |   | 
|---|
| 298 | 301 |  	if (irqs_disabled()) { | 
|---|
| 299 |  | -		work = this_cpu_ptr(&up_read_work);  | 
|---|
| 300 |  | -		if (work->irq_work.flags & IRQ_WORK_BUSY)  | 
|---|
| 301 |  | -			/* cannot queue more up_read, fallback */  | 
|---|
 | 302 | +		if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {  | 
|---|
 | 303 | +			work = this_cpu_ptr(&up_read_work);  | 
|---|
 | 304 | +			if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) {  | 
|---|
 | 305 | +				/* cannot queue more up_read, fallback */  | 
|---|
 | 306 | +				irq_work_busy = true;  | 
|---|
 | 307 | +			}  | 
|---|
 | 308 | +		} else {  | 
|---|
 | 309 | +			/*  | 
|---|
 | 310 | +			 * PREEMPT_RT does not allow to trylock mmap sem in  | 
|---|
 | 311 | +			 * interrupt disabled context. Force the fallback code.  | 
|---|
 | 312 | +			 */  | 
|---|
| 302 | 313 |  			irq_work_busy = true; | 
|---|
 | 314 | +		}  | 
|---|
| 303 | 315 |  	} | 
|---|
| 304 | 316 |   | 
|---|
| 305 | 317 |  	/* | 
|---|
| .. | .. | 
|---|
| 313 | 325 |  	 * with build_id. | 
|---|
| 314 | 326 |  	 */ | 
|---|
| 315 | 327 |  	if (!user || !current || !current->mm || irq_work_busy || | 
|---|
| 316 |  | -	    down_read_trylock(¤t->mm->mmap_sem) == 0) {  | 
|---|
 | 328 | +	    !mmap_read_trylock_non_owner(current->mm)) {  | 
|---|
| 317 | 329 |  		/* cannot access current->mm, fall back to ips */ | 
|---|
| 318 | 330 |  		for (i = 0; i < trace_nr; i++) { | 
|---|
| 319 | 331 |  			id_offs[i].status = BPF_STACK_BUILD_ID_IP; | 
|---|
| .. | .. | 
|---|
| 338 | 350 |  	} | 
|---|
| 339 | 351 |   | 
|---|
| 340 | 352 |  	if (!work) { | 
|---|
| 341 |  | -		up_read(¤t->mm->mmap_sem);  | 
|---|
 | 353 | +		mmap_read_unlock_non_owner(current->mm);  | 
|---|
| 342 | 354 |  	} else { | 
|---|
| 343 |  | -		work->sem = ¤t->mm->mmap_sem;  | 
|---|
 | 355 | +		work->mm = current->mm;  | 
|---|
| 344 | 356 |  		irq_work_queue(&work->irq_work); | 
|---|
| 345 |  | -		/*  | 
|---|
| 346 |  | -		 * The irq_work will release the mmap_sem with  | 
|---|
| 347 |  | -		 * up_read_non_owner(). The rwsem_release() is called  | 
|---|
| 348 |  | -		 * here to release the lock from lockdep's perspective.  | 
|---|
| 349 |  | -		 */  | 
|---|
| 350 |  | -		rwsem_release(¤t->mm->mmap_sem.dep_map, 1, _RET_IP_);  | 
|---|
| 351 | 357 |  	} | 
|---|
| 352 | 358 |  } | 
|---|
| 353 | 359 |   | 
|---|
| 354 |  | -BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,  | 
|---|
| 355 |  | -	   u64, flags)  | 
|---|
 | 360 | +static struct perf_callchain_entry *  | 
|---|
 | 361 | +get_callchain_entry_for_task(struct task_struct *task, u32 max_depth)  | 
|---|
 | 362 | +{  | 
|---|
 | 363 | +#ifdef CONFIG_STACKTRACE  | 
|---|
 | 364 | +	struct perf_callchain_entry *entry;  | 
|---|
 | 365 | +	int rctx;  | 
|---|
 | 366 | +  | 
|---|
 | 367 | +	entry = get_callchain_entry(&rctx);  | 
|---|
 | 368 | +  | 
|---|
 | 369 | +	if (!entry)  | 
|---|
 | 370 | +		return NULL;  | 
|---|
 | 371 | +  | 
|---|
 | 372 | +	entry->nr = stack_trace_save_tsk(task, (unsigned long *)entry->ip,  | 
|---|
 | 373 | +					 max_depth, 0);  | 
|---|
 | 374 | +  | 
|---|
 | 375 | +	/* stack_trace_save_tsk() works on unsigned long array, while  | 
|---|
 | 376 | +	 * perf_callchain_entry uses u64 array. For 32-bit systems, it is  | 
|---|
 | 377 | +	 * necessary to fix this mismatch.  | 
|---|
 | 378 | +	 */  | 
|---|
 | 379 | +	if (__BITS_PER_LONG != 64) {  | 
|---|
 | 380 | +		unsigned long *from = (unsigned long *) entry->ip;  | 
|---|
 | 381 | +		u64 *to = entry->ip;  | 
|---|
 | 382 | +		int i;  | 
|---|
 | 383 | +  | 
|---|
 | 384 | +		/* copy data from the end to avoid using extra buffer */  | 
|---|
 | 385 | +		for (i = entry->nr - 1; i >= 0; i--)  | 
|---|
 | 386 | +			to[i] = (u64)(from[i]);  | 
|---|
 | 387 | +	}  | 
|---|
 | 388 | +  | 
|---|
 | 389 | +	put_callchain_entry(rctx);  | 
|---|
 | 390 | +  | 
|---|
 | 391 | +	return entry;  | 
|---|
 | 392 | +#else /* CONFIG_STACKTRACE */  | 
|---|
 | 393 | +	return NULL;  | 
|---|
 | 394 | +#endif  | 
|---|
 | 395 | +}  | 
|---|
 | 396 | +  | 
|---|
 | 397 | +static long __bpf_get_stackid(struct bpf_map *map,  | 
|---|
 | 398 | +			      struct perf_callchain_entry *trace, u64 flags)  | 
|---|
| 356 | 399 |  { | 
|---|
| 357 | 400 |  	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); | 
|---|
| 358 |  | -	struct perf_callchain_entry *trace;  | 
|---|
| 359 | 401 |  	struct stack_map_bucket *bucket, *new_bucket, *old_bucket; | 
|---|
| 360 |  | -	u32 max_depth = map->value_size / stack_map_data_size(map);  | 
|---|
| 361 |  | -	/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */  | 
|---|
| 362 |  | -	u32 init_nr = sysctl_perf_event_max_stack - max_depth;  | 
|---|
| 363 | 402 |  	u32 skip = flags & BPF_F_SKIP_FIELD_MASK; | 
|---|
| 364 | 403 |  	u32 hash, id, trace_nr, trace_len; | 
|---|
| 365 | 404 |  	bool user = flags & BPF_F_USER_STACK; | 
|---|
| 366 |  | -	bool kernel = !user;  | 
|---|
| 367 | 405 |  	u64 *ips; | 
|---|
| 368 | 406 |  	bool hash_matches; | 
|---|
| 369 | 407 |   | 
|---|
| 370 |  | -	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |  | 
|---|
| 371 |  | -			       BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))  | 
|---|
| 372 |  | -		return -EINVAL;  | 
|---|
| 373 |  | -  | 
|---|
| 374 |  | -	trace = get_perf_callchain(regs, init_nr, kernel, user,  | 
|---|
| 375 |  | -				   sysctl_perf_event_max_stack, false, false);  | 
|---|
| 376 |  | -  | 
|---|
| 377 |  | -	if (unlikely(!trace))  | 
|---|
| 378 |  | -		/* couldn't fetch the stack trace */  | 
|---|
| 379 |  | -		return -EFAULT;  | 
|---|
| 380 |  | -  | 
|---|
| 381 |  | -	/* get_perf_callchain() guarantees that trace->nr >= init_nr  | 
|---|
| 382 |  | -	 * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth  | 
|---|
| 383 |  | -	 */  | 
|---|
| 384 |  | -	trace_nr = trace->nr - init_nr;  | 
|---|
| 385 |  | -  | 
|---|
| 386 |  | -	if (trace_nr <= skip)  | 
|---|
 | 408 | +	if (trace->nr <= skip)  | 
|---|
| 387 | 409 |  		/* skipping more than usable stack trace */ | 
|---|
| 388 | 410 |  		return -EFAULT; | 
|---|
| 389 | 411 |   | 
|---|
| 390 |  | -	trace_nr -= skip;  | 
|---|
 | 412 | +	trace_nr = trace->nr - skip;  | 
|---|
| 391 | 413 |  	trace_len = trace_nr * sizeof(u64); | 
|---|
| 392 |  | -	ips = trace->ip + skip + init_nr;  | 
|---|
 | 414 | +	ips = trace->ip + skip;  | 
|---|
| 393 | 415 |  	hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0); | 
|---|
| 394 | 416 |  	id = hash & (smap->n_buckets - 1); | 
|---|
| 395 | 417 |  	bucket = READ_ONCE(smap->buckets[id]); | 
|---|
| .. | .. | 
|---|
| 442 | 464 |  	return id; | 
|---|
| 443 | 465 |  } | 
|---|
| 444 | 466 |   | 
|---|
 | 467 | +BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,  | 
|---|
 | 468 | +	   u64, flags)  | 
|---|
 | 469 | +{  | 
|---|
 | 470 | +	u32 max_depth = map->value_size / stack_map_data_size(map);  | 
|---|
 | 471 | +	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;  | 
|---|
 | 472 | +	bool user = flags & BPF_F_USER_STACK;  | 
|---|
 | 473 | +	struct perf_callchain_entry *trace;  | 
|---|
 | 474 | +	bool kernel = !user;  | 
|---|
 | 475 | +  | 
|---|
 | 476 | +	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |  | 
|---|
 | 477 | +			       BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))  | 
|---|
 | 478 | +		return -EINVAL;  | 
|---|
 | 479 | +  | 
|---|
 | 480 | +	max_depth += skip;  | 
|---|
 | 481 | +	if (max_depth > sysctl_perf_event_max_stack)  | 
|---|
 | 482 | +		max_depth = sysctl_perf_event_max_stack;  | 
|---|
 | 483 | +  | 
|---|
 | 484 | +	trace = get_perf_callchain(regs, 0, kernel, user, max_depth,  | 
|---|
 | 485 | +				   false, false);  | 
|---|
 | 486 | +  | 
|---|
 | 487 | +	if (unlikely(!trace))  | 
|---|
 | 488 | +		/* couldn't fetch the stack trace */  | 
|---|
 | 489 | +		return -EFAULT;  | 
|---|
 | 490 | +  | 
|---|
 | 491 | +	return __bpf_get_stackid(map, trace, flags);  | 
|---|
 | 492 | +}  | 
|---|
 | 493 | +  | 
|---|
| 445 | 494 |  const struct bpf_func_proto bpf_get_stackid_proto = { | 
|---|
| 446 | 495 |  	.func		= bpf_get_stackid, | 
|---|
| 447 | 496 |  	.gpl_only	= true, | 
|---|
| .. | .. | 
|---|
| 451 | 500 |  	.arg3_type	= ARG_ANYTHING, | 
|---|
| 452 | 501 |  }; | 
|---|
| 453 | 502 |   | 
|---|
| 454 |  | -BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,  | 
|---|
| 455 |  | -	   u64, flags)  | 
|---|
 | 503 | +static __u64 count_kernel_ip(struct perf_callchain_entry *trace)  | 
|---|
| 456 | 504 |  { | 
|---|
| 457 |  | -	u32 init_nr, trace_nr, copy_len, elem_size, num_elem;  | 
|---|
 | 505 | +	__u64 nr_kernel = 0;  | 
|---|
 | 506 | +  | 
|---|
 | 507 | +	while (nr_kernel < trace->nr) {  | 
|---|
 | 508 | +		if (trace->ip[nr_kernel] == PERF_CONTEXT_USER)  | 
|---|
 | 509 | +			break;  | 
|---|
 | 510 | +		nr_kernel++;  | 
|---|
 | 511 | +	}  | 
|---|
 | 512 | +	return nr_kernel;  | 
|---|
 | 513 | +}  | 
|---|
 | 514 | +  | 
|---|
 | 515 | +BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx,  | 
|---|
 | 516 | +	   struct bpf_map *, map, u64, flags)  | 
|---|
 | 517 | +{  | 
|---|
 | 518 | +	struct perf_event *event = ctx->event;  | 
|---|
 | 519 | +	struct perf_callchain_entry *trace;  | 
|---|
 | 520 | +	bool kernel, user;  | 
|---|
 | 521 | +	__u64 nr_kernel;  | 
|---|
 | 522 | +	int ret;  | 
|---|
 | 523 | +  | 
|---|
 | 524 | +	/* perf_sample_data doesn't have callchain, use bpf_get_stackid */  | 
|---|
 | 525 | +	if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))  | 
|---|
 | 526 | +		return bpf_get_stackid((unsigned long)(ctx->regs),  | 
|---|
 | 527 | +				       (unsigned long) map, flags, 0, 0);  | 
|---|
 | 528 | +  | 
|---|
 | 529 | +	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |  | 
|---|
 | 530 | +			       BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))  | 
|---|
 | 531 | +		return -EINVAL;  | 
|---|
 | 532 | +  | 
|---|
 | 533 | +	user = flags & BPF_F_USER_STACK;  | 
|---|
 | 534 | +	kernel = !user;  | 
|---|
 | 535 | +  | 
|---|
 | 536 | +	trace = ctx->data->callchain;  | 
|---|
 | 537 | +	if (unlikely(!trace))  | 
|---|
 | 538 | +		return -EFAULT;  | 
|---|
 | 539 | +  | 
|---|
 | 540 | +	nr_kernel = count_kernel_ip(trace);  | 
|---|
 | 541 | +  | 
|---|
 | 542 | +	if (kernel) {  | 
|---|
 | 543 | +		__u64 nr = trace->nr;  | 
|---|
 | 544 | +  | 
|---|
 | 545 | +		trace->nr = nr_kernel;  | 
|---|
 | 546 | +		ret = __bpf_get_stackid(map, trace, flags);  | 
|---|
 | 547 | +  | 
|---|
 | 548 | +		/* restore nr */  | 
|---|
 | 549 | +		trace->nr = nr;  | 
|---|
 | 550 | +	} else { /* user */  | 
|---|
 | 551 | +		u64 skip = flags & BPF_F_SKIP_FIELD_MASK;  | 
|---|
 | 552 | +  | 
|---|
 | 553 | +		skip += nr_kernel;  | 
|---|
 | 554 | +		if (skip > BPF_F_SKIP_FIELD_MASK)  | 
|---|
 | 555 | +			return -EFAULT;  | 
|---|
 | 556 | +  | 
|---|
 | 557 | +		flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;  | 
|---|
 | 558 | +		ret = __bpf_get_stackid(map, trace, flags);  | 
|---|
 | 559 | +	}  | 
|---|
 | 560 | +	return ret;  | 
|---|
 | 561 | +}  | 
|---|
 | 562 | +  | 
|---|
 | 563 | +const struct bpf_func_proto bpf_get_stackid_proto_pe = {  | 
|---|
 | 564 | +	.func		= bpf_get_stackid_pe,  | 
|---|
 | 565 | +	.gpl_only	= false,  | 
|---|
 | 566 | +	.ret_type	= RET_INTEGER,  | 
|---|
 | 567 | +	.arg1_type	= ARG_PTR_TO_CTX,  | 
|---|
 | 568 | +	.arg2_type	= ARG_CONST_MAP_PTR,  | 
|---|
 | 569 | +	.arg3_type	= ARG_ANYTHING,  | 
|---|
 | 570 | +};  | 
|---|
 | 571 | +  | 
|---|
 | 572 | +static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,  | 
|---|
 | 573 | +			    struct perf_callchain_entry *trace_in,  | 
|---|
 | 574 | +			    void *buf, u32 size, u64 flags)  | 
|---|
 | 575 | +{  | 
|---|
 | 576 | +	u32 trace_nr, copy_len, elem_size, num_elem, max_depth;  | 
|---|
| 458 | 577 |  	bool user_build_id = flags & BPF_F_USER_BUILD_ID; | 
|---|
| 459 | 578 |  	u32 skip = flags & BPF_F_SKIP_FIELD_MASK; | 
|---|
| 460 | 579 |  	bool user = flags & BPF_F_USER_STACK; | 
|---|
| .. | .. | 
|---|
| 474 | 593 |  	if (unlikely(size % elem_size)) | 
|---|
| 475 | 594 |  		goto clear; | 
|---|
| 476 | 595 |   | 
|---|
 | 596 | +	/* cannot get valid user stack for task without user_mode regs */  | 
|---|
 | 597 | +	if (task && user && !user_mode(regs))  | 
|---|
 | 598 | +		goto err_fault;  | 
|---|
 | 599 | +  | 
|---|
| 477 | 600 |  	num_elem = size / elem_size; | 
|---|
| 478 |  | -	if (sysctl_perf_event_max_stack < num_elem)  | 
|---|
| 479 |  | -		init_nr = 0;  | 
|---|
 | 601 | +	max_depth = num_elem + skip;  | 
|---|
 | 602 | +	if (sysctl_perf_event_max_stack < max_depth)  | 
|---|
 | 603 | +		max_depth = sysctl_perf_event_max_stack;  | 
|---|
 | 604 | +  | 
|---|
 | 605 | +	if (trace_in)  | 
|---|
 | 606 | +		trace = trace_in;  | 
|---|
 | 607 | +	else if (kernel && task)  | 
|---|
 | 608 | +		trace = get_callchain_entry_for_task(task, max_depth);  | 
|---|
| 480 | 609 |  	else | 
|---|
| 481 |  | -		init_nr = sysctl_perf_event_max_stack - num_elem;  | 
|---|
| 482 |  | -	trace = get_perf_callchain(regs, init_nr, kernel, user,  | 
|---|
| 483 |  | -				   sysctl_perf_event_max_stack, false, false);  | 
|---|
 | 610 | +		trace = get_perf_callchain(regs, 0, kernel, user, max_depth,  | 
|---|
 | 611 | +					   false, false);  | 
|---|
| 484 | 612 |  	if (unlikely(!trace)) | 
|---|
| 485 | 613 |  		goto err_fault; | 
|---|
| 486 | 614 |   | 
|---|
| 487 |  | -	trace_nr = trace->nr - init_nr;  | 
|---|
| 488 |  | -	if (trace_nr < skip)  | 
|---|
 | 615 | +	if (trace->nr < skip)  | 
|---|
| 489 | 616 |  		goto err_fault; | 
|---|
| 490 | 617 |   | 
|---|
| 491 |  | -	trace_nr -= skip;  | 
|---|
 | 618 | +	trace_nr = trace->nr - skip;  | 
|---|
| 492 | 619 |  	trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem; | 
|---|
| 493 | 620 |  	copy_len = trace_nr * elem_size; | 
|---|
| 494 |  | -	ips = trace->ip + skip + init_nr;  | 
|---|
 | 621 | +  | 
|---|
 | 622 | +	ips = trace->ip + skip;  | 
|---|
| 495 | 623 |  	if (user && user_build_id) | 
|---|
| 496 | 624 |  		stack_map_get_build_id_offset(buf, ips, trace_nr, user); | 
|---|
| 497 | 625 |  	else | 
|---|
| .. | .. | 
|---|
| 508 | 636 |  	return err; | 
|---|
| 509 | 637 |  } | 
|---|
| 510 | 638 |   | 
|---|
 | 639 | +BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,  | 
|---|
 | 640 | +	   u64, flags)  | 
|---|
 | 641 | +{  | 
|---|
 | 642 | +	return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);  | 
|---|
 | 643 | +}  | 
|---|
 | 644 | +  | 
|---|
| 511 | 645 |  const struct bpf_func_proto bpf_get_stack_proto = { | 
|---|
| 512 | 646 |  	.func		= bpf_get_stack, | 
|---|
 | 647 | +	.gpl_only	= true,  | 
|---|
 | 648 | +	.ret_type	= RET_INTEGER,  | 
|---|
 | 649 | +	.arg1_type	= ARG_PTR_TO_CTX,  | 
|---|
 | 650 | +	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,  | 
|---|
 | 651 | +	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,  | 
|---|
 | 652 | +	.arg4_type	= ARG_ANYTHING,  | 
|---|
 | 653 | +};  | 
|---|
 | 654 | +  | 
|---|
 | 655 | +BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,  | 
|---|
 | 656 | +	   u32, size, u64, flags)  | 
|---|
 | 657 | +{  | 
|---|
 | 658 | +	struct pt_regs *regs;  | 
|---|
 | 659 | +	long res = -EINVAL;  | 
|---|
 | 660 | +  | 
|---|
 | 661 | +	if (!try_get_task_stack(task))  | 
|---|
 | 662 | +		return -EFAULT;  | 
|---|
 | 663 | +  | 
|---|
 | 664 | +	regs = task_pt_regs(task);  | 
|---|
 | 665 | +	if (regs)  | 
|---|
 | 666 | +		res = __bpf_get_stack(regs, task, NULL, buf, size, flags);  | 
|---|
 | 667 | +	put_task_stack(task);  | 
|---|
 | 668 | +  | 
|---|
 | 669 | +	return res;  | 
|---|
 | 670 | +}  | 
|---|
 | 671 | +  | 
|---|
 | 672 | +BTF_ID_LIST_SINGLE(bpf_get_task_stack_btf_ids, struct, task_struct)  | 
|---|
 | 673 | +  | 
|---|
 | 674 | +const struct bpf_func_proto bpf_get_task_stack_proto = {  | 
|---|
 | 675 | +	.func		= bpf_get_task_stack,  | 
|---|
 | 676 | +	.gpl_only	= false,  | 
|---|
 | 677 | +	.ret_type	= RET_INTEGER,  | 
|---|
 | 678 | +	.arg1_type	= ARG_PTR_TO_BTF_ID,  | 
|---|
 | 679 | +	.arg1_btf_id	= &bpf_get_task_stack_btf_ids[0],  | 
|---|
 | 680 | +	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,  | 
|---|
 | 681 | +	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,  | 
|---|
 | 682 | +	.arg4_type	= ARG_ANYTHING,  | 
|---|
 | 683 | +};  | 
|---|
 | 684 | +  | 
|---|
 | 685 | +BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,  | 
|---|
 | 686 | +	   void *, buf, u32, size, u64, flags)  | 
|---|
 | 687 | +{  | 
|---|
 | 688 | +	struct pt_regs *regs = (struct pt_regs *)(ctx->regs);  | 
|---|
 | 689 | +	struct perf_event *event = ctx->event;  | 
|---|
 | 690 | +	struct perf_callchain_entry *trace;  | 
|---|
 | 691 | +	bool kernel, user;  | 
|---|
 | 692 | +	int err = -EINVAL;  | 
|---|
 | 693 | +	__u64 nr_kernel;  | 
|---|
 | 694 | +  | 
|---|
 | 695 | +	if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))  | 
|---|
 | 696 | +		return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);  | 
|---|
 | 697 | +  | 
|---|
 | 698 | +	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |  | 
|---|
 | 699 | +			       BPF_F_USER_BUILD_ID)))  | 
|---|
 | 700 | +		goto clear;  | 
|---|
 | 701 | +  | 
|---|
 | 702 | +	user = flags & BPF_F_USER_STACK;  | 
|---|
 | 703 | +	kernel = !user;  | 
|---|
 | 704 | +  | 
|---|
 | 705 | +	err = -EFAULT;  | 
|---|
 | 706 | +	trace = ctx->data->callchain;  | 
|---|
 | 707 | +	if (unlikely(!trace))  | 
|---|
 | 708 | +		goto clear;  | 
|---|
 | 709 | +  | 
|---|
 | 710 | +	nr_kernel = count_kernel_ip(trace);  | 
|---|
 | 711 | +  | 
|---|
 | 712 | +	if (kernel) {  | 
|---|
 | 713 | +		__u64 nr = trace->nr;  | 
|---|
 | 714 | +  | 
|---|
 | 715 | +		trace->nr = nr_kernel;  | 
|---|
 | 716 | +		err = __bpf_get_stack(regs, NULL, trace, buf, size, flags);  | 
|---|
 | 717 | +  | 
|---|
 | 718 | +		/* restore nr */  | 
|---|
 | 719 | +		trace->nr = nr;  | 
|---|
 | 720 | +	} else { /* user */  | 
|---|
 | 721 | +		u64 skip = flags & BPF_F_SKIP_FIELD_MASK;  | 
|---|
 | 722 | +  | 
|---|
 | 723 | +		skip += nr_kernel;  | 
|---|
 | 724 | +		if (skip > BPF_F_SKIP_FIELD_MASK)  | 
|---|
 | 725 | +			goto clear;  | 
|---|
 | 726 | +  | 
|---|
 | 727 | +		flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;  | 
|---|
 | 728 | +		err = __bpf_get_stack(regs, NULL, trace, buf, size, flags);  | 
|---|
 | 729 | +	}  | 
|---|
 | 730 | +	return err;  | 
|---|
 | 731 | +  | 
|---|
 | 732 | +clear:  | 
|---|
 | 733 | +	memset(buf, 0, size);  | 
|---|
 | 734 | +	return err;  | 
|---|
 | 735 | +  | 
|---|
 | 736 | +}  | 
|---|
 | 737 | +  | 
|---|
 | 738 | +const struct bpf_func_proto bpf_get_stack_proto_pe = {  | 
|---|
 | 739 | +	.func		= bpf_get_stack_pe,  | 
|---|
| 513 | 740 |  	.gpl_only	= true, | 
|---|
| 514 | 741 |  	.ret_type	= RET_INTEGER, | 
|---|
| 515 | 742 |  	.arg1_type	= ARG_PTR_TO_CTX, | 
|---|
| .. | .. | 
|---|
| 521 | 748 |  /* Called from eBPF program */ | 
|---|
| 522 | 749 |  static void *stack_map_lookup_elem(struct bpf_map *map, void *key) | 
|---|
| 523 | 750 |  { | 
|---|
| 524 |  | -	return NULL;  | 
|---|
 | 751 | +	return ERR_PTR(-EOPNOTSUPP);  | 
|---|
| 525 | 752 |  } | 
|---|
| 526 | 753 |   | 
|---|
| 527 | 754 |  /* Called from syscall */ | 
|---|
| .. | .. | 
|---|
| 607 | 834 |  { | 
|---|
| 608 | 835 |  	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); | 
|---|
| 609 | 836 |   | 
|---|
| 610 |  | -	/* wait for bpf programs to complete before freeing stack map */  | 
|---|
| 611 |  | -	synchronize_rcu();  | 
|---|
| 612 |  | -  | 
|---|
| 613 | 837 |  	bpf_map_area_free(smap->elems); | 
|---|
| 614 | 838 |  	pcpu_freelist_destroy(&smap->freelist); | 
|---|
| 615 | 839 |  	bpf_map_area_free(smap); | 
|---|
| 616 | 840 |  	put_callchain_buffers(); | 
|---|
| 617 | 841 |  } | 
|---|
| 618 | 842 |   | 
|---|
| 619 |  | -const struct bpf_map_ops stack_map_ops = {  | 
|---|
 | 843 | +static int stack_trace_map_btf_id;  | 
|---|
 | 844 | +const struct bpf_map_ops stack_trace_map_ops = {  | 
|---|
 | 845 | +	.map_meta_equal = bpf_map_meta_equal,  | 
|---|
| 620 | 846 |  	.map_alloc = stack_map_alloc, | 
|---|
| 621 | 847 |  	.map_free = stack_map_free, | 
|---|
| 622 | 848 |  	.map_get_next_key = stack_map_get_next_key, | 
|---|
| .. | .. | 
|---|
| 624 | 850 |  	.map_update_elem = stack_map_update_elem, | 
|---|
| 625 | 851 |  	.map_delete_elem = stack_map_delete_elem, | 
|---|
| 626 | 852 |  	.map_check_btf = map_check_no_btf, | 
|---|
 | 853 | +	.map_btf_name = "bpf_stack_map",  | 
|---|
 | 854 | +	.map_btf_id = &stack_trace_map_btf_id,  | 
|---|
| 627 | 855 |  }; | 
|---|
| 628 | 856 |   | 
|---|
| 629 | 857 |  static int __init stack_map_init(void) | 
|---|