| .. | .. | 
|---|
 | 1 | +// SPDX-License-Identifier: GPL-2.0-only  | 
|---|
| 1 | 2 |  /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com | 
|---|
| 2 | 3 |   * Copyright (c) 2016,2017 Facebook | 
|---|
| 3 |  | - *  | 
|---|
| 4 |  | - * This program is free software; you can redistribute it and/or  | 
|---|
| 5 |  | - * modify it under the terms of version 2 of the GNU General Public  | 
|---|
| 6 |  | - * License as published by the Free Software Foundation.  | 
|---|
| 7 |  | - *  | 
|---|
| 8 |  | - * This program is distributed in the hope that it will be useful, but  | 
|---|
| 9 |  | - * WITHOUT ANY WARRANTY; without even the implied warranty of  | 
|---|
| 10 |  | - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU  | 
|---|
| 11 |  | - * General Public License for more details.  | 
|---|
| 12 | 4 |   */ | 
|---|
| 13 | 5 |  #include <linux/bpf.h> | 
|---|
| 14 | 6 |  #include <linux/btf.h> | 
|---|
| .. | .. | 
|---|
| 18 | 10 |  #include <linux/filter.h> | 
|---|
| 19 | 11 |  #include <linux/perf_event.h> | 
|---|
| 20 | 12 |  #include <uapi/linux/btf.h> | 
|---|
 | 13 | +#include <linux/rcupdate_trace.h>  | 
|---|
| 21 | 14 |   | 
|---|
| 22 | 15 |  #include "map_in_map.h" | 
|---|
| 23 | 16 |   | 
|---|
| 24 | 17 |  #define ARRAY_CREATE_FLAG_MASK \ | 
|---|
| 25 |  | -	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)  | 
|---|
 | 18 | +	(BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK | \  | 
|---|
 | 19 | +	 BPF_F_PRESERVE_ELEMS | BPF_F_INNER_MAP)  | 
|---|
| 26 | 20 |   | 
|---|
| 27 | 21 |  static void bpf_array_free_percpu(struct bpf_array *array) | 
|---|
| 28 | 22 |  { | 
|---|
| .. | .. | 
|---|
| 63 | 57 |  	if (attr->max_entries == 0 || attr->key_size != 4 || | 
|---|
| 64 | 58 |  	    attr->value_size == 0 || | 
|---|
| 65 | 59 |  	    attr->map_flags & ~ARRAY_CREATE_FLAG_MASK || | 
|---|
 | 60 | +	    !bpf_map_flags_access_ok(attr->map_flags) ||  | 
|---|
| 66 | 61 |  	    (percpu && numa_node != NUMA_NO_NODE)) | 
|---|
 | 62 | +		return -EINVAL;  | 
|---|
 | 63 | +  | 
|---|
 | 64 | +	if (attr->map_type != BPF_MAP_TYPE_ARRAY &&  | 
|---|
 | 65 | +	    attr->map_flags & (BPF_F_MMAPABLE | BPF_F_INNER_MAP))  | 
|---|
 | 66 | +		return -EINVAL;  | 
|---|
 | 67 | +  | 
|---|
 | 68 | +	if (attr->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY &&  | 
|---|
 | 69 | +	    attr->map_flags & BPF_F_PRESERVE_ELEMS)  | 
|---|
| 67 | 70 |  		return -EINVAL; | 
|---|
| 68 | 71 |   | 
|---|
| 69 | 72 |  	if (attr->value_size > KMALLOC_MAX_SIZE) | 
|---|
| .. | .. | 
|---|
| 80 | 83 |  	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; | 
|---|
| 81 | 84 |  	int ret, numa_node = bpf_map_attr_numa_node(attr); | 
|---|
| 82 | 85 |  	u32 elem_size, index_mask, max_entries; | 
|---|
| 83 |  | -	bool unpriv = !capable(CAP_SYS_ADMIN);  | 
|---|
 | 86 | +	bool bypass_spec_v1 = bpf_bypass_spec_v1();  | 
|---|
| 84 | 87 |  	u64 cost, array_size, mask64; | 
|---|
 | 88 | +	struct bpf_map_memory mem;  | 
|---|
| 85 | 89 |  	struct bpf_array *array; | 
|---|
| 86 | 90 |   | 
|---|
| 87 | 91 |  	elem_size = round_up(attr->value_size, 8); | 
|---|
| .. | .. | 
|---|
| 97 | 101 |  	mask64 -= 1; | 
|---|
| 98 | 102 |   | 
|---|
| 99 | 103 |  	index_mask = mask64; | 
|---|
| 100 |  | -	if (unpriv) {  | 
|---|
 | 104 | +	if (!bypass_spec_v1) {  | 
|---|
| 101 | 105 |  		/* round up array size to nearest power of 2, | 
|---|
| 102 | 106 |  		 * since cpu will speculate within index_mask limits | 
|---|
| 103 | 107 |  		 */ | 
|---|
| .. | .. | 
|---|
| 108 | 112 |  	} | 
|---|
| 109 | 113 |   | 
|---|
| 110 | 114 |  	array_size = sizeof(*array); | 
|---|
| 111 |  | -	if (percpu)  | 
|---|
 | 115 | +	if (percpu) {  | 
|---|
| 112 | 116 |  		array_size += (u64) max_entries * sizeof(void *); | 
|---|
| 113 |  | -	else  | 
|---|
| 114 |  | -		array_size += (u64) max_entries * elem_size;  | 
|---|
 | 117 | +	} else {  | 
|---|
 | 118 | +		/* rely on vmalloc() to return page-aligned memory and  | 
|---|
 | 119 | +		 * ensure array->value is exactly page-aligned  | 
|---|
 | 120 | +		 */  | 
|---|
 | 121 | +		if (attr->map_flags & BPF_F_MMAPABLE) {  | 
|---|
 | 122 | +			array_size = PAGE_ALIGN(array_size);  | 
|---|
 | 123 | +			array_size += PAGE_ALIGN((u64) max_entries * elem_size);  | 
|---|
 | 124 | +		} else {  | 
|---|
 | 125 | +			array_size += (u64) max_entries * elem_size;  | 
|---|
 | 126 | +		}  | 
|---|
 | 127 | +	}  | 
|---|
| 115 | 128 |   | 
|---|
| 116 | 129 |  	/* make sure there is no u32 overflow later in round_up() */ | 
|---|
| 117 | 130 |  	cost = array_size; | 
|---|
| 118 |  | -	if (cost >= U32_MAX - PAGE_SIZE)  | 
|---|
| 119 |  | -		return ERR_PTR(-ENOMEM);  | 
|---|
| 120 |  | -	if (percpu) {  | 
|---|
 | 131 | +	if (percpu)  | 
|---|
| 121 | 132 |  		cost += (u64)attr->max_entries * elem_size * num_possible_cpus(); | 
|---|
| 122 |  | -		if (cost >= U32_MAX - PAGE_SIZE)  | 
|---|
| 123 |  | -			return ERR_PTR(-ENOMEM);  | 
|---|
| 124 |  | -	}  | 
|---|
| 125 |  | -	cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;  | 
|---|
| 126 | 133 |   | 
|---|
| 127 |  | -	ret = bpf_map_precharge_memlock(cost);  | 
|---|
 | 134 | +	ret = bpf_map_charge_init(&mem, cost);  | 
|---|
| 128 | 135 |  	if (ret < 0) | 
|---|
| 129 | 136 |  		return ERR_PTR(ret); | 
|---|
| 130 | 137 |   | 
|---|
| 131 | 138 |  	/* allocate all map elements and zero-initialize them */ | 
|---|
| 132 |  | -	array = bpf_map_area_alloc(array_size, numa_node);  | 
|---|
| 133 |  | -	if (!array)  | 
|---|
 | 139 | +	if (attr->map_flags & BPF_F_MMAPABLE) {  | 
|---|
 | 140 | +		void *data;  | 
|---|
 | 141 | +  | 
|---|
 | 142 | +		/* kmalloc'ed memory can't be mmap'ed, use explicit vmalloc */  | 
|---|
 | 143 | +		data = bpf_map_area_mmapable_alloc(array_size, numa_node);  | 
|---|
 | 144 | +		if (!data) {  | 
|---|
 | 145 | +			bpf_map_charge_finish(&mem);  | 
|---|
 | 146 | +			return ERR_PTR(-ENOMEM);  | 
|---|
 | 147 | +		}  | 
|---|
 | 148 | +		array = data + PAGE_ALIGN(sizeof(struct bpf_array))  | 
|---|
 | 149 | +			- offsetof(struct bpf_array, value);  | 
|---|
 | 150 | +	} else {  | 
|---|
 | 151 | +		array = bpf_map_area_alloc(array_size, numa_node);  | 
|---|
 | 152 | +	}  | 
|---|
 | 153 | +	if (!array) {  | 
|---|
 | 154 | +		bpf_map_charge_finish(&mem);  | 
|---|
| 134 | 155 |  		return ERR_PTR(-ENOMEM); | 
|---|
 | 156 | +	}  | 
|---|
| 135 | 157 |  	array->index_mask = index_mask; | 
|---|
| 136 |  | -	array->map.unpriv_array = unpriv;  | 
|---|
 | 158 | +	array->map.bypass_spec_v1 = bypass_spec_v1;  | 
|---|
| 137 | 159 |   | 
|---|
| 138 | 160 |  	/* copy mandatory map attributes */ | 
|---|
| 139 | 161 |  	bpf_map_init_from_attr(&array->map, attr); | 
|---|
| 140 |  | -	array->map.pages = cost;  | 
|---|
 | 162 | +	bpf_map_charge_move(&array->map.memory, &mem);  | 
|---|
| 141 | 163 |  	array->elem_size = elem_size; | 
|---|
| 142 | 164 |   | 
|---|
| 143 | 165 |  	if (percpu && bpf_array_alloc_percpu(array)) { | 
|---|
 | 166 | +		bpf_map_charge_finish(&array->map.memory);  | 
|---|
| 144 | 167 |  		bpf_map_area_free(array); | 
|---|
| 145 | 168 |  		return ERR_PTR(-ENOMEM); | 
|---|
| 146 | 169 |  	} | 
|---|
| .. | .. | 
|---|
| 160 | 183 |  	return array->value + array->elem_size * (index & array->index_mask); | 
|---|
| 161 | 184 |  } | 
|---|
| 162 | 185 |   | 
|---|
 | 186 | +static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm,  | 
|---|
 | 187 | +				       u32 off)  | 
|---|
 | 188 | +{  | 
|---|
 | 189 | +	struct bpf_array *array = container_of(map, struct bpf_array, map);  | 
|---|
 | 190 | +  | 
|---|
 | 191 | +	if (map->max_entries != 1)  | 
|---|
 | 192 | +		return -ENOTSUPP;  | 
|---|
 | 193 | +	if (off >= map->value_size)  | 
|---|
 | 194 | +		return -EINVAL;  | 
|---|
 | 195 | +  | 
|---|
 | 196 | +	*imm = (unsigned long)array->value;  | 
|---|
 | 197 | +	return 0;  | 
|---|
 | 198 | +}  | 
|---|
 | 199 | +  | 
|---|
 | 200 | +static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm,  | 
|---|
 | 201 | +				       u32 *off)  | 
|---|
 | 202 | +{  | 
|---|
 | 203 | +	struct bpf_array *array = container_of(map, struct bpf_array, map);  | 
|---|
 | 204 | +	u64 base = (unsigned long)array->value;  | 
|---|
 | 205 | +	u64 range = array->elem_size;  | 
|---|
 | 206 | +  | 
|---|
 | 207 | +	if (map->max_entries != 1)  | 
|---|
 | 208 | +		return -ENOTSUPP;  | 
|---|
 | 209 | +	if (imm < base || imm >= base + range)  | 
|---|
 | 210 | +		return -ENOENT;  | 
|---|
 | 211 | +  | 
|---|
 | 212 | +	*off = imm - base;  | 
|---|
 | 213 | +	return 0;  | 
|---|
 | 214 | +}  | 
|---|
 | 215 | +  | 
|---|
| 163 | 216 |  /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ | 
|---|
| 164 |  | -static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)  | 
|---|
 | 217 | +static int array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)  | 
|---|
| 165 | 218 |  { | 
|---|
| 166 | 219 |  	struct bpf_array *array = container_of(map, struct bpf_array, map); | 
|---|
| 167 | 220 |  	struct bpf_insn *insn = insn_buf; | 
|---|
| .. | .. | 
|---|
| 170 | 223 |  	const int map_ptr = BPF_REG_1; | 
|---|
| 171 | 224 |  	const int index = BPF_REG_2; | 
|---|
| 172 | 225 |   | 
|---|
 | 226 | +	if (map->map_flags & BPF_F_INNER_MAP)  | 
|---|
 | 227 | +		return -EOPNOTSUPP;  | 
|---|
 | 228 | +  | 
|---|
| 173 | 229 |  	*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); | 
|---|
| 174 | 230 |  	*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); | 
|---|
| 175 |  | -	if (map->unpriv_array) {  | 
|---|
 | 231 | +	if (!map->bypass_spec_v1) {  | 
|---|
| 176 | 232 |  		*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4); | 
|---|
| 177 | 233 |  		*insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); | 
|---|
| 178 | 234 |  	} else { | 
|---|
| .. | .. | 
|---|
| 253 | 309 |  { | 
|---|
| 254 | 310 |  	struct bpf_array *array = container_of(map, struct bpf_array, map); | 
|---|
| 255 | 311 |  	u32 index = *(u32 *)key; | 
|---|
 | 312 | +	char *val;  | 
|---|
| 256 | 313 |   | 
|---|
| 257 |  | -	if (unlikely(map_flags > BPF_EXIST))  | 
|---|
 | 314 | +	if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))  | 
|---|
| 258 | 315 |  		/* unknown flags */ | 
|---|
| 259 | 316 |  		return -EINVAL; | 
|---|
| 260 | 317 |   | 
|---|
| .. | .. | 
|---|
| 262 | 319 |  		/* all elements were pre-allocated, cannot insert a new one */ | 
|---|
| 263 | 320 |  		return -E2BIG; | 
|---|
| 264 | 321 |   | 
|---|
| 265 |  | -	if (unlikely(map_flags == BPF_NOEXIST))  | 
|---|
 | 322 | +	if (unlikely(map_flags & BPF_NOEXIST))  | 
|---|
| 266 | 323 |  		/* all elements already exist */ | 
|---|
| 267 | 324 |  		return -EEXIST; | 
|---|
| 268 | 325 |   | 
|---|
| 269 |  | -	if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)  | 
|---|
 | 326 | +	if (unlikely((map_flags & BPF_F_LOCK) &&  | 
|---|
 | 327 | +		     !map_value_has_spin_lock(map)))  | 
|---|
 | 328 | +		return -EINVAL;  | 
|---|
 | 329 | +  | 
|---|
 | 330 | +	if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {  | 
|---|
| 270 | 331 |  		memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]), | 
|---|
| 271 | 332 |  		       value, map->value_size); | 
|---|
| 272 |  | -	else  | 
|---|
| 273 |  | -		memcpy(array->value +  | 
|---|
| 274 |  | -		       array->elem_size * (index & array->index_mask),  | 
|---|
| 275 |  | -		       value, map->value_size);  | 
|---|
 | 333 | +	} else {  | 
|---|
 | 334 | +		val = array->value +  | 
|---|
 | 335 | +			array->elem_size * (index & array->index_mask);  | 
|---|
 | 336 | +		if (map_flags & BPF_F_LOCK)  | 
|---|
 | 337 | +			copy_map_value_locked(map, val, value, false);  | 
|---|
 | 338 | +		else  | 
|---|
 | 339 | +			copy_map_value(map, val, value);  | 
|---|
 | 340 | +	}  | 
|---|
| 276 | 341 |  	return 0; | 
|---|
| 277 | 342 |  } | 
|---|
| 278 | 343 |   | 
|---|
| .. | .. | 
|---|
| 320 | 385 |  	return -EINVAL; | 
|---|
| 321 | 386 |  } | 
|---|
| 322 | 387 |   | 
|---|
 | 388 | +static void *array_map_vmalloc_addr(struct bpf_array *array)  | 
|---|
 | 389 | +{  | 
|---|
 | 390 | +	return (void *)round_down((unsigned long)array, PAGE_SIZE);  | 
|---|
 | 391 | +}  | 
|---|
 | 392 | +  | 
|---|
| 323 | 393 |  /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ | 
|---|
| 324 | 394 |  static void array_map_free(struct bpf_map *map) | 
|---|
| 325 | 395 |  { | 
|---|
| 326 | 396 |  	struct bpf_array *array = container_of(map, struct bpf_array, map); | 
|---|
| 327 | 397 |   | 
|---|
| 328 |  | -	/* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,  | 
|---|
| 329 |  | -	 * so the programs (can be more than one that used this map) were  | 
|---|
| 330 |  | -	 * disconnected from events. Wait for outstanding programs to complete  | 
|---|
| 331 |  | -	 * and free the array  | 
|---|
| 332 |  | -	 */  | 
|---|
| 333 |  | -	synchronize_rcu();  | 
|---|
| 334 |  | -  | 
|---|
| 335 | 398 |  	if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) | 
|---|
| 336 | 399 |  		bpf_array_free_percpu(array); | 
|---|
| 337 | 400 |   | 
|---|
| 338 |  | -	bpf_map_area_free(array);  | 
|---|
 | 401 | +	if (array->map.map_flags & BPF_F_MMAPABLE)  | 
|---|
 | 402 | +		bpf_map_area_free(array_map_vmalloc_addr(array));  | 
|---|
 | 403 | +	else  | 
|---|
 | 404 | +		bpf_map_area_free(array);  | 
|---|
| 339 | 405 |  } | 
|---|
| 340 | 406 |   | 
|---|
| 341 | 407 |  static void array_map_seq_show_elem(struct bpf_map *map, void *key, | 
|---|
| .. | .. | 
|---|
| 351 | 417 |  		return; | 
|---|
| 352 | 418 |  	} | 
|---|
| 353 | 419 |   | 
|---|
| 354 |  | -	seq_printf(m, "%u: ", *(u32 *)key);  | 
|---|
 | 420 | +	if (map->btf_key_type_id)  | 
|---|
 | 421 | +		seq_printf(m, "%u: ", *(u32 *)key);  | 
|---|
| 355 | 422 |  	btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); | 
|---|
| 356 | 423 |  	seq_puts(m, "\n"); | 
|---|
| 357 | 424 |   | 
|---|
| 358 | 425 |  	rcu_read_unlock(); | 
|---|
| 359 | 426 |  } | 
|---|
| 360 | 427 |   | 
|---|
 | 428 | +static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key,  | 
|---|
 | 429 | +					   struct seq_file *m)  | 
|---|
 | 430 | +{  | 
|---|
 | 431 | +	struct bpf_array *array = container_of(map, struct bpf_array, map);  | 
|---|
 | 432 | +	u32 index = *(u32 *)key;  | 
|---|
 | 433 | +	void __percpu *pptr;  | 
|---|
 | 434 | +	int cpu;  | 
|---|
 | 435 | +  | 
|---|
 | 436 | +	rcu_read_lock();  | 
|---|
 | 437 | +  | 
|---|
 | 438 | +	seq_printf(m, "%u: {\n", *(u32 *)key);  | 
|---|
 | 439 | +	pptr = array->pptrs[index & array->index_mask];  | 
|---|
 | 440 | +	for_each_possible_cpu(cpu) {  | 
|---|
 | 441 | +		seq_printf(m, "\tcpu%d: ", cpu);  | 
|---|
 | 442 | +		btf_type_seq_show(map->btf, map->btf_value_type_id,  | 
|---|
 | 443 | +				  per_cpu_ptr(pptr, cpu), m);  | 
|---|
 | 444 | +		seq_puts(m, "\n");  | 
|---|
 | 445 | +	}  | 
|---|
 | 446 | +	seq_puts(m, "}\n");  | 
|---|
 | 447 | +  | 
|---|
 | 448 | +	rcu_read_unlock();  | 
|---|
 | 449 | +}  | 
|---|
 | 450 | +  | 
|---|
| 361 | 451 |  static int array_map_check_btf(const struct bpf_map *map, | 
|---|
 | 452 | +			       const struct btf *btf,  | 
|---|
| 362 | 453 |  			       const struct btf_type *key_type, | 
|---|
| 363 | 454 |  			       const struct btf_type *value_type) | 
|---|
| 364 | 455 |  { | 
|---|
| 365 | 456 |  	u32 int_data; | 
|---|
 | 457 | +  | 
|---|
 | 458 | +	/* One exception for keyless BTF: .bss/.data/.rodata map */  | 
|---|
 | 459 | +	if (btf_type_is_void(key_type)) {  | 
|---|
 | 460 | +		if (map->map_type != BPF_MAP_TYPE_ARRAY ||  | 
|---|
 | 461 | +		    map->max_entries != 1)  | 
|---|
 | 462 | +			return -EINVAL;  | 
|---|
 | 463 | +  | 
|---|
 | 464 | +		if (BTF_INFO_KIND(value_type->info) != BTF_KIND_DATASEC)  | 
|---|
 | 465 | +			return -EINVAL;  | 
|---|
 | 466 | +  | 
|---|
 | 467 | +		return 0;  | 
|---|
 | 468 | +	}  | 
|---|
| 366 | 469 |   | 
|---|
| 367 | 470 |  	if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) | 
|---|
| 368 | 471 |  		return -EINVAL; | 
|---|
| .. | .. | 
|---|
| 377 | 480 |  	return 0; | 
|---|
| 378 | 481 |  } | 
|---|
| 379 | 482 |   | 
|---|
 | 483 | +static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)  | 
|---|
 | 484 | +{  | 
|---|
 | 485 | +	struct bpf_array *array = container_of(map, struct bpf_array, map);  | 
|---|
 | 486 | +	pgoff_t pgoff = PAGE_ALIGN(sizeof(*array)) >> PAGE_SHIFT;  | 
|---|
 | 487 | +  | 
|---|
 | 488 | +	if (!(map->map_flags & BPF_F_MMAPABLE))  | 
|---|
 | 489 | +		return -EINVAL;  | 
|---|
 | 490 | +  | 
|---|
 | 491 | +	if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) >  | 
|---|
 | 492 | +	    PAGE_ALIGN((u64)array->map.max_entries * array->elem_size))  | 
|---|
 | 493 | +		return -EINVAL;  | 
|---|
 | 494 | +  | 
|---|
 | 495 | +	return remap_vmalloc_range(vma, array_map_vmalloc_addr(array),  | 
|---|
 | 496 | +				   vma->vm_pgoff + pgoff);  | 
|---|
 | 497 | +}  | 
|---|
 | 498 | +  | 
|---|
 | 499 | +static bool array_map_meta_equal(const struct bpf_map *meta0,  | 
|---|
 | 500 | +				 const struct bpf_map *meta1)  | 
|---|
 | 501 | +{  | 
|---|
 | 502 | +	if (!bpf_map_meta_equal(meta0, meta1))  | 
|---|
 | 503 | +		return false;  | 
|---|
 | 504 | +	return meta0->map_flags & BPF_F_INNER_MAP ? true :  | 
|---|
 | 505 | +	       meta0->max_entries == meta1->max_entries;  | 
|---|
 | 506 | +}  | 
|---|
 | 507 | +  | 
|---|
 | 508 | +struct bpf_iter_seq_array_map_info {  | 
|---|
 | 509 | +	struct bpf_map *map;  | 
|---|
 | 510 | +	void *percpu_value_buf;  | 
|---|
 | 511 | +	u32 index;  | 
|---|
 | 512 | +};  | 
|---|
 | 513 | +  | 
|---|
 | 514 | +static void *bpf_array_map_seq_start(struct seq_file *seq, loff_t *pos)  | 
|---|
 | 515 | +{  | 
|---|
 | 516 | +	struct bpf_iter_seq_array_map_info *info = seq->private;  | 
|---|
 | 517 | +	struct bpf_map *map = info->map;  | 
|---|
 | 518 | +	struct bpf_array *array;  | 
|---|
 | 519 | +	u32 index;  | 
|---|
 | 520 | +  | 
|---|
 | 521 | +	if (info->index >= map->max_entries)  | 
|---|
 | 522 | +		return NULL;  | 
|---|
 | 523 | +  | 
|---|
 | 524 | +	if (*pos == 0)  | 
|---|
 | 525 | +		++*pos;  | 
|---|
 | 526 | +	array = container_of(map, struct bpf_array, map);  | 
|---|
 | 527 | +	index = info->index & array->index_mask;  | 
|---|
 | 528 | +	if (info->percpu_value_buf)  | 
|---|
 | 529 | +	       return array->pptrs[index];  | 
|---|
 | 530 | +	return array->value + array->elem_size * index;  | 
|---|
 | 531 | +}  | 
|---|
 | 532 | +  | 
|---|
 | 533 | +static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)  | 
|---|
 | 534 | +{  | 
|---|
 | 535 | +	struct bpf_iter_seq_array_map_info *info = seq->private;  | 
|---|
 | 536 | +	struct bpf_map *map = info->map;  | 
|---|
 | 537 | +	struct bpf_array *array;  | 
|---|
 | 538 | +	u32 index;  | 
|---|
 | 539 | +  | 
|---|
 | 540 | +	++*pos;  | 
|---|
 | 541 | +	++info->index;  | 
|---|
 | 542 | +	if (info->index >= map->max_entries)  | 
|---|
 | 543 | +		return NULL;  | 
|---|
 | 544 | +  | 
|---|
 | 545 | +	array = container_of(map, struct bpf_array, map);  | 
|---|
 | 546 | +	index = info->index & array->index_mask;  | 
|---|
 | 547 | +	if (info->percpu_value_buf)  | 
|---|
 | 548 | +	       return array->pptrs[index];  | 
|---|
 | 549 | +	return array->value + array->elem_size * index;  | 
|---|
 | 550 | +}  | 
|---|
 | 551 | +  | 
|---|
 | 552 | +static int __bpf_array_map_seq_show(struct seq_file *seq, void *v)  | 
|---|
 | 553 | +{  | 
|---|
 | 554 | +	struct bpf_iter_seq_array_map_info *info = seq->private;  | 
|---|
 | 555 | +	struct bpf_iter__bpf_map_elem ctx = {};  | 
|---|
 | 556 | +	struct bpf_map *map = info->map;  | 
|---|
 | 557 | +	struct bpf_iter_meta meta;  | 
|---|
 | 558 | +	struct bpf_prog *prog;  | 
|---|
 | 559 | +	int off = 0, cpu = 0;  | 
|---|
 | 560 | +	void __percpu **pptr;  | 
|---|
 | 561 | +	u32 size;  | 
|---|
 | 562 | +  | 
|---|
 | 563 | +	meta.seq = seq;  | 
|---|
 | 564 | +	prog = bpf_iter_get_info(&meta, v == NULL);  | 
|---|
 | 565 | +	if (!prog)  | 
|---|
 | 566 | +		return 0;  | 
|---|
 | 567 | +  | 
|---|
 | 568 | +	ctx.meta = &meta;  | 
|---|
 | 569 | +	ctx.map = info->map;  | 
|---|
 | 570 | +	if (v) {  | 
|---|
 | 571 | +		ctx.key = &info->index;  | 
|---|
 | 572 | +  | 
|---|
 | 573 | +		if (!info->percpu_value_buf) {  | 
|---|
 | 574 | +			ctx.value = v;  | 
|---|
 | 575 | +		} else {  | 
|---|
 | 576 | +			pptr = v;  | 
|---|
 | 577 | +			size = round_up(map->value_size, 8);  | 
|---|
 | 578 | +			for_each_possible_cpu(cpu) {  | 
|---|
 | 579 | +				bpf_long_memcpy(info->percpu_value_buf + off,  | 
|---|
 | 580 | +						per_cpu_ptr(pptr, cpu),  | 
|---|
 | 581 | +						size);  | 
|---|
 | 582 | +				off += size;  | 
|---|
 | 583 | +			}  | 
|---|
 | 584 | +			ctx.value = info->percpu_value_buf;  | 
|---|
 | 585 | +		}  | 
|---|
 | 586 | +	}  | 
|---|
 | 587 | +  | 
|---|
 | 588 | +	return bpf_iter_run_prog(prog, &ctx);  | 
|---|
 | 589 | +}  | 
|---|
 | 590 | +  | 
|---|
 | 591 | +static int bpf_array_map_seq_show(struct seq_file *seq, void *v)  | 
|---|
 | 592 | +{  | 
|---|
 | 593 | +	return __bpf_array_map_seq_show(seq, v);  | 
|---|
 | 594 | +}  | 
|---|
 | 595 | +  | 
|---|
 | 596 | +static void bpf_array_map_seq_stop(struct seq_file *seq, void *v)  | 
|---|
 | 597 | +{  | 
|---|
 | 598 | +	if (!v)  | 
|---|
 | 599 | +		(void)__bpf_array_map_seq_show(seq, NULL);  | 
|---|
 | 600 | +}  | 
|---|
 | 601 | +  | 
|---|
 | 602 | +static int bpf_iter_init_array_map(void *priv_data,  | 
|---|
 | 603 | +				   struct bpf_iter_aux_info *aux)  | 
|---|
 | 604 | +{  | 
|---|
 | 605 | +	struct bpf_iter_seq_array_map_info *seq_info = priv_data;  | 
|---|
 | 606 | +	struct bpf_map *map = aux->map;  | 
|---|
 | 607 | +	void *value_buf;  | 
|---|
 | 608 | +	u32 buf_size;  | 
|---|
 | 609 | +  | 
|---|
 | 610 | +	if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {  | 
|---|
 | 611 | +		buf_size = round_up(map->value_size, 8) * num_possible_cpus();  | 
|---|
 | 612 | +		value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN);  | 
|---|
 | 613 | +		if (!value_buf)  | 
|---|
 | 614 | +			return -ENOMEM;  | 
|---|
 | 615 | +  | 
|---|
 | 616 | +		seq_info->percpu_value_buf = value_buf;  | 
|---|
 | 617 | +	}  | 
|---|
 | 618 | +  | 
|---|
 | 619 | +	/* bpf_iter_attach_map() acquires a map uref, and the uref may be  | 
|---|
 | 620 | +	 * released before or in the middle of iterating map elements, so  | 
|---|
 | 621 | +	 * acquire an extra map uref for iterator.  | 
|---|
 | 622 | +	 */  | 
|---|
 | 623 | +	bpf_map_inc_with_uref(map);  | 
|---|
 | 624 | +	seq_info->map = map;  | 
|---|
 | 625 | +	return 0;  | 
|---|
 | 626 | +}  | 
|---|
 | 627 | +  | 
|---|
 | 628 | +static void bpf_iter_fini_array_map(void *priv_data)  | 
|---|
 | 629 | +{  | 
|---|
 | 630 | +	struct bpf_iter_seq_array_map_info *seq_info = priv_data;  | 
|---|
 | 631 | +  | 
|---|
 | 632 | +	bpf_map_put_with_uref(seq_info->map);  | 
|---|
 | 633 | +	kfree(seq_info->percpu_value_buf);  | 
|---|
 | 634 | +}  | 
|---|
 | 635 | +  | 
|---|
 | 636 | +static const struct seq_operations bpf_array_map_seq_ops = {  | 
|---|
 | 637 | +	.start	= bpf_array_map_seq_start,  | 
|---|
 | 638 | +	.next	= bpf_array_map_seq_next,  | 
|---|
 | 639 | +	.stop	= bpf_array_map_seq_stop,  | 
|---|
 | 640 | +	.show	= bpf_array_map_seq_show,  | 
|---|
 | 641 | +};  | 
|---|
 | 642 | +  | 
|---|
 | 643 | +static const struct bpf_iter_seq_info iter_seq_info = {  | 
|---|
 | 644 | +	.seq_ops		= &bpf_array_map_seq_ops,  | 
|---|
 | 645 | +	.init_seq_private	= bpf_iter_init_array_map,  | 
|---|
 | 646 | +	.fini_seq_private	= bpf_iter_fini_array_map,  | 
|---|
 | 647 | +	.seq_priv_size		= sizeof(struct bpf_iter_seq_array_map_info),  | 
|---|
 | 648 | +};  | 
|---|
 | 649 | +  | 
|---|
 | 650 | +static int array_map_btf_id;  | 
|---|
| 380 | 651 |  const struct bpf_map_ops array_map_ops = { | 
|---|
 | 652 | +	.map_meta_equal = array_map_meta_equal,  | 
|---|
| 381 | 653 |  	.map_alloc_check = array_map_alloc_check, | 
|---|
| 382 | 654 |  	.map_alloc = array_map_alloc, | 
|---|
| 383 | 655 |  	.map_free = array_map_free, | 
|---|
| .. | .. | 
|---|
| 386 | 658 |  	.map_update_elem = array_map_update_elem, | 
|---|
| 387 | 659 |  	.map_delete_elem = array_map_delete_elem, | 
|---|
| 388 | 660 |  	.map_gen_lookup = array_map_gen_lookup, | 
|---|
 | 661 | +	.map_direct_value_addr = array_map_direct_value_addr,  | 
|---|
 | 662 | +	.map_direct_value_meta = array_map_direct_value_meta,  | 
|---|
 | 663 | +	.map_mmap = array_map_mmap,  | 
|---|
| 389 | 664 |  	.map_seq_show_elem = array_map_seq_show_elem, | 
|---|
| 390 | 665 |  	.map_check_btf = array_map_check_btf, | 
|---|
 | 666 | +	.map_lookup_batch = generic_map_lookup_batch,  | 
|---|
 | 667 | +	.map_update_batch = generic_map_update_batch,  | 
|---|
 | 668 | +	.map_btf_name = "bpf_array",  | 
|---|
 | 669 | +	.map_btf_id = &array_map_btf_id,  | 
|---|
 | 670 | +	.iter_seq_info = &iter_seq_info,  | 
|---|
| 391 | 671 |  }; | 
|---|
| 392 | 672 |   | 
|---|
 | 673 | +static int percpu_array_map_btf_id;  | 
|---|
| 393 | 674 |  const struct bpf_map_ops percpu_array_map_ops = { | 
|---|
 | 675 | +	.map_meta_equal = bpf_map_meta_equal,  | 
|---|
| 394 | 676 |  	.map_alloc_check = array_map_alloc_check, | 
|---|
| 395 | 677 |  	.map_alloc = array_map_alloc, | 
|---|
| 396 | 678 |  	.map_free = array_map_free, | 
|---|
| .. | .. | 
|---|
| 398 | 680 |  	.map_lookup_elem = percpu_array_map_lookup_elem, | 
|---|
| 399 | 681 |  	.map_update_elem = array_map_update_elem, | 
|---|
| 400 | 682 |  	.map_delete_elem = array_map_delete_elem, | 
|---|
 | 683 | +	.map_seq_show_elem = percpu_array_map_seq_show_elem,  | 
|---|
| 401 | 684 |  	.map_check_btf = array_map_check_btf, | 
|---|
 | 685 | +	.map_btf_name = "bpf_array",  | 
|---|
 | 686 | +	.map_btf_id = &percpu_array_map_btf_id,  | 
|---|
 | 687 | +	.iter_seq_info = &iter_seq_info,  | 
|---|
| 402 | 688 |  }; | 
|---|
| 403 | 689 |   | 
|---|
| 404 | 690 |  static int fd_array_map_alloc_check(union bpf_attr *attr) | 
|---|
| 405 | 691 |  { | 
|---|
| 406 | 692 |  	/* only file descriptors can be stored in this type of map */ | 
|---|
| 407 | 693 |  	if (attr->value_size != sizeof(u32)) | 
|---|
 | 694 | +		return -EINVAL;  | 
|---|
 | 695 | +	/* Program read-only/write-only not supported for special maps yet. */  | 
|---|
 | 696 | +	if (attr->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG))  | 
|---|
| 408 | 697 |  		return -EINVAL; | 
|---|
| 409 | 698 |  	return array_map_alloc_check(attr); | 
|---|
| 410 | 699 |  } | 
|---|
| .. | .. | 
|---|
| 413 | 702 |  { | 
|---|
| 414 | 703 |  	struct bpf_array *array = container_of(map, struct bpf_array, map); | 
|---|
| 415 | 704 |  	int i; | 
|---|
| 416 |  | -  | 
|---|
| 417 |  | -	synchronize_rcu();  | 
|---|
| 418 | 705 |   | 
|---|
| 419 | 706 |  	/* make sure it's empty */ | 
|---|
| 420 | 707 |  	for (i = 0; i < array->map.max_entries; i++) | 
|---|
| .. | .. | 
|---|
| 425 | 712 |   | 
|---|
| 426 | 713 |  static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) | 
|---|
| 427 | 714 |  { | 
|---|
| 428 |  | -	return NULL;  | 
|---|
 | 715 | +	return ERR_PTR(-EOPNOTSUPP);  | 
|---|
| 429 | 716 |  } | 
|---|
| 430 | 717 |   | 
|---|
| 431 | 718 |  /* only called from syscall */ | 
|---|
| .. | .. | 
|---|
| 467 | 754 |  	if (IS_ERR(new_ptr)) | 
|---|
| 468 | 755 |  		return PTR_ERR(new_ptr); | 
|---|
| 469 | 756 |   | 
|---|
| 470 |  | -	old_ptr = xchg(array->ptrs + index, new_ptr);  | 
|---|
 | 757 | +	if (map->ops->map_poke_run) {  | 
|---|
 | 758 | +		mutex_lock(&array->aux->poke_mutex);  | 
|---|
 | 759 | +		old_ptr = xchg(array->ptrs + index, new_ptr);  | 
|---|
 | 760 | +		map->ops->map_poke_run(map, index, old_ptr, new_ptr);  | 
|---|
 | 761 | +		mutex_unlock(&array->aux->poke_mutex);  | 
|---|
 | 762 | +	} else {  | 
|---|
 | 763 | +		old_ptr = xchg(array->ptrs + index, new_ptr);  | 
|---|
 | 764 | +	}  | 
|---|
 | 765 | +  | 
|---|
| 471 | 766 |  	if (old_ptr) | 
|---|
| 472 | 767 |  		map->ops->map_fd_put_ptr(old_ptr); | 
|---|
| 473 |  | -  | 
|---|
| 474 | 768 |  	return 0; | 
|---|
| 475 | 769 |  } | 
|---|
| 476 | 770 |   | 
|---|
| .. | .. | 
|---|
| 483 | 777 |  	if (index >= array->map.max_entries) | 
|---|
| 484 | 778 |  		return -E2BIG; | 
|---|
| 485 | 779 |   | 
|---|
| 486 |  | -	old_ptr = xchg(array->ptrs + index, NULL);  | 
|---|
 | 780 | +	if (map->ops->map_poke_run) {  | 
|---|
 | 781 | +		mutex_lock(&array->aux->poke_mutex);  | 
|---|
 | 782 | +		old_ptr = xchg(array->ptrs + index, NULL);  | 
|---|
 | 783 | +		map->ops->map_poke_run(map, index, old_ptr, NULL);  | 
|---|
 | 784 | +		mutex_unlock(&array->aux->poke_mutex);  | 
|---|
 | 785 | +	} else {  | 
|---|
 | 786 | +		old_ptr = xchg(array->ptrs + index, NULL);  | 
|---|
 | 787 | +	}  | 
|---|
 | 788 | +  | 
|---|
| 487 | 789 |  	if (old_ptr) { | 
|---|
| 488 | 790 |  		map->ops->map_fd_put_ptr(old_ptr); | 
|---|
| 489 | 791 |  		return 0; | 
|---|
| .. | .. | 
|---|
| 529 | 831 |  		fd_array_map_delete_elem(map, &i); | 
|---|
| 530 | 832 |  } | 
|---|
| 531 | 833 |   | 
|---|
 | 834 | +static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key,  | 
|---|
 | 835 | +					 struct seq_file *m)  | 
|---|
 | 836 | +{  | 
|---|
 | 837 | +	void **elem, *ptr;  | 
|---|
 | 838 | +	u32 prog_id;  | 
|---|
 | 839 | +  | 
|---|
 | 840 | +	rcu_read_lock();  | 
|---|
 | 841 | +  | 
|---|
 | 842 | +	elem = array_map_lookup_elem(map, key);  | 
|---|
 | 843 | +	if (elem) {  | 
|---|
 | 844 | +		ptr = READ_ONCE(*elem);  | 
|---|
 | 845 | +		if (ptr) {  | 
|---|
 | 846 | +			seq_printf(m, "%u: ", *(u32 *)key);  | 
|---|
 | 847 | +			prog_id = prog_fd_array_sys_lookup_elem(ptr);  | 
|---|
 | 848 | +			btf_type_seq_show(map->btf, map->btf_value_type_id,  | 
|---|
 | 849 | +					  &prog_id, m);  | 
|---|
 | 850 | +			seq_puts(m, "\n");  | 
|---|
 | 851 | +		}  | 
|---|
 | 852 | +	}  | 
|---|
 | 853 | +  | 
|---|
 | 854 | +	rcu_read_unlock();  | 
|---|
 | 855 | +}  | 
|---|
 | 856 | +  | 
|---|
 | 857 | +struct prog_poke_elem {  | 
|---|
 | 858 | +	struct list_head list;  | 
|---|
 | 859 | +	struct bpf_prog_aux *aux;  | 
|---|
 | 860 | +};  | 
|---|
 | 861 | +  | 
|---|
 | 862 | +static int prog_array_map_poke_track(struct bpf_map *map,  | 
|---|
 | 863 | +				     struct bpf_prog_aux *prog_aux)  | 
|---|
 | 864 | +{  | 
|---|
 | 865 | +	struct prog_poke_elem *elem;  | 
|---|
 | 866 | +	struct bpf_array_aux *aux;  | 
|---|
 | 867 | +	int ret = 0;  | 
|---|
 | 868 | +  | 
|---|
 | 869 | +	aux = container_of(map, struct bpf_array, map)->aux;  | 
|---|
 | 870 | +	mutex_lock(&aux->poke_mutex);  | 
|---|
 | 871 | +	list_for_each_entry(elem, &aux->poke_progs, list) {  | 
|---|
 | 872 | +		if (elem->aux == prog_aux)  | 
|---|
 | 873 | +			goto out;  | 
|---|
 | 874 | +	}  | 
|---|
 | 875 | +  | 
|---|
 | 876 | +	elem = kmalloc(sizeof(*elem), GFP_KERNEL);  | 
|---|
 | 877 | +	if (!elem) {  | 
|---|
 | 878 | +		ret = -ENOMEM;  | 
|---|
 | 879 | +		goto out;  | 
|---|
 | 880 | +	}  | 
|---|
 | 881 | +  | 
|---|
 | 882 | +	INIT_LIST_HEAD(&elem->list);  | 
|---|
 | 883 | +	/* We must track the program's aux info at this point in time  | 
|---|
 | 884 | +	 * since the program pointer itself may not be stable yet, see  | 
|---|
 | 885 | +	 * also comment in prog_array_map_poke_run().  | 
|---|
 | 886 | +	 */  | 
|---|
 | 887 | +	elem->aux = prog_aux;  | 
|---|
 | 888 | +  | 
|---|
 | 889 | +	list_add_tail(&elem->list, &aux->poke_progs);  | 
|---|
 | 890 | +out:  | 
|---|
 | 891 | +	mutex_unlock(&aux->poke_mutex);  | 
|---|
 | 892 | +	return ret;  | 
|---|
 | 893 | +}  | 
|---|
 | 894 | +  | 
|---|
 | 895 | +static void prog_array_map_poke_untrack(struct bpf_map *map,  | 
|---|
 | 896 | +					struct bpf_prog_aux *prog_aux)  | 
|---|
 | 897 | +{  | 
|---|
 | 898 | +	struct prog_poke_elem *elem, *tmp;  | 
|---|
 | 899 | +	struct bpf_array_aux *aux;  | 
|---|
 | 900 | +  | 
|---|
 | 901 | +	aux = container_of(map, struct bpf_array, map)->aux;  | 
|---|
 | 902 | +	mutex_lock(&aux->poke_mutex);  | 
|---|
 | 903 | +	list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) {  | 
|---|
 | 904 | +		if (elem->aux == prog_aux) {  | 
|---|
 | 905 | +			list_del_init(&elem->list);  | 
|---|
 | 906 | +			kfree(elem);  | 
|---|
 | 907 | +			break;  | 
|---|
 | 908 | +		}  | 
|---|
 | 909 | +	}  | 
|---|
 | 910 | +	mutex_unlock(&aux->poke_mutex);  | 
|---|
 | 911 | +}  | 
|---|
 | 912 | +  | 
|---|
 | 913 | +static void prog_array_map_poke_run(struct bpf_map *map, u32 key,  | 
|---|
 | 914 | +				    struct bpf_prog *old,  | 
|---|
 | 915 | +				    struct bpf_prog *new)  | 
|---|
 | 916 | +{  | 
|---|
 | 917 | +	u8 *old_addr, *new_addr, *old_bypass_addr;  | 
|---|
 | 918 | +	struct prog_poke_elem *elem;  | 
|---|
 | 919 | +	struct bpf_array_aux *aux;  | 
|---|
 | 920 | +  | 
|---|
 | 921 | +	aux = container_of(map, struct bpf_array, map)->aux;  | 
|---|
 | 922 | +	WARN_ON_ONCE(!mutex_is_locked(&aux->poke_mutex));  | 
|---|
 | 923 | +  | 
|---|
 | 924 | +	list_for_each_entry(elem, &aux->poke_progs, list) {  | 
|---|
 | 925 | +		struct bpf_jit_poke_descriptor *poke;  | 
|---|
 | 926 | +		int i, ret;  | 
|---|
 | 927 | +  | 
|---|
 | 928 | +		for (i = 0; i < elem->aux->size_poke_tab; i++) {  | 
|---|
 | 929 | +			poke = &elem->aux->poke_tab[i];  | 
|---|
 | 930 | +  | 
|---|
 | 931 | +			/* Few things to be aware of:  | 
|---|
 | 932 | +			 *  | 
|---|
 | 933 | +			 * 1) We can only ever access aux in this context, but  | 
|---|
 | 934 | +			 *    not aux->prog since it might not be stable yet and  | 
|---|
 | 935 | +			 *    there could be danger of use after free otherwise.  | 
|---|
 | 936 | +			 * 2) Initially when we start tracking aux, the program  | 
|---|
 | 937 | +			 *    is not JITed yet and also does not have a kallsyms  | 
|---|
 | 938 | +			 *    entry. We skip these as poke->tailcall_target_stable  | 
|---|
 | 939 | +			 *    is not active yet. The JIT will do the final fixup  | 
|---|
 | 940 | +			 *    before setting it stable. The various  | 
|---|
 | 941 | +			 *    poke->tailcall_target_stable are successively  | 
|---|
 | 942 | +			 *    activated, so tail call updates can arrive from here  | 
|---|
 | 943 | +			 *    while JIT is still finishing its final fixup for  | 
|---|
 | 944 | +			 *    non-activated poke entries.  | 
|---|
 | 945 | +			 * 3) On program teardown, the program's kallsym entry gets  | 
|---|
 | 946 | +			 *    removed out of RCU callback, but we can only untrack  | 
|---|
 | 947 | +			 *    from sleepable context, therefore bpf_arch_text_poke()  | 
|---|
 | 948 | +			 *    might not see that this is in BPF text section and  | 
|---|
 | 949 | +			 *    bails out with -EINVAL. As these are unreachable since  | 
|---|
 | 950 | +			 *    RCU grace period already passed, we simply skip them.  | 
|---|
 | 951 | +			 * 4) Also programs reaching refcount of zero while patching  | 
|---|
 | 952 | +			 *    is in progress is okay since we're protected under  | 
|---|
 | 953 | +			 *    poke_mutex and untrack the programs before the JIT  | 
|---|
 | 954 | +			 *    buffer is freed. When we're still in the middle of  | 
|---|
 | 955 | +			 *    patching and suddenly kallsyms entry of the program  | 
|---|
 | 956 | +			 *    gets evicted, we just skip the rest which is fine due  | 
|---|
 | 957 | +			 *    to point 3).  | 
|---|
 | 958 | +			 * 5) Any other error happening below from bpf_arch_text_poke()  | 
|---|
 | 959 | +			 *    is a unexpected bug.  | 
|---|
 | 960 | +			 */  | 
|---|
 | 961 | +			if (!READ_ONCE(poke->tailcall_target_stable))  | 
|---|
 | 962 | +				continue;  | 
|---|
 | 963 | +			if (poke->reason != BPF_POKE_REASON_TAIL_CALL)  | 
|---|
 | 964 | +				continue;  | 
|---|
 | 965 | +			if (poke->tail_call.map != map ||  | 
|---|
 | 966 | +			    poke->tail_call.key != key)  | 
|---|
 | 967 | +				continue;  | 
|---|
 | 968 | +  | 
|---|
 | 969 | +			old_bypass_addr = old ? NULL : poke->bypass_addr;  | 
|---|
 | 970 | +			old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL;  | 
|---|
 | 971 | +			new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL;  | 
|---|
 | 972 | +  | 
|---|
 | 973 | +			if (new) {  | 
|---|
 | 974 | +				ret = bpf_arch_text_poke(poke->tailcall_target,  | 
|---|
 | 975 | +							 BPF_MOD_JUMP,  | 
|---|
 | 976 | +							 old_addr, new_addr);  | 
|---|
 | 977 | +				BUG_ON(ret < 0 && ret != -EINVAL);  | 
|---|
 | 978 | +				if (!old) {  | 
|---|
 | 979 | +					ret = bpf_arch_text_poke(poke->tailcall_bypass,  | 
|---|
 | 980 | +								 BPF_MOD_JUMP,  | 
|---|
 | 981 | +								 poke->bypass_addr,  | 
|---|
 | 982 | +								 NULL);  | 
|---|
 | 983 | +					BUG_ON(ret < 0 && ret != -EINVAL);  | 
|---|
 | 984 | +				}  | 
|---|
 | 985 | +			} else {  | 
|---|
 | 986 | +				ret = bpf_arch_text_poke(poke->tailcall_bypass,  | 
|---|
 | 987 | +							 BPF_MOD_JUMP,  | 
|---|
 | 988 | +							 old_bypass_addr,  | 
|---|
 | 989 | +							 poke->bypass_addr);  | 
|---|
 | 990 | +				BUG_ON(ret < 0 && ret != -EINVAL);  | 
|---|
 | 991 | +				/* let other CPUs finish the execution of program  | 
|---|
 | 992 | +				 * so that it will not possible to expose them  | 
|---|
 | 993 | +				 * to invalid nop, stack unwind, nop state  | 
|---|
 | 994 | +				 */  | 
|---|
 | 995 | +				if (!ret)  | 
|---|
 | 996 | +					synchronize_rcu();  | 
|---|
 | 997 | +				ret = bpf_arch_text_poke(poke->tailcall_target,  | 
|---|
 | 998 | +							 BPF_MOD_JUMP,  | 
|---|
 | 999 | +							 old_addr, NULL);  | 
|---|
 | 1000 | +				BUG_ON(ret < 0 && ret != -EINVAL);  | 
|---|
 | 1001 | +			}  | 
|---|
 | 1002 | +		}  | 
|---|
 | 1003 | +	}  | 
|---|
 | 1004 | +}  | 
|---|
 | 1005 | +  | 
|---|
 | 1006 | +static void prog_array_map_clear_deferred(struct work_struct *work)  | 
|---|
 | 1007 | +{  | 
|---|
 | 1008 | +	struct bpf_map *map = container_of(work, struct bpf_array_aux,  | 
|---|
 | 1009 | +					   work)->map;  | 
|---|
 | 1010 | +	bpf_fd_array_map_clear(map);  | 
|---|
 | 1011 | +	bpf_map_put(map);  | 
|---|
 | 1012 | +}  | 
|---|
 | 1013 | +  | 
|---|
 | 1014 | +static void prog_array_map_clear(struct bpf_map *map)  | 
|---|
 | 1015 | +{  | 
|---|
 | 1016 | +	struct bpf_array_aux *aux = container_of(map, struct bpf_array,  | 
|---|
 | 1017 | +						 map)->aux;  | 
|---|
 | 1018 | +	bpf_map_inc(map);  | 
|---|
 | 1019 | +	schedule_work(&aux->work);  | 
|---|
 | 1020 | +}  | 
|---|
 | 1021 | +  | 
|---|
 | 1022 | +static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)  | 
|---|
 | 1023 | +{  | 
|---|
 | 1024 | +	struct bpf_array_aux *aux;  | 
|---|
 | 1025 | +	struct bpf_map *map;  | 
|---|
 | 1026 | +  | 
|---|
 | 1027 | +	aux = kzalloc(sizeof(*aux), GFP_KERNEL);  | 
|---|
 | 1028 | +	if (!aux)  | 
|---|
 | 1029 | +		return ERR_PTR(-ENOMEM);  | 
|---|
 | 1030 | +  | 
|---|
 | 1031 | +	INIT_WORK(&aux->work, prog_array_map_clear_deferred);  | 
|---|
 | 1032 | +	INIT_LIST_HEAD(&aux->poke_progs);  | 
|---|
 | 1033 | +	mutex_init(&aux->poke_mutex);  | 
|---|
 | 1034 | +	spin_lock_init(&aux->owner.lock);  | 
|---|
 | 1035 | +  | 
|---|
 | 1036 | +	map = array_map_alloc(attr);  | 
|---|
 | 1037 | +	if (IS_ERR(map)) {  | 
|---|
 | 1038 | +		kfree(aux);  | 
|---|
 | 1039 | +		return map;  | 
|---|
 | 1040 | +	}  | 
|---|
 | 1041 | +  | 
|---|
 | 1042 | +	container_of(map, struct bpf_array, map)->aux = aux;  | 
|---|
 | 1043 | +	aux->map = map;  | 
|---|
 | 1044 | +  | 
|---|
 | 1045 | +	return map;  | 
|---|
 | 1046 | +}  | 
|---|
 | 1047 | +  | 
|---|
 | 1048 | +static void prog_array_map_free(struct bpf_map *map)  | 
|---|
 | 1049 | +{  | 
|---|
 | 1050 | +	struct prog_poke_elem *elem, *tmp;  | 
|---|
 | 1051 | +	struct bpf_array_aux *aux;  | 
|---|
 | 1052 | +  | 
|---|
 | 1053 | +	aux = container_of(map, struct bpf_array, map)->aux;  | 
|---|
 | 1054 | +	list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) {  | 
|---|
 | 1055 | +		list_del_init(&elem->list);  | 
|---|
 | 1056 | +		kfree(elem);  | 
|---|
 | 1057 | +	}  | 
|---|
 | 1058 | +	kfree(aux);  | 
|---|
 | 1059 | +	fd_array_map_free(map);  | 
|---|
 | 1060 | +}  | 
|---|
 | 1061 | +  | 
|---|
 | 1062 | +/* prog_array->aux->{type,jited} is a runtime binding.  | 
|---|
 | 1063 | + * Doing static check alone in the verifier is not enough.  | 
|---|
 | 1064 | + * Thus, prog_array_map cannot be used as an inner_map  | 
|---|
 | 1065 | + * and map_meta_equal is not implemented.  | 
|---|
 | 1066 | + */  | 
|---|
 | 1067 | +static int prog_array_map_btf_id;  | 
|---|
| 532 | 1068 |  const struct bpf_map_ops prog_array_map_ops = { | 
|---|
| 533 | 1069 |  	.map_alloc_check = fd_array_map_alloc_check, | 
|---|
| 534 |  | -	.map_alloc = array_map_alloc,  | 
|---|
| 535 |  | -	.map_free = fd_array_map_free,  | 
|---|
 | 1070 | +	.map_alloc = prog_array_map_alloc,  | 
|---|
 | 1071 | +	.map_free = prog_array_map_free,  | 
|---|
 | 1072 | +	.map_poke_track = prog_array_map_poke_track,  | 
|---|
 | 1073 | +	.map_poke_untrack = prog_array_map_poke_untrack,  | 
|---|
 | 1074 | +	.map_poke_run = prog_array_map_poke_run,  | 
|---|
| 536 | 1075 |  	.map_get_next_key = array_map_get_next_key, | 
|---|
| 537 | 1076 |  	.map_lookup_elem = fd_array_map_lookup_elem, | 
|---|
| 538 | 1077 |  	.map_delete_elem = fd_array_map_delete_elem, | 
|---|
| 539 | 1078 |  	.map_fd_get_ptr = prog_fd_array_get_ptr, | 
|---|
| 540 | 1079 |  	.map_fd_put_ptr = prog_fd_array_put_ptr, | 
|---|
| 541 | 1080 |  	.map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, | 
|---|
| 542 |  | -	.map_release_uref = bpf_fd_array_map_clear,  | 
|---|
| 543 |  | -	.map_check_btf = map_check_no_btf,  | 
|---|
 | 1081 | +	.map_release_uref = prog_array_map_clear,  | 
|---|
 | 1082 | +	.map_seq_show_elem = prog_array_map_seq_show_elem,  | 
|---|
 | 1083 | +	.map_btf_name = "bpf_array",  | 
|---|
 | 1084 | +	.map_btf_id = &prog_array_map_btf_id,  | 
|---|
| 544 | 1085 |  }; | 
|---|
| 545 | 1086 |   | 
|---|
| 546 | 1087 |  static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, | 
|---|
| .. | .. | 
|---|
| 610 | 1151 |  	struct bpf_event_entry *ee; | 
|---|
| 611 | 1152 |  	int i; | 
|---|
| 612 | 1153 |   | 
|---|
 | 1154 | +	if (map->map_flags & BPF_F_PRESERVE_ELEMS)  | 
|---|
 | 1155 | +		return;  | 
|---|
 | 1156 | +  | 
|---|
| 613 | 1157 |  	rcu_read_lock(); | 
|---|
| 614 | 1158 |  	for (i = 0; i < array->map.max_entries; i++) { | 
|---|
| 615 | 1159 |  		ee = READ_ONCE(array->ptrs[i]); | 
|---|
| .. | .. | 
|---|
| 619 | 1163 |  	rcu_read_unlock(); | 
|---|
| 620 | 1164 |  } | 
|---|
| 621 | 1165 |   | 
|---|
 | 1166 | +static void perf_event_fd_array_map_free(struct bpf_map *map)  | 
|---|
 | 1167 | +{  | 
|---|
 | 1168 | +	if (map->map_flags & BPF_F_PRESERVE_ELEMS)  | 
|---|
 | 1169 | +		bpf_fd_array_map_clear(map);  | 
|---|
 | 1170 | +	fd_array_map_free(map);  | 
|---|
 | 1171 | +}  | 
|---|
 | 1172 | +  | 
|---|
 | 1173 | +static int perf_event_array_map_btf_id;  | 
|---|
| 622 | 1174 |  const struct bpf_map_ops perf_event_array_map_ops = { | 
|---|
 | 1175 | +	.map_meta_equal = bpf_map_meta_equal,  | 
|---|
| 623 | 1176 |  	.map_alloc_check = fd_array_map_alloc_check, | 
|---|
| 624 | 1177 |  	.map_alloc = array_map_alloc, | 
|---|
| 625 |  | -	.map_free = fd_array_map_free,  | 
|---|
 | 1178 | +	.map_free = perf_event_fd_array_map_free,  | 
|---|
| 626 | 1179 |  	.map_get_next_key = array_map_get_next_key, | 
|---|
| 627 | 1180 |  	.map_lookup_elem = fd_array_map_lookup_elem, | 
|---|
| 628 | 1181 |  	.map_delete_elem = fd_array_map_delete_elem, | 
|---|
| .. | .. | 
|---|
| 630 | 1183 |  	.map_fd_put_ptr = perf_event_fd_array_put_ptr, | 
|---|
| 631 | 1184 |  	.map_release = perf_event_fd_array_release, | 
|---|
| 632 | 1185 |  	.map_check_btf = map_check_no_btf, | 
|---|
 | 1186 | +	.map_btf_name = "bpf_array",  | 
|---|
 | 1187 | +	.map_btf_id = &perf_event_array_map_btf_id,  | 
|---|
| 633 | 1188 |  }; | 
|---|
| 634 | 1189 |   | 
|---|
| 635 | 1190 |  #ifdef CONFIG_CGROUPS | 
|---|
| .. | .. | 
|---|
| 652 | 1207 |  	fd_array_map_free(map); | 
|---|
| 653 | 1208 |  } | 
|---|
| 654 | 1209 |   | 
|---|
 | 1210 | +static int cgroup_array_map_btf_id;  | 
|---|
| 655 | 1211 |  const struct bpf_map_ops cgroup_array_map_ops = { | 
|---|
 | 1212 | +	.map_meta_equal = bpf_map_meta_equal,  | 
|---|
| 656 | 1213 |  	.map_alloc_check = fd_array_map_alloc_check, | 
|---|
| 657 | 1214 |  	.map_alloc = array_map_alloc, | 
|---|
| 658 | 1215 |  	.map_free = cgroup_fd_array_free, | 
|---|
| .. | .. | 
|---|
| 662 | 1219 |  	.map_fd_get_ptr = cgroup_fd_array_get_ptr, | 
|---|
| 663 | 1220 |  	.map_fd_put_ptr = cgroup_fd_array_put_ptr, | 
|---|
| 664 | 1221 |  	.map_check_btf = map_check_no_btf, | 
|---|
 | 1222 | +	.map_btf_name = "bpf_array",  | 
|---|
 | 1223 | +	.map_btf_id = &cgroup_array_map_btf_id,  | 
|---|
| 665 | 1224 |  }; | 
|---|
| 666 | 1225 |  #endif | 
|---|
| 667 | 1226 |   | 
|---|
| .. | .. | 
|---|
| 704 | 1263 |  	return READ_ONCE(*inner_map); | 
|---|
| 705 | 1264 |  } | 
|---|
| 706 | 1265 |   | 
|---|
| 707 |  | -static u32 array_of_map_gen_lookup(struct bpf_map *map,  | 
|---|
 | 1266 | +static int array_of_map_gen_lookup(struct bpf_map *map,  | 
|---|
| 708 | 1267 |  				   struct bpf_insn *insn_buf) | 
|---|
| 709 | 1268 |  { | 
|---|
| 710 | 1269 |  	struct bpf_array *array = container_of(map, struct bpf_array, map); | 
|---|
| .. | .. | 
|---|
| 716 | 1275 |   | 
|---|
| 717 | 1276 |  	*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); | 
|---|
| 718 | 1277 |  	*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); | 
|---|
| 719 |  | -	if (map->unpriv_array) {  | 
|---|
 | 1278 | +	if (!map->bypass_spec_v1) {  | 
|---|
| 720 | 1279 |  		*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6); | 
|---|
| 721 | 1280 |  		*insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); | 
|---|
| 722 | 1281 |  	} else { | 
|---|
| .. | .. | 
|---|
| 735 | 1294 |  	return insn - insn_buf; | 
|---|
| 736 | 1295 |  } | 
|---|
| 737 | 1296 |   | 
|---|
 | 1297 | +static int array_of_maps_map_btf_id;  | 
|---|
| 738 | 1298 |  const struct bpf_map_ops array_of_maps_map_ops = { | 
|---|
| 739 | 1299 |  	.map_alloc_check = fd_array_map_alloc_check, | 
|---|
| 740 | 1300 |  	.map_alloc = array_of_map_alloc, | 
|---|
| .. | .. | 
|---|
| 747 | 1307 |  	.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, | 
|---|
| 748 | 1308 |  	.map_gen_lookup = array_of_map_gen_lookup, | 
|---|
| 749 | 1309 |  	.map_check_btf = map_check_no_btf, | 
|---|
 | 1310 | +	.map_btf_name = "bpf_array",  | 
|---|
 | 1311 | +	.map_btf_id = &array_of_maps_map_btf_id,  | 
|---|
| 750 | 1312 |  }; | 
|---|