| .. | .. | 
|---|
 | 1 | +// SPDX-License-Identifier: GPL-2.0-only  | 
|---|
| 1 | 2 |  /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com | 
|---|
| 2 | 3 |   * Copyright (c) 2016 Facebook | 
|---|
| 3 |  | - *  | 
|---|
| 4 |  | - * This program is free software; you can redistribute it and/or  | 
|---|
| 5 |  | - * modify it under the terms of version 2 of the GNU General Public  | 
|---|
| 6 |  | - * License as published by the Free Software Foundation.  | 
|---|
| 7 |  | - *  | 
|---|
| 8 |  | - * This program is distributed in the hope that it will be useful, but  | 
|---|
| 9 |  | - * WITHOUT ANY WARRANTY; without even the implied warranty of  | 
|---|
| 10 |  | - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU  | 
|---|
| 11 |  | - * General Public License for more details.  | 
|---|
| 12 | 4 |   */ | 
|---|
| 13 | 5 |  #include <linux/bpf.h> | 
|---|
| 14 | 6 |  #include <linux/btf.h> | 
|---|
| .. | .. | 
|---|
| 17 | 9 |  #include <linux/rculist_nulls.h> | 
|---|
| 18 | 10 |  #include <linux/random.h> | 
|---|
| 19 | 11 |  #include <uapi/linux/btf.h> | 
|---|
 | 12 | +#include <linux/rcupdate_trace.h>  | 
|---|
| 20 | 13 |  #include "percpu_freelist.h" | 
|---|
| 21 | 14 |  #include "bpf_lru_list.h" | 
|---|
| 22 | 15 |  #include "map_in_map.h" | 
|---|
| 23 | 16 |   | 
|---|
| 24 | 17 |  #define HTAB_CREATE_FLAG_MASK						\ | 
|---|
| 25 | 18 |  	(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE |	\ | 
|---|
| 26 |  | -	 BPF_F_RDONLY | BPF_F_WRONLY)  | 
|---|
 | 19 | +	 BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED)  | 
|---|
| 27 | 20 |   | 
|---|
 | 21 | +#define BATCH_OPS(_name)			\  | 
|---|
 | 22 | +	.map_lookup_batch =			\  | 
|---|
 | 23 | +	_name##_map_lookup_batch,		\  | 
|---|
 | 24 | +	.map_lookup_and_delete_batch =		\  | 
|---|
 | 25 | +	_name##_map_lookup_and_delete_batch,	\  | 
|---|
 | 26 | +	.map_update_batch =			\  | 
|---|
 | 27 | +	generic_map_update_batch,		\  | 
|---|
 | 28 | +	.map_delete_batch =			\  | 
|---|
 | 29 | +	generic_map_delete_batch  | 
|---|
 | 30 | +  | 
|---|
 | 31 | +/*  | 
|---|
 | 32 | + * The bucket lock has two protection scopes:  | 
|---|
 | 33 | + *  | 
|---|
 | 34 | + * 1) Serializing concurrent operations from BPF programs on differrent  | 
|---|
 | 35 | + *    CPUs  | 
|---|
 | 36 | + *  | 
|---|
 | 37 | + * 2) Serializing concurrent operations from BPF programs and sys_bpf()  | 
|---|
 | 38 | + *  | 
|---|
 | 39 | + * BPF programs can execute in any context including perf, kprobes and  | 
|---|
 | 40 | + * tracing. As there are almost no limits where perf, kprobes and tracing  | 
|---|
 | 41 | + * can be invoked from the lock operations need to be protected against  | 
|---|
 | 42 | + * deadlocks. Deadlocks can be caused by recursion and by an invocation in  | 
|---|
 | 43 | + * the lock held section when functions which acquire this lock are invoked  | 
|---|
 | 44 | + * from sys_bpf(). BPF recursion is prevented by incrementing the per CPU  | 
|---|
 | 45 | + * variable bpf_prog_active, which prevents BPF programs attached to perf  | 
|---|
 | 46 | + * events, kprobes and tracing to be invoked before the prior invocation  | 
|---|
 | 47 | + * from one of these contexts completed. sys_bpf() uses the same mechanism  | 
|---|
 | 48 | + * by pinning the task to the current CPU and incrementing the recursion  | 
|---|
 | 49 | + * protection accross the map operation.  | 
|---|
 | 50 | + *  | 
|---|
 | 51 | + * This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain  | 
|---|
 | 52 | + * operations like memory allocations (even with GFP_ATOMIC) from atomic  | 
|---|
 | 53 | + * contexts. This is required because even with GFP_ATOMIC the memory  | 
|---|
 | 54 | + * allocator calls into code pathes which acquire locks with long held lock  | 
|---|
 | 55 | + * sections. To ensure the deterministic behaviour these locks are regular  | 
|---|
 | 56 | + * spinlocks, which are converted to 'sleepable' spinlocks on RT. The only  | 
|---|
 | 57 | + * true atomic contexts on an RT kernel are the low level hardware  | 
|---|
 | 58 | + * handling, scheduling, low level interrupt handling, NMIs etc. None of  | 
|---|
 | 59 | + * these contexts should ever do memory allocations.  | 
|---|
 | 60 | + *  | 
|---|
 | 61 | + * As regular device interrupt handlers and soft interrupts are forced into  | 
|---|
 | 62 | + * thread context, the existing code which does  | 
|---|
 | 63 | + *   spin_lock*(); alloc(GPF_ATOMIC); spin_unlock*();  | 
|---|
 | 64 | + * just works.  | 
|---|
 | 65 | + *  | 
|---|
 | 66 | + * In theory the BPF locks could be converted to regular spinlocks as well,  | 
|---|
 | 67 | + * but the bucket locks and percpu_freelist locks can be taken from  | 
|---|
 | 68 | + * arbitrary contexts (perf, kprobes, tracepoints) which are required to be  | 
|---|
 | 69 | + * atomic contexts even on RT. These mechanisms require preallocated maps,  | 
|---|
 | 70 | + * so there is no need to invoke memory allocations within the lock held  | 
|---|
 | 71 | + * sections.  | 
|---|
 | 72 | + *  | 
|---|
 | 73 | + * BPF maps which need dynamic allocation are only used from (forced)  | 
|---|
 | 74 | + * thread context on RT and can therefore use regular spinlocks which in  | 
|---|
 | 75 | + * turn allows to invoke memory allocations from the lock held section.  | 
|---|
 | 76 | + *  | 
|---|
 | 77 | + * On a non RT kernel this distinction is neither possible nor required.  | 
|---|
 | 78 | + * spinlock maps to raw_spinlock and the extra code is optimized out by the  | 
|---|
 | 79 | + * compiler.  | 
|---|
 | 80 | + */  | 
|---|
| 28 | 81 |  struct bucket { | 
|---|
| 29 | 82 |  	struct hlist_nulls_head head; | 
|---|
| 30 |  | -	raw_spinlock_t lock;  | 
|---|
 | 83 | +	union {  | 
|---|
 | 84 | +		raw_spinlock_t raw_lock;  | 
|---|
 | 85 | +		spinlock_t     lock;  | 
|---|
 | 86 | +	};  | 
|---|
| 31 | 87 |  }; | 
|---|
| 32 | 88 |   | 
|---|
| 33 | 89 |  struct bpf_htab { | 
|---|
| .. | .. | 
|---|
| 54 | 110 |  			union { | 
|---|
| 55 | 111 |  				struct bpf_htab *htab; | 
|---|
| 56 | 112 |  				struct pcpu_freelist_node fnode; | 
|---|
 | 113 | +				struct htab_elem *batch_flink;  | 
|---|
| 57 | 114 |  			}; | 
|---|
| 58 | 115 |  		}; | 
|---|
| 59 | 116 |  	}; | 
|---|
| .. | .. | 
|---|
| 62 | 119 |  		struct bpf_lru_node lru_node; | 
|---|
| 63 | 120 |  	}; | 
|---|
| 64 | 121 |  	u32 hash; | 
|---|
| 65 |  | -	char key[0] __aligned(8);  | 
|---|
 | 122 | +	char key[] __aligned(8);  | 
|---|
| 66 | 123 |  }; | 
|---|
 | 124 | +  | 
|---|
 | 125 | +static inline bool htab_is_prealloc(const struct bpf_htab *htab)  | 
|---|
 | 126 | +{  | 
|---|
 | 127 | +	return !(htab->map.map_flags & BPF_F_NO_PREALLOC);  | 
|---|
 | 128 | +}  | 
|---|
 | 129 | +  | 
|---|
 | 130 | +static inline bool htab_use_raw_lock(const struct bpf_htab *htab)  | 
|---|
 | 131 | +{  | 
|---|
 | 132 | +	return (!IS_ENABLED(CONFIG_PREEMPT_RT) || htab_is_prealloc(htab));  | 
|---|
 | 133 | +}  | 
|---|
 | 134 | +  | 
|---|
 | 135 | +static void htab_init_buckets(struct bpf_htab *htab)  | 
|---|
 | 136 | +{  | 
|---|
 | 137 | +	unsigned i;  | 
|---|
 | 138 | +  | 
|---|
 | 139 | +	for (i = 0; i < htab->n_buckets; i++) {  | 
|---|
 | 140 | +		INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);  | 
|---|
 | 141 | +		if (htab_use_raw_lock(htab))  | 
|---|
 | 142 | +			raw_spin_lock_init(&htab->buckets[i].raw_lock);  | 
|---|
 | 143 | +		else  | 
|---|
 | 144 | +			spin_lock_init(&htab->buckets[i].lock);  | 
|---|
 | 145 | +	}  | 
|---|
 | 146 | +}  | 
|---|
 | 147 | +  | 
|---|
 | 148 | +static inline unsigned long htab_lock_bucket(const struct bpf_htab *htab,  | 
|---|
 | 149 | +					     struct bucket *b)  | 
|---|
 | 150 | +{  | 
|---|
 | 151 | +	unsigned long flags;  | 
|---|
 | 152 | +  | 
|---|
 | 153 | +	if (htab_use_raw_lock(htab))  | 
|---|
 | 154 | +		raw_spin_lock_irqsave(&b->raw_lock, flags);  | 
|---|
 | 155 | +	else  | 
|---|
 | 156 | +		spin_lock_irqsave(&b->lock, flags);  | 
|---|
 | 157 | +	return flags;  | 
|---|
 | 158 | +}  | 
|---|
 | 159 | +  | 
|---|
 | 160 | +static inline void htab_unlock_bucket(const struct bpf_htab *htab,  | 
|---|
 | 161 | +				      struct bucket *b,  | 
|---|
 | 162 | +				      unsigned long flags)  | 
|---|
 | 163 | +{  | 
|---|
 | 164 | +	if (htab_use_raw_lock(htab))  | 
|---|
 | 165 | +		raw_spin_unlock_irqrestore(&b->raw_lock, flags);  | 
|---|
 | 166 | +	else  | 
|---|
 | 167 | +		spin_unlock_irqrestore(&b->lock, flags);  | 
|---|
 | 168 | +}  | 
|---|
| 67 | 169 |   | 
|---|
| 68 | 170 |  static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); | 
|---|
| 69 | 171 |   | 
|---|
| .. | .. | 
|---|
| 77 | 179 |  { | 
|---|
| 78 | 180 |  	return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH || | 
|---|
| 79 | 181 |  		htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; | 
|---|
| 80 |  | -}  | 
|---|
| 81 |  | -  | 
|---|
| 82 |  | -static bool htab_is_prealloc(const struct bpf_htab *htab)  | 
|---|
| 83 |  | -{  | 
|---|
| 84 |  | -	return !(htab->map.map_flags & BPF_F_NO_PREALLOC);  | 
|---|
| 85 | 182 |  } | 
|---|
| 86 | 183 |   | 
|---|
| 87 | 184 |  static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, | 
|---|
| .. | .. | 
|---|
| 124 | 221 |  	bpf_map_area_free(htab->elems); | 
|---|
| 125 | 222 |  } | 
|---|
| 126 | 223 |   | 
|---|
 | 224 | +/* The LRU list has a lock (lru_lock). Each htab bucket has a lock  | 
|---|
 | 225 | + * (bucket_lock). If both locks need to be acquired together, the lock  | 
|---|
 | 226 | + * order is always lru_lock -> bucket_lock and this only happens in  | 
|---|
 | 227 | + * bpf_lru_list.c logic. For example, certain code path of  | 
|---|
 | 228 | + * bpf_lru_pop_free(), which is called by function prealloc_lru_pop(),  | 
|---|
 | 229 | + * will acquire lru_lock first followed by acquiring bucket_lock.  | 
|---|
 | 230 | + *  | 
|---|
 | 231 | + * In hashtab.c, to avoid deadlock, lock acquisition of  | 
|---|
 | 232 | + * bucket_lock followed by lru_lock is not allowed. In such cases,  | 
|---|
 | 233 | + * bucket_lock needs to be released first before acquiring lru_lock.  | 
|---|
 | 234 | + */  | 
|---|
| 127 | 235 |  static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, | 
|---|
| 128 | 236 |  					  u32 hash) | 
|---|
| 129 | 237 |  { | 
|---|
| .. | .. | 
|---|
| 244 | 352 |  	 */ | 
|---|
| 245 | 353 |  	bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); | 
|---|
| 246 | 354 |  	bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); | 
|---|
 | 355 | +	bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED);  | 
|---|
| 247 | 356 |  	int numa_node = bpf_map_attr_numa_node(attr); | 
|---|
| 248 | 357 |   | 
|---|
| 249 | 358 |  	BUILD_BUG_ON(offsetof(struct htab_elem, htab) != | 
|---|
| .. | .. | 
|---|
| 251 | 360 |  	BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != | 
|---|
| 252 | 361 |  		     offsetof(struct htab_elem, hash_node.pprev)); | 
|---|
| 253 | 362 |   | 
|---|
| 254 |  | -	if (lru && !capable(CAP_SYS_ADMIN))  | 
|---|
 | 363 | +	if (lru && !bpf_capable())  | 
|---|
| 255 | 364 |  		/* LRU implementation is much complicated than other | 
|---|
| 256 |  | -		 * maps.  Hence, limit to CAP_SYS_ADMIN for now.  | 
|---|
 | 365 | +		 * maps.  Hence, limit to CAP_BPF.  | 
|---|
| 257 | 366 |  		 */ | 
|---|
| 258 | 367 |  		return -EPERM; | 
|---|
| 259 | 368 |   | 
|---|
| 260 |  | -	if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK)  | 
|---|
| 261 |  | -		/* reserved bits should not be used */  | 
|---|
 | 369 | +	if (zero_seed && !capable(CAP_SYS_ADMIN))  | 
|---|
 | 370 | +		/* Guard against local DoS, and discourage production use. */  | 
|---|
 | 371 | +		return -EPERM;  | 
|---|
 | 372 | +  | 
|---|
 | 373 | +	if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK ||  | 
|---|
 | 374 | +	    !bpf_map_flags_access_ok(attr->map_flags))  | 
|---|
| 262 | 375 |  		return -EINVAL; | 
|---|
| 263 | 376 |   | 
|---|
| 264 | 377 |  	if (!lru && percpu_lru) | 
|---|
| .. | .. | 
|---|
| 309 | 422 |  	bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); | 
|---|
| 310 | 423 |  	bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); | 
|---|
| 311 | 424 |  	struct bpf_htab *htab; | 
|---|
| 312 |  | -	int err, i;  | 
|---|
| 313 | 425 |  	u64 cost; | 
|---|
 | 426 | +	int err;  | 
|---|
| 314 | 427 |   | 
|---|
| 315 | 428 |  	htab = kzalloc(sizeof(*htab), GFP_USER); | 
|---|
| 316 | 429 |  	if (!htab) | 
|---|
| .. | .. | 
|---|
| 355 | 468 |  	else | 
|---|
| 356 | 469 |  	       cost += (u64) htab->elem_size * num_possible_cpus(); | 
|---|
| 357 | 470 |   | 
|---|
| 358 |  | -	if (cost >= U32_MAX - PAGE_SIZE)  | 
|---|
| 359 |  | -		/* make sure page count doesn't overflow */  | 
|---|
| 360 |  | -		goto free_htab;  | 
|---|
| 361 |  | -  | 
|---|
| 362 |  | -	htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;  | 
|---|
| 363 |  | -  | 
|---|
| 364 |  | -	/* if map size is larger than memlock limit, reject it early */  | 
|---|
| 365 |  | -	err = bpf_map_precharge_memlock(htab->map.pages);  | 
|---|
 | 471 | +	/* if map size is larger than memlock limit, reject it */  | 
|---|
 | 472 | +	err = bpf_map_charge_init(&htab->map.memory, cost);  | 
|---|
| 366 | 473 |  	if (err) | 
|---|
| 367 | 474 |  		goto free_htab; | 
|---|
| 368 | 475 |   | 
|---|
| .. | .. | 
|---|
| 371 | 478 |  					   sizeof(struct bucket), | 
|---|
| 372 | 479 |  					   htab->map.numa_node); | 
|---|
| 373 | 480 |  	if (!htab->buckets) | 
|---|
| 374 |  | -		goto free_htab;  | 
|---|
 | 481 | +		goto free_charge;  | 
|---|
| 375 | 482 |   | 
|---|
| 376 |  | -	htab->hashrnd = get_random_int();  | 
|---|
| 377 |  | -	for (i = 0; i < htab->n_buckets; i++) {  | 
|---|
| 378 |  | -		INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);  | 
|---|
| 379 |  | -		raw_spin_lock_init(&htab->buckets[i].lock);  | 
|---|
| 380 |  | -	}  | 
|---|
 | 483 | +	if (htab->map.map_flags & BPF_F_ZERO_SEED)  | 
|---|
 | 484 | +		htab->hashrnd = 0;  | 
|---|
 | 485 | +	else  | 
|---|
 | 486 | +		htab->hashrnd = get_random_int();  | 
|---|
 | 487 | +  | 
|---|
 | 488 | +	htab_init_buckets(htab);  | 
|---|
| 381 | 489 |   | 
|---|
| 382 | 490 |  	if (prealloc) { | 
|---|
| 383 | 491 |  		err = prealloc_init(htab); | 
|---|
| .. | .. | 
|---|
| 400 | 508 |  	prealloc_destroy(htab); | 
|---|
| 401 | 509 |  free_buckets: | 
|---|
| 402 | 510 |  	bpf_map_area_free(htab->buckets); | 
|---|
 | 511 | +free_charge:  | 
|---|
 | 512 | +	bpf_map_charge_finish(&htab->map.memory);  | 
|---|
| 403 | 513 |  free_htab: | 
|---|
| 404 | 514 |  	kfree(htab); | 
|---|
| 405 | 515 |  	return ERR_PTR(err); | 
|---|
| .. | .. | 
|---|
| 468 | 578 |  	struct htab_elem *l; | 
|---|
| 469 | 579 |  	u32 hash, key_size; | 
|---|
| 470 | 580 |   | 
|---|
| 471 |  | -	/* Must be called with rcu_read_lock. */  | 
|---|
| 472 |  | -	WARN_ON_ONCE(!rcu_read_lock_held());  | 
|---|
 | 581 | +	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());  | 
|---|
| 473 | 582 |   | 
|---|
| 474 | 583 |  	key_size = map->key_size; | 
|---|
| 475 | 584 |   | 
|---|
| .. | .. | 
|---|
| 503 | 612 |   * bpf_prog | 
|---|
| 504 | 613 |   *   __htab_map_lookup_elem | 
|---|
| 505 | 614 |   */ | 
|---|
| 506 |  | -static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)  | 
|---|
 | 615 | +static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)  | 
|---|
| 507 | 616 |  { | 
|---|
| 508 | 617 |  	struct bpf_insn *insn = insn_buf; | 
|---|
| 509 | 618 |  	const int ret = BPF_REG_0; | 
|---|
| .. | .. | 
|---|
| 542 | 651 |  	return __htab_lru_map_lookup_elem(map, key, false); | 
|---|
| 543 | 652 |  } | 
|---|
| 544 | 653 |   | 
|---|
| 545 |  | -static u32 htab_lru_map_gen_lookup(struct bpf_map *map,  | 
|---|
 | 654 | +static int htab_lru_map_gen_lookup(struct bpf_map *map,  | 
|---|
| 546 | 655 |  				   struct bpf_insn *insn_buf) | 
|---|
| 547 | 656 |  { | 
|---|
| 548 | 657 |  	struct bpf_insn *insn = insn_buf; | 
|---|
| .. | .. | 
|---|
| 583 | 692 |  	b = __select_bucket(htab, tgt_l->hash); | 
|---|
| 584 | 693 |  	head = &b->head; | 
|---|
| 585 | 694 |   | 
|---|
| 586 |  | -	raw_spin_lock_irqsave(&b->lock, flags);  | 
|---|
 | 695 | +	flags = htab_lock_bucket(htab, b);  | 
|---|
| 587 | 696 |   | 
|---|
| 588 | 697 |  	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) | 
|---|
| 589 | 698 |  		if (l == tgt_l) { | 
|---|
| .. | .. | 
|---|
| 591 | 700 |  			break; | 
|---|
| 592 | 701 |  		} | 
|---|
| 593 | 702 |   | 
|---|
| 594 |  | -	raw_spin_unlock_irqrestore(&b->lock, flags);  | 
|---|
 | 703 | +	htab_unlock_bucket(htab, b, flags);  | 
|---|
| 595 | 704 |   | 
|---|
| 596 | 705 |  	return l == tgt_l; | 
|---|
| 597 | 706 |  } | 
|---|
| .. | .. | 
|---|
| 712 | 821 |  	} | 
|---|
| 713 | 822 |  } | 
|---|
| 714 | 823 |   | 
|---|
 | 824 | +static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,  | 
|---|
 | 825 | +			    void *value, bool onallcpus)  | 
|---|
 | 826 | +{  | 
|---|
 | 827 | +	/* When using prealloc and not setting the initial value on all cpus,  | 
|---|
 | 828 | +	 * zero-fill element values for other cpus (just as what happens when  | 
|---|
 | 829 | +	 * not using prealloc). Otherwise, bpf program has no way to ensure  | 
|---|
 | 830 | +	 * known initial values for cpus other than current one  | 
|---|
 | 831 | +	 * (onallcpus=false always when coming from bpf prog).  | 
|---|
 | 832 | +	 */  | 
|---|
 | 833 | +	if (htab_is_prealloc(htab) && !onallcpus) {  | 
|---|
 | 834 | +		u32 size = round_up(htab->map.value_size, 8);  | 
|---|
 | 835 | +		int current_cpu = raw_smp_processor_id();  | 
|---|
 | 836 | +		int cpu;  | 
|---|
 | 837 | +  | 
|---|
 | 838 | +		for_each_possible_cpu(cpu) {  | 
|---|
 | 839 | +			if (cpu == current_cpu)  | 
|---|
 | 840 | +				bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value,  | 
|---|
 | 841 | +						size);  | 
|---|
 | 842 | +			else  | 
|---|
 | 843 | +				memset(per_cpu_ptr(pptr, cpu), 0, size);  | 
|---|
 | 844 | +		}  | 
|---|
 | 845 | +	} else {  | 
|---|
 | 846 | +		pcpu_copy_value(htab, pptr, value, onallcpus);  | 
|---|
 | 847 | +	}  | 
|---|
 | 848 | +}  | 
|---|
 | 849 | +  | 
|---|
| 715 | 850 |  static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) | 
|---|
| 716 | 851 |  { | 
|---|
| 717 | 852 |  	return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS && | 
|---|
| 718 | 853 |  	       BITS_PER_LONG == 64; | 
|---|
| 719 |  | -}  | 
|---|
| 720 |  | -  | 
|---|
| 721 |  | -static u32 htab_size_value(const struct bpf_htab *htab, bool percpu)  | 
|---|
| 722 |  | -{  | 
|---|
| 723 |  | -	u32 size = htab->map.value_size;  | 
|---|
| 724 |  | -  | 
|---|
| 725 |  | -	if (percpu || fd_htab_map_needs_adjust(htab))  | 
|---|
| 726 |  | -		size = round_up(size, 8);  | 
|---|
| 727 |  | -	return size;  | 
|---|
| 728 | 854 |  } | 
|---|
| 729 | 855 |   | 
|---|
| 730 | 856 |  static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, | 
|---|
| .. | .. | 
|---|
| 732 | 858 |  					 bool percpu, bool onallcpus, | 
|---|
| 733 | 859 |  					 struct htab_elem *old_elem) | 
|---|
| 734 | 860 |  { | 
|---|
| 735 |  | -	u32 size = htab_size_value(htab, percpu);  | 
|---|
 | 861 | +	u32 size = htab->map.value_size;  | 
|---|
| 736 | 862 |  	bool prealloc = htab_is_prealloc(htab); | 
|---|
| 737 | 863 |  	struct htab_elem *l_new, **pl_new; | 
|---|
| 738 | 864 |  	void __percpu *pptr; | 
|---|
| .. | .. | 
|---|
| 771 | 897 |  			l_new = ERR_PTR(-ENOMEM); | 
|---|
| 772 | 898 |  			goto dec_count; | 
|---|
| 773 | 899 |  		} | 
|---|
 | 900 | +		check_and_init_map_lock(&htab->map,  | 
|---|
 | 901 | +					l_new->key + round_up(key_size, 8));  | 
|---|
| 774 | 902 |  	} | 
|---|
| 775 | 903 |   | 
|---|
| 776 | 904 |  	memcpy(l_new->key, key, key_size); | 
|---|
| 777 | 905 |  	if (percpu) { | 
|---|
 | 906 | +		size = round_up(size, 8);  | 
|---|
| 778 | 907 |  		if (prealloc) { | 
|---|
| 779 | 908 |  			pptr = htab_elem_get_ptr(l_new, key_size); | 
|---|
| 780 | 909 |  		} else { | 
|---|
| .. | .. | 
|---|
| 788 | 917 |  			} | 
|---|
| 789 | 918 |  		} | 
|---|
| 790 | 919 |   | 
|---|
| 791 |  | -		pcpu_copy_value(htab, pptr, value, onallcpus);  | 
|---|
 | 920 | +		pcpu_init_value(htab, pptr, value, onallcpus);  | 
|---|
| 792 | 921 |   | 
|---|
| 793 | 922 |  		if (!prealloc) | 
|---|
| 794 | 923 |  			htab_elem_set_ptr(l_new, key_size, pptr); | 
|---|
| 795 |  | -	} else {  | 
|---|
 | 924 | +	} else if (fd_htab_map_needs_adjust(htab)) {  | 
|---|
 | 925 | +		size = round_up(size, 8);  | 
|---|
| 796 | 926 |  		memcpy(l_new->key + round_up(key_size, 8), value, size); | 
|---|
 | 927 | +	} else {  | 
|---|
 | 928 | +		copy_map_value(&htab->map,  | 
|---|
 | 929 | +			       l_new->key + round_up(key_size, 8),  | 
|---|
 | 930 | +			       value);  | 
|---|
| 797 | 931 |  	} | 
|---|
| 798 | 932 |   | 
|---|
| 799 | 933 |  	l_new->hash = hash; | 
|---|
| .. | .. | 
|---|
| 806 | 940 |  static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, | 
|---|
| 807 | 941 |  		       u64 map_flags) | 
|---|
| 808 | 942 |  { | 
|---|
| 809 |  | -	if (l_old && map_flags == BPF_NOEXIST)  | 
|---|
 | 943 | +	if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)  | 
|---|
| 810 | 944 |  		/* elem already exists */ | 
|---|
| 811 | 945 |  		return -EEXIST; | 
|---|
| 812 | 946 |   | 
|---|
| 813 |  | -	if (!l_old && map_flags == BPF_EXIST)  | 
|---|
 | 947 | +	if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)  | 
|---|
| 814 | 948 |  		/* elem doesn't exist, cannot update it */ | 
|---|
| 815 | 949 |  		return -ENOENT; | 
|---|
| 816 | 950 |   | 
|---|
| .. | .. | 
|---|
| 829 | 963 |  	u32 key_size, hash; | 
|---|
| 830 | 964 |  	int ret; | 
|---|
| 831 | 965 |   | 
|---|
| 832 |  | -	if (unlikely(map_flags > BPF_EXIST))  | 
|---|
 | 966 | +	if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))  | 
|---|
| 833 | 967 |  		/* unknown flags */ | 
|---|
| 834 | 968 |  		return -EINVAL; | 
|---|
| 835 | 969 |   | 
|---|
| 836 |  | -	WARN_ON_ONCE(!rcu_read_lock_held());  | 
|---|
 | 970 | +	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());  | 
|---|
| 837 | 971 |   | 
|---|
| 838 | 972 |  	key_size = map->key_size; | 
|---|
| 839 | 973 |   | 
|---|
| .. | .. | 
|---|
| 842 | 976 |  	b = __select_bucket(htab, hash); | 
|---|
| 843 | 977 |  	head = &b->head; | 
|---|
| 844 | 978 |   | 
|---|
| 845 |  | -	/* bpf_map_update_elem() can be called in_irq() */  | 
|---|
| 846 |  | -	raw_spin_lock_irqsave(&b->lock, flags);  | 
|---|
 | 979 | +	if (unlikely(map_flags & BPF_F_LOCK)) {  | 
|---|
 | 980 | +		if (unlikely(!map_value_has_spin_lock(map)))  | 
|---|
 | 981 | +			return -EINVAL;  | 
|---|
 | 982 | +		/* find an element without taking the bucket lock */  | 
|---|
 | 983 | +		l_old = lookup_nulls_elem_raw(head, hash, key, key_size,  | 
|---|
 | 984 | +					      htab->n_buckets);  | 
|---|
 | 985 | +		ret = check_flags(htab, l_old, map_flags);  | 
|---|
 | 986 | +		if (ret)  | 
|---|
 | 987 | +			return ret;  | 
|---|
 | 988 | +		if (l_old) {  | 
|---|
 | 989 | +			/* grab the element lock and update value in place */  | 
|---|
 | 990 | +			copy_map_value_locked(map,  | 
|---|
 | 991 | +					      l_old->key + round_up(key_size, 8),  | 
|---|
 | 992 | +					      value, false);  | 
|---|
 | 993 | +			return 0;  | 
|---|
 | 994 | +		}  | 
|---|
 | 995 | +		/* fall through, grab the bucket lock and lookup again.  | 
|---|
 | 996 | +		 * 99.9% chance that the element won't be found,  | 
|---|
 | 997 | +		 * but second lookup under lock has to be done.  | 
|---|
 | 998 | +		 */  | 
|---|
 | 999 | +	}  | 
|---|
 | 1000 | +  | 
|---|
 | 1001 | +	flags = htab_lock_bucket(htab, b);  | 
|---|
| 847 | 1002 |   | 
|---|
| 848 | 1003 |  	l_old = lookup_elem_raw(head, hash, key, key_size); | 
|---|
| 849 | 1004 |   | 
|---|
| 850 | 1005 |  	ret = check_flags(htab, l_old, map_flags); | 
|---|
| 851 | 1006 |  	if (ret) | 
|---|
| 852 | 1007 |  		goto err; | 
|---|
 | 1008 | +  | 
|---|
 | 1009 | +	if (unlikely(l_old && (map_flags & BPF_F_LOCK))) {  | 
|---|
 | 1010 | +		/* first lookup without the bucket lock didn't find the element,  | 
|---|
 | 1011 | +		 * but second lookup with the bucket lock found it.  | 
|---|
 | 1012 | +		 * This case is highly unlikely, but has to be dealt with:  | 
|---|
 | 1013 | +		 * grab the element lock in addition to the bucket lock  | 
|---|
 | 1014 | +		 * and update element in place  | 
|---|
 | 1015 | +		 */  | 
|---|
 | 1016 | +		copy_map_value_locked(map,  | 
|---|
 | 1017 | +				      l_old->key + round_up(key_size, 8),  | 
|---|
 | 1018 | +				      value, false);  | 
|---|
 | 1019 | +		ret = 0;  | 
|---|
 | 1020 | +		goto err;  | 
|---|
 | 1021 | +	}  | 
|---|
| 853 | 1022 |   | 
|---|
| 854 | 1023 |  	l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, | 
|---|
| 855 | 1024 |  				l_old); | 
|---|
| .. | .. | 
|---|
| 870 | 1039 |  	} | 
|---|
| 871 | 1040 |  	ret = 0; | 
|---|
| 872 | 1041 |  err: | 
|---|
| 873 |  | -	raw_spin_unlock_irqrestore(&b->lock, flags);  | 
|---|
 | 1042 | +	htab_unlock_bucket(htab, b, flags);  | 
|---|
| 874 | 1043 |  	return ret; | 
|---|
| 875 | 1044 |  } | 
|---|
| 876 | 1045 |   | 
|---|
| .. | .. | 
|---|
| 889 | 1058 |  		/* unknown flags */ | 
|---|
| 890 | 1059 |  		return -EINVAL; | 
|---|
| 891 | 1060 |   | 
|---|
| 892 |  | -	WARN_ON_ONCE(!rcu_read_lock_held());  | 
|---|
 | 1061 | +	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());  | 
|---|
| 893 | 1062 |   | 
|---|
| 894 | 1063 |  	key_size = map->key_size; | 
|---|
| 895 | 1064 |   | 
|---|
| .. | .. | 
|---|
| 908 | 1077 |  		return -ENOMEM; | 
|---|
| 909 | 1078 |  	memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size); | 
|---|
| 910 | 1079 |   | 
|---|
| 911 |  | -	/* bpf_map_update_elem() can be called in_irq() */  | 
|---|
| 912 |  | -	raw_spin_lock_irqsave(&b->lock, flags);  | 
|---|
 | 1080 | +	flags = htab_lock_bucket(htab, b);  | 
|---|
| 913 | 1081 |   | 
|---|
| 914 | 1082 |  	l_old = lookup_elem_raw(head, hash, key, key_size); | 
|---|
| 915 | 1083 |   | 
|---|
| .. | .. | 
|---|
| 928 | 1096 |  	ret = 0; | 
|---|
| 929 | 1097 |   | 
|---|
| 930 | 1098 |  err: | 
|---|
| 931 |  | -	raw_spin_unlock_irqrestore(&b->lock, flags);  | 
|---|
 | 1099 | +	htab_unlock_bucket(htab, b, flags);  | 
|---|
| 932 | 1100 |   | 
|---|
| 933 | 1101 |  	if (ret) | 
|---|
| 934 | 1102 |  		bpf_lru_push_free(&htab->lru, &l_new->lru_node); | 
|---|
| .. | .. | 
|---|
| 963 | 1131 |  	b = __select_bucket(htab, hash); | 
|---|
| 964 | 1132 |  	head = &b->head; | 
|---|
| 965 | 1133 |   | 
|---|
| 966 |  | -	/* bpf_map_update_elem() can be called in_irq() */  | 
|---|
| 967 |  | -	raw_spin_lock_irqsave(&b->lock, flags);  | 
|---|
 | 1134 | +	flags = htab_lock_bucket(htab, b);  | 
|---|
| 968 | 1135 |   | 
|---|
| 969 | 1136 |  	l_old = lookup_elem_raw(head, hash, key, key_size); | 
|---|
| 970 | 1137 |   | 
|---|
| .. | .. | 
|---|
| 987 | 1154 |  	} | 
|---|
| 988 | 1155 |  	ret = 0; | 
|---|
| 989 | 1156 |  err: | 
|---|
| 990 |  | -	raw_spin_unlock_irqrestore(&b->lock, flags);  | 
|---|
 | 1157 | +	htab_unlock_bucket(htab, b, flags);  | 
|---|
| 991 | 1158 |  	return ret; | 
|---|
| 992 | 1159 |  } | 
|---|
| 993 | 1160 |   | 
|---|
| .. | .. | 
|---|
| 1027 | 1194 |  			return -ENOMEM; | 
|---|
| 1028 | 1195 |  	} | 
|---|
| 1029 | 1196 |   | 
|---|
| 1030 |  | -	/* bpf_map_update_elem() can be called in_irq() */  | 
|---|
| 1031 |  | -	raw_spin_lock_irqsave(&b->lock, flags);  | 
|---|
 | 1197 | +	flags = htab_lock_bucket(htab, b);  | 
|---|
| 1032 | 1198 |   | 
|---|
| 1033 | 1199 |  	l_old = lookup_elem_raw(head, hash, key, key_size); | 
|---|
| 1034 | 1200 |   | 
|---|
| .. | .. | 
|---|
| 1043 | 1209 |  		pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), | 
|---|
| 1044 | 1210 |  				value, onallcpus); | 
|---|
| 1045 | 1211 |  	} else { | 
|---|
| 1046 |  | -		pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size),  | 
|---|
 | 1212 | +		pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size),  | 
|---|
| 1047 | 1213 |  				value, onallcpus); | 
|---|
| 1048 | 1214 |  		hlist_nulls_add_head_rcu(&l_new->hash_node, head); | 
|---|
| 1049 | 1215 |  		l_new = NULL; | 
|---|
| 1050 | 1216 |  	} | 
|---|
| 1051 | 1217 |  	ret = 0; | 
|---|
| 1052 | 1218 |  err: | 
|---|
| 1053 |  | -	raw_spin_unlock_irqrestore(&b->lock, flags);  | 
|---|
 | 1219 | +	htab_unlock_bucket(htab, b, flags);  | 
|---|
| 1054 | 1220 |  	if (l_new) | 
|---|
| 1055 | 1221 |  		bpf_lru_push_free(&htab->lru, &l_new->lru_node); | 
|---|
| 1056 | 1222 |  	return ret; | 
|---|
| .. | .. | 
|---|
| 1080 | 1246 |  	u32 hash, key_size; | 
|---|
| 1081 | 1247 |  	int ret = -ENOENT; | 
|---|
| 1082 | 1248 |   | 
|---|
| 1083 |  | -	WARN_ON_ONCE(!rcu_read_lock_held());  | 
|---|
 | 1249 | +	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());  | 
|---|
| 1084 | 1250 |   | 
|---|
| 1085 | 1251 |  	key_size = map->key_size; | 
|---|
| 1086 | 1252 |   | 
|---|
| .. | .. | 
|---|
| 1088 | 1254 |  	b = __select_bucket(htab, hash); | 
|---|
| 1089 | 1255 |  	head = &b->head; | 
|---|
| 1090 | 1256 |   | 
|---|
| 1091 |  | -	raw_spin_lock_irqsave(&b->lock, flags);  | 
|---|
 | 1257 | +	flags = htab_lock_bucket(htab, b);  | 
|---|
| 1092 | 1258 |   | 
|---|
| 1093 | 1259 |  	l = lookup_elem_raw(head, hash, key, key_size); | 
|---|
| 1094 | 1260 |   | 
|---|
| .. | .. | 
|---|
| 1098 | 1264 |  		ret = 0; | 
|---|
| 1099 | 1265 |  	} | 
|---|
| 1100 | 1266 |   | 
|---|
| 1101 |  | -	raw_spin_unlock_irqrestore(&b->lock, flags);  | 
|---|
 | 1267 | +	htab_unlock_bucket(htab, b, flags);  | 
|---|
| 1102 | 1268 |  	return ret; | 
|---|
| 1103 | 1269 |  } | 
|---|
| 1104 | 1270 |   | 
|---|
| .. | .. | 
|---|
| 1112 | 1278 |  	u32 hash, key_size; | 
|---|
| 1113 | 1279 |  	int ret = -ENOENT; | 
|---|
| 1114 | 1280 |   | 
|---|
| 1115 |  | -	WARN_ON_ONCE(!rcu_read_lock_held());  | 
|---|
 | 1281 | +	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());  | 
|---|
| 1116 | 1282 |   | 
|---|
| 1117 | 1283 |  	key_size = map->key_size; | 
|---|
| 1118 | 1284 |   | 
|---|
| .. | .. | 
|---|
| 1120 | 1286 |  	b = __select_bucket(htab, hash); | 
|---|
| 1121 | 1287 |  	head = &b->head; | 
|---|
| 1122 | 1288 |   | 
|---|
| 1123 |  | -	raw_spin_lock_irqsave(&b->lock, flags);  | 
|---|
 | 1289 | +	flags = htab_lock_bucket(htab, b);  | 
|---|
| 1124 | 1290 |   | 
|---|
| 1125 | 1291 |  	l = lookup_elem_raw(head, hash, key, key_size); | 
|---|
| 1126 | 1292 |   | 
|---|
| .. | .. | 
|---|
| 1129 | 1295 |  		ret = 0; | 
|---|
| 1130 | 1296 |  	} | 
|---|
| 1131 | 1297 |   | 
|---|
| 1132 |  | -	raw_spin_unlock_irqrestore(&b->lock, flags);  | 
|---|
 | 1298 | +	htab_unlock_bucket(htab, b, flags);  | 
|---|
| 1133 | 1299 |  	if (l) | 
|---|
| 1134 | 1300 |  		bpf_lru_push_free(&htab->lru, &l->lru_node); | 
|---|
| 1135 | 1301 |  	return ret; | 
|---|
| .. | .. | 
|---|
| 1156 | 1322 |  { | 
|---|
| 1157 | 1323 |  	struct bpf_htab *htab = container_of(map, struct bpf_htab, map); | 
|---|
| 1158 | 1324 |   | 
|---|
| 1159 |  | -	/* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,  | 
|---|
| 1160 |  | -	 * so the programs (can be more than one that used this map) were  | 
|---|
| 1161 |  | -	 * disconnected from events. Wait for outstanding critical sections in  | 
|---|
| 1162 |  | -	 * these programs to complete  | 
|---|
 | 1325 | +	/* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback.  | 
|---|
 | 1326 | +	 * bpf_free_used_maps() is called after bpf prog is no longer executing.  | 
|---|
 | 1327 | +	 * There is no need to synchronize_rcu() here to protect map elements.  | 
|---|
| 1163 | 1328 |  	 */ | 
|---|
| 1164 |  | -	synchronize_rcu();  | 
|---|
| 1165 | 1329 |   | 
|---|
| 1166 | 1330 |  	/* some of free_htab_elem() callbacks for elements of this map may | 
|---|
| 1167 | 1331 |  	 * not have executed. Wait for them. | 
|---|
| .. | .. | 
|---|
| 1198 | 1362 |  	rcu_read_unlock(); | 
|---|
| 1199 | 1363 |  } | 
|---|
| 1200 | 1364 |   | 
|---|
 | 1365 | +static int  | 
|---|
 | 1366 | +__htab_map_lookup_and_delete_batch(struct bpf_map *map,  | 
|---|
 | 1367 | +				   const union bpf_attr *attr,  | 
|---|
 | 1368 | +				   union bpf_attr __user *uattr,  | 
|---|
 | 1369 | +				   bool do_delete, bool is_lru_map,  | 
|---|
 | 1370 | +				   bool is_percpu)  | 
|---|
 | 1371 | +{  | 
|---|
 | 1372 | +	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);  | 
|---|
 | 1373 | +	u32 bucket_cnt, total, key_size, value_size, roundup_key_size;  | 
|---|
 | 1374 | +	void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val;  | 
|---|
 | 1375 | +	void __user *uvalues = u64_to_user_ptr(attr->batch.values);  | 
|---|
 | 1376 | +	void __user *ukeys = u64_to_user_ptr(attr->batch.keys);  | 
|---|
 | 1377 | +	void *ubatch = u64_to_user_ptr(attr->batch.in_batch);  | 
|---|
 | 1378 | +	u32 batch, max_count, size, bucket_size;  | 
|---|
 | 1379 | +	struct htab_elem *node_to_free = NULL;  | 
|---|
 | 1380 | +	u64 elem_map_flags, map_flags;  | 
|---|
 | 1381 | +	struct hlist_nulls_head *head;  | 
|---|
 | 1382 | +	struct hlist_nulls_node *n;  | 
|---|
 | 1383 | +	unsigned long flags = 0;  | 
|---|
 | 1384 | +	bool locked = false;  | 
|---|
 | 1385 | +	struct htab_elem *l;  | 
|---|
 | 1386 | +	struct bucket *b;  | 
|---|
 | 1387 | +	int ret = 0;  | 
|---|
 | 1388 | +  | 
|---|
 | 1389 | +	elem_map_flags = attr->batch.elem_flags;  | 
|---|
 | 1390 | +	if ((elem_map_flags & ~BPF_F_LOCK) ||  | 
|---|
 | 1391 | +	    ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map)))  | 
|---|
 | 1392 | +		return -EINVAL;  | 
|---|
 | 1393 | +  | 
|---|
 | 1394 | +	map_flags = attr->batch.flags;  | 
|---|
 | 1395 | +	if (map_flags)  | 
|---|
 | 1396 | +		return -EINVAL;  | 
|---|
 | 1397 | +  | 
|---|
 | 1398 | +	max_count = attr->batch.count;  | 
|---|
 | 1399 | +	if (!max_count)  | 
|---|
 | 1400 | +		return 0;  | 
|---|
 | 1401 | +  | 
|---|
 | 1402 | +	if (put_user(0, &uattr->batch.count))  | 
|---|
 | 1403 | +		return -EFAULT;  | 
|---|
 | 1404 | +  | 
|---|
 | 1405 | +	batch = 0;  | 
|---|
 | 1406 | +	if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch)))  | 
|---|
 | 1407 | +		return -EFAULT;  | 
|---|
 | 1408 | +  | 
|---|
 | 1409 | +	if (batch >= htab->n_buckets)  | 
|---|
 | 1410 | +		return -ENOENT;  | 
|---|
 | 1411 | +  | 
|---|
 | 1412 | +	key_size = htab->map.key_size;  | 
|---|
 | 1413 | +	roundup_key_size = round_up(htab->map.key_size, 8);  | 
|---|
 | 1414 | +	value_size = htab->map.value_size;  | 
|---|
 | 1415 | +	size = round_up(value_size, 8);  | 
|---|
 | 1416 | +	if (is_percpu)  | 
|---|
 | 1417 | +		value_size = size * num_possible_cpus();  | 
|---|
 | 1418 | +	total = 0;  | 
|---|
 | 1419 | +	/* while experimenting with hash tables with sizes ranging from 10 to  | 
|---|
 | 1420 | +	 * 1000, it was observed that a bucket can have upto 5 entries.  | 
|---|
 | 1421 | +	 */  | 
|---|
 | 1422 | +	bucket_size = 5;  | 
|---|
 | 1423 | +  | 
|---|
 | 1424 | +alloc:  | 
|---|
 | 1425 | +	/* We cannot do copy_from_user or copy_to_user inside  | 
|---|
 | 1426 | +	 * the rcu_read_lock. Allocate enough space here.  | 
|---|
 | 1427 | +	 */  | 
|---|
 | 1428 | +	keys = kvmalloc_array(key_size, bucket_size, GFP_USER | __GFP_NOWARN);  | 
|---|
 | 1429 | +	values = kvmalloc_array(value_size, bucket_size, GFP_USER | __GFP_NOWARN);  | 
|---|
 | 1430 | +	if (!keys || !values) {  | 
|---|
 | 1431 | +		ret = -ENOMEM;  | 
|---|
 | 1432 | +		goto after_loop;  | 
|---|
 | 1433 | +	}  | 
|---|
 | 1434 | +  | 
|---|
 | 1435 | +again:  | 
|---|
 | 1436 | +	bpf_disable_instrumentation();  | 
|---|
 | 1437 | +	rcu_read_lock();  | 
|---|
 | 1438 | +again_nocopy:  | 
|---|
 | 1439 | +	dst_key = keys;  | 
|---|
 | 1440 | +	dst_val = values;  | 
|---|
 | 1441 | +	b = &htab->buckets[batch];  | 
|---|
 | 1442 | +	head = &b->head;  | 
|---|
 | 1443 | +	/* do not grab the lock unless need it (bucket_cnt > 0). */  | 
|---|
 | 1444 | +	if (locked)  | 
|---|
 | 1445 | +		flags = htab_lock_bucket(htab, b);  | 
|---|
 | 1446 | +  | 
|---|
 | 1447 | +	bucket_cnt = 0;  | 
|---|
 | 1448 | +	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)  | 
|---|
 | 1449 | +		bucket_cnt++;  | 
|---|
 | 1450 | +  | 
|---|
 | 1451 | +	if (bucket_cnt && !locked) {  | 
|---|
 | 1452 | +		locked = true;  | 
|---|
 | 1453 | +		goto again_nocopy;  | 
|---|
 | 1454 | +	}  | 
|---|
 | 1455 | +  | 
|---|
 | 1456 | +	if (bucket_cnt > (max_count - total)) {  | 
|---|
 | 1457 | +		if (total == 0)  | 
|---|
 | 1458 | +			ret = -ENOSPC;  | 
|---|
 | 1459 | +		/* Note that since bucket_cnt > 0 here, it is implicit  | 
|---|
 | 1460 | +		 * that the locked was grabbed, so release it.  | 
|---|
 | 1461 | +		 */  | 
|---|
 | 1462 | +		htab_unlock_bucket(htab, b, flags);  | 
|---|
 | 1463 | +		rcu_read_unlock();  | 
|---|
 | 1464 | +		bpf_enable_instrumentation();  | 
|---|
 | 1465 | +		goto after_loop;  | 
|---|
 | 1466 | +	}  | 
|---|
 | 1467 | +  | 
|---|
 | 1468 | +	if (bucket_cnt > bucket_size) {  | 
|---|
 | 1469 | +		bucket_size = bucket_cnt;  | 
|---|
 | 1470 | +		/* Note that since bucket_cnt > 0 here, it is implicit  | 
|---|
 | 1471 | +		 * that the locked was grabbed, so release it.  | 
|---|
 | 1472 | +		 */  | 
|---|
 | 1473 | +		htab_unlock_bucket(htab, b, flags);  | 
|---|
 | 1474 | +		rcu_read_unlock();  | 
|---|
 | 1475 | +		bpf_enable_instrumentation();  | 
|---|
 | 1476 | +		kvfree(keys);  | 
|---|
 | 1477 | +		kvfree(values);  | 
|---|
 | 1478 | +		goto alloc;  | 
|---|
 | 1479 | +	}  | 
|---|
 | 1480 | +  | 
|---|
 | 1481 | +	/* Next block is only safe to run if you have grabbed the lock */  | 
|---|
 | 1482 | +	if (!locked)  | 
|---|
 | 1483 | +		goto next_batch;  | 
|---|
 | 1484 | +  | 
|---|
 | 1485 | +	hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {  | 
|---|
 | 1486 | +		memcpy(dst_key, l->key, key_size);  | 
|---|
 | 1487 | +  | 
|---|
 | 1488 | +		if (is_percpu) {  | 
|---|
 | 1489 | +			int off = 0, cpu;  | 
|---|
 | 1490 | +			void __percpu *pptr;  | 
|---|
 | 1491 | +  | 
|---|
 | 1492 | +			pptr = htab_elem_get_ptr(l, map->key_size);  | 
|---|
 | 1493 | +			for_each_possible_cpu(cpu) {  | 
|---|
 | 1494 | +				bpf_long_memcpy(dst_val + off,  | 
|---|
 | 1495 | +						per_cpu_ptr(pptr, cpu), size);  | 
|---|
 | 1496 | +				off += size;  | 
|---|
 | 1497 | +			}  | 
|---|
 | 1498 | +		} else {  | 
|---|
 | 1499 | +			value = l->key + roundup_key_size;  | 
|---|
 | 1500 | +			if (elem_map_flags & BPF_F_LOCK)  | 
|---|
 | 1501 | +				copy_map_value_locked(map, dst_val, value,  | 
|---|
 | 1502 | +						      true);  | 
|---|
 | 1503 | +			else  | 
|---|
 | 1504 | +				copy_map_value(map, dst_val, value);  | 
|---|
 | 1505 | +			check_and_init_map_lock(map, dst_val);  | 
|---|
 | 1506 | +		}  | 
|---|
 | 1507 | +		if (do_delete) {  | 
|---|
 | 1508 | +			hlist_nulls_del_rcu(&l->hash_node);  | 
|---|
 | 1509 | +  | 
|---|
 | 1510 | +			/* bpf_lru_push_free() will acquire lru_lock, which  | 
|---|
 | 1511 | +			 * may cause deadlock. See comments in function  | 
|---|
 | 1512 | +			 * prealloc_lru_pop(). Let us do bpf_lru_push_free()  | 
|---|
 | 1513 | +			 * after releasing the bucket lock.  | 
|---|
 | 1514 | +			 */  | 
|---|
 | 1515 | +			if (is_lru_map) {  | 
|---|
 | 1516 | +				l->batch_flink = node_to_free;  | 
|---|
 | 1517 | +				node_to_free = l;  | 
|---|
 | 1518 | +			} else {  | 
|---|
 | 1519 | +				free_htab_elem(htab, l);  | 
|---|
 | 1520 | +			}  | 
|---|
 | 1521 | +		}  | 
|---|
 | 1522 | +		dst_key += key_size;  | 
|---|
 | 1523 | +		dst_val += value_size;  | 
|---|
 | 1524 | +	}  | 
|---|
 | 1525 | +  | 
|---|
 | 1526 | +	htab_unlock_bucket(htab, b, flags);  | 
|---|
 | 1527 | +	locked = false;  | 
|---|
 | 1528 | +  | 
|---|
 | 1529 | +	while (node_to_free) {  | 
|---|
 | 1530 | +		l = node_to_free;  | 
|---|
 | 1531 | +		node_to_free = node_to_free->batch_flink;  | 
|---|
 | 1532 | +		bpf_lru_push_free(&htab->lru, &l->lru_node);  | 
|---|
 | 1533 | +	}  | 
|---|
 | 1534 | +  | 
|---|
 | 1535 | +next_batch:  | 
|---|
 | 1536 | +	/* If we are not copying data, we can go to next bucket and avoid  | 
|---|
 | 1537 | +	 * unlocking the rcu.  | 
|---|
 | 1538 | +	 */  | 
|---|
 | 1539 | +	if (!bucket_cnt && (batch + 1 < htab->n_buckets)) {  | 
|---|
 | 1540 | +		batch++;  | 
|---|
 | 1541 | +		goto again_nocopy;  | 
|---|
 | 1542 | +	}  | 
|---|
 | 1543 | +  | 
|---|
 | 1544 | +	rcu_read_unlock();  | 
|---|
 | 1545 | +	bpf_enable_instrumentation();  | 
|---|
 | 1546 | +	if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys,  | 
|---|
 | 1547 | +	    key_size * bucket_cnt) ||  | 
|---|
 | 1548 | +	    copy_to_user(uvalues + total * value_size, values,  | 
|---|
 | 1549 | +	    value_size * bucket_cnt))) {  | 
|---|
 | 1550 | +		ret = -EFAULT;  | 
|---|
 | 1551 | +		goto after_loop;  | 
|---|
 | 1552 | +	}  | 
|---|
 | 1553 | +  | 
|---|
 | 1554 | +	total += bucket_cnt;  | 
|---|
 | 1555 | +	batch++;  | 
|---|
 | 1556 | +	if (batch >= htab->n_buckets) {  | 
|---|
 | 1557 | +		ret = -ENOENT;  | 
|---|
 | 1558 | +		goto after_loop;  | 
|---|
 | 1559 | +	}  | 
|---|
 | 1560 | +	goto again;  | 
|---|
 | 1561 | +  | 
|---|
 | 1562 | +after_loop:  | 
|---|
 | 1563 | +	if (ret == -EFAULT)  | 
|---|
 | 1564 | +		goto out;  | 
|---|
 | 1565 | +  | 
|---|
 | 1566 | +	/* copy # of entries and next batch */  | 
|---|
 | 1567 | +	ubatch = u64_to_user_ptr(attr->batch.out_batch);  | 
|---|
 | 1568 | +	if (copy_to_user(ubatch, &batch, sizeof(batch)) ||  | 
|---|
 | 1569 | +	    put_user(total, &uattr->batch.count))  | 
|---|
 | 1570 | +		ret = -EFAULT;  | 
|---|
 | 1571 | +  | 
|---|
 | 1572 | +out:  | 
|---|
 | 1573 | +	kvfree(keys);  | 
|---|
 | 1574 | +	kvfree(values);  | 
|---|
 | 1575 | +	return ret;  | 
|---|
 | 1576 | +}  | 
|---|
 | 1577 | +  | 
|---|
 | 1578 | +static int  | 
|---|
 | 1579 | +htab_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,  | 
|---|
 | 1580 | +			     union bpf_attr __user *uattr)  | 
|---|
 | 1581 | +{  | 
|---|
 | 1582 | +	return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,  | 
|---|
 | 1583 | +						  false, true);  | 
|---|
 | 1584 | +}  | 
|---|
 | 1585 | +  | 
|---|
 | 1586 | +static int  | 
|---|
 | 1587 | +htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map,  | 
|---|
 | 1588 | +					const union bpf_attr *attr,  | 
|---|
 | 1589 | +					union bpf_attr __user *uattr)  | 
|---|
 | 1590 | +{  | 
|---|
 | 1591 | +	return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,  | 
|---|
 | 1592 | +						  false, true);  | 
|---|
 | 1593 | +}  | 
|---|
 | 1594 | +  | 
|---|
 | 1595 | +static int  | 
|---|
 | 1596 | +htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,  | 
|---|
 | 1597 | +		      union bpf_attr __user *uattr)  | 
|---|
 | 1598 | +{  | 
|---|
 | 1599 | +	return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,  | 
|---|
 | 1600 | +						  false, false);  | 
|---|
 | 1601 | +}  | 
|---|
 | 1602 | +  | 
|---|
 | 1603 | +static int  | 
|---|
 | 1604 | +htab_map_lookup_and_delete_batch(struct bpf_map *map,  | 
|---|
 | 1605 | +				 const union bpf_attr *attr,  | 
|---|
 | 1606 | +				 union bpf_attr __user *uattr)  | 
|---|
 | 1607 | +{  | 
|---|
 | 1608 | +	return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,  | 
|---|
 | 1609 | +						  false, false);  | 
|---|
 | 1610 | +}  | 
|---|
 | 1611 | +  | 
|---|
 | 1612 | +static int  | 
|---|
 | 1613 | +htab_lru_percpu_map_lookup_batch(struct bpf_map *map,  | 
|---|
 | 1614 | +				 const union bpf_attr *attr,  | 
|---|
 | 1615 | +				 union bpf_attr __user *uattr)  | 
|---|
 | 1616 | +{  | 
|---|
 | 1617 | +	return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,  | 
|---|
 | 1618 | +						  true, true);  | 
|---|
 | 1619 | +}  | 
|---|
 | 1620 | +  | 
|---|
 | 1621 | +static int  | 
|---|
 | 1622 | +htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map,  | 
|---|
 | 1623 | +					    const union bpf_attr *attr,  | 
|---|
 | 1624 | +					    union bpf_attr __user *uattr)  | 
|---|
 | 1625 | +{  | 
|---|
 | 1626 | +	return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,  | 
|---|
 | 1627 | +						  true, true);  | 
|---|
 | 1628 | +}  | 
|---|
 | 1629 | +  | 
|---|
 | 1630 | +static int  | 
|---|
 | 1631 | +htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,  | 
|---|
 | 1632 | +			  union bpf_attr __user *uattr)  | 
|---|
 | 1633 | +{  | 
|---|
 | 1634 | +	return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,  | 
|---|
 | 1635 | +						  true, false);  | 
|---|
 | 1636 | +}  | 
|---|
 | 1637 | +  | 
|---|
 | 1638 | +static int  | 
|---|
 | 1639 | +htab_lru_map_lookup_and_delete_batch(struct bpf_map *map,  | 
|---|
 | 1640 | +				     const union bpf_attr *attr,  | 
|---|
 | 1641 | +				     union bpf_attr __user *uattr)  | 
|---|
 | 1642 | +{  | 
|---|
 | 1643 | +	return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,  | 
|---|
 | 1644 | +						  true, false);  | 
|---|
 | 1645 | +}  | 
|---|
 | 1646 | +  | 
|---|
 | 1647 | +struct bpf_iter_seq_hash_map_info {  | 
|---|
 | 1648 | +	struct bpf_map *map;  | 
|---|
 | 1649 | +	struct bpf_htab *htab;  | 
|---|
 | 1650 | +	void *percpu_value_buf; // non-zero means percpu hash  | 
|---|
 | 1651 | +	u32 bucket_id;  | 
|---|
 | 1652 | +	u32 skip_elems;  | 
|---|
 | 1653 | +};  | 
|---|
 | 1654 | +  | 
|---|
 | 1655 | +static struct htab_elem *  | 
|---|
 | 1656 | +bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info,  | 
|---|
 | 1657 | +			   struct htab_elem *prev_elem)  | 
|---|
 | 1658 | +{  | 
|---|
 | 1659 | +	const struct bpf_htab *htab = info->htab;  | 
|---|
 | 1660 | +	u32 skip_elems = info->skip_elems;  | 
|---|
 | 1661 | +	u32 bucket_id = info->bucket_id;  | 
|---|
 | 1662 | +	struct hlist_nulls_head *head;  | 
|---|
 | 1663 | +	struct hlist_nulls_node *n;  | 
|---|
 | 1664 | +	struct htab_elem *elem;  | 
|---|
 | 1665 | +	struct bucket *b;  | 
|---|
 | 1666 | +	u32 i, count;  | 
|---|
 | 1667 | +  | 
|---|
 | 1668 | +	if (bucket_id >= htab->n_buckets)  | 
|---|
 | 1669 | +		return NULL;  | 
|---|
 | 1670 | +  | 
|---|
 | 1671 | +	/* try to find next elem in the same bucket */  | 
|---|
 | 1672 | +	if (prev_elem) {  | 
|---|
 | 1673 | +		/* no update/deletion on this bucket, prev_elem should be still valid  | 
|---|
 | 1674 | +		 * and we won't skip elements.  | 
|---|
 | 1675 | +		 */  | 
|---|
 | 1676 | +		n = rcu_dereference_raw(hlist_nulls_next_rcu(&prev_elem->hash_node));  | 
|---|
 | 1677 | +		elem = hlist_nulls_entry_safe(n, struct htab_elem, hash_node);  | 
|---|
 | 1678 | +		if (elem)  | 
|---|
 | 1679 | +			return elem;  | 
|---|
 | 1680 | +  | 
|---|
 | 1681 | +		/* not found, unlock and go to the next bucket */  | 
|---|
 | 1682 | +		b = &htab->buckets[bucket_id++];  | 
|---|
 | 1683 | +		rcu_read_unlock();  | 
|---|
 | 1684 | +		skip_elems = 0;  | 
|---|
 | 1685 | +	}  | 
|---|
 | 1686 | +  | 
|---|
 | 1687 | +	for (i = bucket_id; i < htab->n_buckets; i++) {  | 
|---|
 | 1688 | +		b = &htab->buckets[i];  | 
|---|
 | 1689 | +		rcu_read_lock();  | 
|---|
 | 1690 | +  | 
|---|
 | 1691 | +		count = 0;  | 
|---|
 | 1692 | +		head = &b->head;  | 
|---|
 | 1693 | +		hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {  | 
|---|
 | 1694 | +			if (count >= skip_elems) {  | 
|---|
 | 1695 | +				info->bucket_id = i;  | 
|---|
 | 1696 | +				info->skip_elems = count;  | 
|---|
 | 1697 | +				return elem;  | 
|---|
 | 1698 | +			}  | 
|---|
 | 1699 | +			count++;  | 
|---|
 | 1700 | +		}  | 
|---|
 | 1701 | +  | 
|---|
 | 1702 | +		rcu_read_unlock();  | 
|---|
 | 1703 | +		skip_elems = 0;  | 
|---|
 | 1704 | +	}  | 
|---|
 | 1705 | +  | 
|---|
 | 1706 | +	info->bucket_id = i;  | 
|---|
 | 1707 | +	info->skip_elems = 0;  | 
|---|
 | 1708 | +	return NULL;  | 
|---|
 | 1709 | +}  | 
|---|
 | 1710 | +  | 
|---|
 | 1711 | +static void *bpf_hash_map_seq_start(struct seq_file *seq, loff_t *pos)  | 
|---|
 | 1712 | +{  | 
|---|
 | 1713 | +	struct bpf_iter_seq_hash_map_info *info = seq->private;  | 
|---|
 | 1714 | +	struct htab_elem *elem;  | 
|---|
 | 1715 | +  | 
|---|
 | 1716 | +	elem = bpf_hash_map_seq_find_next(info, NULL);  | 
|---|
 | 1717 | +	if (!elem)  | 
|---|
 | 1718 | +		return NULL;  | 
|---|
 | 1719 | +  | 
|---|
 | 1720 | +	if (*pos == 0)  | 
|---|
 | 1721 | +		++*pos;  | 
|---|
 | 1722 | +	return elem;  | 
|---|
 | 1723 | +}  | 
|---|
 | 1724 | +  | 
|---|
 | 1725 | +static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)  | 
|---|
 | 1726 | +{  | 
|---|
 | 1727 | +	struct bpf_iter_seq_hash_map_info *info = seq->private;  | 
|---|
 | 1728 | +  | 
|---|
 | 1729 | +	++*pos;  | 
|---|
 | 1730 | +	++info->skip_elems;  | 
|---|
 | 1731 | +	return bpf_hash_map_seq_find_next(info, v);  | 
|---|
 | 1732 | +}  | 
|---|
 | 1733 | +  | 
|---|
 | 1734 | +static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem)  | 
|---|
 | 1735 | +{  | 
|---|
 | 1736 | +	struct bpf_iter_seq_hash_map_info *info = seq->private;  | 
|---|
 | 1737 | +	u32 roundup_key_size, roundup_value_size;  | 
|---|
 | 1738 | +	struct bpf_iter__bpf_map_elem ctx = {};  | 
|---|
 | 1739 | +	struct bpf_map *map = info->map;  | 
|---|
 | 1740 | +	struct bpf_iter_meta meta;  | 
|---|
 | 1741 | +	int ret = 0, off = 0, cpu;  | 
|---|
 | 1742 | +	struct bpf_prog *prog;  | 
|---|
 | 1743 | +	void __percpu *pptr;  | 
|---|
 | 1744 | +  | 
|---|
 | 1745 | +	meta.seq = seq;  | 
|---|
 | 1746 | +	prog = bpf_iter_get_info(&meta, elem == NULL);  | 
|---|
 | 1747 | +	if (prog) {  | 
|---|
 | 1748 | +		ctx.meta = &meta;  | 
|---|
 | 1749 | +		ctx.map = info->map;  | 
|---|
 | 1750 | +		if (elem) {  | 
|---|
 | 1751 | +			roundup_key_size = round_up(map->key_size, 8);  | 
|---|
 | 1752 | +			ctx.key = elem->key;  | 
|---|
 | 1753 | +			if (!info->percpu_value_buf) {  | 
|---|
 | 1754 | +				ctx.value = elem->key + roundup_key_size;  | 
|---|
 | 1755 | +			} else {  | 
|---|
 | 1756 | +				roundup_value_size = round_up(map->value_size, 8);  | 
|---|
 | 1757 | +				pptr = htab_elem_get_ptr(elem, map->key_size);  | 
|---|
 | 1758 | +				for_each_possible_cpu(cpu) {  | 
|---|
 | 1759 | +					bpf_long_memcpy(info->percpu_value_buf + off,  | 
|---|
 | 1760 | +							per_cpu_ptr(pptr, cpu),  | 
|---|
 | 1761 | +							roundup_value_size);  | 
|---|
 | 1762 | +					off += roundup_value_size;  | 
|---|
 | 1763 | +				}  | 
|---|
 | 1764 | +				ctx.value = info->percpu_value_buf;  | 
|---|
 | 1765 | +			}  | 
|---|
 | 1766 | +		}  | 
|---|
 | 1767 | +		ret = bpf_iter_run_prog(prog, &ctx);  | 
|---|
 | 1768 | +	}  | 
|---|
 | 1769 | +  | 
|---|
 | 1770 | +	return ret;  | 
|---|
 | 1771 | +}  | 
|---|
 | 1772 | +  | 
|---|
 | 1773 | +static int bpf_hash_map_seq_show(struct seq_file *seq, void *v)  | 
|---|
 | 1774 | +{  | 
|---|
 | 1775 | +	return __bpf_hash_map_seq_show(seq, v);  | 
|---|
 | 1776 | +}  | 
|---|
 | 1777 | +  | 
|---|
 | 1778 | +static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v)  | 
|---|
 | 1779 | +{  | 
|---|
 | 1780 | +	if (!v)  | 
|---|
 | 1781 | +		(void)__bpf_hash_map_seq_show(seq, NULL);  | 
|---|
 | 1782 | +	else  | 
|---|
 | 1783 | +		rcu_read_unlock();  | 
|---|
 | 1784 | +}  | 
|---|
 | 1785 | +  | 
|---|
 | 1786 | +static int bpf_iter_init_hash_map(void *priv_data,  | 
|---|
 | 1787 | +				  struct bpf_iter_aux_info *aux)  | 
|---|
 | 1788 | +{  | 
|---|
 | 1789 | +	struct bpf_iter_seq_hash_map_info *seq_info = priv_data;  | 
|---|
 | 1790 | +	struct bpf_map *map = aux->map;  | 
|---|
 | 1791 | +	void *value_buf;  | 
|---|
 | 1792 | +	u32 buf_size;  | 
|---|
 | 1793 | +  | 
|---|
 | 1794 | +	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||  | 
|---|
 | 1795 | +	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {  | 
|---|
 | 1796 | +		buf_size = round_up(map->value_size, 8) * num_possible_cpus();  | 
|---|
 | 1797 | +		value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN);  | 
|---|
 | 1798 | +		if (!value_buf)  | 
|---|
 | 1799 | +			return -ENOMEM;  | 
|---|
 | 1800 | +  | 
|---|
 | 1801 | +		seq_info->percpu_value_buf = value_buf;  | 
|---|
 | 1802 | +	}  | 
|---|
 | 1803 | +  | 
|---|
 | 1804 | +	bpf_map_inc_with_uref(map);  | 
|---|
 | 1805 | +	seq_info->map = map;  | 
|---|
 | 1806 | +	seq_info->htab = container_of(map, struct bpf_htab, map);  | 
|---|
 | 1807 | +	return 0;  | 
|---|
 | 1808 | +}  | 
|---|
 | 1809 | +  | 
|---|
 | 1810 | +static void bpf_iter_fini_hash_map(void *priv_data)  | 
|---|
 | 1811 | +{  | 
|---|
 | 1812 | +	struct bpf_iter_seq_hash_map_info *seq_info = priv_data;  | 
|---|
 | 1813 | +  | 
|---|
 | 1814 | +	bpf_map_put_with_uref(seq_info->map);  | 
|---|
 | 1815 | +	kfree(seq_info->percpu_value_buf);  | 
|---|
 | 1816 | +}  | 
|---|
 | 1817 | +  | 
|---|
 | 1818 | +static const struct seq_operations bpf_hash_map_seq_ops = {  | 
|---|
 | 1819 | +	.start	= bpf_hash_map_seq_start,  | 
|---|
 | 1820 | +	.next	= bpf_hash_map_seq_next,  | 
|---|
 | 1821 | +	.stop	= bpf_hash_map_seq_stop,  | 
|---|
 | 1822 | +	.show	= bpf_hash_map_seq_show,  | 
|---|
 | 1823 | +};  | 
|---|
 | 1824 | +  | 
|---|
 | 1825 | +static const struct bpf_iter_seq_info iter_seq_info = {  | 
|---|
 | 1826 | +	.seq_ops		= &bpf_hash_map_seq_ops,  | 
|---|
 | 1827 | +	.init_seq_private	= bpf_iter_init_hash_map,  | 
|---|
 | 1828 | +	.fini_seq_private	= bpf_iter_fini_hash_map,  | 
|---|
 | 1829 | +	.seq_priv_size		= sizeof(struct bpf_iter_seq_hash_map_info),  | 
|---|
 | 1830 | +};  | 
|---|
 | 1831 | +  | 
|---|
 | 1832 | +static int htab_map_btf_id;  | 
|---|
| 1201 | 1833 |  const struct bpf_map_ops htab_map_ops = { | 
|---|
 | 1834 | +	.map_meta_equal = bpf_map_meta_equal,  | 
|---|
| 1202 | 1835 |  	.map_alloc_check = htab_map_alloc_check, | 
|---|
| 1203 | 1836 |  	.map_alloc = htab_map_alloc, | 
|---|
| 1204 | 1837 |  	.map_free = htab_map_free, | 
|---|
| .. | .. | 
|---|
| 1208 | 1841 |  	.map_delete_elem = htab_map_delete_elem, | 
|---|
| 1209 | 1842 |  	.map_gen_lookup = htab_map_gen_lookup, | 
|---|
| 1210 | 1843 |  	.map_seq_show_elem = htab_map_seq_show_elem, | 
|---|
 | 1844 | +	BATCH_OPS(htab),  | 
|---|
 | 1845 | +	.map_btf_name = "bpf_htab",  | 
|---|
 | 1846 | +	.map_btf_id = &htab_map_btf_id,  | 
|---|
 | 1847 | +	.iter_seq_info = &iter_seq_info,  | 
|---|
| 1211 | 1848 |  }; | 
|---|
| 1212 | 1849 |   | 
|---|
 | 1850 | +static int htab_lru_map_btf_id;  | 
|---|
| 1213 | 1851 |  const struct bpf_map_ops htab_lru_map_ops = { | 
|---|
 | 1852 | +	.map_meta_equal = bpf_map_meta_equal,  | 
|---|
| 1214 | 1853 |  	.map_alloc_check = htab_map_alloc_check, | 
|---|
| 1215 | 1854 |  	.map_alloc = htab_map_alloc, | 
|---|
| 1216 | 1855 |  	.map_free = htab_map_free, | 
|---|
| .. | .. | 
|---|
| 1221 | 1860 |  	.map_delete_elem = htab_lru_map_delete_elem, | 
|---|
| 1222 | 1861 |  	.map_gen_lookup = htab_lru_map_gen_lookup, | 
|---|
| 1223 | 1862 |  	.map_seq_show_elem = htab_map_seq_show_elem, | 
|---|
 | 1863 | +	BATCH_OPS(htab_lru),  | 
|---|
 | 1864 | +	.map_btf_name = "bpf_htab",  | 
|---|
 | 1865 | +	.map_btf_id = &htab_lru_map_btf_id,  | 
|---|
 | 1866 | +	.iter_seq_info = &iter_seq_info,  | 
|---|
| 1224 | 1867 |  }; | 
|---|
| 1225 | 1868 |   | 
|---|
| 1226 | 1869 |  /* Called from eBPF program */ | 
|---|
| .. | .. | 
|---|
| 1296 | 1939 |  	return ret; | 
|---|
| 1297 | 1940 |  } | 
|---|
| 1298 | 1941 |   | 
|---|
 | 1942 | +static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key,  | 
|---|
 | 1943 | +					  struct seq_file *m)  | 
|---|
 | 1944 | +{  | 
|---|
 | 1945 | +	struct htab_elem *l;  | 
|---|
 | 1946 | +	void __percpu *pptr;  | 
|---|
 | 1947 | +	int cpu;  | 
|---|
 | 1948 | +  | 
|---|
 | 1949 | +	rcu_read_lock();  | 
|---|
 | 1950 | +  | 
|---|
 | 1951 | +	l = __htab_map_lookup_elem(map, key);  | 
|---|
 | 1952 | +	if (!l) {  | 
|---|
 | 1953 | +		rcu_read_unlock();  | 
|---|
 | 1954 | +		return;  | 
|---|
 | 1955 | +	}  | 
|---|
 | 1956 | +  | 
|---|
 | 1957 | +	btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);  | 
|---|
 | 1958 | +	seq_puts(m, ": {\n");  | 
|---|
 | 1959 | +	pptr = htab_elem_get_ptr(l, map->key_size);  | 
|---|
 | 1960 | +	for_each_possible_cpu(cpu) {  | 
|---|
 | 1961 | +		seq_printf(m, "\tcpu%d: ", cpu);  | 
|---|
 | 1962 | +		btf_type_seq_show(map->btf, map->btf_value_type_id,  | 
|---|
 | 1963 | +				  per_cpu_ptr(pptr, cpu), m);  | 
|---|
 | 1964 | +		seq_puts(m, "\n");  | 
|---|
 | 1965 | +	}  | 
|---|
 | 1966 | +	seq_puts(m, "}\n");  | 
|---|
 | 1967 | +  | 
|---|
 | 1968 | +	rcu_read_unlock();  | 
|---|
 | 1969 | +}  | 
|---|
 | 1970 | +  | 
|---|
 | 1971 | +static int htab_percpu_map_btf_id;  | 
|---|
| 1299 | 1972 |  const struct bpf_map_ops htab_percpu_map_ops = { | 
|---|
 | 1973 | +	.map_meta_equal = bpf_map_meta_equal,  | 
|---|
| 1300 | 1974 |  	.map_alloc_check = htab_map_alloc_check, | 
|---|
| 1301 | 1975 |  	.map_alloc = htab_map_alloc, | 
|---|
| 1302 | 1976 |  	.map_free = htab_map_free, | 
|---|
| .. | .. | 
|---|
| 1304 | 1978 |  	.map_lookup_elem = htab_percpu_map_lookup_elem, | 
|---|
| 1305 | 1979 |  	.map_update_elem = htab_percpu_map_update_elem, | 
|---|
| 1306 | 1980 |  	.map_delete_elem = htab_map_delete_elem, | 
|---|
 | 1981 | +	.map_seq_show_elem = htab_percpu_map_seq_show_elem,  | 
|---|
 | 1982 | +	BATCH_OPS(htab_percpu),  | 
|---|
 | 1983 | +	.map_btf_name = "bpf_htab",  | 
|---|
 | 1984 | +	.map_btf_id = &htab_percpu_map_btf_id,  | 
|---|
 | 1985 | +	.iter_seq_info = &iter_seq_info,  | 
|---|
| 1307 | 1986 |  }; | 
|---|
| 1308 | 1987 |   | 
|---|
 | 1988 | +static int htab_lru_percpu_map_btf_id;  | 
|---|
| 1309 | 1989 |  const struct bpf_map_ops htab_lru_percpu_map_ops = { | 
|---|
 | 1990 | +	.map_meta_equal = bpf_map_meta_equal,  | 
|---|
| 1310 | 1991 |  	.map_alloc_check = htab_map_alloc_check, | 
|---|
| 1311 | 1992 |  	.map_alloc = htab_map_alloc, | 
|---|
| 1312 | 1993 |  	.map_free = htab_map_free, | 
|---|
| .. | .. | 
|---|
| 1314 | 1995 |  	.map_lookup_elem = htab_lru_percpu_map_lookup_elem, | 
|---|
| 1315 | 1996 |  	.map_update_elem = htab_lru_percpu_map_update_elem, | 
|---|
| 1316 | 1997 |  	.map_delete_elem = htab_lru_map_delete_elem, | 
|---|
 | 1998 | +	.map_seq_show_elem = htab_percpu_map_seq_show_elem,  | 
|---|
 | 1999 | +	BATCH_OPS(htab_lru_percpu),  | 
|---|
 | 2000 | +	.map_btf_name = "bpf_htab",  | 
|---|
 | 2001 | +	.map_btf_id = &htab_lru_percpu_map_btf_id,  | 
|---|
 | 2002 | +	.iter_seq_info = &iter_seq_info,  | 
|---|
| 1317 | 2003 |  }; | 
|---|
| 1318 | 2004 |   | 
|---|
| 1319 | 2005 |  static int fd_htab_map_alloc_check(union bpf_attr *attr) | 
|---|
| .. | .. | 
|---|
| 1412 | 2098 |  	return READ_ONCE(*inner_map); | 
|---|
| 1413 | 2099 |  } | 
|---|
| 1414 | 2100 |   | 
|---|
| 1415 |  | -static u32 htab_of_map_gen_lookup(struct bpf_map *map,  | 
|---|
 | 2101 | +static int htab_of_map_gen_lookup(struct bpf_map *map,  | 
|---|
| 1416 | 2102 |  				  struct bpf_insn *insn_buf) | 
|---|
| 1417 | 2103 |  { | 
|---|
| 1418 | 2104 |  	struct bpf_insn *insn = insn_buf; | 
|---|
| .. | .. | 
|---|
| 1436 | 2122 |  	fd_htab_map_free(map); | 
|---|
| 1437 | 2123 |  } | 
|---|
| 1438 | 2124 |   | 
|---|
 | 2125 | +static int htab_of_maps_map_btf_id;  | 
|---|
| 1439 | 2126 |  const struct bpf_map_ops htab_of_maps_map_ops = { | 
|---|
| 1440 | 2127 |  	.map_alloc_check = fd_htab_map_alloc_check, | 
|---|
| 1441 | 2128 |  	.map_alloc = htab_of_map_alloc, | 
|---|
| .. | .. | 
|---|
| 1448 | 2135 |  	.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, | 
|---|
| 1449 | 2136 |  	.map_gen_lookup = htab_of_map_gen_lookup, | 
|---|
| 1450 | 2137 |  	.map_check_btf = map_check_no_btf, | 
|---|
 | 2138 | +	.map_btf_name = "bpf_htab",  | 
|---|
 | 2139 | +	.map_btf_id = &htab_of_maps_map_btf_id,  | 
|---|
| 1451 | 2140 |  }; | 
|---|