| .. | .. | 
|---|
 | 1 | +// SPDX-License-Identifier: GPL-2.0-only  | 
|---|
| 1 | 2 |  /* | 
|---|
| 2 | 3 |   * Longest prefix match list implementation | 
|---|
| 3 | 4 |   * | 
|---|
| 4 | 5 |   * Copyright (c) 2016,2017 Daniel Mack | 
|---|
| 5 | 6 |   * Copyright (c) 2016 David Herrmann | 
|---|
| 6 |  | - *  | 
|---|
| 7 |  | - * This file is subject to the terms and conditions of version 2 of the GNU  | 
|---|
| 8 |  | - * General Public License.  See the file COPYING in the main directory of the  | 
|---|
| 9 |  | - * Linux distribution for more details.  | 
|---|
| 10 | 7 |   */ | 
|---|
| 11 | 8 |   | 
|---|
| 12 | 9 |  #include <linux/bpf.h> | 
|---|
| .. | .. | 
|---|
| 28 | 25 |  	struct lpm_trie_node __rcu	*child[2]; | 
|---|
| 29 | 26 |  	u32				prefixlen; | 
|---|
| 30 | 27 |  	u32				flags; | 
|---|
| 31 |  | -	u8				data[0];  | 
|---|
 | 28 | +	u8				data[];  | 
|---|
| 32 | 29 |  }; | 
|---|
| 33 | 30 |   | 
|---|
| 34 | 31 |  struct lpm_trie { | 
|---|
| .. | .. | 
|---|
| 37 | 34 |  	size_t				n_entries; | 
|---|
| 38 | 35 |  	size_t				max_prefixlen; | 
|---|
| 39 | 36 |  	size_t				data_size; | 
|---|
| 40 |  | -	raw_spinlock_t			lock;  | 
|---|
 | 37 | +	spinlock_t			lock;  | 
|---|
| 41 | 38 |  }; | 
|---|
| 42 | 39 |   | 
|---|
| 43 | 40 |  /* This trie implements a longest prefix match algorithm that can be used to | 
|---|
| .. | .. | 
|---|
| 168 | 165 |  				   const struct lpm_trie_node *node, | 
|---|
| 169 | 166 |  				   const struct bpf_lpm_trie_key *key) | 
|---|
| 170 | 167 |  { | 
|---|
| 171 |  | -	size_t prefixlen = 0;  | 
|---|
| 172 |  | -	size_t i;  | 
|---|
 | 168 | +	u32 limit = min(node->prefixlen, key->prefixlen);  | 
|---|
 | 169 | +	u32 prefixlen = 0, i = 0;  | 
|---|
| 173 | 170 |   | 
|---|
| 174 |  | -	for (i = 0; i < trie->data_size; i++) {  | 
|---|
| 175 |  | -		size_t b;  | 
|---|
 | 171 | +	BUILD_BUG_ON(offsetof(struct lpm_trie_node, data) % sizeof(u32));  | 
|---|
 | 172 | +	BUILD_BUG_ON(offsetof(struct bpf_lpm_trie_key, data) % sizeof(u32));  | 
|---|
| 176 | 173 |   | 
|---|
| 177 |  | -		b = 8 - fls(node->data[i] ^ key->data[i]);  | 
|---|
| 178 |  | -		prefixlen += b;  | 
|---|
 | 174 | +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && defined(CONFIG_64BIT)  | 
|---|
| 179 | 175 |   | 
|---|
| 180 |  | -		if (prefixlen >= node->prefixlen || prefixlen >= key->prefixlen)  | 
|---|
| 181 |  | -			return min(node->prefixlen, key->prefixlen);  | 
|---|
 | 176 | +	/* data_size >= 16 has very small probability.  | 
|---|
 | 177 | +	 * We do not use a loop for optimal code generation.  | 
|---|
 | 178 | +	 */  | 
|---|
 | 179 | +	if (trie->data_size >= 8) {  | 
|---|
 | 180 | +		u64 diff = be64_to_cpu(*(__be64 *)node->data ^  | 
|---|
 | 181 | +				       *(__be64 *)key->data);  | 
|---|
| 182 | 182 |   | 
|---|
| 183 |  | -		if (b < 8)  | 
|---|
| 184 |  | -			break;  | 
|---|
 | 183 | +		prefixlen = 64 - fls64(diff);  | 
|---|
 | 184 | +		if (prefixlen >= limit)  | 
|---|
 | 185 | +			return limit;  | 
|---|
 | 186 | +		if (diff)  | 
|---|
 | 187 | +			return prefixlen;  | 
|---|
 | 188 | +		i = 8;  | 
|---|
 | 189 | +	}  | 
|---|
 | 190 | +#endif  | 
|---|
 | 191 | +  | 
|---|
 | 192 | +	while (trie->data_size >= i + 4) {  | 
|---|
 | 193 | +		u32 diff = be32_to_cpu(*(__be32 *)&node->data[i] ^  | 
|---|
 | 194 | +				       *(__be32 *)&key->data[i]);  | 
|---|
 | 195 | +  | 
|---|
 | 196 | +		prefixlen += 32 - fls(diff);  | 
|---|
 | 197 | +		if (prefixlen >= limit)  | 
|---|
 | 198 | +			return limit;  | 
|---|
 | 199 | +		if (diff)  | 
|---|
 | 200 | +			return prefixlen;  | 
|---|
 | 201 | +		i += 4;  | 
|---|
 | 202 | +	}  | 
|---|
 | 203 | +  | 
|---|
 | 204 | +	if (trie->data_size >= i + 2) {  | 
|---|
 | 205 | +		u16 diff = be16_to_cpu(*(__be16 *)&node->data[i] ^  | 
|---|
 | 206 | +				       *(__be16 *)&key->data[i]);  | 
|---|
 | 207 | +  | 
|---|
 | 208 | +		prefixlen += 16 - fls(diff);  | 
|---|
 | 209 | +		if (prefixlen >= limit)  | 
|---|
 | 210 | +			return limit;  | 
|---|
 | 211 | +		if (diff)  | 
|---|
 | 212 | +			return prefixlen;  | 
|---|
 | 213 | +		i += 2;  | 
|---|
 | 214 | +	}  | 
|---|
 | 215 | +  | 
|---|
 | 216 | +	if (trie->data_size >= i + 1) {  | 
|---|
 | 217 | +		prefixlen += 8 - fls(node->data[i] ^ key->data[i]);  | 
|---|
 | 218 | +  | 
|---|
 | 219 | +		if (prefixlen >= limit)  | 
|---|
 | 220 | +			return limit;  | 
|---|
| 185 | 221 |  	} | 
|---|
| 186 | 222 |   | 
|---|
| 187 | 223 |  	return prefixlen; | 
|---|
| .. | .. | 
|---|
| 279 | 315 |  	if (key->prefixlen > trie->max_prefixlen) | 
|---|
| 280 | 316 |  		return -EINVAL; | 
|---|
| 281 | 317 |   | 
|---|
| 282 |  | -	raw_spin_lock_irqsave(&trie->lock, irq_flags);  | 
|---|
 | 318 | +	spin_lock_irqsave(&trie->lock, irq_flags);  | 
|---|
| 283 | 319 |   | 
|---|
| 284 | 320 |  	/* Allocate and fill a new node */ | 
|---|
| 285 | 321 |   | 
|---|
| .. | .. | 
|---|
| 386 | 422 |  		kfree(im_node); | 
|---|
| 387 | 423 |  	} | 
|---|
| 388 | 424 |   | 
|---|
| 389 |  | -	raw_spin_unlock_irqrestore(&trie->lock, irq_flags);  | 
|---|
 | 425 | +	spin_unlock_irqrestore(&trie->lock, irq_flags);  | 
|---|
| 390 | 426 |   | 
|---|
| 391 | 427 |  	return ret; | 
|---|
| 392 | 428 |  } | 
|---|
| .. | .. | 
|---|
| 406 | 442 |  	if (key->prefixlen > trie->max_prefixlen) | 
|---|
| 407 | 443 |  		return -EINVAL; | 
|---|
| 408 | 444 |   | 
|---|
| 409 |  | -	raw_spin_lock_irqsave(&trie->lock, irq_flags);  | 
|---|
 | 445 | +	spin_lock_irqsave(&trie->lock, irq_flags);  | 
|---|
| 410 | 446 |   | 
|---|
| 411 | 447 |  	/* Walk the tree looking for an exact key/length match and keeping | 
|---|
| 412 | 448 |  	 * track of the path we traverse.  We will need to know the node | 
|---|
| .. | .. | 
|---|
| 482 | 518 |  	kfree_rcu(node, rcu); | 
|---|
| 483 | 519 |   | 
|---|
| 484 | 520 |  out: | 
|---|
| 485 |  | -	raw_spin_unlock_irqrestore(&trie->lock, irq_flags);  | 
|---|
 | 521 | +	spin_unlock_irqrestore(&trie->lock, irq_flags);  | 
|---|
| 486 | 522 |   | 
|---|
| 487 | 523 |  	return ret; | 
|---|
| 488 | 524 |  } | 
|---|
| .. | .. | 
|---|
| 499 | 535 |  #define LPM_KEY_SIZE_MIN	LPM_KEY_SIZE(LPM_DATA_SIZE_MIN) | 
|---|
| 500 | 536 |   | 
|---|
| 501 | 537 |  #define LPM_CREATE_FLAG_MASK	(BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE |	\ | 
|---|
| 502 |  | -				 BPF_F_RDONLY | BPF_F_WRONLY)  | 
|---|
 | 538 | +				 BPF_F_ACCESS_MASK)  | 
|---|
| 503 | 539 |   | 
|---|
| 504 | 540 |  static struct bpf_map *trie_alloc(union bpf_attr *attr) | 
|---|
| 505 | 541 |  { | 
|---|
| .. | .. | 
|---|
| 507 | 543 |  	u64 cost = sizeof(*trie), cost_per_node; | 
|---|
| 508 | 544 |  	int ret; | 
|---|
| 509 | 545 |   | 
|---|
| 510 |  | -	if (!capable(CAP_SYS_ADMIN))  | 
|---|
 | 546 | +	if (!bpf_capable())  | 
|---|
| 511 | 547 |  		return ERR_PTR(-EPERM); | 
|---|
| 512 | 548 |   | 
|---|
| 513 | 549 |  	/* check sanity of attributes */ | 
|---|
| 514 | 550 |  	if (attr->max_entries == 0 || | 
|---|
| 515 | 551 |  	    !(attr->map_flags & BPF_F_NO_PREALLOC) || | 
|---|
| 516 | 552 |  	    attr->map_flags & ~LPM_CREATE_FLAG_MASK || | 
|---|
 | 553 | +	    !bpf_map_flags_access_ok(attr->map_flags) ||  | 
|---|
| 517 | 554 |  	    attr->key_size < LPM_KEY_SIZE_MIN || | 
|---|
| 518 | 555 |  	    attr->key_size > LPM_KEY_SIZE_MAX || | 
|---|
| 519 | 556 |  	    attr->value_size < LPM_VAL_SIZE_MIN || | 
|---|
| .. | .. | 
|---|
| 533 | 570 |  	cost_per_node = sizeof(struct lpm_trie_node) + | 
|---|
| 534 | 571 |  			attr->value_size + trie->data_size; | 
|---|
| 535 | 572 |  	cost += (u64) attr->max_entries * cost_per_node; | 
|---|
| 536 |  | -	if (cost >= U32_MAX - PAGE_SIZE) {  | 
|---|
| 537 |  | -		ret = -E2BIG;  | 
|---|
| 538 |  | -		goto out_err;  | 
|---|
| 539 |  | -	}  | 
|---|
| 540 | 573 |   | 
|---|
| 541 |  | -	trie->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;  | 
|---|
| 542 |  | -  | 
|---|
| 543 |  | -	ret = bpf_map_precharge_memlock(trie->map.pages);  | 
|---|
 | 574 | +	ret = bpf_map_charge_init(&trie->map.memory, cost);  | 
|---|
| 544 | 575 |  	if (ret) | 
|---|
| 545 | 576 |  		goto out_err; | 
|---|
| 546 | 577 |   | 
|---|
| 547 |  | -	raw_spin_lock_init(&trie->lock);  | 
|---|
 | 578 | +	spin_lock_init(&trie->lock);  | 
|---|
| 548 | 579 |   | 
|---|
| 549 | 580 |  	return &trie->map; | 
|---|
| 550 | 581 |  out_err: | 
|---|
| .. | .. | 
|---|
| 557 | 588 |  	struct lpm_trie *trie = container_of(map, struct lpm_trie, map); | 
|---|
| 558 | 589 |  	struct lpm_trie_node __rcu **slot; | 
|---|
| 559 | 590 |  	struct lpm_trie_node *node; | 
|---|
| 560 |  | -  | 
|---|
| 561 |  | -	/* Wait for outstanding programs to complete  | 
|---|
| 562 |  | -	 * update/lookup/delete/get_next_key and free the trie.  | 
|---|
| 563 |  | -	 */  | 
|---|
| 564 |  | -	synchronize_rcu();  | 
|---|
| 565 | 591 |   | 
|---|
| 566 | 592 |  	/* Always start at the root and walk down to a node that has no | 
|---|
| 567 | 593 |  	 * children. Then free that node, nullify its reference in the parent | 
|---|
| .. | .. | 
|---|
| 695 | 721 |  } | 
|---|
| 696 | 722 |   | 
|---|
| 697 | 723 |  static int trie_check_btf(const struct bpf_map *map, | 
|---|
 | 724 | +			  const struct btf *btf,  | 
|---|
| 698 | 725 |  			  const struct btf_type *key_type, | 
|---|
| 699 | 726 |  			  const struct btf_type *value_type) | 
|---|
| 700 | 727 |  { | 
|---|
| .. | .. | 
|---|
| 703 | 730 |  	       -EINVAL : 0; | 
|---|
| 704 | 731 |  } | 
|---|
| 705 | 732 |   | 
|---|
 | 733 | +static int trie_map_btf_id;  | 
|---|
| 706 | 734 |  const struct bpf_map_ops trie_map_ops = { | 
|---|
 | 735 | +	.map_meta_equal = bpf_map_meta_equal,  | 
|---|
| 707 | 736 |  	.map_alloc = trie_alloc, | 
|---|
| 708 | 737 |  	.map_free = trie_free, | 
|---|
| 709 | 738 |  	.map_get_next_key = trie_get_next_key, | 
|---|
| .. | .. | 
|---|
| 711 | 740 |  	.map_update_elem = trie_update_elem, | 
|---|
| 712 | 741 |  	.map_delete_elem = trie_delete_elem, | 
|---|
| 713 | 742 |  	.map_check_btf = trie_check_btf, | 
|---|
 | 743 | +	.map_btf_name = "lpm_trie",  | 
|---|
 | 744 | +	.map_btf_id = &trie_map_btf_id,  | 
|---|
| 714 | 745 |  }; | 
|---|