.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
---|
1 | 2 | /* |
---|
2 | 3 | * Longest prefix match list implementation |
---|
3 | 4 | * |
---|
4 | 5 | * Copyright (c) 2016,2017 Daniel Mack |
---|
5 | 6 | * Copyright (c) 2016 David Herrmann |
---|
6 | | - * |
---|
7 | | - * This file is subject to the terms and conditions of version 2 of the GNU |
---|
8 | | - * General Public License. See the file COPYING in the main directory of the |
---|
9 | | - * Linux distribution for more details. |
---|
10 | 7 | */ |
---|
11 | 8 | |
---|
12 | 9 | #include <linux/bpf.h> |
---|
.. | .. |
---|
28 | 25 | struct lpm_trie_node __rcu *child[2]; |
---|
29 | 26 | u32 prefixlen; |
---|
30 | 27 | u32 flags; |
---|
31 | | - u8 data[0]; |
---|
| 28 | + u8 data[]; |
---|
32 | 29 | }; |
---|
33 | 30 | |
---|
34 | 31 | struct lpm_trie { |
---|
.. | .. |
---|
37 | 34 | size_t n_entries; |
---|
38 | 35 | size_t max_prefixlen; |
---|
39 | 36 | size_t data_size; |
---|
40 | | - raw_spinlock_t lock; |
---|
| 37 | + spinlock_t lock; |
---|
41 | 38 | }; |
---|
42 | 39 | |
---|
43 | 40 | /* This trie implements a longest prefix match algorithm that can be used to |
---|
.. | .. |
---|
168 | 165 | const struct lpm_trie_node *node, |
---|
169 | 166 | const struct bpf_lpm_trie_key *key) |
---|
170 | 167 | { |
---|
171 | | - size_t prefixlen = 0; |
---|
172 | | - size_t i; |
---|
| 168 | + u32 limit = min(node->prefixlen, key->prefixlen); |
---|
| 169 | + u32 prefixlen = 0, i = 0; |
---|
173 | 170 | |
---|
174 | | - for (i = 0; i < trie->data_size; i++) { |
---|
175 | | - size_t b; |
---|
| 171 | + BUILD_BUG_ON(offsetof(struct lpm_trie_node, data) % sizeof(u32)); |
---|
| 172 | + BUILD_BUG_ON(offsetof(struct bpf_lpm_trie_key, data) % sizeof(u32)); |
---|
176 | 173 | |
---|
177 | | - b = 8 - fls(node->data[i] ^ key->data[i]); |
---|
178 | | - prefixlen += b; |
---|
| 174 | +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && defined(CONFIG_64BIT) |
---|
179 | 175 | |
---|
180 | | - if (prefixlen >= node->prefixlen || prefixlen >= key->prefixlen) |
---|
181 | | - return min(node->prefixlen, key->prefixlen); |
---|
| 176 | + /* data_size >= 16 has very small probability. |
---|
| 177 | + * We do not use a loop for optimal code generation. |
---|
| 178 | + */ |
---|
| 179 | + if (trie->data_size >= 8) { |
---|
| 180 | + u64 diff = be64_to_cpu(*(__be64 *)node->data ^ |
---|
| 181 | + *(__be64 *)key->data); |
---|
182 | 182 | |
---|
183 | | - if (b < 8) |
---|
184 | | - break; |
---|
| 183 | + prefixlen = 64 - fls64(diff); |
---|
| 184 | + if (prefixlen >= limit) |
---|
| 185 | + return limit; |
---|
| 186 | + if (diff) |
---|
| 187 | + return prefixlen; |
---|
| 188 | + i = 8; |
---|
| 189 | + } |
---|
| 190 | +#endif |
---|
| 191 | + |
---|
| 192 | + while (trie->data_size >= i + 4) { |
---|
| 193 | + u32 diff = be32_to_cpu(*(__be32 *)&node->data[i] ^ |
---|
| 194 | + *(__be32 *)&key->data[i]); |
---|
| 195 | + |
---|
| 196 | + prefixlen += 32 - fls(diff); |
---|
| 197 | + if (prefixlen >= limit) |
---|
| 198 | + return limit; |
---|
| 199 | + if (diff) |
---|
| 200 | + return prefixlen; |
---|
| 201 | + i += 4; |
---|
| 202 | + } |
---|
| 203 | + |
---|
| 204 | + if (trie->data_size >= i + 2) { |
---|
| 205 | + u16 diff = be16_to_cpu(*(__be16 *)&node->data[i] ^ |
---|
| 206 | + *(__be16 *)&key->data[i]); |
---|
| 207 | + |
---|
| 208 | + prefixlen += 16 - fls(diff); |
---|
| 209 | + if (prefixlen >= limit) |
---|
| 210 | + return limit; |
---|
| 211 | + if (diff) |
---|
| 212 | + return prefixlen; |
---|
| 213 | + i += 2; |
---|
| 214 | + } |
---|
| 215 | + |
---|
| 216 | + if (trie->data_size >= i + 1) { |
---|
| 217 | + prefixlen += 8 - fls(node->data[i] ^ key->data[i]); |
---|
| 218 | + |
---|
| 219 | + if (prefixlen >= limit) |
---|
| 220 | + return limit; |
---|
185 | 221 | } |
---|
186 | 222 | |
---|
187 | 223 | return prefixlen; |
---|
.. | .. |
---|
279 | 315 | if (key->prefixlen > trie->max_prefixlen) |
---|
280 | 316 | return -EINVAL; |
---|
281 | 317 | |
---|
282 | | - raw_spin_lock_irqsave(&trie->lock, irq_flags); |
---|
| 318 | + spin_lock_irqsave(&trie->lock, irq_flags); |
---|
283 | 319 | |
---|
284 | 320 | /* Allocate and fill a new node */ |
---|
285 | 321 | |
---|
.. | .. |
---|
386 | 422 | kfree(im_node); |
---|
387 | 423 | } |
---|
388 | 424 | |
---|
389 | | - raw_spin_unlock_irqrestore(&trie->lock, irq_flags); |
---|
| 425 | + spin_unlock_irqrestore(&trie->lock, irq_flags); |
---|
390 | 426 | |
---|
391 | 427 | return ret; |
---|
392 | 428 | } |
---|
.. | .. |
---|
406 | 442 | if (key->prefixlen > trie->max_prefixlen) |
---|
407 | 443 | return -EINVAL; |
---|
408 | 444 | |
---|
409 | | - raw_spin_lock_irqsave(&trie->lock, irq_flags); |
---|
| 445 | + spin_lock_irqsave(&trie->lock, irq_flags); |
---|
410 | 446 | |
---|
411 | 447 | /* Walk the tree looking for an exact key/length match and keeping |
---|
412 | 448 | * track of the path we traverse. We will need to know the node |
---|
.. | .. |
---|
482 | 518 | kfree_rcu(node, rcu); |
---|
483 | 519 | |
---|
484 | 520 | out: |
---|
485 | | - raw_spin_unlock_irqrestore(&trie->lock, irq_flags); |
---|
| 521 | + spin_unlock_irqrestore(&trie->lock, irq_flags); |
---|
486 | 522 | |
---|
487 | 523 | return ret; |
---|
488 | 524 | } |
---|
.. | .. |
---|
499 | 535 | #define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN) |
---|
500 | 536 | |
---|
501 | 537 | #define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE | \ |
---|
502 | | - BPF_F_RDONLY | BPF_F_WRONLY) |
---|
| 538 | + BPF_F_ACCESS_MASK) |
---|
503 | 539 | |
---|
504 | 540 | static struct bpf_map *trie_alloc(union bpf_attr *attr) |
---|
505 | 541 | { |
---|
.. | .. |
---|
507 | 543 | u64 cost = sizeof(*trie), cost_per_node; |
---|
508 | 544 | int ret; |
---|
509 | 545 | |
---|
510 | | - if (!capable(CAP_SYS_ADMIN)) |
---|
| 546 | + if (!bpf_capable()) |
---|
511 | 547 | return ERR_PTR(-EPERM); |
---|
512 | 548 | |
---|
513 | 549 | /* check sanity of attributes */ |
---|
514 | 550 | if (attr->max_entries == 0 || |
---|
515 | 551 | !(attr->map_flags & BPF_F_NO_PREALLOC) || |
---|
516 | 552 | attr->map_flags & ~LPM_CREATE_FLAG_MASK || |
---|
| 553 | + !bpf_map_flags_access_ok(attr->map_flags) || |
---|
517 | 554 | attr->key_size < LPM_KEY_SIZE_MIN || |
---|
518 | 555 | attr->key_size > LPM_KEY_SIZE_MAX || |
---|
519 | 556 | attr->value_size < LPM_VAL_SIZE_MIN || |
---|
.. | .. |
---|
533 | 570 | cost_per_node = sizeof(struct lpm_trie_node) + |
---|
534 | 571 | attr->value_size + trie->data_size; |
---|
535 | 572 | cost += (u64) attr->max_entries * cost_per_node; |
---|
536 | | - if (cost >= U32_MAX - PAGE_SIZE) { |
---|
537 | | - ret = -E2BIG; |
---|
538 | | - goto out_err; |
---|
539 | | - } |
---|
540 | 573 | |
---|
541 | | - trie->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; |
---|
542 | | - |
---|
543 | | - ret = bpf_map_precharge_memlock(trie->map.pages); |
---|
| 574 | + ret = bpf_map_charge_init(&trie->map.memory, cost); |
---|
544 | 575 | if (ret) |
---|
545 | 576 | goto out_err; |
---|
546 | 577 | |
---|
547 | | - raw_spin_lock_init(&trie->lock); |
---|
| 578 | + spin_lock_init(&trie->lock); |
---|
548 | 579 | |
---|
549 | 580 | return &trie->map; |
---|
550 | 581 | out_err: |
---|
.. | .. |
---|
557 | 588 | struct lpm_trie *trie = container_of(map, struct lpm_trie, map); |
---|
558 | 589 | struct lpm_trie_node __rcu **slot; |
---|
559 | 590 | struct lpm_trie_node *node; |
---|
560 | | - |
---|
561 | | - /* Wait for outstanding programs to complete |
---|
562 | | - * update/lookup/delete/get_next_key and free the trie. |
---|
563 | | - */ |
---|
564 | | - synchronize_rcu(); |
---|
565 | 591 | |
---|
566 | 592 | /* Always start at the root and walk down to a node that has no |
---|
567 | 593 | * children. Then free that node, nullify its reference in the parent |
---|
.. | .. |
---|
695 | 721 | } |
---|
696 | 722 | |
---|
697 | 723 | static int trie_check_btf(const struct bpf_map *map, |
---|
| 724 | + const struct btf *btf, |
---|
698 | 725 | const struct btf_type *key_type, |
---|
699 | 726 | const struct btf_type *value_type) |
---|
700 | 727 | { |
---|
.. | .. |
---|
703 | 730 | -EINVAL : 0; |
---|
704 | 731 | } |
---|
705 | 732 | |
---|
| 733 | +static int trie_map_btf_id; |
---|
706 | 734 | const struct bpf_map_ops trie_map_ops = { |
---|
| 735 | + .map_meta_equal = bpf_map_meta_equal, |
---|
707 | 736 | .map_alloc = trie_alloc, |
---|
708 | 737 | .map_free = trie_free, |
---|
709 | 738 | .map_get_next_key = trie_get_next_key, |
---|
.. | .. |
---|
711 | 740 | .map_update_elem = trie_update_elem, |
---|
712 | 741 | .map_delete_elem = trie_delete_elem, |
---|
713 | 742 | .map_check_btf = trie_check_btf, |
---|
| 743 | + .map_btf_name = "lpm_trie", |
---|
| 744 | + .map_btf_id = &trie_map_btf_id, |
---|
714 | 745 | }; |
---|