| .. | .. |
|---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
|---|
| 1 | 2 | /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io |
|---|
| 2 | | - * |
|---|
| 3 | | - * This program is free software; you can redistribute it and/or |
|---|
| 4 | | - * modify it under the terms of version 2 of the GNU General Public |
|---|
| 5 | | - * License as published by the Free Software Foundation. |
|---|
| 6 | | - * |
|---|
| 7 | | - * This program is distributed in the hope that it will be useful, but |
|---|
| 8 | | - * WITHOUT ANY WARRANTY; without even the implied warranty of |
|---|
| 9 | | - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|---|
| 10 | | - * General Public License for more details. |
|---|
| 11 | 3 | */ |
|---|
| 12 | 4 | |
|---|
| 13 | 5 | /* Devmaps primary use is as a backend map for XDP BPF helper call |
|---|
| .. | .. |
|---|
| 25 | 17 | * datapath always has a valid copy. However, the datapath does a "flush" |
|---|
| 26 | 18 | * operation that pushes any pending packets in the driver outside the RCU |
|---|
| 27 | 19 | * critical section. Each bpf_dtab_netdev tracks these pending operations using |
|---|
| 28 | | - * an atomic per-cpu bitmap. The bpf_dtab_netdev object will not be destroyed |
|---|
| 29 | | - * until all bits are cleared indicating outstanding flush operations have |
|---|
| 30 | | - * completed. |
|---|
| 20 | + * a per-cpu flush list. The bpf_dtab_netdev object will not be destroyed until |
|---|
| 21 | + * this list is empty, indicating outstanding flush operations have completed. |
|---|
| 31 | 22 | * |
|---|
| 32 | 23 | * BPF syscalls may race with BPF program calls on any of the update, delete |
|---|
| 33 | 24 | * or lookup operations. As noted above the xchg() operation also keep the |
|---|
| .. | .. |
|---|
| 46 | 37 | * notifier hook walks the map we know that new dev references can not be |
|---|
| 47 | 38 | * added by the user because core infrastructure ensures dev_get_by_index() |
|---|
| 48 | 39 | * calls will fail at this point. |
|---|
| 40 | + * |
|---|
| 41 | + * The devmap_hash type is a map type which interprets keys as ifindexes and |
|---|
| 42 | + * indexes these using a hashmap. This allows maps that use ifindex as key to be |
|---|
| 43 | + * densely packed instead of having holes in the lookup array for unused |
|---|
| 44 | + * ifindexes. The setup and packet enqueue/send code is shared between the two |
|---|
| 45 | + * types of devmap; only the lookup and insertion is different. |
|---|
| 49 | 46 | */ |
|---|
| 50 | 47 | #include <linux/bpf.h> |
|---|
| 51 | 48 | #include <net/xdp.h> |
|---|
| .. | .. |
|---|
| 55 | 52 | #define DEV_CREATE_FLAG_MASK \ |
|---|
| 56 | 53 | (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) |
|---|
| 57 | 54 | |
|---|
| 58 | | -#define DEV_MAP_BULK_SIZE 16 |
|---|
| 59 | | -struct xdp_bulk_queue { |
|---|
| 55 | +struct xdp_dev_bulk_queue { |
|---|
| 60 | 56 | struct xdp_frame *q[DEV_MAP_BULK_SIZE]; |
|---|
| 57 | + struct list_head flush_node; |
|---|
| 58 | + struct net_device *dev; |
|---|
| 61 | 59 | struct net_device *dev_rx; |
|---|
| 62 | 60 | unsigned int count; |
|---|
| 63 | 61 | }; |
|---|
| 64 | 62 | |
|---|
| 65 | 63 | struct bpf_dtab_netdev { |
|---|
| 66 | 64 | struct net_device *dev; /* must be first member, due to tracepoint */ |
|---|
| 65 | + struct hlist_node index_hlist; |
|---|
| 67 | 66 | struct bpf_dtab *dtab; |
|---|
| 68 | | - unsigned int bit; |
|---|
| 69 | | - struct xdp_bulk_queue __percpu *bulkq; |
|---|
| 67 | + struct bpf_prog *xdp_prog; |
|---|
| 70 | 68 | struct rcu_head rcu; |
|---|
| 69 | + unsigned int idx; |
|---|
| 70 | + struct bpf_devmap_val val; |
|---|
| 71 | 71 | }; |
|---|
| 72 | 72 | |
|---|
| 73 | 73 | struct bpf_dtab { |
|---|
| 74 | 74 | struct bpf_map map; |
|---|
| 75 | | - struct bpf_dtab_netdev **netdev_map; |
|---|
| 76 | | - unsigned long __percpu *flush_needed; |
|---|
| 75 | + struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */ |
|---|
| 77 | 76 | struct list_head list; |
|---|
| 77 | + |
|---|
| 78 | + /* these are only used for DEVMAP_HASH type maps */ |
|---|
| 79 | + struct hlist_head *dev_index_head; |
|---|
| 80 | + spinlock_t index_lock; |
|---|
| 81 | + unsigned int items; |
|---|
| 82 | + u32 n_buckets; |
|---|
| 78 | 83 | }; |
|---|
| 79 | 84 | |
|---|
| 85 | +static DEFINE_PER_CPU(struct list_head, dev_flush_list); |
|---|
| 80 | 86 | static DEFINE_SPINLOCK(dev_map_lock); |
|---|
| 81 | 87 | static LIST_HEAD(dev_map_list); |
|---|
| 82 | 88 | |
|---|
| 83 | | -static u64 dev_map_bitmap_size(const union bpf_attr *attr) |
|---|
| 89 | +static struct hlist_head *dev_map_create_hash(unsigned int entries, |
|---|
| 90 | + int numa_node) |
|---|
| 84 | 91 | { |
|---|
| 85 | | - return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long); |
|---|
| 92 | + int i; |
|---|
| 93 | + struct hlist_head *hash; |
|---|
| 94 | + |
|---|
| 95 | + hash = bpf_map_area_alloc((u64) entries * sizeof(*hash), numa_node); |
|---|
| 96 | + if (hash != NULL) |
|---|
| 97 | + for (i = 0; i < entries; i++) |
|---|
| 98 | + INIT_HLIST_HEAD(&hash[i]); |
|---|
| 99 | + |
|---|
| 100 | + return hash; |
|---|
| 101 | +} |
|---|
| 102 | + |
|---|
| 103 | +static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, |
|---|
| 104 | + int idx) |
|---|
| 105 | +{ |
|---|
| 106 | + return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)]; |
|---|
| 107 | +} |
|---|
| 108 | + |
|---|
| 109 | +static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) |
|---|
| 110 | +{ |
|---|
| 111 | + u32 valsize = attr->value_size; |
|---|
| 112 | + u64 cost = 0; |
|---|
| 113 | + int err; |
|---|
| 114 | + |
|---|
| 115 | + /* check sanity of attributes. 2 value sizes supported: |
|---|
| 116 | + * 4 bytes: ifindex |
|---|
| 117 | + * 8 bytes: ifindex + prog fd |
|---|
| 118 | + */ |
|---|
| 119 | + if (attr->max_entries == 0 || attr->key_size != 4 || |
|---|
| 120 | + (valsize != offsetofend(struct bpf_devmap_val, ifindex) && |
|---|
| 121 | + valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) || |
|---|
| 122 | + attr->map_flags & ~DEV_CREATE_FLAG_MASK) |
|---|
| 123 | + return -EINVAL; |
|---|
| 124 | + |
|---|
| 125 | + /* Lookup returns a pointer straight to dev->ifindex, so make sure the |
|---|
| 126 | + * verifier prevents writes from the BPF side |
|---|
| 127 | + */ |
|---|
| 128 | + attr->map_flags |= BPF_F_RDONLY_PROG; |
|---|
| 129 | + |
|---|
| 130 | + |
|---|
| 131 | + bpf_map_init_from_attr(&dtab->map, attr); |
|---|
| 132 | + |
|---|
| 133 | + if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { |
|---|
| 134 | + dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); |
|---|
| 135 | + |
|---|
| 136 | + if (!dtab->n_buckets) /* Overflow check */ |
|---|
| 137 | + return -EINVAL; |
|---|
| 138 | + cost += (u64) sizeof(struct hlist_head) * dtab->n_buckets; |
|---|
| 139 | + } else { |
|---|
| 140 | + cost += (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); |
|---|
| 141 | + } |
|---|
| 142 | + |
|---|
| 143 | + /* if map size is larger than memlock limit, reject it */ |
|---|
| 144 | + err = bpf_map_charge_init(&dtab->map.memory, cost); |
|---|
| 145 | + if (err) |
|---|
| 146 | + return -EINVAL; |
|---|
| 147 | + |
|---|
| 148 | + if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { |
|---|
| 149 | + dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets, |
|---|
| 150 | + dtab->map.numa_node); |
|---|
| 151 | + if (!dtab->dev_index_head) |
|---|
| 152 | + goto free_charge; |
|---|
| 153 | + |
|---|
| 154 | + spin_lock_init(&dtab->index_lock); |
|---|
| 155 | + } else { |
|---|
| 156 | + dtab->netdev_map = bpf_map_area_alloc((u64) dtab->map.max_entries * |
|---|
| 157 | + sizeof(struct bpf_dtab_netdev *), |
|---|
| 158 | + dtab->map.numa_node); |
|---|
| 159 | + if (!dtab->netdev_map) |
|---|
| 160 | + goto free_charge; |
|---|
| 161 | + } |
|---|
| 162 | + |
|---|
| 163 | + return 0; |
|---|
| 164 | + |
|---|
| 165 | +free_charge: |
|---|
| 166 | + bpf_map_charge_finish(&dtab->map.memory); |
|---|
| 167 | + return -ENOMEM; |
|---|
| 86 | 168 | } |
|---|
| 87 | 169 | |
|---|
| 88 | 170 | static struct bpf_map *dev_map_alloc(union bpf_attr *attr) |
|---|
| 89 | 171 | { |
|---|
| 90 | 172 | struct bpf_dtab *dtab; |
|---|
| 91 | | - int err = -EINVAL; |
|---|
| 92 | | - u64 cost; |
|---|
| 173 | + int err; |
|---|
| 93 | 174 | |
|---|
| 94 | 175 | if (!capable(CAP_NET_ADMIN)) |
|---|
| 95 | 176 | return ERR_PTR(-EPERM); |
|---|
| 96 | | - |
|---|
| 97 | | - /* check sanity of attributes */ |
|---|
| 98 | | - if (attr->max_entries == 0 || attr->key_size != 4 || |
|---|
| 99 | | - attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) |
|---|
| 100 | | - return ERR_PTR(-EINVAL); |
|---|
| 101 | 177 | |
|---|
| 102 | 178 | dtab = kzalloc(sizeof(*dtab), GFP_USER); |
|---|
| 103 | 179 | if (!dtab) |
|---|
| 104 | 180 | return ERR_PTR(-ENOMEM); |
|---|
| 105 | 181 | |
|---|
| 106 | | - bpf_map_init_from_attr(&dtab->map, attr); |
|---|
| 107 | | - |
|---|
| 108 | | - /* make sure page count doesn't overflow */ |
|---|
| 109 | | - cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); |
|---|
| 110 | | - cost += dev_map_bitmap_size(attr) * num_possible_cpus(); |
|---|
| 111 | | - if (cost >= U32_MAX - PAGE_SIZE) |
|---|
| 112 | | - goto free_dtab; |
|---|
| 113 | | - |
|---|
| 114 | | - dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; |
|---|
| 115 | | - |
|---|
| 116 | | - /* if map size is larger than memlock limit, reject it early */ |
|---|
| 117 | | - err = bpf_map_precharge_memlock(dtab->map.pages); |
|---|
| 118 | | - if (err) |
|---|
| 119 | | - goto free_dtab; |
|---|
| 120 | | - |
|---|
| 121 | | - err = -ENOMEM; |
|---|
| 122 | | - |
|---|
| 123 | | - /* A per cpu bitfield with a bit per possible net device */ |
|---|
| 124 | | - dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr), |
|---|
| 125 | | - __alignof__(unsigned long), |
|---|
| 126 | | - GFP_KERNEL | __GFP_NOWARN); |
|---|
| 127 | | - if (!dtab->flush_needed) |
|---|
| 128 | | - goto free_dtab; |
|---|
| 129 | | - |
|---|
| 130 | | - dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * |
|---|
| 131 | | - sizeof(struct bpf_dtab_netdev *), |
|---|
| 132 | | - dtab->map.numa_node); |
|---|
| 133 | | - if (!dtab->netdev_map) |
|---|
| 134 | | - goto free_dtab; |
|---|
| 182 | + err = dev_map_init_map(dtab, attr); |
|---|
| 183 | + if (err) { |
|---|
| 184 | + kfree(dtab); |
|---|
| 185 | + return ERR_PTR(err); |
|---|
| 186 | + } |
|---|
| 135 | 187 | |
|---|
| 136 | 188 | spin_lock(&dev_map_lock); |
|---|
| 137 | 189 | list_add_tail_rcu(&dtab->list, &dev_map_list); |
|---|
| 138 | 190 | spin_unlock(&dev_map_lock); |
|---|
| 139 | 191 | |
|---|
| 140 | 192 | return &dtab->map; |
|---|
| 141 | | -free_dtab: |
|---|
| 142 | | - free_percpu(dtab->flush_needed); |
|---|
| 143 | | - kfree(dtab); |
|---|
| 144 | | - return ERR_PTR(err); |
|---|
| 145 | 193 | } |
|---|
| 146 | 194 | |
|---|
| 147 | 195 | static void dev_map_free(struct bpf_map *map) |
|---|
| 148 | 196 | { |
|---|
| 149 | 197 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); |
|---|
| 150 | | - int i, cpu; |
|---|
| 198 | + int i; |
|---|
| 151 | 199 | |
|---|
| 152 | 200 | /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, |
|---|
| 153 | 201 | * so the programs (can be more than one that used this map) were |
|---|
| 154 | | - * disconnected from events. Wait for outstanding critical sections in |
|---|
| 155 | | - * these programs to complete. The rcu critical section only guarantees |
|---|
| 156 | | - * no further reads against netdev_map. It does __not__ ensure pending |
|---|
| 157 | | - * flush operations (if any) are complete. |
|---|
| 202 | + * disconnected from events. The following synchronize_rcu() guarantees |
|---|
| 203 | + * both rcu read critical sections complete and waits for |
|---|
| 204 | + * preempt-disable regions (NAPI being the relevant context here) so we |
|---|
| 205 | + * are certain there will be no further reads against the netdev_map and |
|---|
| 206 | + * all flush operations are complete. Flush operations can only be done |
|---|
| 207 | + * from NAPI context for this reason. |
|---|
| 158 | 208 | */ |
|---|
| 159 | 209 | |
|---|
| 160 | 210 | spin_lock(&dev_map_lock); |
|---|
| .. | .. |
|---|
| 167 | 217 | /* Make sure prior __dev_map_entry_free() have completed. */ |
|---|
| 168 | 218 | rcu_barrier(); |
|---|
| 169 | 219 | |
|---|
| 170 | | - /* To ensure all pending flush operations have completed wait for flush |
|---|
| 171 | | - * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. |
|---|
| 172 | | - * Because the above synchronize_rcu() ensures the map is disconnected |
|---|
| 173 | | - * from the program we can assume no new bits will be set. |
|---|
| 174 | | - */ |
|---|
| 175 | | - for_each_online_cpu(cpu) { |
|---|
| 176 | | - unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu); |
|---|
| 220 | + if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) { |
|---|
| 221 | + for (i = 0; i < dtab->n_buckets; i++) { |
|---|
| 222 | + struct bpf_dtab_netdev *dev; |
|---|
| 223 | + struct hlist_head *head; |
|---|
| 224 | + struct hlist_node *next; |
|---|
| 177 | 225 | |
|---|
| 178 | | - while (!bitmap_empty(bitmap, dtab->map.max_entries)) |
|---|
| 179 | | - cond_resched(); |
|---|
| 226 | + head = dev_map_index_hash(dtab, i); |
|---|
| 227 | + |
|---|
| 228 | + hlist_for_each_entry_safe(dev, next, head, index_hlist) { |
|---|
| 229 | + hlist_del_rcu(&dev->index_hlist); |
|---|
| 230 | + if (dev->xdp_prog) |
|---|
| 231 | + bpf_prog_put(dev->xdp_prog); |
|---|
| 232 | + dev_put(dev->dev); |
|---|
| 233 | + kfree(dev); |
|---|
| 234 | + } |
|---|
| 235 | + } |
|---|
| 236 | + |
|---|
| 237 | + bpf_map_area_free(dtab->dev_index_head); |
|---|
| 238 | + } else { |
|---|
| 239 | + for (i = 0; i < dtab->map.max_entries; i++) { |
|---|
| 240 | + struct bpf_dtab_netdev *dev; |
|---|
| 241 | + |
|---|
| 242 | + dev = dtab->netdev_map[i]; |
|---|
| 243 | + if (!dev) |
|---|
| 244 | + continue; |
|---|
| 245 | + |
|---|
| 246 | + if (dev->xdp_prog) |
|---|
| 247 | + bpf_prog_put(dev->xdp_prog); |
|---|
| 248 | + dev_put(dev->dev); |
|---|
| 249 | + kfree(dev); |
|---|
| 250 | + } |
|---|
| 251 | + |
|---|
| 252 | + bpf_map_area_free(dtab->netdev_map); |
|---|
| 180 | 253 | } |
|---|
| 181 | 254 | |
|---|
| 182 | | - for (i = 0; i < dtab->map.max_entries; i++) { |
|---|
| 183 | | - struct bpf_dtab_netdev *dev; |
|---|
| 184 | | - |
|---|
| 185 | | - dev = dtab->netdev_map[i]; |
|---|
| 186 | | - if (!dev) |
|---|
| 187 | | - continue; |
|---|
| 188 | | - |
|---|
| 189 | | - free_percpu(dev->bulkq); |
|---|
| 190 | | - dev_put(dev->dev); |
|---|
| 191 | | - kfree(dev); |
|---|
| 192 | | - } |
|---|
| 193 | | - |
|---|
| 194 | | - free_percpu(dtab->flush_needed); |
|---|
| 195 | | - bpf_map_area_free(dtab->netdev_map); |
|---|
| 196 | 255 | kfree(dtab); |
|---|
| 197 | 256 | } |
|---|
| 198 | 257 | |
|---|
| .. | .. |
|---|
| 213 | 272 | return 0; |
|---|
| 214 | 273 | } |
|---|
| 215 | 274 | |
|---|
| 216 | | -void __dev_map_insert_ctx(struct bpf_map *map, u32 bit) |
|---|
| 275 | +struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) |
|---|
| 217 | 276 | { |
|---|
| 218 | 277 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); |
|---|
| 219 | | - unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); |
|---|
| 278 | + struct hlist_head *head = dev_map_index_hash(dtab, key); |
|---|
| 279 | + struct bpf_dtab_netdev *dev; |
|---|
| 220 | 280 | |
|---|
| 221 | | - __set_bit(bit, bitmap); |
|---|
| 281 | + hlist_for_each_entry_rcu(dev, head, index_hlist, |
|---|
| 282 | + lockdep_is_held(&dtab->index_lock)) |
|---|
| 283 | + if (dev->idx == key) |
|---|
| 284 | + return dev; |
|---|
| 285 | + |
|---|
| 286 | + return NULL; |
|---|
| 222 | 287 | } |
|---|
| 223 | 288 | |
|---|
| 224 | | -static int bq_xmit_all(struct bpf_dtab_netdev *obj, |
|---|
| 225 | | - struct xdp_bulk_queue *bq, u32 flags, |
|---|
| 226 | | - bool in_napi_ctx) |
|---|
| 289 | +static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, |
|---|
| 290 | + void *next_key) |
|---|
| 227 | 291 | { |
|---|
| 228 | | - struct net_device *dev = obj->dev; |
|---|
| 292 | + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); |
|---|
| 293 | + u32 idx, *next = next_key; |
|---|
| 294 | + struct bpf_dtab_netdev *dev, *next_dev; |
|---|
| 295 | + struct hlist_head *head; |
|---|
| 296 | + int i = 0; |
|---|
| 297 | + |
|---|
| 298 | + if (!key) |
|---|
| 299 | + goto find_first; |
|---|
| 300 | + |
|---|
| 301 | + idx = *(u32 *)key; |
|---|
| 302 | + |
|---|
| 303 | + dev = __dev_map_hash_lookup_elem(map, idx); |
|---|
| 304 | + if (!dev) |
|---|
| 305 | + goto find_first; |
|---|
| 306 | + |
|---|
| 307 | + next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)), |
|---|
| 308 | + struct bpf_dtab_netdev, index_hlist); |
|---|
| 309 | + |
|---|
| 310 | + if (next_dev) { |
|---|
| 311 | + *next = next_dev->idx; |
|---|
| 312 | + return 0; |
|---|
| 313 | + } |
|---|
| 314 | + |
|---|
| 315 | + i = idx & (dtab->n_buckets - 1); |
|---|
| 316 | + i++; |
|---|
| 317 | + |
|---|
| 318 | + find_first: |
|---|
| 319 | + for (; i < dtab->n_buckets; i++) { |
|---|
| 320 | + head = dev_map_index_hash(dtab, i); |
|---|
| 321 | + |
|---|
| 322 | + next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), |
|---|
| 323 | + struct bpf_dtab_netdev, |
|---|
| 324 | + index_hlist); |
|---|
| 325 | + if (next_dev) { |
|---|
| 326 | + *next = next_dev->idx; |
|---|
| 327 | + return 0; |
|---|
| 328 | + } |
|---|
| 329 | + } |
|---|
| 330 | + |
|---|
| 331 | + return -ENOENT; |
|---|
| 332 | +} |
|---|
| 333 | + |
|---|
| 334 | +bool dev_map_can_have_prog(struct bpf_map *map) |
|---|
| 335 | +{ |
|---|
| 336 | + if ((map->map_type == BPF_MAP_TYPE_DEVMAP || |
|---|
| 337 | + map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) && |
|---|
| 338 | + map->value_size != offsetofend(struct bpf_devmap_val, ifindex)) |
|---|
| 339 | + return true; |
|---|
| 340 | + |
|---|
| 341 | + return false; |
|---|
| 342 | +} |
|---|
| 343 | + |
|---|
| 344 | +static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) |
|---|
| 345 | +{ |
|---|
| 346 | + struct net_device *dev = bq->dev; |
|---|
| 229 | 347 | int sent = 0, drops = 0, err = 0; |
|---|
| 230 | 348 | int i; |
|---|
| 231 | 349 | |
|---|
| 232 | 350 | if (unlikely(!bq->count)) |
|---|
| 233 | | - return 0; |
|---|
| 351 | + return; |
|---|
| 234 | 352 | |
|---|
| 235 | 353 | for (i = 0; i < bq->count; i++) { |
|---|
| 236 | 354 | struct xdp_frame *xdpf = bq->q[i]; |
|---|
| .. | .. |
|---|
| 248 | 366 | out: |
|---|
| 249 | 367 | bq->count = 0; |
|---|
| 250 | 368 | |
|---|
| 251 | | - trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit, |
|---|
| 252 | | - sent, drops, bq->dev_rx, dev, err); |
|---|
| 369 | + trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err); |
|---|
| 253 | 370 | bq->dev_rx = NULL; |
|---|
| 254 | | - return 0; |
|---|
| 371 | + __list_del_clearprev(&bq->flush_node); |
|---|
| 372 | + return; |
|---|
| 255 | 373 | error: |
|---|
| 256 | 374 | /* If ndo_xdp_xmit fails with an errno, no frames have been |
|---|
| 257 | 375 | * xmit'ed and it's our responsibility to them free all. |
|---|
| .. | .. |
|---|
| 259 | 377 | for (i = 0; i < bq->count; i++) { |
|---|
| 260 | 378 | struct xdp_frame *xdpf = bq->q[i]; |
|---|
| 261 | 379 | |
|---|
| 262 | | - /* RX path under NAPI protection, can return frames faster */ |
|---|
| 263 | | - if (likely(in_napi_ctx)) |
|---|
| 264 | | - xdp_return_frame_rx_napi(xdpf); |
|---|
| 265 | | - else |
|---|
| 266 | | - xdp_return_frame(xdpf); |
|---|
| 380 | + xdp_return_frame_rx_napi(xdpf); |
|---|
| 267 | 381 | drops++; |
|---|
| 268 | 382 | } |
|---|
| 269 | 383 | goto out; |
|---|
| 270 | 384 | } |
|---|
| 271 | 385 | |
|---|
| 272 | | -/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled |
|---|
| 386 | +/* __dev_flush is called from xdp_do_flush() which _must_ be signaled |
|---|
| 273 | 387 | * from the driver before returning from its napi->poll() routine. The poll() |
|---|
| 274 | 388 | * routine is called either from busy_poll context or net_rx_action signaled |
|---|
| 275 | 389 | * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the |
|---|
| 276 | | - * net device can be torn down. On devmap tear down we ensure the ctx bitmap |
|---|
| 277 | | - * is zeroed before completing to ensure all flush operations have completed. |
|---|
| 390 | + * net device can be torn down. On devmap tear down we ensure the flush list |
|---|
| 391 | + * is empty before completing to ensure all flush operations have completed. |
|---|
| 392 | + * When drivers update the bpf program they may need to ensure any flush ops |
|---|
| 393 | + * are also complete. Using synchronize_rcu or call_rcu will suffice for this |
|---|
| 394 | + * because both wait for napi context to exit. |
|---|
| 278 | 395 | */ |
|---|
| 279 | | -void __dev_map_flush(struct bpf_map *map) |
|---|
| 396 | +void __dev_flush(void) |
|---|
| 280 | 397 | { |
|---|
| 281 | | - struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); |
|---|
| 282 | | - unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); |
|---|
| 283 | | - u32 bit; |
|---|
| 398 | + struct list_head *flush_list = this_cpu_ptr(&dev_flush_list); |
|---|
| 399 | + struct xdp_dev_bulk_queue *bq, *tmp; |
|---|
| 284 | 400 | |
|---|
| 285 | | - rcu_read_lock(); |
|---|
| 286 | | - for_each_set_bit(bit, bitmap, map->max_entries) { |
|---|
| 287 | | - struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]); |
|---|
| 288 | | - struct xdp_bulk_queue *bq; |
|---|
| 289 | | - |
|---|
| 290 | | - /* This is possible if the dev entry is removed by user space |
|---|
| 291 | | - * between xdp redirect and flush op. |
|---|
| 292 | | - */ |
|---|
| 293 | | - if (unlikely(!dev)) |
|---|
| 294 | | - continue; |
|---|
| 295 | | - |
|---|
| 296 | | - bq = this_cpu_ptr(dev->bulkq); |
|---|
| 297 | | - bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, true); |
|---|
| 298 | | - |
|---|
| 299 | | - __clear_bit(bit, bitmap); |
|---|
| 300 | | - } |
|---|
| 301 | | - rcu_read_unlock(); |
|---|
| 401 | + list_for_each_entry_safe(bq, tmp, flush_list, flush_node) |
|---|
| 402 | + bq_xmit_all(bq, XDP_XMIT_FLUSH); |
|---|
| 302 | 403 | } |
|---|
| 303 | 404 | |
|---|
| 304 | 405 | /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or |
|---|
| .. | .. |
|---|
| 320 | 421 | /* Runs under RCU-read-side, plus in softirq under NAPI protection. |
|---|
| 321 | 422 | * Thus, safe percpu variable access. |
|---|
| 322 | 423 | */ |
|---|
| 323 | | -static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, |
|---|
| 324 | | - struct net_device *dev_rx) |
|---|
| 325 | | - |
|---|
| 424 | +static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, |
|---|
| 425 | + struct net_device *dev_rx) |
|---|
| 326 | 426 | { |
|---|
| 327 | | - struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); |
|---|
| 427 | + struct list_head *flush_list = this_cpu_ptr(&dev_flush_list); |
|---|
| 428 | + struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq); |
|---|
| 328 | 429 | |
|---|
| 329 | 430 | if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) |
|---|
| 330 | | - bq_xmit_all(obj, bq, 0, true); |
|---|
| 431 | + bq_xmit_all(bq, 0); |
|---|
| 331 | 432 | |
|---|
| 332 | 433 | /* Ingress dev_rx will be the same for all xdp_frame's in |
|---|
| 333 | 434 | * bulk_queue, because bq stored per-CPU and must be flushed |
|---|
| .. | .. |
|---|
| 337 | 438 | bq->dev_rx = dev_rx; |
|---|
| 338 | 439 | |
|---|
| 339 | 440 | bq->q[bq->count++] = xdpf; |
|---|
| 340 | | - return 0; |
|---|
| 441 | + |
|---|
| 442 | + if (!bq->flush_node.prev) |
|---|
| 443 | + list_add(&bq->flush_node, flush_list); |
|---|
| 341 | 444 | } |
|---|
| 342 | 445 | |
|---|
| 343 | | -int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, |
|---|
| 344 | | - struct net_device *dev_rx) |
|---|
| 446 | +static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, |
|---|
| 447 | + struct net_device *dev_rx) |
|---|
| 345 | 448 | { |
|---|
| 346 | | - struct net_device *dev = dst->dev; |
|---|
| 347 | 449 | struct xdp_frame *xdpf; |
|---|
| 348 | 450 | int err; |
|---|
| 349 | 451 | |
|---|
| .. | .. |
|---|
| 354 | 456 | if (unlikely(err)) |
|---|
| 355 | 457 | return err; |
|---|
| 356 | 458 | |
|---|
| 357 | | - xdpf = convert_to_xdp_frame(xdp); |
|---|
| 459 | + xdpf = xdp_convert_buff_to_frame(xdp); |
|---|
| 358 | 460 | if (unlikely(!xdpf)) |
|---|
| 359 | 461 | return -EOVERFLOW; |
|---|
| 360 | 462 | |
|---|
| 361 | | - return bq_enqueue(dst, xdpf, dev_rx); |
|---|
| 463 | + bq_enqueue(dev, xdpf, dev_rx); |
|---|
| 464 | + return 0; |
|---|
| 465 | +} |
|---|
| 466 | + |
|---|
| 467 | +static struct xdp_buff *dev_map_run_prog(struct net_device *dev, |
|---|
| 468 | + struct xdp_buff *xdp, |
|---|
| 469 | + struct bpf_prog *xdp_prog) |
|---|
| 470 | +{ |
|---|
| 471 | + struct xdp_txq_info txq = { .dev = dev }; |
|---|
| 472 | + u32 act; |
|---|
| 473 | + |
|---|
| 474 | + xdp_set_data_meta_invalid(xdp); |
|---|
| 475 | + xdp->txq = &txq; |
|---|
| 476 | + |
|---|
| 477 | + act = bpf_prog_run_xdp(xdp_prog, xdp); |
|---|
| 478 | + switch (act) { |
|---|
| 479 | + case XDP_PASS: |
|---|
| 480 | + return xdp; |
|---|
| 481 | + case XDP_DROP: |
|---|
| 482 | + break; |
|---|
| 483 | + default: |
|---|
| 484 | + bpf_warn_invalid_xdp_action(act); |
|---|
| 485 | + fallthrough; |
|---|
| 486 | + case XDP_ABORTED: |
|---|
| 487 | + trace_xdp_exception(dev, xdp_prog, act); |
|---|
| 488 | + break; |
|---|
| 489 | + } |
|---|
| 490 | + |
|---|
| 491 | + xdp_return_buff(xdp); |
|---|
| 492 | + return NULL; |
|---|
| 493 | +} |
|---|
| 494 | + |
|---|
| 495 | +int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, |
|---|
| 496 | + struct net_device *dev_rx) |
|---|
| 497 | +{ |
|---|
| 498 | + return __xdp_enqueue(dev, xdp, dev_rx); |
|---|
| 499 | +} |
|---|
| 500 | + |
|---|
| 501 | +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, |
|---|
| 502 | + struct net_device *dev_rx) |
|---|
| 503 | +{ |
|---|
| 504 | + struct net_device *dev = dst->dev; |
|---|
| 505 | + |
|---|
| 506 | + if (dst->xdp_prog) { |
|---|
| 507 | + xdp = dev_map_run_prog(dev, xdp, dst->xdp_prog); |
|---|
| 508 | + if (!xdp) |
|---|
| 509 | + return 0; |
|---|
| 510 | + } |
|---|
| 511 | + return __xdp_enqueue(dev, xdp, dev_rx); |
|---|
| 362 | 512 | } |
|---|
| 363 | 513 | |
|---|
| 364 | 514 | int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, |
|---|
| .. | .. |
|---|
| 378 | 528 | static void *dev_map_lookup_elem(struct bpf_map *map, void *key) |
|---|
| 379 | 529 | { |
|---|
| 380 | 530 | struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key); |
|---|
| 381 | | - struct net_device *dev = obj ? obj->dev : NULL; |
|---|
| 382 | 531 | |
|---|
| 383 | | - return dev ? &dev->ifindex : NULL; |
|---|
| 532 | + return obj ? &obj->val : NULL; |
|---|
| 384 | 533 | } |
|---|
| 385 | 534 | |
|---|
| 386 | | -static void dev_map_flush_old(struct bpf_dtab_netdev *dev) |
|---|
| 535 | +static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key) |
|---|
| 387 | 536 | { |
|---|
| 388 | | - if (dev->dev->netdev_ops->ndo_xdp_xmit) { |
|---|
| 389 | | - struct xdp_bulk_queue *bq; |
|---|
| 390 | | - unsigned long *bitmap; |
|---|
| 391 | | - |
|---|
| 392 | | - int cpu; |
|---|
| 393 | | - |
|---|
| 394 | | - rcu_read_lock(); |
|---|
| 395 | | - for_each_online_cpu(cpu) { |
|---|
| 396 | | - bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu); |
|---|
| 397 | | - __clear_bit(dev->bit, bitmap); |
|---|
| 398 | | - |
|---|
| 399 | | - bq = per_cpu_ptr(dev->bulkq, cpu); |
|---|
| 400 | | - bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, false); |
|---|
| 401 | | - } |
|---|
| 402 | | - rcu_read_unlock(); |
|---|
| 403 | | - } |
|---|
| 537 | + struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map, |
|---|
| 538 | + *(u32 *)key); |
|---|
| 539 | + return obj ? &obj->val : NULL; |
|---|
| 404 | 540 | } |
|---|
| 405 | 541 | |
|---|
| 406 | 542 | static void __dev_map_entry_free(struct rcu_head *rcu) |
|---|
| .. | .. |
|---|
| 408 | 544 | struct bpf_dtab_netdev *dev; |
|---|
| 409 | 545 | |
|---|
| 410 | 546 | dev = container_of(rcu, struct bpf_dtab_netdev, rcu); |
|---|
| 411 | | - dev_map_flush_old(dev); |
|---|
| 412 | | - free_percpu(dev->bulkq); |
|---|
| 547 | + if (dev->xdp_prog) |
|---|
| 548 | + bpf_prog_put(dev->xdp_prog); |
|---|
| 413 | 549 | dev_put(dev->dev); |
|---|
| 414 | 550 | kfree(dev); |
|---|
| 415 | 551 | } |
|---|
| .. | .. |
|---|
| 424 | 560 | return -EINVAL; |
|---|
| 425 | 561 | |
|---|
| 426 | 562 | /* Use call_rcu() here to ensure any rcu critical sections have |
|---|
| 427 | | - * completed, but this does not guarantee a flush has happened |
|---|
| 428 | | - * yet. Because driver side rcu_read_lock/unlock only protects the |
|---|
| 429 | | - * running XDP program. However, for pending flush operations the |
|---|
| 430 | | - * dev and ctx are stored in another per cpu map. And additionally, |
|---|
| 431 | | - * the driver tear down ensures all soft irqs are complete before |
|---|
| 432 | | - * removing the net device in the case of dev_put equals zero. |
|---|
| 563 | + * completed as well as any flush operations because call_rcu |
|---|
| 564 | + * will wait for preempt-disable region to complete, NAPI in this |
|---|
| 565 | + * context. And additionally, the driver tear down ensures all |
|---|
| 566 | + * soft irqs are complete before removing the net device in the |
|---|
| 567 | + * case of dev_put equals zero. |
|---|
| 433 | 568 | */ |
|---|
| 434 | 569 | old_dev = xchg(&dtab->netdev_map[k], NULL); |
|---|
| 435 | 570 | if (old_dev) |
|---|
| .. | .. |
|---|
| 437 | 572 | return 0; |
|---|
| 438 | 573 | } |
|---|
| 439 | 574 | |
|---|
| 440 | | -static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, |
|---|
| 441 | | - u64 map_flags) |
|---|
| 575 | +static int dev_map_hash_delete_elem(struct bpf_map *map, void *key) |
|---|
| 442 | 576 | { |
|---|
| 443 | 577 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); |
|---|
| 444 | | - struct net *net = current->nsproxy->net_ns; |
|---|
| 445 | | - gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; |
|---|
| 578 | + struct bpf_dtab_netdev *old_dev; |
|---|
| 579 | + int k = *(u32 *)key; |
|---|
| 580 | + unsigned long flags; |
|---|
| 581 | + int ret = -ENOENT; |
|---|
| 582 | + |
|---|
| 583 | + spin_lock_irqsave(&dtab->index_lock, flags); |
|---|
| 584 | + |
|---|
| 585 | + old_dev = __dev_map_hash_lookup_elem(map, k); |
|---|
| 586 | + if (old_dev) { |
|---|
| 587 | + dtab->items--; |
|---|
| 588 | + hlist_del_init_rcu(&old_dev->index_hlist); |
|---|
| 589 | + call_rcu(&old_dev->rcu, __dev_map_entry_free); |
|---|
| 590 | + ret = 0; |
|---|
| 591 | + } |
|---|
| 592 | + spin_unlock_irqrestore(&dtab->index_lock, flags); |
|---|
| 593 | + |
|---|
| 594 | + return ret; |
|---|
| 595 | +} |
|---|
| 596 | + |
|---|
| 597 | +static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, |
|---|
| 598 | + struct bpf_dtab *dtab, |
|---|
| 599 | + struct bpf_devmap_val *val, |
|---|
| 600 | + unsigned int idx) |
|---|
| 601 | +{ |
|---|
| 602 | + struct bpf_prog *prog = NULL; |
|---|
| 603 | + struct bpf_dtab_netdev *dev; |
|---|
| 604 | + |
|---|
| 605 | + dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, |
|---|
| 606 | + dtab->map.numa_node); |
|---|
| 607 | + if (!dev) |
|---|
| 608 | + return ERR_PTR(-ENOMEM); |
|---|
| 609 | + |
|---|
| 610 | + dev->dev = dev_get_by_index(net, val->ifindex); |
|---|
| 611 | + if (!dev->dev) |
|---|
| 612 | + goto err_out; |
|---|
| 613 | + |
|---|
| 614 | + if (val->bpf_prog.fd > 0) { |
|---|
| 615 | + prog = bpf_prog_get_type_dev(val->bpf_prog.fd, |
|---|
| 616 | + BPF_PROG_TYPE_XDP, false); |
|---|
| 617 | + if (IS_ERR(prog)) |
|---|
| 618 | + goto err_put_dev; |
|---|
| 619 | + if (prog->expected_attach_type != BPF_XDP_DEVMAP) |
|---|
| 620 | + goto err_put_prog; |
|---|
| 621 | + } |
|---|
| 622 | + |
|---|
| 623 | + dev->idx = idx; |
|---|
| 624 | + dev->dtab = dtab; |
|---|
| 625 | + if (prog) { |
|---|
| 626 | + dev->xdp_prog = prog; |
|---|
| 627 | + dev->val.bpf_prog.id = prog->aux->id; |
|---|
| 628 | + } else { |
|---|
| 629 | + dev->xdp_prog = NULL; |
|---|
| 630 | + dev->val.bpf_prog.id = 0; |
|---|
| 631 | + } |
|---|
| 632 | + dev->val.ifindex = val->ifindex; |
|---|
| 633 | + |
|---|
| 634 | + return dev; |
|---|
| 635 | +err_put_prog: |
|---|
| 636 | + bpf_prog_put(prog); |
|---|
| 637 | +err_put_dev: |
|---|
| 638 | + dev_put(dev->dev); |
|---|
| 639 | +err_out: |
|---|
| 640 | + kfree(dev); |
|---|
| 641 | + return ERR_PTR(-EINVAL); |
|---|
| 642 | +} |
|---|
| 643 | + |
|---|
| 644 | +static int __dev_map_update_elem(struct net *net, struct bpf_map *map, |
|---|
| 645 | + void *key, void *value, u64 map_flags) |
|---|
| 646 | +{ |
|---|
| 647 | + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); |
|---|
| 446 | 648 | struct bpf_dtab_netdev *dev, *old_dev; |
|---|
| 649 | + struct bpf_devmap_val val = {}; |
|---|
| 447 | 650 | u32 i = *(u32 *)key; |
|---|
| 448 | | - u32 ifindex = *(u32 *)value; |
|---|
| 449 | 651 | |
|---|
| 450 | 652 | if (unlikely(map_flags > BPF_EXIST)) |
|---|
| 451 | 653 | return -EINVAL; |
|---|
| .. | .. |
|---|
| 454 | 656 | if (unlikely(map_flags == BPF_NOEXIST)) |
|---|
| 455 | 657 | return -EEXIST; |
|---|
| 456 | 658 | |
|---|
| 457 | | - if (!ifindex) { |
|---|
| 659 | + /* already verified value_size <= sizeof val */ |
|---|
| 660 | + memcpy(&val, value, map->value_size); |
|---|
| 661 | + |
|---|
| 662 | + if (!val.ifindex) { |
|---|
| 458 | 663 | dev = NULL; |
|---|
| 459 | | - } else { |
|---|
| 460 | | - dev = kmalloc_node(sizeof(*dev), gfp, map->numa_node); |
|---|
| 461 | | - if (!dev) |
|---|
| 462 | | - return -ENOMEM; |
|---|
| 463 | | - |
|---|
| 464 | | - dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq), |
|---|
| 465 | | - sizeof(void *), gfp); |
|---|
| 466 | | - if (!dev->bulkq) { |
|---|
| 467 | | - kfree(dev); |
|---|
| 468 | | - return -ENOMEM; |
|---|
| 469 | | - } |
|---|
| 470 | | - |
|---|
| 471 | | - dev->dev = dev_get_by_index(net, ifindex); |
|---|
| 472 | | - if (!dev->dev) { |
|---|
| 473 | | - free_percpu(dev->bulkq); |
|---|
| 474 | | - kfree(dev); |
|---|
| 664 | + /* can not specify fd if ifindex is 0 */ |
|---|
| 665 | + if (val.bpf_prog.fd > 0) |
|---|
| 475 | 666 | return -EINVAL; |
|---|
| 476 | | - } |
|---|
| 477 | | - |
|---|
| 478 | | - dev->bit = i; |
|---|
| 479 | | - dev->dtab = dtab; |
|---|
| 667 | + } else { |
|---|
| 668 | + dev = __dev_map_alloc_node(net, dtab, &val, i); |
|---|
| 669 | + if (IS_ERR(dev)) |
|---|
| 670 | + return PTR_ERR(dev); |
|---|
| 480 | 671 | } |
|---|
| 481 | 672 | |
|---|
| 482 | 673 | /* Use call_rcu() here to ensure rcu critical sections have completed |
|---|
| .. | .. |
|---|
| 490 | 681 | return 0; |
|---|
| 491 | 682 | } |
|---|
| 492 | 683 | |
|---|
| 684 | +static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, |
|---|
| 685 | + u64 map_flags) |
|---|
| 686 | +{ |
|---|
| 687 | + return __dev_map_update_elem(current->nsproxy->net_ns, |
|---|
| 688 | + map, key, value, map_flags); |
|---|
| 689 | +} |
|---|
| 690 | + |
|---|
| 691 | +static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, |
|---|
| 692 | + void *key, void *value, u64 map_flags) |
|---|
| 693 | +{ |
|---|
| 694 | + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); |
|---|
| 695 | + struct bpf_dtab_netdev *dev, *old_dev; |
|---|
| 696 | + struct bpf_devmap_val val = {}; |
|---|
| 697 | + u32 idx = *(u32 *)key; |
|---|
| 698 | + unsigned long flags; |
|---|
| 699 | + int err = -EEXIST; |
|---|
| 700 | + |
|---|
| 701 | + /* already verified value_size <= sizeof val */ |
|---|
| 702 | + memcpy(&val, value, map->value_size); |
|---|
| 703 | + |
|---|
| 704 | + if (unlikely(map_flags > BPF_EXIST || !val.ifindex)) |
|---|
| 705 | + return -EINVAL; |
|---|
| 706 | + |
|---|
| 707 | + spin_lock_irqsave(&dtab->index_lock, flags); |
|---|
| 708 | + |
|---|
| 709 | + old_dev = __dev_map_hash_lookup_elem(map, idx); |
|---|
| 710 | + if (old_dev && (map_flags & BPF_NOEXIST)) |
|---|
| 711 | + goto out_err; |
|---|
| 712 | + |
|---|
| 713 | + dev = __dev_map_alloc_node(net, dtab, &val, idx); |
|---|
| 714 | + if (IS_ERR(dev)) { |
|---|
| 715 | + err = PTR_ERR(dev); |
|---|
| 716 | + goto out_err; |
|---|
| 717 | + } |
|---|
| 718 | + |
|---|
| 719 | + if (old_dev) { |
|---|
| 720 | + hlist_del_rcu(&old_dev->index_hlist); |
|---|
| 721 | + } else { |
|---|
| 722 | + if (dtab->items >= dtab->map.max_entries) { |
|---|
| 723 | + spin_unlock_irqrestore(&dtab->index_lock, flags); |
|---|
| 724 | + call_rcu(&dev->rcu, __dev_map_entry_free); |
|---|
| 725 | + return -E2BIG; |
|---|
| 726 | + } |
|---|
| 727 | + dtab->items++; |
|---|
| 728 | + } |
|---|
| 729 | + |
|---|
| 730 | + hlist_add_head_rcu(&dev->index_hlist, |
|---|
| 731 | + dev_map_index_hash(dtab, idx)); |
|---|
| 732 | + spin_unlock_irqrestore(&dtab->index_lock, flags); |
|---|
| 733 | + |
|---|
| 734 | + if (old_dev) |
|---|
| 735 | + call_rcu(&old_dev->rcu, __dev_map_entry_free); |
|---|
| 736 | + |
|---|
| 737 | + return 0; |
|---|
| 738 | + |
|---|
| 739 | +out_err: |
|---|
| 740 | + spin_unlock_irqrestore(&dtab->index_lock, flags); |
|---|
| 741 | + return err; |
|---|
| 742 | +} |
|---|
| 743 | + |
|---|
| 744 | +static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, |
|---|
| 745 | + u64 map_flags) |
|---|
| 746 | +{ |
|---|
| 747 | + return __dev_map_hash_update_elem(current->nsproxy->net_ns, |
|---|
| 748 | + map, key, value, map_flags); |
|---|
| 749 | +} |
|---|
| 750 | + |
|---|
| 751 | +static int dev_map_btf_id; |
|---|
| 493 | 752 | const struct bpf_map_ops dev_map_ops = { |
|---|
| 753 | + .map_meta_equal = bpf_map_meta_equal, |
|---|
| 494 | 754 | .map_alloc = dev_map_alloc, |
|---|
| 495 | 755 | .map_free = dev_map_free, |
|---|
| 496 | 756 | .map_get_next_key = dev_map_get_next_key, |
|---|
| .. | .. |
|---|
| 498 | 758 | .map_update_elem = dev_map_update_elem, |
|---|
| 499 | 759 | .map_delete_elem = dev_map_delete_elem, |
|---|
| 500 | 760 | .map_check_btf = map_check_no_btf, |
|---|
| 761 | + .map_btf_name = "bpf_dtab", |
|---|
| 762 | + .map_btf_id = &dev_map_btf_id, |
|---|
| 501 | 763 | }; |
|---|
| 764 | + |
|---|
| 765 | +static int dev_map_hash_map_btf_id; |
|---|
| 766 | +const struct bpf_map_ops dev_map_hash_ops = { |
|---|
| 767 | + .map_meta_equal = bpf_map_meta_equal, |
|---|
| 768 | + .map_alloc = dev_map_alloc, |
|---|
| 769 | + .map_free = dev_map_free, |
|---|
| 770 | + .map_get_next_key = dev_map_hash_get_next_key, |
|---|
| 771 | + .map_lookup_elem = dev_map_hash_lookup_elem, |
|---|
| 772 | + .map_update_elem = dev_map_hash_update_elem, |
|---|
| 773 | + .map_delete_elem = dev_map_hash_delete_elem, |
|---|
| 774 | + .map_check_btf = map_check_no_btf, |
|---|
| 775 | + .map_btf_name = "bpf_dtab", |
|---|
| 776 | + .map_btf_id = &dev_map_hash_map_btf_id, |
|---|
| 777 | +}; |
|---|
| 778 | + |
|---|
| 779 | +static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab, |
|---|
| 780 | + struct net_device *netdev) |
|---|
| 781 | +{ |
|---|
| 782 | + unsigned long flags; |
|---|
| 783 | + u32 i; |
|---|
| 784 | + |
|---|
| 785 | + spin_lock_irqsave(&dtab->index_lock, flags); |
|---|
| 786 | + for (i = 0; i < dtab->n_buckets; i++) { |
|---|
| 787 | + struct bpf_dtab_netdev *dev; |
|---|
| 788 | + struct hlist_head *head; |
|---|
| 789 | + struct hlist_node *next; |
|---|
| 790 | + |
|---|
| 791 | + head = dev_map_index_hash(dtab, i); |
|---|
| 792 | + |
|---|
| 793 | + hlist_for_each_entry_safe(dev, next, head, index_hlist) { |
|---|
| 794 | + if (netdev != dev->dev) |
|---|
| 795 | + continue; |
|---|
| 796 | + |
|---|
| 797 | + dtab->items--; |
|---|
| 798 | + hlist_del_rcu(&dev->index_hlist); |
|---|
| 799 | + call_rcu(&dev->rcu, __dev_map_entry_free); |
|---|
| 800 | + } |
|---|
| 801 | + } |
|---|
| 802 | + spin_unlock_irqrestore(&dtab->index_lock, flags); |
|---|
| 803 | +} |
|---|
| 502 | 804 | |
|---|
| 503 | 805 | static int dev_map_notification(struct notifier_block *notifier, |
|---|
| 504 | 806 | ulong event, void *ptr) |
|---|
| 505 | 807 | { |
|---|
| 506 | 808 | struct net_device *netdev = netdev_notifier_info_to_dev(ptr); |
|---|
| 507 | 809 | struct bpf_dtab *dtab; |
|---|
| 508 | | - int i; |
|---|
| 810 | + int i, cpu; |
|---|
| 509 | 811 | |
|---|
| 510 | 812 | switch (event) { |
|---|
| 813 | + case NETDEV_REGISTER: |
|---|
| 814 | + if (!netdev->netdev_ops->ndo_xdp_xmit || netdev->xdp_bulkq) |
|---|
| 815 | + break; |
|---|
| 816 | + |
|---|
| 817 | + /* will be freed in free_netdev() */ |
|---|
| 818 | + netdev->xdp_bulkq = alloc_percpu(struct xdp_dev_bulk_queue); |
|---|
| 819 | + if (!netdev->xdp_bulkq) |
|---|
| 820 | + return NOTIFY_BAD; |
|---|
| 821 | + |
|---|
| 822 | + for_each_possible_cpu(cpu) |
|---|
| 823 | + per_cpu_ptr(netdev->xdp_bulkq, cpu)->dev = netdev; |
|---|
| 824 | + break; |
|---|
| 511 | 825 | case NETDEV_UNREGISTER: |
|---|
| 512 | 826 | /* This rcu_read_lock/unlock pair is needed because |
|---|
| 513 | 827 | * dev_map_list is an RCU list AND to ensure a delete |
|---|
| .. | .. |
|---|
| 516 | 830 | */ |
|---|
| 517 | 831 | rcu_read_lock(); |
|---|
| 518 | 832 | list_for_each_entry_rcu(dtab, &dev_map_list, list) { |
|---|
| 833 | + if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) { |
|---|
| 834 | + dev_map_hash_remove_netdev(dtab, netdev); |
|---|
| 835 | + continue; |
|---|
| 836 | + } |
|---|
| 837 | + |
|---|
| 519 | 838 | for (i = 0; i < dtab->map.max_entries; i++) { |
|---|
| 520 | 839 | struct bpf_dtab_netdev *dev, *odev; |
|---|
| 521 | 840 | |
|---|
| .. | .. |
|---|
| 542 | 861 | |
|---|
| 543 | 862 | static int __init dev_map_init(void) |
|---|
| 544 | 863 | { |
|---|
| 864 | + int cpu; |
|---|
| 865 | + |
|---|
| 545 | 866 | /* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */ |
|---|
| 546 | 867 | BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) != |
|---|
| 547 | 868 | offsetof(struct _bpf_dtab_netdev, dev)); |
|---|
| 548 | 869 | register_netdevice_notifier(&dev_map_notifier); |
|---|
| 870 | + |
|---|
| 871 | + for_each_possible_cpu(cpu) |
|---|
| 872 | + INIT_LIST_HEAD(&per_cpu(dev_flush_list, cpu)); |
|---|
| 549 | 873 | return 0; |
|---|
| 550 | 874 | } |
|---|
| 551 | 875 | |
|---|