.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
---|
1 | 2 | /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io |
---|
2 | | - * |
---|
3 | | - * This program is free software; you can redistribute it and/or |
---|
4 | | - * modify it under the terms of version 2 of the GNU General Public |
---|
5 | | - * License as published by the Free Software Foundation. |
---|
6 | | - * |
---|
7 | | - * This program is distributed in the hope that it will be useful, but |
---|
8 | | - * WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
9 | | - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
---|
10 | | - * General Public License for more details. |
---|
11 | 3 | */ |
---|
12 | 4 | |
---|
13 | 5 | /* Devmaps primary use is as a backend map for XDP BPF helper call |
---|
.. | .. |
---|
25 | 17 | * datapath always has a valid copy. However, the datapath does a "flush" |
---|
26 | 18 | * operation that pushes any pending packets in the driver outside the RCU |
---|
27 | 19 | * critical section. Each bpf_dtab_netdev tracks these pending operations using |
---|
28 | | - * an atomic per-cpu bitmap. The bpf_dtab_netdev object will not be destroyed |
---|
29 | | - * until all bits are cleared indicating outstanding flush operations have |
---|
30 | | - * completed. |
---|
| 20 | + * a per-cpu flush list. The bpf_dtab_netdev object will not be destroyed until |
---|
| 21 | + * this list is empty, indicating outstanding flush operations have completed. |
---|
31 | 22 | * |
---|
32 | 23 | * BPF syscalls may race with BPF program calls on any of the update, delete |
---|
33 | 24 | * or lookup operations. As noted above the xchg() operation also keep the |
---|
.. | .. |
---|
46 | 37 | * notifier hook walks the map we know that new dev references can not be |
---|
47 | 38 | * added by the user because core infrastructure ensures dev_get_by_index() |
---|
48 | 39 | * calls will fail at this point. |
---|
| 40 | + * |
---|
| 41 | + * The devmap_hash type is a map type which interprets keys as ifindexes and |
---|
| 42 | + * indexes these using a hashmap. This allows maps that use ifindex as key to be |
---|
| 43 | + * densely packed instead of having holes in the lookup array for unused |
---|
| 44 | + * ifindexes. The setup and packet enqueue/send code is shared between the two |
---|
| 45 | + * types of devmap; only the lookup and insertion is different. |
---|
49 | 46 | */ |
---|
50 | 47 | #include <linux/bpf.h> |
---|
51 | 48 | #include <net/xdp.h> |
---|
.. | .. |
---|
55 | 52 | #define DEV_CREATE_FLAG_MASK \ |
---|
56 | 53 | (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) |
---|
57 | 54 | |
---|
58 | | -#define DEV_MAP_BULK_SIZE 16 |
---|
59 | | -struct xdp_bulk_queue { |
---|
| 55 | +struct xdp_dev_bulk_queue { |
---|
60 | 56 | struct xdp_frame *q[DEV_MAP_BULK_SIZE]; |
---|
| 57 | + struct list_head flush_node; |
---|
| 58 | + struct net_device *dev; |
---|
61 | 59 | struct net_device *dev_rx; |
---|
62 | 60 | unsigned int count; |
---|
63 | 61 | }; |
---|
64 | 62 | |
---|
65 | 63 | struct bpf_dtab_netdev { |
---|
66 | 64 | struct net_device *dev; /* must be first member, due to tracepoint */ |
---|
| 65 | + struct hlist_node index_hlist; |
---|
67 | 66 | struct bpf_dtab *dtab; |
---|
68 | | - unsigned int bit; |
---|
69 | | - struct xdp_bulk_queue __percpu *bulkq; |
---|
| 67 | + struct bpf_prog *xdp_prog; |
---|
70 | 68 | struct rcu_head rcu; |
---|
| 69 | + unsigned int idx; |
---|
| 70 | + struct bpf_devmap_val val; |
---|
71 | 71 | }; |
---|
72 | 72 | |
---|
73 | 73 | struct bpf_dtab { |
---|
74 | 74 | struct bpf_map map; |
---|
75 | | - struct bpf_dtab_netdev **netdev_map; |
---|
76 | | - unsigned long __percpu *flush_needed; |
---|
| 75 | + struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */ |
---|
77 | 76 | struct list_head list; |
---|
| 77 | + |
---|
| 78 | + /* these are only used for DEVMAP_HASH type maps */ |
---|
| 79 | + struct hlist_head *dev_index_head; |
---|
| 80 | + spinlock_t index_lock; |
---|
| 81 | + unsigned int items; |
---|
| 82 | + u32 n_buckets; |
---|
78 | 83 | }; |
---|
79 | 84 | |
---|
| 85 | +static DEFINE_PER_CPU(struct list_head, dev_flush_list); |
---|
80 | 86 | static DEFINE_SPINLOCK(dev_map_lock); |
---|
81 | 87 | static LIST_HEAD(dev_map_list); |
---|
82 | 88 | |
---|
83 | | -static u64 dev_map_bitmap_size(const union bpf_attr *attr) |
---|
| 89 | +static struct hlist_head *dev_map_create_hash(unsigned int entries, |
---|
| 90 | + int numa_node) |
---|
84 | 91 | { |
---|
85 | | - return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long); |
---|
| 92 | + int i; |
---|
| 93 | + struct hlist_head *hash; |
---|
| 94 | + |
---|
| 95 | + hash = bpf_map_area_alloc((u64) entries * sizeof(*hash), numa_node); |
---|
| 96 | + if (hash != NULL) |
---|
| 97 | + for (i = 0; i < entries; i++) |
---|
| 98 | + INIT_HLIST_HEAD(&hash[i]); |
---|
| 99 | + |
---|
| 100 | + return hash; |
---|
| 101 | +} |
---|
| 102 | + |
---|
| 103 | +static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, |
---|
| 104 | + int idx) |
---|
| 105 | +{ |
---|
| 106 | + return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)]; |
---|
| 107 | +} |
---|
| 108 | + |
---|
| 109 | +static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) |
---|
| 110 | +{ |
---|
| 111 | + u32 valsize = attr->value_size; |
---|
| 112 | + u64 cost = 0; |
---|
| 113 | + int err; |
---|
| 114 | + |
---|
| 115 | + /* check sanity of attributes. 2 value sizes supported: |
---|
| 116 | + * 4 bytes: ifindex |
---|
| 117 | + * 8 bytes: ifindex + prog fd |
---|
| 118 | + */ |
---|
| 119 | + if (attr->max_entries == 0 || attr->key_size != 4 || |
---|
| 120 | + (valsize != offsetofend(struct bpf_devmap_val, ifindex) && |
---|
| 121 | + valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) || |
---|
| 122 | + attr->map_flags & ~DEV_CREATE_FLAG_MASK) |
---|
| 123 | + return -EINVAL; |
---|
| 124 | + |
---|
| 125 | + /* Lookup returns a pointer straight to dev->ifindex, so make sure the |
---|
| 126 | + * verifier prevents writes from the BPF side |
---|
| 127 | + */ |
---|
| 128 | + attr->map_flags |= BPF_F_RDONLY_PROG; |
---|
| 129 | + |
---|
| 130 | + |
---|
| 131 | + bpf_map_init_from_attr(&dtab->map, attr); |
---|
| 132 | + |
---|
| 133 | + if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { |
---|
| 134 | + dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); |
---|
| 135 | + |
---|
| 136 | + if (!dtab->n_buckets) /* Overflow check */ |
---|
| 137 | + return -EINVAL; |
---|
| 138 | + cost += (u64) sizeof(struct hlist_head) * dtab->n_buckets; |
---|
| 139 | + } else { |
---|
| 140 | + cost += (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); |
---|
| 141 | + } |
---|
| 142 | + |
---|
| 143 | + /* if map size is larger than memlock limit, reject it */ |
---|
| 144 | + err = bpf_map_charge_init(&dtab->map.memory, cost); |
---|
| 145 | + if (err) |
---|
| 146 | + return -EINVAL; |
---|
| 147 | + |
---|
| 148 | + if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { |
---|
| 149 | + dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets, |
---|
| 150 | + dtab->map.numa_node); |
---|
| 151 | + if (!dtab->dev_index_head) |
---|
| 152 | + goto free_charge; |
---|
| 153 | + |
---|
| 154 | + spin_lock_init(&dtab->index_lock); |
---|
| 155 | + } else { |
---|
| 156 | + dtab->netdev_map = bpf_map_area_alloc((u64) dtab->map.max_entries * |
---|
| 157 | + sizeof(struct bpf_dtab_netdev *), |
---|
| 158 | + dtab->map.numa_node); |
---|
| 159 | + if (!dtab->netdev_map) |
---|
| 160 | + goto free_charge; |
---|
| 161 | + } |
---|
| 162 | + |
---|
| 163 | + return 0; |
---|
| 164 | + |
---|
| 165 | +free_charge: |
---|
| 166 | + bpf_map_charge_finish(&dtab->map.memory); |
---|
| 167 | + return -ENOMEM; |
---|
86 | 168 | } |
---|
87 | 169 | |
---|
88 | 170 | static struct bpf_map *dev_map_alloc(union bpf_attr *attr) |
---|
89 | 171 | { |
---|
90 | 172 | struct bpf_dtab *dtab; |
---|
91 | | - int err = -EINVAL; |
---|
92 | | - u64 cost; |
---|
| 173 | + int err; |
---|
93 | 174 | |
---|
94 | 175 | if (!capable(CAP_NET_ADMIN)) |
---|
95 | 176 | return ERR_PTR(-EPERM); |
---|
96 | | - |
---|
97 | | - /* check sanity of attributes */ |
---|
98 | | - if (attr->max_entries == 0 || attr->key_size != 4 || |
---|
99 | | - attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) |
---|
100 | | - return ERR_PTR(-EINVAL); |
---|
101 | 177 | |
---|
102 | 178 | dtab = kzalloc(sizeof(*dtab), GFP_USER); |
---|
103 | 179 | if (!dtab) |
---|
104 | 180 | return ERR_PTR(-ENOMEM); |
---|
105 | 181 | |
---|
106 | | - bpf_map_init_from_attr(&dtab->map, attr); |
---|
107 | | - |
---|
108 | | - /* make sure page count doesn't overflow */ |
---|
109 | | - cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); |
---|
110 | | - cost += dev_map_bitmap_size(attr) * num_possible_cpus(); |
---|
111 | | - if (cost >= U32_MAX - PAGE_SIZE) |
---|
112 | | - goto free_dtab; |
---|
113 | | - |
---|
114 | | - dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; |
---|
115 | | - |
---|
116 | | - /* if map size is larger than memlock limit, reject it early */ |
---|
117 | | - err = bpf_map_precharge_memlock(dtab->map.pages); |
---|
118 | | - if (err) |
---|
119 | | - goto free_dtab; |
---|
120 | | - |
---|
121 | | - err = -ENOMEM; |
---|
122 | | - |
---|
123 | | - /* A per cpu bitfield with a bit per possible net device */ |
---|
124 | | - dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr), |
---|
125 | | - __alignof__(unsigned long), |
---|
126 | | - GFP_KERNEL | __GFP_NOWARN); |
---|
127 | | - if (!dtab->flush_needed) |
---|
128 | | - goto free_dtab; |
---|
129 | | - |
---|
130 | | - dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * |
---|
131 | | - sizeof(struct bpf_dtab_netdev *), |
---|
132 | | - dtab->map.numa_node); |
---|
133 | | - if (!dtab->netdev_map) |
---|
134 | | - goto free_dtab; |
---|
| 182 | + err = dev_map_init_map(dtab, attr); |
---|
| 183 | + if (err) { |
---|
| 184 | + kfree(dtab); |
---|
| 185 | + return ERR_PTR(err); |
---|
| 186 | + } |
---|
135 | 187 | |
---|
136 | 188 | spin_lock(&dev_map_lock); |
---|
137 | 189 | list_add_tail_rcu(&dtab->list, &dev_map_list); |
---|
138 | 190 | spin_unlock(&dev_map_lock); |
---|
139 | 191 | |
---|
140 | 192 | return &dtab->map; |
---|
141 | | -free_dtab: |
---|
142 | | - free_percpu(dtab->flush_needed); |
---|
143 | | - kfree(dtab); |
---|
144 | | - return ERR_PTR(err); |
---|
145 | 193 | } |
---|
146 | 194 | |
---|
147 | 195 | static void dev_map_free(struct bpf_map *map) |
---|
148 | 196 | { |
---|
149 | 197 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); |
---|
150 | | - int i, cpu; |
---|
| 198 | + int i; |
---|
151 | 199 | |
---|
152 | 200 | /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, |
---|
153 | 201 | * so the programs (can be more than one that used this map) were |
---|
154 | | - * disconnected from events. Wait for outstanding critical sections in |
---|
155 | | - * these programs to complete. The rcu critical section only guarantees |
---|
156 | | - * no further reads against netdev_map. It does __not__ ensure pending |
---|
157 | | - * flush operations (if any) are complete. |
---|
| 202 | + * disconnected from events. The following synchronize_rcu() guarantees |
---|
| 203 | + * both rcu read critical sections complete and waits for |
---|
| 204 | + * preempt-disable regions (NAPI being the relevant context here) so we |
---|
| 205 | + * are certain there will be no further reads against the netdev_map and |
---|
| 206 | + * all flush operations are complete. Flush operations can only be done |
---|
| 207 | + * from NAPI context for this reason. |
---|
158 | 208 | */ |
---|
159 | 209 | |
---|
160 | 210 | spin_lock(&dev_map_lock); |
---|
.. | .. |
---|
167 | 217 | /* Make sure prior __dev_map_entry_free() have completed. */ |
---|
168 | 218 | rcu_barrier(); |
---|
169 | 219 | |
---|
170 | | - /* To ensure all pending flush operations have completed wait for flush |
---|
171 | | - * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. |
---|
172 | | - * Because the above synchronize_rcu() ensures the map is disconnected |
---|
173 | | - * from the program we can assume no new bits will be set. |
---|
174 | | - */ |
---|
175 | | - for_each_online_cpu(cpu) { |
---|
176 | | - unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu); |
---|
| 220 | + if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) { |
---|
| 221 | + for (i = 0; i < dtab->n_buckets; i++) { |
---|
| 222 | + struct bpf_dtab_netdev *dev; |
---|
| 223 | + struct hlist_head *head; |
---|
| 224 | + struct hlist_node *next; |
---|
177 | 225 | |
---|
178 | | - while (!bitmap_empty(bitmap, dtab->map.max_entries)) |
---|
179 | | - cond_resched(); |
---|
| 226 | + head = dev_map_index_hash(dtab, i); |
---|
| 227 | + |
---|
| 228 | + hlist_for_each_entry_safe(dev, next, head, index_hlist) { |
---|
| 229 | + hlist_del_rcu(&dev->index_hlist); |
---|
| 230 | + if (dev->xdp_prog) |
---|
| 231 | + bpf_prog_put(dev->xdp_prog); |
---|
| 232 | + dev_put(dev->dev); |
---|
| 233 | + kfree(dev); |
---|
| 234 | + } |
---|
| 235 | + } |
---|
| 236 | + |
---|
| 237 | + bpf_map_area_free(dtab->dev_index_head); |
---|
| 238 | + } else { |
---|
| 239 | + for (i = 0; i < dtab->map.max_entries; i++) { |
---|
| 240 | + struct bpf_dtab_netdev *dev; |
---|
| 241 | + |
---|
| 242 | + dev = dtab->netdev_map[i]; |
---|
| 243 | + if (!dev) |
---|
| 244 | + continue; |
---|
| 245 | + |
---|
| 246 | + if (dev->xdp_prog) |
---|
| 247 | + bpf_prog_put(dev->xdp_prog); |
---|
| 248 | + dev_put(dev->dev); |
---|
| 249 | + kfree(dev); |
---|
| 250 | + } |
---|
| 251 | + |
---|
| 252 | + bpf_map_area_free(dtab->netdev_map); |
---|
180 | 253 | } |
---|
181 | 254 | |
---|
182 | | - for (i = 0; i < dtab->map.max_entries; i++) { |
---|
183 | | - struct bpf_dtab_netdev *dev; |
---|
184 | | - |
---|
185 | | - dev = dtab->netdev_map[i]; |
---|
186 | | - if (!dev) |
---|
187 | | - continue; |
---|
188 | | - |
---|
189 | | - free_percpu(dev->bulkq); |
---|
190 | | - dev_put(dev->dev); |
---|
191 | | - kfree(dev); |
---|
192 | | - } |
---|
193 | | - |
---|
194 | | - free_percpu(dtab->flush_needed); |
---|
195 | | - bpf_map_area_free(dtab->netdev_map); |
---|
196 | 255 | kfree(dtab); |
---|
197 | 256 | } |
---|
198 | 257 | |
---|
.. | .. |
---|
213 | 272 | return 0; |
---|
214 | 273 | } |
---|
215 | 274 | |
---|
216 | | -void __dev_map_insert_ctx(struct bpf_map *map, u32 bit) |
---|
| 275 | +struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) |
---|
217 | 276 | { |
---|
218 | 277 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); |
---|
219 | | - unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); |
---|
| 278 | + struct hlist_head *head = dev_map_index_hash(dtab, key); |
---|
| 279 | + struct bpf_dtab_netdev *dev; |
---|
220 | 280 | |
---|
221 | | - __set_bit(bit, bitmap); |
---|
| 281 | + hlist_for_each_entry_rcu(dev, head, index_hlist, |
---|
| 282 | + lockdep_is_held(&dtab->index_lock)) |
---|
| 283 | + if (dev->idx == key) |
---|
| 284 | + return dev; |
---|
| 285 | + |
---|
| 286 | + return NULL; |
---|
222 | 287 | } |
---|
223 | 288 | |
---|
224 | | -static int bq_xmit_all(struct bpf_dtab_netdev *obj, |
---|
225 | | - struct xdp_bulk_queue *bq, u32 flags, |
---|
226 | | - bool in_napi_ctx) |
---|
| 289 | +static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, |
---|
| 290 | + void *next_key) |
---|
227 | 291 | { |
---|
228 | | - struct net_device *dev = obj->dev; |
---|
| 292 | + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); |
---|
| 293 | + u32 idx, *next = next_key; |
---|
| 294 | + struct bpf_dtab_netdev *dev, *next_dev; |
---|
| 295 | + struct hlist_head *head; |
---|
| 296 | + int i = 0; |
---|
| 297 | + |
---|
| 298 | + if (!key) |
---|
| 299 | + goto find_first; |
---|
| 300 | + |
---|
| 301 | + idx = *(u32 *)key; |
---|
| 302 | + |
---|
| 303 | + dev = __dev_map_hash_lookup_elem(map, idx); |
---|
| 304 | + if (!dev) |
---|
| 305 | + goto find_first; |
---|
| 306 | + |
---|
| 307 | + next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)), |
---|
| 308 | + struct bpf_dtab_netdev, index_hlist); |
---|
| 309 | + |
---|
| 310 | + if (next_dev) { |
---|
| 311 | + *next = next_dev->idx; |
---|
| 312 | + return 0; |
---|
| 313 | + } |
---|
| 314 | + |
---|
| 315 | + i = idx & (dtab->n_buckets - 1); |
---|
| 316 | + i++; |
---|
| 317 | + |
---|
| 318 | + find_first: |
---|
| 319 | + for (; i < dtab->n_buckets; i++) { |
---|
| 320 | + head = dev_map_index_hash(dtab, i); |
---|
| 321 | + |
---|
| 322 | + next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), |
---|
| 323 | + struct bpf_dtab_netdev, |
---|
| 324 | + index_hlist); |
---|
| 325 | + if (next_dev) { |
---|
| 326 | + *next = next_dev->idx; |
---|
| 327 | + return 0; |
---|
| 328 | + } |
---|
| 329 | + } |
---|
| 330 | + |
---|
| 331 | + return -ENOENT; |
---|
| 332 | +} |
---|
| 333 | + |
---|
| 334 | +bool dev_map_can_have_prog(struct bpf_map *map) |
---|
| 335 | +{ |
---|
| 336 | + if ((map->map_type == BPF_MAP_TYPE_DEVMAP || |
---|
| 337 | + map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) && |
---|
| 338 | + map->value_size != offsetofend(struct bpf_devmap_val, ifindex)) |
---|
| 339 | + return true; |
---|
| 340 | + |
---|
| 341 | + return false; |
---|
| 342 | +} |
---|
| 343 | + |
---|
| 344 | +static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) |
---|
| 345 | +{ |
---|
| 346 | + struct net_device *dev = bq->dev; |
---|
229 | 347 | int sent = 0, drops = 0, err = 0; |
---|
230 | 348 | int i; |
---|
231 | 349 | |
---|
232 | 350 | if (unlikely(!bq->count)) |
---|
233 | | - return 0; |
---|
| 351 | + return; |
---|
234 | 352 | |
---|
235 | 353 | for (i = 0; i < bq->count; i++) { |
---|
236 | 354 | struct xdp_frame *xdpf = bq->q[i]; |
---|
.. | .. |
---|
248 | 366 | out: |
---|
249 | 367 | bq->count = 0; |
---|
250 | 368 | |
---|
251 | | - trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit, |
---|
252 | | - sent, drops, bq->dev_rx, dev, err); |
---|
| 369 | + trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err); |
---|
253 | 370 | bq->dev_rx = NULL; |
---|
254 | | - return 0; |
---|
| 371 | + __list_del_clearprev(&bq->flush_node); |
---|
| 372 | + return; |
---|
255 | 373 | error: |
---|
256 | 374 | /* If ndo_xdp_xmit fails with an errno, no frames have been |
---|
257 | 375 | * xmit'ed and it's our responsibility to them free all. |
---|
.. | .. |
---|
259 | 377 | for (i = 0; i < bq->count; i++) { |
---|
260 | 378 | struct xdp_frame *xdpf = bq->q[i]; |
---|
261 | 379 | |
---|
262 | | - /* RX path under NAPI protection, can return frames faster */ |
---|
263 | | - if (likely(in_napi_ctx)) |
---|
264 | | - xdp_return_frame_rx_napi(xdpf); |
---|
265 | | - else |
---|
266 | | - xdp_return_frame(xdpf); |
---|
| 380 | + xdp_return_frame_rx_napi(xdpf); |
---|
267 | 381 | drops++; |
---|
268 | 382 | } |
---|
269 | 383 | goto out; |
---|
270 | 384 | } |
---|
271 | 385 | |
---|
272 | | -/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled |
---|
| 386 | +/* __dev_flush is called from xdp_do_flush() which _must_ be signaled |
---|
273 | 387 | * from the driver before returning from its napi->poll() routine. The poll() |
---|
274 | 388 | * routine is called either from busy_poll context or net_rx_action signaled |
---|
275 | 389 | * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the |
---|
276 | | - * net device can be torn down. On devmap tear down we ensure the ctx bitmap |
---|
277 | | - * is zeroed before completing to ensure all flush operations have completed. |
---|
| 390 | + * net device can be torn down. On devmap tear down we ensure the flush list |
---|
| 391 | + * is empty before completing to ensure all flush operations have completed. |
---|
| 392 | + * When drivers update the bpf program they may need to ensure any flush ops |
---|
| 393 | + * are also complete. Using synchronize_rcu or call_rcu will suffice for this |
---|
| 394 | + * because both wait for napi context to exit. |
---|
278 | 395 | */ |
---|
279 | | -void __dev_map_flush(struct bpf_map *map) |
---|
| 396 | +void __dev_flush(void) |
---|
280 | 397 | { |
---|
281 | | - struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); |
---|
282 | | - unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); |
---|
283 | | - u32 bit; |
---|
| 398 | + struct list_head *flush_list = this_cpu_ptr(&dev_flush_list); |
---|
| 399 | + struct xdp_dev_bulk_queue *bq, *tmp; |
---|
284 | 400 | |
---|
285 | | - rcu_read_lock(); |
---|
286 | | - for_each_set_bit(bit, bitmap, map->max_entries) { |
---|
287 | | - struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]); |
---|
288 | | - struct xdp_bulk_queue *bq; |
---|
289 | | - |
---|
290 | | - /* This is possible if the dev entry is removed by user space |
---|
291 | | - * between xdp redirect and flush op. |
---|
292 | | - */ |
---|
293 | | - if (unlikely(!dev)) |
---|
294 | | - continue; |
---|
295 | | - |
---|
296 | | - bq = this_cpu_ptr(dev->bulkq); |
---|
297 | | - bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, true); |
---|
298 | | - |
---|
299 | | - __clear_bit(bit, bitmap); |
---|
300 | | - } |
---|
301 | | - rcu_read_unlock(); |
---|
| 401 | + list_for_each_entry_safe(bq, tmp, flush_list, flush_node) |
---|
| 402 | + bq_xmit_all(bq, XDP_XMIT_FLUSH); |
---|
302 | 403 | } |
---|
303 | 404 | |
---|
304 | 405 | /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or |
---|
.. | .. |
---|
320 | 421 | /* Runs under RCU-read-side, plus in softirq under NAPI protection. |
---|
321 | 422 | * Thus, safe percpu variable access. |
---|
322 | 423 | */ |
---|
323 | | -static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, |
---|
324 | | - struct net_device *dev_rx) |
---|
325 | | - |
---|
| 424 | +static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, |
---|
| 425 | + struct net_device *dev_rx) |
---|
326 | 426 | { |
---|
327 | | - struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); |
---|
| 427 | + struct list_head *flush_list = this_cpu_ptr(&dev_flush_list); |
---|
| 428 | + struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq); |
---|
328 | 429 | |
---|
329 | 430 | if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) |
---|
330 | | - bq_xmit_all(obj, bq, 0, true); |
---|
| 431 | + bq_xmit_all(bq, 0); |
---|
331 | 432 | |
---|
332 | 433 | /* Ingress dev_rx will be the same for all xdp_frame's in |
---|
333 | 434 | * bulk_queue, because bq stored per-CPU and must be flushed |
---|
.. | .. |
---|
337 | 438 | bq->dev_rx = dev_rx; |
---|
338 | 439 | |
---|
339 | 440 | bq->q[bq->count++] = xdpf; |
---|
340 | | - return 0; |
---|
| 441 | + |
---|
| 442 | + if (!bq->flush_node.prev) |
---|
| 443 | + list_add(&bq->flush_node, flush_list); |
---|
341 | 444 | } |
---|
342 | 445 | |
---|
343 | | -int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, |
---|
344 | | - struct net_device *dev_rx) |
---|
| 446 | +static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, |
---|
| 447 | + struct net_device *dev_rx) |
---|
345 | 448 | { |
---|
346 | | - struct net_device *dev = dst->dev; |
---|
347 | 449 | struct xdp_frame *xdpf; |
---|
348 | 450 | int err; |
---|
349 | 451 | |
---|
.. | .. |
---|
354 | 456 | if (unlikely(err)) |
---|
355 | 457 | return err; |
---|
356 | 458 | |
---|
357 | | - xdpf = convert_to_xdp_frame(xdp); |
---|
| 459 | + xdpf = xdp_convert_buff_to_frame(xdp); |
---|
358 | 460 | if (unlikely(!xdpf)) |
---|
359 | 461 | return -EOVERFLOW; |
---|
360 | 462 | |
---|
361 | | - return bq_enqueue(dst, xdpf, dev_rx); |
---|
| 463 | + bq_enqueue(dev, xdpf, dev_rx); |
---|
| 464 | + return 0; |
---|
| 465 | +} |
---|
| 466 | + |
---|
| 467 | +static struct xdp_buff *dev_map_run_prog(struct net_device *dev, |
---|
| 468 | + struct xdp_buff *xdp, |
---|
| 469 | + struct bpf_prog *xdp_prog) |
---|
| 470 | +{ |
---|
| 471 | + struct xdp_txq_info txq = { .dev = dev }; |
---|
| 472 | + u32 act; |
---|
| 473 | + |
---|
| 474 | + xdp_set_data_meta_invalid(xdp); |
---|
| 475 | + xdp->txq = &txq; |
---|
| 476 | + |
---|
| 477 | + act = bpf_prog_run_xdp(xdp_prog, xdp); |
---|
| 478 | + switch (act) { |
---|
| 479 | + case XDP_PASS: |
---|
| 480 | + return xdp; |
---|
| 481 | + case XDP_DROP: |
---|
| 482 | + break; |
---|
| 483 | + default: |
---|
| 484 | + bpf_warn_invalid_xdp_action(act); |
---|
| 485 | + fallthrough; |
---|
| 486 | + case XDP_ABORTED: |
---|
| 487 | + trace_xdp_exception(dev, xdp_prog, act); |
---|
| 488 | + break; |
---|
| 489 | + } |
---|
| 490 | + |
---|
| 491 | + xdp_return_buff(xdp); |
---|
| 492 | + return NULL; |
---|
| 493 | +} |
---|
| 494 | + |
---|
| 495 | +int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, |
---|
| 496 | + struct net_device *dev_rx) |
---|
| 497 | +{ |
---|
| 498 | + return __xdp_enqueue(dev, xdp, dev_rx); |
---|
| 499 | +} |
---|
| 500 | + |
---|
| 501 | +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, |
---|
| 502 | + struct net_device *dev_rx) |
---|
| 503 | +{ |
---|
| 504 | + struct net_device *dev = dst->dev; |
---|
| 505 | + |
---|
| 506 | + if (dst->xdp_prog) { |
---|
| 507 | + xdp = dev_map_run_prog(dev, xdp, dst->xdp_prog); |
---|
| 508 | + if (!xdp) |
---|
| 509 | + return 0; |
---|
| 510 | + } |
---|
| 511 | + return __xdp_enqueue(dev, xdp, dev_rx); |
---|
362 | 512 | } |
---|
363 | 513 | |
---|
364 | 514 | int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, |
---|
.. | .. |
---|
378 | 528 | static void *dev_map_lookup_elem(struct bpf_map *map, void *key) |
---|
379 | 529 | { |
---|
380 | 530 | struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key); |
---|
381 | | - struct net_device *dev = obj ? obj->dev : NULL; |
---|
382 | 531 | |
---|
383 | | - return dev ? &dev->ifindex : NULL; |
---|
| 532 | + return obj ? &obj->val : NULL; |
---|
384 | 533 | } |
---|
385 | 534 | |
---|
386 | | -static void dev_map_flush_old(struct bpf_dtab_netdev *dev) |
---|
| 535 | +static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key) |
---|
387 | 536 | { |
---|
388 | | - if (dev->dev->netdev_ops->ndo_xdp_xmit) { |
---|
389 | | - struct xdp_bulk_queue *bq; |
---|
390 | | - unsigned long *bitmap; |
---|
391 | | - |
---|
392 | | - int cpu; |
---|
393 | | - |
---|
394 | | - rcu_read_lock(); |
---|
395 | | - for_each_online_cpu(cpu) { |
---|
396 | | - bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu); |
---|
397 | | - __clear_bit(dev->bit, bitmap); |
---|
398 | | - |
---|
399 | | - bq = per_cpu_ptr(dev->bulkq, cpu); |
---|
400 | | - bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, false); |
---|
401 | | - } |
---|
402 | | - rcu_read_unlock(); |
---|
403 | | - } |
---|
| 537 | + struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map, |
---|
| 538 | + *(u32 *)key); |
---|
| 539 | + return obj ? &obj->val : NULL; |
---|
404 | 540 | } |
---|
405 | 541 | |
---|
406 | 542 | static void __dev_map_entry_free(struct rcu_head *rcu) |
---|
.. | .. |
---|
408 | 544 | struct bpf_dtab_netdev *dev; |
---|
409 | 545 | |
---|
410 | 546 | dev = container_of(rcu, struct bpf_dtab_netdev, rcu); |
---|
411 | | - dev_map_flush_old(dev); |
---|
412 | | - free_percpu(dev->bulkq); |
---|
| 547 | + if (dev->xdp_prog) |
---|
| 548 | + bpf_prog_put(dev->xdp_prog); |
---|
413 | 549 | dev_put(dev->dev); |
---|
414 | 550 | kfree(dev); |
---|
415 | 551 | } |
---|
.. | .. |
---|
424 | 560 | return -EINVAL; |
---|
425 | 561 | |
---|
426 | 562 | /* Use call_rcu() here to ensure any rcu critical sections have |
---|
427 | | - * completed, but this does not guarantee a flush has happened |
---|
428 | | - * yet. Because driver side rcu_read_lock/unlock only protects the |
---|
429 | | - * running XDP program. However, for pending flush operations the |
---|
430 | | - * dev and ctx are stored in another per cpu map. And additionally, |
---|
431 | | - * the driver tear down ensures all soft irqs are complete before |
---|
432 | | - * removing the net device in the case of dev_put equals zero. |
---|
| 563 | + * completed as well as any flush operations because call_rcu |
---|
| 564 | + * will wait for preempt-disable region to complete, NAPI in this |
---|
| 565 | + * context. And additionally, the driver tear down ensures all |
---|
| 566 | + * soft irqs are complete before removing the net device in the |
---|
| 567 | + * case of dev_put equals zero. |
---|
433 | 568 | */ |
---|
434 | 569 | old_dev = xchg(&dtab->netdev_map[k], NULL); |
---|
435 | 570 | if (old_dev) |
---|
.. | .. |
---|
437 | 572 | return 0; |
---|
438 | 573 | } |
---|
439 | 574 | |
---|
440 | | -static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, |
---|
441 | | - u64 map_flags) |
---|
| 575 | +static int dev_map_hash_delete_elem(struct bpf_map *map, void *key) |
---|
442 | 576 | { |
---|
443 | 577 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); |
---|
444 | | - struct net *net = current->nsproxy->net_ns; |
---|
445 | | - gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; |
---|
| 578 | + struct bpf_dtab_netdev *old_dev; |
---|
| 579 | + int k = *(u32 *)key; |
---|
| 580 | + unsigned long flags; |
---|
| 581 | + int ret = -ENOENT; |
---|
| 582 | + |
---|
| 583 | + spin_lock_irqsave(&dtab->index_lock, flags); |
---|
| 584 | + |
---|
| 585 | + old_dev = __dev_map_hash_lookup_elem(map, k); |
---|
| 586 | + if (old_dev) { |
---|
| 587 | + dtab->items--; |
---|
| 588 | + hlist_del_init_rcu(&old_dev->index_hlist); |
---|
| 589 | + call_rcu(&old_dev->rcu, __dev_map_entry_free); |
---|
| 590 | + ret = 0; |
---|
| 591 | + } |
---|
| 592 | + spin_unlock_irqrestore(&dtab->index_lock, flags); |
---|
| 593 | + |
---|
| 594 | + return ret; |
---|
| 595 | +} |
---|
| 596 | + |
---|
| 597 | +static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, |
---|
| 598 | + struct bpf_dtab *dtab, |
---|
| 599 | + struct bpf_devmap_val *val, |
---|
| 600 | + unsigned int idx) |
---|
| 601 | +{ |
---|
| 602 | + struct bpf_prog *prog = NULL; |
---|
| 603 | + struct bpf_dtab_netdev *dev; |
---|
| 604 | + |
---|
| 605 | + dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, |
---|
| 606 | + dtab->map.numa_node); |
---|
| 607 | + if (!dev) |
---|
| 608 | + return ERR_PTR(-ENOMEM); |
---|
| 609 | + |
---|
| 610 | + dev->dev = dev_get_by_index(net, val->ifindex); |
---|
| 611 | + if (!dev->dev) |
---|
| 612 | + goto err_out; |
---|
| 613 | + |
---|
| 614 | + if (val->bpf_prog.fd > 0) { |
---|
| 615 | + prog = bpf_prog_get_type_dev(val->bpf_prog.fd, |
---|
| 616 | + BPF_PROG_TYPE_XDP, false); |
---|
| 617 | + if (IS_ERR(prog)) |
---|
| 618 | + goto err_put_dev; |
---|
| 619 | + if (prog->expected_attach_type != BPF_XDP_DEVMAP) |
---|
| 620 | + goto err_put_prog; |
---|
| 621 | + } |
---|
| 622 | + |
---|
| 623 | + dev->idx = idx; |
---|
| 624 | + dev->dtab = dtab; |
---|
| 625 | + if (prog) { |
---|
| 626 | + dev->xdp_prog = prog; |
---|
| 627 | + dev->val.bpf_prog.id = prog->aux->id; |
---|
| 628 | + } else { |
---|
| 629 | + dev->xdp_prog = NULL; |
---|
| 630 | + dev->val.bpf_prog.id = 0; |
---|
| 631 | + } |
---|
| 632 | + dev->val.ifindex = val->ifindex; |
---|
| 633 | + |
---|
| 634 | + return dev; |
---|
| 635 | +err_put_prog: |
---|
| 636 | + bpf_prog_put(prog); |
---|
| 637 | +err_put_dev: |
---|
| 638 | + dev_put(dev->dev); |
---|
| 639 | +err_out: |
---|
| 640 | + kfree(dev); |
---|
| 641 | + return ERR_PTR(-EINVAL); |
---|
| 642 | +} |
---|
| 643 | + |
---|
| 644 | +static int __dev_map_update_elem(struct net *net, struct bpf_map *map, |
---|
| 645 | + void *key, void *value, u64 map_flags) |
---|
| 646 | +{ |
---|
| 647 | + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); |
---|
446 | 648 | struct bpf_dtab_netdev *dev, *old_dev; |
---|
| 649 | + struct bpf_devmap_val val = {}; |
---|
447 | 650 | u32 i = *(u32 *)key; |
---|
448 | | - u32 ifindex = *(u32 *)value; |
---|
449 | 651 | |
---|
450 | 652 | if (unlikely(map_flags > BPF_EXIST)) |
---|
451 | 653 | return -EINVAL; |
---|
.. | .. |
---|
454 | 656 | if (unlikely(map_flags == BPF_NOEXIST)) |
---|
455 | 657 | return -EEXIST; |
---|
456 | 658 | |
---|
457 | | - if (!ifindex) { |
---|
| 659 | + /* already verified value_size <= sizeof val */ |
---|
| 660 | + memcpy(&val, value, map->value_size); |
---|
| 661 | + |
---|
| 662 | + if (!val.ifindex) { |
---|
458 | 663 | dev = NULL; |
---|
459 | | - } else { |
---|
460 | | - dev = kmalloc_node(sizeof(*dev), gfp, map->numa_node); |
---|
461 | | - if (!dev) |
---|
462 | | - return -ENOMEM; |
---|
463 | | - |
---|
464 | | - dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq), |
---|
465 | | - sizeof(void *), gfp); |
---|
466 | | - if (!dev->bulkq) { |
---|
467 | | - kfree(dev); |
---|
468 | | - return -ENOMEM; |
---|
469 | | - } |
---|
470 | | - |
---|
471 | | - dev->dev = dev_get_by_index(net, ifindex); |
---|
472 | | - if (!dev->dev) { |
---|
473 | | - free_percpu(dev->bulkq); |
---|
474 | | - kfree(dev); |
---|
| 664 | + /* can not specify fd if ifindex is 0 */ |
---|
| 665 | + if (val.bpf_prog.fd > 0) |
---|
475 | 666 | return -EINVAL; |
---|
476 | | - } |
---|
477 | | - |
---|
478 | | - dev->bit = i; |
---|
479 | | - dev->dtab = dtab; |
---|
| 667 | + } else { |
---|
| 668 | + dev = __dev_map_alloc_node(net, dtab, &val, i); |
---|
| 669 | + if (IS_ERR(dev)) |
---|
| 670 | + return PTR_ERR(dev); |
---|
480 | 671 | } |
---|
481 | 672 | |
---|
482 | 673 | /* Use call_rcu() here to ensure rcu critical sections have completed |
---|
.. | .. |
---|
490 | 681 | return 0; |
---|
491 | 682 | } |
---|
492 | 683 | |
---|
| 684 | +static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, |
---|
| 685 | + u64 map_flags) |
---|
| 686 | +{ |
---|
| 687 | + return __dev_map_update_elem(current->nsproxy->net_ns, |
---|
| 688 | + map, key, value, map_flags); |
---|
| 689 | +} |
---|
| 690 | + |
---|
| 691 | +static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, |
---|
| 692 | + void *key, void *value, u64 map_flags) |
---|
| 693 | +{ |
---|
| 694 | + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); |
---|
| 695 | + struct bpf_dtab_netdev *dev, *old_dev; |
---|
| 696 | + struct bpf_devmap_val val = {}; |
---|
| 697 | + u32 idx = *(u32 *)key; |
---|
| 698 | + unsigned long flags; |
---|
| 699 | + int err = -EEXIST; |
---|
| 700 | + |
---|
| 701 | + /* already verified value_size <= sizeof val */ |
---|
| 702 | + memcpy(&val, value, map->value_size); |
---|
| 703 | + |
---|
| 704 | + if (unlikely(map_flags > BPF_EXIST || !val.ifindex)) |
---|
| 705 | + return -EINVAL; |
---|
| 706 | + |
---|
| 707 | + spin_lock_irqsave(&dtab->index_lock, flags); |
---|
| 708 | + |
---|
| 709 | + old_dev = __dev_map_hash_lookup_elem(map, idx); |
---|
| 710 | + if (old_dev && (map_flags & BPF_NOEXIST)) |
---|
| 711 | + goto out_err; |
---|
| 712 | + |
---|
| 713 | + dev = __dev_map_alloc_node(net, dtab, &val, idx); |
---|
| 714 | + if (IS_ERR(dev)) { |
---|
| 715 | + err = PTR_ERR(dev); |
---|
| 716 | + goto out_err; |
---|
| 717 | + } |
---|
| 718 | + |
---|
| 719 | + if (old_dev) { |
---|
| 720 | + hlist_del_rcu(&old_dev->index_hlist); |
---|
| 721 | + } else { |
---|
| 722 | + if (dtab->items >= dtab->map.max_entries) { |
---|
| 723 | + spin_unlock_irqrestore(&dtab->index_lock, flags); |
---|
| 724 | + call_rcu(&dev->rcu, __dev_map_entry_free); |
---|
| 725 | + return -E2BIG; |
---|
| 726 | + } |
---|
| 727 | + dtab->items++; |
---|
| 728 | + } |
---|
| 729 | + |
---|
| 730 | + hlist_add_head_rcu(&dev->index_hlist, |
---|
| 731 | + dev_map_index_hash(dtab, idx)); |
---|
| 732 | + spin_unlock_irqrestore(&dtab->index_lock, flags); |
---|
| 733 | + |
---|
| 734 | + if (old_dev) |
---|
| 735 | + call_rcu(&old_dev->rcu, __dev_map_entry_free); |
---|
| 736 | + |
---|
| 737 | + return 0; |
---|
| 738 | + |
---|
| 739 | +out_err: |
---|
| 740 | + spin_unlock_irqrestore(&dtab->index_lock, flags); |
---|
| 741 | + return err; |
---|
| 742 | +} |
---|
| 743 | + |
---|
| 744 | +static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, |
---|
| 745 | + u64 map_flags) |
---|
| 746 | +{ |
---|
| 747 | + return __dev_map_hash_update_elem(current->nsproxy->net_ns, |
---|
| 748 | + map, key, value, map_flags); |
---|
| 749 | +} |
---|
| 750 | + |
---|
| 751 | +static int dev_map_btf_id; |
---|
493 | 752 | const struct bpf_map_ops dev_map_ops = { |
---|
| 753 | + .map_meta_equal = bpf_map_meta_equal, |
---|
494 | 754 | .map_alloc = dev_map_alloc, |
---|
495 | 755 | .map_free = dev_map_free, |
---|
496 | 756 | .map_get_next_key = dev_map_get_next_key, |
---|
.. | .. |
---|
498 | 758 | .map_update_elem = dev_map_update_elem, |
---|
499 | 759 | .map_delete_elem = dev_map_delete_elem, |
---|
500 | 760 | .map_check_btf = map_check_no_btf, |
---|
| 761 | + .map_btf_name = "bpf_dtab", |
---|
| 762 | + .map_btf_id = &dev_map_btf_id, |
---|
501 | 763 | }; |
---|
| 764 | + |
---|
| 765 | +static int dev_map_hash_map_btf_id; |
---|
| 766 | +const struct bpf_map_ops dev_map_hash_ops = { |
---|
| 767 | + .map_meta_equal = bpf_map_meta_equal, |
---|
| 768 | + .map_alloc = dev_map_alloc, |
---|
| 769 | + .map_free = dev_map_free, |
---|
| 770 | + .map_get_next_key = dev_map_hash_get_next_key, |
---|
| 771 | + .map_lookup_elem = dev_map_hash_lookup_elem, |
---|
| 772 | + .map_update_elem = dev_map_hash_update_elem, |
---|
| 773 | + .map_delete_elem = dev_map_hash_delete_elem, |
---|
| 774 | + .map_check_btf = map_check_no_btf, |
---|
| 775 | + .map_btf_name = "bpf_dtab", |
---|
| 776 | + .map_btf_id = &dev_map_hash_map_btf_id, |
---|
| 777 | +}; |
---|
| 778 | + |
---|
| 779 | +static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab, |
---|
| 780 | + struct net_device *netdev) |
---|
| 781 | +{ |
---|
| 782 | + unsigned long flags; |
---|
| 783 | + u32 i; |
---|
| 784 | + |
---|
| 785 | + spin_lock_irqsave(&dtab->index_lock, flags); |
---|
| 786 | + for (i = 0; i < dtab->n_buckets; i++) { |
---|
| 787 | + struct bpf_dtab_netdev *dev; |
---|
| 788 | + struct hlist_head *head; |
---|
| 789 | + struct hlist_node *next; |
---|
| 790 | + |
---|
| 791 | + head = dev_map_index_hash(dtab, i); |
---|
| 792 | + |
---|
| 793 | + hlist_for_each_entry_safe(dev, next, head, index_hlist) { |
---|
| 794 | + if (netdev != dev->dev) |
---|
| 795 | + continue; |
---|
| 796 | + |
---|
| 797 | + dtab->items--; |
---|
| 798 | + hlist_del_rcu(&dev->index_hlist); |
---|
| 799 | + call_rcu(&dev->rcu, __dev_map_entry_free); |
---|
| 800 | + } |
---|
| 801 | + } |
---|
| 802 | + spin_unlock_irqrestore(&dtab->index_lock, flags); |
---|
| 803 | +} |
---|
502 | 804 | |
---|
503 | 805 | static int dev_map_notification(struct notifier_block *notifier, |
---|
504 | 806 | ulong event, void *ptr) |
---|
505 | 807 | { |
---|
506 | 808 | struct net_device *netdev = netdev_notifier_info_to_dev(ptr); |
---|
507 | 809 | struct bpf_dtab *dtab; |
---|
508 | | - int i; |
---|
| 810 | + int i, cpu; |
---|
509 | 811 | |
---|
510 | 812 | switch (event) { |
---|
| 813 | + case NETDEV_REGISTER: |
---|
| 814 | + if (!netdev->netdev_ops->ndo_xdp_xmit || netdev->xdp_bulkq) |
---|
| 815 | + break; |
---|
| 816 | + |
---|
| 817 | + /* will be freed in free_netdev() */ |
---|
| 818 | + netdev->xdp_bulkq = alloc_percpu(struct xdp_dev_bulk_queue); |
---|
| 819 | + if (!netdev->xdp_bulkq) |
---|
| 820 | + return NOTIFY_BAD; |
---|
| 821 | + |
---|
| 822 | + for_each_possible_cpu(cpu) |
---|
| 823 | + per_cpu_ptr(netdev->xdp_bulkq, cpu)->dev = netdev; |
---|
| 824 | + break; |
---|
511 | 825 | case NETDEV_UNREGISTER: |
---|
512 | 826 | /* This rcu_read_lock/unlock pair is needed because |
---|
513 | 827 | * dev_map_list is an RCU list AND to ensure a delete |
---|
.. | .. |
---|
516 | 830 | */ |
---|
517 | 831 | rcu_read_lock(); |
---|
518 | 832 | list_for_each_entry_rcu(dtab, &dev_map_list, list) { |
---|
| 833 | + if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) { |
---|
| 834 | + dev_map_hash_remove_netdev(dtab, netdev); |
---|
| 835 | + continue; |
---|
| 836 | + } |
---|
| 837 | + |
---|
519 | 838 | for (i = 0; i < dtab->map.max_entries; i++) { |
---|
520 | 839 | struct bpf_dtab_netdev *dev, *odev; |
---|
521 | 840 | |
---|
.. | .. |
---|
542 | 861 | |
---|
543 | 862 | static int __init dev_map_init(void) |
---|
544 | 863 | { |
---|
| 864 | + int cpu; |
---|
| 865 | + |
---|
545 | 866 | /* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */ |
---|
546 | 867 | BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) != |
---|
547 | 868 | offsetof(struct _bpf_dtab_netdev, dev)); |
---|
548 | 869 | register_netdevice_notifier(&dev_map_notifier); |
---|
| 870 | + |
---|
| 871 | + for_each_possible_cpu(cpu) |
---|
| 872 | + INIT_LIST_HEAD(&per_cpu(dev_flush_list, cpu)); |
---|
549 | 873 | return 0; |
---|
550 | 874 | } |
---|
551 | 875 | |
---|