.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
---|
1 | 2 | /* bpf/cpumap.c |
---|
2 | 3 | * |
---|
3 | 4 | * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. |
---|
4 | | - * Released under terms in GPL version 2. See COPYING. |
---|
5 | 5 | */ |
---|
6 | 6 | |
---|
7 | 7 | /* The 'cpumap' is primarily used as a backend map for XDP BPF helper |
---|
.. | .. |
---|
32 | 32 | |
---|
33 | 33 | /* General idea: XDP packets getting XDP redirected to another CPU, |
---|
34 | 34 | * will maximum be stored/queued for one driver ->poll() call. It is |
---|
35 | | - * guaranteed that setting flush bit and flush operation happen on |
---|
| 35 | + * guaranteed that queueing the frame and the flush operation happen on |
---|
36 | 36 | * same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr() |
---|
37 | 37 | * which queue in bpf_cpu_map_entry contains packets. |
---|
38 | 38 | */ |
---|
39 | 39 | |
---|
40 | 40 | #define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */ |
---|
| 41 | +struct bpf_cpu_map_entry; |
---|
| 42 | +struct bpf_cpu_map; |
---|
| 43 | + |
---|
41 | 44 | struct xdp_bulk_queue { |
---|
42 | 45 | void *q[CPU_MAP_BULK_SIZE]; |
---|
| 46 | + struct list_head flush_node; |
---|
| 47 | + struct bpf_cpu_map_entry *obj; |
---|
43 | 48 | unsigned int count; |
---|
44 | 49 | }; |
---|
45 | 50 | |
---|
.. | .. |
---|
47 | 52 | struct bpf_cpu_map_entry { |
---|
48 | 53 | u32 cpu; /* kthread CPU and map index */ |
---|
49 | 54 | int map_id; /* Back reference to map */ |
---|
50 | | - u32 qsize; /* Queue size placeholder for map lookup */ |
---|
51 | 55 | |
---|
52 | 56 | /* XDP can run multiple RX-ring queues, need __percpu enqueue store */ |
---|
53 | 57 | struct xdp_bulk_queue __percpu *bulkq; |
---|
54 | 58 | |
---|
| 59 | + struct bpf_cpu_map *cmap; |
---|
| 60 | + |
---|
55 | 61 | /* Queue with potential multi-producers, and single-consumer kthread */ |
---|
56 | 62 | struct ptr_ring *queue; |
---|
57 | 63 | struct task_struct *kthread; |
---|
58 | | - struct work_struct kthread_stop_wq; |
---|
| 64 | + |
---|
| 65 | + struct bpf_cpumap_val value; |
---|
| 66 | + struct bpf_prog *prog; |
---|
59 | 67 | |
---|
60 | 68 | atomic_t refcnt; /* Control when this struct can be free'ed */ |
---|
61 | 69 | struct rcu_head rcu; |
---|
| 70 | + |
---|
| 71 | + struct work_struct kthread_stop_wq; |
---|
62 | 72 | }; |
---|
63 | 73 | |
---|
64 | 74 | struct bpf_cpu_map { |
---|
65 | 75 | struct bpf_map map; |
---|
66 | 76 | /* Below members specific for map type */ |
---|
67 | 77 | struct bpf_cpu_map_entry **cpu_map; |
---|
68 | | - unsigned long __percpu *flush_needed; |
---|
69 | 78 | }; |
---|
70 | 79 | |
---|
71 | | -static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, |
---|
72 | | - struct xdp_bulk_queue *bq, bool in_napi_ctx); |
---|
73 | | - |
---|
74 | | -static u64 cpu_map_bitmap_size(const union bpf_attr *attr) |
---|
75 | | -{ |
---|
76 | | - return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long); |
---|
77 | | -} |
---|
| 80 | +static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list); |
---|
78 | 81 | |
---|
79 | 82 | static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) |
---|
80 | 83 | { |
---|
| 84 | + u32 value_size = attr->value_size; |
---|
81 | 85 | struct bpf_cpu_map *cmap; |
---|
82 | 86 | int err = -ENOMEM; |
---|
83 | 87 | u64 cost; |
---|
84 | 88 | int ret; |
---|
85 | 89 | |
---|
86 | | - if (!capable(CAP_SYS_ADMIN)) |
---|
| 90 | + if (!bpf_capable()) |
---|
87 | 91 | return ERR_PTR(-EPERM); |
---|
88 | 92 | |
---|
89 | 93 | /* check sanity of attributes */ |
---|
90 | 94 | if (attr->max_entries == 0 || attr->key_size != 4 || |
---|
91 | | - attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) |
---|
| 95 | + (value_size != offsetofend(struct bpf_cpumap_val, qsize) && |
---|
| 96 | + value_size != offsetofend(struct bpf_cpumap_val, bpf_prog.fd)) || |
---|
| 97 | + attr->map_flags & ~BPF_F_NUMA_NODE) |
---|
92 | 98 | return ERR_PTR(-EINVAL); |
---|
93 | 99 | |
---|
94 | 100 | cmap = kzalloc(sizeof(*cmap), GFP_USER); |
---|
.. | .. |
---|
105 | 111 | |
---|
106 | 112 | /* make sure page count doesn't overflow */ |
---|
107 | 113 | cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *); |
---|
108 | | - cost += cpu_map_bitmap_size(attr) * num_possible_cpus(); |
---|
109 | | - if (cost >= U32_MAX - PAGE_SIZE) |
---|
110 | | - goto free_cmap; |
---|
111 | | - cmap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; |
---|
112 | 114 | |
---|
113 | 115 | /* Notice returns -EPERM on if map size is larger than memlock limit */ |
---|
114 | | - ret = bpf_map_precharge_memlock(cmap->map.pages); |
---|
| 116 | + ret = bpf_map_charge_init(&cmap->map.memory, cost); |
---|
115 | 117 | if (ret) { |
---|
116 | 118 | err = ret; |
---|
117 | 119 | goto free_cmap; |
---|
118 | 120 | } |
---|
119 | | - |
---|
120 | | - /* A per cpu bitfield with a bit per possible CPU in map */ |
---|
121 | | - cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr), |
---|
122 | | - __alignof__(unsigned long)); |
---|
123 | | - if (!cmap->flush_needed) |
---|
124 | | - goto free_cmap; |
---|
125 | 121 | |
---|
126 | 122 | /* Alloc array for possible remote "destination" CPUs */ |
---|
127 | 123 | cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * |
---|
128 | 124 | sizeof(struct bpf_cpu_map_entry *), |
---|
129 | 125 | cmap->map.numa_node); |
---|
130 | 126 | if (!cmap->cpu_map) |
---|
131 | | - goto free_percpu; |
---|
| 127 | + goto free_charge; |
---|
132 | 128 | |
---|
133 | 129 | return &cmap->map; |
---|
134 | | -free_percpu: |
---|
135 | | - free_percpu(cmap->flush_needed); |
---|
| 130 | +free_charge: |
---|
| 131 | + bpf_map_charge_finish(&cmap->map.memory); |
---|
136 | 132 | free_cmap: |
---|
137 | 133 | kfree(cmap); |
---|
138 | 134 | return ERR_PTR(err); |
---|
.. | .. |
---|
159 | 155 | kthread_stop(rcpu->kthread); |
---|
160 | 156 | } |
---|
161 | 157 | |
---|
162 | | -static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, |
---|
163 | | - struct xdp_frame *xdpf) |
---|
| 158 | +static struct sk_buff *cpu_map_build_skb(struct xdp_frame *xdpf, |
---|
| 159 | + struct sk_buff *skb) |
---|
164 | 160 | { |
---|
165 | 161 | unsigned int hard_start_headroom; |
---|
166 | 162 | unsigned int frame_size; |
---|
167 | 163 | void *pkt_data_start; |
---|
168 | | - struct sk_buff *skb; |
---|
169 | 164 | |
---|
170 | 165 | /* Part of headroom was reserved to xdpf */ |
---|
171 | 166 | hard_start_headroom = sizeof(struct xdp_frame) + xdpf->headroom; |
---|
172 | 167 | |
---|
173 | | - /* build_skb need to place skb_shared_info after SKB end, and |
---|
174 | | - * also want to know the memory "truesize". Thus, need to |
---|
175 | | - * know the memory frame size backing xdp_buff. |
---|
176 | | - * |
---|
177 | | - * XDP was designed to have PAGE_SIZE frames, but this |
---|
178 | | - * assumption is not longer true with ixgbe and i40e. It |
---|
179 | | - * would be preferred to set frame_size to 2048 or 4096 |
---|
180 | | - * depending on the driver. |
---|
181 | | - * frame_size = 2048; |
---|
182 | | - * frame_len = frame_size - sizeof(*xdp_frame); |
---|
183 | | - * |
---|
184 | | - * Instead, with info avail, skb_shared_info in placed after |
---|
185 | | - * packet len. This, unfortunately fakes the truesize. |
---|
186 | | - * Another disadvantage of this approach, the skb_shared_info |
---|
187 | | - * is not at a fixed memory location, with mixed length |
---|
188 | | - * packets, which is bad for cache-line hotness. |
---|
| 168 | + /* Memory size backing xdp_frame data already have reserved |
---|
| 169 | + * room for build_skb to place skb_shared_info in tailroom. |
---|
189 | 170 | */ |
---|
190 | | - frame_size = SKB_DATA_ALIGN(xdpf->len + hard_start_headroom) + |
---|
191 | | - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); |
---|
| 171 | + frame_size = xdpf->frame_sz; |
---|
192 | 172 | |
---|
193 | 173 | pkt_data_start = xdpf->data - hard_start_headroom; |
---|
194 | | - skb = build_skb(pkt_data_start, frame_size); |
---|
195 | | - if (!skb) |
---|
| 174 | + skb = build_skb_around(skb, pkt_data_start, frame_size); |
---|
| 175 | + if (unlikely(!skb)) |
---|
196 | 176 | return NULL; |
---|
197 | 177 | |
---|
198 | 178 | skb_reserve(skb, hard_start_headroom); |
---|
.. | .. |
---|
208 | 188 | * - HW RX hash (skb_set_hash) |
---|
209 | 189 | * - RX ring dev queue index (skb_record_rx_queue) |
---|
210 | 190 | */ |
---|
| 191 | + |
---|
| 192 | + /* Until page_pool get SKB return path, release DMA here */ |
---|
| 193 | + xdp_release_frame(xdpf); |
---|
211 | 194 | |
---|
212 | 195 | /* Allow SKB to reuse area used by xdp_frame */ |
---|
213 | 196 | xdp_scrub_frame(xdpf); |
---|
.. | .. |
---|
232 | 215 | static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) |
---|
233 | 216 | { |
---|
234 | 217 | if (atomic_dec_and_test(&rcpu->refcnt)) { |
---|
| 218 | + if (rcpu->prog) |
---|
| 219 | + bpf_prog_put(rcpu->prog); |
---|
235 | 220 | /* The queue should be empty at this point */ |
---|
236 | 221 | __cpu_map_ring_cleanup(rcpu->queue); |
---|
237 | 222 | ptr_ring_cleanup(rcpu->queue, NULL); |
---|
.. | .. |
---|
239 | 224 | kfree(rcpu); |
---|
240 | 225 | } |
---|
241 | 226 | } |
---|
| 227 | + |
---|
| 228 | +static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, |
---|
| 229 | + void **frames, int n, |
---|
| 230 | + struct xdp_cpumap_stats *stats) |
---|
| 231 | +{ |
---|
| 232 | + struct xdp_rxq_info rxq; |
---|
| 233 | + struct xdp_buff xdp; |
---|
| 234 | + int i, nframes = 0; |
---|
| 235 | + |
---|
| 236 | + if (!rcpu->prog) |
---|
| 237 | + return n; |
---|
| 238 | + |
---|
| 239 | + rcu_read_lock_bh(); |
---|
| 240 | + |
---|
| 241 | + xdp_set_return_frame_no_direct(); |
---|
| 242 | + xdp.rxq = &rxq; |
---|
| 243 | + |
---|
| 244 | + for (i = 0; i < n; i++) { |
---|
| 245 | + struct xdp_frame *xdpf = frames[i]; |
---|
| 246 | + u32 act; |
---|
| 247 | + int err; |
---|
| 248 | + |
---|
| 249 | + rxq.dev = xdpf->dev_rx; |
---|
| 250 | + rxq.mem = xdpf->mem; |
---|
| 251 | + /* TODO: report queue_index to xdp_rxq_info */ |
---|
| 252 | + |
---|
| 253 | + xdp_convert_frame_to_buff(xdpf, &xdp); |
---|
| 254 | + |
---|
| 255 | + act = bpf_prog_run_xdp(rcpu->prog, &xdp); |
---|
| 256 | + switch (act) { |
---|
| 257 | + case XDP_PASS: |
---|
| 258 | + err = xdp_update_frame_from_buff(&xdp, xdpf); |
---|
| 259 | + if (err < 0) { |
---|
| 260 | + xdp_return_frame(xdpf); |
---|
| 261 | + stats->drop++; |
---|
| 262 | + } else { |
---|
| 263 | + frames[nframes++] = xdpf; |
---|
| 264 | + stats->pass++; |
---|
| 265 | + } |
---|
| 266 | + break; |
---|
| 267 | + case XDP_REDIRECT: |
---|
| 268 | + err = xdp_do_redirect(xdpf->dev_rx, &xdp, |
---|
| 269 | + rcpu->prog); |
---|
| 270 | + if (unlikely(err)) { |
---|
| 271 | + xdp_return_frame(xdpf); |
---|
| 272 | + stats->drop++; |
---|
| 273 | + } else { |
---|
| 274 | + stats->redirect++; |
---|
| 275 | + } |
---|
| 276 | + break; |
---|
| 277 | + default: |
---|
| 278 | + bpf_warn_invalid_xdp_action(act); |
---|
| 279 | + fallthrough; |
---|
| 280 | + case XDP_DROP: |
---|
| 281 | + xdp_return_frame(xdpf); |
---|
| 282 | + stats->drop++; |
---|
| 283 | + break; |
---|
| 284 | + } |
---|
| 285 | + } |
---|
| 286 | + |
---|
| 287 | + if (stats->redirect) |
---|
| 288 | + xdp_do_flush_map(); |
---|
| 289 | + |
---|
| 290 | + xdp_clear_return_frame_no_direct(); |
---|
| 291 | + |
---|
| 292 | + rcu_read_unlock_bh(); /* resched point, may call do_softirq() */ |
---|
| 293 | + |
---|
| 294 | + return nframes; |
---|
| 295 | +} |
---|
| 296 | + |
---|
| 297 | +#define CPUMAP_BATCH 8 |
---|
242 | 298 | |
---|
243 | 299 | static int cpu_map_kthread_run(void *data) |
---|
244 | 300 | { |
---|
.. | .. |
---|
252 | 308 | * kthread_stop signal until queue is empty. |
---|
253 | 309 | */ |
---|
254 | 310 | while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { |
---|
255 | | - unsigned int processed = 0, drops = 0, sched = 0; |
---|
256 | | - struct xdp_frame *xdpf; |
---|
| 311 | + struct xdp_cpumap_stats stats = {}; /* zero stats */ |
---|
| 312 | + gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; |
---|
| 313 | + unsigned int drops = 0, sched = 0; |
---|
| 314 | + void *frames[CPUMAP_BATCH]; |
---|
| 315 | + void *skbs[CPUMAP_BATCH]; |
---|
| 316 | + int i, n, m, nframes; |
---|
257 | 317 | |
---|
258 | 318 | /* Release CPU reschedule checks */ |
---|
259 | 319 | if (__ptr_ring_empty(rcpu->queue)) { |
---|
.. | .. |
---|
269 | 329 | sched = cond_resched(); |
---|
270 | 330 | } |
---|
271 | 331 | |
---|
272 | | - /* Process packets in rcpu->queue */ |
---|
273 | | - local_bh_disable(); |
---|
274 | 332 | /* |
---|
275 | 333 | * The bpf_cpu_map_entry is single consumer, with this |
---|
276 | 334 | * kthread CPU pinned. Lockless access to ptr_ring |
---|
277 | 335 | * consume side valid as no-resize allowed of queue. |
---|
278 | 336 | */ |
---|
279 | | - while ((xdpf = __ptr_ring_consume(rcpu->queue))) { |
---|
280 | | - struct sk_buff *skb; |
---|
| 337 | + n = __ptr_ring_consume_batched(rcpu->queue, frames, |
---|
| 338 | + CPUMAP_BATCH); |
---|
| 339 | + for (i = 0; i < n; i++) { |
---|
| 340 | + void *f = frames[i]; |
---|
| 341 | + struct page *page = virt_to_page(f); |
---|
| 342 | + |
---|
| 343 | + /* Bring struct page memory area to curr CPU. Read by |
---|
| 344 | + * build_skb_around via page_is_pfmemalloc(), and when |
---|
| 345 | + * freed written by page_frag_free call. |
---|
| 346 | + */ |
---|
| 347 | + prefetchw(page); |
---|
| 348 | + } |
---|
| 349 | + |
---|
| 350 | + /* Support running another XDP prog on this CPU */ |
---|
| 351 | + nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, n, &stats); |
---|
| 352 | + if (nframes) { |
---|
| 353 | + m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, nframes, skbs); |
---|
| 354 | + if (unlikely(m == 0)) { |
---|
| 355 | + for (i = 0; i < nframes; i++) |
---|
| 356 | + skbs[i] = NULL; /* effect: xdp_return_frame */ |
---|
| 357 | + drops += nframes; |
---|
| 358 | + } |
---|
| 359 | + } |
---|
| 360 | + |
---|
| 361 | + local_bh_disable(); |
---|
| 362 | + for (i = 0; i < nframes; i++) { |
---|
| 363 | + struct xdp_frame *xdpf = frames[i]; |
---|
| 364 | + struct sk_buff *skb = skbs[i]; |
---|
281 | 365 | int ret; |
---|
282 | 366 | |
---|
283 | | - skb = cpu_map_build_skb(rcpu, xdpf); |
---|
| 367 | + skb = cpu_map_build_skb(xdpf, skb); |
---|
284 | 368 | if (!skb) { |
---|
285 | 369 | xdp_return_frame(xdpf); |
---|
286 | 370 | continue; |
---|
.. | .. |
---|
290 | 374 | ret = netif_receive_skb_core(skb); |
---|
291 | 375 | if (ret == NET_RX_DROP) |
---|
292 | 376 | drops++; |
---|
293 | | - |
---|
294 | | - /* Limit BH-disable period */ |
---|
295 | | - if (++processed == 8) |
---|
296 | | - break; |
---|
297 | 377 | } |
---|
298 | 378 | /* Feedback loop via tracepoint */ |
---|
299 | | - trace_xdp_cpumap_kthread(rcpu->map_id, processed, drops, sched); |
---|
| 379 | + trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched, &stats); |
---|
300 | 380 | |
---|
301 | 381 | local_bh_enable(); /* resched point, may call do_softirq() */ |
---|
302 | 382 | } |
---|
.. | .. |
---|
306 | 386 | return 0; |
---|
307 | 387 | } |
---|
308 | 388 | |
---|
309 | | -static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, |
---|
310 | | - int map_id) |
---|
| 389 | +bool cpu_map_prog_allowed(struct bpf_map *map) |
---|
311 | 390 | { |
---|
| 391 | + return map->map_type == BPF_MAP_TYPE_CPUMAP && |
---|
| 392 | + map->value_size != offsetofend(struct bpf_cpumap_val, qsize); |
---|
| 393 | +} |
---|
| 394 | + |
---|
| 395 | +static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd) |
---|
| 396 | +{ |
---|
| 397 | + struct bpf_prog *prog; |
---|
| 398 | + |
---|
| 399 | + prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); |
---|
| 400 | + if (IS_ERR(prog)) |
---|
| 401 | + return PTR_ERR(prog); |
---|
| 402 | + |
---|
| 403 | + if (prog->expected_attach_type != BPF_XDP_CPUMAP) { |
---|
| 404 | + bpf_prog_put(prog); |
---|
| 405 | + return -EINVAL; |
---|
| 406 | + } |
---|
| 407 | + |
---|
| 408 | + rcpu->value.bpf_prog.id = prog->aux->id; |
---|
| 409 | + rcpu->prog = prog; |
---|
| 410 | + |
---|
| 411 | + return 0; |
---|
| 412 | +} |
---|
| 413 | + |
---|
| 414 | +static struct bpf_cpu_map_entry * |
---|
| 415 | +__cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) |
---|
| 416 | +{ |
---|
| 417 | + int numa, err, i, fd = value->bpf_prog.fd; |
---|
312 | 418 | gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; |
---|
313 | 419 | struct bpf_cpu_map_entry *rcpu; |
---|
314 | | - int numa, err; |
---|
| 420 | + struct xdp_bulk_queue *bq; |
---|
315 | 421 | |
---|
316 | 422 | /* Have map->numa_node, but choose node of redirect target CPU */ |
---|
317 | 423 | numa = cpu_to_node(cpu); |
---|
.. | .. |
---|
326 | 432 | if (!rcpu->bulkq) |
---|
327 | 433 | goto free_rcu; |
---|
328 | 434 | |
---|
| 435 | + for_each_possible_cpu(i) { |
---|
| 436 | + bq = per_cpu_ptr(rcpu->bulkq, i); |
---|
| 437 | + bq->obj = rcpu; |
---|
| 438 | + } |
---|
| 439 | + |
---|
329 | 440 | /* Alloc queue */ |
---|
330 | 441 | rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa); |
---|
331 | 442 | if (!rcpu->queue) |
---|
332 | 443 | goto free_bulkq; |
---|
333 | 444 | |
---|
334 | | - err = ptr_ring_init(rcpu->queue, qsize, gfp); |
---|
| 445 | + err = ptr_ring_init(rcpu->queue, value->qsize, gfp); |
---|
335 | 446 | if (err) |
---|
336 | 447 | goto free_queue; |
---|
337 | 448 | |
---|
338 | 449 | rcpu->cpu = cpu; |
---|
339 | 450 | rcpu->map_id = map_id; |
---|
340 | | - rcpu->qsize = qsize; |
---|
| 451 | + rcpu->value.qsize = value->qsize; |
---|
| 452 | + |
---|
| 453 | + if (fd > 0 && __cpu_map_load_bpf_program(rcpu, fd)) |
---|
| 454 | + goto free_ptr_ring; |
---|
341 | 455 | |
---|
342 | 456 | /* Setup kthread */ |
---|
343 | 457 | rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa, |
---|
344 | 458 | "cpumap/%d/map:%d", cpu, map_id); |
---|
345 | 459 | if (IS_ERR(rcpu->kthread)) |
---|
346 | | - goto free_ptr_ring; |
---|
| 460 | + goto free_prog; |
---|
347 | 461 | |
---|
348 | 462 | get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */ |
---|
349 | 463 | get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */ |
---|
.. | .. |
---|
354 | 468 | |
---|
355 | 469 | return rcpu; |
---|
356 | 470 | |
---|
| 471 | +free_prog: |
---|
| 472 | + if (rcpu->prog) |
---|
| 473 | + bpf_prog_put(rcpu->prog); |
---|
357 | 474 | free_ptr_ring: |
---|
358 | 475 | ptr_ring_cleanup(rcpu->queue, NULL); |
---|
359 | 476 | free_queue: |
---|
.. | .. |
---|
368 | 485 | static void __cpu_map_entry_free(struct rcu_head *rcu) |
---|
369 | 486 | { |
---|
370 | 487 | struct bpf_cpu_map_entry *rcpu; |
---|
371 | | - int cpu; |
---|
372 | 488 | |
---|
373 | 489 | /* This cpu_map_entry have been disconnected from map and one |
---|
374 | | - * RCU graze-period have elapsed. Thus, XDP cannot queue any |
---|
| 490 | + * RCU grace-period have elapsed. Thus, XDP cannot queue any |
---|
375 | 491 | * new packets and cannot change/set flush_needed that can |
---|
376 | 492 | * find this entry. |
---|
377 | 493 | */ |
---|
378 | 494 | rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu); |
---|
379 | 495 | |
---|
380 | | - /* Flush remaining packets in percpu bulkq */ |
---|
381 | | - for_each_online_cpu(cpu) { |
---|
382 | | - struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu); |
---|
383 | | - |
---|
384 | | - /* No concurrent bq_enqueue can run at this point */ |
---|
385 | | - bq_flush_to_queue(rcpu, bq, false); |
---|
386 | | - } |
---|
387 | 496 | free_percpu(rcpu->bulkq); |
---|
388 | 497 | /* Cannot kthread_stop() here, last put free rcpu resources */ |
---|
389 | 498 | put_cpu_map_entry(rcpu); |
---|
.. | .. |
---|
405 | 514 | * percpu bulkq to queue. Due to caller map_delete_elem() disable |
---|
406 | 515 | * preemption, cannot call kthread_stop() to make sure queue is empty. |
---|
407 | 516 | * Instead a work_queue is started for stopping kthread, |
---|
408 | | - * cpu_map_kthread_stop, which waits for an RCU graze period before |
---|
| 517 | + * cpu_map_kthread_stop, which waits for an RCU grace period before |
---|
409 | 518 | * stopping kthread, emptying the queue. |
---|
410 | 519 | */ |
---|
411 | 520 | static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, |
---|
.. | .. |
---|
438 | 547 | u64 map_flags) |
---|
439 | 548 | { |
---|
440 | 549 | struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); |
---|
| 550 | + struct bpf_cpumap_val cpumap_value = {}; |
---|
441 | 551 | struct bpf_cpu_map_entry *rcpu; |
---|
442 | | - |
---|
443 | 552 | /* Array index key correspond to CPU number */ |
---|
444 | 553 | u32 key_cpu = *(u32 *)key; |
---|
445 | | - /* Value is the queue size */ |
---|
446 | | - u32 qsize = *(u32 *)value; |
---|
| 554 | + |
---|
| 555 | + memcpy(&cpumap_value, value, map->value_size); |
---|
447 | 556 | |
---|
448 | 557 | if (unlikely(map_flags > BPF_EXIST)) |
---|
449 | 558 | return -EINVAL; |
---|
.. | .. |
---|
451 | 560 | return -E2BIG; |
---|
452 | 561 | if (unlikely(map_flags == BPF_NOEXIST)) |
---|
453 | 562 | return -EEXIST; |
---|
454 | | - if (unlikely(qsize > 16384)) /* sanity limit on qsize */ |
---|
| 563 | + if (unlikely(cpumap_value.qsize > 16384)) /* sanity limit on qsize */ |
---|
455 | 564 | return -EOVERFLOW; |
---|
456 | 565 | |
---|
457 | 566 | /* Make sure CPU is a valid possible cpu */ |
---|
458 | 567 | if (key_cpu >= nr_cpumask_bits || !cpu_possible(key_cpu)) |
---|
459 | 568 | return -ENODEV; |
---|
460 | 569 | |
---|
461 | | - if (qsize == 0) { |
---|
| 570 | + if (cpumap_value.qsize == 0) { |
---|
462 | 571 | rcpu = NULL; /* Same as deleting */ |
---|
463 | 572 | } else { |
---|
464 | 573 | /* Updating qsize cause re-allocation of bpf_cpu_map_entry */ |
---|
465 | | - rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id); |
---|
| 574 | + rcpu = __cpu_map_entry_alloc(&cpumap_value, key_cpu, map->id); |
---|
466 | 575 | if (!rcpu) |
---|
467 | 576 | return -ENOMEM; |
---|
| 577 | + rcpu->cmap = cmap; |
---|
468 | 578 | } |
---|
469 | 579 | rcu_read_lock(); |
---|
470 | 580 | __cpu_map_entry_replace(cmap, key_cpu, rcpu); |
---|
.. | .. |
---|
475 | 585 | static void cpu_map_free(struct bpf_map *map) |
---|
476 | 586 | { |
---|
477 | 587 | struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); |
---|
478 | | - int cpu; |
---|
479 | 588 | u32 i; |
---|
480 | 589 | |
---|
481 | 590 | /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, |
---|
.. | .. |
---|
490 | 599 | bpf_clear_redirect_map(map); |
---|
491 | 600 | synchronize_rcu(); |
---|
492 | 601 | |
---|
493 | | - /* To ensure all pending flush operations have completed wait for flush |
---|
494 | | - * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. |
---|
495 | | - * Because the above synchronize_rcu() ensures the map is disconnected |
---|
496 | | - * from the program we can assume no new bits will be set. |
---|
497 | | - */ |
---|
498 | | - for_each_online_cpu(cpu) { |
---|
499 | | - unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu); |
---|
500 | | - |
---|
501 | | - while (!bitmap_empty(bitmap, cmap->map.max_entries)) |
---|
502 | | - cond_resched(); |
---|
503 | | - } |
---|
504 | | - |
---|
505 | 602 | /* For cpu_map the remote CPUs can still be using the entries |
---|
506 | 603 | * (struct bpf_cpu_map_entry). |
---|
507 | 604 | */ |
---|
.. | .. |
---|
512 | 609 | if (!rcpu) |
---|
513 | 610 | continue; |
---|
514 | 611 | |
---|
515 | | - /* bq flush and cleanup happens after RCU graze-period */ |
---|
| 612 | + /* bq flush and cleanup happens after RCU grace-period */ |
---|
516 | 613 | __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ |
---|
517 | 614 | } |
---|
518 | | - free_percpu(cmap->flush_needed); |
---|
519 | 615 | bpf_map_area_free(cmap->cpu_map); |
---|
520 | 616 | kfree(cmap); |
---|
521 | 617 | } |
---|
.. | .. |
---|
537 | 633 | struct bpf_cpu_map_entry *rcpu = |
---|
538 | 634 | __cpu_map_lookup_elem(map, *(u32 *)key); |
---|
539 | 635 | |
---|
540 | | - return rcpu ? &rcpu->qsize : NULL; |
---|
| 636 | + return rcpu ? &rcpu->value : NULL; |
---|
541 | 637 | } |
---|
542 | 638 | |
---|
543 | 639 | static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) |
---|
.. | .. |
---|
557 | 653 | return 0; |
---|
558 | 654 | } |
---|
559 | 655 | |
---|
| 656 | +static int cpu_map_btf_id; |
---|
560 | 657 | const struct bpf_map_ops cpu_map_ops = { |
---|
| 658 | + .map_meta_equal = bpf_map_meta_equal, |
---|
561 | 659 | .map_alloc = cpu_map_alloc, |
---|
562 | 660 | .map_free = cpu_map_free, |
---|
563 | 661 | .map_delete_elem = cpu_map_delete_elem, |
---|
.. | .. |
---|
565 | 663 | .map_lookup_elem = cpu_map_lookup_elem, |
---|
566 | 664 | .map_get_next_key = cpu_map_get_next_key, |
---|
567 | 665 | .map_check_btf = map_check_no_btf, |
---|
| 666 | + .map_btf_name = "bpf_cpu_map", |
---|
| 667 | + .map_btf_id = &cpu_map_btf_id, |
---|
568 | 668 | }; |
---|
569 | 669 | |
---|
570 | | -static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, |
---|
571 | | - struct xdp_bulk_queue *bq, bool in_napi_ctx) |
---|
| 670 | +static void bq_flush_to_queue(struct xdp_bulk_queue *bq) |
---|
572 | 671 | { |
---|
| 672 | + struct bpf_cpu_map_entry *rcpu = bq->obj; |
---|
573 | 673 | unsigned int processed = 0, drops = 0; |
---|
574 | 674 | const int to_cpu = rcpu->cpu; |
---|
575 | 675 | struct ptr_ring *q; |
---|
576 | 676 | int i; |
---|
577 | 677 | |
---|
578 | 678 | if (unlikely(!bq->count)) |
---|
579 | | - return 0; |
---|
| 679 | + return; |
---|
580 | 680 | |
---|
581 | 681 | q = rcpu->queue; |
---|
582 | 682 | spin_lock(&q->producer_lock); |
---|
.. | .. |
---|
588 | 688 | err = __ptr_ring_produce(q, xdpf); |
---|
589 | 689 | if (err) { |
---|
590 | 690 | drops++; |
---|
591 | | - if (likely(in_napi_ctx)) |
---|
592 | | - xdp_return_frame_rx_napi(xdpf); |
---|
593 | | - else |
---|
594 | | - xdp_return_frame(xdpf); |
---|
| 691 | + xdp_return_frame_rx_napi(xdpf); |
---|
595 | 692 | } |
---|
596 | 693 | processed++; |
---|
597 | 694 | } |
---|
598 | 695 | bq->count = 0; |
---|
599 | 696 | spin_unlock(&q->producer_lock); |
---|
600 | 697 | |
---|
| 698 | + __list_del_clearprev(&bq->flush_node); |
---|
| 699 | + |
---|
601 | 700 | /* Feedback loop via tracepoints */ |
---|
602 | 701 | trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu); |
---|
603 | | - return 0; |
---|
604 | 702 | } |
---|
605 | 703 | |
---|
606 | 704 | /* Runs under RCU-read-side, plus in softirq under NAPI protection. |
---|
607 | 705 | * Thus, safe percpu variable access. |
---|
608 | 706 | */ |
---|
609 | | -static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) |
---|
| 707 | +static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) |
---|
610 | 708 | { |
---|
| 709 | + struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list); |
---|
611 | 710 | struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); |
---|
612 | 711 | |
---|
613 | 712 | if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) |
---|
614 | | - bq_flush_to_queue(rcpu, bq, true); |
---|
| 713 | + bq_flush_to_queue(bq); |
---|
615 | 714 | |
---|
616 | 715 | /* Notice, xdp_buff/page MUST be queued here, long enough for |
---|
617 | 716 | * driver to code invoking us to finished, due to driver |
---|
.. | .. |
---|
623 | 722 | * operation, when completing napi->poll call. |
---|
624 | 723 | */ |
---|
625 | 724 | bq->q[bq->count++] = xdpf; |
---|
626 | | - return 0; |
---|
| 725 | + |
---|
| 726 | + if (!bq->flush_node.prev) |
---|
| 727 | + list_add(&bq->flush_node, flush_list); |
---|
627 | 728 | } |
---|
628 | 729 | |
---|
629 | 730 | int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, |
---|
.. | .. |
---|
631 | 732 | { |
---|
632 | 733 | struct xdp_frame *xdpf; |
---|
633 | 734 | |
---|
634 | | - xdpf = convert_to_xdp_frame(xdp); |
---|
| 735 | + xdpf = xdp_convert_buff_to_frame(xdp); |
---|
635 | 736 | if (unlikely(!xdpf)) |
---|
636 | 737 | return -EOVERFLOW; |
---|
637 | 738 | |
---|
.. | .. |
---|
642 | 743 | return 0; |
---|
643 | 744 | } |
---|
644 | 745 | |
---|
645 | | -void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit) |
---|
| 746 | +void __cpu_map_flush(void) |
---|
646 | 747 | { |
---|
647 | | - struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); |
---|
648 | | - unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); |
---|
| 748 | + struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list); |
---|
| 749 | + struct xdp_bulk_queue *bq, *tmp; |
---|
649 | 750 | |
---|
650 | | - __set_bit(bit, bitmap); |
---|
651 | | -} |
---|
652 | | - |
---|
653 | | -void __cpu_map_flush(struct bpf_map *map) |
---|
654 | | -{ |
---|
655 | | - struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); |
---|
656 | | - unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); |
---|
657 | | - u32 bit; |
---|
658 | | - |
---|
659 | | - /* The napi->poll softirq makes sure __cpu_map_insert_ctx() |
---|
660 | | - * and __cpu_map_flush() happen on same CPU. Thus, the percpu |
---|
661 | | - * bitmap indicate which percpu bulkq have packets. |
---|
662 | | - */ |
---|
663 | | - for_each_set_bit(bit, bitmap, map->max_entries) { |
---|
664 | | - struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]); |
---|
665 | | - struct xdp_bulk_queue *bq; |
---|
666 | | - |
---|
667 | | - /* This is possible if entry is removed by user space |
---|
668 | | - * between xdp redirect and flush op. |
---|
669 | | - */ |
---|
670 | | - if (unlikely(!rcpu)) |
---|
671 | | - continue; |
---|
672 | | - |
---|
673 | | - __clear_bit(bit, bitmap); |
---|
674 | | - |
---|
675 | | - /* Flush all frames in bulkq to real queue */ |
---|
676 | | - bq = this_cpu_ptr(rcpu->bulkq); |
---|
677 | | - bq_flush_to_queue(rcpu, bq, true); |
---|
| 751 | + list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { |
---|
| 752 | + bq_flush_to_queue(bq); |
---|
678 | 753 | |
---|
679 | 754 | /* If already running, costs spin_lock_irqsave + smb_mb */ |
---|
680 | | - wake_up_process(rcpu->kthread); |
---|
| 755 | + wake_up_process(bq->obj->kthread); |
---|
681 | 756 | } |
---|
682 | 757 | } |
---|
| 758 | + |
---|
| 759 | +static int __init cpu_map_init(void) |
---|
| 760 | +{ |
---|
| 761 | + int cpu; |
---|
| 762 | + |
---|
| 763 | + for_each_possible_cpu(cpu) |
---|
| 764 | + INIT_LIST_HEAD(&per_cpu(cpu_map_flush_list, cpu)); |
---|
| 765 | + return 0; |
---|
| 766 | +} |
---|
| 767 | + |
---|
| 768 | +subsys_initcall(cpu_map_init); |
---|