| .. | .. |
|---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
|---|
| 1 | 2 | /* bpf/cpumap.c |
|---|
| 2 | 3 | * |
|---|
| 3 | 4 | * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. |
|---|
| 4 | | - * Released under terms in GPL version 2. See COPYING. |
|---|
| 5 | 5 | */ |
|---|
| 6 | 6 | |
|---|
| 7 | 7 | /* The 'cpumap' is primarily used as a backend map for XDP BPF helper |
|---|
| .. | .. |
|---|
| 32 | 32 | |
|---|
| 33 | 33 | /* General idea: XDP packets getting XDP redirected to another CPU, |
|---|
| 34 | 34 | * will maximum be stored/queued for one driver ->poll() call. It is |
|---|
| 35 | | - * guaranteed that setting flush bit and flush operation happen on |
|---|
| 35 | + * guaranteed that queueing the frame and the flush operation happen on |
|---|
| 36 | 36 | * same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr() |
|---|
| 37 | 37 | * which queue in bpf_cpu_map_entry contains packets. |
|---|
| 38 | 38 | */ |
|---|
| 39 | 39 | |
|---|
| 40 | 40 | #define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */ |
|---|
| 41 | +struct bpf_cpu_map_entry; |
|---|
| 42 | +struct bpf_cpu_map; |
|---|
| 43 | + |
|---|
| 41 | 44 | struct xdp_bulk_queue { |
|---|
| 42 | 45 | void *q[CPU_MAP_BULK_SIZE]; |
|---|
| 46 | + struct list_head flush_node; |
|---|
| 47 | + struct bpf_cpu_map_entry *obj; |
|---|
| 43 | 48 | unsigned int count; |
|---|
| 44 | 49 | }; |
|---|
| 45 | 50 | |
|---|
| .. | .. |
|---|
| 47 | 52 | struct bpf_cpu_map_entry { |
|---|
| 48 | 53 | u32 cpu; /* kthread CPU and map index */ |
|---|
| 49 | 54 | int map_id; /* Back reference to map */ |
|---|
| 50 | | - u32 qsize; /* Queue size placeholder for map lookup */ |
|---|
| 51 | 55 | |
|---|
| 52 | 56 | /* XDP can run multiple RX-ring queues, need __percpu enqueue store */ |
|---|
| 53 | 57 | struct xdp_bulk_queue __percpu *bulkq; |
|---|
| 54 | 58 | |
|---|
| 59 | + struct bpf_cpu_map *cmap; |
|---|
| 60 | + |
|---|
| 55 | 61 | /* Queue with potential multi-producers, and single-consumer kthread */ |
|---|
| 56 | 62 | struct ptr_ring *queue; |
|---|
| 57 | 63 | struct task_struct *kthread; |
|---|
| 58 | | - struct work_struct kthread_stop_wq; |
|---|
| 64 | + |
|---|
| 65 | + struct bpf_cpumap_val value; |
|---|
| 66 | + struct bpf_prog *prog; |
|---|
| 59 | 67 | |
|---|
| 60 | 68 | atomic_t refcnt; /* Control when this struct can be free'ed */ |
|---|
| 61 | 69 | struct rcu_head rcu; |
|---|
| 70 | + |
|---|
| 71 | + struct work_struct kthread_stop_wq; |
|---|
| 62 | 72 | }; |
|---|
| 63 | 73 | |
|---|
| 64 | 74 | struct bpf_cpu_map { |
|---|
| 65 | 75 | struct bpf_map map; |
|---|
| 66 | 76 | /* Below members specific for map type */ |
|---|
| 67 | 77 | struct bpf_cpu_map_entry **cpu_map; |
|---|
| 68 | | - unsigned long __percpu *flush_needed; |
|---|
| 69 | 78 | }; |
|---|
| 70 | 79 | |
|---|
| 71 | | -static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, |
|---|
| 72 | | - struct xdp_bulk_queue *bq, bool in_napi_ctx); |
|---|
| 73 | | - |
|---|
| 74 | | -static u64 cpu_map_bitmap_size(const union bpf_attr *attr) |
|---|
| 75 | | -{ |
|---|
| 76 | | - return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long); |
|---|
| 77 | | -} |
|---|
| 80 | +static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list); |
|---|
| 78 | 81 | |
|---|
| 79 | 82 | static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) |
|---|
| 80 | 83 | { |
|---|
| 84 | + u32 value_size = attr->value_size; |
|---|
| 81 | 85 | struct bpf_cpu_map *cmap; |
|---|
| 82 | 86 | int err = -ENOMEM; |
|---|
| 83 | 87 | u64 cost; |
|---|
| 84 | 88 | int ret; |
|---|
| 85 | 89 | |
|---|
| 86 | | - if (!capable(CAP_SYS_ADMIN)) |
|---|
| 90 | + if (!bpf_capable()) |
|---|
| 87 | 91 | return ERR_PTR(-EPERM); |
|---|
| 88 | 92 | |
|---|
| 89 | 93 | /* check sanity of attributes */ |
|---|
| 90 | 94 | if (attr->max_entries == 0 || attr->key_size != 4 || |
|---|
| 91 | | - attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) |
|---|
| 95 | + (value_size != offsetofend(struct bpf_cpumap_val, qsize) && |
|---|
| 96 | + value_size != offsetofend(struct bpf_cpumap_val, bpf_prog.fd)) || |
|---|
| 97 | + attr->map_flags & ~BPF_F_NUMA_NODE) |
|---|
| 92 | 98 | return ERR_PTR(-EINVAL); |
|---|
| 93 | 99 | |
|---|
| 94 | 100 | cmap = kzalloc(sizeof(*cmap), GFP_USER); |
|---|
| .. | .. |
|---|
| 105 | 111 | |
|---|
| 106 | 112 | /* make sure page count doesn't overflow */ |
|---|
| 107 | 113 | cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *); |
|---|
| 108 | | - cost += cpu_map_bitmap_size(attr) * num_possible_cpus(); |
|---|
| 109 | | - if (cost >= U32_MAX - PAGE_SIZE) |
|---|
| 110 | | - goto free_cmap; |
|---|
| 111 | | - cmap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; |
|---|
| 112 | 114 | |
|---|
| 113 | 115 | /* Notice returns -EPERM on if map size is larger than memlock limit */ |
|---|
| 114 | | - ret = bpf_map_precharge_memlock(cmap->map.pages); |
|---|
| 116 | + ret = bpf_map_charge_init(&cmap->map.memory, cost); |
|---|
| 115 | 117 | if (ret) { |
|---|
| 116 | 118 | err = ret; |
|---|
| 117 | 119 | goto free_cmap; |
|---|
| 118 | 120 | } |
|---|
| 119 | | - |
|---|
| 120 | | - /* A per cpu bitfield with a bit per possible CPU in map */ |
|---|
| 121 | | - cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr), |
|---|
| 122 | | - __alignof__(unsigned long)); |
|---|
| 123 | | - if (!cmap->flush_needed) |
|---|
| 124 | | - goto free_cmap; |
|---|
| 125 | 121 | |
|---|
| 126 | 122 | /* Alloc array for possible remote "destination" CPUs */ |
|---|
| 127 | 123 | cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * |
|---|
| 128 | 124 | sizeof(struct bpf_cpu_map_entry *), |
|---|
| 129 | 125 | cmap->map.numa_node); |
|---|
| 130 | 126 | if (!cmap->cpu_map) |
|---|
| 131 | | - goto free_percpu; |
|---|
| 127 | + goto free_charge; |
|---|
| 132 | 128 | |
|---|
| 133 | 129 | return &cmap->map; |
|---|
| 134 | | -free_percpu: |
|---|
| 135 | | - free_percpu(cmap->flush_needed); |
|---|
| 130 | +free_charge: |
|---|
| 131 | + bpf_map_charge_finish(&cmap->map.memory); |
|---|
| 136 | 132 | free_cmap: |
|---|
| 137 | 133 | kfree(cmap); |
|---|
| 138 | 134 | return ERR_PTR(err); |
|---|
| .. | .. |
|---|
| 159 | 155 | kthread_stop(rcpu->kthread); |
|---|
| 160 | 156 | } |
|---|
| 161 | 157 | |
|---|
| 162 | | -static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, |
|---|
| 163 | | - struct xdp_frame *xdpf) |
|---|
| 158 | +static struct sk_buff *cpu_map_build_skb(struct xdp_frame *xdpf, |
|---|
| 159 | + struct sk_buff *skb) |
|---|
| 164 | 160 | { |
|---|
| 165 | 161 | unsigned int hard_start_headroom; |
|---|
| 166 | 162 | unsigned int frame_size; |
|---|
| 167 | 163 | void *pkt_data_start; |
|---|
| 168 | | - struct sk_buff *skb; |
|---|
| 169 | 164 | |
|---|
| 170 | 165 | /* Part of headroom was reserved to xdpf */ |
|---|
| 171 | 166 | hard_start_headroom = sizeof(struct xdp_frame) + xdpf->headroom; |
|---|
| 172 | 167 | |
|---|
| 173 | | - /* build_skb need to place skb_shared_info after SKB end, and |
|---|
| 174 | | - * also want to know the memory "truesize". Thus, need to |
|---|
| 175 | | - * know the memory frame size backing xdp_buff. |
|---|
| 176 | | - * |
|---|
| 177 | | - * XDP was designed to have PAGE_SIZE frames, but this |
|---|
| 178 | | - * assumption is not longer true with ixgbe and i40e. It |
|---|
| 179 | | - * would be preferred to set frame_size to 2048 or 4096 |
|---|
| 180 | | - * depending on the driver. |
|---|
| 181 | | - * frame_size = 2048; |
|---|
| 182 | | - * frame_len = frame_size - sizeof(*xdp_frame); |
|---|
| 183 | | - * |
|---|
| 184 | | - * Instead, with info avail, skb_shared_info in placed after |
|---|
| 185 | | - * packet len. This, unfortunately fakes the truesize. |
|---|
| 186 | | - * Another disadvantage of this approach, the skb_shared_info |
|---|
| 187 | | - * is not at a fixed memory location, with mixed length |
|---|
| 188 | | - * packets, which is bad for cache-line hotness. |
|---|
| 168 | + /* Memory size backing xdp_frame data already have reserved |
|---|
| 169 | + * room for build_skb to place skb_shared_info in tailroom. |
|---|
| 189 | 170 | */ |
|---|
| 190 | | - frame_size = SKB_DATA_ALIGN(xdpf->len + hard_start_headroom) + |
|---|
| 191 | | - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); |
|---|
| 171 | + frame_size = xdpf->frame_sz; |
|---|
| 192 | 172 | |
|---|
| 193 | 173 | pkt_data_start = xdpf->data - hard_start_headroom; |
|---|
| 194 | | - skb = build_skb(pkt_data_start, frame_size); |
|---|
| 195 | | - if (!skb) |
|---|
| 174 | + skb = build_skb_around(skb, pkt_data_start, frame_size); |
|---|
| 175 | + if (unlikely(!skb)) |
|---|
| 196 | 176 | return NULL; |
|---|
| 197 | 177 | |
|---|
| 198 | 178 | skb_reserve(skb, hard_start_headroom); |
|---|
| .. | .. |
|---|
| 208 | 188 | * - HW RX hash (skb_set_hash) |
|---|
| 209 | 189 | * - RX ring dev queue index (skb_record_rx_queue) |
|---|
| 210 | 190 | */ |
|---|
| 191 | + |
|---|
| 192 | + /* Until page_pool get SKB return path, release DMA here */ |
|---|
| 193 | + xdp_release_frame(xdpf); |
|---|
| 211 | 194 | |
|---|
| 212 | 195 | /* Allow SKB to reuse area used by xdp_frame */ |
|---|
| 213 | 196 | xdp_scrub_frame(xdpf); |
|---|
| .. | .. |
|---|
| 232 | 215 | static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) |
|---|
| 233 | 216 | { |
|---|
| 234 | 217 | if (atomic_dec_and_test(&rcpu->refcnt)) { |
|---|
| 218 | + if (rcpu->prog) |
|---|
| 219 | + bpf_prog_put(rcpu->prog); |
|---|
| 235 | 220 | /* The queue should be empty at this point */ |
|---|
| 236 | 221 | __cpu_map_ring_cleanup(rcpu->queue); |
|---|
| 237 | 222 | ptr_ring_cleanup(rcpu->queue, NULL); |
|---|
| .. | .. |
|---|
| 239 | 224 | kfree(rcpu); |
|---|
| 240 | 225 | } |
|---|
| 241 | 226 | } |
|---|
| 227 | + |
|---|
| 228 | +static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, |
|---|
| 229 | + void **frames, int n, |
|---|
| 230 | + struct xdp_cpumap_stats *stats) |
|---|
| 231 | +{ |
|---|
| 232 | + struct xdp_rxq_info rxq; |
|---|
| 233 | + struct xdp_buff xdp; |
|---|
| 234 | + int i, nframes = 0; |
|---|
| 235 | + |
|---|
| 236 | + if (!rcpu->prog) |
|---|
| 237 | + return n; |
|---|
| 238 | + |
|---|
| 239 | + rcu_read_lock_bh(); |
|---|
| 240 | + |
|---|
| 241 | + xdp_set_return_frame_no_direct(); |
|---|
| 242 | + xdp.rxq = &rxq; |
|---|
| 243 | + |
|---|
| 244 | + for (i = 0; i < n; i++) { |
|---|
| 245 | + struct xdp_frame *xdpf = frames[i]; |
|---|
| 246 | + u32 act; |
|---|
| 247 | + int err; |
|---|
| 248 | + |
|---|
| 249 | + rxq.dev = xdpf->dev_rx; |
|---|
| 250 | + rxq.mem = xdpf->mem; |
|---|
| 251 | + /* TODO: report queue_index to xdp_rxq_info */ |
|---|
| 252 | + |
|---|
| 253 | + xdp_convert_frame_to_buff(xdpf, &xdp); |
|---|
| 254 | + |
|---|
| 255 | + act = bpf_prog_run_xdp(rcpu->prog, &xdp); |
|---|
| 256 | + switch (act) { |
|---|
| 257 | + case XDP_PASS: |
|---|
| 258 | + err = xdp_update_frame_from_buff(&xdp, xdpf); |
|---|
| 259 | + if (err < 0) { |
|---|
| 260 | + xdp_return_frame(xdpf); |
|---|
| 261 | + stats->drop++; |
|---|
| 262 | + } else { |
|---|
| 263 | + frames[nframes++] = xdpf; |
|---|
| 264 | + stats->pass++; |
|---|
| 265 | + } |
|---|
| 266 | + break; |
|---|
| 267 | + case XDP_REDIRECT: |
|---|
| 268 | + err = xdp_do_redirect(xdpf->dev_rx, &xdp, |
|---|
| 269 | + rcpu->prog); |
|---|
| 270 | + if (unlikely(err)) { |
|---|
| 271 | + xdp_return_frame(xdpf); |
|---|
| 272 | + stats->drop++; |
|---|
| 273 | + } else { |
|---|
| 274 | + stats->redirect++; |
|---|
| 275 | + } |
|---|
| 276 | + break; |
|---|
| 277 | + default: |
|---|
| 278 | + bpf_warn_invalid_xdp_action(act); |
|---|
| 279 | + fallthrough; |
|---|
| 280 | + case XDP_DROP: |
|---|
| 281 | + xdp_return_frame(xdpf); |
|---|
| 282 | + stats->drop++; |
|---|
| 283 | + break; |
|---|
| 284 | + } |
|---|
| 285 | + } |
|---|
| 286 | + |
|---|
| 287 | + if (stats->redirect) |
|---|
| 288 | + xdp_do_flush_map(); |
|---|
| 289 | + |
|---|
| 290 | + xdp_clear_return_frame_no_direct(); |
|---|
| 291 | + |
|---|
| 292 | + rcu_read_unlock_bh(); /* resched point, may call do_softirq() */ |
|---|
| 293 | + |
|---|
| 294 | + return nframes; |
|---|
| 295 | +} |
|---|
| 296 | + |
|---|
| 297 | +#define CPUMAP_BATCH 8 |
|---|
| 242 | 298 | |
|---|
| 243 | 299 | static int cpu_map_kthread_run(void *data) |
|---|
| 244 | 300 | { |
|---|
| .. | .. |
|---|
| 252 | 308 | * kthread_stop signal until queue is empty. |
|---|
| 253 | 309 | */ |
|---|
| 254 | 310 | while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { |
|---|
| 255 | | - unsigned int processed = 0, drops = 0, sched = 0; |
|---|
| 256 | | - struct xdp_frame *xdpf; |
|---|
| 311 | + struct xdp_cpumap_stats stats = {}; /* zero stats */ |
|---|
| 312 | + gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; |
|---|
| 313 | + unsigned int drops = 0, sched = 0; |
|---|
| 314 | + void *frames[CPUMAP_BATCH]; |
|---|
| 315 | + void *skbs[CPUMAP_BATCH]; |
|---|
| 316 | + int i, n, m, nframes; |
|---|
| 257 | 317 | |
|---|
| 258 | 318 | /* Release CPU reschedule checks */ |
|---|
| 259 | 319 | if (__ptr_ring_empty(rcpu->queue)) { |
|---|
| .. | .. |
|---|
| 269 | 329 | sched = cond_resched(); |
|---|
| 270 | 330 | } |
|---|
| 271 | 331 | |
|---|
| 272 | | - /* Process packets in rcpu->queue */ |
|---|
| 273 | | - local_bh_disable(); |
|---|
| 274 | 332 | /* |
|---|
| 275 | 333 | * The bpf_cpu_map_entry is single consumer, with this |
|---|
| 276 | 334 | * kthread CPU pinned. Lockless access to ptr_ring |
|---|
| 277 | 335 | * consume side valid as no-resize allowed of queue. |
|---|
| 278 | 336 | */ |
|---|
| 279 | | - while ((xdpf = __ptr_ring_consume(rcpu->queue))) { |
|---|
| 280 | | - struct sk_buff *skb; |
|---|
| 337 | + n = __ptr_ring_consume_batched(rcpu->queue, frames, |
|---|
| 338 | + CPUMAP_BATCH); |
|---|
| 339 | + for (i = 0; i < n; i++) { |
|---|
| 340 | + void *f = frames[i]; |
|---|
| 341 | + struct page *page = virt_to_page(f); |
|---|
| 342 | + |
|---|
| 343 | + /* Bring struct page memory area to curr CPU. Read by |
|---|
| 344 | + * build_skb_around via page_is_pfmemalloc(), and when |
|---|
| 345 | + * freed written by page_frag_free call. |
|---|
| 346 | + */ |
|---|
| 347 | + prefetchw(page); |
|---|
| 348 | + } |
|---|
| 349 | + |
|---|
| 350 | + /* Support running another XDP prog on this CPU */ |
|---|
| 351 | + nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, n, &stats); |
|---|
| 352 | + if (nframes) { |
|---|
| 353 | + m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, nframes, skbs); |
|---|
| 354 | + if (unlikely(m == 0)) { |
|---|
| 355 | + for (i = 0; i < nframes; i++) |
|---|
| 356 | + skbs[i] = NULL; /* effect: xdp_return_frame */ |
|---|
| 357 | + drops += nframes; |
|---|
| 358 | + } |
|---|
| 359 | + } |
|---|
| 360 | + |
|---|
| 361 | + local_bh_disable(); |
|---|
| 362 | + for (i = 0; i < nframes; i++) { |
|---|
| 363 | + struct xdp_frame *xdpf = frames[i]; |
|---|
| 364 | + struct sk_buff *skb = skbs[i]; |
|---|
| 281 | 365 | int ret; |
|---|
| 282 | 366 | |
|---|
| 283 | | - skb = cpu_map_build_skb(rcpu, xdpf); |
|---|
| 367 | + skb = cpu_map_build_skb(xdpf, skb); |
|---|
| 284 | 368 | if (!skb) { |
|---|
| 285 | 369 | xdp_return_frame(xdpf); |
|---|
| 286 | 370 | continue; |
|---|
| .. | .. |
|---|
| 290 | 374 | ret = netif_receive_skb_core(skb); |
|---|
| 291 | 375 | if (ret == NET_RX_DROP) |
|---|
| 292 | 376 | drops++; |
|---|
| 293 | | - |
|---|
| 294 | | - /* Limit BH-disable period */ |
|---|
| 295 | | - if (++processed == 8) |
|---|
| 296 | | - break; |
|---|
| 297 | 377 | } |
|---|
| 298 | 378 | /* Feedback loop via tracepoint */ |
|---|
| 299 | | - trace_xdp_cpumap_kthread(rcpu->map_id, processed, drops, sched); |
|---|
| 379 | + trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched, &stats); |
|---|
| 300 | 380 | |
|---|
| 301 | 381 | local_bh_enable(); /* resched point, may call do_softirq() */ |
|---|
| 302 | 382 | } |
|---|
| .. | .. |
|---|
| 306 | 386 | return 0; |
|---|
| 307 | 387 | } |
|---|
| 308 | 388 | |
|---|
| 309 | | -static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, |
|---|
| 310 | | - int map_id) |
|---|
| 389 | +bool cpu_map_prog_allowed(struct bpf_map *map) |
|---|
| 311 | 390 | { |
|---|
| 391 | + return map->map_type == BPF_MAP_TYPE_CPUMAP && |
|---|
| 392 | + map->value_size != offsetofend(struct bpf_cpumap_val, qsize); |
|---|
| 393 | +} |
|---|
| 394 | + |
|---|
| 395 | +static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd) |
|---|
| 396 | +{ |
|---|
| 397 | + struct bpf_prog *prog; |
|---|
| 398 | + |
|---|
| 399 | + prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); |
|---|
| 400 | + if (IS_ERR(prog)) |
|---|
| 401 | + return PTR_ERR(prog); |
|---|
| 402 | + |
|---|
| 403 | + if (prog->expected_attach_type != BPF_XDP_CPUMAP) { |
|---|
| 404 | + bpf_prog_put(prog); |
|---|
| 405 | + return -EINVAL; |
|---|
| 406 | + } |
|---|
| 407 | + |
|---|
| 408 | + rcpu->value.bpf_prog.id = prog->aux->id; |
|---|
| 409 | + rcpu->prog = prog; |
|---|
| 410 | + |
|---|
| 411 | + return 0; |
|---|
| 412 | +} |
|---|
| 413 | + |
|---|
| 414 | +static struct bpf_cpu_map_entry * |
|---|
| 415 | +__cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) |
|---|
| 416 | +{ |
|---|
| 417 | + int numa, err, i, fd = value->bpf_prog.fd; |
|---|
| 312 | 418 | gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; |
|---|
| 313 | 419 | struct bpf_cpu_map_entry *rcpu; |
|---|
| 314 | | - int numa, err; |
|---|
| 420 | + struct xdp_bulk_queue *bq; |
|---|
| 315 | 421 | |
|---|
| 316 | 422 | /* Have map->numa_node, but choose node of redirect target CPU */ |
|---|
| 317 | 423 | numa = cpu_to_node(cpu); |
|---|
| .. | .. |
|---|
| 326 | 432 | if (!rcpu->bulkq) |
|---|
| 327 | 433 | goto free_rcu; |
|---|
| 328 | 434 | |
|---|
| 435 | + for_each_possible_cpu(i) { |
|---|
| 436 | + bq = per_cpu_ptr(rcpu->bulkq, i); |
|---|
| 437 | + bq->obj = rcpu; |
|---|
| 438 | + } |
|---|
| 439 | + |
|---|
| 329 | 440 | /* Alloc queue */ |
|---|
| 330 | 441 | rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa); |
|---|
| 331 | 442 | if (!rcpu->queue) |
|---|
| 332 | 443 | goto free_bulkq; |
|---|
| 333 | 444 | |
|---|
| 334 | | - err = ptr_ring_init(rcpu->queue, qsize, gfp); |
|---|
| 445 | + err = ptr_ring_init(rcpu->queue, value->qsize, gfp); |
|---|
| 335 | 446 | if (err) |
|---|
| 336 | 447 | goto free_queue; |
|---|
| 337 | 448 | |
|---|
| 338 | 449 | rcpu->cpu = cpu; |
|---|
| 339 | 450 | rcpu->map_id = map_id; |
|---|
| 340 | | - rcpu->qsize = qsize; |
|---|
| 451 | + rcpu->value.qsize = value->qsize; |
|---|
| 452 | + |
|---|
| 453 | + if (fd > 0 && __cpu_map_load_bpf_program(rcpu, fd)) |
|---|
| 454 | + goto free_ptr_ring; |
|---|
| 341 | 455 | |
|---|
| 342 | 456 | /* Setup kthread */ |
|---|
| 343 | 457 | rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa, |
|---|
| 344 | 458 | "cpumap/%d/map:%d", cpu, map_id); |
|---|
| 345 | 459 | if (IS_ERR(rcpu->kthread)) |
|---|
| 346 | | - goto free_ptr_ring; |
|---|
| 460 | + goto free_prog; |
|---|
| 347 | 461 | |
|---|
| 348 | 462 | get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */ |
|---|
| 349 | 463 | get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */ |
|---|
| .. | .. |
|---|
| 354 | 468 | |
|---|
| 355 | 469 | return rcpu; |
|---|
| 356 | 470 | |
|---|
| 471 | +free_prog: |
|---|
| 472 | + if (rcpu->prog) |
|---|
| 473 | + bpf_prog_put(rcpu->prog); |
|---|
| 357 | 474 | free_ptr_ring: |
|---|
| 358 | 475 | ptr_ring_cleanup(rcpu->queue, NULL); |
|---|
| 359 | 476 | free_queue: |
|---|
| .. | .. |
|---|
| 368 | 485 | static void __cpu_map_entry_free(struct rcu_head *rcu) |
|---|
| 369 | 486 | { |
|---|
| 370 | 487 | struct bpf_cpu_map_entry *rcpu; |
|---|
| 371 | | - int cpu; |
|---|
| 372 | 488 | |
|---|
| 373 | 489 | /* This cpu_map_entry have been disconnected from map and one |
|---|
| 374 | | - * RCU graze-period have elapsed. Thus, XDP cannot queue any |
|---|
| 490 | + * RCU grace-period have elapsed. Thus, XDP cannot queue any |
|---|
| 375 | 491 | * new packets and cannot change/set flush_needed that can |
|---|
| 376 | 492 | * find this entry. |
|---|
| 377 | 493 | */ |
|---|
| 378 | 494 | rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu); |
|---|
| 379 | 495 | |
|---|
| 380 | | - /* Flush remaining packets in percpu bulkq */ |
|---|
| 381 | | - for_each_online_cpu(cpu) { |
|---|
| 382 | | - struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu); |
|---|
| 383 | | - |
|---|
| 384 | | - /* No concurrent bq_enqueue can run at this point */ |
|---|
| 385 | | - bq_flush_to_queue(rcpu, bq, false); |
|---|
| 386 | | - } |
|---|
| 387 | 496 | free_percpu(rcpu->bulkq); |
|---|
| 388 | 497 | /* Cannot kthread_stop() here, last put free rcpu resources */ |
|---|
| 389 | 498 | put_cpu_map_entry(rcpu); |
|---|
| .. | .. |
|---|
| 405 | 514 | * percpu bulkq to queue. Due to caller map_delete_elem() disable |
|---|
| 406 | 515 | * preemption, cannot call kthread_stop() to make sure queue is empty. |
|---|
| 407 | 516 | * Instead a work_queue is started for stopping kthread, |
|---|
| 408 | | - * cpu_map_kthread_stop, which waits for an RCU graze period before |
|---|
| 517 | + * cpu_map_kthread_stop, which waits for an RCU grace period before |
|---|
| 409 | 518 | * stopping kthread, emptying the queue. |
|---|
| 410 | 519 | */ |
|---|
| 411 | 520 | static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, |
|---|
| .. | .. |
|---|
| 438 | 547 | u64 map_flags) |
|---|
| 439 | 548 | { |
|---|
| 440 | 549 | struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); |
|---|
| 550 | + struct bpf_cpumap_val cpumap_value = {}; |
|---|
| 441 | 551 | struct bpf_cpu_map_entry *rcpu; |
|---|
| 442 | | - |
|---|
| 443 | 552 | /* Array index key correspond to CPU number */ |
|---|
| 444 | 553 | u32 key_cpu = *(u32 *)key; |
|---|
| 445 | | - /* Value is the queue size */ |
|---|
| 446 | | - u32 qsize = *(u32 *)value; |
|---|
| 554 | + |
|---|
| 555 | + memcpy(&cpumap_value, value, map->value_size); |
|---|
| 447 | 556 | |
|---|
| 448 | 557 | if (unlikely(map_flags > BPF_EXIST)) |
|---|
| 449 | 558 | return -EINVAL; |
|---|
| .. | .. |
|---|
| 451 | 560 | return -E2BIG; |
|---|
| 452 | 561 | if (unlikely(map_flags == BPF_NOEXIST)) |
|---|
| 453 | 562 | return -EEXIST; |
|---|
| 454 | | - if (unlikely(qsize > 16384)) /* sanity limit on qsize */ |
|---|
| 563 | + if (unlikely(cpumap_value.qsize > 16384)) /* sanity limit on qsize */ |
|---|
| 455 | 564 | return -EOVERFLOW; |
|---|
| 456 | 565 | |
|---|
| 457 | 566 | /* Make sure CPU is a valid possible cpu */ |
|---|
| 458 | 567 | if (key_cpu >= nr_cpumask_bits || !cpu_possible(key_cpu)) |
|---|
| 459 | 568 | return -ENODEV; |
|---|
| 460 | 569 | |
|---|
| 461 | | - if (qsize == 0) { |
|---|
| 570 | + if (cpumap_value.qsize == 0) { |
|---|
| 462 | 571 | rcpu = NULL; /* Same as deleting */ |
|---|
| 463 | 572 | } else { |
|---|
| 464 | 573 | /* Updating qsize cause re-allocation of bpf_cpu_map_entry */ |
|---|
| 465 | | - rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id); |
|---|
| 574 | + rcpu = __cpu_map_entry_alloc(&cpumap_value, key_cpu, map->id); |
|---|
| 466 | 575 | if (!rcpu) |
|---|
| 467 | 576 | return -ENOMEM; |
|---|
| 577 | + rcpu->cmap = cmap; |
|---|
| 468 | 578 | } |
|---|
| 469 | 579 | rcu_read_lock(); |
|---|
| 470 | 580 | __cpu_map_entry_replace(cmap, key_cpu, rcpu); |
|---|
| .. | .. |
|---|
| 475 | 585 | static void cpu_map_free(struct bpf_map *map) |
|---|
| 476 | 586 | { |
|---|
| 477 | 587 | struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); |
|---|
| 478 | | - int cpu; |
|---|
| 479 | 588 | u32 i; |
|---|
| 480 | 589 | |
|---|
| 481 | 590 | /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, |
|---|
| .. | .. |
|---|
| 490 | 599 | bpf_clear_redirect_map(map); |
|---|
| 491 | 600 | synchronize_rcu(); |
|---|
| 492 | 601 | |
|---|
| 493 | | - /* To ensure all pending flush operations have completed wait for flush |
|---|
| 494 | | - * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. |
|---|
| 495 | | - * Because the above synchronize_rcu() ensures the map is disconnected |
|---|
| 496 | | - * from the program we can assume no new bits will be set. |
|---|
| 497 | | - */ |
|---|
| 498 | | - for_each_online_cpu(cpu) { |
|---|
| 499 | | - unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu); |
|---|
| 500 | | - |
|---|
| 501 | | - while (!bitmap_empty(bitmap, cmap->map.max_entries)) |
|---|
| 502 | | - cond_resched(); |
|---|
| 503 | | - } |
|---|
| 504 | | - |
|---|
| 505 | 602 | /* For cpu_map the remote CPUs can still be using the entries |
|---|
| 506 | 603 | * (struct bpf_cpu_map_entry). |
|---|
| 507 | 604 | */ |
|---|
| .. | .. |
|---|
| 512 | 609 | if (!rcpu) |
|---|
| 513 | 610 | continue; |
|---|
| 514 | 611 | |
|---|
| 515 | | - /* bq flush and cleanup happens after RCU graze-period */ |
|---|
| 612 | + /* bq flush and cleanup happens after RCU grace-period */ |
|---|
| 516 | 613 | __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ |
|---|
| 517 | 614 | } |
|---|
| 518 | | - free_percpu(cmap->flush_needed); |
|---|
| 519 | 615 | bpf_map_area_free(cmap->cpu_map); |
|---|
| 520 | 616 | kfree(cmap); |
|---|
| 521 | 617 | } |
|---|
| .. | .. |
|---|
| 537 | 633 | struct bpf_cpu_map_entry *rcpu = |
|---|
| 538 | 634 | __cpu_map_lookup_elem(map, *(u32 *)key); |
|---|
| 539 | 635 | |
|---|
| 540 | | - return rcpu ? &rcpu->qsize : NULL; |
|---|
| 636 | + return rcpu ? &rcpu->value : NULL; |
|---|
| 541 | 637 | } |
|---|
| 542 | 638 | |
|---|
| 543 | 639 | static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) |
|---|
| .. | .. |
|---|
| 557 | 653 | return 0; |
|---|
| 558 | 654 | } |
|---|
| 559 | 655 | |
|---|
| 656 | +static int cpu_map_btf_id; |
|---|
| 560 | 657 | const struct bpf_map_ops cpu_map_ops = { |
|---|
| 658 | + .map_meta_equal = bpf_map_meta_equal, |
|---|
| 561 | 659 | .map_alloc = cpu_map_alloc, |
|---|
| 562 | 660 | .map_free = cpu_map_free, |
|---|
| 563 | 661 | .map_delete_elem = cpu_map_delete_elem, |
|---|
| .. | .. |
|---|
| 565 | 663 | .map_lookup_elem = cpu_map_lookup_elem, |
|---|
| 566 | 664 | .map_get_next_key = cpu_map_get_next_key, |
|---|
| 567 | 665 | .map_check_btf = map_check_no_btf, |
|---|
| 666 | + .map_btf_name = "bpf_cpu_map", |
|---|
| 667 | + .map_btf_id = &cpu_map_btf_id, |
|---|
| 568 | 668 | }; |
|---|
| 569 | 669 | |
|---|
| 570 | | -static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, |
|---|
| 571 | | - struct xdp_bulk_queue *bq, bool in_napi_ctx) |
|---|
| 670 | +static void bq_flush_to_queue(struct xdp_bulk_queue *bq) |
|---|
| 572 | 671 | { |
|---|
| 672 | + struct bpf_cpu_map_entry *rcpu = bq->obj; |
|---|
| 573 | 673 | unsigned int processed = 0, drops = 0; |
|---|
| 574 | 674 | const int to_cpu = rcpu->cpu; |
|---|
| 575 | 675 | struct ptr_ring *q; |
|---|
| 576 | 676 | int i; |
|---|
| 577 | 677 | |
|---|
| 578 | 678 | if (unlikely(!bq->count)) |
|---|
| 579 | | - return 0; |
|---|
| 679 | + return; |
|---|
| 580 | 680 | |
|---|
| 581 | 681 | q = rcpu->queue; |
|---|
| 582 | 682 | spin_lock(&q->producer_lock); |
|---|
| .. | .. |
|---|
| 588 | 688 | err = __ptr_ring_produce(q, xdpf); |
|---|
| 589 | 689 | if (err) { |
|---|
| 590 | 690 | drops++; |
|---|
| 591 | | - if (likely(in_napi_ctx)) |
|---|
| 592 | | - xdp_return_frame_rx_napi(xdpf); |
|---|
| 593 | | - else |
|---|
| 594 | | - xdp_return_frame(xdpf); |
|---|
| 691 | + xdp_return_frame_rx_napi(xdpf); |
|---|
| 595 | 692 | } |
|---|
| 596 | 693 | processed++; |
|---|
| 597 | 694 | } |
|---|
| 598 | 695 | bq->count = 0; |
|---|
| 599 | 696 | spin_unlock(&q->producer_lock); |
|---|
| 600 | 697 | |
|---|
| 698 | + __list_del_clearprev(&bq->flush_node); |
|---|
| 699 | + |
|---|
| 601 | 700 | /* Feedback loop via tracepoints */ |
|---|
| 602 | 701 | trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu); |
|---|
| 603 | | - return 0; |
|---|
| 604 | 702 | } |
|---|
| 605 | 703 | |
|---|
| 606 | 704 | /* Runs under RCU-read-side, plus in softirq under NAPI protection. |
|---|
| 607 | 705 | * Thus, safe percpu variable access. |
|---|
| 608 | 706 | */ |
|---|
| 609 | | -static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) |
|---|
| 707 | +static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) |
|---|
| 610 | 708 | { |
|---|
| 709 | + struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list); |
|---|
| 611 | 710 | struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); |
|---|
| 612 | 711 | |
|---|
| 613 | 712 | if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) |
|---|
| 614 | | - bq_flush_to_queue(rcpu, bq, true); |
|---|
| 713 | + bq_flush_to_queue(bq); |
|---|
| 615 | 714 | |
|---|
| 616 | 715 | /* Notice, xdp_buff/page MUST be queued here, long enough for |
|---|
| 617 | 716 | * driver to code invoking us to finished, due to driver |
|---|
| .. | .. |
|---|
| 623 | 722 | * operation, when completing napi->poll call. |
|---|
| 624 | 723 | */ |
|---|
| 625 | 724 | bq->q[bq->count++] = xdpf; |
|---|
| 626 | | - return 0; |
|---|
| 725 | + |
|---|
| 726 | + if (!bq->flush_node.prev) |
|---|
| 727 | + list_add(&bq->flush_node, flush_list); |
|---|
| 627 | 728 | } |
|---|
| 628 | 729 | |
|---|
| 629 | 730 | int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, |
|---|
| .. | .. |
|---|
| 631 | 732 | { |
|---|
| 632 | 733 | struct xdp_frame *xdpf; |
|---|
| 633 | 734 | |
|---|
| 634 | | - xdpf = convert_to_xdp_frame(xdp); |
|---|
| 735 | + xdpf = xdp_convert_buff_to_frame(xdp); |
|---|
| 635 | 736 | if (unlikely(!xdpf)) |
|---|
| 636 | 737 | return -EOVERFLOW; |
|---|
| 637 | 738 | |
|---|
| .. | .. |
|---|
| 642 | 743 | return 0; |
|---|
| 643 | 744 | } |
|---|
| 644 | 745 | |
|---|
| 645 | | -void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit) |
|---|
| 746 | +void __cpu_map_flush(void) |
|---|
| 646 | 747 | { |
|---|
| 647 | | - struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); |
|---|
| 648 | | - unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); |
|---|
| 748 | + struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list); |
|---|
| 749 | + struct xdp_bulk_queue *bq, *tmp; |
|---|
| 649 | 750 | |
|---|
| 650 | | - __set_bit(bit, bitmap); |
|---|
| 651 | | -} |
|---|
| 652 | | - |
|---|
| 653 | | -void __cpu_map_flush(struct bpf_map *map) |
|---|
| 654 | | -{ |
|---|
| 655 | | - struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); |
|---|
| 656 | | - unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); |
|---|
| 657 | | - u32 bit; |
|---|
| 658 | | - |
|---|
| 659 | | - /* The napi->poll softirq makes sure __cpu_map_insert_ctx() |
|---|
| 660 | | - * and __cpu_map_flush() happen on same CPU. Thus, the percpu |
|---|
| 661 | | - * bitmap indicate which percpu bulkq have packets. |
|---|
| 662 | | - */ |
|---|
| 663 | | - for_each_set_bit(bit, bitmap, map->max_entries) { |
|---|
| 664 | | - struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]); |
|---|
| 665 | | - struct xdp_bulk_queue *bq; |
|---|
| 666 | | - |
|---|
| 667 | | - /* This is possible if entry is removed by user space |
|---|
| 668 | | - * between xdp redirect and flush op. |
|---|
| 669 | | - */ |
|---|
| 670 | | - if (unlikely(!rcpu)) |
|---|
| 671 | | - continue; |
|---|
| 672 | | - |
|---|
| 673 | | - __clear_bit(bit, bitmap); |
|---|
| 674 | | - |
|---|
| 675 | | - /* Flush all frames in bulkq to real queue */ |
|---|
| 676 | | - bq = this_cpu_ptr(rcpu->bulkq); |
|---|
| 677 | | - bq_flush_to_queue(rcpu, bq, true); |
|---|
| 751 | + list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { |
|---|
| 752 | + bq_flush_to_queue(bq); |
|---|
| 678 | 753 | |
|---|
| 679 | 754 | /* If already running, costs spin_lock_irqsave + smb_mb */ |
|---|
| 680 | | - wake_up_process(rcpu->kthread); |
|---|
| 755 | + wake_up_process(bq->obj->kthread); |
|---|
| 681 | 756 | } |
|---|
| 682 | 757 | } |
|---|
| 758 | + |
|---|
| 759 | +static int __init cpu_map_init(void) |
|---|
| 760 | +{ |
|---|
| 761 | + int cpu; |
|---|
| 762 | + |
|---|
| 763 | + for_each_possible_cpu(cpu) |
|---|
| 764 | + INIT_LIST_HEAD(&per_cpu(cpu_map_flush_list, cpu)); |
|---|
| 765 | + return 0; |
|---|
| 766 | +} |
|---|
| 767 | + |
|---|
| 768 | +subsys_initcall(cpu_map_init); |
|---|