hc
2024-05-10 10ebd8556b7990499c896a550e3d416b444211e6
kernel/include/uapi/linux/bpf.h
....@@ -14,6 +14,7 @@
1414 /* Extended instruction set based on top of classic BPF */
1515
1616 /* instruction classes */
17
+#define BPF_JMP32 0x06 /* jmp mode in word width */
1718 #define BPF_ALU64 0x07 /* alu mode in double word width */
1819
1920 /* ld/ldx fields */
....@@ -80,6 +81,12 @@
8081 __u32 attach_type; /* program attach type */
8182 };
8283
84
+union bpf_iter_link_info {
85
+ struct {
86
+ __u32 map_fd;
87
+ } map;
88
+};
89
+
8390 /* BPF syscall commands, see bpf(2) man-page for details. */
8491 enum bpf_cmd {
8592 BPF_MAP_CREATE,
....@@ -103,6 +110,21 @@
103110 BPF_BTF_LOAD,
104111 BPF_BTF_GET_FD_BY_ID,
105112 BPF_TASK_FD_QUERY,
113
+ BPF_MAP_LOOKUP_AND_DELETE_ELEM,
114
+ BPF_MAP_FREEZE,
115
+ BPF_BTF_GET_NEXT_ID,
116
+ BPF_MAP_LOOKUP_BATCH,
117
+ BPF_MAP_LOOKUP_AND_DELETE_BATCH,
118
+ BPF_MAP_UPDATE_BATCH,
119
+ BPF_MAP_DELETE_BATCH,
120
+ BPF_LINK_CREATE,
121
+ BPF_LINK_UPDATE,
122
+ BPF_LINK_GET_FD_BY_ID,
123
+ BPF_LINK_GET_NEXT_ID,
124
+ BPF_ENABLE_STATS,
125
+ BPF_ITER_CREATE,
126
+ BPF_LINK_DETACH,
127
+ BPF_PROG_BIND_MAP,
106128 };
107129
108130 enum bpf_map_type {
....@@ -127,8 +149,24 @@
127149 BPF_MAP_TYPE_SOCKHASH,
128150 BPF_MAP_TYPE_CGROUP_STORAGE,
129151 BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
152
+ BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
153
+ BPF_MAP_TYPE_QUEUE,
154
+ BPF_MAP_TYPE_STACK,
155
+ BPF_MAP_TYPE_SK_STORAGE,
156
+ BPF_MAP_TYPE_DEVMAP_HASH,
157
+ BPF_MAP_TYPE_STRUCT_OPS,
158
+ BPF_MAP_TYPE_RINGBUF,
159
+ BPF_MAP_TYPE_INODE_STORAGE,
130160 };
131161
162
+/* Note that tracing related programs such as
163
+ * BPF_PROG_TYPE_{KPROBE,TRACEPOINT,PERF_EVENT,RAW_TRACEPOINT}
164
+ * are not subject to a stable API since kernel internal data
165
+ * structures can change from release to release and may
166
+ * therefore break existing tracing BPF programs. Tracing BPF
167
+ * programs correspond to /a/ specific kernel which is to be
168
+ * analyzed, and not /a/ specific kernel /and/ all future ones.
169
+ */
132170 enum bpf_prog_type {
133171 BPF_PROG_TYPE_UNSPEC,
134172 BPF_PROG_TYPE_SOCKET_FILTER,
....@@ -152,6 +190,15 @@
152190 BPF_PROG_TYPE_LWT_SEG6LOCAL,
153191 BPF_PROG_TYPE_LIRC_MODE2,
154192 BPF_PROG_TYPE_SK_REUSEPORT,
193
+ BPF_PROG_TYPE_FLOW_DISSECTOR,
194
+ BPF_PROG_TYPE_CGROUP_SYSCTL,
195
+ BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,
196
+ BPF_PROG_TYPE_CGROUP_SOCKOPT,
197
+ BPF_PROG_TYPE_TRACING,
198
+ BPF_PROG_TYPE_STRUCT_OPS,
199
+ BPF_PROG_TYPE_EXT,
200
+ BPF_PROG_TYPE_LSM,
201
+ BPF_PROG_TYPE_SK_LOOKUP,
155202 };
156203
157204 enum bpf_attach_type {
....@@ -172,12 +219,43 @@
172219 BPF_CGROUP_UDP4_SENDMSG,
173220 BPF_CGROUP_UDP6_SENDMSG,
174221 BPF_LIRC_MODE2,
175
- BPF_CGROUP_UDP4_RECVMSG = 19,
222
+ BPF_FLOW_DISSECTOR,
223
+ BPF_CGROUP_SYSCTL,
224
+ BPF_CGROUP_UDP4_RECVMSG,
176225 BPF_CGROUP_UDP6_RECVMSG,
226
+ BPF_CGROUP_GETSOCKOPT,
227
+ BPF_CGROUP_SETSOCKOPT,
228
+ BPF_TRACE_RAW_TP,
229
+ BPF_TRACE_FENTRY,
230
+ BPF_TRACE_FEXIT,
231
+ BPF_MODIFY_RETURN,
232
+ BPF_LSM_MAC,
233
+ BPF_TRACE_ITER,
234
+ BPF_CGROUP_INET4_GETPEERNAME,
235
+ BPF_CGROUP_INET6_GETPEERNAME,
236
+ BPF_CGROUP_INET4_GETSOCKNAME,
237
+ BPF_CGROUP_INET6_GETSOCKNAME,
238
+ BPF_XDP_DEVMAP,
239
+ BPF_CGROUP_INET_SOCK_RELEASE,
240
+ BPF_XDP_CPUMAP,
241
+ BPF_SK_LOOKUP,
242
+ BPF_XDP,
177243 __MAX_BPF_ATTACH_TYPE
178244 };
179245
180246 #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
247
+
248
+enum bpf_link_type {
249
+ BPF_LINK_TYPE_UNSPEC = 0,
250
+ BPF_LINK_TYPE_RAW_TRACEPOINT = 1,
251
+ BPF_LINK_TYPE_TRACING = 2,
252
+ BPF_LINK_TYPE_CGROUP = 3,
253
+ BPF_LINK_TYPE_ITER = 4,
254
+ BPF_LINK_TYPE_NETNS = 5,
255
+ BPF_LINK_TYPE_XDP = 6,
256
+
257
+ MAX_BPF_LINK_TYPE,
258
+};
181259
182260 /* cgroup-bpf attach flags used in BPF_PROG_ATTACH command
183261 *
....@@ -202,6 +280,11 @@
202280 * When children program makes decision (like picking TCP CA or sock bind)
203281 * parent program has a chance to override it.
204282 *
283
+ * With BPF_F_ALLOW_MULTI a new program is added to the end of the list of
284
+ * programs for a cgroup. Though it's possible to replace an old program at
285
+ * any position by also specifying BPF_F_REPLACE flag and position itself in
286
+ * replace_bpf_fd attribute. Old program at this position will be released.
287
+ *
205288 * A cgroup with MULTI or OVERRIDE flag allows any attach flags in sub-cgroups.
206289 * A cgroup with NONE doesn't allow any programs in sub-cgroups.
207290 * Ex1:
....@@ -220,6 +303,7 @@
220303 */
221304 #define BPF_F_ALLOW_OVERRIDE (1U << 0)
222305 #define BPF_F_ALLOW_MULTI (1U << 1)
306
+#define BPF_F_REPLACE (1U << 2)
223307
224308 /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the
225309 * verifier will perform strict alignment checking as if the kernel
....@@ -242,8 +326,66 @@
242326 */
243327 #define BPF_F_ANY_ALIGNMENT (1U << 1)
244328
245
-/* when bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_FD, bpf_ldimm64->imm == fd */
329
+/* BPF_F_TEST_RND_HI32 is used in BPF_PROG_LOAD command for testing purpose.
330
+ * Verifier does sub-register def/use analysis and identifies instructions whose
331
+ * def only matters for low 32-bit, high 32-bit is never referenced later
332
+ * through implicit zero extension. Therefore verifier notifies JIT back-ends
333
+ * that it is safe to ignore clearing high 32-bit for these instructions. This
334
+ * saves some back-ends a lot of code-gen. However such optimization is not
335
+ * necessary on some arches, for example x86_64, arm64 etc, whose JIT back-ends
336
+ * hence hasn't used verifier's analysis result. But, we really want to have a
337
+ * way to be able to verify the correctness of the described optimization on
338
+ * x86_64 on which testsuites are frequently exercised.
339
+ *
340
+ * So, this flag is introduced. Once it is set, verifier will randomize high
341
+ * 32-bit for those instructions who has been identified as safe to ignore them.
342
+ * Then, if verifier is not doing correct analysis, such randomization will
343
+ * regress tests to expose bugs.
344
+ */
345
+#define BPF_F_TEST_RND_HI32 (1U << 2)
346
+
347
+/* The verifier internal test flag. Behavior is undefined */
348
+#define BPF_F_TEST_STATE_FREQ (1U << 3)
349
+
350
+/* If BPF_F_SLEEPABLE is used in BPF_PROG_LOAD command, the verifier will
351
+ * restrict map and helper usage for such programs. Sleepable BPF programs can
352
+ * only be attached to hooks where kernel execution context allows sleeping.
353
+ * Such programs are allowed to use helpers that may sleep like
354
+ * bpf_copy_from_user().
355
+ */
356
+#define BPF_F_SLEEPABLE (1U << 4)
357
+
358
+/* When BPF ldimm64's insn[0].src_reg != 0 then this can have
359
+ * the following extensions:
360
+ *
361
+ * insn[0].src_reg: BPF_PSEUDO_MAP_FD
362
+ * insn[0].imm: map fd
363
+ * insn[1].imm: 0
364
+ * insn[0].off: 0
365
+ * insn[1].off: 0
366
+ * ldimm64 rewrite: address of map
367
+ * verifier type: CONST_PTR_TO_MAP
368
+ */
246369 #define BPF_PSEUDO_MAP_FD 1
370
+/* insn[0].src_reg: BPF_PSEUDO_MAP_VALUE
371
+ * insn[0].imm: map fd
372
+ * insn[1].imm: offset into value
373
+ * insn[0].off: 0
374
+ * insn[1].off: 0
375
+ * ldimm64 rewrite: address of map[0]+offset
376
+ * verifier type: PTR_TO_MAP_VALUE
377
+ */
378
+#define BPF_PSEUDO_MAP_VALUE 2
379
+/* insn[0].src_reg: BPF_PSEUDO_BTF_ID
380
+ * insn[0].imm: kernel btd id of VAR
381
+ * insn[1].imm: 0
382
+ * insn[0].off: 0
383
+ * insn[1].off: 0
384
+ * ldimm64 rewrite: address of the kernel variable
385
+ * verifier type: PTR_TO_BTF_ID or PTR_TO_MEM, depending on whether the var
386
+ * is struct/union.
387
+ */
388
+#define BPF_PSEUDO_BTF_ID 3
247389
248390 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative
249391 * offset to another bpf function
....@@ -251,33 +393,71 @@
251393 #define BPF_PSEUDO_CALL 1
252394
253395 /* flags for BPF_MAP_UPDATE_ELEM command */
254
-#define BPF_ANY 0 /* create new element or update existing */
255
-#define BPF_NOEXIST 1 /* create new element if it didn't exist */
256
-#define BPF_EXIST 2 /* update existing element */
396
+enum {
397
+ BPF_ANY = 0, /* create new element or update existing */
398
+ BPF_NOEXIST = 1, /* create new element if it didn't exist */
399
+ BPF_EXIST = 2, /* update existing element */
400
+ BPF_F_LOCK = 4, /* spin_lock-ed map_lookup/map_update */
401
+};
257402
258403 /* flags for BPF_MAP_CREATE command */
259
-#define BPF_F_NO_PREALLOC (1U << 0)
404
+enum {
405
+ BPF_F_NO_PREALLOC = (1U << 0),
260406 /* Instead of having one common LRU list in the
261407 * BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list
262408 * which can scale and perform better.
263409 * Note, the LRU nodes (including free nodes) cannot be moved
264410 * across different LRU lists.
265411 */
266
-#define BPF_F_NO_COMMON_LRU (1U << 1)
412
+ BPF_F_NO_COMMON_LRU = (1U << 1),
267413 /* Specify numa node during map creation */
268
-#define BPF_F_NUMA_NODE (1U << 2)
414
+ BPF_F_NUMA_NODE = (1U << 2),
269415
270
-/* flags for BPF_PROG_QUERY */
271
-#define BPF_F_QUERY_EFFECTIVE (1U << 0)
272
-
273
-#define BPF_OBJ_NAME_LEN 16U
274
-
275
-/* Flags for accessing BPF object */
276
-#define BPF_F_RDONLY (1U << 3)
277
-#define BPF_F_WRONLY (1U << 4)
416
+/* Flags for accessing BPF object from syscall side. */
417
+ BPF_F_RDONLY = (1U << 3),
418
+ BPF_F_WRONLY = (1U << 4),
278419
279420 /* Flag for stack_map, store build_id+offset instead of pointer */
280
-#define BPF_F_STACK_BUILD_ID (1U << 5)
421
+ BPF_F_STACK_BUILD_ID = (1U << 5),
422
+
423
+/* Zero-initialize hash function seed. This should only be used for testing. */
424
+ BPF_F_ZERO_SEED = (1U << 6),
425
+
426
+/* Flags for accessing BPF object from program side. */
427
+ BPF_F_RDONLY_PROG = (1U << 7),
428
+ BPF_F_WRONLY_PROG = (1U << 8),
429
+
430
+/* Clone map from listener for newly accepted socket */
431
+ BPF_F_CLONE = (1U << 9),
432
+
433
+/* Enable memory-mapping BPF map */
434
+ BPF_F_MMAPABLE = (1U << 10),
435
+
436
+/* Share perf_event among processes */
437
+ BPF_F_PRESERVE_ELEMS = (1U << 11),
438
+
439
+/* Create a map that is suitable to be an inner map with dynamic max entries */
440
+ BPF_F_INNER_MAP = (1U << 12),
441
+};
442
+
443
+/* Flags for BPF_PROG_QUERY. */
444
+
445
+/* Query effective (directly attached + inherited from ancestor cgroups)
446
+ * programs that will be executed for events within a cgroup.
447
+ * attach_flags with this flag are returned only for directly attached programs.
448
+ */
449
+#define BPF_F_QUERY_EFFECTIVE (1U << 0)
450
+
451
+/* Flags for BPF_PROG_TEST_RUN */
452
+
453
+/* If set, run the test on the cpu specified by bpf_attr.test.cpu */
454
+#define BPF_F_TEST_RUN_ON_CPU (1U << 0)
455
+
456
+/* type for BPF_ENABLE_STATS */
457
+enum bpf_stats_type {
458
+ /* enabled run_time_ns and run_cnt */
459
+ BPF_STATS_RUN_TIME = 0,
460
+};
281461
282462 enum bpf_stack_build_id_status {
283463 /* user space need an empty entry to identify end of a trace */
....@@ -298,6 +478,8 @@
298478 };
299479 };
300480
481
+#define BPF_OBJ_NAME_LEN 16U
482
+
301483 union bpf_attr {
302484 struct { /* anonymous struct used by BPF_MAP_CREATE command */
303485 __u32 map_type; /* one of enum bpf_map_type */
....@@ -316,6 +498,10 @@
316498 __u32 btf_fd; /* fd pointing to a BTF type data */
317499 __u32 btf_key_type_id; /* BTF type_id of the key */
318500 __u32 btf_value_type_id; /* BTF type_id of the value */
501
+ __u32 btf_vmlinux_value_type_id;/* BTF type_id of a kernel-
502
+ * struct stored as the
503
+ * map value
504
+ */
319505 };
320506
321507 struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
....@@ -328,6 +514,23 @@
328514 __u64 flags;
329515 };
330516
517
+ struct { /* struct used by BPF_MAP_*_BATCH commands */
518
+ __aligned_u64 in_batch; /* start batch,
519
+ * NULL to start from beginning
520
+ */
521
+ __aligned_u64 out_batch; /* output: next start batch */
522
+ __aligned_u64 keys;
523
+ __aligned_u64 values;
524
+ __u32 count; /* input/output:
525
+ * input: # of key/value
526
+ * elements
527
+ * output: # of filled elements
528
+ */
529
+ __u32 map_fd;
530
+ __u64 elem_flags;
531
+ __u64 flags;
532
+ } batch;
533
+
331534 struct { /* anonymous struct used by BPF_PROG_LOAD command */
332535 __u32 prog_type; /* one of enum bpf_prog_type */
333536 __u32 insn_cnt;
....@@ -336,7 +539,7 @@
336539 __u32 log_level; /* verbosity level of verifier */
337540 __u32 log_size; /* size of user buffer */
338541 __aligned_u64 log_buf; /* user supplied buffer */
339
- __u32 kern_version; /* checked when prog_type=kprobe */
542
+ __u32 kern_version; /* not used */
340543 __u32 prog_flags;
341544 char prog_name[BPF_OBJ_NAME_LEN];
342545 __u32 prog_ifindex; /* ifindex of netdev to prep for */
....@@ -345,6 +548,15 @@
345548 * (context accesses, allowed helpers, etc).
346549 */
347550 __u32 expected_attach_type;
551
+ __u32 prog_btf_fd; /* fd pointing to BTF type data */
552
+ __u32 func_info_rec_size; /* userspace bpf_func_info size */
553
+ __aligned_u64 func_info; /* func info */
554
+ __u32 func_info_cnt; /* number of bpf_func_info records */
555
+ __u32 line_info_rec_size; /* userspace bpf_line_info size */
556
+ __aligned_u64 line_info; /* line info */
557
+ __u32 line_info_cnt; /* number of bpf_line_info records */
558
+ __u32 attach_btf_id; /* in-kernel BTF type id to attach to */
559
+ __u32 attach_prog_fd; /* 0 to attach to vmlinux */
348560 };
349561
350562 struct { /* anonymous struct used by BPF_OBJ_* commands */
....@@ -358,17 +570,33 @@
358570 __u32 attach_bpf_fd; /* eBPF program to attach */
359571 __u32 attach_type;
360572 __u32 attach_flags;
573
+ __u32 replace_bpf_fd; /* previously attached eBPF
574
+ * program to replace if
575
+ * BPF_F_REPLACE is used
576
+ */
361577 };
362578
363579 struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */
364580 __u32 prog_fd;
365581 __u32 retval;
366
- __u32 data_size_in;
367
- __u32 data_size_out;
582
+ __u32 data_size_in; /* input: len of data_in */
583
+ __u32 data_size_out; /* input/output: len of data_out
584
+ * returns ENOSPC if data_out
585
+ * is too small.
586
+ */
368587 __aligned_u64 data_in;
369588 __aligned_u64 data_out;
370589 __u32 repeat;
371590 __u32 duration;
591
+ __u32 ctx_size_in; /* input: len of ctx_in */
592
+ __u32 ctx_size_out; /* input/output: len of ctx_out
593
+ * returns ENOSPC if ctx_out
594
+ * is too small.
595
+ */
596
+ __aligned_u64 ctx_in;
597
+ __aligned_u64 ctx_out;
598
+ __u32 flags;
599
+ __u32 cpu;
372600 } test;
373601
374602 struct { /* anonymous struct used by BPF_*_GET_*_ID */
....@@ -377,6 +605,7 @@
377605 __u32 prog_id;
378606 __u32 map_id;
379607 __u32 btf_id;
608
+ __u32 link_id;
380609 };
381610 __u32 next_id;
382611 __u32 open_flags;
....@@ -397,7 +626,7 @@
397626 __u32 prog_cnt;
398627 } query;
399628
400
- struct {
629
+ struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */
401630 __u64 name;
402631 __u32 prog_fd;
403632 } raw_tracepoint;
....@@ -425,6 +654,53 @@
425654 __u64 probe_offset; /* output: probe_offset */
426655 __u64 probe_addr; /* output: probe_addr */
427656 } task_fd_query;
657
+
658
+ struct { /* struct used by BPF_LINK_CREATE command */
659
+ __u32 prog_fd; /* eBPF program to attach */
660
+ union {
661
+ __u32 target_fd; /* object to attach to */
662
+ __u32 target_ifindex; /* target ifindex */
663
+ };
664
+ __u32 attach_type; /* attach type */
665
+ __u32 flags; /* extra flags */
666
+ union {
667
+ __u32 target_btf_id; /* btf_id of target to attach to */
668
+ struct {
669
+ __aligned_u64 iter_info; /* extra bpf_iter_link_info */
670
+ __u32 iter_info_len; /* iter_info length */
671
+ };
672
+ };
673
+ } link_create;
674
+
675
+ struct { /* struct used by BPF_LINK_UPDATE command */
676
+ __u32 link_fd; /* link fd */
677
+ /* new program fd to update link with */
678
+ __u32 new_prog_fd;
679
+ __u32 flags; /* extra flags */
680
+ /* expected link's program fd; is specified only if
681
+ * BPF_F_REPLACE flag is set in flags */
682
+ __u32 old_prog_fd;
683
+ } link_update;
684
+
685
+ struct {
686
+ __u32 link_fd;
687
+ } link_detach;
688
+
689
+ struct { /* struct used by BPF_ENABLE_STATS command */
690
+ __u32 type;
691
+ } enable_stats;
692
+
693
+ struct { /* struct used by BPF_ITER_CREATE command */
694
+ __u32 link_fd;
695
+ __u32 flags;
696
+ } iter_create;
697
+
698
+ struct { /* struct used by BPF_PROG_BIND_MAP command */
699
+ __u32 prog_fd;
700
+ __u32 map_fd;
701
+ __u32 flags; /* extra flags */
702
+ } prog_bind_map;
703
+
428704 } __attribute__((aligned(8)));
429705
430706 /* The description below is an attempt at providing documentation to eBPF
....@@ -451,7 +727,7 @@
451727 * Map value associated to *key*, or **NULL** if no entry was
452728 * found.
453729 *
454
- * int bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags)
730
+ * long bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags)
455731 * Description
456732 * Add or update the value of the entry associated to *key* in
457733 * *map* with *value*. *flags* is one of:
....@@ -469,16 +745,19 @@
469745 * Return
470746 * 0 on success, or a negative error in case of failure.
471747 *
472
- * int bpf_map_delete_elem(struct bpf_map *map, const void *key)
748
+ * long bpf_map_delete_elem(struct bpf_map *map, const void *key)
473749 * Description
474750 * Delete entry with *key* from *map*.
475751 * Return
476752 * 0 on success, or a negative error in case of failure.
477753 *
478
- * int bpf_probe_read(void *dst, u32 size, const void *src)
754
+ * long bpf_probe_read(void *dst, u32 size, const void *unsafe_ptr)
479755 * Description
480756 * For tracing programs, safely attempt to read *size* bytes from
481
- * address *src* and store the data in *dst*.
757
+ * kernel space address *unsafe_ptr* and store the data in *dst*.
758
+ *
759
+ * Generally, use **bpf_probe_read_user**\ () or
760
+ * **bpf_probe_read_kernel**\ () instead.
482761 * Return
483762 * 0 on success, or a negative error in case of failure.
484763 *
....@@ -486,11 +765,11 @@
486765 * Description
487766 * Return the time elapsed since system boot, in nanoseconds.
488767 * Does not include time the system was suspended.
489
- * See: clock_gettime(CLOCK_MONOTONIC)
768
+ * See: **clock_gettime**\ (**CLOCK_MONOTONIC**)
490769 * Return
491770 * Current *ktime*.
492771 *
493
- * int bpf_trace_printk(const char *fmt, u32 fmt_size, ...)
772
+ * long bpf_trace_printk(const char *fmt, u32 fmt_size, ...)
494773 * Description
495774 * This helper is a "printk()-like" facility for debugging. It
496775 * prints a message defined by format *fmt* (of size *fmt_size*)
....@@ -500,6 +779,8 @@
500779 * limited to five).
501780 *
502781 * Each time the helper is called, it appends a line to the trace.
782
+ * Lines are discarded while *\/sys/kernel/debug/tracing/trace* is
783
+ * open, use *\/sys/kernel/debug/tracing/trace_pipe* to avoid this.
503784 * The format of the trace is customizable, and the exact output
504785 * one will get depends on the options set in
505786 * *\/sys/kernel/debug/tracing/trace_options* (see also the
....@@ -538,7 +819,7 @@
538819 *
539820 * Also, note that **bpf_trace_printk**\ () is slow, and should
540821 * only be used for debugging purposes. For this reason, a notice
541
- * bloc (spanning several lines) is printed to kernel logs and
822
+ * block (spanning several lines) is printed to kernel logs and
542823 * states that the helper should not be used "for production use"
543824 * the first time this helper is used (or more precisely, when
544825 * **trace_printk**\ () buffers are allocated). For passing values
....@@ -568,7 +849,7 @@
568849 * Return
569850 * The SMP id of the processor running the program.
570851 *
571
- * int bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags)
852
+ * long bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags)
572853 * Description
573854 * Store *len* bytes from address *from* into the packet
574855 * associated to *skb*, at *offset*. *flags* are a combination of
....@@ -577,7 +858,7 @@
577858 * **BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\
578859 * **->swhash** and *skb*\ **->l4hash** to 0).
579860 *
580
- * A call to this helper is susceptible to change the underlaying
861
+ * A call to this helper is susceptible to change the underlying
581862 * packet buffer. Therefore, at load time, all checks on pointers
582863 * previously done by the verifier are invalidated and must be
583864 * performed again, if the helper is used in combination with
....@@ -585,7 +866,7 @@
585866 * Return
586867 * 0 on success, or a negative error in case of failure.
587868 *
588
- * int bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size)
869
+ * long bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size)
589870 * Description
590871 * Recompute the layer 3 (e.g. IP) checksum for the packet
591872 * associated to *skb*. Computation is incremental, so the helper
....@@ -602,7 +883,7 @@
602883 * flexibility and can handle sizes larger than 2 or 4 for the
603884 * checksum to update.
604885 *
605
- * A call to this helper is susceptible to change the underlaying
886
+ * A call to this helper is susceptible to change the underlying
606887 * packet buffer. Therefore, at load time, all checks on pointers
607888 * previously done by the verifier are invalidated and must be
608889 * performed again, if the helper is used in combination with
....@@ -610,7 +891,7 @@
610891 * Return
611892 * 0 on success, or a negative error in case of failure.
612893 *
613
- * int bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags)
894
+ * long bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags)
614895 * Description
615896 * Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the
616897 * packet associated to *skb*. Computation is incremental, so the
....@@ -634,7 +915,7 @@
634915 * flexibility and can handle sizes larger than 2 or 4 for the
635916 * checksum to update.
636917 *
637
- * A call to this helper is susceptible to change the underlaying
918
+ * A call to this helper is susceptible to change the underlying
638919 * packet buffer. Therefore, at load time, all checks on pointers
639920 * previously done by the verifier are invalidated and must be
640921 * performed again, if the helper is used in combination with
....@@ -642,7 +923,7 @@
642923 * Return
643924 * 0 on success, or a negative error in case of failure.
644925 *
645
- * int bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index)
926
+ * long bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index)
646927 * Description
647928 * This special helper is used to trigger a "tail call", or in
648929 * other words, to jump into another eBPF program. The same stack
....@@ -673,7 +954,7 @@
673954 * Return
674955 * 0 on success, or a negative error in case of failure.
675956 *
676
- * int bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags)
957
+ * long bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags)
677958 * Description
678959 * Clone and redirect the packet associated to *skb* to another
679960 * net device of index *ifindex*. Both ingress and egress
....@@ -689,13 +970,15 @@
689970 * efficient, but it is handled through an action code where the
690971 * redirection happens only after the eBPF program has returned.
691972 *
692
- * A call to this helper is susceptible to change the underlaying
973
+ * A call to this helper is susceptible to change the underlying
693974 * packet buffer. Therefore, at load time, all checks on pointers
694975 * previously done by the verifier are invalidated and must be
695976 * performed again, if the helper is used in combination with
696977 * direct packet access.
697978 * Return
698
- * 0 on success, or a negative error in case of failure.
979
+ * 0 on success, or a negative error in case of failure. Positive
980
+ * error indicates a potential drop or congestion in the target
981
+ * device. The particular positive error codes are not defined.
699982 *
700983 * u64 bpf_get_current_pid_tgid(void)
701984 * Return
....@@ -709,7 +992,7 @@
709992 * A 64-bit integer containing the current GID and UID, and
710993 * created as such: *current_gid* **<< 32 \|** *current_uid*.
711994 *
712
- * int bpf_get_current_comm(char *buf, u32 size_of_buf)
995
+ * long bpf_get_current_comm(void *buf, u32 size_of_buf)
713996 * Description
714997 * Copy the **comm** attribute of the current task into *buf* of
715998 * *size_of_buf*. The **comm** attribute contains the name of
....@@ -731,7 +1014,7 @@
7311014 * based on a user-provided identifier for all traffic coming from
7321015 * the tasks belonging to the related cgroup. See also the related
7331016 * kernel documentation, available from the Linux sources in file
734
- * *Documentation/cgroup-v1/net_cls.txt*.
1017
+ * *Documentation/admin-guide/cgroup-v1/net_cls.rst*.
7351018 *
7361019 * The Linux kernel has two versions for cgroups: there are
7371020 * cgroups v1 and cgroups v2. Both are available to users, who can
....@@ -746,7 +1029,7 @@
7461029 * Return
7471030 * The classid, or 0 for the default unconfigured classid.
7481031 *
749
- * int bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
1032
+ * long bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
7501033 * Description
7511034 * Push a *vlan_tci* (VLAN tag control information) of protocol
7521035 * *vlan_proto* to the packet associated to *skb*, then update
....@@ -754,7 +1037,7 @@
7541037 * **ETH_P_8021Q** and **ETH_P_8021AD**, it is considered to
7551038 * be **ETH_P_8021Q**.
7561039 *
757
- * A call to this helper is susceptible to change the underlaying
1040
+ * A call to this helper is susceptible to change the underlying
7581041 * packet buffer. Therefore, at load time, all checks on pointers
7591042 * previously done by the verifier are invalidated and must be
7601043 * performed again, if the helper is used in combination with
....@@ -762,11 +1045,11 @@
7621045 * Return
7631046 * 0 on success, or a negative error in case of failure.
7641047 *
765
- * int bpf_skb_vlan_pop(struct sk_buff *skb)
1048
+ * long bpf_skb_vlan_pop(struct sk_buff *skb)
7661049 * Description
7671050 * Pop a VLAN header from the packet associated to *skb*.
7681051 *
769
- * A call to this helper is susceptible to change the underlaying
1052
+ * A call to this helper is susceptible to change the underlying
7701053 * packet buffer. Therefore, at load time, all checks on pointers
7711054 * previously done by the verifier are invalidated and must be
7721055 * performed again, if the helper is used in combination with
....@@ -774,7 +1057,7 @@
7741057 * Return
7751058 * 0 on success, or a negative error in case of failure.
7761059 *
777
- * int bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags)
1060
+ * long bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags)
7781061 * Description
7791062 * Get tunnel metadata. This helper takes a pointer *key* to an
7801063 * empty **struct bpf_tunnel_key** of **size**, that will be
....@@ -804,14 +1087,14 @@
8041087 *
8051088 * int ret;
8061089 * struct bpf_tunnel_key key = {};
807
- *
1090
+ *
8081091 * ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
8091092 * if (ret < 0)
8101093 * return TC_ACT_SHOT; // drop packet
811
- *
1094
+ *
8121095 * if (key.remote_ipv4 != 0x0a000001)
8131096 * return TC_ACT_SHOT; // drop packet
814
- *
1097
+ *
8151098 * return TC_ACT_OK; // accept packet
8161099 *
8171100 * This interface can also be used with all encapsulation devices
....@@ -825,7 +1108,7 @@
8251108 * Return
8261109 * 0 on success, or a negative error in case of failure.
8271110 *
828
- * int bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags)
1111
+ * long bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags)
8291112 * Description
8301113 * Populate tunnel metadata for packet associated to *skb.* The
8311114 * tunnel metadata is set to the contents of *key*, of *size*. The
....@@ -891,7 +1174,7 @@
8911174 * The value of the perf event counter read from the map, or a
8921175 * negative error code in case of failure.
8931176 *
894
- * int bpf_redirect(u32 ifindex, u64 flags)
1177
+ * long bpf_redirect(u32 ifindex, u64 flags)
8951178 * Description
8961179 * Redirect the packet to another net device of index *ifindex*.
8971180 * This helper is somewhat similar to **bpf_clone_redirect**\
....@@ -905,9 +1188,9 @@
9051188 * supports redirection to the egress interface, and accepts no
9061189 * flag at all.
9071190 *
908
- * The same effect can be attained with the more generic
909
- * **bpf_redirect_map**\ (), which requires specific maps to be
910
- * used but offers better performance.
1191
+ * The same effect can also be attained with the more generic
1192
+ * **bpf_redirect_map**\ (), which uses a BPF map to store the
1193
+ * redirect target instead of providing it directly to the helper.
9111194 * Return
9121195 * For XDP, the helper returns **XDP_REDIRECT** on success or
9131196 * **XDP_ABORTED** on error. For other program types, the values
....@@ -918,7 +1201,7 @@
9181201 * Description
9191202 * Retrieve the realm or the route, that is to say the
9201203 * **tclassid** field of the destination for the *skb*. The
921
- * indentifier retrieved is a user-provided tag, similar to the
1204
+ * identifier retrieved is a user-provided tag, similar to the
9221205 * one used with the net_cls cgroup (see description for
9231206 * **bpf_get_cgroup_classid**\ () helper), but here this tag is
9241207 * held by a route (a destination entry), not by a task.
....@@ -938,7 +1221,7 @@
9381221 * The realm of the route for the packet associated to *skb*, or 0
9391222 * if none was found.
9401223 *
941
- * int bpf_perf_event_output(struct pt_reg *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
1224
+ * long bpf_perf_event_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
9421225 * Description
9431226 * Write raw *data* blob into a special BPF perf event held by
9441227 * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
....@@ -983,7 +1266,7 @@
9831266 * Return
9841267 * 0 on success, or a negative error in case of failure.
9851268 *
986
- * int bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len)
1269
+ * long bpf_skb_load_bytes(const void *skb, u32 offset, void *to, u32 len)
9871270 * Description
9881271 * This helper was provided as an easy way to load data from a
9891272 * packet. It can be used to load *len* bytes from *offset* from
....@@ -1000,7 +1283,7 @@
10001283 * Return
10011284 * 0 on success, or a negative error in case of failure.
10021285 *
1003
- * int bpf_get_stackid(struct pt_reg *ctx, struct bpf_map *map, u64 flags)
1286
+ * long bpf_get_stackid(void *ctx, struct bpf_map *map, u64 flags)
10041287 * Description
10051288 * Walk a user or a kernel stack and return its id. To achieve
10061289 * this, the helper needs *ctx*, which is a pointer to the context
....@@ -1069,7 +1352,7 @@
10691352 * The checksum result, or a negative error code in case of
10701353 * failure.
10711354 *
1072
- * int bpf_skb_get_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size)
1355
+ * long bpf_skb_get_tunnel_opt(struct sk_buff *skb, void *opt, u32 size)
10731356 * Description
10741357 * Retrieve tunnel options metadata for the packet associated to
10751358 * *skb*, and store the raw tunnel option data to the buffer *opt*
....@@ -1087,7 +1370,7 @@
10871370 * Return
10881371 * The size of the option data retrieved.
10891372 *
1090
- * int bpf_skb_set_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size)
1373
+ * long bpf_skb_set_tunnel_opt(struct sk_buff *skb, void *opt, u32 size)
10911374 * Description
10921375 * Set tunnel options metadata for the packet associated to *skb*
10931376 * to the option data contained in the raw buffer *opt* of *size*.
....@@ -1097,7 +1380,7 @@
10971380 * Return
10981381 * 0 on success, or a negative error in case of failure.
10991382 *
1100
- * int bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags)
1383
+ * long bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags)
11011384 * Description
11021385 * Change the protocol of the *skb* to *proto*. Currently
11031386 * supported are transition from IPv4 to IPv6, and from IPv6 to
....@@ -1116,7 +1399,7 @@
11161399 * All values for *flags* are reserved for future usage, and must
11171400 * be left at zero.
11181401 *
1119
- * A call to this helper is susceptible to change the underlaying
1402
+ * A call to this helper is susceptible to change the underlying
11201403 * packet buffer. Therefore, at load time, all checks on pointers
11211404 * previously done by the verifier are invalidated and must be
11221405 * performed again, if the helper is used in combination with
....@@ -1124,7 +1407,7 @@
11241407 * Return
11251408 * 0 on success, or a negative error in case of failure.
11261409 *
1127
- * int bpf_skb_change_type(struct sk_buff *skb, u32 type)
1410
+ * long bpf_skb_change_type(struct sk_buff *skb, u32 type)
11281411 * Description
11291412 * Change the packet type for the packet associated to *skb*. This
11301413 * comes down to setting *skb*\ **->pkt_type** to *type*, except
....@@ -1151,7 +1434,7 @@
11511434 * Return
11521435 * 0 on success, or a negative error in case of failure.
11531436 *
1154
- * int bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index)
1437
+ * long bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index)
11551438 * Description
11561439 * Check whether *skb* is a descendant of the cgroup2 held by
11571440 * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*.
....@@ -1182,7 +1465,7 @@
11821465 * Return
11831466 * A pointer to the current task struct.
11841467 *
1185
- * int bpf_probe_write_user(void *dst, const void *src, u32 len)
1468
+ * long bpf_probe_write_user(void *dst, const void *src, u32 len)
11861469 * Description
11871470 * Attempt in a safe way to write *len* bytes from the buffer
11881471 * *src* to *dst* in memory. It only works for threads that are in
....@@ -1201,7 +1484,7 @@
12011484 * Return
12021485 * 0 on success, or a negative error in case of failure.
12031486 *
1204
- * int bpf_current_task_under_cgroup(struct bpf_map *map, u32 index)
1487
+ * long bpf_current_task_under_cgroup(struct bpf_map *map, u32 index)
12051488 * Description
12061489 * Check whether the probe is being run is the context of a given
12071490 * subset of the cgroup2 hierarchy. The cgroup2 to test is held by
....@@ -1209,11 +1492,11 @@
12091492 * Return
12101493 * The return value depends on the result of the test, and can be:
12111494 *
1212
- * * 0, if current task belongs to the cgroup2.
1213
- * * 1, if current task does not belong to the cgroup2.
1495
+ * * 1, if current task belongs to the cgroup2.
1496
+ * * 0, if current task does not belong to the cgroup2.
12141497 * * A negative error code, if an error occurred.
12151498 *
1216
- * int bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags)
1499
+ * long bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags)
12171500 * Description
12181501 * Resize (trim or grow) the packet associated to *skb* to the
12191502 * new *len*. The *flags* are reserved for future usage, and must
....@@ -1229,7 +1512,7 @@
12291512 * implicitly linearizes, unclones and drops offloads from the
12301513 * *skb*.
12311514 *
1232
- * A call to this helper is susceptible to change the underlaying
1515
+ * A call to this helper is susceptible to change the underlying
12331516 * packet buffer. Therefore, at load time, all checks on pointers
12341517 * previously done by the verifier are invalidated and must be
12351518 * performed again, if the helper is used in combination with
....@@ -1237,7 +1520,7 @@
12371520 * Return
12381521 * 0 on success, or a negative error in case of failure.
12391522 *
1240
- * int bpf_skb_pull_data(struct sk_buff *skb, u32 len)
1523
+ * long bpf_skb_pull_data(struct sk_buff *skb, u32 len)
12411524 * Description
12421525 * Pull in non-linear data in case the *skb* is non-linear and not
12431526 * all of *len* are part of the linear section. Make *len* bytes
....@@ -1265,7 +1548,7 @@
12651548 * **bpf_skb_pull_data()** to effectively unclone the *skb* from
12661549 * the very beginning in case it is indeed cloned.
12671550 *
1268
- * A call to this helper is susceptible to change the underlaying
1551
+ * A call to this helper is susceptible to change the underlying
12691552 * packet buffer. Therefore, at load time, all checks on pointers
12701553 * previously done by the verifier are invalidated and must be
12711554 * performed again, if the helper is used in combination with
....@@ -1293,7 +1576,7 @@
12931576 * recalculation the next time the kernel tries to access this
12941577 * hash or when the **bpf_get_hash_recalc**\ () helper is called.
12951578 *
1296
- * int bpf_get_numa_node_id(void)
1579
+ * long bpf_get_numa_node_id(void)
12971580 * Description
12981581 * Return the id of the current NUMA node. The primary use case
12991582 * for this helper is the selection of sockets for the local NUMA
....@@ -1304,7 +1587,7 @@
13041587 * Return
13051588 * The id of current NUMA node.
13061589 *
1307
- * int bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags)
1590
+ * long bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags)
13081591 * Description
13091592 * Grows headroom of packet associated to *skb* and adjusts the
13101593 * offset of the MAC header accordingly, adding *len* bytes of
....@@ -1317,7 +1600,7 @@
13171600 * All values for *flags* are reserved for future usage, and must
13181601 * be left at zero.
13191602 *
1320
- * A call to this helper is susceptible to change the underlaying
1603
+ * A call to this helper is susceptible to change the underlying
13211604 * packet buffer. Therefore, at load time, all checks on pointers
13221605 * previously done by the verifier are invalidated and must be
13231606 * performed again, if the helper is used in combination with
....@@ -1325,14 +1608,14 @@
13251608 * Return
13261609 * 0 on success, or a negative error in case of failure.
13271610 *
1328
- * int bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta)
1611
+ * long bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta)
13291612 * Description
13301613 * Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that
13311614 * it is possible to use a negative value for *delta*. This helper
13321615 * can be used to prepare the packet for pushing or popping
13331616 * headers.
13341617 *
1335
- * A call to this helper is susceptible to change the underlaying
1618
+ * A call to this helper is susceptible to change the underlying
13361619 * packet buffer. Therefore, at load time, all checks on pointers
13371620 * previously done by the verifier are invalidated and must be
13381621 * performed again, if the helper is used in combination with
....@@ -1340,45 +1623,14 @@
13401623 * Return
13411624 * 0 on success, or a negative error in case of failure.
13421625 *
1343
- * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr)
1626
+ * long bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr)
13441627 * Description
1345
- * Copy a NUL terminated string from an unsafe address
1346
- * *unsafe_ptr* to *dst*. The *size* should include the
1347
- * terminating NUL byte. In case the string length is smaller than
1348
- * *size*, the target is not padded with further NUL bytes. If the
1349
- * string length is larger than *size*, just *size*-1 bytes are
1350
- * copied and the last byte is set to NUL.
1628
+ * Copy a NUL terminated string from an unsafe kernel address
1629
+ * *unsafe_ptr* to *dst*. See **bpf_probe_read_kernel_str**\ () for
1630
+ * more details.
13511631 *
1352
- * On success, the length of the copied string is returned. This
1353
- * makes this helper useful in tracing programs for reading
1354
- * strings, and more importantly to get its length at runtime. See
1355
- * the following snippet:
1356
- *
1357
- * ::
1358
- *
1359
- * SEC("kprobe/sys_open")
1360
- * void bpf_sys_open(struct pt_regs *ctx)
1361
- * {
1362
- * char buf[PATHLEN]; // PATHLEN is defined to 256
1363
- * int res = bpf_probe_read_str(buf, sizeof(buf),
1364
- * ctx->di);
1365
- *
1366
- * // Consume buf, for example push it to
1367
- * // userspace via bpf_perf_event_output(); we
1368
- * // can use res (the string length) as event
1369
- * // size, after checking its boundaries.
1370
- * }
1371
- *
1372
- * In comparison, using **bpf_probe_read()** helper here instead
1373
- * to read the string would require to estimate the length at
1374
- * compile time, and would often result in copying more memory
1375
- * than necessary.
1376
- *
1377
- * Another useful use case is when parsing individual process
1378
- * arguments or individual environment variables navigating
1379
- * *current*\ **->mm->arg_start** and *current*\
1380
- * **->mm->env_start**: using this helper and the return value,
1381
- * one can quickly iterate at the right offset of the memory area.
1632
+ * Generally, use **bpf_probe_read_user_str**\ () or
1633
+ * **bpf_probe_read_kernel_str**\ () instead.
13821634 * Return
13831635 * On success, the strictly positive length of the string,
13841636 * including the trailing NUL character. On error, a negative
....@@ -1391,8 +1643,8 @@
13911643 * If no cookie has been set yet, generate a new cookie. Once
13921644 * generated, the socket cookie remains stable for the life of the
13931645 * socket. This helper can be useful for monitoring per socket
1394
- * networking traffic statistics as it provides a unique socket
1395
- * identifier per namespace.
1646
+ * networking traffic statistics as it provides a global socket
1647
+ * identifier that can be assumed unique.
13961648 * Return
13971649 * A 8-byte long non-decreasing number on success, or 0 if the
13981650 * socket field is missing inside *skb*.
....@@ -1400,14 +1652,14 @@
14001652 * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx)
14011653 * Description
14021654 * Equivalent to bpf_get_socket_cookie() helper that accepts
1403
- * *skb*, but gets socket from **struct bpf_sock_addr** contex.
1655
+ * *skb*, but gets socket from **struct bpf_sock_addr** context.
14041656 * Return
14051657 * A 8-byte long non-decreasing number.
14061658 *
14071659 * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx)
14081660 * Description
1409
- * Equivalent to bpf_get_socket_cookie() helper that accepts
1410
- * *skb*, but gets socket from **struct bpf_sock_ops** contex.
1661
+ * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts
1662
+ * *skb*, but gets socket from **struct bpf_sock_ops** context.
14111663 * Return
14121664 * A 8-byte long non-decreasing number.
14131665 *
....@@ -1419,14 +1671,14 @@
14191671 * is returned (note that **overflowuid** might also be the actual
14201672 * UID value for the socket).
14211673 *
1422
- * u32 bpf_set_hash(struct sk_buff *skb, u32 hash)
1674
+ * long bpf_set_hash(struct sk_buff *skb, u32 hash)
14231675 * Description
14241676 * Set the full hash for *skb* (set the field *skb*\ **->hash**)
14251677 * to value *hash*.
14261678 * Return
14271679 * 0
14281680 *
1429
- * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen)
1681
+ * long bpf_setsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen)
14301682 * Description
14311683 * Emulate a call to **setsockopt()** on the socket associated to
14321684 * *bpf_socket*, which must be a full socket. The *level* at
....@@ -1434,34 +1686,68 @@
14341686 * must be specified, see **setsockopt(2)** for more information.
14351687 * The option value of length *optlen* is pointed by *optval*.
14361688 *
1689
+ * *bpf_socket* should be one of the following:
1690
+ *
1691
+ * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
1692
+ * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
1693
+ * and **BPF_CGROUP_INET6_CONNECT**.
1694
+ *
14371695 * This helper actually implements a subset of **setsockopt()**.
14381696 * It supports the following *level*\ s:
14391697 *
14401698 * * **SOL_SOCKET**, which supports the following *optname*\ s:
14411699 * **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**,
1442
- * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**.
1700
+ * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**,
1701
+ * **SO_BINDTODEVICE**, **SO_KEEPALIVE**.
14431702 * * **IPPROTO_TCP**, which supports the following *optname*\ s:
14441703 * **TCP_CONGESTION**, **TCP_BPF_IW**,
1445
- * **TCP_BPF_SNDCWND_CLAMP**.
1704
+ * **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**,
1705
+ * **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**,
1706
+ * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**.
14461707 * * **IPPROTO_IP**, which supports *optname* **IP_TOS**.
14471708 * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
14481709 * Return
14491710 * 0 on success, or a negative error in case of failure.
14501711 *
1451
- * int bpf_skb_adjust_room(struct sk_buff *skb, u32 len_diff, u32 mode, u64 flags)
1712
+ * long bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags)
14521713 * Description
14531714 * Grow or shrink the room for data in the packet associated to
14541715 * *skb* by *len_diff*, and according to the selected *mode*.
14551716 *
1456
- * There is a single supported mode at this time:
1717
+ * By default, the helper will reset any offloaded checksum
1718
+ * indicator of the skb to CHECKSUM_NONE. This can be avoided
1719
+ * by the following flag:
1720
+ *
1721
+ * * **BPF_F_ADJ_ROOM_NO_CSUM_RESET**: Do not reset offloaded
1722
+ * checksum data of the skb to CHECKSUM_NONE.
1723
+ *
1724
+ * There are two supported modes at this time:
1725
+ *
1726
+ * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer
1727
+ * (room space is added or removed below the layer 2 header).
14571728 *
14581729 * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer
14591730 * (room space is added or removed below the layer 3 header).
14601731 *
1461
- * All values for *flags* are reserved for future usage, and must
1462
- * be left at zero.
1732
+ * The following flags are supported at this time:
14631733 *
1464
- * A call to this helper is susceptible to change the underlaying
1734
+ * * **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size.
1735
+ * Adjusting mss in this way is not allowed for datagrams.
1736
+ *
1737
+ * * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4**,
1738
+ * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6**:
1739
+ * Any new space is reserved to hold a tunnel header.
1740
+ * Configure skb offsets and other fields accordingly.
1741
+ *
1742
+ * * **BPF_F_ADJ_ROOM_ENCAP_L4_GRE**,
1743
+ * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP**:
1744
+ * Use with ENCAP_L3 flags to further specify the tunnel type.
1745
+ *
1746
+ * * **BPF_F_ADJ_ROOM_ENCAP_L2**\ (*len*):
1747
+ * Use with ENCAP_L3/L4 flags to further specify the tunnel
1748
+ * type; *len* is the length of the inner MAC header.
1749
+ *
1750
+ * A call to this helper is susceptible to change the underlying
14651751 * packet buffer. Therefore, at load time, all checks on pointers
14661752 * previously done by the verifier are invalidated and must be
14671753 * performed again, if the helper is used in combination with
....@@ -1469,7 +1755,7 @@
14691755 * Return
14701756 * 0 on success, or a negative error in case of failure.
14711757 *
1472
- * int bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags)
1758
+ * long bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags)
14731759 * Description
14741760 * Redirect the packet to the endpoint referenced by *map* at
14751761 * index *key*. Depending on its type, this *map* can contain
....@@ -1478,18 +1764,19 @@
14781764 * but this is only implemented for native XDP (with driver
14791765 * support) as of this writing).
14801766 *
1481
- * All values for *flags* are reserved for future usage, and must
1482
- * be left at zero.
1767
+ * The lower two bits of *flags* are used as the return code if
1768
+ * the map lookup fails. This is so that the return value can be
1769
+ * one of the XDP program return codes up to **XDP_TX**, as chosen
1770
+ * by the caller. Any higher bits in the *flags* argument must be
1771
+ * unset.
14831772 *
1484
- * When used to redirect packets to net devices, this helper
1485
- * provides a high performance increase over **bpf_redirect**\ ().
1486
- * This is due to various implementation details of the underlying
1487
- * mechanisms, one of which is the fact that **bpf_redirect_map**\
1488
- * () tries to send packet as a "bulk" to the device.
1773
+ * See also **bpf_redirect**\ (), which only supports redirecting
1774
+ * to an ifindex, but doesn't require a map to do so.
14891775 * Return
1490
- * **XDP_REDIRECT** on success, or **XDP_ABORTED** on error.
1776
+ * **XDP_REDIRECT** on success, or the value of the two lower bits
1777
+ * of the *flags* argument on error.
14911778 *
1492
- * int bpf_sk_redirect_map(struct bpf_map *map, u32 key, u64 flags)
1779
+ * long bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags)
14931780 * Description
14941781 * Redirect the packet to the socket referenced by *map* (of type
14951782 * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and
....@@ -1500,7 +1787,7 @@
15001787 * Return
15011788 * **SK_PASS** on success, or **SK_DROP** on error.
15021789 *
1503
- * int bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags)
1790
+ * long bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags)
15041791 * Description
15051792 * Add an entry to, or update a *map* referencing sockets. The
15061793 * *skops* is used as a new value for the entry associated to
....@@ -1519,7 +1806,7 @@
15191806 * Return
15201807 * 0 on success, or a negative error in case of failure.
15211808 *
1522
- * int bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta)
1809
+ * long bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta)
15231810 * Description
15241811 * Adjust the address pointed by *xdp_md*\ **->data_meta** by
15251812 * *delta* (which can be positive or negative). Note that this
....@@ -1540,7 +1827,7 @@
15401827 * more flexibility as the user is free to store whatever meta
15411828 * data they need.
15421829 *
1543
- * A call to this helper is susceptible to change the underlaying
1830
+ * A call to this helper is susceptible to change the underlying
15441831 * packet buffer. Therefore, at load time, all checks on pointers
15451832 * previously done by the verifier are invalidated and must be
15461833 * performed again, if the helper is used in combination with
....@@ -1548,7 +1835,7 @@
15481835 * Return
15491836 * 0 on success, or a negative error in case of failure.
15501837 *
1551
- * int bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size)
1838
+ * long bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size)
15521839 * Description
15531840 * Read the value of a perf event counter, and store it into *buf*
15541841 * of size *buf_size*. This helper relies on a *map* of type
....@@ -1592,13 +1879,13 @@
15921879 * the time running for event since last normalization. The
15931880 * enabled and running times are accumulated since the perf event
15941881 * open. To achieve scaling factor between two invocations of an
1595
- * eBPF program, users can can use CPU id as the key (which is
1882
+ * eBPF program, users can use CPU id as the key (which is
15961883 * typical for perf array usage model) to remember the previous
15971884 * value and do the calculation inside the eBPF program.
15981885 * Return
15991886 * 0 on success, or a negative error in case of failure.
16001887 *
1601
- * int bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size)
1888
+ * long bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size)
16021889 * Description
16031890 * For en eBPF program attached to a perf event, retrieve the
16041891 * value of the event counter associated to *ctx* and store it in
....@@ -1609,7 +1896,7 @@
16091896 * Return
16101897 * 0 on success, or a negative error in case of failure.
16111898 *
1612
- * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen)
1899
+ * long bpf_getsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen)
16131900 * Description
16141901 * Emulate a call to **getsockopt()** on the socket associated to
16151902 * *bpf_socket*, which must be a full socket. The *level* at
....@@ -1617,6 +1904,12 @@
16171904 * must be specified, see **getsockopt(2)** for more information.
16181905 * The retrieved value is stored in the structure pointed by
16191906 * *opval* and of length *optlen*.
1907
+ *
1908
+ * *bpf_socket* should be one of the following:
1909
+ *
1910
+ * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
1911
+ * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
1912
+ * and **BPF_CGROUP_INET6_CONNECT**.
16201913 *
16211914 * This helper actually implements a subset of **getsockopt()**.
16221915 * It supports the following *level*\ s:
....@@ -1628,14 +1921,14 @@
16281921 * Return
16291922 * 0 on success, or a negative error in case of failure.
16301923 *
1631
- * int bpf_override_return(struct pt_reg *regs, u64 rc)
1924
+ * long bpf_override_return(struct pt_regs *regs, u64 rc)
16321925 * Description
16331926 * Used for error injection, this helper uses kprobes to override
16341927 * the return value of the probed function, and to set it to *rc*.
16351928 * The first argument is the context *regs* on which the kprobe
16361929 * works.
16371930 *
1638
- * This helper works by setting setting the PC (program counter)
1931
+ * This helper works by setting the PC (program counter)
16391932 * to an override function which is run in place of the original
16401933 * probed function. This means the probed function is not run at
16411934 * all. The replacement function just returns with the required
....@@ -1653,7 +1946,7 @@
16531946 * Return
16541947 * 0
16551948 *
1656
- * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval)
1949
+ * long bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval)
16571950 * Description
16581951 * Attempt to set the value of the **bpf_sock_ops_cb_flags** field
16591952 * for the full TCP socket associated to *bpf_sock_ops* to
....@@ -1669,11 +1962,19 @@
16691962 * error if an eBPF program tries to set a callback that is not
16701963 * supported in the current kernel.
16711964 *
1672
- * The supported callback values that *argval* can combine are:
1965
+ * *argval* is a flag array which can combine these flags:
16731966 *
16741967 * * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out)
16751968 * * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission)
16761969 * * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change)
1970
+ * * **BPF_SOCK_OPS_RTT_CB_FLAG** (every RTT)
1971
+ *
1972
+ * Therefore, this function can be used to clear a callback flag by
1973
+ * setting the appropriate bit to zero. e.g. to disable the RTO
1974
+ * callback:
1975
+ *
1976
+ * **bpf_sock_ops_cb_flags_set(bpf_sock,**
1977
+ * **bpf_sock->bpf_sock_ops_cb_flags & ~BPF_SOCK_OPS_RTO_CB_FLAG)**
16771978 *
16781979 * Here are some examples of where one could call such eBPF
16791980 * program:
....@@ -1689,7 +1990,7 @@
16891990 * be set is returned (which comes down to 0 if all bits were set
16901991 * as required).
16911992 *
1692
- * int bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags)
1993
+ * long bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags)
16931994 * Description
16941995 * This helper is used in programs implementing policies at the
16951996 * socket level. If the message *msg* is allowed to pass (i.e. if
....@@ -1703,7 +2004,7 @@
17032004 * Return
17042005 * **SK_PASS** on success, or **SK_DROP** on error.
17052006 *
1706
- * int bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes)
2007
+ * long bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes)
17072008 * Description
17082009 * For socket policies, apply the verdict of the eBPF program to
17092010 * the next *bytes* (number of bytes) of message *msg*.
....@@ -1737,7 +2038,7 @@
17372038 * Return
17382039 * 0
17392040 *
1740
- * int bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes)
2041
+ * long bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes)
17412042 * Description
17422043 * For socket policies, prevent the execution of the verdict eBPF
17432044 * program for message *msg* until *bytes* (byte number) have been
....@@ -1755,7 +2056,7 @@
17552056 * Return
17562057 * 0
17572058 *
1758
- * int bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags)
2059
+ * long bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags)
17592060 * Description
17602061 * For socket policies, pull in non-linear data from user space
17612062 * for *msg* and set pointers *msg*\ **->data** and *msg*\
....@@ -1775,7 +2076,7 @@
17752076 * copied if necessary (i.e. if data was not linear and if start
17762077 * and end pointers do not point to the same chunk).
17772078 *
1778
- * A call to this helper is susceptible to change the underlaying
2079
+ * A call to this helper is susceptible to change the underlying
17792080 * packet buffer. Therefore, at load time, all checks on pointers
17802081 * previously done by the verifier are invalidated and must be
17812082 * performed again, if the helper is used in combination with
....@@ -1786,7 +2087,7 @@
17862087 * Return
17872088 * 0 on success, or a negative error in case of failure.
17882089 *
1789
- * int bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len)
2090
+ * long bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len)
17902091 * Description
17912092 * Bind the socket associated to *ctx* to the address pointed by
17922093 * *addr*, of length *addr_len*. This allows for making outgoing
....@@ -1796,20 +2097,21 @@
17962097 *
17972098 * This helper works for IPv4 and IPv6, TCP and UDP sockets. The
17982099 * domain (*addr*\ **->sa_family**) must be **AF_INET** (or
1799
- * **AF_INET6**). Looking for a free port to bind to can be
1800
- * expensive, therefore binding to port is not permitted by the
1801
- * helper: *addr*\ **->sin_port** (or **sin6_port**, respectively)
1802
- * must be set to zero.
2100
+ * **AF_INET6**). It's advised to pass zero port (**sin_port**
2101
+ * or **sin6_port**) which triggers IP_BIND_ADDRESS_NO_PORT-like
2102
+ * behavior and lets the kernel efficiently pick up an unused
2103
+ * port as long as 4-tuple is unique. Passing non-zero port might
2104
+ * lead to degraded performance.
18032105 * Return
18042106 * 0 on success, or a negative error in case of failure.
18052107 *
1806
- * int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta)
2108
+ * long bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta)
18072109 * Description
18082110 * Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is
1809
- * only possible to shrink the packet as of this writing,
1810
- * therefore *delta* must be a negative integer.
2111
+ * possible to both shrink and grow the packet tail.
2112
+ * Shrink done via *delta* being a negative integer.
18112113 *
1812
- * A call to this helper is susceptible to change the underlaying
2114
+ * A call to this helper is susceptible to change the underlying
18132115 * packet buffer. Therefore, at load time, all checks on pointers
18142116 * previously done by the verifier are invalidated and must be
18152117 * performed again, if the helper is used in combination with
....@@ -1817,7 +2119,7 @@
18172119 * Return
18182120 * 0 on success, or a negative error in case of failure.
18192121 *
1820
- * int bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags)
2122
+ * long bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags)
18212123 * Description
18222124 * Retrieve the XFRM state (IP transform framework, see also
18232125 * **ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*.
....@@ -1833,7 +2135,7 @@
18332135 * Return
18342136 * 0 on success, or a negative error in case of failure.
18352137 *
1836
- * int bpf_get_stack(struct pt_regs *regs, void *buf, u32 size, u64 flags)
2138
+ * long bpf_get_stack(void *ctx, void *buf, u32 size, u64 flags)
18372139 * Description
18382140 * Return a user or a kernel stack in bpf program provided buffer.
18392141 * To achieve this, the helper needs *ctx*, which is a pointer
....@@ -1863,10 +2165,10 @@
18632165 *
18642166 * # sysctl kernel.perf_event_max_stack=<new value>
18652167 * Return
1866
- * A non-negative value equal to or less than *size* on success,
1867
- * or a negative error in case of failure.
2168
+ * The non-negative copied *buf* length equal to or less than
2169
+ * *size* on success, or a negative error in case of failure.
18682170 *
1869
- * int bpf_skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header)
2171
+ * long bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header)
18702172 * Description
18712173 * This helper is similar to **bpf_skb_load_bytes**\ () in that
18722174 * it provides an easy way to load *len* bytes from *offset*
....@@ -1888,7 +2190,7 @@
18882190 * Return
18892191 * 0 on success, or a negative error in case of failure.
18902192 *
1891
- * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags)
2193
+ * long bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags)
18922194 * Description
18932195 * Do FIB lookup in kernel tables using parameters in *params*.
18942196 * If lookup is successful and result shows packet is to be
....@@ -1900,9 +2202,9 @@
19002202 * is set to metric from route (IPv4/IPv6 only), and ifindex
19012203 * is set to the device index of the nexthop from the FIB lookup.
19022204 *
1903
- * *plen* argument is the size of the passed in struct.
1904
- * *flags* argument can be a combination of one or more of the
1905
- * following values:
2205
+ * *plen* argument is the size of the passed in struct.
2206
+ * *flags* argument can be a combination of one or more of the
2207
+ * following values:
19062208 *
19072209 * **BPF_FIB_LOOKUP_DIRECT**
19082210 * Do a direct table lookup vs full lookup using FIB
....@@ -1911,15 +2213,15 @@
19112213 * Perform lookup from an egress perspective (default is
19122214 * ingress).
19132215 *
1914
- * *ctx* is either **struct xdp_md** for XDP programs or
1915
- * **struct sk_buff** tc cls_act programs.
1916
- * Return
2216
+ * *ctx* is either **struct xdp_md** for XDP programs or
2217
+ * **struct sk_buff** tc cls_act programs.
2218
+ * Return
19172219 * * < 0 if any input argument is invalid
19182220 * * 0 on success (packet is forwarded, nexthop neighbor exists)
19192221 * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the
19202222 * packet is not forwarded or needs assist from full stack
19212223 *
1922
- * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags)
2224
+ * long bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags)
19232225 * Description
19242226 * Add an entry to, or update a sockhash *map* referencing sockets.
19252227 * The *skops* is used as a new value for the entry associated to
....@@ -1938,7 +2240,7 @@
19382240 * Return
19392241 * 0 on success, or a negative error in case of failure.
19402242 *
1941
- * int bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags)
2243
+ * long bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags)
19422244 * Description
19432245 * This helper is used in programs implementing policies at the
19442246 * socket level. If the message *msg* is allowed to pass (i.e. if
....@@ -1952,11 +2254,11 @@
19522254 * Return
19532255 * **SK_PASS** on success, or **SK_DROP** on error.
19542256 *
1955
- * int bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags)
2257
+ * long bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags)
19562258 * Description
19572259 * This helper is used in programs implementing policies at the
19582260 * skb socket level. If the sk_buff *skb* is allowed to pass (i.e.
1959
- * if the verdeict eBPF program returns **SK_PASS**), redirect it
2261
+ * if the verdict eBPF program returns **SK_PASS**), redirect it
19602262 * to the socket referenced by *map* (of type
19612263 * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and
19622264 * egress interfaces can be used for redirection. The
....@@ -1966,7 +2268,7 @@
19662268 * Return
19672269 * **SK_PASS** on success, or **SK_DROP** on error.
19682270 *
1969
- * int bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
2271
+ * long bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
19702272 * Description
19712273 * Encapsulate the packet associated to *skb* within a Layer 3
19722274 * protocol header. This header is provided in the buffer at
....@@ -1981,8 +2283,21 @@
19812283 * Only works if *skb* contains an IPv6 packet. Insert a
19822284 * Segment Routing Header (**struct ipv6_sr_hdr**) inside
19832285 * the IPv6 header.
2286
+ * **BPF_LWT_ENCAP_IP**
2287
+ * IP encapsulation (GRE/GUE/IPIP/etc). The outer header
2288
+ * must be IPv4 or IPv6, followed by zero or more
2289
+ * additional headers, up to **LWT_BPF_MAX_HEADROOM**
2290
+ * total bytes in all prepended headers. Please note that
2291
+ * if **skb_is_gso**\ (*skb*) is true, no more than two
2292
+ * headers can be prepended, and the inner header, if
2293
+ * present, should be either GRE or UDP/GUE.
19842294 *
1985
- * A call to this helper is susceptible to change the underlaying
2295
+ * **BPF_LWT_ENCAP_SEG6**\ \* types can be called by BPF programs
2296
+ * of type **BPF_PROG_TYPE_LWT_IN**; **BPF_LWT_ENCAP_IP** type can
2297
+ * be called by bpf programs of types **BPF_PROG_TYPE_LWT_IN** and
2298
+ * **BPF_PROG_TYPE_LWT_XMIT**.
2299
+ *
2300
+ * A call to this helper is susceptible to change the underlying
19862301 * packet buffer. Therefore, at load time, all checks on pointers
19872302 * previously done by the verifier are invalidated and must be
19882303 * performed again, if the helper is used in combination with
....@@ -1990,14 +2305,14 @@
19902305 * Return
19912306 * 0 on success, or a negative error in case of failure.
19922307 *
1993
- * int bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len)
2308
+ * long bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len)
19942309 * Description
19952310 * Store *len* bytes from address *from* into the packet
19962311 * associated to *skb*, at *offset*. Only the flags, tag and TLVs
19972312 * inside the outermost IPv6 Segment Routing Header can be
19982313 * modified through this helper.
19992314 *
2000
- * A call to this helper is susceptible to change the underlaying
2315
+ * A call to this helper is susceptible to change the underlying
20012316 * packet buffer. Therefore, at load time, all checks on pointers
20022317 * previously done by the verifier are invalidated and must be
20032318 * performed again, if the helper is used in combination with
....@@ -2005,7 +2320,7 @@
20052320 * Return
20062321 * 0 on success, or a negative error in case of failure.
20072322 *
2008
- * int bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta)
2323
+ * long bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta)
20092324 * Description
20102325 * Adjust the size allocated to TLVs in the outermost IPv6
20112326 * Segment Routing Header contained in the packet associated to
....@@ -2013,7 +2328,7 @@
20132328 * after the segments are accepted. *delta* can be as well
20142329 * positive (growing) as negative (shrinking).
20152330 *
2016
- * A call to this helper is susceptible to change the underlaying
2331
+ * A call to this helper is susceptible to change the underlying
20172332 * packet buffer. Therefore, at load time, all checks on pointers
20182333 * previously done by the verifier are invalidated and must be
20192334 * performed again, if the helper is used in combination with
....@@ -2021,7 +2336,7 @@
20212336 * Return
20222337 * 0 on success, or a negative error in case of failure.
20232338 *
2024
- * int bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len)
2339
+ * long bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len)
20252340 * Description
20262341 * Apply an IPv6 Segment Routing action of type *action* to the
20272342 * packet associated to *skb*. Each action takes a parameter
....@@ -2036,13 +2351,13 @@
20362351 * Type of *param*: **int**.
20372352 * **SEG6_LOCAL_ACTION_END_B6**
20382353 * End.B6 action: Endpoint bound to an SRv6 policy.
2039
- * Type of param: **struct ipv6_sr_hdr**.
2354
+ * Type of *param*: **struct ipv6_sr_hdr**.
20402355 * **SEG6_LOCAL_ACTION_END_B6_ENCAP**
20412356 * End.B6.Encap action: Endpoint bound to an SRv6
20422357 * encapsulation policy.
2043
- * Type of param: **struct ipv6_sr_hdr**.
2358
+ * Type of *param*: **struct ipv6_sr_hdr**.
20442359 *
2045
- * A call to this helper is susceptible to change the underlaying
2360
+ * A call to this helper is susceptible to change the underlying
20462361 * packet buffer. Therefore, at load time, all checks on pointers
20472362 * previously done by the verifier are invalidated and must be
20482363 * performed again, if the helper is used in combination with
....@@ -2050,33 +2365,7 @@
20502365 * Return
20512366 * 0 on success, or a negative error in case of failure.
20522367 *
2053
- * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle)
2054
- * Description
2055
- * This helper is used in programs implementing IR decoding, to
2056
- * report a successfully decoded key press with *scancode*,
2057
- * *toggle* value in the given *protocol*. The scancode will be
2058
- * translated to a keycode using the rc keymap, and reported as
2059
- * an input key down event. After a period a key up event is
2060
- * generated. This period can be extended by calling either
2061
- * **bpf_rc_keydown** () again with the same values, or calling
2062
- * **bpf_rc_repeat** ().
2063
- *
2064
- * Some protocols include a toggle bit, in case the button was
2065
- * released and pressed again between consecutive scancodes.
2066
- *
2067
- * The *ctx* should point to the lirc sample as passed into
2068
- * the program.
2069
- *
2070
- * The *protocol* is the decoded protocol number (see
2071
- * **enum rc_proto** for some predefined values).
2072
- *
2073
- * This helper is only available is the kernel was compiled with
2074
- * the **CONFIG_BPF_LIRC_MODE2** configuration option set to
2075
- * "**y**".
2076
- * Return
2077
- * 0
2078
- *
2079
- * int bpf_rc_repeat(void *ctx)
2368
+ * long bpf_rc_repeat(void *ctx)
20802369 * Description
20812370 * This helper is used in programs implementing IR decoding, to
20822371 * report a successfully decoded repeat key message. This delays
....@@ -2095,7 +2384,33 @@
20952384 * Return
20962385 * 0
20972386 *
2098
- * uint64_t bpf_skb_cgroup_id(struct sk_buff *skb)
2387
+ * long bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle)
2388
+ * Description
2389
+ * This helper is used in programs implementing IR decoding, to
2390
+ * report a successfully decoded key press with *scancode*,
2391
+ * *toggle* value in the given *protocol*. The scancode will be
2392
+ * translated to a keycode using the rc keymap, and reported as
2393
+ * an input key down event. After a period a key up event is
2394
+ * generated. This period can be extended by calling either
2395
+ * **bpf_rc_keydown**\ () again with the same values, or calling
2396
+ * **bpf_rc_repeat**\ ().
2397
+ *
2398
+ * Some protocols include a toggle bit, in case the button was
2399
+ * released and pressed again between consecutive scancodes.
2400
+ *
2401
+ * The *ctx* should point to the lirc sample as passed into
2402
+ * the program.
2403
+ *
2404
+ * The *protocol* is the decoded protocol number (see
2405
+ * **enum rc_proto** for some predefined values).
2406
+ *
2407
+ * This helper is only available is the kernel was compiled with
2408
+ * the **CONFIG_BPF_LIRC_MODE2** configuration option set to
2409
+ * "**y**".
2410
+ * Return
2411
+ * 0
2412
+ *
2413
+ * u64 bpf_skb_cgroup_id(struct sk_buff *skb)
20992414 * Description
21002415 * Return the cgroup v2 id of the socket associated with the *skb*.
21012416 * This is roughly similar to the **bpf_get_cgroup_classid**\ ()
....@@ -2110,6 +2425,38 @@
21102425 * **CONFIG_SOCK_CGROUP_DATA** configuration option.
21112426 * Return
21122427 * The id is returned or 0 in case the id could not be retrieved.
2428
+ *
2429
+ * u64 bpf_get_current_cgroup_id(void)
2430
+ * Return
2431
+ * A 64-bit integer containing the current cgroup id based
2432
+ * on the cgroup within which the current task is running.
2433
+ *
2434
+ * void *bpf_get_local_storage(void *map, u64 flags)
2435
+ * Description
2436
+ * Get the pointer to the local storage area.
2437
+ * The type and the size of the local storage is defined
2438
+ * by the *map* argument.
2439
+ * The *flags* meaning is specific for each map type,
2440
+ * and has to be 0 for cgroup local storage.
2441
+ *
2442
+ * Depending on the BPF program type, a local storage area
2443
+ * can be shared between multiple instances of the BPF program,
2444
+ * running simultaneously.
2445
+ *
2446
+ * A user should care about the synchronization by himself.
2447
+ * For example, by using the **BPF_STX_XADD** instruction to alter
2448
+ * the shared data.
2449
+ * Return
2450
+ * A pointer to the local storage area.
2451
+ *
2452
+ * long bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags)
2453
+ * Description
2454
+ * Select a **SO_REUSEPORT** socket from a
2455
+ * **BPF_MAP_TYPE_REUSEPORT_SOCKARRAY** *map*.
2456
+ * It checks the selected socket is matching the incoming
2457
+ * request in the socket buffer.
2458
+ * Return
2459
+ * 0 on success, or a negative error in case of failure.
21132460 *
21142461 * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level)
21152462 * Description
....@@ -2129,44 +2476,1274 @@
21292476 * Return
21302477 * The id is returned or 0 in case the id could not be retrieved.
21312478 *
2132
- * u64 bpf_get_current_cgroup_id(void)
2133
- * Return
2134
- * A 64-bit integer containing the current cgroup id based
2135
- * on the cgroup within which the current task is running.
2136
- *
2137
- * void* get_local_storage(void *map, u64 flags)
2479
+ * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
21382480 * Description
2139
- * Get the pointer to the local storage area.
2140
- * The type and the size of the local storage is defined
2141
- * by the *map* argument.
2142
- * The *flags* meaning is specific for each map type,
2143
- * and has to be 0 for cgroup local storage.
2481
+ * Look for TCP socket matching *tuple*, optionally in a child
2482
+ * network namespace *netns*. The return value must be checked,
2483
+ * and if non-**NULL**, released via **bpf_sk_release**\ ().
21442484 *
2145
- * Depending on the bpf program type, a local storage area
2146
- * can be shared between multiple instances of the bpf program,
2147
- * running simultaneously.
2485
+ * The *ctx* should point to the context of the program, such as
2486
+ * the skb or socket (depending on the hook in use). This is used
2487
+ * to determine the base network namespace for the lookup.
21482488 *
2149
- * A user should care about the synchronization by himself.
2150
- * For example, by using the BPF_STX_XADD instruction to alter
2151
- * the shared data.
2489
+ * *tuple_size* must be one of:
2490
+ *
2491
+ * **sizeof**\ (*tuple*\ **->ipv4**)
2492
+ * Look for an IPv4 socket.
2493
+ * **sizeof**\ (*tuple*\ **->ipv6**)
2494
+ * Look for an IPv6 socket.
2495
+ *
2496
+ * If the *netns* is a negative signed 32-bit integer, then the
2497
+ * socket lookup table in the netns associated with the *ctx*
2498
+ * will be used. For the TC hooks, this is the netns of the device
2499
+ * in the skb. For socket hooks, this is the netns of the socket.
2500
+ * If *netns* is any other signed 32-bit value greater than or
2501
+ * equal to zero then it specifies the ID of the netns relative to
2502
+ * the netns associated with the *ctx*. *netns* values beyond the
2503
+ * range of 32-bit integers are reserved for future use.
2504
+ *
2505
+ * All values for *flags* are reserved for future usage, and must
2506
+ * be left at zero.
2507
+ *
2508
+ * This helper is available only if the kernel was compiled with
2509
+ * **CONFIG_NET** configuration option.
21522510 * Return
2153
- * Pointer to the local storage area.
2511
+ * Pointer to **struct bpf_sock**, or **NULL** in case of failure.
2512
+ * For sockets with reuseport option, the **struct bpf_sock**
2513
+ * result is from *reuse*\ **->socks**\ [] using the hash of the
2514
+ * tuple.
21542515 *
2155
- * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags)
2516
+ * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
21562517 * Description
2157
- * Select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY map
2158
- * It checks the selected sk is matching the incoming
2159
- * request in the skb.
2518
+ * Look for UDP socket matching *tuple*, optionally in a child
2519
+ * network namespace *netns*. The return value must be checked,
2520
+ * and if non-**NULL**, released via **bpf_sk_release**\ ().
2521
+ *
2522
+ * The *ctx* should point to the context of the program, such as
2523
+ * the skb or socket (depending on the hook in use). This is used
2524
+ * to determine the base network namespace for the lookup.
2525
+ *
2526
+ * *tuple_size* must be one of:
2527
+ *
2528
+ * **sizeof**\ (*tuple*\ **->ipv4**)
2529
+ * Look for an IPv4 socket.
2530
+ * **sizeof**\ (*tuple*\ **->ipv6**)
2531
+ * Look for an IPv6 socket.
2532
+ *
2533
+ * If the *netns* is a negative signed 32-bit integer, then the
2534
+ * socket lookup table in the netns associated with the *ctx*
2535
+ * will be used. For the TC hooks, this is the netns of the device
2536
+ * in the skb. For socket hooks, this is the netns of the socket.
2537
+ * If *netns* is any other signed 32-bit value greater than or
2538
+ * equal to zero then it specifies the ID of the netns relative to
2539
+ * the netns associated with the *ctx*. *netns* values beyond the
2540
+ * range of 32-bit integers are reserved for future use.
2541
+ *
2542
+ * All values for *flags* are reserved for future usage, and must
2543
+ * be left at zero.
2544
+ *
2545
+ * This helper is available only if the kernel was compiled with
2546
+ * **CONFIG_NET** configuration option.
2547
+ * Return
2548
+ * Pointer to **struct bpf_sock**, or **NULL** in case of failure.
2549
+ * For sockets with reuseport option, the **struct bpf_sock**
2550
+ * result is from *reuse*\ **->socks**\ [] using the hash of the
2551
+ * tuple.
2552
+ *
2553
+ * long bpf_sk_release(void *sock)
2554
+ * Description
2555
+ * Release the reference held by *sock*. *sock* must be a
2556
+ * non-**NULL** pointer that was returned from
2557
+ * **bpf_sk_lookup_xxx**\ ().
21602558 * Return
21612559 * 0 on success, or a negative error in case of failure.
2560
+ *
2561
+ * long bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags)
2562
+ * Description
2563
+ * Push an element *value* in *map*. *flags* is one of:
2564
+ *
2565
+ * **BPF_EXIST**
2566
+ * If the queue/stack is full, the oldest element is
2567
+ * removed to make room for this.
2568
+ * Return
2569
+ * 0 on success, or a negative error in case of failure.
2570
+ *
2571
+ * long bpf_map_pop_elem(struct bpf_map *map, void *value)
2572
+ * Description
2573
+ * Pop an element from *map*.
2574
+ * Return
2575
+ * 0 on success, or a negative error in case of failure.
2576
+ *
2577
+ * long bpf_map_peek_elem(struct bpf_map *map, void *value)
2578
+ * Description
2579
+ * Get an element from *map* without removing it.
2580
+ * Return
2581
+ * 0 on success, or a negative error in case of failure.
2582
+ *
2583
+ * long bpf_msg_push_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags)
2584
+ * Description
2585
+ * For socket policies, insert *len* bytes into *msg* at offset
2586
+ * *start*.
2587
+ *
2588
+ * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a
2589
+ * *msg* it may want to insert metadata or options into the *msg*.
2590
+ * This can later be read and used by any of the lower layer BPF
2591
+ * hooks.
2592
+ *
2593
+ * This helper may fail if under memory pressure (a malloc
2594
+ * fails) in these cases BPF programs will get an appropriate
2595
+ * error and BPF programs will need to handle them.
2596
+ * Return
2597
+ * 0 on success, or a negative error in case of failure.
2598
+ *
2599
+ * long bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags)
2600
+ * Description
2601
+ * Will remove *len* bytes from a *msg* starting at byte *start*.
2602
+ * This may result in **ENOMEM** errors under certain situations if
2603
+ * an allocation and copy are required due to a full ring buffer.
2604
+ * However, the helper will try to avoid doing the allocation
2605
+ * if possible. Other errors can occur if input parameters are
2606
+ * invalid either due to *start* byte not being valid part of *msg*
2607
+ * payload and/or *pop* value being to large.
2608
+ * Return
2609
+ * 0 on success, or a negative error in case of failure.
2610
+ *
2611
+ * long bpf_rc_pointer_rel(void *ctx, s32 rel_x, s32 rel_y)
2612
+ * Description
2613
+ * This helper is used in programs implementing IR decoding, to
2614
+ * report a successfully decoded pointer movement.
2615
+ *
2616
+ * The *ctx* should point to the lirc sample as passed into
2617
+ * the program.
2618
+ *
2619
+ * This helper is only available is the kernel was compiled with
2620
+ * the **CONFIG_BPF_LIRC_MODE2** configuration option set to
2621
+ * "**y**".
2622
+ * Return
2623
+ * 0
2624
+ *
2625
+ * long bpf_spin_lock(struct bpf_spin_lock *lock)
2626
+ * Description
2627
+ * Acquire a spinlock represented by the pointer *lock*, which is
2628
+ * stored as part of a value of a map. Taking the lock allows to
2629
+ * safely update the rest of the fields in that value. The
2630
+ * spinlock can (and must) later be released with a call to
2631
+ * **bpf_spin_unlock**\ (\ *lock*\ ).
2632
+ *
2633
+ * Spinlocks in BPF programs come with a number of restrictions
2634
+ * and constraints:
2635
+ *
2636
+ * * **bpf_spin_lock** objects are only allowed inside maps of
2637
+ * types **BPF_MAP_TYPE_HASH** and **BPF_MAP_TYPE_ARRAY** (this
2638
+ * list could be extended in the future).
2639
+ * * BTF description of the map is mandatory.
2640
+ * * The BPF program can take ONE lock at a time, since taking two
2641
+ * or more could cause dead locks.
2642
+ * * Only one **struct bpf_spin_lock** is allowed per map element.
2643
+ * * When the lock is taken, calls (either BPF to BPF or helpers)
2644
+ * are not allowed.
2645
+ * * The **BPF_LD_ABS** and **BPF_LD_IND** instructions are not
2646
+ * allowed inside a spinlock-ed region.
2647
+ * * The BPF program MUST call **bpf_spin_unlock**\ () to release
2648
+ * the lock, on all execution paths, before it returns.
2649
+ * * The BPF program can access **struct bpf_spin_lock** only via
2650
+ * the **bpf_spin_lock**\ () and **bpf_spin_unlock**\ ()
2651
+ * helpers. Loading or storing data into the **struct
2652
+ * bpf_spin_lock** *lock*\ **;** field of a map is not allowed.
2653
+ * * To use the **bpf_spin_lock**\ () helper, the BTF description
2654
+ * of the map value must be a struct and have **struct
2655
+ * bpf_spin_lock** *anyname*\ **;** field at the top level.
2656
+ * Nested lock inside another struct is not allowed.
2657
+ * * The **struct bpf_spin_lock** *lock* field in a map value must
2658
+ * be aligned on a multiple of 4 bytes in that value.
2659
+ * * Syscall with command **BPF_MAP_LOOKUP_ELEM** does not copy
2660
+ * the **bpf_spin_lock** field to user space.
2661
+ * * Syscall with command **BPF_MAP_UPDATE_ELEM**, or update from
2662
+ * a BPF program, do not update the **bpf_spin_lock** field.
2663
+ * * **bpf_spin_lock** cannot be on the stack or inside a
2664
+ * networking packet (it can only be inside of a map values).
2665
+ * * **bpf_spin_lock** is available to root only.
2666
+ * * Tracing programs and socket filter programs cannot use
2667
+ * **bpf_spin_lock**\ () due to insufficient preemption checks
2668
+ * (but this may change in the future).
2669
+ * * **bpf_spin_lock** is not allowed in inner maps of map-in-map.
2670
+ * Return
2671
+ * 0
2672
+ *
2673
+ * long bpf_spin_unlock(struct bpf_spin_lock *lock)
2674
+ * Description
2675
+ * Release the *lock* previously locked by a call to
2676
+ * **bpf_spin_lock**\ (\ *lock*\ ).
2677
+ * Return
2678
+ * 0
2679
+ *
2680
+ * struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk)
2681
+ * Description
2682
+ * This helper gets a **struct bpf_sock** pointer such
2683
+ * that all the fields in this **bpf_sock** can be accessed.
2684
+ * Return
2685
+ * A **struct bpf_sock** pointer on success, or **NULL** in
2686
+ * case of failure.
2687
+ *
2688
+ * struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk)
2689
+ * Description
2690
+ * This helper gets a **struct bpf_tcp_sock** pointer from a
2691
+ * **struct bpf_sock** pointer.
2692
+ * Return
2693
+ * A **struct bpf_tcp_sock** pointer on success, or **NULL** in
2694
+ * case of failure.
2695
+ *
2696
+ * long bpf_skb_ecn_set_ce(struct sk_buff *skb)
2697
+ * Description
2698
+ * Set ECN (Explicit Congestion Notification) field of IP header
2699
+ * to **CE** (Congestion Encountered) if current value is **ECT**
2700
+ * (ECN Capable Transport). Otherwise, do nothing. Works with IPv6
2701
+ * and IPv4.
2702
+ * Return
2703
+ * 1 if the **CE** flag is set (either by the current helper call
2704
+ * or because it was already present), 0 if it is not set.
2705
+ *
2706
+ * struct bpf_sock *bpf_get_listener_sock(struct bpf_sock *sk)
2707
+ * Description
2708
+ * Return a **struct bpf_sock** pointer in **TCP_LISTEN** state.
2709
+ * **bpf_sk_release**\ () is unnecessary and not allowed.
2710
+ * Return
2711
+ * A **struct bpf_sock** pointer on success, or **NULL** in
2712
+ * case of failure.
2713
+ *
2714
+ * struct bpf_sock *bpf_skc_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
2715
+ * Description
2716
+ * Look for TCP socket matching *tuple*, optionally in a child
2717
+ * network namespace *netns*. The return value must be checked,
2718
+ * and if non-**NULL**, released via **bpf_sk_release**\ ().
2719
+ *
2720
+ * This function is identical to **bpf_sk_lookup_tcp**\ (), except
2721
+ * that it also returns timewait or request sockets. Use
2722
+ * **bpf_sk_fullsock**\ () or **bpf_tcp_sock**\ () to access the
2723
+ * full structure.
2724
+ *
2725
+ * This helper is available only if the kernel was compiled with
2726
+ * **CONFIG_NET** configuration option.
2727
+ * Return
2728
+ * Pointer to **struct bpf_sock**, or **NULL** in case of failure.
2729
+ * For sockets with reuseport option, the **struct bpf_sock**
2730
+ * result is from *reuse*\ **->socks**\ [] using the hash of the
2731
+ * tuple.
2732
+ *
2733
+ * long bpf_tcp_check_syncookie(void *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len)
2734
+ * Description
2735
+ * Check whether *iph* and *th* contain a valid SYN cookie ACK for
2736
+ * the listening socket in *sk*.
2737
+ *
2738
+ * *iph* points to the start of the IPv4 or IPv6 header, while
2739
+ * *iph_len* contains **sizeof**\ (**struct iphdr**) or
2740
+ * **sizeof**\ (**struct ip6hdr**).
2741
+ *
2742
+ * *th* points to the start of the TCP header, while *th_len*
2743
+ * contains **sizeof**\ (**struct tcphdr**).
2744
+ * Return
2745
+ * 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative
2746
+ * error otherwise.
2747
+ *
2748
+ * long bpf_sysctl_get_name(struct bpf_sysctl *ctx, char *buf, size_t buf_len, u64 flags)
2749
+ * Description
2750
+ * Get name of sysctl in /proc/sys/ and copy it into provided by
2751
+ * program buffer *buf* of size *buf_len*.
2752
+ *
2753
+ * The buffer is always NUL terminated, unless it's zero-sized.
2754
+ *
2755
+ * If *flags* is zero, full name (e.g. "net/ipv4/tcp_mem") is
2756
+ * copied. Use **BPF_F_SYSCTL_BASE_NAME** flag to copy base name
2757
+ * only (e.g. "tcp_mem").
2758
+ * Return
2759
+ * Number of character copied (not including the trailing NUL).
2760
+ *
2761
+ * **-E2BIG** if the buffer wasn't big enough (*buf* will contain
2762
+ * truncated name in this case).
2763
+ *
2764
+ * long bpf_sysctl_get_current_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len)
2765
+ * Description
2766
+ * Get current value of sysctl as it is presented in /proc/sys
2767
+ * (incl. newline, etc), and copy it as a string into provided
2768
+ * by program buffer *buf* of size *buf_len*.
2769
+ *
2770
+ * The whole value is copied, no matter what file position user
2771
+ * space issued e.g. sys_read at.
2772
+ *
2773
+ * The buffer is always NUL terminated, unless it's zero-sized.
2774
+ * Return
2775
+ * Number of character copied (not including the trailing NUL).
2776
+ *
2777
+ * **-E2BIG** if the buffer wasn't big enough (*buf* will contain
2778
+ * truncated name in this case).
2779
+ *
2780
+ * **-EINVAL** if current value was unavailable, e.g. because
2781
+ * sysctl is uninitialized and read returns -EIO for it.
2782
+ *
2783
+ * long bpf_sysctl_get_new_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len)
2784
+ * Description
2785
+ * Get new value being written by user space to sysctl (before
2786
+ * the actual write happens) and copy it as a string into
2787
+ * provided by program buffer *buf* of size *buf_len*.
2788
+ *
2789
+ * User space may write new value at file position > 0.
2790
+ *
2791
+ * The buffer is always NUL terminated, unless it's zero-sized.
2792
+ * Return
2793
+ * Number of character copied (not including the trailing NUL).
2794
+ *
2795
+ * **-E2BIG** if the buffer wasn't big enough (*buf* will contain
2796
+ * truncated name in this case).
2797
+ *
2798
+ * **-EINVAL** if sysctl is being read.
2799
+ *
2800
+ * long bpf_sysctl_set_new_value(struct bpf_sysctl *ctx, const char *buf, size_t buf_len)
2801
+ * Description
2802
+ * Override new value being written by user space to sysctl with
2803
+ * value provided by program in buffer *buf* of size *buf_len*.
2804
+ *
2805
+ * *buf* should contain a string in same form as provided by user
2806
+ * space on sysctl write.
2807
+ *
2808
+ * User space may write new value at file position > 0. To override
2809
+ * the whole sysctl value file position should be set to zero.
2810
+ * Return
2811
+ * 0 on success.
2812
+ *
2813
+ * **-E2BIG** if the *buf_len* is too big.
2814
+ *
2815
+ * **-EINVAL** if sysctl is being read.
2816
+ *
2817
+ * long bpf_strtol(const char *buf, size_t buf_len, u64 flags, long *res)
2818
+ * Description
2819
+ * Convert the initial part of the string from buffer *buf* of
2820
+ * size *buf_len* to a long integer according to the given base
2821
+ * and save the result in *res*.
2822
+ *
2823
+ * The string may begin with an arbitrary amount of white space
2824
+ * (as determined by **isspace**\ (3)) followed by a single
2825
+ * optional '**-**' sign.
2826
+ *
2827
+ * Five least significant bits of *flags* encode base, other bits
2828
+ * are currently unused.
2829
+ *
2830
+ * Base must be either 8, 10, 16 or 0 to detect it automatically
2831
+ * similar to user space **strtol**\ (3).
2832
+ * Return
2833
+ * Number of characters consumed on success. Must be positive but
2834
+ * no more than *buf_len*.
2835
+ *
2836
+ * **-EINVAL** if no valid digits were found or unsupported base
2837
+ * was provided.
2838
+ *
2839
+ * **-ERANGE** if resulting value was out of range.
2840
+ *
2841
+ * long bpf_strtoul(const char *buf, size_t buf_len, u64 flags, unsigned long *res)
2842
+ * Description
2843
+ * Convert the initial part of the string from buffer *buf* of
2844
+ * size *buf_len* to an unsigned long integer according to the
2845
+ * given base and save the result in *res*.
2846
+ *
2847
+ * The string may begin with an arbitrary amount of white space
2848
+ * (as determined by **isspace**\ (3)).
2849
+ *
2850
+ * Five least significant bits of *flags* encode base, other bits
2851
+ * are currently unused.
2852
+ *
2853
+ * Base must be either 8, 10, 16 or 0 to detect it automatically
2854
+ * similar to user space **strtoul**\ (3).
2855
+ * Return
2856
+ * Number of characters consumed on success. Must be positive but
2857
+ * no more than *buf_len*.
2858
+ *
2859
+ * **-EINVAL** if no valid digits were found or unsupported base
2860
+ * was provided.
2861
+ *
2862
+ * **-ERANGE** if resulting value was out of range.
2863
+ *
2864
+ * void *bpf_sk_storage_get(struct bpf_map *map, void *sk, void *value, u64 flags)
2865
+ * Description
2866
+ * Get a bpf-local-storage from a *sk*.
2867
+ *
2868
+ * Logically, it could be thought of getting the value from
2869
+ * a *map* with *sk* as the **key**. From this
2870
+ * perspective, the usage is not much different from
2871
+ * **bpf_map_lookup_elem**\ (*map*, **&**\ *sk*) except this
2872
+ * helper enforces the key must be a full socket and the map must
2873
+ * be a **BPF_MAP_TYPE_SK_STORAGE** also.
2874
+ *
2875
+ * Underneath, the value is stored locally at *sk* instead of
2876
+ * the *map*. The *map* is used as the bpf-local-storage
2877
+ * "type". The bpf-local-storage "type" (i.e. the *map*) is
2878
+ * searched against all bpf-local-storages residing at *sk*.
2879
+ *
2880
+ * *sk* is a kernel **struct sock** pointer for LSM program.
2881
+ * *sk* is a **struct bpf_sock** pointer for other program types.
2882
+ *
2883
+ * An optional *flags* (**BPF_SK_STORAGE_GET_F_CREATE**) can be
2884
+ * used such that a new bpf-local-storage will be
2885
+ * created if one does not exist. *value* can be used
2886
+ * together with **BPF_SK_STORAGE_GET_F_CREATE** to specify
2887
+ * the initial value of a bpf-local-storage. If *value* is
2888
+ * **NULL**, the new bpf-local-storage will be zero initialized.
2889
+ * Return
2890
+ * A bpf-local-storage pointer is returned on success.
2891
+ *
2892
+ * **NULL** if not found or there was an error in adding
2893
+ * a new bpf-local-storage.
2894
+ *
2895
+ * long bpf_sk_storage_delete(struct bpf_map *map, void *sk)
2896
+ * Description
2897
+ * Delete a bpf-local-storage from a *sk*.
2898
+ * Return
2899
+ * 0 on success.
2900
+ *
2901
+ * **-ENOENT** if the bpf-local-storage cannot be found.
2902
+ * **-EINVAL** if sk is not a fullsock (e.g. a request_sock).
2903
+ *
2904
+ * long bpf_send_signal(u32 sig)
2905
+ * Description
2906
+ * Send signal *sig* to the process of the current task.
2907
+ * The signal may be delivered to any of this process's threads.
2908
+ * Return
2909
+ * 0 on success or successfully queued.
2910
+ *
2911
+ * **-EBUSY** if work queue under nmi is full.
2912
+ *
2913
+ * **-EINVAL** if *sig* is invalid.
2914
+ *
2915
+ * **-EPERM** if no permission to send the *sig*.
2916
+ *
2917
+ * **-EAGAIN** if bpf program can try again.
2918
+ *
2919
+ * s64 bpf_tcp_gen_syncookie(void *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len)
2920
+ * Description
2921
+ * Try to issue a SYN cookie for the packet with corresponding
2922
+ * IP/TCP headers, *iph* and *th*, on the listening socket in *sk*.
2923
+ *
2924
+ * *iph* points to the start of the IPv4 or IPv6 header, while
2925
+ * *iph_len* contains **sizeof**\ (**struct iphdr**) or
2926
+ * **sizeof**\ (**struct ip6hdr**).
2927
+ *
2928
+ * *th* points to the start of the TCP header, while *th_len*
2929
+ * contains the length of the TCP header.
2930
+ * Return
2931
+ * On success, lower 32 bits hold the generated SYN cookie in
2932
+ * followed by 16 bits which hold the MSS value for that cookie,
2933
+ * and the top 16 bits are unused.
2934
+ *
2935
+ * On failure, the returned value is one of the following:
2936
+ *
2937
+ * **-EINVAL** SYN cookie cannot be issued due to error
2938
+ *
2939
+ * **-ENOENT** SYN cookie should not be issued (no SYN flood)
2940
+ *
2941
+ * **-EOPNOTSUPP** kernel configuration does not enable SYN cookies
2942
+ *
2943
+ * **-EPROTONOSUPPORT** IP packet version is not 4 or 6
2944
+ *
2945
+ * long bpf_skb_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
2946
+ * Description
2947
+ * Write raw *data* blob into a special BPF perf event held by
2948
+ * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
2949
+ * event must have the following attributes: **PERF_SAMPLE_RAW**
2950
+ * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and
2951
+ * **PERF_COUNT_SW_BPF_OUTPUT** as **config**.
2952
+ *
2953
+ * The *flags* are used to indicate the index in *map* for which
2954
+ * the value must be put, masked with **BPF_F_INDEX_MASK**.
2955
+ * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU**
2956
+ * to indicate that the index of the current CPU core should be
2957
+ * used.
2958
+ *
2959
+ * The value to write, of *size*, is passed through eBPF stack and
2960
+ * pointed by *data*.
2961
+ *
2962
+ * *ctx* is a pointer to in-kernel struct sk_buff.
2963
+ *
2964
+ * This helper is similar to **bpf_perf_event_output**\ () but
2965
+ * restricted to raw_tracepoint bpf programs.
2966
+ * Return
2967
+ * 0 on success, or a negative error in case of failure.
2968
+ *
2969
+ * long bpf_probe_read_user(void *dst, u32 size, const void *unsafe_ptr)
2970
+ * Description
2971
+ * Safely attempt to read *size* bytes from user space address
2972
+ * *unsafe_ptr* and store the data in *dst*.
2973
+ * Return
2974
+ * 0 on success, or a negative error in case of failure.
2975
+ *
2976
+ * long bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr)
2977
+ * Description
2978
+ * Safely attempt to read *size* bytes from kernel space address
2979
+ * *unsafe_ptr* and store the data in *dst*.
2980
+ * Return
2981
+ * 0 on success, or a negative error in case of failure.
2982
+ *
2983
+ * long bpf_probe_read_user_str(void *dst, u32 size, const void *unsafe_ptr)
2984
+ * Description
2985
+ * Copy a NUL terminated string from an unsafe user address
2986
+ * *unsafe_ptr* to *dst*. The *size* should include the
2987
+ * terminating NUL byte. In case the string length is smaller than
2988
+ * *size*, the target is not padded with further NUL bytes. If the
2989
+ * string length is larger than *size*, just *size*-1 bytes are
2990
+ * copied and the last byte is set to NUL.
2991
+ *
2992
+ * On success, the length of the copied string is returned. This
2993
+ * makes this helper useful in tracing programs for reading
2994
+ * strings, and more importantly to get its length at runtime. See
2995
+ * the following snippet:
2996
+ *
2997
+ * ::
2998
+ *
2999
+ * SEC("kprobe/sys_open")
3000
+ * void bpf_sys_open(struct pt_regs *ctx)
3001
+ * {
3002
+ * char buf[PATHLEN]; // PATHLEN is defined to 256
3003
+ * int res = bpf_probe_read_user_str(buf, sizeof(buf),
3004
+ * ctx->di);
3005
+ *
3006
+ * // Consume buf, for example push it to
3007
+ * // userspace via bpf_perf_event_output(); we
3008
+ * // can use res (the string length) as event
3009
+ * // size, after checking its boundaries.
3010
+ * }
3011
+ *
3012
+ * In comparison, using **bpf_probe_read_user**\ () helper here
3013
+ * instead to read the string would require to estimate the length
3014
+ * at compile time, and would often result in copying more memory
3015
+ * than necessary.
3016
+ *
3017
+ * Another useful use case is when parsing individual process
3018
+ * arguments or individual environment variables navigating
3019
+ * *current*\ **->mm->arg_start** and *current*\
3020
+ * **->mm->env_start**: using this helper and the return value,
3021
+ * one can quickly iterate at the right offset of the memory area.
3022
+ * Return
3023
+ * On success, the strictly positive length of the string,
3024
+ * including the trailing NUL character. On error, a negative
3025
+ * value.
3026
+ *
3027
+ * long bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr)
3028
+ * Description
3029
+ * Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr*
3030
+ * to *dst*. Same semantics as with **bpf_probe_read_user_str**\ () apply.
3031
+ * Return
3032
+ * On success, the strictly positive length of the string, including
3033
+ * the trailing NUL character. On error, a negative value.
3034
+ *
3035
+ * long bpf_tcp_send_ack(void *tp, u32 rcv_nxt)
3036
+ * Description
3037
+ * Send out a tcp-ack. *tp* is the in-kernel struct **tcp_sock**.
3038
+ * *rcv_nxt* is the ack_seq to be sent out.
3039
+ * Return
3040
+ * 0 on success, or a negative error in case of failure.
3041
+ *
3042
+ * long bpf_send_signal_thread(u32 sig)
3043
+ * Description
3044
+ * Send signal *sig* to the thread corresponding to the current task.
3045
+ * Return
3046
+ * 0 on success or successfully queued.
3047
+ *
3048
+ * **-EBUSY** if work queue under nmi is full.
3049
+ *
3050
+ * **-EINVAL** if *sig* is invalid.
3051
+ *
3052
+ * **-EPERM** if no permission to send the *sig*.
3053
+ *
3054
+ * **-EAGAIN** if bpf program can try again.
3055
+ *
3056
+ * u64 bpf_jiffies64(void)
3057
+ * Description
3058
+ * Obtain the 64bit jiffies
3059
+ * Return
3060
+ * The 64 bit jiffies
3061
+ *
3062
+ * long bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags)
3063
+ * Description
3064
+ * For an eBPF program attached to a perf event, retrieve the
3065
+ * branch records (**struct perf_branch_entry**) associated to *ctx*
3066
+ * and store it in the buffer pointed by *buf* up to size
3067
+ * *size* bytes.
3068
+ * Return
3069
+ * On success, number of bytes written to *buf*. On error, a
3070
+ * negative value.
3071
+ *
3072
+ * The *flags* can be set to **BPF_F_GET_BRANCH_RECORDS_SIZE** to
3073
+ * instead return the number of bytes required to store all the
3074
+ * branch entries. If this flag is set, *buf* may be NULL.
3075
+ *
3076
+ * **-EINVAL** if arguments invalid or **size** not a multiple
3077
+ * of **sizeof**\ (**struct perf_branch_entry**\ ).
3078
+ *
3079
+ * **-ENOENT** if architecture does not support branch records.
3080
+ *
3081
+ * long bpf_get_ns_current_pid_tgid(u64 dev, u64 ino, struct bpf_pidns_info *nsdata, u32 size)
3082
+ * Description
3083
+ * Returns 0 on success, values for *pid* and *tgid* as seen from the current
3084
+ * *namespace* will be returned in *nsdata*.
3085
+ * Return
3086
+ * 0 on success, or one of the following in case of failure:
3087
+ *
3088
+ * **-EINVAL** if dev and inum supplied don't match dev_t and inode number
3089
+ * with nsfs of current task, or if dev conversion to dev_t lost high bits.
3090
+ *
3091
+ * **-ENOENT** if pidns does not exists for the current task.
3092
+ *
3093
+ * long bpf_xdp_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
3094
+ * Description
3095
+ * Write raw *data* blob into a special BPF perf event held by
3096
+ * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
3097
+ * event must have the following attributes: **PERF_SAMPLE_RAW**
3098
+ * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and
3099
+ * **PERF_COUNT_SW_BPF_OUTPUT** as **config**.
3100
+ *
3101
+ * The *flags* are used to indicate the index in *map* for which
3102
+ * the value must be put, masked with **BPF_F_INDEX_MASK**.
3103
+ * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU**
3104
+ * to indicate that the index of the current CPU core should be
3105
+ * used.
3106
+ *
3107
+ * The value to write, of *size*, is passed through eBPF stack and
3108
+ * pointed by *data*.
3109
+ *
3110
+ * *ctx* is a pointer to in-kernel struct xdp_buff.
3111
+ *
3112
+ * This helper is similar to **bpf_perf_eventoutput**\ () but
3113
+ * restricted to raw_tracepoint bpf programs.
3114
+ * Return
3115
+ * 0 on success, or a negative error in case of failure.
3116
+ *
3117
+ * u64 bpf_get_netns_cookie(void *ctx)
3118
+ * Description
3119
+ * Retrieve the cookie (generated by the kernel) of the network
3120
+ * namespace the input *ctx* is associated with. The network
3121
+ * namespace cookie remains stable for its lifetime and provides
3122
+ * a global identifier that can be assumed unique. If *ctx* is
3123
+ * NULL, then the helper returns the cookie for the initial
3124
+ * network namespace. The cookie itself is very similar to that
3125
+ * of **bpf_get_socket_cookie**\ () helper, but for network
3126
+ * namespaces instead of sockets.
3127
+ * Return
3128
+ * A 8-byte long opaque number.
3129
+ *
3130
+ * u64 bpf_get_current_ancestor_cgroup_id(int ancestor_level)
3131
+ * Description
3132
+ * Return id of cgroup v2 that is ancestor of the cgroup associated
3133
+ * with the current task at the *ancestor_level*. The root cgroup
3134
+ * is at *ancestor_level* zero and each step down the hierarchy
3135
+ * increments the level. If *ancestor_level* == level of cgroup
3136
+ * associated with the current task, then return value will be the
3137
+ * same as that of **bpf_get_current_cgroup_id**\ ().
3138
+ *
3139
+ * The helper is useful to implement policies based on cgroups
3140
+ * that are upper in hierarchy than immediate cgroup associated
3141
+ * with the current task.
3142
+ *
3143
+ * The format of returned id and helper limitations are same as in
3144
+ * **bpf_get_current_cgroup_id**\ ().
3145
+ * Return
3146
+ * The id is returned or 0 in case the id could not be retrieved.
3147
+ *
3148
+ * long bpf_sk_assign(struct sk_buff *skb, void *sk, u64 flags)
3149
+ * Description
3150
+ * Helper is overloaded depending on BPF program type. This
3151
+ * description applies to **BPF_PROG_TYPE_SCHED_CLS** and
3152
+ * **BPF_PROG_TYPE_SCHED_ACT** programs.
3153
+ *
3154
+ * Assign the *sk* to the *skb*. When combined with appropriate
3155
+ * routing configuration to receive the packet towards the socket,
3156
+ * will cause *skb* to be delivered to the specified socket.
3157
+ * Subsequent redirection of *skb* via **bpf_redirect**\ (),
3158
+ * **bpf_clone_redirect**\ () or other methods outside of BPF may
3159
+ * interfere with successful delivery to the socket.
3160
+ *
3161
+ * This operation is only valid from TC ingress path.
3162
+ *
3163
+ * The *flags* argument must be zero.
3164
+ * Return
3165
+ * 0 on success, or a negative error in case of failure:
3166
+ *
3167
+ * **-EINVAL** if specified *flags* are not supported.
3168
+ *
3169
+ * **-ENOENT** if the socket is unavailable for assignment.
3170
+ *
3171
+ * **-ENETUNREACH** if the socket is unreachable (wrong netns).
3172
+ *
3173
+ * **-EOPNOTSUPP** if the operation is not supported, for example
3174
+ * a call from outside of TC ingress.
3175
+ *
3176
+ * **-ESOCKTNOSUPPORT** if the socket type is not supported
3177
+ * (reuseport).
3178
+ *
3179
+ * long bpf_sk_assign(struct bpf_sk_lookup *ctx, struct bpf_sock *sk, u64 flags)
3180
+ * Description
3181
+ * Helper is overloaded depending on BPF program type. This
3182
+ * description applies to **BPF_PROG_TYPE_SK_LOOKUP** programs.
3183
+ *
3184
+ * Select the *sk* as a result of a socket lookup.
3185
+ *
3186
+ * For the operation to succeed passed socket must be compatible
3187
+ * with the packet description provided by the *ctx* object.
3188
+ *
3189
+ * L4 protocol (**IPPROTO_TCP** or **IPPROTO_UDP**) must
3190
+ * be an exact match. While IP family (**AF_INET** or
3191
+ * **AF_INET6**) must be compatible, that is IPv6 sockets
3192
+ * that are not v6-only can be selected for IPv4 packets.
3193
+ *
3194
+ * Only TCP listeners and UDP unconnected sockets can be
3195
+ * selected. *sk* can also be NULL to reset any previous
3196
+ * selection.
3197
+ *
3198
+ * *flags* argument can combination of following values:
3199
+ *
3200
+ * * **BPF_SK_LOOKUP_F_REPLACE** to override the previous
3201
+ * socket selection, potentially done by a BPF program
3202
+ * that ran before us.
3203
+ *
3204
+ * * **BPF_SK_LOOKUP_F_NO_REUSEPORT** to skip
3205
+ * load-balancing within reuseport group for the socket
3206
+ * being selected.
3207
+ *
3208
+ * On success *ctx->sk* will point to the selected socket.
3209
+ *
3210
+ * Return
3211
+ * 0 on success, or a negative errno in case of failure.
3212
+ *
3213
+ * * **-EAFNOSUPPORT** if socket family (*sk->family*) is
3214
+ * not compatible with packet family (*ctx->family*).
3215
+ *
3216
+ * * **-EEXIST** if socket has been already selected,
3217
+ * potentially by another program, and
3218
+ * **BPF_SK_LOOKUP_F_REPLACE** flag was not specified.
3219
+ *
3220
+ * * **-EINVAL** if unsupported flags were specified.
3221
+ *
3222
+ * * **-EPROTOTYPE** if socket L4 protocol
3223
+ * (*sk->protocol*) doesn't match packet protocol
3224
+ * (*ctx->protocol*).
3225
+ *
3226
+ * * **-ESOCKTNOSUPPORT** if socket is not in allowed
3227
+ * state (TCP listening or UDP unconnected).
21623228 *
21633229 * u64 bpf_ktime_get_boot_ns(void)
21643230 * Description
21653231 * Return the time elapsed since system boot, in nanoseconds.
21663232 * Does include the time the system was suspended.
2167
- * See: clock_gettime(CLOCK_BOOTTIME)
3233
+ * See: **clock_gettime**\ (**CLOCK_BOOTTIME**)
21683234 * Return
21693235 * Current *ktime*.
3236
+ *
3237
+ * long bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len)
3238
+ * Description
3239
+ * **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print
3240
+ * out the format string.
3241
+ * The *m* represents the seq_file. The *fmt* and *fmt_size* are for
3242
+ * the format string itself. The *data* and *data_len* are format string
3243
+ * arguments. The *data* are a **u64** array and corresponding format string
3244
+ * values are stored in the array. For strings and pointers where pointees
3245
+ * are accessed, only the pointer values are stored in the *data* array.
3246
+ * The *data_len* is the size of *data* in bytes.
3247
+ *
3248
+ * Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory.
3249
+ * Reading kernel memory may fail due to either invalid address or
3250
+ * valid address but requiring a major memory fault. If reading kernel memory
3251
+ * fails, the string for **%s** will be an empty string, and the ip
3252
+ * address for **%p{i,I}{4,6}** will be 0. Not returning error to
3253
+ * bpf program is consistent with what **bpf_trace_printk**\ () does for now.
3254
+ * Return
3255
+ * 0 on success, or a negative error in case of failure:
3256
+ *
3257
+ * **-EBUSY** if per-CPU memory copy buffer is busy, can try again
3258
+ * by returning 1 from bpf program.
3259
+ *
3260
+ * **-EINVAL** if arguments are invalid, or if *fmt* is invalid/unsupported.
3261
+ *
3262
+ * **-E2BIG** if *fmt* contains too many format specifiers.
3263
+ *
3264
+ * **-EOVERFLOW** if an overflow happened: The same object will be tried again.
3265
+ *
3266
+ * long bpf_seq_write(struct seq_file *m, const void *data, u32 len)
3267
+ * Description
3268
+ * **bpf_seq_write**\ () uses seq_file **seq_write**\ () to write the data.
3269
+ * The *m* represents the seq_file. The *data* and *len* represent the
3270
+ * data to write in bytes.
3271
+ * Return
3272
+ * 0 on success, or a negative error in case of failure:
3273
+ *
3274
+ * **-EOVERFLOW** if an overflow happened: The same object will be tried again.
3275
+ *
3276
+ * u64 bpf_sk_cgroup_id(void *sk)
3277
+ * Description
3278
+ * Return the cgroup v2 id of the socket *sk*.
3279
+ *
3280
+ * *sk* must be a non-**NULL** pointer to a socket, e.g. one
3281
+ * returned from **bpf_sk_lookup_xxx**\ (),
3282
+ * **bpf_sk_fullsock**\ (), etc. The format of returned id is
3283
+ * same as in **bpf_skb_cgroup_id**\ ().
3284
+ *
3285
+ * This helper is available only if the kernel was compiled with
3286
+ * the **CONFIG_SOCK_CGROUP_DATA** configuration option.
3287
+ * Return
3288
+ * The id is returned or 0 in case the id could not be retrieved.
3289
+ *
3290
+ * u64 bpf_sk_ancestor_cgroup_id(void *sk, int ancestor_level)
3291
+ * Description
3292
+ * Return id of cgroup v2 that is ancestor of cgroup associated
3293
+ * with the *sk* at the *ancestor_level*. The root cgroup is at
3294
+ * *ancestor_level* zero and each step down the hierarchy
3295
+ * increments the level. If *ancestor_level* == level of cgroup
3296
+ * associated with *sk*, then return value will be same as that
3297
+ * of **bpf_sk_cgroup_id**\ ().
3298
+ *
3299
+ * The helper is useful to implement policies based on cgroups
3300
+ * that are upper in hierarchy than immediate cgroup associated
3301
+ * with *sk*.
3302
+ *
3303
+ * The format of returned id and helper limitations are same as in
3304
+ * **bpf_sk_cgroup_id**\ ().
3305
+ * Return
3306
+ * The id is returned or 0 in case the id could not be retrieved.
3307
+ *
3308
+ * long bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags)
3309
+ * Description
3310
+ * Copy *size* bytes from *data* into a ring buffer *ringbuf*.
3311
+ * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification
3312
+ * of new data availability is sent.
3313
+ * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification
3314
+ * of new data availability is sent unconditionally.
3315
+ * Return
3316
+ * 0 on success, or a negative error in case of failure.
3317
+ *
3318
+ * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags)
3319
+ * Description
3320
+ * Reserve *size* bytes of payload in a ring buffer *ringbuf*.
3321
+ * Return
3322
+ * Valid pointer with *size* bytes of memory available; NULL,
3323
+ * otherwise.
3324
+ *
3325
+ * void bpf_ringbuf_submit(void *data, u64 flags)
3326
+ * Description
3327
+ * Submit reserved ring buffer sample, pointed to by *data*.
3328
+ * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification
3329
+ * of new data availability is sent.
3330
+ * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification
3331
+ * of new data availability is sent unconditionally.
3332
+ * Return
3333
+ * Nothing. Always succeeds.
3334
+ *
3335
+ * void bpf_ringbuf_discard(void *data, u64 flags)
3336
+ * Description
3337
+ * Discard reserved ring buffer sample, pointed to by *data*.
3338
+ * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification
3339
+ * of new data availability is sent.
3340
+ * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification
3341
+ * of new data availability is sent unconditionally.
3342
+ * Return
3343
+ * Nothing. Always succeeds.
3344
+ *
3345
+ * u64 bpf_ringbuf_query(void *ringbuf, u64 flags)
3346
+ * Description
3347
+ * Query various characteristics of provided ring buffer. What
3348
+ * exactly is queries is determined by *flags*:
3349
+ *
3350
+ * * **BPF_RB_AVAIL_DATA**: Amount of data not yet consumed.
3351
+ * * **BPF_RB_RING_SIZE**: The size of ring buffer.
3352
+ * * **BPF_RB_CONS_POS**: Consumer position (can wrap around).
3353
+ * * **BPF_RB_PROD_POS**: Producer(s) position (can wrap around).
3354
+ *
3355
+ * Data returned is just a momentary snapshot of actual values
3356
+ * and could be inaccurate, so this facility should be used to
3357
+ * power heuristics and for reporting, not to make 100% correct
3358
+ * calculation.
3359
+ * Return
3360
+ * Requested value, or 0, if *flags* are not recognized.
3361
+ *
3362
+ * long bpf_csum_level(struct sk_buff *skb, u64 level)
3363
+ * Description
3364
+ * Change the skbs checksum level by one layer up or down, or
3365
+ * reset it entirely to none in order to have the stack perform
3366
+ * checksum validation. The level is applicable to the following
3367
+ * protocols: TCP, UDP, GRE, SCTP, FCOE. For example, a decap of
3368
+ * | ETH | IP | UDP | GUE | IP | TCP | into | ETH | IP | TCP |
3369
+ * through **bpf_skb_adjust_room**\ () helper with passing in
3370
+ * **BPF_F_ADJ_ROOM_NO_CSUM_RESET** flag would require one call
3371
+ * to **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_DEC** since
3372
+ * the UDP header is removed. Similarly, an encap of the latter
3373
+ * into the former could be accompanied by a helper call to
3374
+ * **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_INC** if the
3375
+ * skb is still intended to be processed in higher layers of the
3376
+ * stack instead of just egressing at tc.
3377
+ *
3378
+ * There are three supported level settings at this time:
3379
+ *
3380
+ * * **BPF_CSUM_LEVEL_INC**: Increases skb->csum_level for skbs
3381
+ * with CHECKSUM_UNNECESSARY.
3382
+ * * **BPF_CSUM_LEVEL_DEC**: Decreases skb->csum_level for skbs
3383
+ * with CHECKSUM_UNNECESSARY.
3384
+ * * **BPF_CSUM_LEVEL_RESET**: Resets skb->csum_level to 0 and
3385
+ * sets CHECKSUM_NONE to force checksum validation by the stack.
3386
+ * * **BPF_CSUM_LEVEL_QUERY**: No-op, returns the current
3387
+ * skb->csum_level.
3388
+ * Return
3389
+ * 0 on success, or a negative error in case of failure. In the
3390
+ * case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level
3391
+ * is returned or the error code -EACCES in case the skb is not
3392
+ * subject to CHECKSUM_UNNECESSARY.
3393
+ *
3394
+ * struct tcp6_sock *bpf_skc_to_tcp6_sock(void *sk)
3395
+ * Description
3396
+ * Dynamically cast a *sk* pointer to a *tcp6_sock* pointer.
3397
+ * Return
3398
+ * *sk* if casting is valid, or **NULL** otherwise.
3399
+ *
3400
+ * struct tcp_sock *bpf_skc_to_tcp_sock(void *sk)
3401
+ * Description
3402
+ * Dynamically cast a *sk* pointer to a *tcp_sock* pointer.
3403
+ * Return
3404
+ * *sk* if casting is valid, or **NULL** otherwise.
3405
+ *
3406
+ * struct tcp_timewait_sock *bpf_skc_to_tcp_timewait_sock(void *sk)
3407
+ * Description
3408
+ * Dynamically cast a *sk* pointer to a *tcp_timewait_sock* pointer.
3409
+ * Return
3410
+ * *sk* if casting is valid, or **NULL** otherwise.
3411
+ *
3412
+ * struct tcp_request_sock *bpf_skc_to_tcp_request_sock(void *sk)
3413
+ * Description
3414
+ * Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer.
3415
+ * Return
3416
+ * *sk* if casting is valid, or **NULL** otherwise.
3417
+ *
3418
+ * struct udp6_sock *bpf_skc_to_udp6_sock(void *sk)
3419
+ * Description
3420
+ * Dynamically cast a *sk* pointer to a *udp6_sock* pointer.
3421
+ * Return
3422
+ * *sk* if casting is valid, or **NULL** otherwise.
3423
+ *
3424
+ * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags)
3425
+ * Description
3426
+ * Return a user or a kernel stack in bpf program provided buffer.
3427
+ * To achieve this, the helper needs *task*, which is a valid
3428
+ * pointer to **struct task_struct**. To store the stacktrace, the
3429
+ * bpf program provides *buf* with a nonnegative *size*.
3430
+ *
3431
+ * The last argument, *flags*, holds the number of stack frames to
3432
+ * skip (from 0 to 255), masked with
3433
+ * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
3434
+ * the following flags:
3435
+ *
3436
+ * **BPF_F_USER_STACK**
3437
+ * Collect a user space stack instead of a kernel stack.
3438
+ * **BPF_F_USER_BUILD_ID**
3439
+ * Collect buildid+offset instead of ips for user stack,
3440
+ * only valid if **BPF_F_USER_STACK** is also specified.
3441
+ *
3442
+ * **bpf_get_task_stack**\ () can collect up to
3443
+ * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
3444
+ * to sufficient large buffer size. Note that
3445
+ * this limit can be controlled with the **sysctl** program, and
3446
+ * that it should be manually increased in order to profile long
3447
+ * user stacks (such as stacks for Java programs). To do so, use:
3448
+ *
3449
+ * ::
3450
+ *
3451
+ * # sysctl kernel.perf_event_max_stack=<new value>
3452
+ * Return
3453
+ * The non-negative copied *buf* length equal to or less than
3454
+ * *size* on success, or a negative error in case of failure.
3455
+ *
3456
+ * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags)
3457
+ * Description
3458
+ * Load header option. Support reading a particular TCP header
3459
+ * option for bpf program (**BPF_PROG_TYPE_SOCK_OPS**).
3460
+ *
3461
+ * If *flags* is 0, it will search the option from the
3462
+ * *skops*\ **->skb_data**. The comment in **struct bpf_sock_ops**
3463
+ * has details on what skb_data contains under different
3464
+ * *skops*\ **->op**.
3465
+ *
3466
+ * The first byte of the *searchby_res* specifies the
3467
+ * kind that it wants to search.
3468
+ *
3469
+ * If the searching kind is an experimental kind
3470
+ * (i.e. 253 or 254 according to RFC6994). It also
3471
+ * needs to specify the "magic" which is either
3472
+ * 2 bytes or 4 bytes. It then also needs to
3473
+ * specify the size of the magic by using
3474
+ * the 2nd byte which is "kind-length" of a TCP
3475
+ * header option and the "kind-length" also
3476
+ * includes the first 2 bytes "kind" and "kind-length"
3477
+ * itself as a normal TCP header option also does.
3478
+ *
3479
+ * For example, to search experimental kind 254 with
3480
+ * 2 byte magic 0xeB9F, the searchby_res should be
3481
+ * [ 254, 4, 0xeB, 0x9F, 0, 0, .... 0 ].
3482
+ *
3483
+ * To search for the standard window scale option (3),
3484
+ * the *searchby_res* should be [ 3, 0, 0, .... 0 ].
3485
+ * Note, kind-length must be 0 for regular option.
3486
+ *
3487
+ * Searching for No-Op (0) and End-of-Option-List (1) are
3488
+ * not supported.
3489
+ *
3490
+ * *len* must be at least 2 bytes which is the minimal size
3491
+ * of a header option.
3492
+ *
3493
+ * Supported flags:
3494
+ *
3495
+ * * **BPF_LOAD_HDR_OPT_TCP_SYN** to search from the
3496
+ * saved_syn packet or the just-received syn packet.
3497
+ *
3498
+ * Return
3499
+ * > 0 when found, the header option is copied to *searchby_res*.
3500
+ * The return value is the total length copied. On failure, a
3501
+ * negative error code is returned:
3502
+ *
3503
+ * **-EINVAL** if a parameter is invalid.
3504
+ *
3505
+ * **-ENOMSG** if the option is not found.
3506
+ *
3507
+ * **-ENOENT** if no syn packet is available when
3508
+ * **BPF_LOAD_HDR_OPT_TCP_SYN** is used.
3509
+ *
3510
+ * **-ENOSPC** if there is not enough space. Only *len* number of
3511
+ * bytes are copied.
3512
+ *
3513
+ * **-EFAULT** on failure to parse the header options in the
3514
+ * packet.
3515
+ *
3516
+ * **-EPERM** if the helper cannot be used under the current
3517
+ * *skops*\ **->op**.
3518
+ *
3519
+ * long bpf_store_hdr_opt(struct bpf_sock_ops *skops, const void *from, u32 len, u64 flags)
3520
+ * Description
3521
+ * Store header option. The data will be copied
3522
+ * from buffer *from* with length *len* to the TCP header.
3523
+ *
3524
+ * The buffer *from* should have the whole option that
3525
+ * includes the kind, kind-length, and the actual
3526
+ * option data. The *len* must be at least kind-length
3527
+ * long. The kind-length does not have to be 4 byte
3528
+ * aligned. The kernel will take care of the padding
3529
+ * and setting the 4 bytes aligned value to th->doff.
3530
+ *
3531
+ * This helper will check for duplicated option
3532
+ * by searching the same option in the outgoing skb.
3533
+ *
3534
+ * This helper can only be called during
3535
+ * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**.
3536
+ *
3537
+ * Return
3538
+ * 0 on success, or negative error in case of failure:
3539
+ *
3540
+ * **-EINVAL** If param is invalid.
3541
+ *
3542
+ * **-ENOSPC** if there is not enough space in the header.
3543
+ * Nothing has been written
3544
+ *
3545
+ * **-EEXIST** if the option already exists.
3546
+ *
3547
+ * **-EFAULT** on failrue to parse the existing header options.
3548
+ *
3549
+ * **-EPERM** if the helper cannot be used under the current
3550
+ * *skops*\ **->op**.
3551
+ *
3552
+ * long bpf_reserve_hdr_opt(struct bpf_sock_ops *skops, u32 len, u64 flags)
3553
+ * Description
3554
+ * Reserve *len* bytes for the bpf header option. The
3555
+ * space will be used by **bpf_store_hdr_opt**\ () later in
3556
+ * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**.
3557
+ *
3558
+ * If **bpf_reserve_hdr_opt**\ () is called multiple times,
3559
+ * the total number of bytes will be reserved.
3560
+ *
3561
+ * This helper can only be called during
3562
+ * **BPF_SOCK_OPS_HDR_OPT_LEN_CB**.
3563
+ *
3564
+ * Return
3565
+ * 0 on success, or negative error in case of failure:
3566
+ *
3567
+ * **-EINVAL** if a parameter is invalid.
3568
+ *
3569
+ * **-ENOSPC** if there is not enough space in the header.
3570
+ *
3571
+ * **-EPERM** if the helper cannot be used under the current
3572
+ * *skops*\ **->op**.
3573
+ *
3574
+ * void *bpf_inode_storage_get(struct bpf_map *map, void *inode, void *value, u64 flags)
3575
+ * Description
3576
+ * Get a bpf_local_storage from an *inode*.
3577
+ *
3578
+ * Logically, it could be thought of as getting the value from
3579
+ * a *map* with *inode* as the **key**. From this
3580
+ * perspective, the usage is not much different from
3581
+ * **bpf_map_lookup_elem**\ (*map*, **&**\ *inode*) except this
3582
+ * helper enforces the key must be an inode and the map must also
3583
+ * be a **BPF_MAP_TYPE_INODE_STORAGE**.
3584
+ *
3585
+ * Underneath, the value is stored locally at *inode* instead of
3586
+ * the *map*. The *map* is used as the bpf-local-storage
3587
+ * "type". The bpf-local-storage "type" (i.e. the *map*) is
3588
+ * searched against all bpf_local_storage residing at *inode*.
3589
+ *
3590
+ * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be
3591
+ * used such that a new bpf_local_storage will be
3592
+ * created if one does not exist. *value* can be used
3593
+ * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify
3594
+ * the initial value of a bpf_local_storage. If *value* is
3595
+ * **NULL**, the new bpf_local_storage will be zero initialized.
3596
+ * Return
3597
+ * A bpf_local_storage pointer is returned on success.
3598
+ *
3599
+ * **NULL** if not found or there was an error in adding
3600
+ * a new bpf_local_storage.
3601
+ *
3602
+ * int bpf_inode_storage_delete(struct bpf_map *map, void *inode)
3603
+ * Description
3604
+ * Delete a bpf_local_storage from an *inode*.
3605
+ * Return
3606
+ * 0 on success.
3607
+ *
3608
+ * **-ENOENT** if the bpf_local_storage cannot be found.
3609
+ *
3610
+ * long bpf_d_path(struct path *path, char *buf, u32 sz)
3611
+ * Description
3612
+ * Return full path for given **struct path** object, which
3613
+ * needs to be the kernel BTF *path* object. The path is
3614
+ * returned in the provided buffer *buf* of size *sz* and
3615
+ * is zero terminated.
3616
+ *
3617
+ * Return
3618
+ * On success, the strictly positive length of the string,
3619
+ * including the trailing NUL character. On error, a negative
3620
+ * value.
3621
+ *
3622
+ * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr)
3623
+ * Description
3624
+ * Read *size* bytes from user space address *user_ptr* and store
3625
+ * the data in *dst*. This is a wrapper of **copy_from_user**\ ().
3626
+ * Return
3627
+ * 0 on success, or a negative error in case of failure.
3628
+ *
3629
+ * long bpf_snprintf_btf(char *str, u32 str_size, struct btf_ptr *ptr, u32 btf_ptr_size, u64 flags)
3630
+ * Description
3631
+ * Use BTF to store a string representation of *ptr*->ptr in *str*,
3632
+ * using *ptr*->type_id. This value should specify the type
3633
+ * that *ptr*->ptr points to. LLVM __builtin_btf_type_id(type, 1)
3634
+ * can be used to look up vmlinux BTF type ids. Traversing the
3635
+ * data structure using BTF, the type information and values are
3636
+ * stored in the first *str_size* - 1 bytes of *str*. Safe copy of
3637
+ * the pointer data is carried out to avoid kernel crashes during
3638
+ * operation. Smaller types can use string space on the stack;
3639
+ * larger programs can use map data to store the string
3640
+ * representation.
3641
+ *
3642
+ * The string can be subsequently shared with userspace via
3643
+ * bpf_perf_event_output() or ring buffer interfaces.
3644
+ * bpf_trace_printk() is to be avoided as it places too small
3645
+ * a limit on string size to be useful.
3646
+ *
3647
+ * *flags* is a combination of
3648
+ *
3649
+ * **BTF_F_COMPACT**
3650
+ * no formatting around type information
3651
+ * **BTF_F_NONAME**
3652
+ * no struct/union member names/types
3653
+ * **BTF_F_PTR_RAW**
3654
+ * show raw (unobfuscated) pointer values;
3655
+ * equivalent to printk specifier %px.
3656
+ * **BTF_F_ZERO**
3657
+ * show zero-valued struct/union members; they
3658
+ * are not displayed by default
3659
+ *
3660
+ * Return
3661
+ * The number of bytes that were written (or would have been
3662
+ * written if output had to be truncated due to string size),
3663
+ * or a negative error in cases of failure.
3664
+ *
3665
+ * long bpf_seq_printf_btf(struct seq_file *m, struct btf_ptr *ptr, u32 ptr_size, u64 flags)
3666
+ * Description
3667
+ * Use BTF to write to seq_write a string representation of
3668
+ * *ptr*->ptr, using *ptr*->type_id as per bpf_snprintf_btf().
3669
+ * *flags* are identical to those used for bpf_snprintf_btf.
3670
+ * Return
3671
+ * 0 on success or a negative error in case of failure.
3672
+ *
3673
+ * u64 bpf_skb_cgroup_classid(struct sk_buff *skb)
3674
+ * Description
3675
+ * See **bpf_get_cgroup_classid**\ () for the main description.
3676
+ * This helper differs from **bpf_get_cgroup_classid**\ () in that
3677
+ * the cgroup v1 net_cls class is retrieved only from the *skb*'s
3678
+ * associated socket instead of the current process.
3679
+ * Return
3680
+ * The id is returned or 0 in case the id could not be retrieved.
3681
+ *
3682
+ * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags)
3683
+ * Description
3684
+ * Redirect the packet to another net device of index *ifindex*
3685
+ * and fill in L2 addresses from neighboring subsystem. This helper
3686
+ * is somewhat similar to **bpf_redirect**\ (), except that it
3687
+ * populates L2 addresses as well, meaning, internally, the helper
3688
+ * relies on the neighbor lookup for the L2 address of the nexthop.
3689
+ *
3690
+ * The helper will perform a FIB lookup based on the skb's
3691
+ * networking header to get the address of the next hop, unless
3692
+ * this is supplied by the caller in the *params* argument. The
3693
+ * *plen* argument indicates the len of *params* and should be set
3694
+ * to 0 if *params* is NULL.
3695
+ *
3696
+ * The *flags* argument is reserved and must be 0. The helper is
3697
+ * currently only supported for tc BPF program types, and enabled
3698
+ * for IPv4 and IPv6 protocols.
3699
+ * Return
3700
+ * The helper returns **TC_ACT_REDIRECT** on success or
3701
+ * **TC_ACT_SHOT** on error.
3702
+ *
3703
+ * void *bpf_per_cpu_ptr(const void *percpu_ptr, u32 cpu)
3704
+ * Description
3705
+ * Take a pointer to a percpu ksym, *percpu_ptr*, and return a
3706
+ * pointer to the percpu kernel variable on *cpu*. A ksym is an
3707
+ * extern variable decorated with '__ksym'. For ksym, there is a
3708
+ * global var (either static or global) defined of the same name
3709
+ * in the kernel. The ksym is percpu if the global var is percpu.
3710
+ * The returned pointer points to the global percpu var on *cpu*.
3711
+ *
3712
+ * bpf_per_cpu_ptr() has the same semantic as per_cpu_ptr() in the
3713
+ * kernel, except that bpf_per_cpu_ptr() may return NULL. This
3714
+ * happens if *cpu* is larger than nr_cpu_ids. The caller of
3715
+ * bpf_per_cpu_ptr() must check the returned value.
3716
+ * Return
3717
+ * A pointer pointing to the kernel percpu variable on *cpu*, or
3718
+ * NULL, if *cpu* is invalid.
3719
+ *
3720
+ * void *bpf_this_cpu_ptr(const void *percpu_ptr)
3721
+ * Description
3722
+ * Take a pointer to a percpu ksym, *percpu_ptr*, and return a
3723
+ * pointer to the percpu kernel variable on this cpu. See the
3724
+ * description of 'ksym' in **bpf_per_cpu_ptr**\ ().
3725
+ *
3726
+ * bpf_this_cpu_ptr() has the same semantic as this_cpu_ptr() in
3727
+ * the kernel. Different from **bpf_per_cpu_ptr**\ (), it would
3728
+ * never return NULL.
3729
+ * Return
3730
+ * A pointer pointing to the kernel percpu variable on this cpu.
3731
+ *
3732
+ * long bpf_redirect_peer(u32 ifindex, u64 flags)
3733
+ * Description
3734
+ * Redirect the packet to another net device of index *ifindex*.
3735
+ * This helper is somewhat similar to **bpf_redirect**\ (), except
3736
+ * that the redirection happens to the *ifindex*' peer device and
3737
+ * the netns switch takes place from ingress to ingress without
3738
+ * going through the CPU's backlog queue.
3739
+ *
3740
+ * The *flags* argument is reserved and must be 0. The helper is
3741
+ * currently only supported for tc BPF program types at the ingress
3742
+ * hook and for veth device types. The peer device must reside in a
3743
+ * different network namespace.
3744
+ * Return
3745
+ * The helper returns **TC_ACT_REDIRECT** on success or
3746
+ * **TC_ACT_SHOT** on error.
21703747 */
21713748 #define __BPF_FUNC_MAPPER(FN) \
21723749 FN(unspec), \
....@@ -2294,7 +3871,38 @@
22943871 FN(get_netns_cookie), \
22953872 FN(get_current_ancestor_cgroup_id), \
22963873 FN(sk_assign), \
2297
- FN(ktime_get_boot_ns),
3874
+ FN(ktime_get_boot_ns), \
3875
+ FN(seq_printf), \
3876
+ FN(seq_write), \
3877
+ FN(sk_cgroup_id), \
3878
+ FN(sk_ancestor_cgroup_id), \
3879
+ FN(ringbuf_output), \
3880
+ FN(ringbuf_reserve), \
3881
+ FN(ringbuf_submit), \
3882
+ FN(ringbuf_discard), \
3883
+ FN(ringbuf_query), \
3884
+ FN(csum_level), \
3885
+ FN(skc_to_tcp6_sock), \
3886
+ FN(skc_to_tcp_sock), \
3887
+ FN(skc_to_tcp_timewait_sock), \
3888
+ FN(skc_to_tcp_request_sock), \
3889
+ FN(skc_to_udp6_sock), \
3890
+ FN(get_task_stack), \
3891
+ FN(load_hdr_opt), \
3892
+ FN(store_hdr_opt), \
3893
+ FN(reserve_hdr_opt), \
3894
+ FN(inode_storage_get), \
3895
+ FN(inode_storage_delete), \
3896
+ FN(d_path), \
3897
+ FN(copy_from_user), \
3898
+ FN(snprintf_btf), \
3899
+ FN(seq_printf_btf), \
3900
+ FN(skb_cgroup_classid), \
3901
+ FN(redirect_neigh), \
3902
+ FN(per_cpu_ptr), \
3903
+ FN(this_cpu_ptr), \
3904
+ FN(redirect_peer), \
3905
+ /* */
22983906
22993907 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
23003908 * function eBPF program intends to call
....@@ -2309,50 +3917,147 @@
23093917 /* All flags used by eBPF helper functions, placed here. */
23103918
23113919 /* BPF_FUNC_skb_store_bytes flags. */
2312
-#define BPF_F_RECOMPUTE_CSUM (1ULL << 0)
2313
-#define BPF_F_INVALIDATE_HASH (1ULL << 1)
3920
+enum {
3921
+ BPF_F_RECOMPUTE_CSUM = (1ULL << 0),
3922
+ BPF_F_INVALIDATE_HASH = (1ULL << 1),
3923
+};
23143924
23153925 /* BPF_FUNC_l3_csum_replace and BPF_FUNC_l4_csum_replace flags.
23163926 * First 4 bits are for passing the header field size.
23173927 */
2318
-#define BPF_F_HDR_FIELD_MASK 0xfULL
3928
+enum {
3929
+ BPF_F_HDR_FIELD_MASK = 0xfULL,
3930
+};
23193931
23203932 /* BPF_FUNC_l4_csum_replace flags. */
2321
-#define BPF_F_PSEUDO_HDR (1ULL << 4)
2322
-#define BPF_F_MARK_MANGLED_0 (1ULL << 5)
2323
-#define BPF_F_MARK_ENFORCE (1ULL << 6)
3933
+enum {
3934
+ BPF_F_PSEUDO_HDR = (1ULL << 4),
3935
+ BPF_F_MARK_MANGLED_0 = (1ULL << 5),
3936
+ BPF_F_MARK_ENFORCE = (1ULL << 6),
3937
+};
23243938
23253939 /* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */
2326
-#define BPF_F_INGRESS (1ULL << 0)
3940
+enum {
3941
+ BPF_F_INGRESS = (1ULL << 0),
3942
+};
23273943
23283944 /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
2329
-#define BPF_F_TUNINFO_IPV6 (1ULL << 0)
3945
+enum {
3946
+ BPF_F_TUNINFO_IPV6 = (1ULL << 0),
3947
+};
23303948
23313949 /* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */
2332
-#define BPF_F_SKIP_FIELD_MASK 0xffULL
2333
-#define BPF_F_USER_STACK (1ULL << 8)
3950
+enum {
3951
+ BPF_F_SKIP_FIELD_MASK = 0xffULL,
3952
+ BPF_F_USER_STACK = (1ULL << 8),
23343953 /* flags used by BPF_FUNC_get_stackid only. */
2335
-#define BPF_F_FAST_STACK_CMP (1ULL << 9)
2336
-#define BPF_F_REUSE_STACKID (1ULL << 10)
3954
+ BPF_F_FAST_STACK_CMP = (1ULL << 9),
3955
+ BPF_F_REUSE_STACKID = (1ULL << 10),
23373956 /* flags used by BPF_FUNC_get_stack only. */
2338
-#define BPF_F_USER_BUILD_ID (1ULL << 11)
3957
+ BPF_F_USER_BUILD_ID = (1ULL << 11),
3958
+};
23393959
23403960 /* BPF_FUNC_skb_set_tunnel_key flags. */
2341
-#define BPF_F_ZERO_CSUM_TX (1ULL << 1)
2342
-#define BPF_F_DONT_FRAGMENT (1ULL << 2)
2343
-#define BPF_F_SEQ_NUMBER (1ULL << 3)
3961
+enum {
3962
+ BPF_F_ZERO_CSUM_TX = (1ULL << 1),
3963
+ BPF_F_DONT_FRAGMENT = (1ULL << 2),
3964
+ BPF_F_SEQ_NUMBER = (1ULL << 3),
3965
+};
23443966
23453967 /* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and
23463968 * BPF_FUNC_perf_event_read_value flags.
23473969 */
2348
-#define BPF_F_INDEX_MASK 0xffffffffULL
2349
-#define BPF_F_CURRENT_CPU BPF_F_INDEX_MASK
3970
+enum {
3971
+ BPF_F_INDEX_MASK = 0xffffffffULL,
3972
+ BPF_F_CURRENT_CPU = BPF_F_INDEX_MASK,
23503973 /* BPF_FUNC_perf_event_output for sk_buff input context. */
2351
-#define BPF_F_CTXLEN_MASK (0xfffffULL << 32)
3974
+ BPF_F_CTXLEN_MASK = (0xfffffULL << 32),
3975
+};
3976
+
3977
+/* Current network namespace */
3978
+enum {
3979
+ BPF_F_CURRENT_NETNS = (-1L),
3980
+};
3981
+
3982
+/* BPF_FUNC_csum_level level values. */
3983
+enum {
3984
+ BPF_CSUM_LEVEL_QUERY,
3985
+ BPF_CSUM_LEVEL_INC,
3986
+ BPF_CSUM_LEVEL_DEC,
3987
+ BPF_CSUM_LEVEL_RESET,
3988
+};
3989
+
3990
+/* BPF_FUNC_skb_adjust_room flags. */
3991
+enum {
3992
+ BPF_F_ADJ_ROOM_FIXED_GSO = (1ULL << 0),
3993
+ BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 = (1ULL << 1),
3994
+ BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 = (1ULL << 2),
3995
+ BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3),
3996
+ BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4),
3997
+ BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5),
3998
+};
3999
+
4000
+enum {
4001
+ BPF_ADJ_ROOM_ENCAP_L2_MASK = 0xff,
4002
+ BPF_ADJ_ROOM_ENCAP_L2_SHIFT = 56,
4003
+};
4004
+
4005
+#define BPF_F_ADJ_ROOM_ENCAP_L2(len) (((__u64)len & \
4006
+ BPF_ADJ_ROOM_ENCAP_L2_MASK) \
4007
+ << BPF_ADJ_ROOM_ENCAP_L2_SHIFT)
4008
+
4009
+/* BPF_FUNC_sysctl_get_name flags. */
4010
+enum {
4011
+ BPF_F_SYSCTL_BASE_NAME = (1ULL << 0),
4012
+};
4013
+
4014
+/* BPF_FUNC_<kernel_obj>_storage_get flags */
4015
+enum {
4016
+ BPF_LOCAL_STORAGE_GET_F_CREATE = (1ULL << 0),
4017
+ /* BPF_SK_STORAGE_GET_F_CREATE is only kept for backward compatibility
4018
+ * and BPF_LOCAL_STORAGE_GET_F_CREATE must be used instead.
4019
+ */
4020
+ BPF_SK_STORAGE_GET_F_CREATE = BPF_LOCAL_STORAGE_GET_F_CREATE,
4021
+};
4022
+
4023
+/* BPF_FUNC_read_branch_records flags. */
4024
+enum {
4025
+ BPF_F_GET_BRANCH_RECORDS_SIZE = (1ULL << 0),
4026
+};
4027
+
4028
+/* BPF_FUNC_bpf_ringbuf_commit, BPF_FUNC_bpf_ringbuf_discard, and
4029
+ * BPF_FUNC_bpf_ringbuf_output flags.
4030
+ */
4031
+enum {
4032
+ BPF_RB_NO_WAKEUP = (1ULL << 0),
4033
+ BPF_RB_FORCE_WAKEUP = (1ULL << 1),
4034
+};
4035
+
4036
+/* BPF_FUNC_bpf_ringbuf_query flags */
4037
+enum {
4038
+ BPF_RB_AVAIL_DATA = 0,
4039
+ BPF_RB_RING_SIZE = 1,
4040
+ BPF_RB_CONS_POS = 2,
4041
+ BPF_RB_PROD_POS = 3,
4042
+};
4043
+
4044
+/* BPF ring buffer constants */
4045
+enum {
4046
+ BPF_RINGBUF_BUSY_BIT = (1U << 31),
4047
+ BPF_RINGBUF_DISCARD_BIT = (1U << 30),
4048
+ BPF_RINGBUF_HDR_SZ = 8,
4049
+};
4050
+
4051
+/* BPF_FUNC_sk_assign flags in bpf_sk_lookup context. */
4052
+enum {
4053
+ BPF_SK_LOOKUP_F_REPLACE = (1ULL << 0),
4054
+ BPF_SK_LOOKUP_F_NO_REUSEPORT = (1ULL << 1),
4055
+};
23524056
23534057 /* Mode for BPF_FUNC_skb_adjust_room helper. */
23544058 enum bpf_adj_room_mode {
23554059 BPF_ADJ_ROOM_NET,
4060
+ BPF_ADJ_ROOM_MAC,
23564061 };
23574062
23584063 /* Mode for BPF_FUNC_skb_load_bytes_relative helper. */
....@@ -2364,8 +4069,15 @@
23644069 /* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */
23654070 enum bpf_lwt_encap_mode {
23664071 BPF_LWT_ENCAP_SEG6,
2367
- BPF_LWT_ENCAP_SEG6_INLINE
4072
+ BPF_LWT_ENCAP_SEG6_INLINE,
4073
+ BPF_LWT_ENCAP_IP,
23684074 };
4075
+
4076
+#define __bpf_md_ptr(type, name) \
4077
+union { \
4078
+ type name; \
4079
+ __u64 :64; \
4080
+} __attribute__((aligned(8)))
23694081
23704082 /* user accessible mirror of in-kernel sk_buff.
23714083 * new fields can only be added to the end of this structure
....@@ -2401,6 +4113,12 @@
24014113 /* ... here. */
24024114
24034115 __u32 data_meta;
4116
+ __bpf_md_ptr(struct bpf_flow_keys *, flow_keys);
4117
+ __u64 tstamp;
4118
+ __u32 wire_len;
4119
+ __u32 gso_segs;
4120
+ __bpf_md_ptr(struct bpf_sock *, sk);
4121
+ __u32 gso_size;
24044122 };
24054123
24064124 struct bpf_tunnel_key {
....@@ -2442,7 +4160,15 @@
24424160 BPF_DROP = 2,
24434161 /* 3-6 reserved */
24444162 BPF_REDIRECT = 7,
2445
- /* >127 are reserved for prog type specific return codes */
4163
+ /* >127 are reserved for prog type specific return codes.
4164
+ *
4165
+ * BPF_LWT_REROUTE: used by BPF_PROG_TYPE_LWT_IN and
4166
+ * BPF_PROG_TYPE_LWT_XMIT to indicate that skb had been
4167
+ * changed and should be routed based on its new L3 header.
4168
+ * (This is an L3 redirect, as opposed to L2 redirect
4169
+ * represented by BPF_REDIRECT above).
4170
+ */
4171
+ BPF_LWT_REROUTE = 128,
24464172 };
24474173
24484174 struct bpf_sock {
....@@ -2452,15 +4178,82 @@
24524178 __u32 protocol;
24534179 __u32 mark;
24544180 __u32 priority;
2455
- __u32 src_ip4; /* Allows 1,2,4-byte read.
2456
- * Stored in network byte order.
4181
+ /* IP address also allows 1 and 2 bytes access */
4182
+ __u32 src_ip4;
4183
+ __u32 src_ip6[4];
4184
+ __u32 src_port; /* host byte order */
4185
+ __be16 dst_port; /* network byte order */
4186
+ __u16 :16; /* zero padding */
4187
+ __u32 dst_ip4;
4188
+ __u32 dst_ip6[4];
4189
+ __u32 state;
4190
+ __s32 rx_queue_mapping;
4191
+};
4192
+
4193
+struct bpf_tcp_sock {
4194
+ __u32 snd_cwnd; /* Sending congestion window */
4195
+ __u32 srtt_us; /* smoothed round trip time << 3 in usecs */
4196
+ __u32 rtt_min;
4197
+ __u32 snd_ssthresh; /* Slow start size threshold */
4198
+ __u32 rcv_nxt; /* What we want to receive next */
4199
+ __u32 snd_nxt; /* Next sequence we send */
4200
+ __u32 snd_una; /* First byte we want an ack for */
4201
+ __u32 mss_cache; /* Cached effective mss, not including SACKS */
4202
+ __u32 ecn_flags; /* ECN status bits. */
4203
+ __u32 rate_delivered; /* saved rate sample: packets delivered */
4204
+ __u32 rate_interval_us; /* saved rate sample: time elapsed */
4205
+ __u32 packets_out; /* Packets which are "in flight" */
4206
+ __u32 retrans_out; /* Retransmitted packets out */
4207
+ __u32 total_retrans; /* Total retransmits for entire connection */
4208
+ __u32 segs_in; /* RFC4898 tcpEStatsPerfSegsIn
4209
+ * total number of segments in.
24574210 */
2458
- __u32 src_ip6[4]; /* Allows 1,2,4-byte read.
2459
- * Stored in network byte order.
4211
+ __u32 data_segs_in; /* RFC4898 tcpEStatsPerfDataSegsIn
4212
+ * total number of data segments in.
24604213 */
2461
- __u32 src_port; /* Allows 4-byte read.
2462
- * Stored in host byte order
4214
+ __u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut
4215
+ * The total number of segments sent.
24634216 */
4217
+ __u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut
4218
+ * total number of data segments sent.
4219
+ */
4220
+ __u32 lost_out; /* Lost packets */
4221
+ __u32 sacked_out; /* SACK'd packets */
4222
+ __u64 bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived
4223
+ * sum(delta(rcv_nxt)), or how many bytes
4224
+ * were acked.
4225
+ */
4226
+ __u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked
4227
+ * sum(delta(snd_una)), or how many bytes
4228
+ * were acked.
4229
+ */
4230
+ __u32 dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups
4231
+ * total number of DSACK blocks received
4232
+ */
4233
+ __u32 delivered; /* Total data packets delivered incl. rexmits */
4234
+ __u32 delivered_ce; /* Like the above but only ECE marked packets */
4235
+ __u32 icsk_retransmits; /* Number of unrecovered [RTO] timeouts */
4236
+};
4237
+
4238
+struct bpf_sock_tuple {
4239
+ union {
4240
+ struct {
4241
+ __be32 saddr;
4242
+ __be32 daddr;
4243
+ __be16 sport;
4244
+ __be16 dport;
4245
+ } ipv4;
4246
+ struct {
4247
+ __be32 saddr[4];
4248
+ __be32 daddr[4];
4249
+ __be16 sport;
4250
+ __be16 dport;
4251
+ } ipv6;
4252
+ };
4253
+};
4254
+
4255
+struct bpf_xdp_sock {
4256
+ __u32 queue_id;
24644257 };
24654258
24664259 #define XDP_PACKET_HEADROOM 256
....@@ -2488,6 +4281,34 @@
24884281 /* Below access go through struct xdp_rxq_info */
24894282 __u32 ingress_ifindex; /* rxq->dev->ifindex */
24904283 __u32 rx_queue_index; /* rxq->queue_index */
4284
+
4285
+ __u32 egress_ifindex; /* txq->dev->ifindex */
4286
+};
4287
+
4288
+/* DEVMAP map-value layout
4289
+ *
4290
+ * The struct data-layout of map-value is a configuration interface.
4291
+ * New members can only be added to the end of this structure.
4292
+ */
4293
+struct bpf_devmap_val {
4294
+ __u32 ifindex; /* device index */
4295
+ union {
4296
+ int fd; /* prog fd on map write */
4297
+ __u32 id; /* prog id on map read */
4298
+ } bpf_prog;
4299
+};
4300
+
4301
+/* CPUMAP map-value layout
4302
+ *
4303
+ * The struct data-layout of map-value is a configuration interface.
4304
+ * New members can only be added to the end of this structure.
4305
+ */
4306
+struct bpf_cpumap_val {
4307
+ __u32 qsize; /* queue size to remote target CPU */
4308
+ union {
4309
+ int fd; /* prog fd on map write */
4310
+ __u32 id; /* prog id on map read */
4311
+ } bpf_prog;
24914312 };
24924313
24934314 enum sk_action {
....@@ -2499,8 +4320,8 @@
24994320 * be added to the end of this structure
25004321 */
25014322 struct sk_msg_md {
2502
- void *data;
2503
- void *data_end;
4323
+ __bpf_md_ptr(void *, data);
4324
+ __bpf_md_ptr(void *, data_end);
25044325
25054326 __u32 family;
25064327 __u32 remote_ip4; /* Stored in network byte order */
....@@ -2509,6 +4330,9 @@
25094330 __u32 local_ip6[4]; /* Stored in network byte order */
25104331 __u32 remote_port; /* Stored in network byte order */
25114332 __u32 local_port; /* stored in host byte order */
4333
+ __u32 size; /* Total size of sk_msg */
4334
+
4335
+ __bpf_md_ptr(struct bpf_sock *, sk); /* current socket */
25124336 };
25134337
25144338 struct sk_reuseport_md {
....@@ -2516,8 +4340,9 @@
25164340 * Start of directly accessible data. It begins from
25174341 * the tcp/udp header.
25184342 */
2519
- void *data;
2520
- void *data_end; /* End of directly accessible data */
4343
+ __bpf_md_ptr(void *, data);
4344
+ /* End of directly accessible data */
4345
+ __bpf_md_ptr(void *, data_end);
25214346 /*
25224347 * Total length of packet (starting from the tcp/udp header).
25234348 * Note that the directly accessible bytes (data_end - data)
....@@ -2559,6 +4384,20 @@
25594384 __u32 nr_jited_func_lens;
25604385 __aligned_u64 jited_ksyms;
25614386 __aligned_u64 jited_func_lens;
4387
+ __u32 btf_id;
4388
+ __u32 func_info_rec_size;
4389
+ __aligned_u64 func_info;
4390
+ __u32 nr_func_info;
4391
+ __u32 nr_line_info;
4392
+ __aligned_u64 line_info;
4393
+ __aligned_u64 jited_line_info;
4394
+ __u32 nr_jited_line_info;
4395
+ __u32 line_info_rec_size;
4396
+ __u32 jited_line_info_rec_size;
4397
+ __u32 nr_prog_tags;
4398
+ __aligned_u64 prog_tags;
4399
+ __u64 run_time_ns;
4400
+ __u64 run_cnt;
25624401 } __attribute__((aligned(8)));
25634402
25644403 struct bpf_map_info {
....@@ -2570,7 +4409,7 @@
25704409 __u32 map_flags;
25714410 char name[BPF_OBJ_NAME_LEN];
25724411 __u32 ifindex;
2573
- __u32 :32;
4412
+ __u32 btf_vmlinux_value_type_id;
25744413 __u64 netns_dev;
25754414 __u64 netns_ino;
25764415 __u32 btf_id;
....@@ -2584,30 +4423,66 @@
25844423 __u32 id;
25854424 } __attribute__((aligned(8)));
25864425
4426
+struct bpf_link_info {
4427
+ __u32 type;
4428
+ __u32 id;
4429
+ __u32 prog_id;
4430
+ union {
4431
+ struct {
4432
+ __aligned_u64 tp_name; /* in/out: tp_name buffer ptr */
4433
+ __u32 tp_name_len; /* in/out: tp_name buffer len */
4434
+ } raw_tracepoint;
4435
+ struct {
4436
+ __u32 attach_type;
4437
+ } tracing;
4438
+ struct {
4439
+ __u64 cgroup_id;
4440
+ __u32 attach_type;
4441
+ } cgroup;
4442
+ struct {
4443
+ __aligned_u64 target_name; /* in/out: target_name buffer ptr */
4444
+ __u32 target_name_len; /* in/out: target_name buffer len */
4445
+ union {
4446
+ struct {
4447
+ __u32 map_id;
4448
+ } map;
4449
+ };
4450
+ } iter;
4451
+ struct {
4452
+ __u32 netns_ino;
4453
+ __u32 attach_type;
4454
+ } netns;
4455
+ struct {
4456
+ __u32 ifindex;
4457
+ } xdp;
4458
+ };
4459
+} __attribute__((aligned(8)));
4460
+
25874461 /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed
25884462 * by user and intended to be used by socket (e.g. to bind to, depends on
2589
- * attach attach type).
4463
+ * attach type).
25904464 */
25914465 struct bpf_sock_addr {
25924466 __u32 user_family; /* Allows 4-byte read, but no write. */
25934467 __u32 user_ip4; /* Allows 1,2,4-byte read and 4-byte write.
25944468 * Stored in network byte order.
25954469 */
2596
- __u32 user_ip6[4]; /* Allows 1,2,4-byte read an 4-byte write.
4470
+ __u32 user_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write.
25974471 * Stored in network byte order.
25984472 */
2599
- __u32 user_port; /* Allows 4-byte read and write.
4473
+ __u32 user_port; /* Allows 1,2,4-byte read and 4-byte write.
26004474 * Stored in network byte order
26014475 */
26024476 __u32 family; /* Allows 4-byte read, but no write */
26034477 __u32 type; /* Allows 4-byte read, but no write */
26044478 __u32 protocol; /* Allows 4-byte read, but no write */
2605
- __u32 msg_src_ip4; /* Allows 1,2,4-byte read an 4-byte write.
4479
+ __u32 msg_src_ip4; /* Allows 1,2,4-byte read and 4-byte write.
26064480 * Stored in network byte order.
26074481 */
2608
- __u32 msg_src_ip6[4]; /* Allows 1,2,4-byte read an 4-byte write.
4482
+ __u32 msg_src_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write.
26094483 * Stored in network byte order.
26104484 */
4485
+ __bpf_md_ptr(struct bpf_sock *, sk);
26114486 };
26124487
26134488 /* User bpf_sock_ops struct to access socket values and specify request ops
....@@ -2659,15 +4534,91 @@
26594534 __u32 sk_txhash;
26604535 __u64 bytes_received;
26614536 __u64 bytes_acked;
4537
+ __bpf_md_ptr(struct bpf_sock *, sk);
4538
+ /* [skb_data, skb_data_end) covers the whole TCP header.
4539
+ *
4540
+ * BPF_SOCK_OPS_PARSE_HDR_OPT_CB: The packet received
4541
+ * BPF_SOCK_OPS_HDR_OPT_LEN_CB: Not useful because the
4542
+ * header has not been written.
4543
+ * BPF_SOCK_OPS_WRITE_HDR_OPT_CB: The header and options have
4544
+ * been written so far.
4545
+ * BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: The SYNACK that concludes
4546
+ * the 3WHS.
4547
+ * BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: The ACK that concludes
4548
+ * the 3WHS.
4549
+ *
4550
+ * bpf_load_hdr_opt() can also be used to read a particular option.
4551
+ */
4552
+ __bpf_md_ptr(void *, skb_data);
4553
+ __bpf_md_ptr(void *, skb_data_end);
4554
+ __u32 skb_len; /* The total length of a packet.
4555
+ * It includes the header, options,
4556
+ * and payload.
4557
+ */
4558
+ __u32 skb_tcp_flags; /* tcp_flags of the header. It provides
4559
+ * an easy way to check for tcp_flags
4560
+ * without parsing skb_data.
4561
+ *
4562
+ * In particular, the skb_tcp_flags
4563
+ * will still be available in
4564
+ * BPF_SOCK_OPS_HDR_OPT_LEN even though
4565
+ * the outgoing header has not
4566
+ * been written yet.
4567
+ */
26624568 };
26634569
26644570 /* Definitions for bpf_sock_ops_cb_flags */
2665
-#define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0)
2666
-#define BPF_SOCK_OPS_RETRANS_CB_FLAG (1<<1)
2667
-#define BPF_SOCK_OPS_STATE_CB_FLAG (1<<2)
2668
-#define BPF_SOCK_OPS_ALL_CB_FLAGS 0x7 /* Mask of all currently
2669
- * supported cb flags
2670
- */
4571
+enum {
4572
+ BPF_SOCK_OPS_RTO_CB_FLAG = (1<<0),
4573
+ BPF_SOCK_OPS_RETRANS_CB_FLAG = (1<<1),
4574
+ BPF_SOCK_OPS_STATE_CB_FLAG = (1<<2),
4575
+ BPF_SOCK_OPS_RTT_CB_FLAG = (1<<3),
4576
+ /* Call bpf for all received TCP headers. The bpf prog will be
4577
+ * called under sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB
4578
+ *
4579
+ * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB
4580
+ * for the header option related helpers that will be useful
4581
+ * to the bpf programs.
4582
+ *
4583
+ * It could be used at the client/active side (i.e. connect() side)
4584
+ * when the server told it that the server was in syncookie
4585
+ * mode and required the active side to resend the bpf-written
4586
+ * options. The active side can keep writing the bpf-options until
4587
+ * it received a valid packet from the server side to confirm
4588
+ * the earlier packet (and options) has been received. The later
4589
+ * example patch is using it like this at the active side when the
4590
+ * server is in syncookie mode.
4591
+ *
4592
+ * The bpf prog will usually turn this off in the common cases.
4593
+ */
4594
+ BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG = (1<<4),
4595
+ /* Call bpf when kernel has received a header option that
4596
+ * the kernel cannot handle. The bpf prog will be called under
4597
+ * sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB.
4598
+ *
4599
+ * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB
4600
+ * for the header option related helpers that will be useful
4601
+ * to the bpf programs.
4602
+ */
4603
+ BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG = (1<<5),
4604
+ /* Call bpf when the kernel is writing header options for the
4605
+ * outgoing packet. The bpf prog will first be called
4606
+ * to reserve space in a skb under
4607
+ * sock_ops->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB. Then
4608
+ * the bpf prog will be called to write the header option(s)
4609
+ * under sock_ops->op == BPF_SOCK_OPS_WRITE_HDR_OPT_CB.
4610
+ *
4611
+ * Please refer to the comment in BPF_SOCK_OPS_HDR_OPT_LEN_CB
4612
+ * and BPF_SOCK_OPS_WRITE_HDR_OPT_CB for the header option
4613
+ * related helpers that will be useful to the bpf programs.
4614
+ *
4615
+ * The kernel gets its chance to reserve space and write
4616
+ * options first before the BPF program does.
4617
+ */
4618
+ BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG = (1<<6),
4619
+/* Mask of all currently supported cb flags */
4620
+ BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F,
4621
+};
26714622
26724623 /* List of known BPF sock_ops operators.
26734624 * New entries can only be added at the end
....@@ -2720,6 +4671,65 @@
27204671 BPF_SOCK_OPS_TCP_LISTEN_CB, /* Called on listen(2), right after
27214672 * socket transition to LISTEN state.
27224673 */
4674
+ BPF_SOCK_OPS_RTT_CB, /* Called on every RTT.
4675
+ */
4676
+ BPF_SOCK_OPS_PARSE_HDR_OPT_CB, /* Parse the header option.
4677
+ * It will be called to handle
4678
+ * the packets received at
4679
+ * an already established
4680
+ * connection.
4681
+ *
4682
+ * sock_ops->skb_data:
4683
+ * Referring to the received skb.
4684
+ * It covers the TCP header only.
4685
+ *
4686
+ * bpf_load_hdr_opt() can also
4687
+ * be used to search for a
4688
+ * particular option.
4689
+ */
4690
+ BPF_SOCK_OPS_HDR_OPT_LEN_CB, /* Reserve space for writing the
4691
+ * header option later in
4692
+ * BPF_SOCK_OPS_WRITE_HDR_OPT_CB.
4693
+ * Arg1: bool want_cookie. (in
4694
+ * writing SYNACK only)
4695
+ *
4696
+ * sock_ops->skb_data:
4697
+ * Not available because no header has
4698
+ * been written yet.
4699
+ *
4700
+ * sock_ops->skb_tcp_flags:
4701
+ * The tcp_flags of the
4702
+ * outgoing skb. (e.g. SYN, ACK, FIN).
4703
+ *
4704
+ * bpf_reserve_hdr_opt() should
4705
+ * be used to reserve space.
4706
+ */
4707
+ BPF_SOCK_OPS_WRITE_HDR_OPT_CB, /* Write the header options
4708
+ * Arg1: bool want_cookie. (in
4709
+ * writing SYNACK only)
4710
+ *
4711
+ * sock_ops->skb_data:
4712
+ * Referring to the outgoing skb.
4713
+ * It covers the TCP header
4714
+ * that has already been written
4715
+ * by the kernel and the
4716
+ * earlier bpf-progs.
4717
+ *
4718
+ * sock_ops->skb_tcp_flags:
4719
+ * The tcp_flags of the outgoing
4720
+ * skb. (e.g. SYN, ACK, FIN).
4721
+ *
4722
+ * bpf_store_hdr_opt() should
4723
+ * be used to write the
4724
+ * option.
4725
+ *
4726
+ * bpf_load_hdr_opt() can also
4727
+ * be used to search for a
4728
+ * particular option that
4729
+ * has already been written
4730
+ * by the kernel or the
4731
+ * earlier bpf-progs.
4732
+ */
27234733 };
27244734
27254735 /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
....@@ -2744,8 +4754,67 @@
27444754 BPF_TCP_MAX_STATES /* Leave at the end! */
27454755 };
27464756
2747
-#define TCP_BPF_IW 1001 /* Set TCP initial congestion window */
2748
-#define TCP_BPF_SNDCWND_CLAMP 1002 /* Set sndcwnd_clamp */
4757
+enum {
4758
+ TCP_BPF_IW = 1001, /* Set TCP initial congestion window */
4759
+ TCP_BPF_SNDCWND_CLAMP = 1002, /* Set sndcwnd_clamp */
4760
+ TCP_BPF_DELACK_MAX = 1003, /* Max delay ack in usecs */
4761
+ TCP_BPF_RTO_MIN = 1004, /* Min delay ack in usecs */
4762
+ /* Copy the SYN pkt to optval
4763
+ *
4764
+ * BPF_PROG_TYPE_SOCK_OPS only. It is similar to the
4765
+ * bpf_getsockopt(TCP_SAVED_SYN) but it does not limit
4766
+ * to only getting from the saved_syn. It can either get the
4767
+ * syn packet from:
4768
+ *
4769
+ * 1. the just-received SYN packet (only available when writing the
4770
+ * SYNACK). It will be useful when it is not necessary to
4771
+ * save the SYN packet for latter use. It is also the only way
4772
+ * to get the SYN during syncookie mode because the syn
4773
+ * packet cannot be saved during syncookie.
4774
+ *
4775
+ * OR
4776
+ *
4777
+ * 2. the earlier saved syn which was done by
4778
+ * bpf_setsockopt(TCP_SAVE_SYN).
4779
+ *
4780
+ * The bpf_getsockopt(TCP_BPF_SYN*) option will hide where the
4781
+ * SYN packet is obtained.
4782
+ *
4783
+ * If the bpf-prog does not need the IP[46] header, the
4784
+ * bpf-prog can avoid parsing the IP header by using
4785
+ * TCP_BPF_SYN. Otherwise, the bpf-prog can get both
4786
+ * IP[46] and TCP header by using TCP_BPF_SYN_IP.
4787
+ *
4788
+ * >0: Total number of bytes copied
4789
+ * -ENOSPC: Not enough space in optval. Only optlen number of
4790
+ * bytes is copied.
4791
+ * -ENOENT: The SYN skb is not available now and the earlier SYN pkt
4792
+ * is not saved by setsockopt(TCP_SAVE_SYN).
4793
+ */
4794
+ TCP_BPF_SYN = 1005, /* Copy the TCP header */
4795
+ TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */
4796
+ TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */
4797
+};
4798
+
4799
+enum {
4800
+ BPF_LOAD_HDR_OPT_TCP_SYN = (1ULL << 0),
4801
+};
4802
+
4803
+/* args[0] value during BPF_SOCK_OPS_HDR_OPT_LEN_CB and
4804
+ * BPF_SOCK_OPS_WRITE_HDR_OPT_CB.
4805
+ */
4806
+enum {
4807
+ BPF_WRITE_HDR_TCP_CURRENT_MSS = 1, /* Kernel is finding the
4808
+ * total option spaces
4809
+ * required for an established
4810
+ * sk in order to calculate the
4811
+ * MSS. No skb is actually
4812
+ * sent.
4813
+ */
4814
+ BPF_WRITE_HDR_TCP_SYNACK_COOKIE = 2, /* Kernel is in syncookie mode
4815
+ * when sending a SYN.
4816
+ */
4817
+};
27494818
27504819 struct bpf_perf_event_value {
27514820 __u64 counter;
....@@ -2753,12 +4822,16 @@
27534822 __u64 running;
27544823 };
27554824
2756
-#define BPF_DEVCG_ACC_MKNOD (1ULL << 0)
2757
-#define BPF_DEVCG_ACC_READ (1ULL << 1)
2758
-#define BPF_DEVCG_ACC_WRITE (1ULL << 2)
4825
+enum {
4826
+ BPF_DEVCG_ACC_MKNOD = (1ULL << 0),
4827
+ BPF_DEVCG_ACC_READ = (1ULL << 1),
4828
+ BPF_DEVCG_ACC_WRITE = (1ULL << 2),
4829
+};
27594830
2760
-#define BPF_DEVCG_DEV_BLOCK (1ULL << 0)
2761
-#define BPF_DEVCG_DEV_CHAR (1ULL << 1)
4831
+enum {
4832
+ BPF_DEVCG_DEV_BLOCK = (1ULL << 0),
4833
+ BPF_DEVCG_DEV_CHAR = (1ULL << 1),
4834
+};
27624835
27634836 struct bpf_cgroup_dev_ctx {
27644837 /* access_type encoded as (BPF_DEVCG_ACC_* << 16) | BPF_DEVCG_DEV_* */
....@@ -2774,8 +4847,10 @@
27744847 /* DIRECT: Skip the FIB rules and go to FIB table associated with device
27754848 * OUTPUT: Do lookup from egress perspective; default is ingress
27764849 */
2777
-#define BPF_FIB_LOOKUP_DIRECT (1U << 0)
2778
-#define BPF_FIB_LOOKUP_OUTPUT (1U << 1)
4850
+enum {
4851
+ BPF_FIB_LOOKUP_DIRECT = (1U << 0),
4852
+ BPF_FIB_LOOKUP_OUTPUT = (1U << 1),
4853
+};
27794854
27804855 enum {
27814856 BPF_FIB_LKUP_RET_SUCCESS, /* lookup successful */
....@@ -2838,6 +4913,16 @@
28384913 __u8 dmac[6]; /* ETH_ALEN */
28394914 };
28404915
4916
+struct bpf_redir_neigh {
4917
+ /* network family for lookup (AF_INET, AF_INET6) */
4918
+ __u32 nh_family;
4919
+ /* network address of nexthop; skips fib lookup to find gateway */
4920
+ union {
4921
+ __be32 ipv4_nh;
4922
+ __u32 ipv6_nh[4]; /* in6_addr; network order */
4923
+ };
4924
+};
4925
+
28414926 enum bpf_task_fd_type {
28424927 BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */
28434928 BPF_FD_TYPE_TRACEPOINT, /* tp name */
....@@ -2847,4 +4932,126 @@
28474932 BPF_FD_TYPE_URETPROBE, /* filename + offset */
28484933 };
28494934
4935
+enum {
4936
+ BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG = (1U << 0),
4937
+ BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL = (1U << 1),
4938
+ BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP = (1U << 2),
4939
+};
4940
+
4941
+struct bpf_flow_keys {
4942
+ __u16 nhoff;
4943
+ __u16 thoff;
4944
+ __u16 addr_proto; /* ETH_P_* of valid addrs */
4945
+ __u8 is_frag;
4946
+ __u8 is_first_frag;
4947
+ __u8 is_encap;
4948
+ __u8 ip_proto;
4949
+ __be16 n_proto;
4950
+ __be16 sport;
4951
+ __be16 dport;
4952
+ union {
4953
+ struct {
4954
+ __be32 ipv4_src;
4955
+ __be32 ipv4_dst;
4956
+ };
4957
+ struct {
4958
+ __u32 ipv6_src[4]; /* in6_addr; network order */
4959
+ __u32 ipv6_dst[4]; /* in6_addr; network order */
4960
+ };
4961
+ };
4962
+ __u32 flags;
4963
+ __be32 flow_label;
4964
+};
4965
+
4966
+struct bpf_func_info {
4967
+ __u32 insn_off;
4968
+ __u32 type_id;
4969
+};
4970
+
4971
+#define BPF_LINE_INFO_LINE_NUM(line_col) ((line_col) >> 10)
4972
+#define BPF_LINE_INFO_LINE_COL(line_col) ((line_col) & 0x3ff)
4973
+
4974
+struct bpf_line_info {
4975
+ __u32 insn_off;
4976
+ __u32 file_name_off;
4977
+ __u32 line_off;
4978
+ __u32 line_col;
4979
+};
4980
+
4981
+struct bpf_spin_lock {
4982
+ __u32 val;
4983
+};
4984
+
4985
+struct bpf_sysctl {
4986
+ __u32 write; /* Sysctl is being read (= 0) or written (= 1).
4987
+ * Allows 1,2,4-byte read, but no write.
4988
+ */
4989
+ __u32 file_pos; /* Sysctl file position to read from, write to.
4990
+ * Allows 1,2,4-byte read an 4-byte write.
4991
+ */
4992
+};
4993
+
4994
+struct bpf_sockopt {
4995
+ __bpf_md_ptr(struct bpf_sock *, sk);
4996
+ __bpf_md_ptr(void *, optval);
4997
+ __bpf_md_ptr(void *, optval_end);
4998
+
4999
+ __s32 level;
5000
+ __s32 optname;
5001
+ __s32 optlen;
5002
+ __s32 retval;
5003
+};
5004
+
5005
+struct bpf_pidns_info {
5006
+ __u32 pid;
5007
+ __u32 tgid;
5008
+};
5009
+
5010
+/* User accessible data for SK_LOOKUP programs. Add new fields at the end. */
5011
+struct bpf_sk_lookup {
5012
+ union {
5013
+ __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */
5014
+ __u64 cookie; /* Non-zero if socket was selected in PROG_TEST_RUN */
5015
+ };
5016
+
5017
+ __u32 family; /* Protocol family (AF_INET, AF_INET6) */
5018
+ __u32 protocol; /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */
5019
+ __u32 remote_ip4; /* Network byte order */
5020
+ __u32 remote_ip6[4]; /* Network byte order */
5021
+ __u32 remote_port; /* Network byte order */
5022
+ __u32 local_ip4; /* Network byte order */
5023
+ __u32 local_ip6[4]; /* Network byte order */
5024
+ __u32 local_port; /* Host byte order */
5025
+};
5026
+
5027
+/*
5028
+ * struct btf_ptr is used for typed pointer representation; the
5029
+ * type id is used to render the pointer data as the appropriate type
5030
+ * via the bpf_snprintf_btf() helper described above. A flags field -
5031
+ * potentially to specify additional details about the BTF pointer
5032
+ * (rather than its mode of display) - is included for future use.
5033
+ * Display flags - BTF_F_* - are passed to bpf_snprintf_btf separately.
5034
+ */
5035
+struct btf_ptr {
5036
+ void *ptr;
5037
+ __u32 type_id;
5038
+ __u32 flags; /* BTF ptr flags; unused at present. */
5039
+};
5040
+
5041
+/*
5042
+ * Flags to control bpf_snprintf_btf() behaviour.
5043
+ * - BTF_F_COMPACT: no formatting around type information
5044
+ * - BTF_F_NONAME: no struct/union member names/types
5045
+ * - BTF_F_PTR_RAW: show raw (unobfuscated) pointer values;
5046
+ * equivalent to %px.
5047
+ * - BTF_F_ZERO: show zero-valued struct/union members; they
5048
+ * are not displayed by default
5049
+ */
5050
+enum {
5051
+ BTF_F_COMPACT = (1ULL << 0),
5052
+ BTF_F_NONAME = (1ULL << 1),
5053
+ BTF_F_PTR_RAW = (1ULL << 2),
5054
+ BTF_F_ZERO = (1ULL << 3),
5055
+};
5056
+
28505057 #endif /* _UAPI__LINUX_BPF_H__ */