hc
2023-12-09 b22da3d8526a935aa31e086e63f60ff3246cb61c
kernel/include/uapi/linux/bpf.h
....@@ -14,6 +14,7 @@
1414 /* Extended instruction set based on top of classic BPF */
1515
1616 /* instruction classes */
17
+#define BPF_JMP32 0x06 /* jmp mode in word width */
1718 #define BPF_ALU64 0x07 /* alu mode in double word width */
1819
1920 /* ld/ldx fields */
....@@ -80,6 +81,12 @@
8081 __u32 attach_type; /* program attach type */
8182 };
8283
84
+union bpf_iter_link_info {
85
+ struct {
86
+ __u32 map_fd;
87
+ } map;
88
+};
89
+
8390 /* BPF syscall commands, see bpf(2) man-page for details. */
8491 enum bpf_cmd {
8592 BPF_MAP_CREATE,
....@@ -103,6 +110,21 @@
103110 BPF_BTF_LOAD,
104111 BPF_BTF_GET_FD_BY_ID,
105112 BPF_TASK_FD_QUERY,
113
+ BPF_MAP_LOOKUP_AND_DELETE_ELEM,
114
+ BPF_MAP_FREEZE,
115
+ BPF_BTF_GET_NEXT_ID,
116
+ BPF_MAP_LOOKUP_BATCH,
117
+ BPF_MAP_LOOKUP_AND_DELETE_BATCH,
118
+ BPF_MAP_UPDATE_BATCH,
119
+ BPF_MAP_DELETE_BATCH,
120
+ BPF_LINK_CREATE,
121
+ BPF_LINK_UPDATE,
122
+ BPF_LINK_GET_FD_BY_ID,
123
+ BPF_LINK_GET_NEXT_ID,
124
+ BPF_ENABLE_STATS,
125
+ BPF_ITER_CREATE,
126
+ BPF_LINK_DETACH,
127
+ BPF_PROG_BIND_MAP,
106128 };
107129
108130 enum bpf_map_type {
....@@ -127,8 +149,24 @@
127149 BPF_MAP_TYPE_SOCKHASH,
128150 BPF_MAP_TYPE_CGROUP_STORAGE,
129151 BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
152
+ BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
153
+ BPF_MAP_TYPE_QUEUE,
154
+ BPF_MAP_TYPE_STACK,
155
+ BPF_MAP_TYPE_SK_STORAGE,
156
+ BPF_MAP_TYPE_DEVMAP_HASH,
157
+ BPF_MAP_TYPE_STRUCT_OPS,
158
+ BPF_MAP_TYPE_RINGBUF,
159
+ BPF_MAP_TYPE_INODE_STORAGE,
130160 };
131161
162
+/* Note that tracing related programs such as
163
+ * BPF_PROG_TYPE_{KPROBE,TRACEPOINT,PERF_EVENT,RAW_TRACEPOINT}
164
+ * are not subject to a stable API since kernel internal data
165
+ * structures can change from release to release and may
166
+ * therefore break existing tracing BPF programs. Tracing BPF
167
+ * programs correspond to /a/ specific kernel which is to be
168
+ * analyzed, and not /a/ specific kernel /and/ all future ones.
169
+ */
132170 enum bpf_prog_type {
133171 BPF_PROG_TYPE_UNSPEC,
134172 BPF_PROG_TYPE_SOCKET_FILTER,
....@@ -152,6 +190,15 @@
152190 BPF_PROG_TYPE_LWT_SEG6LOCAL,
153191 BPF_PROG_TYPE_LIRC_MODE2,
154192 BPF_PROG_TYPE_SK_REUSEPORT,
193
+ BPF_PROG_TYPE_FLOW_DISSECTOR,
194
+ BPF_PROG_TYPE_CGROUP_SYSCTL,
195
+ BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,
196
+ BPF_PROG_TYPE_CGROUP_SOCKOPT,
197
+ BPF_PROG_TYPE_TRACING,
198
+ BPF_PROG_TYPE_STRUCT_OPS,
199
+ BPF_PROG_TYPE_EXT,
200
+ BPF_PROG_TYPE_LSM,
201
+ BPF_PROG_TYPE_SK_LOOKUP,
155202 };
156203
157204 enum bpf_attach_type {
....@@ -172,12 +219,43 @@
172219 BPF_CGROUP_UDP4_SENDMSG,
173220 BPF_CGROUP_UDP6_SENDMSG,
174221 BPF_LIRC_MODE2,
175
- BPF_CGROUP_UDP4_RECVMSG = 19,
222
+ BPF_FLOW_DISSECTOR,
223
+ BPF_CGROUP_SYSCTL,
224
+ BPF_CGROUP_UDP4_RECVMSG,
176225 BPF_CGROUP_UDP6_RECVMSG,
226
+ BPF_CGROUP_GETSOCKOPT,
227
+ BPF_CGROUP_SETSOCKOPT,
228
+ BPF_TRACE_RAW_TP,
229
+ BPF_TRACE_FENTRY,
230
+ BPF_TRACE_FEXIT,
231
+ BPF_MODIFY_RETURN,
232
+ BPF_LSM_MAC,
233
+ BPF_TRACE_ITER,
234
+ BPF_CGROUP_INET4_GETPEERNAME,
235
+ BPF_CGROUP_INET6_GETPEERNAME,
236
+ BPF_CGROUP_INET4_GETSOCKNAME,
237
+ BPF_CGROUP_INET6_GETSOCKNAME,
238
+ BPF_XDP_DEVMAP,
239
+ BPF_CGROUP_INET_SOCK_RELEASE,
240
+ BPF_XDP_CPUMAP,
241
+ BPF_SK_LOOKUP,
242
+ BPF_XDP,
177243 __MAX_BPF_ATTACH_TYPE
178244 };
179245
180246 #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
247
+
248
+enum bpf_link_type {
249
+ BPF_LINK_TYPE_UNSPEC = 0,
250
+ BPF_LINK_TYPE_RAW_TRACEPOINT = 1,
251
+ BPF_LINK_TYPE_TRACING = 2,
252
+ BPF_LINK_TYPE_CGROUP = 3,
253
+ BPF_LINK_TYPE_ITER = 4,
254
+ BPF_LINK_TYPE_NETNS = 5,
255
+ BPF_LINK_TYPE_XDP = 6,
256
+
257
+ MAX_BPF_LINK_TYPE,
258
+};
181259
182260 /* cgroup-bpf attach flags used in BPF_PROG_ATTACH command
183261 *
....@@ -202,6 +280,11 @@
202280 * When children program makes decision (like picking TCP CA or sock bind)
203281 * parent program has a chance to override it.
204282 *
283
+ * With BPF_F_ALLOW_MULTI a new program is added to the end of the list of
284
+ * programs for a cgroup. Though it's possible to replace an old program at
285
+ * any position by also specifying BPF_F_REPLACE flag and position itself in
286
+ * replace_bpf_fd attribute. Old program at this position will be released.
287
+ *
205288 * A cgroup with MULTI or OVERRIDE flag allows any attach flags in sub-cgroups.
206289 * A cgroup with NONE doesn't allow any programs in sub-cgroups.
207290 * Ex1:
....@@ -220,6 +303,7 @@
220303 */
221304 #define BPF_F_ALLOW_OVERRIDE (1U << 0)
222305 #define BPF_F_ALLOW_MULTI (1U << 1)
306
+#define BPF_F_REPLACE (1U << 2)
223307
224308 /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the
225309 * verifier will perform strict alignment checking as if the kernel
....@@ -242,8 +326,66 @@
242326 */
243327 #define BPF_F_ANY_ALIGNMENT (1U << 1)
244328
245
-/* when bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_FD, bpf_ldimm64->imm == fd */
329
+/* BPF_F_TEST_RND_HI32 is used in BPF_PROG_LOAD command for testing purpose.
330
+ * Verifier does sub-register def/use analysis and identifies instructions whose
331
+ * def only matters for low 32-bit, high 32-bit is never referenced later
332
+ * through implicit zero extension. Therefore verifier notifies JIT back-ends
333
+ * that it is safe to ignore clearing high 32-bit for these instructions. This
334
+ * saves some back-ends a lot of code-gen. However such optimization is not
335
+ * necessary on some arches, for example x86_64, arm64 etc, whose JIT back-ends
336
+ * hence hasn't used verifier's analysis result. But, we really want to have a
337
+ * way to be able to verify the correctness of the described optimization on
338
+ * x86_64 on which testsuites are frequently exercised.
339
+ *
340
+ * So, this flag is introduced. Once it is set, verifier will randomize high
341
+ * 32-bit for those instructions who has been identified as safe to ignore them.
342
+ * Then, if verifier is not doing correct analysis, such randomization will
343
+ * regress tests to expose bugs.
344
+ */
345
+#define BPF_F_TEST_RND_HI32 (1U << 2)
346
+
347
+/* The verifier internal test flag. Behavior is undefined */
348
+#define BPF_F_TEST_STATE_FREQ (1U << 3)
349
+
350
+/* If BPF_F_SLEEPABLE is used in BPF_PROG_LOAD command, the verifier will
351
+ * restrict map and helper usage for such programs. Sleepable BPF programs can
352
+ * only be attached to hooks where kernel execution context allows sleeping.
353
+ * Such programs are allowed to use helpers that may sleep like
354
+ * bpf_copy_from_user().
355
+ */
356
+#define BPF_F_SLEEPABLE (1U << 4)
357
+
358
+/* When BPF ldimm64's insn[0].src_reg != 0 then this can have
359
+ * the following extensions:
360
+ *
361
+ * insn[0].src_reg: BPF_PSEUDO_MAP_FD
362
+ * insn[0].imm: map fd
363
+ * insn[1].imm: 0
364
+ * insn[0].off: 0
365
+ * insn[1].off: 0
366
+ * ldimm64 rewrite: address of map
367
+ * verifier type: CONST_PTR_TO_MAP
368
+ */
246369 #define BPF_PSEUDO_MAP_FD 1
370
+/* insn[0].src_reg: BPF_PSEUDO_MAP_VALUE
371
+ * insn[0].imm: map fd
372
+ * insn[1].imm: offset into value
373
+ * insn[0].off: 0
374
+ * insn[1].off: 0
375
+ * ldimm64 rewrite: address of map[0]+offset
376
+ * verifier type: PTR_TO_MAP_VALUE
377
+ */
378
+#define BPF_PSEUDO_MAP_VALUE 2
379
+/* insn[0].src_reg: BPF_PSEUDO_BTF_ID
380
+ * insn[0].imm: kernel btd id of VAR
381
+ * insn[1].imm: 0
382
+ * insn[0].off: 0
383
+ * insn[1].off: 0
384
+ * ldimm64 rewrite: address of the kernel variable
385
+ * verifier type: PTR_TO_BTF_ID or PTR_TO_MEM, depending on whether the var
386
+ * is struct/union.
387
+ */
388
+#define BPF_PSEUDO_BTF_ID 3
247389
248390 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative
249391 * offset to another bpf function
....@@ -251,33 +393,71 @@
251393 #define BPF_PSEUDO_CALL 1
252394
253395 /* flags for BPF_MAP_UPDATE_ELEM command */
254
-#define BPF_ANY 0 /* create new element or update existing */
255
-#define BPF_NOEXIST 1 /* create new element if it didn't exist */
256
-#define BPF_EXIST 2 /* update existing element */
396
+enum {
397
+ BPF_ANY = 0, /* create new element or update existing */
398
+ BPF_NOEXIST = 1, /* create new element if it didn't exist */
399
+ BPF_EXIST = 2, /* update existing element */
400
+ BPF_F_LOCK = 4, /* spin_lock-ed map_lookup/map_update */
401
+};
257402
258403 /* flags for BPF_MAP_CREATE command */
259
-#define BPF_F_NO_PREALLOC (1U << 0)
404
+enum {
405
+ BPF_F_NO_PREALLOC = (1U << 0),
260406 /* Instead of having one common LRU list in the
261407 * BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list
262408 * which can scale and perform better.
263409 * Note, the LRU nodes (including free nodes) cannot be moved
264410 * across different LRU lists.
265411 */
266
-#define BPF_F_NO_COMMON_LRU (1U << 1)
412
+ BPF_F_NO_COMMON_LRU = (1U << 1),
267413 /* Specify numa node during map creation */
268
-#define BPF_F_NUMA_NODE (1U << 2)
414
+ BPF_F_NUMA_NODE = (1U << 2),
269415
270
-/* flags for BPF_PROG_QUERY */
271
-#define BPF_F_QUERY_EFFECTIVE (1U << 0)
272
-
273
-#define BPF_OBJ_NAME_LEN 16U
274
-
275
-/* Flags for accessing BPF object */
276
-#define BPF_F_RDONLY (1U << 3)
277
-#define BPF_F_WRONLY (1U << 4)
416
+/* Flags for accessing BPF object from syscall side. */
417
+ BPF_F_RDONLY = (1U << 3),
418
+ BPF_F_WRONLY = (1U << 4),
278419
279420 /* Flag for stack_map, store build_id+offset instead of pointer */
280
-#define BPF_F_STACK_BUILD_ID (1U << 5)
421
+ BPF_F_STACK_BUILD_ID = (1U << 5),
422
+
423
+/* Zero-initialize hash function seed. This should only be used for testing. */
424
+ BPF_F_ZERO_SEED = (1U << 6),
425
+
426
+/* Flags for accessing BPF object from program side. */
427
+ BPF_F_RDONLY_PROG = (1U << 7),
428
+ BPF_F_WRONLY_PROG = (1U << 8),
429
+
430
+/* Clone map from listener for newly accepted socket */
431
+ BPF_F_CLONE = (1U << 9),
432
+
433
+/* Enable memory-mapping BPF map */
434
+ BPF_F_MMAPABLE = (1U << 10),
435
+
436
+/* Share perf_event among processes */
437
+ BPF_F_PRESERVE_ELEMS = (1U << 11),
438
+
439
+/* Create a map that is suitable to be an inner map with dynamic max entries */
440
+ BPF_F_INNER_MAP = (1U << 12),
441
+};
442
+
443
+/* Flags for BPF_PROG_QUERY. */
444
+
445
+/* Query effective (directly attached + inherited from ancestor cgroups)
446
+ * programs that will be executed for events within a cgroup.
447
+ * attach_flags with this flag are returned only for directly attached programs.
448
+ */
449
+#define BPF_F_QUERY_EFFECTIVE (1U << 0)
450
+
451
+/* Flags for BPF_PROG_TEST_RUN */
452
+
453
+/* If set, run the test on the cpu specified by bpf_attr.test.cpu */
454
+#define BPF_F_TEST_RUN_ON_CPU (1U << 0)
455
+
456
+/* type for BPF_ENABLE_STATS */
457
+enum bpf_stats_type {
458
+ /* enabled run_time_ns and run_cnt */
459
+ BPF_STATS_RUN_TIME = 0,
460
+};
281461
282462 enum bpf_stack_build_id_status {
283463 /* user space need an empty entry to identify end of a trace */
....@@ -298,6 +478,8 @@
298478 };
299479 };
300480
481
+#define BPF_OBJ_NAME_LEN 16U
482
+
301483 union bpf_attr {
302484 struct { /* anonymous struct used by BPF_MAP_CREATE command */
303485 __u32 map_type; /* one of enum bpf_map_type */
....@@ -316,6 +498,10 @@
316498 __u32 btf_fd; /* fd pointing to a BTF type data */
317499 __u32 btf_key_type_id; /* BTF type_id of the key */
318500 __u32 btf_value_type_id; /* BTF type_id of the value */
501
+ __u32 btf_vmlinux_value_type_id;/* BTF type_id of a kernel-
502
+ * struct stored as the
503
+ * map value
504
+ */
319505 };
320506
321507 struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
....@@ -328,6 +514,23 @@
328514 __u64 flags;
329515 };
330516
517
+ struct { /* struct used by BPF_MAP_*_BATCH commands */
518
+ __aligned_u64 in_batch; /* start batch,
519
+ * NULL to start from beginning
520
+ */
521
+ __aligned_u64 out_batch; /* output: next start batch */
522
+ __aligned_u64 keys;
523
+ __aligned_u64 values;
524
+ __u32 count; /* input/output:
525
+ * input: # of key/value
526
+ * elements
527
+ * output: # of filled elements
528
+ */
529
+ __u32 map_fd;
530
+ __u64 elem_flags;
531
+ __u64 flags;
532
+ } batch;
533
+
331534 struct { /* anonymous struct used by BPF_PROG_LOAD command */
332535 __u32 prog_type; /* one of enum bpf_prog_type */
333536 __u32 insn_cnt;
....@@ -336,7 +539,7 @@
336539 __u32 log_level; /* verbosity level of verifier */
337540 __u32 log_size; /* size of user buffer */
338541 __aligned_u64 log_buf; /* user supplied buffer */
339
- __u32 kern_version; /* checked when prog_type=kprobe */
542
+ __u32 kern_version; /* not used */
340543 __u32 prog_flags;
341544 char prog_name[BPF_OBJ_NAME_LEN];
342545 __u32 prog_ifindex; /* ifindex of netdev to prep for */
....@@ -345,6 +548,15 @@
345548 * (context accesses, allowed helpers, etc).
346549 */
347550 __u32 expected_attach_type;
551
+ __u32 prog_btf_fd; /* fd pointing to BTF type data */
552
+ __u32 func_info_rec_size; /* userspace bpf_func_info size */
553
+ __aligned_u64 func_info; /* func info */
554
+ __u32 func_info_cnt; /* number of bpf_func_info records */
555
+ __u32 line_info_rec_size; /* userspace bpf_line_info size */
556
+ __aligned_u64 line_info; /* line info */
557
+ __u32 line_info_cnt; /* number of bpf_line_info records */
558
+ __u32 attach_btf_id; /* in-kernel BTF type id to attach to */
559
+ __u32 attach_prog_fd; /* 0 to attach to vmlinux */
348560 };
349561
350562 struct { /* anonymous struct used by BPF_OBJ_* commands */
....@@ -358,17 +570,33 @@
358570 __u32 attach_bpf_fd; /* eBPF program to attach */
359571 __u32 attach_type;
360572 __u32 attach_flags;
573
+ __u32 replace_bpf_fd; /* previously attached eBPF
574
+ * program to replace if
575
+ * BPF_F_REPLACE is used
576
+ */
361577 };
362578
363579 struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */
364580 __u32 prog_fd;
365581 __u32 retval;
366
- __u32 data_size_in;
367
- __u32 data_size_out;
582
+ __u32 data_size_in; /* input: len of data_in */
583
+ __u32 data_size_out; /* input/output: len of data_out
584
+ * returns ENOSPC if data_out
585
+ * is too small.
586
+ */
368587 __aligned_u64 data_in;
369588 __aligned_u64 data_out;
370589 __u32 repeat;
371590 __u32 duration;
591
+ __u32 ctx_size_in; /* input: len of ctx_in */
592
+ __u32 ctx_size_out; /* input/output: len of ctx_out
593
+ * returns ENOSPC if ctx_out
594
+ * is too small.
595
+ */
596
+ __aligned_u64 ctx_in;
597
+ __aligned_u64 ctx_out;
598
+ __u32 flags;
599
+ __u32 cpu;
372600 } test;
373601
374602 struct { /* anonymous struct used by BPF_*_GET_*_ID */
....@@ -377,6 +605,7 @@
377605 __u32 prog_id;
378606 __u32 map_id;
379607 __u32 btf_id;
608
+ __u32 link_id;
380609 };
381610 __u32 next_id;
382611 __u32 open_flags;
....@@ -397,7 +626,7 @@
397626 __u32 prog_cnt;
398627 } query;
399628
400
- struct {
629
+ struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */
401630 __u64 name;
402631 __u32 prog_fd;
403632 } raw_tracepoint;
....@@ -425,6 +654,53 @@
425654 __u64 probe_offset; /* output: probe_offset */
426655 __u64 probe_addr; /* output: probe_addr */
427656 } task_fd_query;
657
+
658
+ struct { /* struct used by BPF_LINK_CREATE command */
659
+ __u32 prog_fd; /* eBPF program to attach */
660
+ union {
661
+ __u32 target_fd; /* object to attach to */
662
+ __u32 target_ifindex; /* target ifindex */
663
+ };
664
+ __u32 attach_type; /* attach type */
665
+ __u32 flags; /* extra flags */
666
+ union {
667
+ __u32 target_btf_id; /* btf_id of target to attach to */
668
+ struct {
669
+ __aligned_u64 iter_info; /* extra bpf_iter_link_info */
670
+ __u32 iter_info_len; /* iter_info length */
671
+ };
672
+ };
673
+ } link_create;
674
+
675
+ struct { /* struct used by BPF_LINK_UPDATE command */
676
+ __u32 link_fd; /* link fd */
677
+ /* new program fd to update link with */
678
+ __u32 new_prog_fd;
679
+ __u32 flags; /* extra flags */
680
+ /* expected link's program fd; is specified only if
681
+ * BPF_F_REPLACE flag is set in flags */
682
+ __u32 old_prog_fd;
683
+ } link_update;
684
+
685
+ struct {
686
+ __u32 link_fd;
687
+ } link_detach;
688
+
689
+ struct { /* struct used by BPF_ENABLE_STATS command */
690
+ __u32 type;
691
+ } enable_stats;
692
+
693
+ struct { /* struct used by BPF_ITER_CREATE command */
694
+ __u32 link_fd;
695
+ __u32 flags;
696
+ } iter_create;
697
+
698
+ struct { /* struct used by BPF_PROG_BIND_MAP command */
699
+ __u32 prog_fd;
700
+ __u32 map_fd;
701
+ __u32 flags; /* extra flags */
702
+ } prog_bind_map;
703
+
428704 } __attribute__((aligned(8)));
429705
430706 /* The description below is an attempt at providing documentation to eBPF
....@@ -451,7 +727,7 @@
451727 * Map value associated to *key*, or **NULL** if no entry was
452728 * found.
453729 *
454
- * int bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags)
730
+ * long bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags)
455731 * Description
456732 * Add or update the value of the entry associated to *key* in
457733 * *map* with *value*. *flags* is one of:
....@@ -469,16 +745,19 @@
469745 * Return
470746 * 0 on success, or a negative error in case of failure.
471747 *
472
- * int bpf_map_delete_elem(struct bpf_map *map, const void *key)
748
+ * long bpf_map_delete_elem(struct bpf_map *map, const void *key)
473749 * Description
474750 * Delete entry with *key* from *map*.
475751 * Return
476752 * 0 on success, or a negative error in case of failure.
477753 *
478
- * int bpf_probe_read(void *dst, u32 size, const void *src)
754
+ * long bpf_probe_read(void *dst, u32 size, const void *unsafe_ptr)
479755 * Description
480756 * For tracing programs, safely attempt to read *size* bytes from
481
- * address *src* and store the data in *dst*.
757
+ * kernel space address *unsafe_ptr* and store the data in *dst*.
758
+ *
759
+ * Generally, use **bpf_probe_read_user**\ () or
760
+ * **bpf_probe_read_kernel**\ () instead.
482761 * Return
483762 * 0 on success, or a negative error in case of failure.
484763 *
....@@ -486,11 +765,11 @@
486765 * Description
487766 * Return the time elapsed since system boot, in nanoseconds.
488767 * Does not include time the system was suspended.
489
- * See: clock_gettime(CLOCK_MONOTONIC)
768
+ * See: **clock_gettime**\ (**CLOCK_MONOTONIC**)
490769 * Return
491770 * Current *ktime*.
492771 *
493
- * int bpf_trace_printk(const char *fmt, u32 fmt_size, ...)
772
+ * long bpf_trace_printk(const char *fmt, u32 fmt_size, ...)
494773 * Description
495774 * This helper is a "printk()-like" facility for debugging. It
496775 * prints a message defined by format *fmt* (of size *fmt_size*)
....@@ -500,6 +779,8 @@
500779 * limited to five).
501780 *
502781 * Each time the helper is called, it appends a line to the trace.
782
+ * Lines are discarded while *\/sys/kernel/debug/tracing/trace* is
783
+ * open, use *\/sys/kernel/debug/tracing/trace_pipe* to avoid this.
503784 * The format of the trace is customizable, and the exact output
504785 * one will get depends on the options set in
505786 * *\/sys/kernel/debug/tracing/trace_options* (see also the
....@@ -538,7 +819,7 @@
538819 *
539820 * Also, note that **bpf_trace_printk**\ () is slow, and should
540821 * only be used for debugging purposes. For this reason, a notice
541
- * bloc (spanning several lines) is printed to kernel logs and
822
+ * block (spanning several lines) is printed to kernel logs and
542823 * states that the helper should not be used "for production use"
543824 * the first time this helper is used (or more precisely, when
544825 * **trace_printk**\ () buffers are allocated). For passing values
....@@ -568,7 +849,7 @@
568849 * Return
569850 * The SMP id of the processor running the program.
570851 *
571
- * int bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags)
852
+ * long bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags)
572853 * Description
573854 * Store *len* bytes from address *from* into the packet
574855 * associated to *skb*, at *offset*. *flags* are a combination of
....@@ -577,7 +858,7 @@
577858 * **BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\
578859 * **->swhash** and *skb*\ **->l4hash** to 0).
579860 *
580
- * A call to this helper is susceptible to change the underlaying
861
+ * A call to this helper is susceptible to change the underlying
581862 * packet buffer. Therefore, at load time, all checks on pointers
582863 * previously done by the verifier are invalidated and must be
583864 * performed again, if the helper is used in combination with
....@@ -585,7 +866,7 @@
585866 * Return
586867 * 0 on success, or a negative error in case of failure.
587868 *
588
- * int bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size)
869
+ * long bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size)
589870 * Description
590871 * Recompute the layer 3 (e.g. IP) checksum for the packet
591872 * associated to *skb*. Computation is incremental, so the helper
....@@ -602,7 +883,7 @@
602883 * flexibility and can handle sizes larger than 2 or 4 for the
603884 * checksum to update.
604885 *
605
- * A call to this helper is susceptible to change the underlaying
886
+ * A call to this helper is susceptible to change the underlying
606887 * packet buffer. Therefore, at load time, all checks on pointers
607888 * previously done by the verifier are invalidated and must be
608889 * performed again, if the helper is used in combination with
....@@ -610,7 +891,7 @@
610891 * Return
611892 * 0 on success, or a negative error in case of failure.
612893 *
613
- * int bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags)
894
+ * long bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags)
614895 * Description
615896 * Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the
616897 * packet associated to *skb*. Computation is incremental, so the
....@@ -634,7 +915,7 @@
634915 * flexibility and can handle sizes larger than 2 or 4 for the
635916 * checksum to update.
636917 *
637
- * A call to this helper is susceptible to change the underlaying
918
+ * A call to this helper is susceptible to change the underlying
638919 * packet buffer. Therefore, at load time, all checks on pointers
639920 * previously done by the verifier are invalidated and must be
640921 * performed again, if the helper is used in combination with
....@@ -642,7 +923,7 @@
642923 * Return
643924 * 0 on success, or a negative error in case of failure.
644925 *
645
- * int bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index)
926
+ * long bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index)
646927 * Description
647928 * This special helper is used to trigger a "tail call", or in
648929 * other words, to jump into another eBPF program. The same stack
....@@ -673,7 +954,7 @@
673954 * Return
674955 * 0 on success, or a negative error in case of failure.
675956 *
676
- * int bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags)
957
+ * long bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags)
677958 * Description
678959 * Clone and redirect the packet associated to *skb* to another
679960 * net device of index *ifindex*. Both ingress and egress
....@@ -689,7 +970,7 @@
689970 * efficient, but it is handled through an action code where the
690971 * redirection happens only after the eBPF program has returned.
691972 *
692
- * A call to this helper is susceptible to change the underlaying
973
+ * A call to this helper is susceptible to change the underlying
693974 * packet buffer. Therefore, at load time, all checks on pointers
694975 * previously done by the verifier are invalidated and must be
695976 * performed again, if the helper is used in combination with
....@@ -709,7 +990,7 @@
709990 * A 64-bit integer containing the current GID and UID, and
710991 * created as such: *current_gid* **<< 32 \|** *current_uid*.
711992 *
712
- * int bpf_get_current_comm(char *buf, u32 size_of_buf)
993
+ * long bpf_get_current_comm(void *buf, u32 size_of_buf)
713994 * Description
714995 * Copy the **comm** attribute of the current task into *buf* of
715996 * *size_of_buf*. The **comm** attribute contains the name of
....@@ -731,7 +1012,7 @@
7311012 * based on a user-provided identifier for all traffic coming from
7321013 * the tasks belonging to the related cgroup. See also the related
7331014 * kernel documentation, available from the Linux sources in file
734
- * *Documentation/cgroup-v1/net_cls.txt*.
1015
+ * *Documentation/admin-guide/cgroup-v1/net_cls.rst*.
7351016 *
7361017 * The Linux kernel has two versions for cgroups: there are
7371018 * cgroups v1 and cgroups v2. Both are available to users, who can
....@@ -746,7 +1027,7 @@
7461027 * Return
7471028 * The classid, or 0 for the default unconfigured classid.
7481029 *
749
- * int bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
1030
+ * long bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
7501031 * Description
7511032 * Push a *vlan_tci* (VLAN tag control information) of protocol
7521033 * *vlan_proto* to the packet associated to *skb*, then update
....@@ -754,7 +1035,7 @@
7541035 * **ETH_P_8021Q** and **ETH_P_8021AD**, it is considered to
7551036 * be **ETH_P_8021Q**.
7561037 *
757
- * A call to this helper is susceptible to change the underlaying
1038
+ * A call to this helper is susceptible to change the underlying
7581039 * packet buffer. Therefore, at load time, all checks on pointers
7591040 * previously done by the verifier are invalidated and must be
7601041 * performed again, if the helper is used in combination with
....@@ -762,11 +1043,11 @@
7621043 * Return
7631044 * 0 on success, or a negative error in case of failure.
7641045 *
765
- * int bpf_skb_vlan_pop(struct sk_buff *skb)
1046
+ * long bpf_skb_vlan_pop(struct sk_buff *skb)
7661047 * Description
7671048 * Pop a VLAN header from the packet associated to *skb*.
7681049 *
769
- * A call to this helper is susceptible to change the underlaying
1050
+ * A call to this helper is susceptible to change the underlying
7701051 * packet buffer. Therefore, at load time, all checks on pointers
7711052 * previously done by the verifier are invalidated and must be
7721053 * performed again, if the helper is used in combination with
....@@ -774,7 +1055,7 @@
7741055 * Return
7751056 * 0 on success, or a negative error in case of failure.
7761057 *
777
- * int bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags)
1058
+ * long bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags)
7781059 * Description
7791060 * Get tunnel metadata. This helper takes a pointer *key* to an
7801061 * empty **struct bpf_tunnel_key** of **size**, that will be
....@@ -804,14 +1085,14 @@
8041085 *
8051086 * int ret;
8061087 * struct bpf_tunnel_key key = {};
807
- *
1088
+ *
8081089 * ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
8091090 * if (ret < 0)
8101091 * return TC_ACT_SHOT; // drop packet
811
- *
1092
+ *
8121093 * if (key.remote_ipv4 != 0x0a000001)
8131094 * return TC_ACT_SHOT; // drop packet
814
- *
1095
+ *
8151096 * return TC_ACT_OK; // accept packet
8161097 *
8171098 * This interface can also be used with all encapsulation devices
....@@ -825,7 +1106,7 @@
8251106 * Return
8261107 * 0 on success, or a negative error in case of failure.
8271108 *
828
- * int bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags)
1109
+ * long bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags)
8291110 * Description
8301111 * Populate tunnel metadata for packet associated to *skb.* The
8311112 * tunnel metadata is set to the contents of *key*, of *size*. The
....@@ -891,7 +1172,7 @@
8911172 * The value of the perf event counter read from the map, or a
8921173 * negative error code in case of failure.
8931174 *
894
- * int bpf_redirect(u32 ifindex, u64 flags)
1175
+ * long bpf_redirect(u32 ifindex, u64 flags)
8951176 * Description
8961177 * Redirect the packet to another net device of index *ifindex*.
8971178 * This helper is somewhat similar to **bpf_clone_redirect**\
....@@ -905,9 +1186,9 @@
9051186 * supports redirection to the egress interface, and accepts no
9061187 * flag at all.
9071188 *
908
- * The same effect can be attained with the more generic
909
- * **bpf_redirect_map**\ (), which requires specific maps to be
910
- * used but offers better performance.
1189
+ * The same effect can also be attained with the more generic
1190
+ * **bpf_redirect_map**\ (), which uses a BPF map to store the
1191
+ * redirect target instead of providing it directly to the helper.
9111192 * Return
9121193 * For XDP, the helper returns **XDP_REDIRECT** on success or
9131194 * **XDP_ABORTED** on error. For other program types, the values
....@@ -918,7 +1199,7 @@
9181199 * Description
9191200 * Retrieve the realm or the route, that is to say the
9201201 * **tclassid** field of the destination for the *skb*. The
921
- * indentifier retrieved is a user-provided tag, similar to the
1202
+ * identifier retrieved is a user-provided tag, similar to the
9221203 * one used with the net_cls cgroup (see description for
9231204 * **bpf_get_cgroup_classid**\ () helper), but here this tag is
9241205 * held by a route (a destination entry), not by a task.
....@@ -938,7 +1219,7 @@
9381219 * The realm of the route for the packet associated to *skb*, or 0
9391220 * if none was found.
9401221 *
941
- * int bpf_perf_event_output(struct pt_reg *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
1222
+ * long bpf_perf_event_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
9421223 * Description
9431224 * Write raw *data* blob into a special BPF perf event held by
9441225 * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
....@@ -983,7 +1264,7 @@
9831264 * Return
9841265 * 0 on success, or a negative error in case of failure.
9851266 *
986
- * int bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len)
1267
+ * long bpf_skb_load_bytes(const void *skb, u32 offset, void *to, u32 len)
9871268 * Description
9881269 * This helper was provided as an easy way to load data from a
9891270 * packet. It can be used to load *len* bytes from *offset* from
....@@ -1000,7 +1281,7 @@
10001281 * Return
10011282 * 0 on success, or a negative error in case of failure.
10021283 *
1003
- * int bpf_get_stackid(struct pt_reg *ctx, struct bpf_map *map, u64 flags)
1284
+ * long bpf_get_stackid(void *ctx, struct bpf_map *map, u64 flags)
10041285 * Description
10051286 * Walk a user or a kernel stack and return its id. To achieve
10061287 * this, the helper needs *ctx*, which is a pointer to the context
....@@ -1069,7 +1350,7 @@
10691350 * The checksum result, or a negative error code in case of
10701351 * failure.
10711352 *
1072
- * int bpf_skb_get_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size)
1353
+ * long bpf_skb_get_tunnel_opt(struct sk_buff *skb, void *opt, u32 size)
10731354 * Description
10741355 * Retrieve tunnel options metadata for the packet associated to
10751356 * *skb*, and store the raw tunnel option data to the buffer *opt*
....@@ -1087,7 +1368,7 @@
10871368 * Return
10881369 * The size of the option data retrieved.
10891370 *
1090
- * int bpf_skb_set_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size)
1371
+ * long bpf_skb_set_tunnel_opt(struct sk_buff *skb, void *opt, u32 size)
10911372 * Description
10921373 * Set tunnel options metadata for the packet associated to *skb*
10931374 * to the option data contained in the raw buffer *opt* of *size*.
....@@ -1097,7 +1378,7 @@
10971378 * Return
10981379 * 0 on success, or a negative error in case of failure.
10991380 *
1100
- * int bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags)
1381
+ * long bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags)
11011382 * Description
11021383 * Change the protocol of the *skb* to *proto*. Currently
11031384 * supported are transition from IPv4 to IPv6, and from IPv6 to
....@@ -1116,7 +1397,7 @@
11161397 * All values for *flags* are reserved for future usage, and must
11171398 * be left at zero.
11181399 *
1119
- * A call to this helper is susceptible to change the underlaying
1400
+ * A call to this helper is susceptible to change the underlying
11201401 * packet buffer. Therefore, at load time, all checks on pointers
11211402 * previously done by the verifier are invalidated and must be
11221403 * performed again, if the helper is used in combination with
....@@ -1124,7 +1405,7 @@
11241405 * Return
11251406 * 0 on success, or a negative error in case of failure.
11261407 *
1127
- * int bpf_skb_change_type(struct sk_buff *skb, u32 type)
1408
+ * long bpf_skb_change_type(struct sk_buff *skb, u32 type)
11281409 * Description
11291410 * Change the packet type for the packet associated to *skb*. This
11301411 * comes down to setting *skb*\ **->pkt_type** to *type*, except
....@@ -1151,7 +1432,7 @@
11511432 * Return
11521433 * 0 on success, or a negative error in case of failure.
11531434 *
1154
- * int bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index)
1435
+ * long bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index)
11551436 * Description
11561437 * Check whether *skb* is a descendant of the cgroup2 held by
11571438 * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*.
....@@ -1182,7 +1463,7 @@
11821463 * Return
11831464 * A pointer to the current task struct.
11841465 *
1185
- * int bpf_probe_write_user(void *dst, const void *src, u32 len)
1466
+ * long bpf_probe_write_user(void *dst, const void *src, u32 len)
11861467 * Description
11871468 * Attempt in a safe way to write *len* bytes from the buffer
11881469 * *src* to *dst* in memory. It only works for threads that are in
....@@ -1201,7 +1482,7 @@
12011482 * Return
12021483 * 0 on success, or a negative error in case of failure.
12031484 *
1204
- * int bpf_current_task_under_cgroup(struct bpf_map *map, u32 index)
1485
+ * long bpf_current_task_under_cgroup(struct bpf_map *map, u32 index)
12051486 * Description
12061487 * Check whether the probe is being run is the context of a given
12071488 * subset of the cgroup2 hierarchy. The cgroup2 to test is held by
....@@ -1209,11 +1490,11 @@
12091490 * Return
12101491 * The return value depends on the result of the test, and can be:
12111492 *
1212
- * * 0, if current task belongs to the cgroup2.
1213
- * * 1, if current task does not belong to the cgroup2.
1493
+ * * 1, if current task belongs to the cgroup2.
1494
+ * * 0, if current task does not belong to the cgroup2.
12141495 * * A negative error code, if an error occurred.
12151496 *
1216
- * int bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags)
1497
+ * long bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags)
12171498 * Description
12181499 * Resize (trim or grow) the packet associated to *skb* to the
12191500 * new *len*. The *flags* are reserved for future usage, and must
....@@ -1229,7 +1510,7 @@
12291510 * implicitly linearizes, unclones and drops offloads from the
12301511 * *skb*.
12311512 *
1232
- * A call to this helper is susceptible to change the underlaying
1513
+ * A call to this helper is susceptible to change the underlying
12331514 * packet buffer. Therefore, at load time, all checks on pointers
12341515 * previously done by the verifier are invalidated and must be
12351516 * performed again, if the helper is used in combination with
....@@ -1237,7 +1518,7 @@
12371518 * Return
12381519 * 0 on success, or a negative error in case of failure.
12391520 *
1240
- * int bpf_skb_pull_data(struct sk_buff *skb, u32 len)
1521
+ * long bpf_skb_pull_data(struct sk_buff *skb, u32 len)
12411522 * Description
12421523 * Pull in non-linear data in case the *skb* is non-linear and not
12431524 * all of *len* are part of the linear section. Make *len* bytes
....@@ -1265,7 +1546,7 @@
12651546 * **bpf_skb_pull_data()** to effectively unclone the *skb* from
12661547 * the very beginning in case it is indeed cloned.
12671548 *
1268
- * A call to this helper is susceptible to change the underlaying
1549
+ * A call to this helper is susceptible to change the underlying
12691550 * packet buffer. Therefore, at load time, all checks on pointers
12701551 * previously done by the verifier are invalidated and must be
12711552 * performed again, if the helper is used in combination with
....@@ -1293,7 +1574,7 @@
12931574 * recalculation the next time the kernel tries to access this
12941575 * hash or when the **bpf_get_hash_recalc**\ () helper is called.
12951576 *
1296
- * int bpf_get_numa_node_id(void)
1577
+ * long bpf_get_numa_node_id(void)
12971578 * Description
12981579 * Return the id of the current NUMA node. The primary use case
12991580 * for this helper is the selection of sockets for the local NUMA
....@@ -1304,7 +1585,7 @@
13041585 * Return
13051586 * The id of current NUMA node.
13061587 *
1307
- * int bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags)
1588
+ * long bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags)
13081589 * Description
13091590 * Grows headroom of packet associated to *skb* and adjusts the
13101591 * offset of the MAC header accordingly, adding *len* bytes of
....@@ -1317,7 +1598,7 @@
13171598 * All values for *flags* are reserved for future usage, and must
13181599 * be left at zero.
13191600 *
1320
- * A call to this helper is susceptible to change the underlaying
1601
+ * A call to this helper is susceptible to change the underlying
13211602 * packet buffer. Therefore, at load time, all checks on pointers
13221603 * previously done by the verifier are invalidated and must be
13231604 * performed again, if the helper is used in combination with
....@@ -1325,14 +1606,14 @@
13251606 * Return
13261607 * 0 on success, or a negative error in case of failure.
13271608 *
1328
- * int bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta)
1609
+ * long bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta)
13291610 * Description
13301611 * Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that
13311612 * it is possible to use a negative value for *delta*. This helper
13321613 * can be used to prepare the packet for pushing or popping
13331614 * headers.
13341615 *
1335
- * A call to this helper is susceptible to change the underlaying
1616
+ * A call to this helper is susceptible to change the underlying
13361617 * packet buffer. Therefore, at load time, all checks on pointers
13371618 * previously done by the verifier are invalidated and must be
13381619 * performed again, if the helper is used in combination with
....@@ -1340,45 +1621,14 @@
13401621 * Return
13411622 * 0 on success, or a negative error in case of failure.
13421623 *
1343
- * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr)
1624
+ * long bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr)
13441625 * Description
1345
- * Copy a NUL terminated string from an unsafe address
1346
- * *unsafe_ptr* to *dst*. The *size* should include the
1347
- * terminating NUL byte. In case the string length is smaller than
1348
- * *size*, the target is not padded with further NUL bytes. If the
1349
- * string length is larger than *size*, just *size*-1 bytes are
1350
- * copied and the last byte is set to NUL.
1626
+ * Copy a NUL terminated string from an unsafe kernel address
1627
+ * *unsafe_ptr* to *dst*. See **bpf_probe_read_kernel_str**\ () for
1628
+ * more details.
13511629 *
1352
- * On success, the length of the copied string is returned. This
1353
- * makes this helper useful in tracing programs for reading
1354
- * strings, and more importantly to get its length at runtime. See
1355
- * the following snippet:
1356
- *
1357
- * ::
1358
- *
1359
- * SEC("kprobe/sys_open")
1360
- * void bpf_sys_open(struct pt_regs *ctx)
1361
- * {
1362
- * char buf[PATHLEN]; // PATHLEN is defined to 256
1363
- * int res = bpf_probe_read_str(buf, sizeof(buf),
1364
- * ctx->di);
1365
- *
1366
- * // Consume buf, for example push it to
1367
- * // userspace via bpf_perf_event_output(); we
1368
- * // can use res (the string length) as event
1369
- * // size, after checking its boundaries.
1370
- * }
1371
- *
1372
- * In comparison, using **bpf_probe_read()** helper here instead
1373
- * to read the string would require to estimate the length at
1374
- * compile time, and would often result in copying more memory
1375
- * than necessary.
1376
- *
1377
- * Another useful use case is when parsing individual process
1378
- * arguments or individual environment variables navigating
1379
- * *current*\ **->mm->arg_start** and *current*\
1380
- * **->mm->env_start**: using this helper and the return value,
1381
- * one can quickly iterate at the right offset of the memory area.
1630
+ * Generally, use **bpf_probe_read_user_str**\ () or
1631
+ * **bpf_probe_read_kernel_str**\ () instead.
13821632 * Return
13831633 * On success, the strictly positive length of the string,
13841634 * including the trailing NUL character. On error, a negative
....@@ -1391,8 +1641,8 @@
13911641 * If no cookie has been set yet, generate a new cookie. Once
13921642 * generated, the socket cookie remains stable for the life of the
13931643 * socket. This helper can be useful for monitoring per socket
1394
- * networking traffic statistics as it provides a unique socket
1395
- * identifier per namespace.
1644
+ * networking traffic statistics as it provides a global socket
1645
+ * identifier that can be assumed unique.
13961646 * Return
13971647 * A 8-byte long non-decreasing number on success, or 0 if the
13981648 * socket field is missing inside *skb*.
....@@ -1400,14 +1650,14 @@
14001650 * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx)
14011651 * Description
14021652 * Equivalent to bpf_get_socket_cookie() helper that accepts
1403
- * *skb*, but gets socket from **struct bpf_sock_addr** contex.
1653
+ * *skb*, but gets socket from **struct bpf_sock_addr** context.
14041654 * Return
14051655 * A 8-byte long non-decreasing number.
14061656 *
14071657 * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx)
14081658 * Description
1409
- * Equivalent to bpf_get_socket_cookie() helper that accepts
1410
- * *skb*, but gets socket from **struct bpf_sock_ops** contex.
1659
+ * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts
1660
+ * *skb*, but gets socket from **struct bpf_sock_ops** context.
14111661 * Return
14121662 * A 8-byte long non-decreasing number.
14131663 *
....@@ -1419,14 +1669,14 @@
14191669 * is returned (note that **overflowuid** might also be the actual
14201670 * UID value for the socket).
14211671 *
1422
- * u32 bpf_set_hash(struct sk_buff *skb, u32 hash)
1672
+ * long bpf_set_hash(struct sk_buff *skb, u32 hash)
14231673 * Description
14241674 * Set the full hash for *skb* (set the field *skb*\ **->hash**)
14251675 * to value *hash*.
14261676 * Return
14271677 * 0
14281678 *
1429
- * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen)
1679
+ * long bpf_setsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen)
14301680 * Description
14311681 * Emulate a call to **setsockopt()** on the socket associated to
14321682 * *bpf_socket*, which must be a full socket. The *level* at
....@@ -1434,34 +1684,68 @@
14341684 * must be specified, see **setsockopt(2)** for more information.
14351685 * The option value of length *optlen* is pointed by *optval*.
14361686 *
1687
+ * *bpf_socket* should be one of the following:
1688
+ *
1689
+ * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
1690
+ * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
1691
+ * and **BPF_CGROUP_INET6_CONNECT**.
1692
+ *
14371693 * This helper actually implements a subset of **setsockopt()**.
14381694 * It supports the following *level*\ s:
14391695 *
14401696 * * **SOL_SOCKET**, which supports the following *optname*\ s:
14411697 * **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**,
1442
- * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**.
1698
+ * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**,
1699
+ * **SO_BINDTODEVICE**, **SO_KEEPALIVE**.
14431700 * * **IPPROTO_TCP**, which supports the following *optname*\ s:
14441701 * **TCP_CONGESTION**, **TCP_BPF_IW**,
1445
- * **TCP_BPF_SNDCWND_CLAMP**.
1702
+ * **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**,
1703
+ * **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**,
1704
+ * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**.
14461705 * * **IPPROTO_IP**, which supports *optname* **IP_TOS**.
14471706 * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
14481707 * Return
14491708 * 0 on success, or a negative error in case of failure.
14501709 *
1451
- * int bpf_skb_adjust_room(struct sk_buff *skb, u32 len_diff, u32 mode, u64 flags)
1710
+ * long bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags)
14521711 * Description
14531712 * Grow or shrink the room for data in the packet associated to
14541713 * *skb* by *len_diff*, and according to the selected *mode*.
14551714 *
1456
- * There is a single supported mode at this time:
1715
+ * By default, the helper will reset any offloaded checksum
1716
+ * indicator of the skb to CHECKSUM_NONE. This can be avoided
1717
+ * by the following flag:
1718
+ *
1719
+ * * **BPF_F_ADJ_ROOM_NO_CSUM_RESET**: Do not reset offloaded
1720
+ * checksum data of the skb to CHECKSUM_NONE.
1721
+ *
1722
+ * There are two supported modes at this time:
1723
+ *
1724
+ * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer
1725
+ * (room space is added or removed below the layer 2 header).
14571726 *
14581727 * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer
14591728 * (room space is added or removed below the layer 3 header).
14601729 *
1461
- * All values for *flags* are reserved for future usage, and must
1462
- * be left at zero.
1730
+ * The following flags are supported at this time:
14631731 *
1464
- * A call to this helper is susceptible to change the underlaying
1732
+ * * **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size.
1733
+ * Adjusting mss in this way is not allowed for datagrams.
1734
+ *
1735
+ * * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4**,
1736
+ * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6**:
1737
+ * Any new space is reserved to hold a tunnel header.
1738
+ * Configure skb offsets and other fields accordingly.
1739
+ *
1740
+ * * **BPF_F_ADJ_ROOM_ENCAP_L4_GRE**,
1741
+ * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP**:
1742
+ * Use with ENCAP_L3 flags to further specify the tunnel type.
1743
+ *
1744
+ * * **BPF_F_ADJ_ROOM_ENCAP_L2**\ (*len*):
1745
+ * Use with ENCAP_L3/L4 flags to further specify the tunnel
1746
+ * type; *len* is the length of the inner MAC header.
1747
+ *
1748
+ * A call to this helper is susceptible to change the underlying
14651749 * packet buffer. Therefore, at load time, all checks on pointers
14661750 * previously done by the verifier are invalidated and must be
14671751 * performed again, if the helper is used in combination with
....@@ -1469,7 +1753,7 @@
14691753 * Return
14701754 * 0 on success, or a negative error in case of failure.
14711755 *
1472
- * int bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags)
1756
+ * long bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags)
14731757 * Description
14741758 * Redirect the packet to the endpoint referenced by *map* at
14751759 * index *key*. Depending on its type, this *map* can contain
....@@ -1478,18 +1762,19 @@
14781762 * but this is only implemented for native XDP (with driver
14791763 * support) as of this writing).
14801764 *
1481
- * All values for *flags* are reserved for future usage, and must
1482
- * be left at zero.
1765
+ * The lower two bits of *flags* are used as the return code if
1766
+ * the map lookup fails. This is so that the return value can be
1767
+ * one of the XDP program return codes up to **XDP_TX**, as chosen
1768
+ * by the caller. Any higher bits in the *flags* argument must be
1769
+ * unset.
14831770 *
1484
- * When used to redirect packets to net devices, this helper
1485
- * provides a high performance increase over **bpf_redirect**\ ().
1486
- * This is due to various implementation details of the underlying
1487
- * mechanisms, one of which is the fact that **bpf_redirect_map**\
1488
- * () tries to send packet as a "bulk" to the device.
1771
+ * See also **bpf_redirect**\ (), which only supports redirecting
1772
+ * to an ifindex, but doesn't require a map to do so.
14891773 * Return
1490
- * **XDP_REDIRECT** on success, or **XDP_ABORTED** on error.
1774
+ * **XDP_REDIRECT** on success, or the value of the two lower bits
1775
+ * of the *flags* argument on error.
14911776 *
1492
- * int bpf_sk_redirect_map(struct bpf_map *map, u32 key, u64 flags)
1777
+ * long bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags)
14931778 * Description
14941779 * Redirect the packet to the socket referenced by *map* (of type
14951780 * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and
....@@ -1500,7 +1785,7 @@
15001785 * Return
15011786 * **SK_PASS** on success, or **SK_DROP** on error.
15021787 *
1503
- * int bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags)
1788
+ * long bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags)
15041789 * Description
15051790 * Add an entry to, or update a *map* referencing sockets. The
15061791 * *skops* is used as a new value for the entry associated to
....@@ -1519,7 +1804,7 @@
15191804 * Return
15201805 * 0 on success, or a negative error in case of failure.
15211806 *
1522
- * int bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta)
1807
+ * long bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta)
15231808 * Description
15241809 * Adjust the address pointed by *xdp_md*\ **->data_meta** by
15251810 * *delta* (which can be positive or negative). Note that this
....@@ -1540,7 +1825,7 @@
15401825 * more flexibility as the user is free to store whatever meta
15411826 * data they need.
15421827 *
1543
- * A call to this helper is susceptible to change the underlaying
1828
+ * A call to this helper is susceptible to change the underlying
15441829 * packet buffer. Therefore, at load time, all checks on pointers
15451830 * previously done by the verifier are invalidated and must be
15461831 * performed again, if the helper is used in combination with
....@@ -1548,7 +1833,7 @@
15481833 * Return
15491834 * 0 on success, or a negative error in case of failure.
15501835 *
1551
- * int bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size)
1836
+ * long bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size)
15521837 * Description
15531838 * Read the value of a perf event counter, and store it into *buf*
15541839 * of size *buf_size*. This helper relies on a *map* of type
....@@ -1592,13 +1877,13 @@
15921877 * the time running for event since last normalization. The
15931878 * enabled and running times are accumulated since the perf event
15941879 * open. To achieve scaling factor between two invocations of an
1595
- * eBPF program, users can can use CPU id as the key (which is
1880
+ * eBPF program, users can use CPU id as the key (which is
15961881 * typical for perf array usage model) to remember the previous
15971882 * value and do the calculation inside the eBPF program.
15981883 * Return
15991884 * 0 on success, or a negative error in case of failure.
16001885 *
1601
- * int bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size)
1886
+ * long bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size)
16021887 * Description
16031888 * For en eBPF program attached to a perf event, retrieve the
16041889 * value of the event counter associated to *ctx* and store it in
....@@ -1609,7 +1894,7 @@
16091894 * Return
16101895 * 0 on success, or a negative error in case of failure.
16111896 *
1612
- * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen)
1897
+ * long bpf_getsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen)
16131898 * Description
16141899 * Emulate a call to **getsockopt()** on the socket associated to
16151900 * *bpf_socket*, which must be a full socket. The *level* at
....@@ -1617,6 +1902,12 @@
16171902 * must be specified, see **getsockopt(2)** for more information.
16181903 * The retrieved value is stored in the structure pointed by
16191904 * *opval* and of length *optlen*.
1905
+ *
1906
+ * *bpf_socket* should be one of the following:
1907
+ *
1908
+ * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
1909
+ * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
1910
+ * and **BPF_CGROUP_INET6_CONNECT**.
16201911 *
16211912 * This helper actually implements a subset of **getsockopt()**.
16221913 * It supports the following *level*\ s:
....@@ -1628,14 +1919,14 @@
16281919 * Return
16291920 * 0 on success, or a negative error in case of failure.
16301921 *
1631
- * int bpf_override_return(struct pt_reg *regs, u64 rc)
1922
+ * long bpf_override_return(struct pt_regs *regs, u64 rc)
16321923 * Description
16331924 * Used for error injection, this helper uses kprobes to override
16341925 * the return value of the probed function, and to set it to *rc*.
16351926 * The first argument is the context *regs* on which the kprobe
16361927 * works.
16371928 *
1638
- * This helper works by setting setting the PC (program counter)
1929
+ * This helper works by setting the PC (program counter)
16391930 * to an override function which is run in place of the original
16401931 * probed function. This means the probed function is not run at
16411932 * all. The replacement function just returns with the required
....@@ -1653,7 +1944,7 @@
16531944 * Return
16541945 * 0
16551946 *
1656
- * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval)
1947
+ * long bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval)
16571948 * Description
16581949 * Attempt to set the value of the **bpf_sock_ops_cb_flags** field
16591950 * for the full TCP socket associated to *bpf_sock_ops* to
....@@ -1669,11 +1960,19 @@
16691960 * error if an eBPF program tries to set a callback that is not
16701961 * supported in the current kernel.
16711962 *
1672
- * The supported callback values that *argval* can combine are:
1963
+ * *argval* is a flag array which can combine these flags:
16731964 *
16741965 * * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out)
16751966 * * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission)
16761967 * * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change)
1968
+ * * **BPF_SOCK_OPS_RTT_CB_FLAG** (every RTT)
1969
+ *
1970
+ * Therefore, this function can be used to clear a callback flag by
1971
+ * setting the appropriate bit to zero. e.g. to disable the RTO
1972
+ * callback:
1973
+ *
1974
+ * **bpf_sock_ops_cb_flags_set(bpf_sock,**
1975
+ * **bpf_sock->bpf_sock_ops_cb_flags & ~BPF_SOCK_OPS_RTO_CB_FLAG)**
16771976 *
16781977 * Here are some examples of where one could call such eBPF
16791978 * program:
....@@ -1689,7 +1988,7 @@
16891988 * be set is returned (which comes down to 0 if all bits were set
16901989 * as required).
16911990 *
1692
- * int bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags)
1991
+ * long bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags)
16931992 * Description
16941993 * This helper is used in programs implementing policies at the
16951994 * socket level. If the message *msg* is allowed to pass (i.e. if
....@@ -1703,7 +2002,7 @@
17032002 * Return
17042003 * **SK_PASS** on success, or **SK_DROP** on error.
17052004 *
1706
- * int bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes)
2005
+ * long bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes)
17072006 * Description
17082007 * For socket policies, apply the verdict of the eBPF program to
17092008 * the next *bytes* (number of bytes) of message *msg*.
....@@ -1737,7 +2036,7 @@
17372036 * Return
17382037 * 0
17392038 *
1740
- * int bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes)
2039
+ * long bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes)
17412040 * Description
17422041 * For socket policies, prevent the execution of the verdict eBPF
17432042 * program for message *msg* until *bytes* (byte number) have been
....@@ -1755,7 +2054,7 @@
17552054 * Return
17562055 * 0
17572056 *
1758
- * int bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags)
2057
+ * long bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags)
17592058 * Description
17602059 * For socket policies, pull in non-linear data from user space
17612060 * for *msg* and set pointers *msg*\ **->data** and *msg*\
....@@ -1775,7 +2074,7 @@
17752074 * copied if necessary (i.e. if data was not linear and if start
17762075 * and end pointers do not point to the same chunk).
17772076 *
1778
- * A call to this helper is susceptible to change the underlaying
2077
+ * A call to this helper is susceptible to change the underlying
17792078 * packet buffer. Therefore, at load time, all checks on pointers
17802079 * previously done by the verifier are invalidated and must be
17812080 * performed again, if the helper is used in combination with
....@@ -1786,7 +2085,7 @@
17862085 * Return
17872086 * 0 on success, or a negative error in case of failure.
17882087 *
1789
- * int bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len)
2088
+ * long bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len)
17902089 * Description
17912090 * Bind the socket associated to *ctx* to the address pointed by
17922091 * *addr*, of length *addr_len*. This allows for making outgoing
....@@ -1796,20 +2095,21 @@
17962095 *
17972096 * This helper works for IPv4 and IPv6, TCP and UDP sockets. The
17982097 * domain (*addr*\ **->sa_family**) must be **AF_INET** (or
1799
- * **AF_INET6**). Looking for a free port to bind to can be
1800
- * expensive, therefore binding to port is not permitted by the
1801
- * helper: *addr*\ **->sin_port** (or **sin6_port**, respectively)
1802
- * must be set to zero.
2098
+ * **AF_INET6**). It's advised to pass zero port (**sin_port**
2099
+ * or **sin6_port**) which triggers IP_BIND_ADDRESS_NO_PORT-like
2100
+ * behavior and lets the kernel efficiently pick up an unused
2101
+ * port as long as 4-tuple is unique. Passing non-zero port might
2102
+ * lead to degraded performance.
18032103 * Return
18042104 * 0 on success, or a negative error in case of failure.
18052105 *
1806
- * int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta)
2106
+ * long bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta)
18072107 * Description
18082108 * Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is
1809
- * only possible to shrink the packet as of this writing,
1810
- * therefore *delta* must be a negative integer.
2109
+ * possible to both shrink and grow the packet tail.
2110
+ * Shrink done via *delta* being a negative integer.
18112111 *
1812
- * A call to this helper is susceptible to change the underlaying
2112
+ * A call to this helper is susceptible to change the underlying
18132113 * packet buffer. Therefore, at load time, all checks on pointers
18142114 * previously done by the verifier are invalidated and must be
18152115 * performed again, if the helper is used in combination with
....@@ -1817,7 +2117,7 @@
18172117 * Return
18182118 * 0 on success, or a negative error in case of failure.
18192119 *
1820
- * int bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags)
2120
+ * long bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags)
18212121 * Description
18222122 * Retrieve the XFRM state (IP transform framework, see also
18232123 * **ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*.
....@@ -1833,7 +2133,7 @@
18332133 * Return
18342134 * 0 on success, or a negative error in case of failure.
18352135 *
1836
- * int bpf_get_stack(struct pt_regs *regs, void *buf, u32 size, u64 flags)
2136
+ * long bpf_get_stack(void *ctx, void *buf, u32 size, u64 flags)
18372137 * Description
18382138 * Return a user or a kernel stack in bpf program provided buffer.
18392139 * To achieve this, the helper needs *ctx*, which is a pointer
....@@ -1863,10 +2163,10 @@
18632163 *
18642164 * # sysctl kernel.perf_event_max_stack=<new value>
18652165 * Return
1866
- * A non-negative value equal to or less than *size* on success,
1867
- * or a negative error in case of failure.
2166
+ * The non-negative copied *buf* length equal to or less than
2167
+ * *size* on success, or a negative error in case of failure.
18682168 *
1869
- * int bpf_skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header)
2169
+ * long bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header)
18702170 * Description
18712171 * This helper is similar to **bpf_skb_load_bytes**\ () in that
18722172 * it provides an easy way to load *len* bytes from *offset*
....@@ -1888,7 +2188,7 @@
18882188 * Return
18892189 * 0 on success, or a negative error in case of failure.
18902190 *
1891
- * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags)
2191
+ * long bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags)
18922192 * Description
18932193 * Do FIB lookup in kernel tables using parameters in *params*.
18942194 * If lookup is successful and result shows packet is to be
....@@ -1900,9 +2200,9 @@
19002200 * is set to metric from route (IPv4/IPv6 only), and ifindex
19012201 * is set to the device index of the nexthop from the FIB lookup.
19022202 *
1903
- * *plen* argument is the size of the passed in struct.
1904
- * *flags* argument can be a combination of one or more of the
1905
- * following values:
2203
+ * *plen* argument is the size of the passed in struct.
2204
+ * *flags* argument can be a combination of one or more of the
2205
+ * following values:
19062206 *
19072207 * **BPF_FIB_LOOKUP_DIRECT**
19082208 * Do a direct table lookup vs full lookup using FIB
....@@ -1911,15 +2211,15 @@
19112211 * Perform lookup from an egress perspective (default is
19122212 * ingress).
19132213 *
1914
- * *ctx* is either **struct xdp_md** for XDP programs or
1915
- * **struct sk_buff** tc cls_act programs.
1916
- * Return
2214
+ * *ctx* is either **struct xdp_md** for XDP programs or
2215
+ * **struct sk_buff** tc cls_act programs.
2216
+ * Return
19172217 * * < 0 if any input argument is invalid
19182218 * * 0 on success (packet is forwarded, nexthop neighbor exists)
19192219 * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the
19202220 * packet is not forwarded or needs assist from full stack
19212221 *
1922
- * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags)
2222
+ * long bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags)
19232223 * Description
19242224 * Add an entry to, or update a sockhash *map* referencing sockets.
19252225 * The *skops* is used as a new value for the entry associated to
....@@ -1938,7 +2238,7 @@
19382238 * Return
19392239 * 0 on success, or a negative error in case of failure.
19402240 *
1941
- * int bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags)
2241
+ * long bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags)
19422242 * Description
19432243 * This helper is used in programs implementing policies at the
19442244 * socket level. If the message *msg* is allowed to pass (i.e. if
....@@ -1952,11 +2252,11 @@
19522252 * Return
19532253 * **SK_PASS** on success, or **SK_DROP** on error.
19542254 *
1955
- * int bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags)
2255
+ * long bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags)
19562256 * Description
19572257 * This helper is used in programs implementing policies at the
19582258 * skb socket level. If the sk_buff *skb* is allowed to pass (i.e.
1959
- * if the verdeict eBPF program returns **SK_PASS**), redirect it
2259
+ * if the verdict eBPF program returns **SK_PASS**), redirect it
19602260 * to the socket referenced by *map* (of type
19612261 * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and
19622262 * egress interfaces can be used for redirection. The
....@@ -1966,7 +2266,7 @@
19662266 * Return
19672267 * **SK_PASS** on success, or **SK_DROP** on error.
19682268 *
1969
- * int bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
2269
+ * long bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
19702270 * Description
19712271 * Encapsulate the packet associated to *skb* within a Layer 3
19722272 * protocol header. This header is provided in the buffer at
....@@ -1981,8 +2281,21 @@
19812281 * Only works if *skb* contains an IPv6 packet. Insert a
19822282 * Segment Routing Header (**struct ipv6_sr_hdr**) inside
19832283 * the IPv6 header.
2284
+ * **BPF_LWT_ENCAP_IP**
2285
+ * IP encapsulation (GRE/GUE/IPIP/etc). The outer header
2286
+ * must be IPv4 or IPv6, followed by zero or more
2287
+ * additional headers, up to **LWT_BPF_MAX_HEADROOM**
2288
+ * total bytes in all prepended headers. Please note that
2289
+ * if **skb_is_gso**\ (*skb*) is true, no more than two
2290
+ * headers can be prepended, and the inner header, if
2291
+ * present, should be either GRE or UDP/GUE.
19842292 *
1985
- * A call to this helper is susceptible to change the underlaying
2293
+ * **BPF_LWT_ENCAP_SEG6**\ \* types can be called by BPF programs
2294
+ * of type **BPF_PROG_TYPE_LWT_IN**; **BPF_LWT_ENCAP_IP** type can
2295
+ * be called by bpf programs of types **BPF_PROG_TYPE_LWT_IN** and
2296
+ * **BPF_PROG_TYPE_LWT_XMIT**.
2297
+ *
2298
+ * A call to this helper is susceptible to change the underlying
19862299 * packet buffer. Therefore, at load time, all checks on pointers
19872300 * previously done by the verifier are invalidated and must be
19882301 * performed again, if the helper is used in combination with
....@@ -1990,14 +2303,14 @@
19902303 * Return
19912304 * 0 on success, or a negative error in case of failure.
19922305 *
1993
- * int bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len)
2306
+ * long bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len)
19942307 * Description
19952308 * Store *len* bytes from address *from* into the packet
19962309 * associated to *skb*, at *offset*. Only the flags, tag and TLVs
19972310 * inside the outermost IPv6 Segment Routing Header can be
19982311 * modified through this helper.
19992312 *
2000
- * A call to this helper is susceptible to change the underlaying
2313
+ * A call to this helper is susceptible to change the underlying
20012314 * packet buffer. Therefore, at load time, all checks on pointers
20022315 * previously done by the verifier are invalidated and must be
20032316 * performed again, if the helper is used in combination with
....@@ -2005,7 +2318,7 @@
20052318 * Return
20062319 * 0 on success, or a negative error in case of failure.
20072320 *
2008
- * int bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta)
2321
+ * long bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta)
20092322 * Description
20102323 * Adjust the size allocated to TLVs in the outermost IPv6
20112324 * Segment Routing Header contained in the packet associated to
....@@ -2013,7 +2326,7 @@
20132326 * after the segments are accepted. *delta* can be as well
20142327 * positive (growing) as negative (shrinking).
20152328 *
2016
- * A call to this helper is susceptible to change the underlaying
2329
+ * A call to this helper is susceptible to change the underlying
20172330 * packet buffer. Therefore, at load time, all checks on pointers
20182331 * previously done by the verifier are invalidated and must be
20192332 * performed again, if the helper is used in combination with
....@@ -2021,7 +2334,7 @@
20212334 * Return
20222335 * 0 on success, or a negative error in case of failure.
20232336 *
2024
- * int bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len)
2337
+ * long bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len)
20252338 * Description
20262339 * Apply an IPv6 Segment Routing action of type *action* to the
20272340 * packet associated to *skb*. Each action takes a parameter
....@@ -2036,13 +2349,13 @@
20362349 * Type of *param*: **int**.
20372350 * **SEG6_LOCAL_ACTION_END_B6**
20382351 * End.B6 action: Endpoint bound to an SRv6 policy.
2039
- * Type of param: **struct ipv6_sr_hdr**.
2352
+ * Type of *param*: **struct ipv6_sr_hdr**.
20402353 * **SEG6_LOCAL_ACTION_END_B6_ENCAP**
20412354 * End.B6.Encap action: Endpoint bound to an SRv6
20422355 * encapsulation policy.
2043
- * Type of param: **struct ipv6_sr_hdr**.
2356
+ * Type of *param*: **struct ipv6_sr_hdr**.
20442357 *
2045
- * A call to this helper is susceptible to change the underlaying
2358
+ * A call to this helper is susceptible to change the underlying
20462359 * packet buffer. Therefore, at load time, all checks on pointers
20472360 * previously done by the verifier are invalidated and must be
20482361 * performed again, if the helper is used in combination with
....@@ -2050,33 +2363,7 @@
20502363 * Return
20512364 * 0 on success, or a negative error in case of failure.
20522365 *
2053
- * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle)
2054
- * Description
2055
- * This helper is used in programs implementing IR decoding, to
2056
- * report a successfully decoded key press with *scancode*,
2057
- * *toggle* value in the given *protocol*. The scancode will be
2058
- * translated to a keycode using the rc keymap, and reported as
2059
- * an input key down event. After a period a key up event is
2060
- * generated. This period can be extended by calling either
2061
- * **bpf_rc_keydown** () again with the same values, or calling
2062
- * **bpf_rc_repeat** ().
2063
- *
2064
- * Some protocols include a toggle bit, in case the button was
2065
- * released and pressed again between consecutive scancodes.
2066
- *
2067
- * The *ctx* should point to the lirc sample as passed into
2068
- * the program.
2069
- *
2070
- * The *protocol* is the decoded protocol number (see
2071
- * **enum rc_proto** for some predefined values).
2072
- *
2073
- * This helper is only available is the kernel was compiled with
2074
- * the **CONFIG_BPF_LIRC_MODE2** configuration option set to
2075
- * "**y**".
2076
- * Return
2077
- * 0
2078
- *
2079
- * int bpf_rc_repeat(void *ctx)
2366
+ * long bpf_rc_repeat(void *ctx)
20802367 * Description
20812368 * This helper is used in programs implementing IR decoding, to
20822369 * report a successfully decoded repeat key message. This delays
....@@ -2095,7 +2382,33 @@
20952382 * Return
20962383 * 0
20972384 *
2098
- * uint64_t bpf_skb_cgroup_id(struct sk_buff *skb)
2385
+ * long bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle)
2386
+ * Description
2387
+ * This helper is used in programs implementing IR decoding, to
2388
+ * report a successfully decoded key press with *scancode*,
2389
+ * *toggle* value in the given *protocol*. The scancode will be
2390
+ * translated to a keycode using the rc keymap, and reported as
2391
+ * an input key down event. After a period a key up event is
2392
+ * generated. This period can be extended by calling either
2393
+ * **bpf_rc_keydown**\ () again with the same values, or calling
2394
+ * **bpf_rc_repeat**\ ().
2395
+ *
2396
+ * Some protocols include a toggle bit, in case the button was
2397
+ * released and pressed again between consecutive scancodes.
2398
+ *
2399
+ * The *ctx* should point to the lirc sample as passed into
2400
+ * the program.
2401
+ *
2402
+ * The *protocol* is the decoded protocol number (see
2403
+ * **enum rc_proto** for some predefined values).
2404
+ *
2405
+ * This helper is only available is the kernel was compiled with
2406
+ * the **CONFIG_BPF_LIRC_MODE2** configuration option set to
2407
+ * "**y**".
2408
+ * Return
2409
+ * 0
2410
+ *
2411
+ * u64 bpf_skb_cgroup_id(struct sk_buff *skb)
20992412 * Description
21002413 * Return the cgroup v2 id of the socket associated with the *skb*.
21012414 * This is roughly similar to the **bpf_get_cgroup_classid**\ ()
....@@ -2110,6 +2423,38 @@
21102423 * **CONFIG_SOCK_CGROUP_DATA** configuration option.
21112424 * Return
21122425 * The id is returned or 0 in case the id could not be retrieved.
2426
+ *
2427
+ * u64 bpf_get_current_cgroup_id(void)
2428
+ * Return
2429
+ * A 64-bit integer containing the current cgroup id based
2430
+ * on the cgroup within which the current task is running.
2431
+ *
2432
+ * void *bpf_get_local_storage(void *map, u64 flags)
2433
+ * Description
2434
+ * Get the pointer to the local storage area.
2435
+ * The type and the size of the local storage is defined
2436
+ * by the *map* argument.
2437
+ * The *flags* meaning is specific for each map type,
2438
+ * and has to be 0 for cgroup local storage.
2439
+ *
2440
+ * Depending on the BPF program type, a local storage area
2441
+ * can be shared between multiple instances of the BPF program,
2442
+ * running simultaneously.
2443
+ *
2444
+ * A user should care about the synchronization by himself.
2445
+ * For example, by using the **BPF_STX_XADD** instruction to alter
2446
+ * the shared data.
2447
+ * Return
2448
+ * A pointer to the local storage area.
2449
+ *
2450
+ * long bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags)
2451
+ * Description
2452
+ * Select a **SO_REUSEPORT** socket from a
2453
+ * **BPF_MAP_TYPE_REUSEPORT_SOCKARRAY** *map*.
2454
+ * It checks the selected socket is matching the incoming
2455
+ * request in the socket buffer.
2456
+ * Return
2457
+ * 0 on success, or a negative error in case of failure.
21132458 *
21142459 * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level)
21152460 * Description
....@@ -2129,44 +2474,1274 @@
21292474 * Return
21302475 * The id is returned or 0 in case the id could not be retrieved.
21312476 *
2132
- * u64 bpf_get_current_cgroup_id(void)
2133
- * Return
2134
- * A 64-bit integer containing the current cgroup id based
2135
- * on the cgroup within which the current task is running.
2136
- *
2137
- * void* get_local_storage(void *map, u64 flags)
2477
+ * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
21382478 * Description
2139
- * Get the pointer to the local storage area.
2140
- * The type and the size of the local storage is defined
2141
- * by the *map* argument.
2142
- * The *flags* meaning is specific for each map type,
2143
- * and has to be 0 for cgroup local storage.
2479
+ * Look for TCP socket matching *tuple*, optionally in a child
2480
+ * network namespace *netns*. The return value must be checked,
2481
+ * and if non-**NULL**, released via **bpf_sk_release**\ ().
21442482 *
2145
- * Depending on the bpf program type, a local storage area
2146
- * can be shared between multiple instances of the bpf program,
2147
- * running simultaneously.
2483
+ * The *ctx* should point to the context of the program, such as
2484
+ * the skb or socket (depending on the hook in use). This is used
2485
+ * to determine the base network namespace for the lookup.
21482486 *
2149
- * A user should care about the synchronization by himself.
2150
- * For example, by using the BPF_STX_XADD instruction to alter
2151
- * the shared data.
2487
+ * *tuple_size* must be one of:
2488
+ *
2489
+ * **sizeof**\ (*tuple*\ **->ipv4**)
2490
+ * Look for an IPv4 socket.
2491
+ * **sizeof**\ (*tuple*\ **->ipv6**)
2492
+ * Look for an IPv6 socket.
2493
+ *
2494
+ * If the *netns* is a negative signed 32-bit integer, then the
2495
+ * socket lookup table in the netns associated with the *ctx*
2496
+ * will be used. For the TC hooks, this is the netns of the device
2497
+ * in the skb. For socket hooks, this is the netns of the socket.
2498
+ * If *netns* is any other signed 32-bit value greater than or
2499
+ * equal to zero then it specifies the ID of the netns relative to
2500
+ * the netns associated with the *ctx*. *netns* values beyond the
2501
+ * range of 32-bit integers are reserved for future use.
2502
+ *
2503
+ * All values for *flags* are reserved for future usage, and must
2504
+ * be left at zero.
2505
+ *
2506
+ * This helper is available only if the kernel was compiled with
2507
+ * **CONFIG_NET** configuration option.
21522508 * Return
2153
- * Pointer to the local storage area.
2509
+ * Pointer to **struct bpf_sock**, or **NULL** in case of failure.
2510
+ * For sockets with reuseport option, the **struct bpf_sock**
2511
+ * result is from *reuse*\ **->socks**\ [] using the hash of the
2512
+ * tuple.
21542513 *
2155
- * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags)
2514
+ * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
21562515 * Description
2157
- * Select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY map
2158
- * It checks the selected sk is matching the incoming
2159
- * request in the skb.
2516
+ * Look for UDP socket matching *tuple*, optionally in a child
2517
+ * network namespace *netns*. The return value must be checked,
2518
+ * and if non-**NULL**, released via **bpf_sk_release**\ ().
2519
+ *
2520
+ * The *ctx* should point to the context of the program, such as
2521
+ * the skb or socket (depending on the hook in use). This is used
2522
+ * to determine the base network namespace for the lookup.
2523
+ *
2524
+ * *tuple_size* must be one of:
2525
+ *
2526
+ * **sizeof**\ (*tuple*\ **->ipv4**)
2527
+ * Look for an IPv4 socket.
2528
+ * **sizeof**\ (*tuple*\ **->ipv6**)
2529
+ * Look for an IPv6 socket.
2530
+ *
2531
+ * If the *netns* is a negative signed 32-bit integer, then the
2532
+ * socket lookup table in the netns associated with the *ctx*
2533
+ * will be used. For the TC hooks, this is the netns of the device
2534
+ * in the skb. For socket hooks, this is the netns of the socket.
2535
+ * If *netns* is any other signed 32-bit value greater than or
2536
+ * equal to zero then it specifies the ID of the netns relative to
2537
+ * the netns associated with the *ctx*. *netns* values beyond the
2538
+ * range of 32-bit integers are reserved for future use.
2539
+ *
2540
+ * All values for *flags* are reserved for future usage, and must
2541
+ * be left at zero.
2542
+ *
2543
+ * This helper is available only if the kernel was compiled with
2544
+ * **CONFIG_NET** configuration option.
2545
+ * Return
2546
+ * Pointer to **struct bpf_sock**, or **NULL** in case of failure.
2547
+ * For sockets with reuseport option, the **struct bpf_sock**
2548
+ * result is from *reuse*\ **->socks**\ [] using the hash of the
2549
+ * tuple.
2550
+ *
2551
+ * long bpf_sk_release(void *sock)
2552
+ * Description
2553
+ * Release the reference held by *sock*. *sock* must be a
2554
+ * non-**NULL** pointer that was returned from
2555
+ * **bpf_sk_lookup_xxx**\ ().
21602556 * Return
21612557 * 0 on success, or a negative error in case of failure.
2558
+ *
2559
+ * long bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags)
2560
+ * Description
2561
+ * Push an element *value* in *map*. *flags* is one of:
2562
+ *
2563
+ * **BPF_EXIST**
2564
+ * If the queue/stack is full, the oldest element is
2565
+ * removed to make room for this.
2566
+ * Return
2567
+ * 0 on success, or a negative error in case of failure.
2568
+ *
2569
+ * long bpf_map_pop_elem(struct bpf_map *map, void *value)
2570
+ * Description
2571
+ * Pop an element from *map*.
2572
+ * Return
2573
+ * 0 on success, or a negative error in case of failure.
2574
+ *
2575
+ * long bpf_map_peek_elem(struct bpf_map *map, void *value)
2576
+ * Description
2577
+ * Get an element from *map* without removing it.
2578
+ * Return
2579
+ * 0 on success, or a negative error in case of failure.
2580
+ *
2581
+ * long bpf_msg_push_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags)
2582
+ * Description
2583
+ * For socket policies, insert *len* bytes into *msg* at offset
2584
+ * *start*.
2585
+ *
2586
+ * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a
2587
+ * *msg* it may want to insert metadata or options into the *msg*.
2588
+ * This can later be read and used by any of the lower layer BPF
2589
+ * hooks.
2590
+ *
2591
+ * This helper may fail if under memory pressure (a malloc
2592
+ * fails) in these cases BPF programs will get an appropriate
2593
+ * error and BPF programs will need to handle them.
2594
+ * Return
2595
+ * 0 on success, or a negative error in case of failure.
2596
+ *
2597
+ * long bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags)
2598
+ * Description
2599
+ * Will remove *len* bytes from a *msg* starting at byte *start*.
2600
+ * This may result in **ENOMEM** errors under certain situations if
2601
+ * an allocation and copy are required due to a full ring buffer.
2602
+ * However, the helper will try to avoid doing the allocation
2603
+ * if possible. Other errors can occur if input parameters are
2604
+ * invalid either due to *start* byte not being valid part of *msg*
2605
+ * payload and/or *pop* value being to large.
2606
+ * Return
2607
+ * 0 on success, or a negative error in case of failure.
2608
+ *
2609
+ * long bpf_rc_pointer_rel(void *ctx, s32 rel_x, s32 rel_y)
2610
+ * Description
2611
+ * This helper is used in programs implementing IR decoding, to
2612
+ * report a successfully decoded pointer movement.
2613
+ *
2614
+ * The *ctx* should point to the lirc sample as passed into
2615
+ * the program.
2616
+ *
2617
+ * This helper is only available is the kernel was compiled with
2618
+ * the **CONFIG_BPF_LIRC_MODE2** configuration option set to
2619
+ * "**y**".
2620
+ * Return
2621
+ * 0
2622
+ *
2623
+ * long bpf_spin_lock(struct bpf_spin_lock *lock)
2624
+ * Description
2625
+ * Acquire a spinlock represented by the pointer *lock*, which is
2626
+ * stored as part of a value of a map. Taking the lock allows to
2627
+ * safely update the rest of the fields in that value. The
2628
+ * spinlock can (and must) later be released with a call to
2629
+ * **bpf_spin_unlock**\ (\ *lock*\ ).
2630
+ *
2631
+ * Spinlocks in BPF programs come with a number of restrictions
2632
+ * and constraints:
2633
+ *
2634
+ * * **bpf_spin_lock** objects are only allowed inside maps of
2635
+ * types **BPF_MAP_TYPE_HASH** and **BPF_MAP_TYPE_ARRAY** (this
2636
+ * list could be extended in the future).
2637
+ * * BTF description of the map is mandatory.
2638
+ * * The BPF program can take ONE lock at a time, since taking two
2639
+ * or more could cause dead locks.
2640
+ * * Only one **struct bpf_spin_lock** is allowed per map element.
2641
+ * * When the lock is taken, calls (either BPF to BPF or helpers)
2642
+ * are not allowed.
2643
+ * * The **BPF_LD_ABS** and **BPF_LD_IND** instructions are not
2644
+ * allowed inside a spinlock-ed region.
2645
+ * * The BPF program MUST call **bpf_spin_unlock**\ () to release
2646
+ * the lock, on all execution paths, before it returns.
2647
+ * * The BPF program can access **struct bpf_spin_lock** only via
2648
+ * the **bpf_spin_lock**\ () and **bpf_spin_unlock**\ ()
2649
+ * helpers. Loading or storing data into the **struct
2650
+ * bpf_spin_lock** *lock*\ **;** field of a map is not allowed.
2651
+ * * To use the **bpf_spin_lock**\ () helper, the BTF description
2652
+ * of the map value must be a struct and have **struct
2653
+ * bpf_spin_lock** *anyname*\ **;** field at the top level.
2654
+ * Nested lock inside another struct is not allowed.
2655
+ * * The **struct bpf_spin_lock** *lock* field in a map value must
2656
+ * be aligned on a multiple of 4 bytes in that value.
2657
+ * * Syscall with command **BPF_MAP_LOOKUP_ELEM** does not copy
2658
+ * the **bpf_spin_lock** field to user space.
2659
+ * * Syscall with command **BPF_MAP_UPDATE_ELEM**, or update from
2660
+ * a BPF program, do not update the **bpf_spin_lock** field.
2661
+ * * **bpf_spin_lock** cannot be on the stack or inside a
2662
+ * networking packet (it can only be inside of a map values).
2663
+ * * **bpf_spin_lock** is available to root only.
2664
+ * * Tracing programs and socket filter programs cannot use
2665
+ * **bpf_spin_lock**\ () due to insufficient preemption checks
2666
+ * (but this may change in the future).
2667
+ * * **bpf_spin_lock** is not allowed in inner maps of map-in-map.
2668
+ * Return
2669
+ * 0
2670
+ *
2671
+ * long bpf_spin_unlock(struct bpf_spin_lock *lock)
2672
+ * Description
2673
+ * Release the *lock* previously locked by a call to
2674
+ * **bpf_spin_lock**\ (\ *lock*\ ).
2675
+ * Return
2676
+ * 0
2677
+ *
2678
+ * struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk)
2679
+ * Description
2680
+ * This helper gets a **struct bpf_sock** pointer such
2681
+ * that all the fields in this **bpf_sock** can be accessed.
2682
+ * Return
2683
+ * A **struct bpf_sock** pointer on success, or **NULL** in
2684
+ * case of failure.
2685
+ *
2686
+ * struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk)
2687
+ * Description
2688
+ * This helper gets a **struct bpf_tcp_sock** pointer from a
2689
+ * **struct bpf_sock** pointer.
2690
+ * Return
2691
+ * A **struct bpf_tcp_sock** pointer on success, or **NULL** in
2692
+ * case of failure.
2693
+ *
2694
+ * long bpf_skb_ecn_set_ce(struct sk_buff *skb)
2695
+ * Description
2696
+ * Set ECN (Explicit Congestion Notification) field of IP header
2697
+ * to **CE** (Congestion Encountered) if current value is **ECT**
2698
+ * (ECN Capable Transport). Otherwise, do nothing. Works with IPv6
2699
+ * and IPv4.
2700
+ * Return
2701
+ * 1 if the **CE** flag is set (either by the current helper call
2702
+ * or because it was already present), 0 if it is not set.
2703
+ *
2704
+ * struct bpf_sock *bpf_get_listener_sock(struct bpf_sock *sk)
2705
+ * Description
2706
+ * Return a **struct bpf_sock** pointer in **TCP_LISTEN** state.
2707
+ * **bpf_sk_release**\ () is unnecessary and not allowed.
2708
+ * Return
2709
+ * A **struct bpf_sock** pointer on success, or **NULL** in
2710
+ * case of failure.
2711
+ *
2712
+ * struct bpf_sock *bpf_skc_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
2713
+ * Description
2714
+ * Look for TCP socket matching *tuple*, optionally in a child
2715
+ * network namespace *netns*. The return value must be checked,
2716
+ * and if non-**NULL**, released via **bpf_sk_release**\ ().
2717
+ *
2718
+ * This function is identical to **bpf_sk_lookup_tcp**\ (), except
2719
+ * that it also returns timewait or request sockets. Use
2720
+ * **bpf_sk_fullsock**\ () or **bpf_tcp_sock**\ () to access the
2721
+ * full structure.
2722
+ *
2723
+ * This helper is available only if the kernel was compiled with
2724
+ * **CONFIG_NET** configuration option.
2725
+ * Return
2726
+ * Pointer to **struct bpf_sock**, or **NULL** in case of failure.
2727
+ * For sockets with reuseport option, the **struct bpf_sock**
2728
+ * result is from *reuse*\ **->socks**\ [] using the hash of the
2729
+ * tuple.
2730
+ *
2731
+ * long bpf_tcp_check_syncookie(void *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len)
2732
+ * Description
2733
+ * Check whether *iph* and *th* contain a valid SYN cookie ACK for
2734
+ * the listening socket in *sk*.
2735
+ *
2736
+ * *iph* points to the start of the IPv4 or IPv6 header, while
2737
+ * *iph_len* contains **sizeof**\ (**struct iphdr**) or
2738
+ * **sizeof**\ (**struct ip6hdr**).
2739
+ *
2740
+ * *th* points to the start of the TCP header, while *th_len*
2741
+ * contains **sizeof**\ (**struct tcphdr**).
2742
+ * Return
2743
+ * 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative
2744
+ * error otherwise.
2745
+ *
2746
+ * long bpf_sysctl_get_name(struct bpf_sysctl *ctx, char *buf, size_t buf_len, u64 flags)
2747
+ * Description
2748
+ * Get name of sysctl in /proc/sys/ and copy it into provided by
2749
+ * program buffer *buf* of size *buf_len*.
2750
+ *
2751
+ * The buffer is always NUL terminated, unless it's zero-sized.
2752
+ *
2753
+ * If *flags* is zero, full name (e.g. "net/ipv4/tcp_mem") is
2754
+ * copied. Use **BPF_F_SYSCTL_BASE_NAME** flag to copy base name
2755
+ * only (e.g. "tcp_mem").
2756
+ * Return
2757
+ * Number of character copied (not including the trailing NUL).
2758
+ *
2759
+ * **-E2BIG** if the buffer wasn't big enough (*buf* will contain
2760
+ * truncated name in this case).
2761
+ *
2762
+ * long bpf_sysctl_get_current_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len)
2763
+ * Description
2764
+ * Get current value of sysctl as it is presented in /proc/sys
2765
+ * (incl. newline, etc), and copy it as a string into provided
2766
+ * by program buffer *buf* of size *buf_len*.
2767
+ *
2768
+ * The whole value is copied, no matter what file position user
2769
+ * space issued e.g. sys_read at.
2770
+ *
2771
+ * The buffer is always NUL terminated, unless it's zero-sized.
2772
+ * Return
2773
+ * Number of character copied (not including the trailing NUL).
2774
+ *
2775
+ * **-E2BIG** if the buffer wasn't big enough (*buf* will contain
2776
+ * truncated name in this case).
2777
+ *
2778
+ * **-EINVAL** if current value was unavailable, e.g. because
2779
+ * sysctl is uninitialized and read returns -EIO for it.
2780
+ *
2781
+ * long bpf_sysctl_get_new_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len)
2782
+ * Description
2783
+ * Get new value being written by user space to sysctl (before
2784
+ * the actual write happens) and copy it as a string into
2785
+ * provided by program buffer *buf* of size *buf_len*.
2786
+ *
2787
+ * User space may write new value at file position > 0.
2788
+ *
2789
+ * The buffer is always NUL terminated, unless it's zero-sized.
2790
+ * Return
2791
+ * Number of character copied (not including the trailing NUL).
2792
+ *
2793
+ * **-E2BIG** if the buffer wasn't big enough (*buf* will contain
2794
+ * truncated name in this case).
2795
+ *
2796
+ * **-EINVAL** if sysctl is being read.
2797
+ *
2798
+ * long bpf_sysctl_set_new_value(struct bpf_sysctl *ctx, const char *buf, size_t buf_len)
2799
+ * Description
2800
+ * Override new value being written by user space to sysctl with
2801
+ * value provided by program in buffer *buf* of size *buf_len*.
2802
+ *
2803
+ * *buf* should contain a string in same form as provided by user
2804
+ * space on sysctl write.
2805
+ *
2806
+ * User space may write new value at file position > 0. To override
2807
+ * the whole sysctl value file position should be set to zero.
2808
+ * Return
2809
+ * 0 on success.
2810
+ *
2811
+ * **-E2BIG** if the *buf_len* is too big.
2812
+ *
2813
+ * **-EINVAL** if sysctl is being read.
2814
+ *
2815
+ * long bpf_strtol(const char *buf, size_t buf_len, u64 flags, long *res)
2816
+ * Description
2817
+ * Convert the initial part of the string from buffer *buf* of
2818
+ * size *buf_len* to a long integer according to the given base
2819
+ * and save the result in *res*.
2820
+ *
2821
+ * The string may begin with an arbitrary amount of white space
2822
+ * (as determined by **isspace**\ (3)) followed by a single
2823
+ * optional '**-**' sign.
2824
+ *
2825
+ * Five least significant bits of *flags* encode base, other bits
2826
+ * are currently unused.
2827
+ *
2828
+ * Base must be either 8, 10, 16 or 0 to detect it automatically
2829
+ * similar to user space **strtol**\ (3).
2830
+ * Return
2831
+ * Number of characters consumed on success. Must be positive but
2832
+ * no more than *buf_len*.
2833
+ *
2834
+ * **-EINVAL** if no valid digits were found or unsupported base
2835
+ * was provided.
2836
+ *
2837
+ * **-ERANGE** if resulting value was out of range.
2838
+ *
2839
+ * long bpf_strtoul(const char *buf, size_t buf_len, u64 flags, unsigned long *res)
2840
+ * Description
2841
+ * Convert the initial part of the string from buffer *buf* of
2842
+ * size *buf_len* to an unsigned long integer according to the
2843
+ * given base and save the result in *res*.
2844
+ *
2845
+ * The string may begin with an arbitrary amount of white space
2846
+ * (as determined by **isspace**\ (3)).
2847
+ *
2848
+ * Five least significant bits of *flags* encode base, other bits
2849
+ * are currently unused.
2850
+ *
2851
+ * Base must be either 8, 10, 16 or 0 to detect it automatically
2852
+ * similar to user space **strtoul**\ (3).
2853
+ * Return
2854
+ * Number of characters consumed on success. Must be positive but
2855
+ * no more than *buf_len*.
2856
+ *
2857
+ * **-EINVAL** if no valid digits were found or unsupported base
2858
+ * was provided.
2859
+ *
2860
+ * **-ERANGE** if resulting value was out of range.
2861
+ *
2862
+ * void *bpf_sk_storage_get(struct bpf_map *map, void *sk, void *value, u64 flags)
2863
+ * Description
2864
+ * Get a bpf-local-storage from a *sk*.
2865
+ *
2866
+ * Logically, it could be thought of getting the value from
2867
+ * a *map* with *sk* as the **key**. From this
2868
+ * perspective, the usage is not much different from
2869
+ * **bpf_map_lookup_elem**\ (*map*, **&**\ *sk*) except this
2870
+ * helper enforces the key must be a full socket and the map must
2871
+ * be a **BPF_MAP_TYPE_SK_STORAGE** also.
2872
+ *
2873
+ * Underneath, the value is stored locally at *sk* instead of
2874
+ * the *map*. The *map* is used as the bpf-local-storage
2875
+ * "type". The bpf-local-storage "type" (i.e. the *map*) is
2876
+ * searched against all bpf-local-storages residing at *sk*.
2877
+ *
2878
+ * *sk* is a kernel **struct sock** pointer for LSM program.
2879
+ * *sk* is a **struct bpf_sock** pointer for other program types.
2880
+ *
2881
+ * An optional *flags* (**BPF_SK_STORAGE_GET_F_CREATE**) can be
2882
+ * used such that a new bpf-local-storage will be
2883
+ * created if one does not exist. *value* can be used
2884
+ * together with **BPF_SK_STORAGE_GET_F_CREATE** to specify
2885
+ * the initial value of a bpf-local-storage. If *value* is
2886
+ * **NULL**, the new bpf-local-storage will be zero initialized.
2887
+ * Return
2888
+ * A bpf-local-storage pointer is returned on success.
2889
+ *
2890
+ * **NULL** if not found or there was an error in adding
2891
+ * a new bpf-local-storage.
2892
+ *
2893
+ * long bpf_sk_storage_delete(struct bpf_map *map, void *sk)
2894
+ * Description
2895
+ * Delete a bpf-local-storage from a *sk*.
2896
+ * Return
2897
+ * 0 on success.
2898
+ *
2899
+ * **-ENOENT** if the bpf-local-storage cannot be found.
2900
+ * **-EINVAL** if sk is not a fullsock (e.g. a request_sock).
2901
+ *
2902
+ * long bpf_send_signal(u32 sig)
2903
+ * Description
2904
+ * Send signal *sig* to the process of the current task.
2905
+ * The signal may be delivered to any of this process's threads.
2906
+ * Return
2907
+ * 0 on success or successfully queued.
2908
+ *
2909
+ * **-EBUSY** if work queue under nmi is full.
2910
+ *
2911
+ * **-EINVAL** if *sig* is invalid.
2912
+ *
2913
+ * **-EPERM** if no permission to send the *sig*.
2914
+ *
2915
+ * **-EAGAIN** if bpf program can try again.
2916
+ *
2917
+ * s64 bpf_tcp_gen_syncookie(void *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len)
2918
+ * Description
2919
+ * Try to issue a SYN cookie for the packet with corresponding
2920
+ * IP/TCP headers, *iph* and *th*, on the listening socket in *sk*.
2921
+ *
2922
+ * *iph* points to the start of the IPv4 or IPv6 header, while
2923
+ * *iph_len* contains **sizeof**\ (**struct iphdr**) or
2924
+ * **sizeof**\ (**struct ip6hdr**).
2925
+ *
2926
+ * *th* points to the start of the TCP header, while *th_len*
2927
+ * contains the length of the TCP header.
2928
+ * Return
2929
+ * On success, lower 32 bits hold the generated SYN cookie in
2930
+ * followed by 16 bits which hold the MSS value for that cookie,
2931
+ * and the top 16 bits are unused.
2932
+ *
2933
+ * On failure, the returned value is one of the following:
2934
+ *
2935
+ * **-EINVAL** SYN cookie cannot be issued due to error
2936
+ *
2937
+ * **-ENOENT** SYN cookie should not be issued (no SYN flood)
2938
+ *
2939
+ * **-EOPNOTSUPP** kernel configuration does not enable SYN cookies
2940
+ *
2941
+ * **-EPROTONOSUPPORT** IP packet version is not 4 or 6
2942
+ *
2943
+ * long bpf_skb_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
2944
+ * Description
2945
+ * Write raw *data* blob into a special BPF perf event held by
2946
+ * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
2947
+ * event must have the following attributes: **PERF_SAMPLE_RAW**
2948
+ * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and
2949
+ * **PERF_COUNT_SW_BPF_OUTPUT** as **config**.
2950
+ *
2951
+ * The *flags* are used to indicate the index in *map* for which
2952
+ * the value must be put, masked with **BPF_F_INDEX_MASK**.
2953
+ * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU**
2954
+ * to indicate that the index of the current CPU core should be
2955
+ * used.
2956
+ *
2957
+ * The value to write, of *size*, is passed through eBPF stack and
2958
+ * pointed by *data*.
2959
+ *
2960
+ * *ctx* is a pointer to in-kernel struct sk_buff.
2961
+ *
2962
+ * This helper is similar to **bpf_perf_event_output**\ () but
2963
+ * restricted to raw_tracepoint bpf programs.
2964
+ * Return
2965
+ * 0 on success, or a negative error in case of failure.
2966
+ *
2967
+ * long bpf_probe_read_user(void *dst, u32 size, const void *unsafe_ptr)
2968
+ * Description
2969
+ * Safely attempt to read *size* bytes from user space address
2970
+ * *unsafe_ptr* and store the data in *dst*.
2971
+ * Return
2972
+ * 0 on success, or a negative error in case of failure.
2973
+ *
2974
+ * long bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr)
2975
+ * Description
2976
+ * Safely attempt to read *size* bytes from kernel space address
2977
+ * *unsafe_ptr* and store the data in *dst*.
2978
+ * Return
2979
+ * 0 on success, or a negative error in case of failure.
2980
+ *
2981
+ * long bpf_probe_read_user_str(void *dst, u32 size, const void *unsafe_ptr)
2982
+ * Description
2983
+ * Copy a NUL terminated string from an unsafe user address
2984
+ * *unsafe_ptr* to *dst*. The *size* should include the
2985
+ * terminating NUL byte. In case the string length is smaller than
2986
+ * *size*, the target is not padded with further NUL bytes. If the
2987
+ * string length is larger than *size*, just *size*-1 bytes are
2988
+ * copied and the last byte is set to NUL.
2989
+ *
2990
+ * On success, the length of the copied string is returned. This
2991
+ * makes this helper useful in tracing programs for reading
2992
+ * strings, and more importantly to get its length at runtime. See
2993
+ * the following snippet:
2994
+ *
2995
+ * ::
2996
+ *
2997
+ * SEC("kprobe/sys_open")
2998
+ * void bpf_sys_open(struct pt_regs *ctx)
2999
+ * {
3000
+ * char buf[PATHLEN]; // PATHLEN is defined to 256
3001
+ * int res = bpf_probe_read_user_str(buf, sizeof(buf),
3002
+ * ctx->di);
3003
+ *
3004
+ * // Consume buf, for example push it to
3005
+ * // userspace via bpf_perf_event_output(); we
3006
+ * // can use res (the string length) as event
3007
+ * // size, after checking its boundaries.
3008
+ * }
3009
+ *
3010
+ * In comparison, using **bpf_probe_read_user**\ () helper here
3011
+ * instead to read the string would require to estimate the length
3012
+ * at compile time, and would often result in copying more memory
3013
+ * than necessary.
3014
+ *
3015
+ * Another useful use case is when parsing individual process
3016
+ * arguments or individual environment variables navigating
3017
+ * *current*\ **->mm->arg_start** and *current*\
3018
+ * **->mm->env_start**: using this helper and the return value,
3019
+ * one can quickly iterate at the right offset of the memory area.
3020
+ * Return
3021
+ * On success, the strictly positive length of the string,
3022
+ * including the trailing NUL character. On error, a negative
3023
+ * value.
3024
+ *
3025
+ * long bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr)
3026
+ * Description
3027
+ * Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr*
3028
+ * to *dst*. Same semantics as with **bpf_probe_read_user_str**\ () apply.
3029
+ * Return
3030
+ * On success, the strictly positive length of the string, including
3031
+ * the trailing NUL character. On error, a negative value.
3032
+ *
3033
+ * long bpf_tcp_send_ack(void *tp, u32 rcv_nxt)
3034
+ * Description
3035
+ * Send out a tcp-ack. *tp* is the in-kernel struct **tcp_sock**.
3036
+ * *rcv_nxt* is the ack_seq to be sent out.
3037
+ * Return
3038
+ * 0 on success, or a negative error in case of failure.
3039
+ *
3040
+ * long bpf_send_signal_thread(u32 sig)
3041
+ * Description
3042
+ * Send signal *sig* to the thread corresponding to the current task.
3043
+ * Return
3044
+ * 0 on success or successfully queued.
3045
+ *
3046
+ * **-EBUSY** if work queue under nmi is full.
3047
+ *
3048
+ * **-EINVAL** if *sig* is invalid.
3049
+ *
3050
+ * **-EPERM** if no permission to send the *sig*.
3051
+ *
3052
+ * **-EAGAIN** if bpf program can try again.
3053
+ *
3054
+ * u64 bpf_jiffies64(void)
3055
+ * Description
3056
+ * Obtain the 64bit jiffies
3057
+ * Return
3058
+ * The 64 bit jiffies
3059
+ *
3060
+ * long bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags)
3061
+ * Description
3062
+ * For an eBPF program attached to a perf event, retrieve the
3063
+ * branch records (**struct perf_branch_entry**) associated to *ctx*
3064
+ * and store it in the buffer pointed by *buf* up to size
3065
+ * *size* bytes.
3066
+ * Return
3067
+ * On success, number of bytes written to *buf*. On error, a
3068
+ * negative value.
3069
+ *
3070
+ * The *flags* can be set to **BPF_F_GET_BRANCH_RECORDS_SIZE** to
3071
+ * instead return the number of bytes required to store all the
3072
+ * branch entries. If this flag is set, *buf* may be NULL.
3073
+ *
3074
+ * **-EINVAL** if arguments invalid or **size** not a multiple
3075
+ * of **sizeof**\ (**struct perf_branch_entry**\ ).
3076
+ *
3077
+ * **-ENOENT** if architecture does not support branch records.
3078
+ *
3079
+ * long bpf_get_ns_current_pid_tgid(u64 dev, u64 ino, struct bpf_pidns_info *nsdata, u32 size)
3080
+ * Description
3081
+ * Returns 0 on success, values for *pid* and *tgid* as seen from the current
3082
+ * *namespace* will be returned in *nsdata*.
3083
+ * Return
3084
+ * 0 on success, or one of the following in case of failure:
3085
+ *
3086
+ * **-EINVAL** if dev and inum supplied don't match dev_t and inode number
3087
+ * with nsfs of current task, or if dev conversion to dev_t lost high bits.
3088
+ *
3089
+ * **-ENOENT** if pidns does not exists for the current task.
3090
+ *
3091
+ * long bpf_xdp_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
3092
+ * Description
3093
+ * Write raw *data* blob into a special BPF perf event held by
3094
+ * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
3095
+ * event must have the following attributes: **PERF_SAMPLE_RAW**
3096
+ * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and
3097
+ * **PERF_COUNT_SW_BPF_OUTPUT** as **config**.
3098
+ *
3099
+ * The *flags* are used to indicate the index in *map* for which
3100
+ * the value must be put, masked with **BPF_F_INDEX_MASK**.
3101
+ * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU**
3102
+ * to indicate that the index of the current CPU core should be
3103
+ * used.
3104
+ *
3105
+ * The value to write, of *size*, is passed through eBPF stack and
3106
+ * pointed by *data*.
3107
+ *
3108
+ * *ctx* is a pointer to in-kernel struct xdp_buff.
3109
+ *
3110
+ * This helper is similar to **bpf_perf_eventoutput**\ () but
3111
+ * restricted to raw_tracepoint bpf programs.
3112
+ * Return
3113
+ * 0 on success, or a negative error in case of failure.
3114
+ *
3115
+ * u64 bpf_get_netns_cookie(void *ctx)
3116
+ * Description
3117
+ * Retrieve the cookie (generated by the kernel) of the network
3118
+ * namespace the input *ctx* is associated with. The network
3119
+ * namespace cookie remains stable for its lifetime and provides
3120
+ * a global identifier that can be assumed unique. If *ctx* is
3121
+ * NULL, then the helper returns the cookie for the initial
3122
+ * network namespace. The cookie itself is very similar to that
3123
+ * of **bpf_get_socket_cookie**\ () helper, but for network
3124
+ * namespaces instead of sockets.
3125
+ * Return
3126
+ * A 8-byte long opaque number.
3127
+ *
3128
+ * u64 bpf_get_current_ancestor_cgroup_id(int ancestor_level)
3129
+ * Description
3130
+ * Return id of cgroup v2 that is ancestor of the cgroup associated
3131
+ * with the current task at the *ancestor_level*. The root cgroup
3132
+ * is at *ancestor_level* zero and each step down the hierarchy
3133
+ * increments the level. If *ancestor_level* == level of cgroup
3134
+ * associated with the current task, then return value will be the
3135
+ * same as that of **bpf_get_current_cgroup_id**\ ().
3136
+ *
3137
+ * The helper is useful to implement policies based on cgroups
3138
+ * that are upper in hierarchy than immediate cgroup associated
3139
+ * with the current task.
3140
+ *
3141
+ * The format of returned id and helper limitations are same as in
3142
+ * **bpf_get_current_cgroup_id**\ ().
3143
+ * Return
3144
+ * The id is returned or 0 in case the id could not be retrieved.
3145
+ *
3146
+ * long bpf_sk_assign(struct sk_buff *skb, void *sk, u64 flags)
3147
+ * Description
3148
+ * Helper is overloaded depending on BPF program type. This
3149
+ * description applies to **BPF_PROG_TYPE_SCHED_CLS** and
3150
+ * **BPF_PROG_TYPE_SCHED_ACT** programs.
3151
+ *
3152
+ * Assign the *sk* to the *skb*. When combined with appropriate
3153
+ * routing configuration to receive the packet towards the socket,
3154
+ * will cause *skb* to be delivered to the specified socket.
3155
+ * Subsequent redirection of *skb* via **bpf_redirect**\ (),
3156
+ * **bpf_clone_redirect**\ () or other methods outside of BPF may
3157
+ * interfere with successful delivery to the socket.
3158
+ *
3159
+ * This operation is only valid from TC ingress path.
3160
+ *
3161
+ * The *flags* argument must be zero.
3162
+ * Return
3163
+ * 0 on success, or a negative error in case of failure:
3164
+ *
3165
+ * **-EINVAL** if specified *flags* are not supported.
3166
+ *
3167
+ * **-ENOENT** if the socket is unavailable for assignment.
3168
+ *
3169
+ * **-ENETUNREACH** if the socket is unreachable (wrong netns).
3170
+ *
3171
+ * **-EOPNOTSUPP** if the operation is not supported, for example
3172
+ * a call from outside of TC ingress.
3173
+ *
3174
+ * **-ESOCKTNOSUPPORT** if the socket type is not supported
3175
+ * (reuseport).
3176
+ *
3177
+ * long bpf_sk_assign(struct bpf_sk_lookup *ctx, struct bpf_sock *sk, u64 flags)
3178
+ * Description
3179
+ * Helper is overloaded depending on BPF program type. This
3180
+ * description applies to **BPF_PROG_TYPE_SK_LOOKUP** programs.
3181
+ *
3182
+ * Select the *sk* as a result of a socket lookup.
3183
+ *
3184
+ * For the operation to succeed passed socket must be compatible
3185
+ * with the packet description provided by the *ctx* object.
3186
+ *
3187
+ * L4 protocol (**IPPROTO_TCP** or **IPPROTO_UDP**) must
3188
+ * be an exact match. While IP family (**AF_INET** or
3189
+ * **AF_INET6**) must be compatible, that is IPv6 sockets
3190
+ * that are not v6-only can be selected for IPv4 packets.
3191
+ *
3192
+ * Only TCP listeners and UDP unconnected sockets can be
3193
+ * selected. *sk* can also be NULL to reset any previous
3194
+ * selection.
3195
+ *
3196
+ * *flags* argument can combination of following values:
3197
+ *
3198
+ * * **BPF_SK_LOOKUP_F_REPLACE** to override the previous
3199
+ * socket selection, potentially done by a BPF program
3200
+ * that ran before us.
3201
+ *
3202
+ * * **BPF_SK_LOOKUP_F_NO_REUSEPORT** to skip
3203
+ * load-balancing within reuseport group for the socket
3204
+ * being selected.
3205
+ *
3206
+ * On success *ctx->sk* will point to the selected socket.
3207
+ *
3208
+ * Return
3209
+ * 0 on success, or a negative errno in case of failure.
3210
+ *
3211
+ * * **-EAFNOSUPPORT** if socket family (*sk->family*) is
3212
+ * not compatible with packet family (*ctx->family*).
3213
+ *
3214
+ * * **-EEXIST** if socket has been already selected,
3215
+ * potentially by another program, and
3216
+ * **BPF_SK_LOOKUP_F_REPLACE** flag was not specified.
3217
+ *
3218
+ * * **-EINVAL** if unsupported flags were specified.
3219
+ *
3220
+ * * **-EPROTOTYPE** if socket L4 protocol
3221
+ * (*sk->protocol*) doesn't match packet protocol
3222
+ * (*ctx->protocol*).
3223
+ *
3224
+ * * **-ESOCKTNOSUPPORT** if socket is not in allowed
3225
+ * state (TCP listening or UDP unconnected).
21623226 *
21633227 * u64 bpf_ktime_get_boot_ns(void)
21643228 * Description
21653229 * Return the time elapsed since system boot, in nanoseconds.
21663230 * Does include the time the system was suspended.
2167
- * See: clock_gettime(CLOCK_BOOTTIME)
3231
+ * See: **clock_gettime**\ (**CLOCK_BOOTTIME**)
21683232 * Return
21693233 * Current *ktime*.
3234
+ *
3235
+ * long bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len)
3236
+ * Description
3237
+ * **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print
3238
+ * out the format string.
3239
+ * The *m* represents the seq_file. The *fmt* and *fmt_size* are for
3240
+ * the format string itself. The *data* and *data_len* are format string
3241
+ * arguments. The *data* are a **u64** array and corresponding format string
3242
+ * values are stored in the array. For strings and pointers where pointees
3243
+ * are accessed, only the pointer values are stored in the *data* array.
3244
+ * The *data_len* is the size of *data* in bytes.
3245
+ *
3246
+ * Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory.
3247
+ * Reading kernel memory may fail due to either invalid address or
3248
+ * valid address but requiring a major memory fault. If reading kernel memory
3249
+ * fails, the string for **%s** will be an empty string, and the ip
3250
+ * address for **%p{i,I}{4,6}** will be 0. Not returning error to
3251
+ * bpf program is consistent with what **bpf_trace_printk**\ () does for now.
3252
+ * Return
3253
+ * 0 on success, or a negative error in case of failure:
3254
+ *
3255
+ * **-EBUSY** if per-CPU memory copy buffer is busy, can try again
3256
+ * by returning 1 from bpf program.
3257
+ *
3258
+ * **-EINVAL** if arguments are invalid, or if *fmt* is invalid/unsupported.
3259
+ *
3260
+ * **-E2BIG** if *fmt* contains too many format specifiers.
3261
+ *
3262
+ * **-EOVERFLOW** if an overflow happened: The same object will be tried again.
3263
+ *
3264
+ * long bpf_seq_write(struct seq_file *m, const void *data, u32 len)
3265
+ * Description
3266
+ * **bpf_seq_write**\ () uses seq_file **seq_write**\ () to write the data.
3267
+ * The *m* represents the seq_file. The *data* and *len* represent the
3268
+ * data to write in bytes.
3269
+ * Return
3270
+ * 0 on success, or a negative error in case of failure:
3271
+ *
3272
+ * **-EOVERFLOW** if an overflow happened: The same object will be tried again.
3273
+ *
3274
+ * u64 bpf_sk_cgroup_id(void *sk)
3275
+ * Description
3276
+ * Return the cgroup v2 id of the socket *sk*.
3277
+ *
3278
+ * *sk* must be a non-**NULL** pointer to a socket, e.g. one
3279
+ * returned from **bpf_sk_lookup_xxx**\ (),
3280
+ * **bpf_sk_fullsock**\ (), etc. The format of returned id is
3281
+ * same as in **bpf_skb_cgroup_id**\ ().
3282
+ *
3283
+ * This helper is available only if the kernel was compiled with
3284
+ * the **CONFIG_SOCK_CGROUP_DATA** configuration option.
3285
+ * Return
3286
+ * The id is returned or 0 in case the id could not be retrieved.
3287
+ *
3288
+ * u64 bpf_sk_ancestor_cgroup_id(void *sk, int ancestor_level)
3289
+ * Description
3290
+ * Return id of cgroup v2 that is ancestor of cgroup associated
3291
+ * with the *sk* at the *ancestor_level*. The root cgroup is at
3292
+ * *ancestor_level* zero and each step down the hierarchy
3293
+ * increments the level. If *ancestor_level* == level of cgroup
3294
+ * associated with *sk*, then return value will be same as that
3295
+ * of **bpf_sk_cgroup_id**\ ().
3296
+ *
3297
+ * The helper is useful to implement policies based on cgroups
3298
+ * that are upper in hierarchy than immediate cgroup associated
3299
+ * with *sk*.
3300
+ *
3301
+ * The format of returned id and helper limitations are same as in
3302
+ * **bpf_sk_cgroup_id**\ ().
3303
+ * Return
3304
+ * The id is returned or 0 in case the id could not be retrieved.
3305
+ *
3306
+ * long bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags)
3307
+ * Description
3308
+ * Copy *size* bytes from *data* into a ring buffer *ringbuf*.
3309
+ * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification
3310
+ * of new data availability is sent.
3311
+ * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification
3312
+ * of new data availability is sent unconditionally.
3313
+ * Return
3314
+ * 0 on success, or a negative error in case of failure.
3315
+ *
3316
+ * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags)
3317
+ * Description
3318
+ * Reserve *size* bytes of payload in a ring buffer *ringbuf*.
3319
+ * Return
3320
+ * Valid pointer with *size* bytes of memory available; NULL,
3321
+ * otherwise.
3322
+ *
3323
+ * void bpf_ringbuf_submit(void *data, u64 flags)
3324
+ * Description
3325
+ * Submit reserved ring buffer sample, pointed to by *data*.
3326
+ * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification
3327
+ * of new data availability is sent.
3328
+ * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification
3329
+ * of new data availability is sent unconditionally.
3330
+ * Return
3331
+ * Nothing. Always succeeds.
3332
+ *
3333
+ * void bpf_ringbuf_discard(void *data, u64 flags)
3334
+ * Description
3335
+ * Discard reserved ring buffer sample, pointed to by *data*.
3336
+ * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification
3337
+ * of new data availability is sent.
3338
+ * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification
3339
+ * of new data availability is sent unconditionally.
3340
+ * Return
3341
+ * Nothing. Always succeeds.
3342
+ *
3343
+ * u64 bpf_ringbuf_query(void *ringbuf, u64 flags)
3344
+ * Description
3345
+ * Query various characteristics of provided ring buffer. What
3346
+ * exactly is queries is determined by *flags*:
3347
+ *
3348
+ * * **BPF_RB_AVAIL_DATA**: Amount of data not yet consumed.
3349
+ * * **BPF_RB_RING_SIZE**: The size of ring buffer.
3350
+ * * **BPF_RB_CONS_POS**: Consumer position (can wrap around).
3351
+ * * **BPF_RB_PROD_POS**: Producer(s) position (can wrap around).
3352
+ *
3353
+ * Data returned is just a momentary snapshot of actual values
3354
+ * and could be inaccurate, so this facility should be used to
3355
+ * power heuristics and for reporting, not to make 100% correct
3356
+ * calculation.
3357
+ * Return
3358
+ * Requested value, or 0, if *flags* are not recognized.
3359
+ *
3360
+ * long bpf_csum_level(struct sk_buff *skb, u64 level)
3361
+ * Description
3362
+ * Change the skbs checksum level by one layer up or down, or
3363
+ * reset it entirely to none in order to have the stack perform
3364
+ * checksum validation. The level is applicable to the following
3365
+ * protocols: TCP, UDP, GRE, SCTP, FCOE. For example, a decap of
3366
+ * | ETH | IP | UDP | GUE | IP | TCP | into | ETH | IP | TCP |
3367
+ * through **bpf_skb_adjust_room**\ () helper with passing in
3368
+ * **BPF_F_ADJ_ROOM_NO_CSUM_RESET** flag would require one call
3369
+ * to **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_DEC** since
3370
+ * the UDP header is removed. Similarly, an encap of the latter
3371
+ * into the former could be accompanied by a helper call to
3372
+ * **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_INC** if the
3373
+ * skb is still intended to be processed in higher layers of the
3374
+ * stack instead of just egressing at tc.
3375
+ *
3376
+ * There are three supported level settings at this time:
3377
+ *
3378
+ * * **BPF_CSUM_LEVEL_INC**: Increases skb->csum_level for skbs
3379
+ * with CHECKSUM_UNNECESSARY.
3380
+ * * **BPF_CSUM_LEVEL_DEC**: Decreases skb->csum_level for skbs
3381
+ * with CHECKSUM_UNNECESSARY.
3382
+ * * **BPF_CSUM_LEVEL_RESET**: Resets skb->csum_level to 0 and
3383
+ * sets CHECKSUM_NONE to force checksum validation by the stack.
3384
+ * * **BPF_CSUM_LEVEL_QUERY**: No-op, returns the current
3385
+ * skb->csum_level.
3386
+ * Return
3387
+ * 0 on success, or a negative error in case of failure. In the
3388
+ * case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level
3389
+ * is returned or the error code -EACCES in case the skb is not
3390
+ * subject to CHECKSUM_UNNECESSARY.
3391
+ *
3392
+ * struct tcp6_sock *bpf_skc_to_tcp6_sock(void *sk)
3393
+ * Description
3394
+ * Dynamically cast a *sk* pointer to a *tcp6_sock* pointer.
3395
+ * Return
3396
+ * *sk* if casting is valid, or **NULL** otherwise.
3397
+ *
3398
+ * struct tcp_sock *bpf_skc_to_tcp_sock(void *sk)
3399
+ * Description
3400
+ * Dynamically cast a *sk* pointer to a *tcp_sock* pointer.
3401
+ * Return
3402
+ * *sk* if casting is valid, or **NULL** otherwise.
3403
+ *
3404
+ * struct tcp_timewait_sock *bpf_skc_to_tcp_timewait_sock(void *sk)
3405
+ * Description
3406
+ * Dynamically cast a *sk* pointer to a *tcp_timewait_sock* pointer.
3407
+ * Return
3408
+ * *sk* if casting is valid, or **NULL** otherwise.
3409
+ *
3410
+ * struct tcp_request_sock *bpf_skc_to_tcp_request_sock(void *sk)
3411
+ * Description
3412
+ * Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer.
3413
+ * Return
3414
+ * *sk* if casting is valid, or **NULL** otherwise.
3415
+ *
3416
+ * struct udp6_sock *bpf_skc_to_udp6_sock(void *sk)
3417
+ * Description
3418
+ * Dynamically cast a *sk* pointer to a *udp6_sock* pointer.
3419
+ * Return
3420
+ * *sk* if casting is valid, or **NULL** otherwise.
3421
+ *
3422
+ * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags)
3423
+ * Description
3424
+ * Return a user or a kernel stack in bpf program provided buffer.
3425
+ * To achieve this, the helper needs *task*, which is a valid
3426
+ * pointer to **struct task_struct**. To store the stacktrace, the
3427
+ * bpf program provides *buf* with a nonnegative *size*.
3428
+ *
3429
+ * The last argument, *flags*, holds the number of stack frames to
3430
+ * skip (from 0 to 255), masked with
3431
+ * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
3432
+ * the following flags:
3433
+ *
3434
+ * **BPF_F_USER_STACK**
3435
+ * Collect a user space stack instead of a kernel stack.
3436
+ * **BPF_F_USER_BUILD_ID**
3437
+ * Collect buildid+offset instead of ips for user stack,
3438
+ * only valid if **BPF_F_USER_STACK** is also specified.
3439
+ *
3440
+ * **bpf_get_task_stack**\ () can collect up to
3441
+ * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
3442
+ * to sufficient large buffer size. Note that
3443
+ * this limit can be controlled with the **sysctl** program, and
3444
+ * that it should be manually increased in order to profile long
3445
+ * user stacks (such as stacks for Java programs). To do so, use:
3446
+ *
3447
+ * ::
3448
+ *
3449
+ * # sysctl kernel.perf_event_max_stack=<new value>
3450
+ * Return
3451
+ * The non-negative copied *buf* length equal to or less than
3452
+ * *size* on success, or a negative error in case of failure.
3453
+ *
3454
+ * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags)
3455
+ * Description
3456
+ * Load header option. Support reading a particular TCP header
3457
+ * option for bpf program (**BPF_PROG_TYPE_SOCK_OPS**).
3458
+ *
3459
+ * If *flags* is 0, it will search the option from the
3460
+ * *skops*\ **->skb_data**. The comment in **struct bpf_sock_ops**
3461
+ * has details on what skb_data contains under different
3462
+ * *skops*\ **->op**.
3463
+ *
3464
+ * The first byte of the *searchby_res* specifies the
3465
+ * kind that it wants to search.
3466
+ *
3467
+ * If the searching kind is an experimental kind
3468
+ * (i.e. 253 or 254 according to RFC6994). It also
3469
+ * needs to specify the "magic" which is either
3470
+ * 2 bytes or 4 bytes. It then also needs to
3471
+ * specify the size of the magic by using
3472
+ * the 2nd byte which is "kind-length" of a TCP
3473
+ * header option and the "kind-length" also
3474
+ * includes the first 2 bytes "kind" and "kind-length"
3475
+ * itself as a normal TCP header option also does.
3476
+ *
3477
+ * For example, to search experimental kind 254 with
3478
+ * 2 byte magic 0xeB9F, the searchby_res should be
3479
+ * [ 254, 4, 0xeB, 0x9F, 0, 0, .... 0 ].
3480
+ *
3481
+ * To search for the standard window scale option (3),
3482
+ * the *searchby_res* should be [ 3, 0, 0, .... 0 ].
3483
+ * Note, kind-length must be 0 for regular option.
3484
+ *
3485
+ * Searching for No-Op (0) and End-of-Option-List (1) are
3486
+ * not supported.
3487
+ *
3488
+ * *len* must be at least 2 bytes which is the minimal size
3489
+ * of a header option.
3490
+ *
3491
+ * Supported flags:
3492
+ *
3493
+ * * **BPF_LOAD_HDR_OPT_TCP_SYN** to search from the
3494
+ * saved_syn packet or the just-received syn packet.
3495
+ *
3496
+ * Return
3497
+ * > 0 when found, the header option is copied to *searchby_res*.
3498
+ * The return value is the total length copied. On failure, a
3499
+ * negative error code is returned:
3500
+ *
3501
+ * **-EINVAL** if a parameter is invalid.
3502
+ *
3503
+ * **-ENOMSG** if the option is not found.
3504
+ *
3505
+ * **-ENOENT** if no syn packet is available when
3506
+ * **BPF_LOAD_HDR_OPT_TCP_SYN** is used.
3507
+ *
3508
+ * **-ENOSPC** if there is not enough space. Only *len* number of
3509
+ * bytes are copied.
3510
+ *
3511
+ * **-EFAULT** on failure to parse the header options in the
3512
+ * packet.
3513
+ *
3514
+ * **-EPERM** if the helper cannot be used under the current
3515
+ * *skops*\ **->op**.
3516
+ *
3517
+ * long bpf_store_hdr_opt(struct bpf_sock_ops *skops, const void *from, u32 len, u64 flags)
3518
+ * Description
3519
+ * Store header option. The data will be copied
3520
+ * from buffer *from* with length *len* to the TCP header.
3521
+ *
3522
+ * The buffer *from* should have the whole option that
3523
+ * includes the kind, kind-length, and the actual
3524
+ * option data. The *len* must be at least kind-length
3525
+ * long. The kind-length does not have to be 4 byte
3526
+ * aligned. The kernel will take care of the padding
3527
+ * and setting the 4 bytes aligned value to th->doff.
3528
+ *
3529
+ * This helper will check for duplicated option
3530
+ * by searching the same option in the outgoing skb.
3531
+ *
3532
+ * This helper can only be called during
3533
+ * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**.
3534
+ *
3535
+ * Return
3536
+ * 0 on success, or negative error in case of failure:
3537
+ *
3538
+ * **-EINVAL** If param is invalid.
3539
+ *
3540
+ * **-ENOSPC** if there is not enough space in the header.
3541
+ * Nothing has been written
3542
+ *
3543
+ * **-EEXIST** if the option already exists.
3544
+ *
3545
+ * **-EFAULT** on failrue to parse the existing header options.
3546
+ *
3547
+ * **-EPERM** if the helper cannot be used under the current
3548
+ * *skops*\ **->op**.
3549
+ *
3550
+ * long bpf_reserve_hdr_opt(struct bpf_sock_ops *skops, u32 len, u64 flags)
3551
+ * Description
3552
+ * Reserve *len* bytes for the bpf header option. The
3553
+ * space will be used by **bpf_store_hdr_opt**\ () later in
3554
+ * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**.
3555
+ *
3556
+ * If **bpf_reserve_hdr_opt**\ () is called multiple times,
3557
+ * the total number of bytes will be reserved.
3558
+ *
3559
+ * This helper can only be called during
3560
+ * **BPF_SOCK_OPS_HDR_OPT_LEN_CB**.
3561
+ *
3562
+ * Return
3563
+ * 0 on success, or negative error in case of failure:
3564
+ *
3565
+ * **-EINVAL** if a parameter is invalid.
3566
+ *
3567
+ * **-ENOSPC** if there is not enough space in the header.
3568
+ *
3569
+ * **-EPERM** if the helper cannot be used under the current
3570
+ * *skops*\ **->op**.
3571
+ *
3572
+ * void *bpf_inode_storage_get(struct bpf_map *map, void *inode, void *value, u64 flags)
3573
+ * Description
3574
+ * Get a bpf_local_storage from an *inode*.
3575
+ *
3576
+ * Logically, it could be thought of as getting the value from
3577
+ * a *map* with *inode* as the **key**. From this
3578
+ * perspective, the usage is not much different from
3579
+ * **bpf_map_lookup_elem**\ (*map*, **&**\ *inode*) except this
3580
+ * helper enforces the key must be an inode and the map must also
3581
+ * be a **BPF_MAP_TYPE_INODE_STORAGE**.
3582
+ *
3583
+ * Underneath, the value is stored locally at *inode* instead of
3584
+ * the *map*. The *map* is used as the bpf-local-storage
3585
+ * "type". The bpf-local-storage "type" (i.e. the *map*) is
3586
+ * searched against all bpf_local_storage residing at *inode*.
3587
+ *
3588
+ * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be
3589
+ * used such that a new bpf_local_storage will be
3590
+ * created if one does not exist. *value* can be used
3591
+ * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify
3592
+ * the initial value of a bpf_local_storage. If *value* is
3593
+ * **NULL**, the new bpf_local_storage will be zero initialized.
3594
+ * Return
3595
+ * A bpf_local_storage pointer is returned on success.
3596
+ *
3597
+ * **NULL** if not found or there was an error in adding
3598
+ * a new bpf_local_storage.
3599
+ *
3600
+ * int bpf_inode_storage_delete(struct bpf_map *map, void *inode)
3601
+ * Description
3602
+ * Delete a bpf_local_storage from an *inode*.
3603
+ * Return
3604
+ * 0 on success.
3605
+ *
3606
+ * **-ENOENT** if the bpf_local_storage cannot be found.
3607
+ *
3608
+ * long bpf_d_path(struct path *path, char *buf, u32 sz)
3609
+ * Description
3610
+ * Return full path for given **struct path** object, which
3611
+ * needs to be the kernel BTF *path* object. The path is
3612
+ * returned in the provided buffer *buf* of size *sz* and
3613
+ * is zero terminated.
3614
+ *
3615
+ * Return
3616
+ * On success, the strictly positive length of the string,
3617
+ * including the trailing NUL character. On error, a negative
3618
+ * value.
3619
+ *
3620
+ * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr)
3621
+ * Description
3622
+ * Read *size* bytes from user space address *user_ptr* and store
3623
+ * the data in *dst*. This is a wrapper of **copy_from_user**\ ().
3624
+ * Return
3625
+ * 0 on success, or a negative error in case of failure.
3626
+ *
3627
+ * long bpf_snprintf_btf(char *str, u32 str_size, struct btf_ptr *ptr, u32 btf_ptr_size, u64 flags)
3628
+ * Description
3629
+ * Use BTF to store a string representation of *ptr*->ptr in *str*,
3630
+ * using *ptr*->type_id. This value should specify the type
3631
+ * that *ptr*->ptr points to. LLVM __builtin_btf_type_id(type, 1)
3632
+ * can be used to look up vmlinux BTF type ids. Traversing the
3633
+ * data structure using BTF, the type information and values are
3634
+ * stored in the first *str_size* - 1 bytes of *str*. Safe copy of
3635
+ * the pointer data is carried out to avoid kernel crashes during
3636
+ * operation. Smaller types can use string space on the stack;
3637
+ * larger programs can use map data to store the string
3638
+ * representation.
3639
+ *
3640
+ * The string can be subsequently shared with userspace via
3641
+ * bpf_perf_event_output() or ring buffer interfaces.
3642
+ * bpf_trace_printk() is to be avoided as it places too small
3643
+ * a limit on string size to be useful.
3644
+ *
3645
+ * *flags* is a combination of
3646
+ *
3647
+ * **BTF_F_COMPACT**
3648
+ * no formatting around type information
3649
+ * **BTF_F_NONAME**
3650
+ * no struct/union member names/types
3651
+ * **BTF_F_PTR_RAW**
3652
+ * show raw (unobfuscated) pointer values;
3653
+ * equivalent to printk specifier %px.
3654
+ * **BTF_F_ZERO**
3655
+ * show zero-valued struct/union members; they
3656
+ * are not displayed by default
3657
+ *
3658
+ * Return
3659
+ * The number of bytes that were written (or would have been
3660
+ * written if output had to be truncated due to string size),
3661
+ * or a negative error in cases of failure.
3662
+ *
3663
+ * long bpf_seq_printf_btf(struct seq_file *m, struct btf_ptr *ptr, u32 ptr_size, u64 flags)
3664
+ * Description
3665
+ * Use BTF to write to seq_write a string representation of
3666
+ * *ptr*->ptr, using *ptr*->type_id as per bpf_snprintf_btf().
3667
+ * *flags* are identical to those used for bpf_snprintf_btf.
3668
+ * Return
3669
+ * 0 on success or a negative error in case of failure.
3670
+ *
3671
+ * u64 bpf_skb_cgroup_classid(struct sk_buff *skb)
3672
+ * Description
3673
+ * See **bpf_get_cgroup_classid**\ () for the main description.
3674
+ * This helper differs from **bpf_get_cgroup_classid**\ () in that
3675
+ * the cgroup v1 net_cls class is retrieved only from the *skb*'s
3676
+ * associated socket instead of the current process.
3677
+ * Return
3678
+ * The id is returned or 0 in case the id could not be retrieved.
3679
+ *
3680
+ * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags)
3681
+ * Description
3682
+ * Redirect the packet to another net device of index *ifindex*
3683
+ * and fill in L2 addresses from neighboring subsystem. This helper
3684
+ * is somewhat similar to **bpf_redirect**\ (), except that it
3685
+ * populates L2 addresses as well, meaning, internally, the helper
3686
+ * relies on the neighbor lookup for the L2 address of the nexthop.
3687
+ *
3688
+ * The helper will perform a FIB lookup based on the skb's
3689
+ * networking header to get the address of the next hop, unless
3690
+ * this is supplied by the caller in the *params* argument. The
3691
+ * *plen* argument indicates the len of *params* and should be set
3692
+ * to 0 if *params* is NULL.
3693
+ *
3694
+ * The *flags* argument is reserved and must be 0. The helper is
3695
+ * currently only supported for tc BPF program types, and enabled
3696
+ * for IPv4 and IPv6 protocols.
3697
+ * Return
3698
+ * The helper returns **TC_ACT_REDIRECT** on success or
3699
+ * **TC_ACT_SHOT** on error.
3700
+ *
3701
+ * void *bpf_per_cpu_ptr(const void *percpu_ptr, u32 cpu)
3702
+ * Description
3703
+ * Take a pointer to a percpu ksym, *percpu_ptr*, and return a
3704
+ * pointer to the percpu kernel variable on *cpu*. A ksym is an
3705
+ * extern variable decorated with '__ksym'. For ksym, there is a
3706
+ * global var (either static or global) defined of the same name
3707
+ * in the kernel. The ksym is percpu if the global var is percpu.
3708
+ * The returned pointer points to the global percpu var on *cpu*.
3709
+ *
3710
+ * bpf_per_cpu_ptr() has the same semantic as per_cpu_ptr() in the
3711
+ * kernel, except that bpf_per_cpu_ptr() may return NULL. This
3712
+ * happens if *cpu* is larger than nr_cpu_ids. The caller of
3713
+ * bpf_per_cpu_ptr() must check the returned value.
3714
+ * Return
3715
+ * A pointer pointing to the kernel percpu variable on *cpu*, or
3716
+ * NULL, if *cpu* is invalid.
3717
+ *
3718
+ * void *bpf_this_cpu_ptr(const void *percpu_ptr)
3719
+ * Description
3720
+ * Take a pointer to a percpu ksym, *percpu_ptr*, and return a
3721
+ * pointer to the percpu kernel variable on this cpu. See the
3722
+ * description of 'ksym' in **bpf_per_cpu_ptr**\ ().
3723
+ *
3724
+ * bpf_this_cpu_ptr() has the same semantic as this_cpu_ptr() in
3725
+ * the kernel. Different from **bpf_per_cpu_ptr**\ (), it would
3726
+ * never return NULL.
3727
+ * Return
3728
+ * A pointer pointing to the kernel percpu variable on this cpu.
3729
+ *
3730
+ * long bpf_redirect_peer(u32 ifindex, u64 flags)
3731
+ * Description
3732
+ * Redirect the packet to another net device of index *ifindex*.
3733
+ * This helper is somewhat similar to **bpf_redirect**\ (), except
3734
+ * that the redirection happens to the *ifindex*' peer device and
3735
+ * the netns switch takes place from ingress to ingress without
3736
+ * going through the CPU's backlog queue.
3737
+ *
3738
+ * The *flags* argument is reserved and must be 0. The helper is
3739
+ * currently only supported for tc BPF program types at the ingress
3740
+ * hook and for veth device types. The peer device must reside in a
3741
+ * different network namespace.
3742
+ * Return
3743
+ * The helper returns **TC_ACT_REDIRECT** on success or
3744
+ * **TC_ACT_SHOT** on error.
21703745 */
21713746 #define __BPF_FUNC_MAPPER(FN) \
21723747 FN(unspec), \
....@@ -2294,7 +3869,38 @@
22943869 FN(get_netns_cookie), \
22953870 FN(get_current_ancestor_cgroup_id), \
22963871 FN(sk_assign), \
2297
- FN(ktime_get_boot_ns),
3872
+ FN(ktime_get_boot_ns), \
3873
+ FN(seq_printf), \
3874
+ FN(seq_write), \
3875
+ FN(sk_cgroup_id), \
3876
+ FN(sk_ancestor_cgroup_id), \
3877
+ FN(ringbuf_output), \
3878
+ FN(ringbuf_reserve), \
3879
+ FN(ringbuf_submit), \
3880
+ FN(ringbuf_discard), \
3881
+ FN(ringbuf_query), \
3882
+ FN(csum_level), \
3883
+ FN(skc_to_tcp6_sock), \
3884
+ FN(skc_to_tcp_sock), \
3885
+ FN(skc_to_tcp_timewait_sock), \
3886
+ FN(skc_to_tcp_request_sock), \
3887
+ FN(skc_to_udp6_sock), \
3888
+ FN(get_task_stack), \
3889
+ FN(load_hdr_opt), \
3890
+ FN(store_hdr_opt), \
3891
+ FN(reserve_hdr_opt), \
3892
+ FN(inode_storage_get), \
3893
+ FN(inode_storage_delete), \
3894
+ FN(d_path), \
3895
+ FN(copy_from_user), \
3896
+ FN(snprintf_btf), \
3897
+ FN(seq_printf_btf), \
3898
+ FN(skb_cgroup_classid), \
3899
+ FN(redirect_neigh), \
3900
+ FN(per_cpu_ptr), \
3901
+ FN(this_cpu_ptr), \
3902
+ FN(redirect_peer), \
3903
+ /* */
22983904
22993905 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
23003906 * function eBPF program intends to call
....@@ -2309,50 +3915,147 @@
23093915 /* All flags used by eBPF helper functions, placed here. */
23103916
23113917 /* BPF_FUNC_skb_store_bytes flags. */
2312
-#define BPF_F_RECOMPUTE_CSUM (1ULL << 0)
2313
-#define BPF_F_INVALIDATE_HASH (1ULL << 1)
3918
+enum {
3919
+ BPF_F_RECOMPUTE_CSUM = (1ULL << 0),
3920
+ BPF_F_INVALIDATE_HASH = (1ULL << 1),
3921
+};
23143922
23153923 /* BPF_FUNC_l3_csum_replace and BPF_FUNC_l4_csum_replace flags.
23163924 * First 4 bits are for passing the header field size.
23173925 */
2318
-#define BPF_F_HDR_FIELD_MASK 0xfULL
3926
+enum {
3927
+ BPF_F_HDR_FIELD_MASK = 0xfULL,
3928
+};
23193929
23203930 /* BPF_FUNC_l4_csum_replace flags. */
2321
-#define BPF_F_PSEUDO_HDR (1ULL << 4)
2322
-#define BPF_F_MARK_MANGLED_0 (1ULL << 5)
2323
-#define BPF_F_MARK_ENFORCE (1ULL << 6)
3931
+enum {
3932
+ BPF_F_PSEUDO_HDR = (1ULL << 4),
3933
+ BPF_F_MARK_MANGLED_0 = (1ULL << 5),
3934
+ BPF_F_MARK_ENFORCE = (1ULL << 6),
3935
+};
23243936
23253937 /* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */
2326
-#define BPF_F_INGRESS (1ULL << 0)
3938
+enum {
3939
+ BPF_F_INGRESS = (1ULL << 0),
3940
+};
23273941
23283942 /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
2329
-#define BPF_F_TUNINFO_IPV6 (1ULL << 0)
3943
+enum {
3944
+ BPF_F_TUNINFO_IPV6 = (1ULL << 0),
3945
+};
23303946
23313947 /* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */
2332
-#define BPF_F_SKIP_FIELD_MASK 0xffULL
2333
-#define BPF_F_USER_STACK (1ULL << 8)
3948
+enum {
3949
+ BPF_F_SKIP_FIELD_MASK = 0xffULL,
3950
+ BPF_F_USER_STACK = (1ULL << 8),
23343951 /* flags used by BPF_FUNC_get_stackid only. */
2335
-#define BPF_F_FAST_STACK_CMP (1ULL << 9)
2336
-#define BPF_F_REUSE_STACKID (1ULL << 10)
3952
+ BPF_F_FAST_STACK_CMP = (1ULL << 9),
3953
+ BPF_F_REUSE_STACKID = (1ULL << 10),
23373954 /* flags used by BPF_FUNC_get_stack only. */
2338
-#define BPF_F_USER_BUILD_ID (1ULL << 11)
3955
+ BPF_F_USER_BUILD_ID = (1ULL << 11),
3956
+};
23393957
23403958 /* BPF_FUNC_skb_set_tunnel_key flags. */
2341
-#define BPF_F_ZERO_CSUM_TX (1ULL << 1)
2342
-#define BPF_F_DONT_FRAGMENT (1ULL << 2)
2343
-#define BPF_F_SEQ_NUMBER (1ULL << 3)
3959
+enum {
3960
+ BPF_F_ZERO_CSUM_TX = (1ULL << 1),
3961
+ BPF_F_DONT_FRAGMENT = (1ULL << 2),
3962
+ BPF_F_SEQ_NUMBER = (1ULL << 3),
3963
+};
23443964
23453965 /* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and
23463966 * BPF_FUNC_perf_event_read_value flags.
23473967 */
2348
-#define BPF_F_INDEX_MASK 0xffffffffULL
2349
-#define BPF_F_CURRENT_CPU BPF_F_INDEX_MASK
3968
+enum {
3969
+ BPF_F_INDEX_MASK = 0xffffffffULL,
3970
+ BPF_F_CURRENT_CPU = BPF_F_INDEX_MASK,
23503971 /* BPF_FUNC_perf_event_output for sk_buff input context. */
2351
-#define BPF_F_CTXLEN_MASK (0xfffffULL << 32)
3972
+ BPF_F_CTXLEN_MASK = (0xfffffULL << 32),
3973
+};
3974
+
3975
+/* Current network namespace */
3976
+enum {
3977
+ BPF_F_CURRENT_NETNS = (-1L),
3978
+};
3979
+
3980
+/* BPF_FUNC_csum_level level values. */
3981
+enum {
3982
+ BPF_CSUM_LEVEL_QUERY,
3983
+ BPF_CSUM_LEVEL_INC,
3984
+ BPF_CSUM_LEVEL_DEC,
3985
+ BPF_CSUM_LEVEL_RESET,
3986
+};
3987
+
3988
+/* BPF_FUNC_skb_adjust_room flags. */
3989
+enum {
3990
+ BPF_F_ADJ_ROOM_FIXED_GSO = (1ULL << 0),
3991
+ BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 = (1ULL << 1),
3992
+ BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 = (1ULL << 2),
3993
+ BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3),
3994
+ BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4),
3995
+ BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5),
3996
+};
3997
+
3998
+enum {
3999
+ BPF_ADJ_ROOM_ENCAP_L2_MASK = 0xff,
4000
+ BPF_ADJ_ROOM_ENCAP_L2_SHIFT = 56,
4001
+};
4002
+
4003
+#define BPF_F_ADJ_ROOM_ENCAP_L2(len) (((__u64)len & \
4004
+ BPF_ADJ_ROOM_ENCAP_L2_MASK) \
4005
+ << BPF_ADJ_ROOM_ENCAP_L2_SHIFT)
4006
+
4007
+/* BPF_FUNC_sysctl_get_name flags. */
4008
+enum {
4009
+ BPF_F_SYSCTL_BASE_NAME = (1ULL << 0),
4010
+};
4011
+
4012
+/* BPF_FUNC_<kernel_obj>_storage_get flags */
4013
+enum {
4014
+ BPF_LOCAL_STORAGE_GET_F_CREATE = (1ULL << 0),
4015
+ /* BPF_SK_STORAGE_GET_F_CREATE is only kept for backward compatibility
4016
+ * and BPF_LOCAL_STORAGE_GET_F_CREATE must be used instead.
4017
+ */
4018
+ BPF_SK_STORAGE_GET_F_CREATE = BPF_LOCAL_STORAGE_GET_F_CREATE,
4019
+};
4020
+
4021
+/* BPF_FUNC_read_branch_records flags. */
4022
+enum {
4023
+ BPF_F_GET_BRANCH_RECORDS_SIZE = (1ULL << 0),
4024
+};
4025
+
4026
+/* BPF_FUNC_bpf_ringbuf_commit, BPF_FUNC_bpf_ringbuf_discard, and
4027
+ * BPF_FUNC_bpf_ringbuf_output flags.
4028
+ */
4029
+enum {
4030
+ BPF_RB_NO_WAKEUP = (1ULL << 0),
4031
+ BPF_RB_FORCE_WAKEUP = (1ULL << 1),
4032
+};
4033
+
4034
+/* BPF_FUNC_bpf_ringbuf_query flags */
4035
+enum {
4036
+ BPF_RB_AVAIL_DATA = 0,
4037
+ BPF_RB_RING_SIZE = 1,
4038
+ BPF_RB_CONS_POS = 2,
4039
+ BPF_RB_PROD_POS = 3,
4040
+};
4041
+
4042
+/* BPF ring buffer constants */
4043
+enum {
4044
+ BPF_RINGBUF_BUSY_BIT = (1U << 31),
4045
+ BPF_RINGBUF_DISCARD_BIT = (1U << 30),
4046
+ BPF_RINGBUF_HDR_SZ = 8,
4047
+};
4048
+
4049
+/* BPF_FUNC_sk_assign flags in bpf_sk_lookup context. */
4050
+enum {
4051
+ BPF_SK_LOOKUP_F_REPLACE = (1ULL << 0),
4052
+ BPF_SK_LOOKUP_F_NO_REUSEPORT = (1ULL << 1),
4053
+};
23524054
23534055 /* Mode for BPF_FUNC_skb_adjust_room helper. */
23544056 enum bpf_adj_room_mode {
23554057 BPF_ADJ_ROOM_NET,
4058
+ BPF_ADJ_ROOM_MAC,
23564059 };
23574060
23584061 /* Mode for BPF_FUNC_skb_load_bytes_relative helper. */
....@@ -2364,8 +4067,15 @@
23644067 /* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */
23654068 enum bpf_lwt_encap_mode {
23664069 BPF_LWT_ENCAP_SEG6,
2367
- BPF_LWT_ENCAP_SEG6_INLINE
4070
+ BPF_LWT_ENCAP_SEG6_INLINE,
4071
+ BPF_LWT_ENCAP_IP,
23684072 };
4073
+
4074
+#define __bpf_md_ptr(type, name) \
4075
+union { \
4076
+ type name; \
4077
+ __u64 :64; \
4078
+} __attribute__((aligned(8)))
23694079
23704080 /* user accessible mirror of in-kernel sk_buff.
23714081 * new fields can only be added to the end of this structure
....@@ -2401,6 +4111,12 @@
24014111 /* ... here. */
24024112
24034113 __u32 data_meta;
4114
+ __bpf_md_ptr(struct bpf_flow_keys *, flow_keys);
4115
+ __u64 tstamp;
4116
+ __u32 wire_len;
4117
+ __u32 gso_segs;
4118
+ __bpf_md_ptr(struct bpf_sock *, sk);
4119
+ __u32 gso_size;
24044120 };
24054121
24064122 struct bpf_tunnel_key {
....@@ -2442,7 +4158,15 @@
24424158 BPF_DROP = 2,
24434159 /* 3-6 reserved */
24444160 BPF_REDIRECT = 7,
2445
- /* >127 are reserved for prog type specific return codes */
4161
+ /* >127 are reserved for prog type specific return codes.
4162
+ *
4163
+ * BPF_LWT_REROUTE: used by BPF_PROG_TYPE_LWT_IN and
4164
+ * BPF_PROG_TYPE_LWT_XMIT to indicate that skb had been
4165
+ * changed and should be routed based on its new L3 header.
4166
+ * (This is an L3 redirect, as opposed to L2 redirect
4167
+ * represented by BPF_REDIRECT above).
4168
+ */
4169
+ BPF_LWT_REROUTE = 128,
24464170 };
24474171
24484172 struct bpf_sock {
....@@ -2452,15 +4176,82 @@
24524176 __u32 protocol;
24534177 __u32 mark;
24544178 __u32 priority;
2455
- __u32 src_ip4; /* Allows 1,2,4-byte read.
2456
- * Stored in network byte order.
4179
+ /* IP address also allows 1 and 2 bytes access */
4180
+ __u32 src_ip4;
4181
+ __u32 src_ip6[4];
4182
+ __u32 src_port; /* host byte order */
4183
+ __be16 dst_port; /* network byte order */
4184
+ __u16 :16; /* zero padding */
4185
+ __u32 dst_ip4;
4186
+ __u32 dst_ip6[4];
4187
+ __u32 state;
4188
+ __s32 rx_queue_mapping;
4189
+};
4190
+
4191
+struct bpf_tcp_sock {
4192
+ __u32 snd_cwnd; /* Sending congestion window */
4193
+ __u32 srtt_us; /* smoothed round trip time << 3 in usecs */
4194
+ __u32 rtt_min;
4195
+ __u32 snd_ssthresh; /* Slow start size threshold */
4196
+ __u32 rcv_nxt; /* What we want to receive next */
4197
+ __u32 snd_nxt; /* Next sequence we send */
4198
+ __u32 snd_una; /* First byte we want an ack for */
4199
+ __u32 mss_cache; /* Cached effective mss, not including SACKS */
4200
+ __u32 ecn_flags; /* ECN status bits. */
4201
+ __u32 rate_delivered; /* saved rate sample: packets delivered */
4202
+ __u32 rate_interval_us; /* saved rate sample: time elapsed */
4203
+ __u32 packets_out; /* Packets which are "in flight" */
4204
+ __u32 retrans_out; /* Retransmitted packets out */
4205
+ __u32 total_retrans; /* Total retransmits for entire connection */
4206
+ __u32 segs_in; /* RFC4898 tcpEStatsPerfSegsIn
4207
+ * total number of segments in.
24574208 */
2458
- __u32 src_ip6[4]; /* Allows 1,2,4-byte read.
2459
- * Stored in network byte order.
4209
+ __u32 data_segs_in; /* RFC4898 tcpEStatsPerfDataSegsIn
4210
+ * total number of data segments in.
24604211 */
2461
- __u32 src_port; /* Allows 4-byte read.
2462
- * Stored in host byte order
4212
+ __u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut
4213
+ * The total number of segments sent.
24634214 */
4215
+ __u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut
4216
+ * total number of data segments sent.
4217
+ */
4218
+ __u32 lost_out; /* Lost packets */
4219
+ __u32 sacked_out; /* SACK'd packets */
4220
+ __u64 bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived
4221
+ * sum(delta(rcv_nxt)), or how many bytes
4222
+ * were acked.
4223
+ */
4224
+ __u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked
4225
+ * sum(delta(snd_una)), or how many bytes
4226
+ * were acked.
4227
+ */
4228
+ __u32 dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups
4229
+ * total number of DSACK blocks received
4230
+ */
4231
+ __u32 delivered; /* Total data packets delivered incl. rexmits */
4232
+ __u32 delivered_ce; /* Like the above but only ECE marked packets */
4233
+ __u32 icsk_retransmits; /* Number of unrecovered [RTO] timeouts */
4234
+};
4235
+
4236
+struct bpf_sock_tuple {
4237
+ union {
4238
+ struct {
4239
+ __be32 saddr;
4240
+ __be32 daddr;
4241
+ __be16 sport;
4242
+ __be16 dport;
4243
+ } ipv4;
4244
+ struct {
4245
+ __be32 saddr[4];
4246
+ __be32 daddr[4];
4247
+ __be16 sport;
4248
+ __be16 dport;
4249
+ } ipv6;
4250
+ };
4251
+};
4252
+
4253
+struct bpf_xdp_sock {
4254
+ __u32 queue_id;
24644255 };
24654256
24664257 #define XDP_PACKET_HEADROOM 256
....@@ -2488,6 +4279,34 @@
24884279 /* Below access go through struct xdp_rxq_info */
24894280 __u32 ingress_ifindex; /* rxq->dev->ifindex */
24904281 __u32 rx_queue_index; /* rxq->queue_index */
4282
+
4283
+ __u32 egress_ifindex; /* txq->dev->ifindex */
4284
+};
4285
+
4286
+/* DEVMAP map-value layout
4287
+ *
4288
+ * The struct data-layout of map-value is a configuration interface.
4289
+ * New members can only be added to the end of this structure.
4290
+ */
4291
+struct bpf_devmap_val {
4292
+ __u32 ifindex; /* device index */
4293
+ union {
4294
+ int fd; /* prog fd on map write */
4295
+ __u32 id; /* prog id on map read */
4296
+ } bpf_prog;
4297
+};
4298
+
4299
+/* CPUMAP map-value layout
4300
+ *
4301
+ * The struct data-layout of map-value is a configuration interface.
4302
+ * New members can only be added to the end of this structure.
4303
+ */
4304
+struct bpf_cpumap_val {
4305
+ __u32 qsize; /* queue size to remote target CPU */
4306
+ union {
4307
+ int fd; /* prog fd on map write */
4308
+ __u32 id; /* prog id on map read */
4309
+ } bpf_prog;
24914310 };
24924311
24934312 enum sk_action {
....@@ -2499,8 +4318,8 @@
24994318 * be added to the end of this structure
25004319 */
25014320 struct sk_msg_md {
2502
- void *data;
2503
- void *data_end;
4321
+ __bpf_md_ptr(void *, data);
4322
+ __bpf_md_ptr(void *, data_end);
25044323
25054324 __u32 family;
25064325 __u32 remote_ip4; /* Stored in network byte order */
....@@ -2509,6 +4328,9 @@
25094328 __u32 local_ip6[4]; /* Stored in network byte order */
25104329 __u32 remote_port; /* Stored in network byte order */
25114330 __u32 local_port; /* stored in host byte order */
4331
+ __u32 size; /* Total size of sk_msg */
4332
+
4333
+ __bpf_md_ptr(struct bpf_sock *, sk); /* current socket */
25124334 };
25134335
25144336 struct sk_reuseport_md {
....@@ -2516,8 +4338,9 @@
25164338 * Start of directly accessible data. It begins from
25174339 * the tcp/udp header.
25184340 */
2519
- void *data;
2520
- void *data_end; /* End of directly accessible data */
4341
+ __bpf_md_ptr(void *, data);
4342
+ /* End of directly accessible data */
4343
+ __bpf_md_ptr(void *, data_end);
25214344 /*
25224345 * Total length of packet (starting from the tcp/udp header).
25234346 * Note that the directly accessible bytes (data_end - data)
....@@ -2559,6 +4382,20 @@
25594382 __u32 nr_jited_func_lens;
25604383 __aligned_u64 jited_ksyms;
25614384 __aligned_u64 jited_func_lens;
4385
+ __u32 btf_id;
4386
+ __u32 func_info_rec_size;
4387
+ __aligned_u64 func_info;
4388
+ __u32 nr_func_info;
4389
+ __u32 nr_line_info;
4390
+ __aligned_u64 line_info;
4391
+ __aligned_u64 jited_line_info;
4392
+ __u32 nr_jited_line_info;
4393
+ __u32 line_info_rec_size;
4394
+ __u32 jited_line_info_rec_size;
4395
+ __u32 nr_prog_tags;
4396
+ __aligned_u64 prog_tags;
4397
+ __u64 run_time_ns;
4398
+ __u64 run_cnt;
25624399 } __attribute__((aligned(8)));
25634400
25644401 struct bpf_map_info {
....@@ -2570,7 +4407,7 @@
25704407 __u32 map_flags;
25714408 char name[BPF_OBJ_NAME_LEN];
25724409 __u32 ifindex;
2573
- __u32 :32;
4410
+ __u32 btf_vmlinux_value_type_id;
25744411 __u64 netns_dev;
25754412 __u64 netns_ino;
25764413 __u32 btf_id;
....@@ -2584,30 +4421,66 @@
25844421 __u32 id;
25854422 } __attribute__((aligned(8)));
25864423
4424
+struct bpf_link_info {
4425
+ __u32 type;
4426
+ __u32 id;
4427
+ __u32 prog_id;
4428
+ union {
4429
+ struct {
4430
+ __aligned_u64 tp_name; /* in/out: tp_name buffer ptr */
4431
+ __u32 tp_name_len; /* in/out: tp_name buffer len */
4432
+ } raw_tracepoint;
4433
+ struct {
4434
+ __u32 attach_type;
4435
+ } tracing;
4436
+ struct {
4437
+ __u64 cgroup_id;
4438
+ __u32 attach_type;
4439
+ } cgroup;
4440
+ struct {
4441
+ __aligned_u64 target_name; /* in/out: target_name buffer ptr */
4442
+ __u32 target_name_len; /* in/out: target_name buffer len */
4443
+ union {
4444
+ struct {
4445
+ __u32 map_id;
4446
+ } map;
4447
+ };
4448
+ } iter;
4449
+ struct {
4450
+ __u32 netns_ino;
4451
+ __u32 attach_type;
4452
+ } netns;
4453
+ struct {
4454
+ __u32 ifindex;
4455
+ } xdp;
4456
+ };
4457
+} __attribute__((aligned(8)));
4458
+
25874459 /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed
25884460 * by user and intended to be used by socket (e.g. to bind to, depends on
2589
- * attach attach type).
4461
+ * attach type).
25904462 */
25914463 struct bpf_sock_addr {
25924464 __u32 user_family; /* Allows 4-byte read, but no write. */
25934465 __u32 user_ip4; /* Allows 1,2,4-byte read and 4-byte write.
25944466 * Stored in network byte order.
25954467 */
2596
- __u32 user_ip6[4]; /* Allows 1,2,4-byte read an 4-byte write.
4468
+ __u32 user_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write.
25974469 * Stored in network byte order.
25984470 */
2599
- __u32 user_port; /* Allows 4-byte read and write.
4471
+ __u32 user_port; /* Allows 1,2,4-byte read and 4-byte write.
26004472 * Stored in network byte order
26014473 */
26024474 __u32 family; /* Allows 4-byte read, but no write */
26034475 __u32 type; /* Allows 4-byte read, but no write */
26044476 __u32 protocol; /* Allows 4-byte read, but no write */
2605
- __u32 msg_src_ip4; /* Allows 1,2,4-byte read an 4-byte write.
4477
+ __u32 msg_src_ip4; /* Allows 1,2,4-byte read and 4-byte write.
26064478 * Stored in network byte order.
26074479 */
2608
- __u32 msg_src_ip6[4]; /* Allows 1,2,4-byte read an 4-byte write.
4480
+ __u32 msg_src_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write.
26094481 * Stored in network byte order.
26104482 */
4483
+ __bpf_md_ptr(struct bpf_sock *, sk);
26114484 };
26124485
26134486 /* User bpf_sock_ops struct to access socket values and specify request ops
....@@ -2659,15 +4532,91 @@
26594532 __u32 sk_txhash;
26604533 __u64 bytes_received;
26614534 __u64 bytes_acked;
4535
+ __bpf_md_ptr(struct bpf_sock *, sk);
4536
+ /* [skb_data, skb_data_end) covers the whole TCP header.
4537
+ *
4538
+ * BPF_SOCK_OPS_PARSE_HDR_OPT_CB: The packet received
4539
+ * BPF_SOCK_OPS_HDR_OPT_LEN_CB: Not useful because the
4540
+ * header has not been written.
4541
+ * BPF_SOCK_OPS_WRITE_HDR_OPT_CB: The header and options have
4542
+ * been written so far.
4543
+ * BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: The SYNACK that concludes
4544
+ * the 3WHS.
4545
+ * BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: The ACK that concludes
4546
+ * the 3WHS.
4547
+ *
4548
+ * bpf_load_hdr_opt() can also be used to read a particular option.
4549
+ */
4550
+ __bpf_md_ptr(void *, skb_data);
4551
+ __bpf_md_ptr(void *, skb_data_end);
4552
+ __u32 skb_len; /* The total length of a packet.
4553
+ * It includes the header, options,
4554
+ * and payload.
4555
+ */
4556
+ __u32 skb_tcp_flags; /* tcp_flags of the header. It provides
4557
+ * an easy way to check for tcp_flags
4558
+ * without parsing skb_data.
4559
+ *
4560
+ * In particular, the skb_tcp_flags
4561
+ * will still be available in
4562
+ * BPF_SOCK_OPS_HDR_OPT_LEN even though
4563
+ * the outgoing header has not
4564
+ * been written yet.
4565
+ */
26624566 };
26634567
26644568 /* Definitions for bpf_sock_ops_cb_flags */
2665
-#define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0)
2666
-#define BPF_SOCK_OPS_RETRANS_CB_FLAG (1<<1)
2667
-#define BPF_SOCK_OPS_STATE_CB_FLAG (1<<2)
2668
-#define BPF_SOCK_OPS_ALL_CB_FLAGS 0x7 /* Mask of all currently
2669
- * supported cb flags
2670
- */
4569
+enum {
4570
+ BPF_SOCK_OPS_RTO_CB_FLAG = (1<<0),
4571
+ BPF_SOCK_OPS_RETRANS_CB_FLAG = (1<<1),
4572
+ BPF_SOCK_OPS_STATE_CB_FLAG = (1<<2),
4573
+ BPF_SOCK_OPS_RTT_CB_FLAG = (1<<3),
4574
+ /* Call bpf for all received TCP headers. The bpf prog will be
4575
+ * called under sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB
4576
+ *
4577
+ * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB
4578
+ * for the header option related helpers that will be useful
4579
+ * to the bpf programs.
4580
+ *
4581
+ * It could be used at the client/active side (i.e. connect() side)
4582
+ * when the server told it that the server was in syncookie
4583
+ * mode and required the active side to resend the bpf-written
4584
+ * options. The active side can keep writing the bpf-options until
4585
+ * it received a valid packet from the server side to confirm
4586
+ * the earlier packet (and options) has been received. The later
4587
+ * example patch is using it like this at the active side when the
4588
+ * server is in syncookie mode.
4589
+ *
4590
+ * The bpf prog will usually turn this off in the common cases.
4591
+ */
4592
+ BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG = (1<<4),
4593
+ /* Call bpf when kernel has received a header option that
4594
+ * the kernel cannot handle. The bpf prog will be called under
4595
+ * sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB.
4596
+ *
4597
+ * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB
4598
+ * for the header option related helpers that will be useful
4599
+ * to the bpf programs.
4600
+ */
4601
+ BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG = (1<<5),
4602
+ /* Call bpf when the kernel is writing header options for the
4603
+ * outgoing packet. The bpf prog will first be called
4604
+ * to reserve space in a skb under
4605
+ * sock_ops->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB. Then
4606
+ * the bpf prog will be called to write the header option(s)
4607
+ * under sock_ops->op == BPF_SOCK_OPS_WRITE_HDR_OPT_CB.
4608
+ *
4609
+ * Please refer to the comment in BPF_SOCK_OPS_HDR_OPT_LEN_CB
4610
+ * and BPF_SOCK_OPS_WRITE_HDR_OPT_CB for the header option
4611
+ * related helpers that will be useful to the bpf programs.
4612
+ *
4613
+ * The kernel gets its chance to reserve space and write
4614
+ * options first before the BPF program does.
4615
+ */
4616
+ BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG = (1<<6),
4617
+/* Mask of all currently supported cb flags */
4618
+ BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F,
4619
+};
26714620
26724621 /* List of known BPF sock_ops operators.
26734622 * New entries can only be added at the end
....@@ -2720,6 +4669,65 @@
27204669 BPF_SOCK_OPS_TCP_LISTEN_CB, /* Called on listen(2), right after
27214670 * socket transition to LISTEN state.
27224671 */
4672
+ BPF_SOCK_OPS_RTT_CB, /* Called on every RTT.
4673
+ */
4674
+ BPF_SOCK_OPS_PARSE_HDR_OPT_CB, /* Parse the header option.
4675
+ * It will be called to handle
4676
+ * the packets received at
4677
+ * an already established
4678
+ * connection.
4679
+ *
4680
+ * sock_ops->skb_data:
4681
+ * Referring to the received skb.
4682
+ * It covers the TCP header only.
4683
+ *
4684
+ * bpf_load_hdr_opt() can also
4685
+ * be used to search for a
4686
+ * particular option.
4687
+ */
4688
+ BPF_SOCK_OPS_HDR_OPT_LEN_CB, /* Reserve space for writing the
4689
+ * header option later in
4690
+ * BPF_SOCK_OPS_WRITE_HDR_OPT_CB.
4691
+ * Arg1: bool want_cookie. (in
4692
+ * writing SYNACK only)
4693
+ *
4694
+ * sock_ops->skb_data:
4695
+ * Not available because no header has
4696
+ * been written yet.
4697
+ *
4698
+ * sock_ops->skb_tcp_flags:
4699
+ * The tcp_flags of the
4700
+ * outgoing skb. (e.g. SYN, ACK, FIN).
4701
+ *
4702
+ * bpf_reserve_hdr_opt() should
4703
+ * be used to reserve space.
4704
+ */
4705
+ BPF_SOCK_OPS_WRITE_HDR_OPT_CB, /* Write the header options
4706
+ * Arg1: bool want_cookie. (in
4707
+ * writing SYNACK only)
4708
+ *
4709
+ * sock_ops->skb_data:
4710
+ * Referring to the outgoing skb.
4711
+ * It covers the TCP header
4712
+ * that has already been written
4713
+ * by the kernel and the
4714
+ * earlier bpf-progs.
4715
+ *
4716
+ * sock_ops->skb_tcp_flags:
4717
+ * The tcp_flags of the outgoing
4718
+ * skb. (e.g. SYN, ACK, FIN).
4719
+ *
4720
+ * bpf_store_hdr_opt() should
4721
+ * be used to write the
4722
+ * option.
4723
+ *
4724
+ * bpf_load_hdr_opt() can also
4725
+ * be used to search for a
4726
+ * particular option that
4727
+ * has already been written
4728
+ * by the kernel or the
4729
+ * earlier bpf-progs.
4730
+ */
27234731 };
27244732
27254733 /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
....@@ -2744,8 +4752,67 @@
27444752 BPF_TCP_MAX_STATES /* Leave at the end! */
27454753 };
27464754
2747
-#define TCP_BPF_IW 1001 /* Set TCP initial congestion window */
2748
-#define TCP_BPF_SNDCWND_CLAMP 1002 /* Set sndcwnd_clamp */
4755
+enum {
4756
+ TCP_BPF_IW = 1001, /* Set TCP initial congestion window */
4757
+ TCP_BPF_SNDCWND_CLAMP = 1002, /* Set sndcwnd_clamp */
4758
+ TCP_BPF_DELACK_MAX = 1003, /* Max delay ack in usecs */
4759
+ TCP_BPF_RTO_MIN = 1004, /* Min delay ack in usecs */
4760
+ /* Copy the SYN pkt to optval
4761
+ *
4762
+ * BPF_PROG_TYPE_SOCK_OPS only. It is similar to the
4763
+ * bpf_getsockopt(TCP_SAVED_SYN) but it does not limit
4764
+ * to only getting from the saved_syn. It can either get the
4765
+ * syn packet from:
4766
+ *
4767
+ * 1. the just-received SYN packet (only available when writing the
4768
+ * SYNACK). It will be useful when it is not necessary to
4769
+ * save the SYN packet for latter use. It is also the only way
4770
+ * to get the SYN during syncookie mode because the syn
4771
+ * packet cannot be saved during syncookie.
4772
+ *
4773
+ * OR
4774
+ *
4775
+ * 2. the earlier saved syn which was done by
4776
+ * bpf_setsockopt(TCP_SAVE_SYN).
4777
+ *
4778
+ * The bpf_getsockopt(TCP_BPF_SYN*) option will hide where the
4779
+ * SYN packet is obtained.
4780
+ *
4781
+ * If the bpf-prog does not need the IP[46] header, the
4782
+ * bpf-prog can avoid parsing the IP header by using
4783
+ * TCP_BPF_SYN. Otherwise, the bpf-prog can get both
4784
+ * IP[46] and TCP header by using TCP_BPF_SYN_IP.
4785
+ *
4786
+ * >0: Total number of bytes copied
4787
+ * -ENOSPC: Not enough space in optval. Only optlen number of
4788
+ * bytes is copied.
4789
+ * -ENOENT: The SYN skb is not available now and the earlier SYN pkt
4790
+ * is not saved by setsockopt(TCP_SAVE_SYN).
4791
+ */
4792
+ TCP_BPF_SYN = 1005, /* Copy the TCP header */
4793
+ TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */
4794
+ TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */
4795
+};
4796
+
4797
+enum {
4798
+ BPF_LOAD_HDR_OPT_TCP_SYN = (1ULL << 0),
4799
+};
4800
+
4801
+/* args[0] value during BPF_SOCK_OPS_HDR_OPT_LEN_CB and
4802
+ * BPF_SOCK_OPS_WRITE_HDR_OPT_CB.
4803
+ */
4804
+enum {
4805
+ BPF_WRITE_HDR_TCP_CURRENT_MSS = 1, /* Kernel is finding the
4806
+ * total option spaces
4807
+ * required for an established
4808
+ * sk in order to calculate the
4809
+ * MSS. No skb is actually
4810
+ * sent.
4811
+ */
4812
+ BPF_WRITE_HDR_TCP_SYNACK_COOKIE = 2, /* Kernel is in syncookie mode
4813
+ * when sending a SYN.
4814
+ */
4815
+};
27494816
27504817 struct bpf_perf_event_value {
27514818 __u64 counter;
....@@ -2753,12 +4820,16 @@
27534820 __u64 running;
27544821 };
27554822
2756
-#define BPF_DEVCG_ACC_MKNOD (1ULL << 0)
2757
-#define BPF_DEVCG_ACC_READ (1ULL << 1)
2758
-#define BPF_DEVCG_ACC_WRITE (1ULL << 2)
4823
+enum {
4824
+ BPF_DEVCG_ACC_MKNOD = (1ULL << 0),
4825
+ BPF_DEVCG_ACC_READ = (1ULL << 1),
4826
+ BPF_DEVCG_ACC_WRITE = (1ULL << 2),
4827
+};
27594828
2760
-#define BPF_DEVCG_DEV_BLOCK (1ULL << 0)
2761
-#define BPF_DEVCG_DEV_CHAR (1ULL << 1)
4829
+enum {
4830
+ BPF_DEVCG_DEV_BLOCK = (1ULL << 0),
4831
+ BPF_DEVCG_DEV_CHAR = (1ULL << 1),
4832
+};
27624833
27634834 struct bpf_cgroup_dev_ctx {
27644835 /* access_type encoded as (BPF_DEVCG_ACC_* << 16) | BPF_DEVCG_DEV_* */
....@@ -2774,8 +4845,10 @@
27744845 /* DIRECT: Skip the FIB rules and go to FIB table associated with device
27754846 * OUTPUT: Do lookup from egress perspective; default is ingress
27764847 */
2777
-#define BPF_FIB_LOOKUP_DIRECT (1U << 0)
2778
-#define BPF_FIB_LOOKUP_OUTPUT (1U << 1)
4848
+enum {
4849
+ BPF_FIB_LOOKUP_DIRECT = (1U << 0),
4850
+ BPF_FIB_LOOKUP_OUTPUT = (1U << 1),
4851
+};
27794852
27804853 enum {
27814854 BPF_FIB_LKUP_RET_SUCCESS, /* lookup successful */
....@@ -2838,6 +4911,16 @@
28384911 __u8 dmac[6]; /* ETH_ALEN */
28394912 };
28404913
4914
+struct bpf_redir_neigh {
4915
+ /* network family for lookup (AF_INET, AF_INET6) */
4916
+ __u32 nh_family;
4917
+ /* network address of nexthop; skips fib lookup to find gateway */
4918
+ union {
4919
+ __be32 ipv4_nh;
4920
+ __u32 ipv6_nh[4]; /* in6_addr; network order */
4921
+ };
4922
+};
4923
+
28414924 enum bpf_task_fd_type {
28424925 BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */
28434926 BPF_FD_TYPE_TRACEPOINT, /* tp name */
....@@ -2847,4 +4930,126 @@
28474930 BPF_FD_TYPE_URETPROBE, /* filename + offset */
28484931 };
28494932
4933
+enum {
4934
+ BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG = (1U << 0),
4935
+ BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL = (1U << 1),
4936
+ BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP = (1U << 2),
4937
+};
4938
+
4939
+struct bpf_flow_keys {
4940
+ __u16 nhoff;
4941
+ __u16 thoff;
4942
+ __u16 addr_proto; /* ETH_P_* of valid addrs */
4943
+ __u8 is_frag;
4944
+ __u8 is_first_frag;
4945
+ __u8 is_encap;
4946
+ __u8 ip_proto;
4947
+ __be16 n_proto;
4948
+ __be16 sport;
4949
+ __be16 dport;
4950
+ union {
4951
+ struct {
4952
+ __be32 ipv4_src;
4953
+ __be32 ipv4_dst;
4954
+ };
4955
+ struct {
4956
+ __u32 ipv6_src[4]; /* in6_addr; network order */
4957
+ __u32 ipv6_dst[4]; /* in6_addr; network order */
4958
+ };
4959
+ };
4960
+ __u32 flags;
4961
+ __be32 flow_label;
4962
+};
4963
+
4964
+struct bpf_func_info {
4965
+ __u32 insn_off;
4966
+ __u32 type_id;
4967
+};
4968
+
4969
+#define BPF_LINE_INFO_LINE_NUM(line_col) ((line_col) >> 10)
4970
+#define BPF_LINE_INFO_LINE_COL(line_col) ((line_col) & 0x3ff)
4971
+
4972
+struct bpf_line_info {
4973
+ __u32 insn_off;
4974
+ __u32 file_name_off;
4975
+ __u32 line_off;
4976
+ __u32 line_col;
4977
+};
4978
+
4979
+struct bpf_spin_lock {
4980
+ __u32 val;
4981
+};
4982
+
4983
+struct bpf_sysctl {
4984
+ __u32 write; /* Sysctl is being read (= 0) or written (= 1).
4985
+ * Allows 1,2,4-byte read, but no write.
4986
+ */
4987
+ __u32 file_pos; /* Sysctl file position to read from, write to.
4988
+ * Allows 1,2,4-byte read an 4-byte write.
4989
+ */
4990
+};
4991
+
4992
+struct bpf_sockopt {
4993
+ __bpf_md_ptr(struct bpf_sock *, sk);
4994
+ __bpf_md_ptr(void *, optval);
4995
+ __bpf_md_ptr(void *, optval_end);
4996
+
4997
+ __s32 level;
4998
+ __s32 optname;
4999
+ __s32 optlen;
5000
+ __s32 retval;
5001
+};
5002
+
5003
+struct bpf_pidns_info {
5004
+ __u32 pid;
5005
+ __u32 tgid;
5006
+};
5007
+
5008
+/* User accessible data for SK_LOOKUP programs. Add new fields at the end. */
5009
+struct bpf_sk_lookup {
5010
+ union {
5011
+ __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */
5012
+ __u64 cookie; /* Non-zero if socket was selected in PROG_TEST_RUN */
5013
+ };
5014
+
5015
+ __u32 family; /* Protocol family (AF_INET, AF_INET6) */
5016
+ __u32 protocol; /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */
5017
+ __u32 remote_ip4; /* Network byte order */
5018
+ __u32 remote_ip6[4]; /* Network byte order */
5019
+ __u32 remote_port; /* Network byte order */
5020
+ __u32 local_ip4; /* Network byte order */
5021
+ __u32 local_ip6[4]; /* Network byte order */
5022
+ __u32 local_port; /* Host byte order */
5023
+};
5024
+
5025
+/*
5026
+ * struct btf_ptr is used for typed pointer representation; the
5027
+ * type id is used to render the pointer data as the appropriate type
5028
+ * via the bpf_snprintf_btf() helper described above. A flags field -
5029
+ * potentially to specify additional details about the BTF pointer
5030
+ * (rather than its mode of display) - is included for future use.
5031
+ * Display flags - BTF_F_* - are passed to bpf_snprintf_btf separately.
5032
+ */
5033
+struct btf_ptr {
5034
+ void *ptr;
5035
+ __u32 type_id;
5036
+ __u32 flags; /* BTF ptr flags; unused at present. */
5037
+};
5038
+
5039
+/*
5040
+ * Flags to control bpf_snprintf_btf() behaviour.
5041
+ * - BTF_F_COMPACT: no formatting around type information
5042
+ * - BTF_F_NONAME: no struct/union member names/types
5043
+ * - BTF_F_PTR_RAW: show raw (unobfuscated) pointer values;
5044
+ * equivalent to %px.
5045
+ * - BTF_F_ZERO: show zero-valued struct/union members; they
5046
+ * are not displayed by default
5047
+ */
5048
+enum {
5049
+ BTF_F_COMPACT = (1ULL << 0),
5050
+ BTF_F_NONAME = (1ULL << 1),
5051
+ BTF_F_PTR_RAW = (1ULL << 2),
5052
+ BTF_F_ZERO = (1ULL << 3),
5053
+};
5054
+
28505055 #endif /* _UAPI__LINUX_BPF_H__ */