hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/kernel/bpf/verifier.c
....@@ -1,19 +1,14 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
23 * Copyright (c) 2016 Facebook
3
- *
4
- * This program is free software; you can redistribute it and/or
5
- * modify it under the terms of version 2 of the GNU General Public
6
- * License as published by the Free Software Foundation.
7
- *
8
- * This program is distributed in the hope that it will be useful, but
9
- * WITHOUT ANY WARRANTY; without even the implied warranty of
10
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11
- * General Public License for more details.
4
+ * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
125 */
6
+#include <uapi/linux/btf.h>
137 #include <linux/kernel.h>
148 #include <linux/types.h>
159 #include <linux/slab.h>
1610 #include <linux/bpf.h>
11
+#include <linux/btf.h>
1712 #include <linux/bpf_verifier.h>
1813 #include <linux/filter.h>
1914 #include <net/netlink.h>
....@@ -23,16 +18,22 @@
2318 #include <linux/bsearch.h>
2419 #include <linux/sort.h>
2520 #include <linux/perf_event.h>
21
+#include <linux/ctype.h>
22
+#include <linux/error-injection.h>
23
+#include <linux/bpf_lsm.h>
24
+#include <linux/btf_ids.h>
2625
2726 #include "disasm.h"
2827
2928 static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
30
-#define BPF_PROG_TYPE(_id, _name) \
29
+#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
3130 [_id] = & _name ## _verifier_ops,
3231 #define BPF_MAP_TYPE(_id, _ops)
32
+#define BPF_LINK_TYPE(_id, _name)
3333 #include <linux/bpf_types.h>
3434 #undef BPF_PROG_TYPE
3535 #undef BPF_MAP_TYPE
36
+#undef BPF_LINK_TYPE
3637 };
3738
3839 /* bpf_check() is a static code analyzer that walks eBPF program
....@@ -80,8 +81,8 @@
8081 * (like pointer plus pointer becomes SCALAR_VALUE type)
8182 *
8283 * When verifier sees load or store instructions the type of base register
83
- * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, PTR_TO_STACK. These are three pointer
84
- * types recognized by check_mem_access() function.
84
+ * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, PTR_TO_STACK, PTR_TO_SOCKET. These are
85
+ * four pointer types recognized by check_mem_access() function.
8586 *
8687 * PTR_TO_MAP_VALUE means that this register is pointing to 'map element value'
8788 * and the range of [ptr, ptr + map's value_size) is accessible.
....@@ -140,6 +141,24 @@
140141 *
141142 * After the call R0 is set to return type of the function and registers R1-R5
142143 * are set to NOT_INIT to indicate that they are no longer readable.
144
+ *
145
+ * The following reference types represent a potential reference to a kernel
146
+ * resource which, after first being allocated, must be checked and freed by
147
+ * the BPF program:
148
+ * - PTR_TO_SOCKET_OR_NULL, PTR_TO_SOCKET
149
+ *
150
+ * When the verifier sees a helper call return a reference type, it allocates a
151
+ * pointer id for the reference and stores it in the current function state.
152
+ * Similar to the way that PTR_TO_MAP_VALUE_OR_NULL is converted into
153
+ * PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL becomes PTR_TO_SOCKET when the type
154
+ * passes through a NULL-check conditional. For the branch wherein the state is
155
+ * changed to CONST_IMM, the verifier releases the reference.
156
+ *
157
+ * For each helper function that allocates a reference, such as
158
+ * bpf_sk_lookup_tcp(), there is a corresponding release function, such as
159
+ * bpf_sk_release(). When a reference type passes into the release function,
160
+ * the verifier also releases the reference. If any unchecked or unreleased
161
+ * reference remains at the end of the program, the verifier rejects it.
143162 */
144163
145164 /* verifier_state + insn_idx are pushed to stack when branch is encountered */
....@@ -152,11 +171,15 @@
152171 int insn_idx;
153172 int prev_insn_idx;
154173 struct bpf_verifier_stack_elem *next;
174
+ /* length of verifier log at the time this state was pushed on stack */
175
+ u32 log_pos;
155176 };
156177
157
-#define BPF_COMPLEXITY_LIMIT_INSNS 131072
158
-#define BPF_COMPLEXITY_LIMIT_STACK 1024
178
+#define BPF_COMPLEXITY_LIMIT_JMP_SEQ 8192
159179 #define BPF_COMPLEXITY_LIMIT_STATES 64
180
+
181
+#define BPF_MAP_KEY_POISON (1ULL << 63)
182
+#define BPF_MAP_KEY_SEEN (1ULL << 62)
160183
161184 #define BPF_MAP_PTR_UNPRIV 1UL
162185 #define BPF_MAP_PTR_POISON ((void *)((0xeB9FUL << 1) + \
....@@ -165,12 +188,12 @@
165188
166189 static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
167190 {
168
- return BPF_MAP_PTR(aux->map_state) == BPF_MAP_PTR_POISON;
191
+ return BPF_MAP_PTR(aux->map_ptr_state) == BPF_MAP_PTR_POISON;
169192 }
170193
171194 static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux)
172195 {
173
- return aux->map_state & BPF_MAP_PTR_UNPRIV;
196
+ return aux->map_ptr_state & BPF_MAP_PTR_UNPRIV;
174197 }
175198
176199 static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux,
....@@ -178,8 +201,31 @@
178201 {
179202 BUILD_BUG_ON((unsigned long)BPF_MAP_PTR_POISON & BPF_MAP_PTR_UNPRIV);
180203 unpriv |= bpf_map_ptr_unpriv(aux);
181
- aux->map_state = (unsigned long)map |
182
- (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL);
204
+ aux->map_ptr_state = (unsigned long)map |
205
+ (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL);
206
+}
207
+
208
+static bool bpf_map_key_poisoned(const struct bpf_insn_aux_data *aux)
209
+{
210
+ return aux->map_key_state & BPF_MAP_KEY_POISON;
211
+}
212
+
213
+static bool bpf_map_key_unseen(const struct bpf_insn_aux_data *aux)
214
+{
215
+ return !(aux->map_key_state & BPF_MAP_KEY_SEEN);
216
+}
217
+
218
+static u64 bpf_map_key_immediate(const struct bpf_insn_aux_data *aux)
219
+{
220
+ return aux->map_key_state & ~(BPF_MAP_KEY_SEEN | BPF_MAP_KEY_POISON);
221
+}
222
+
223
+static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state)
224
+{
225
+ bool poisoned = bpf_map_key_poisoned(aux);
226
+
227
+ aux->map_key_state = state | BPF_MAP_KEY_SEEN |
228
+ (poisoned ? BPF_MAP_KEY_POISON : 0ULL);
183229 }
184230
185231 struct bpf_call_arg_meta {
....@@ -188,10 +234,38 @@
188234 bool pkt_access;
189235 int regno;
190236 int access_size;
237
+ int mem_size;
191238 u64 msize_max_value;
239
+ int ref_obj_id;
240
+ int func_id;
241
+ u32 btf_id;
242
+ u32 ret_btf_id;
192243 };
193244
245
+struct btf *btf_vmlinux;
246
+
194247 static DEFINE_MUTEX(bpf_verifier_lock);
248
+
249
+static const struct bpf_line_info *
250
+find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
251
+{
252
+ const struct bpf_line_info *linfo;
253
+ const struct bpf_prog *prog;
254
+ u32 i, nr_linfo;
255
+
256
+ prog = env->prog;
257
+ nr_linfo = prog->aux->nr_linfo;
258
+
259
+ if (!nr_linfo || insn_off >= prog->len)
260
+ return NULL;
261
+
262
+ linfo = prog->aux->linfo;
263
+ for (i = 1; i < nr_linfo; i++)
264
+ if (insn_off < linfo[i].insn_off)
265
+ break;
266
+
267
+ return &linfo[i - 1];
268
+}
195269
196270 void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
197271 va_list args)
....@@ -206,9 +280,25 @@
206280 n = min(log->len_total - log->len_used - 1, n);
207281 log->kbuf[n] = '\0';
208282
283
+ if (log->level == BPF_LOG_KERNEL) {
284
+ pr_err("BPF:%s\n", log->kbuf);
285
+ return;
286
+ }
209287 if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1))
210288 log->len_used += n;
211289 else
290
+ log->ubuf = NULL;
291
+}
292
+
293
+static void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos)
294
+{
295
+ char zero = 0;
296
+
297
+ if (!bpf_verifier_log_needed(log))
298
+ return;
299
+
300
+ log->len_used = new_pos;
301
+ if (put_user(zero, log->ubuf + new_pos))
212302 log->ubuf = NULL;
213303 }
214304
....@@ -243,10 +333,167 @@
243333 va_end(args);
244334 }
245335
336
+__printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
337
+ const char *fmt, ...)
338
+{
339
+ va_list args;
340
+
341
+ if (!bpf_verifier_log_needed(log))
342
+ return;
343
+
344
+ va_start(args, fmt);
345
+ bpf_verifier_vlog(log, fmt, args);
346
+ va_end(args);
347
+}
348
+
349
+static const char *ltrim(const char *s)
350
+{
351
+ while (isspace(*s))
352
+ s++;
353
+
354
+ return s;
355
+}
356
+
357
+__printf(3, 4) static void verbose_linfo(struct bpf_verifier_env *env,
358
+ u32 insn_off,
359
+ const char *prefix_fmt, ...)
360
+{
361
+ const struct bpf_line_info *linfo;
362
+
363
+ if (!bpf_verifier_log_needed(&env->log))
364
+ return;
365
+
366
+ linfo = find_linfo(env, insn_off);
367
+ if (!linfo || linfo == env->prev_linfo)
368
+ return;
369
+
370
+ if (prefix_fmt) {
371
+ va_list args;
372
+
373
+ va_start(args, prefix_fmt);
374
+ bpf_verifier_vlog(&env->log, prefix_fmt, args);
375
+ va_end(args);
376
+ }
377
+
378
+ verbose(env, "%s\n",
379
+ ltrim(btf_name_by_offset(env->prog->aux->btf,
380
+ linfo->line_off)));
381
+
382
+ env->prev_linfo = linfo;
383
+}
384
+
246385 static bool type_is_pkt_pointer(enum bpf_reg_type type)
247386 {
248387 return type == PTR_TO_PACKET ||
249388 type == PTR_TO_PACKET_META;
389
+}
390
+
391
+static bool type_is_sk_pointer(enum bpf_reg_type type)
392
+{
393
+ return type == PTR_TO_SOCKET ||
394
+ type == PTR_TO_SOCK_COMMON ||
395
+ type == PTR_TO_TCP_SOCK ||
396
+ type == PTR_TO_XDP_SOCK;
397
+}
398
+
399
+static bool reg_type_not_null(enum bpf_reg_type type)
400
+{
401
+ return type == PTR_TO_SOCKET ||
402
+ type == PTR_TO_TCP_SOCK ||
403
+ type == PTR_TO_MAP_VALUE ||
404
+ type == PTR_TO_SOCK_COMMON;
405
+}
406
+
407
+static bool reg_type_may_be_null(enum bpf_reg_type type)
408
+{
409
+ return type == PTR_TO_MAP_VALUE_OR_NULL ||
410
+ type == PTR_TO_SOCKET_OR_NULL ||
411
+ type == PTR_TO_SOCK_COMMON_OR_NULL ||
412
+ type == PTR_TO_TCP_SOCK_OR_NULL ||
413
+ type == PTR_TO_BTF_ID_OR_NULL ||
414
+ type == PTR_TO_MEM_OR_NULL ||
415
+ type == PTR_TO_RDONLY_BUF_OR_NULL ||
416
+ type == PTR_TO_RDWR_BUF_OR_NULL;
417
+}
418
+
419
+static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
420
+{
421
+ return reg->type == PTR_TO_MAP_VALUE &&
422
+ map_value_has_spin_lock(reg->map_ptr);
423
+}
424
+
425
+static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type)
426
+{
427
+ return type == PTR_TO_SOCKET ||
428
+ type == PTR_TO_SOCKET_OR_NULL ||
429
+ type == PTR_TO_TCP_SOCK ||
430
+ type == PTR_TO_TCP_SOCK_OR_NULL ||
431
+ type == PTR_TO_MEM ||
432
+ type == PTR_TO_MEM_OR_NULL;
433
+}
434
+
435
+static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
436
+{
437
+ return type == ARG_PTR_TO_SOCK_COMMON;
438
+}
439
+
440
+static bool arg_type_may_be_null(enum bpf_arg_type type)
441
+{
442
+ return type == ARG_PTR_TO_MAP_VALUE_OR_NULL ||
443
+ type == ARG_PTR_TO_MEM_OR_NULL ||
444
+ type == ARG_PTR_TO_CTX_OR_NULL ||
445
+ type == ARG_PTR_TO_SOCKET_OR_NULL ||
446
+ type == ARG_PTR_TO_ALLOC_MEM_OR_NULL;
447
+}
448
+
449
+/* Determine whether the function releases some resources allocated by another
450
+ * function call. The first reference type argument will be assumed to be
451
+ * released by release_reference().
452
+ */
453
+static bool is_release_function(enum bpf_func_id func_id)
454
+{
455
+ return func_id == BPF_FUNC_sk_release ||
456
+ func_id == BPF_FUNC_ringbuf_submit ||
457
+ func_id == BPF_FUNC_ringbuf_discard;
458
+}
459
+
460
+static bool may_be_acquire_function(enum bpf_func_id func_id)
461
+{
462
+ return func_id == BPF_FUNC_sk_lookup_tcp ||
463
+ func_id == BPF_FUNC_sk_lookup_udp ||
464
+ func_id == BPF_FUNC_skc_lookup_tcp ||
465
+ func_id == BPF_FUNC_map_lookup_elem ||
466
+ func_id == BPF_FUNC_ringbuf_reserve;
467
+}
468
+
469
+static bool is_acquire_function(enum bpf_func_id func_id,
470
+ const struct bpf_map *map)
471
+{
472
+ enum bpf_map_type map_type = map ? map->map_type : BPF_MAP_TYPE_UNSPEC;
473
+
474
+ if (func_id == BPF_FUNC_sk_lookup_tcp ||
475
+ func_id == BPF_FUNC_sk_lookup_udp ||
476
+ func_id == BPF_FUNC_skc_lookup_tcp ||
477
+ func_id == BPF_FUNC_ringbuf_reserve)
478
+ return true;
479
+
480
+ if (func_id == BPF_FUNC_map_lookup_elem &&
481
+ (map_type == BPF_MAP_TYPE_SOCKMAP ||
482
+ map_type == BPF_MAP_TYPE_SOCKHASH))
483
+ return true;
484
+
485
+ return false;
486
+}
487
+
488
+static bool is_ptr_cast_function(enum bpf_func_id func_id)
489
+{
490
+ return func_id == BPF_FUNC_tcp_sock ||
491
+ func_id == BPF_FUNC_sk_fullsock ||
492
+ func_id == BPF_FUNC_skc_to_tcp_sock ||
493
+ func_id == BPF_FUNC_skc_to_tcp6_sock ||
494
+ func_id == BPF_FUNC_skc_to_udp6_sock ||
495
+ func_id == BPF_FUNC_skc_to_tcp_timewait_sock ||
496
+ func_id == BPF_FUNC_skc_to_tcp_request_sock;
250497 }
251498
252499 /* string representation of 'enum bpf_reg_type' */
....@@ -261,17 +508,44 @@
261508 [PTR_TO_PACKET] = "pkt",
262509 [PTR_TO_PACKET_META] = "pkt_meta",
263510 [PTR_TO_PACKET_END] = "pkt_end",
511
+ [PTR_TO_FLOW_KEYS] = "flow_keys",
512
+ [PTR_TO_SOCKET] = "sock",
513
+ [PTR_TO_SOCKET_OR_NULL] = "sock_or_null",
514
+ [PTR_TO_SOCK_COMMON] = "sock_common",
515
+ [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null",
516
+ [PTR_TO_TCP_SOCK] = "tcp_sock",
517
+ [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null",
518
+ [PTR_TO_TP_BUFFER] = "tp_buffer",
519
+ [PTR_TO_XDP_SOCK] = "xdp_sock",
520
+ [PTR_TO_BTF_ID] = "ptr_",
521
+ [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_",
522
+ [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_",
523
+ [PTR_TO_MEM] = "mem",
524
+ [PTR_TO_MEM_OR_NULL] = "mem_or_null",
525
+ [PTR_TO_RDONLY_BUF] = "rdonly_buf",
526
+ [PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null",
527
+ [PTR_TO_RDWR_BUF] = "rdwr_buf",
528
+ [PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null",
529
+};
530
+
531
+static char slot_type_char[] = {
532
+ [STACK_INVALID] = '?',
533
+ [STACK_SPILL] = 'r',
534
+ [STACK_MISC] = 'm',
535
+ [STACK_ZERO] = '0',
264536 };
265537
266538 static void print_liveness(struct bpf_verifier_env *env,
267539 enum bpf_reg_liveness live)
268540 {
269
- if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN))
541
+ if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN | REG_LIVE_DONE))
270542 verbose(env, "_");
271543 if (live & REG_LIVE_READ)
272544 verbose(env, "r");
273545 if (live & REG_LIVE_WRITTEN)
274546 verbose(env, "w");
547
+ if (live & REG_LIVE_DONE)
548
+ verbose(env, "D");
275549 }
276550
277551 static struct bpf_func_state *func(struct bpf_verifier_env *env,
....@@ -280,6 +554,26 @@
280554 struct bpf_verifier_state *cur = env->cur_state;
281555
282556 return cur->frame[reg->frameno];
557
+}
558
+
559
+const char *kernel_type_name(u32 id)
560
+{
561
+ return btf_name_by_offset(btf_vmlinux,
562
+ btf_type_by_id(btf_vmlinux, id)->name_off);
563
+}
564
+
565
+/* The reg state of a pointer or a bounded scalar was saved when
566
+ * it was spilled to the stack.
567
+ */
568
+static bool is_spilled_reg(const struct bpf_stack_state *stack)
569
+{
570
+ return stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL;
571
+}
572
+
573
+static void scrub_spilled_slot(u8 *stype)
574
+{
575
+ if (*stype != STACK_INVALID)
576
+ *stype = STACK_MISC;
283577 }
284578
285579 static void print_verifier_state(struct bpf_verifier_env *env,
....@@ -299,14 +593,20 @@
299593 verbose(env, " R%d", i);
300594 print_liveness(env, reg->live);
301595 verbose(env, "=%s", reg_type_str[t]);
596
+ if (t == SCALAR_VALUE && reg->precise)
597
+ verbose(env, "P");
302598 if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
303599 tnum_is_const(reg->var_off)) {
304600 /* reg->off should be 0 for SCALAR_VALUE */
305601 verbose(env, "%lld", reg->var_off.value + reg->off);
306
- if (t == PTR_TO_STACK)
307
- verbose(env, ",call_%d", func(env, reg)->callsite);
308602 } else {
603
+ if (t == PTR_TO_BTF_ID ||
604
+ t == PTR_TO_BTF_ID_OR_NULL ||
605
+ t == PTR_TO_PERCPU_BTF_ID)
606
+ verbose(env, "%s", kernel_type_name(reg->btf_id));
309607 verbose(env, "(id=%d", reg->id);
608
+ if (reg_type_may_be_refcounted_or_null(t))
609
+ verbose(env, ",ref_obj_id=%d", reg->ref_obj_id);
310610 if (t != SCALAR_VALUE)
311611 verbose(env, ",off=%d", reg->off);
312612 if (type_is_pkt_pointer(t))
....@@ -344,77 +644,189 @@
344644 tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
345645 verbose(env, ",var_off=%s", tn_buf);
346646 }
647
+ if (reg->s32_min_value != reg->smin_value &&
648
+ reg->s32_min_value != S32_MIN)
649
+ verbose(env, ",s32_min_value=%d",
650
+ (int)(reg->s32_min_value));
651
+ if (reg->s32_max_value != reg->smax_value &&
652
+ reg->s32_max_value != S32_MAX)
653
+ verbose(env, ",s32_max_value=%d",
654
+ (int)(reg->s32_max_value));
655
+ if (reg->u32_min_value != reg->umin_value &&
656
+ reg->u32_min_value != U32_MIN)
657
+ verbose(env, ",u32_min_value=%d",
658
+ (int)(reg->u32_min_value));
659
+ if (reg->u32_max_value != reg->umax_value &&
660
+ reg->u32_max_value != U32_MAX)
661
+ verbose(env, ",u32_max_value=%d",
662
+ (int)(reg->u32_max_value));
347663 }
348664 verbose(env, ")");
349665 }
350666 }
351667 for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
352
- if (state->stack[i].slot_type[0] == STACK_SPILL) {
353
- verbose(env, " fp%d",
354
- (-i - 1) * BPF_REG_SIZE);
355
- print_liveness(env, state->stack[i].spilled_ptr.live);
356
- verbose(env, "=%s",
357
- reg_type_str[state->stack[i].spilled_ptr.type]);
668
+ char types_buf[BPF_REG_SIZE + 1];
669
+ bool valid = false;
670
+ int j;
671
+
672
+ for (j = 0; j < BPF_REG_SIZE; j++) {
673
+ if (state->stack[i].slot_type[j] != STACK_INVALID)
674
+ valid = true;
675
+ types_buf[j] = slot_type_char[
676
+ state->stack[i].slot_type[j]];
358677 }
359
- if (state->stack[i].slot_type[0] == STACK_ZERO)
360
- verbose(env, " fp%d=0", (-i - 1) * BPF_REG_SIZE);
678
+ types_buf[BPF_REG_SIZE] = 0;
679
+ if (!valid)
680
+ continue;
681
+ verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
682
+ print_liveness(env, state->stack[i].spilled_ptr.live);
683
+ if (is_spilled_reg(&state->stack[i])) {
684
+ reg = &state->stack[i].spilled_ptr;
685
+ t = reg->type;
686
+ verbose(env, "=%s", reg_type_str[t]);
687
+ if (t == SCALAR_VALUE && reg->precise)
688
+ verbose(env, "P");
689
+ if (t == SCALAR_VALUE && tnum_is_const(reg->var_off))
690
+ verbose(env, "%lld", reg->var_off.value + reg->off);
691
+ } else {
692
+ verbose(env, "=%s", types_buf);
693
+ }
694
+ }
695
+ if (state->acquired_refs && state->refs[0].id) {
696
+ verbose(env, " refs=%d", state->refs[0].id);
697
+ for (i = 1; i < state->acquired_refs; i++)
698
+ if (state->refs[i].id)
699
+ verbose(env, ",%d", state->refs[i].id);
361700 }
362701 verbose(env, "\n");
363702 }
364703
365
-static int copy_stack_state(struct bpf_func_state *dst,
366
- const struct bpf_func_state *src)
367
-{
368
- if (!src->stack)
369
- return 0;
370
- if (WARN_ON_ONCE(dst->allocated_stack < src->allocated_stack)) {
371
- /* internal bug, make state invalid to reject the program */
372
- memset(dst, 0, sizeof(*dst));
373
- return -EFAULT;
374
- }
375
- memcpy(dst->stack, src->stack,
376
- sizeof(*src->stack) * (src->allocated_stack / BPF_REG_SIZE));
377
- return 0;
704
+#define COPY_STATE_FN(NAME, COUNT, FIELD, SIZE) \
705
+static int copy_##NAME##_state(struct bpf_func_state *dst, \
706
+ const struct bpf_func_state *src) \
707
+{ \
708
+ if (!src->FIELD) \
709
+ return 0; \
710
+ if (WARN_ON_ONCE(dst->COUNT < src->COUNT)) { \
711
+ /* internal bug, make state invalid to reject the program */ \
712
+ memset(dst, 0, sizeof(*dst)); \
713
+ return -EFAULT; \
714
+ } \
715
+ memcpy(dst->FIELD, src->FIELD, \
716
+ sizeof(*src->FIELD) * (src->COUNT / SIZE)); \
717
+ return 0; \
378718 }
719
+/* copy_reference_state() */
720
+COPY_STATE_FN(reference, acquired_refs, refs, 1)
721
+/* copy_stack_state() */
722
+COPY_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE)
723
+#undef COPY_STATE_FN
724
+
725
+#define REALLOC_STATE_FN(NAME, COUNT, FIELD, SIZE) \
726
+static int realloc_##NAME##_state(struct bpf_func_state *state, int size, \
727
+ bool copy_old) \
728
+{ \
729
+ u32 old_size = state->COUNT; \
730
+ struct bpf_##NAME##_state *new_##FIELD; \
731
+ int slot = size / SIZE; \
732
+ \
733
+ if (size <= old_size || !size) { \
734
+ if (copy_old) \
735
+ return 0; \
736
+ state->COUNT = slot * SIZE; \
737
+ if (!size && old_size) { \
738
+ kfree(state->FIELD); \
739
+ state->FIELD = NULL; \
740
+ } \
741
+ return 0; \
742
+ } \
743
+ new_##FIELD = kmalloc_array(slot, sizeof(struct bpf_##NAME##_state), \
744
+ GFP_KERNEL); \
745
+ if (!new_##FIELD) \
746
+ return -ENOMEM; \
747
+ if (copy_old) { \
748
+ if (state->FIELD) \
749
+ memcpy(new_##FIELD, state->FIELD, \
750
+ sizeof(*new_##FIELD) * (old_size / SIZE)); \
751
+ memset(new_##FIELD + old_size / SIZE, 0, \
752
+ sizeof(*new_##FIELD) * (size - old_size) / SIZE); \
753
+ } \
754
+ state->COUNT = slot * SIZE; \
755
+ kfree(state->FIELD); \
756
+ state->FIELD = new_##FIELD; \
757
+ return 0; \
758
+}
759
+/* realloc_reference_state() */
760
+REALLOC_STATE_FN(reference, acquired_refs, refs, 1)
761
+/* realloc_stack_state() */
762
+REALLOC_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE)
763
+#undef REALLOC_STATE_FN
379764
380765 /* do_check() starts with zero-sized stack in struct bpf_verifier_state to
381766 * make it consume minimal amount of memory. check_stack_write() access from
382767 * the program calls into realloc_func_state() to grow the stack size.
383
- * Note there is a non-zero parent pointer inside each reg of bpf_verifier_state
384
- * which this function copies over. It points to corresponding reg in previous
385
- * bpf_verifier_state which is never reallocated
768
+ * Note there is a non-zero 'parent' pointer inside bpf_verifier_state
769
+ * which realloc_stack_state() copies over. It points to previous
770
+ * bpf_verifier_state which is never reallocated.
386771 */
387
-static int realloc_func_state(struct bpf_func_state *state, int size,
388
- bool copy_old)
772
+static int realloc_func_state(struct bpf_func_state *state, int stack_size,
773
+ int refs_size, bool copy_old)
389774 {
390
- u32 old_size = state->allocated_stack;
391
- struct bpf_stack_state *new_stack;
392
- int slot = size / BPF_REG_SIZE;
775
+ int err = realloc_reference_state(state, refs_size, copy_old);
776
+ if (err)
777
+ return err;
778
+ return realloc_stack_state(state, stack_size, copy_old);
779
+}
393780
394
- if (size <= old_size || !size) {
395
- if (copy_old)
781
+/* Acquire a pointer id from the env and update the state->refs to include
782
+ * this new pointer reference.
783
+ * On success, returns a valid pointer id to associate with the register
784
+ * On failure, returns a negative errno.
785
+ */
786
+static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx)
787
+{
788
+ struct bpf_func_state *state = cur_func(env);
789
+ int new_ofs = state->acquired_refs;
790
+ int id, err;
791
+
792
+ err = realloc_reference_state(state, state->acquired_refs + 1, true);
793
+ if (err)
794
+ return err;
795
+ id = ++env->id_gen;
796
+ state->refs[new_ofs].id = id;
797
+ state->refs[new_ofs].insn_idx = insn_idx;
798
+
799
+ return id;
800
+}
801
+
802
+/* release function corresponding to acquire_reference_state(). Idempotent. */
803
+static int release_reference_state(struct bpf_func_state *state, int ptr_id)
804
+{
805
+ int i, last_idx;
806
+
807
+ last_idx = state->acquired_refs - 1;
808
+ for (i = 0; i < state->acquired_refs; i++) {
809
+ if (state->refs[i].id == ptr_id) {
810
+ if (last_idx && i != last_idx)
811
+ memcpy(&state->refs[i], &state->refs[last_idx],
812
+ sizeof(*state->refs));
813
+ memset(&state->refs[last_idx], 0, sizeof(*state->refs));
814
+ state->acquired_refs--;
396815 return 0;
397
- state->allocated_stack = slot * BPF_REG_SIZE;
398
- if (!size && old_size) {
399
- kfree(state->stack);
400
- state->stack = NULL;
401816 }
402
- return 0;
403817 }
404
- new_stack = kmalloc_array(slot, sizeof(struct bpf_stack_state),
405
- GFP_KERNEL);
406
- if (!new_stack)
407
- return -ENOMEM;
408
- if (copy_old) {
409
- if (state->stack)
410
- memcpy(new_stack, state->stack,
411
- sizeof(*new_stack) * (old_size / BPF_REG_SIZE));
412
- memset(new_stack + old_size / BPF_REG_SIZE, 0,
413
- sizeof(*new_stack) * (size - old_size) / BPF_REG_SIZE);
414
- }
415
- state->allocated_stack = slot * BPF_REG_SIZE;
416
- kfree(state->stack);
417
- state->stack = new_stack;
818
+ return -EINVAL;
819
+}
820
+
821
+static int transfer_reference_state(struct bpf_func_state *dst,
822
+ struct bpf_func_state *src)
823
+{
824
+ int err = realloc_reference_state(dst, src->acquired_refs, false);
825
+ if (err)
826
+ return err;
827
+ err = copy_reference_state(dst, src);
828
+ if (err)
829
+ return err;
418830 return 0;
419831 }
420832
....@@ -422,8 +834,16 @@
422834 {
423835 if (!state)
424836 return;
837
+ kfree(state->refs);
425838 kfree(state->stack);
426839 kfree(state);
840
+}
841
+
842
+static void clear_jmp_history(struct bpf_verifier_state *state)
843
+{
844
+ kfree(state->jmp_history);
845
+ state->jmp_history = NULL;
846
+ state->jmp_history_cnt = 0;
427847 }
428848
429849 static void free_verifier_state(struct bpf_verifier_state *state,
....@@ -435,6 +855,7 @@
435855 free_func_state(state->frame[i]);
436856 state->frame[i] = NULL;
437857 }
858
+ clear_jmp_history(state);
438859 if (free_self)
439860 kfree(state);
440861 }
....@@ -447,10 +868,14 @@
447868 {
448869 int err;
449870
450
- err = realloc_func_state(dst, src->allocated_stack, false);
871
+ err = realloc_func_state(dst, src->allocated_stack, src->acquired_refs,
872
+ false);
451873 if (err)
452874 return err;
453
- memcpy(dst, src, offsetof(struct bpf_func_state, allocated_stack));
875
+ memcpy(dst, src, offsetof(struct bpf_func_state, acquired_refs));
876
+ err = copy_reference_state(dst, src);
877
+ if (err)
878
+ return err;
454879 return copy_stack_state(dst, src);
455880 }
456881
....@@ -458,7 +883,17 @@
458883 const struct bpf_verifier_state *src)
459884 {
460885 struct bpf_func_state *dst;
886
+ u32 jmp_sz = sizeof(struct bpf_idx_pair) * src->jmp_history_cnt;
461887 int i, err;
888
+
889
+ if (dst_state->jmp_history_cnt < src->jmp_history_cnt) {
890
+ kfree(dst_state->jmp_history);
891
+ dst_state->jmp_history = kmalloc(jmp_sz, GFP_USER);
892
+ if (!dst_state->jmp_history)
893
+ return -ENOMEM;
894
+ }
895
+ memcpy(dst_state->jmp_history, src->jmp_history, jmp_sz);
896
+ dst_state->jmp_history_cnt = src->jmp_history_cnt;
462897
463898 /* if dst has more stack frames then src frame, free them */
464899 for (i = src->curframe + 1; i <= dst_state->curframe; i++) {
....@@ -467,6 +902,11 @@
467902 }
468903 dst_state->speculative = src->speculative;
469904 dst_state->curframe = src->curframe;
905
+ dst_state->active_spin_lock = src->active_spin_lock;
906
+ dst_state->branches = src->branches;
907
+ dst_state->parent = src->parent;
908
+ dst_state->first_insn_idx = src->first_insn_idx;
909
+ dst_state->last_insn_idx = src->last_insn_idx;
470910 for (i = 0; i <= src->curframe; i++) {
471911 dst = dst_state->frame[i];
472912 if (!dst) {
....@@ -482,8 +922,25 @@
482922 return 0;
483923 }
484924
925
+static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
926
+{
927
+ while (st) {
928
+ u32 br = --st->branches;
929
+
930
+ /* WARN_ON(br > 1) technically makes sense here,
931
+ * but see comment in push_stack(), hence:
932
+ */
933
+ WARN_ONCE((int)br < 0,
934
+ "BUG update_branch_counts:branches_to_explore=%d\n",
935
+ br);
936
+ if (br)
937
+ break;
938
+ st = st->parent;
939
+ }
940
+}
941
+
485942 static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
486
- int *insn_idx)
943
+ int *insn_idx, bool pop_log)
487944 {
488945 struct bpf_verifier_state *cur = env->cur_state;
489946 struct bpf_verifier_stack_elem *elem, *head = env->head;
....@@ -497,6 +954,8 @@
497954 if (err)
498955 return err;
499956 }
957
+ if (pop_log)
958
+ bpf_vlog_reset(&env->log, head->log_pos);
500959 if (insn_idx)
501960 *insn_idx = head->insn_idx;
502961 if (prev_insn_idx)
....@@ -524,22 +983,36 @@
524983 elem->insn_idx = insn_idx;
525984 elem->prev_insn_idx = prev_insn_idx;
526985 elem->next = env->head;
986
+ elem->log_pos = env->log.len_used;
527987 env->head = elem;
528988 env->stack_size++;
529989 err = copy_verifier_state(&elem->st, cur);
530990 if (err)
531991 goto err;
532992 elem->st.speculative |= speculative;
533
- if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) {
534
- verbose(env, "BPF program is too complex\n");
993
+ if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) {
994
+ verbose(env, "The sequence of %d jumps is too complex.\n",
995
+ env->stack_size);
535996 goto err;
997
+ }
998
+ if (elem->st.parent) {
999
+ ++elem->st.parent->branches;
1000
+ /* WARN_ON(branches > 2) technically makes sense here,
1001
+ * but
1002
+ * 1. speculative states will bump 'branches' for non-branch
1003
+ * instructions
1004
+ * 2. is_state_visited() heuristics may decide not to create
1005
+ * a new state for a sequence of branches and all such current
1006
+ * and cloned states will be pointing to a single parent state
1007
+ * which might have large 'branches' count.
1008
+ */
5361009 }
5371010 return &elem->st;
5381011 err:
5391012 free_verifier_state(env->cur_state, true);
5401013 env->cur_state = NULL;
5411014 /* pop all elements and return */
542
- while (!pop_stack(env, NULL, NULL));
1015
+ while (!pop_stack(env, NULL, NULL, false));
5431016 return NULL;
5441017 }
5451018
....@@ -548,7 +1021,23 @@
5481021 BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
5491022 };
5501023
551
-static void __mark_reg_not_init(struct bpf_reg_state *reg);
1024
+static void __mark_reg_not_init(const struct bpf_verifier_env *env,
1025
+ struct bpf_reg_state *reg);
1026
+
1027
+/* This helper doesn't clear reg->id */
1028
+static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm)
1029
+{
1030
+ reg->var_off = tnum_const(imm);
1031
+ reg->smin_value = (s64)imm;
1032
+ reg->smax_value = (s64)imm;
1033
+ reg->umin_value = imm;
1034
+ reg->umax_value = imm;
1035
+
1036
+ reg->s32_min_value = (s32)imm;
1037
+ reg->s32_max_value = (s32)imm;
1038
+ reg->u32_min_value = (u32)imm;
1039
+ reg->u32_max_value = (u32)imm;
1040
+}
5521041
5531042 /* Mark the unknown part of a register (variable offset or scalar value) as
5541043 * known to have the value @imm.
....@@ -558,11 +1047,16 @@
5581047 /* Clear id, off, and union(map_ptr, range) */
5591048 memset(((u8 *)reg) + sizeof(reg->type), 0,
5601049 offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type));
561
- reg->var_off = tnum_const(imm);
562
- reg->smin_value = (s64)imm;
563
- reg->smax_value = (s64)imm;
564
- reg->umin_value = imm;
565
- reg->umax_value = imm;
1050
+ ___mark_reg_known(reg, imm);
1051
+}
1052
+
1053
+static void __mark_reg32_known(struct bpf_reg_state *reg, u64 imm)
1054
+{
1055
+ reg->var_off = tnum_const_subreg(reg->var_off, imm);
1056
+ reg->s32_min_value = (s32)imm;
1057
+ reg->s32_max_value = (s32)imm;
1058
+ reg->u32_min_value = (u32)imm;
1059
+ reg->u32_max_value = (u32)imm;
5661060 }
5671061
5681062 /* Mark the 'variable offset' part of a register as zero. This should be
....@@ -586,7 +1080,7 @@
5861080 verbose(env, "mark_reg_known_zero(regs, %u)\n", regno);
5871081 /* Something bad happened, let's kill all regs */
5881082 for (regno = 0; regno < MAX_BPF_REG; regno++)
589
- __mark_reg_not_init(regs + regno);
1083
+ __mark_reg_not_init(env, regs + regno);
5901084 return;
5911085 }
5921086 __mark_reg_known_zero(regs + regno);
....@@ -617,8 +1111,52 @@
6171111 tnum_equals_const(reg->var_off, 0);
6181112 }
6191113
620
-/* Attempts to improve min/max values based on var_off information */
621
-static void __update_reg_bounds(struct bpf_reg_state *reg)
1114
+/* Reset the min/max bounds of a register */
1115
+static void __mark_reg_unbounded(struct bpf_reg_state *reg)
1116
+{
1117
+ reg->smin_value = S64_MIN;
1118
+ reg->smax_value = S64_MAX;
1119
+ reg->umin_value = 0;
1120
+ reg->umax_value = U64_MAX;
1121
+
1122
+ reg->s32_min_value = S32_MIN;
1123
+ reg->s32_max_value = S32_MAX;
1124
+ reg->u32_min_value = 0;
1125
+ reg->u32_max_value = U32_MAX;
1126
+}
1127
+
1128
+static void __mark_reg64_unbounded(struct bpf_reg_state *reg)
1129
+{
1130
+ reg->smin_value = S64_MIN;
1131
+ reg->smax_value = S64_MAX;
1132
+ reg->umin_value = 0;
1133
+ reg->umax_value = U64_MAX;
1134
+}
1135
+
1136
+static void __mark_reg32_unbounded(struct bpf_reg_state *reg)
1137
+{
1138
+ reg->s32_min_value = S32_MIN;
1139
+ reg->s32_max_value = S32_MAX;
1140
+ reg->u32_min_value = 0;
1141
+ reg->u32_max_value = U32_MAX;
1142
+}
1143
+
1144
+static void __update_reg32_bounds(struct bpf_reg_state *reg)
1145
+{
1146
+ struct tnum var32_off = tnum_subreg(reg->var_off);
1147
+
1148
+ /* min signed is max(sign bit) | min(other bits) */
1149
+ reg->s32_min_value = max_t(s32, reg->s32_min_value,
1150
+ var32_off.value | (var32_off.mask & S32_MIN));
1151
+ /* max signed is min(sign bit) | max(other bits) */
1152
+ reg->s32_max_value = min_t(s32, reg->s32_max_value,
1153
+ var32_off.value | (var32_off.mask & S32_MAX));
1154
+ reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)var32_off.value);
1155
+ reg->u32_max_value = min(reg->u32_max_value,
1156
+ (u32)(var32_off.value | var32_off.mask));
1157
+}
1158
+
1159
+static void __update_reg64_bounds(struct bpf_reg_state *reg)
6221160 {
6231161 /* min signed is max(sign bit) | min(other bits) */
6241162 reg->smin_value = max_t(s64, reg->smin_value,
....@@ -631,8 +1169,48 @@
6311169 reg->var_off.value | reg->var_off.mask);
6321170 }
6331171
1172
+static void __update_reg_bounds(struct bpf_reg_state *reg)
1173
+{
1174
+ __update_reg32_bounds(reg);
1175
+ __update_reg64_bounds(reg);
1176
+}
1177
+
6341178 /* Uses signed min/max values to inform unsigned, and vice-versa */
635
-static void __reg_deduce_bounds(struct bpf_reg_state *reg)
1179
+static void __reg32_deduce_bounds(struct bpf_reg_state *reg)
1180
+{
1181
+ /* Learn sign from signed bounds.
1182
+ * If we cannot cross the sign boundary, then signed and unsigned bounds
1183
+ * are the same, so combine. This works even in the negative case, e.g.
1184
+ * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff.
1185
+ */
1186
+ if (reg->s32_min_value >= 0 || reg->s32_max_value < 0) {
1187
+ reg->s32_min_value = reg->u32_min_value =
1188
+ max_t(u32, reg->s32_min_value, reg->u32_min_value);
1189
+ reg->s32_max_value = reg->u32_max_value =
1190
+ min_t(u32, reg->s32_max_value, reg->u32_max_value);
1191
+ return;
1192
+ }
1193
+ /* Learn sign from unsigned bounds. Signed bounds cross the sign
1194
+ * boundary, so we must be careful.
1195
+ */
1196
+ if ((s32)reg->u32_max_value >= 0) {
1197
+ /* Positive. We can't learn anything from the smin, but smax
1198
+ * is positive, hence safe.
1199
+ */
1200
+ reg->s32_min_value = reg->u32_min_value;
1201
+ reg->s32_max_value = reg->u32_max_value =
1202
+ min_t(u32, reg->s32_max_value, reg->u32_max_value);
1203
+ } else if ((s32)reg->u32_min_value < 0) {
1204
+ /* Negative. We can't learn anything from the smax, but smin
1205
+ * is negative, hence safe.
1206
+ */
1207
+ reg->s32_min_value = reg->u32_min_value =
1208
+ max_t(u32, reg->s32_min_value, reg->u32_min_value);
1209
+ reg->s32_max_value = reg->u32_max_value;
1210
+ }
1211
+}
1212
+
1213
+static void __reg64_deduce_bounds(struct bpf_reg_state *reg)
6361214 {
6371215 /* Learn sign from signed bounds.
6381216 * If we cannot cross the sign boundary, then signed and unsigned bounds
....@@ -666,25 +1244,112 @@
6661244 }
6671245 }
6681246
1247
+static void __reg_deduce_bounds(struct bpf_reg_state *reg)
1248
+{
1249
+ __reg32_deduce_bounds(reg);
1250
+ __reg64_deduce_bounds(reg);
1251
+}
1252
+
6691253 /* Attempts to improve var_off based on unsigned min/max information */
6701254 static void __reg_bound_offset(struct bpf_reg_state *reg)
6711255 {
672
- reg->var_off = tnum_intersect(reg->var_off,
673
- tnum_range(reg->umin_value,
674
- reg->umax_value));
1256
+ struct tnum var64_off = tnum_intersect(reg->var_off,
1257
+ tnum_range(reg->umin_value,
1258
+ reg->umax_value));
1259
+ struct tnum var32_off = tnum_intersect(tnum_subreg(reg->var_off),
1260
+ tnum_range(reg->u32_min_value,
1261
+ reg->u32_max_value));
1262
+
1263
+ reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off);
6751264 }
6761265
677
-/* Reset the min/max bounds of a register */
678
-static void __mark_reg_unbounded(struct bpf_reg_state *reg)
1266
+static void reg_bounds_sync(struct bpf_reg_state *reg)
6791267 {
680
- reg->smin_value = S64_MIN;
681
- reg->smax_value = S64_MAX;
682
- reg->umin_value = 0;
683
- reg->umax_value = U64_MAX;
1268
+ /* We might have learned new bounds from the var_off. */
1269
+ __update_reg_bounds(reg);
1270
+ /* We might have learned something about the sign bit. */
1271
+ __reg_deduce_bounds(reg);
1272
+ /* We might have learned some bits from the bounds. */
1273
+ __reg_bound_offset(reg);
1274
+ /* Intersecting with the old var_off might have improved our bounds
1275
+ * slightly, e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
1276
+ * then new var_off is (0; 0x7f...fc) which improves our umax.
1277
+ */
1278
+ __update_reg_bounds(reg);
1279
+}
1280
+
1281
+static bool __reg32_bound_s64(s32 a)
1282
+{
1283
+ return a >= 0 && a <= S32_MAX;
1284
+}
1285
+
1286
+static void __reg_assign_32_into_64(struct bpf_reg_state *reg)
1287
+{
1288
+ reg->umin_value = reg->u32_min_value;
1289
+ reg->umax_value = reg->u32_max_value;
1290
+
1291
+ /* Attempt to pull 32-bit signed bounds into 64-bit bounds but must
1292
+ * be positive otherwise set to worse case bounds and refine later
1293
+ * from tnum.
1294
+ */
1295
+ if (__reg32_bound_s64(reg->s32_min_value) &&
1296
+ __reg32_bound_s64(reg->s32_max_value)) {
1297
+ reg->smin_value = reg->s32_min_value;
1298
+ reg->smax_value = reg->s32_max_value;
1299
+ } else {
1300
+ reg->smin_value = 0;
1301
+ reg->smax_value = U32_MAX;
1302
+ }
1303
+}
1304
+
1305
+static void __reg_combine_32_into_64(struct bpf_reg_state *reg)
1306
+{
1307
+ /* special case when 64-bit register has upper 32-bit register
1308
+ * zeroed. Typically happens after zext or <<32, >>32 sequence
1309
+ * allowing us to use 32-bit bounds directly,
1310
+ */
1311
+ if (tnum_equals_const(tnum_clear_subreg(reg->var_off), 0)) {
1312
+ __reg_assign_32_into_64(reg);
1313
+ } else {
1314
+ /* Otherwise the best we can do is push lower 32bit known and
1315
+ * unknown bits into register (var_off set from jmp logic)
1316
+ * then learn as much as possible from the 64-bit tnum
1317
+ * known and unknown bits. The previous smin/smax bounds are
1318
+ * invalid here because of jmp32 compare so mark them unknown
1319
+ * so they do not impact tnum bounds calculation.
1320
+ */
1321
+ __mark_reg64_unbounded(reg);
1322
+ }
1323
+ reg_bounds_sync(reg);
1324
+}
1325
+
1326
+static bool __reg64_bound_s32(s64 a)
1327
+{
1328
+ return a >= S32_MIN && a <= S32_MAX;
1329
+}
1330
+
1331
+static bool __reg64_bound_u32(u64 a)
1332
+{
1333
+ return a >= U32_MIN && a <= U32_MAX;
1334
+}
1335
+
1336
+static void __reg_combine_64_into_32(struct bpf_reg_state *reg)
1337
+{
1338
+ __mark_reg32_unbounded(reg);
1339
+ if (__reg64_bound_s32(reg->smin_value) && __reg64_bound_s32(reg->smax_value)) {
1340
+ reg->s32_min_value = (s32)reg->smin_value;
1341
+ reg->s32_max_value = (s32)reg->smax_value;
1342
+ }
1343
+ if (__reg64_bound_u32(reg->umin_value) && __reg64_bound_u32(reg->umax_value)) {
1344
+ reg->u32_min_value = (u32)reg->umin_value;
1345
+ reg->u32_max_value = (u32)reg->umax_value;
1346
+ }
1347
+ reg_bounds_sync(reg);
6841348 }
6851349
6861350 /* Mark a register as having a completely unknown (scalar) value. */
687
-static void __mark_reg_unknown(struct bpf_reg_state *reg)
1351
+static void __mark_reg_unknown(const struct bpf_verifier_env *env,
1352
+ struct bpf_reg_state *reg)
6881353 {
6891354 /*
6901355 * Clear type, id, off, and union(map_ptr, range) and
....@@ -694,6 +1359,7 @@
6941359 reg->type = SCALAR_VALUE;
6951360 reg->var_off = tnum_unknown;
6961361 reg->frameno = 0;
1362
+ reg->precise = !env->bpf_capable;
6971363 __mark_reg_unbounded(reg);
6981364 }
6991365
....@@ -704,15 +1370,16 @@
7041370 verbose(env, "mark_reg_unknown(regs, %u)\n", regno);
7051371 /* Something bad happened, let's kill all regs except FP */
7061372 for (regno = 0; regno < BPF_REG_FP; regno++)
707
- __mark_reg_not_init(regs + regno);
1373
+ __mark_reg_not_init(env, regs + regno);
7081374 return;
7091375 }
710
- __mark_reg_unknown(regs + regno);
1376
+ __mark_reg_unknown(env, regs + regno);
7111377 }
7121378
713
-static void __mark_reg_not_init(struct bpf_reg_state *reg)
1379
+static void __mark_reg_not_init(const struct bpf_verifier_env *env,
1380
+ struct bpf_reg_state *reg)
7141381 {
715
- __mark_reg_unknown(reg);
1382
+ __mark_reg_unknown(env, reg);
7161383 reg->type = NOT_INIT;
7171384 }
7181385
....@@ -723,12 +1390,26 @@
7231390 verbose(env, "mark_reg_not_init(regs, %u)\n", regno);
7241391 /* Something bad happened, let's kill all regs except FP */
7251392 for (regno = 0; regno < BPF_REG_FP; regno++)
726
- __mark_reg_not_init(regs + regno);
1393
+ __mark_reg_not_init(env, regs + regno);
7271394 return;
7281395 }
729
- __mark_reg_not_init(regs + regno);
1396
+ __mark_reg_not_init(env, regs + regno);
7301397 }
7311398
1399
+static void mark_btf_ld_reg(struct bpf_verifier_env *env,
1400
+ struct bpf_reg_state *regs, u32 regno,
1401
+ enum bpf_reg_type reg_type, u32 btf_id)
1402
+{
1403
+ if (reg_type == SCALAR_VALUE) {
1404
+ mark_reg_unknown(env, regs, regno);
1405
+ return;
1406
+ }
1407
+ mark_reg_known_zero(env, regs, regno);
1408
+ regs[regno].type = PTR_TO_BTF_ID;
1409
+ regs[regno].btf_id = btf_id;
1410
+}
1411
+
1412
+#define DEF_NOT_SUBREG (0)
7321413 static void init_reg_state(struct bpf_verifier_env *env,
7331414 struct bpf_func_state *state)
7341415 {
....@@ -739,16 +1420,13 @@
7391420 mark_reg_not_init(env, regs, i);
7401421 regs[i].live = REG_LIVE_NONE;
7411422 regs[i].parent = NULL;
1423
+ regs[i].subreg_def = DEF_NOT_SUBREG;
7421424 }
7431425
7441426 /* frame pointer */
7451427 regs[BPF_REG_FP].type = PTR_TO_STACK;
7461428 mark_reg_known_zero(env, regs, BPF_REG_FP);
7471429 regs[BPF_REG_FP].frameno = state->frameno;
748
-
749
- /* 1st arg to a function */
750
- regs[BPF_REG_1].type = PTR_TO_CTX;
751
- mark_reg_known_zero(env, regs, BPF_REG_1);
7521430 }
7531431
7541432 #define BPF_MAIN_FUNC (-1)
....@@ -826,13 +1504,10 @@
8261504 continue;
8271505 if (insn[i].src_reg != BPF_PSEUDO_CALL)
8281506 continue;
829
- if (!env->allow_ptr_leaks) {
830
- verbose(env, "function calls to other bpf functions are allowed for root only\n");
1507
+ if (!env->bpf_capable) {
1508
+ verbose(env,
1509
+ "function calls to other bpf functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n");
8311510 return -EPERM;
832
- }
833
- if (bpf_prog_is_dev_bound(env->prog->aux)) {
834
- verbose(env, "function calls in offloaded programs are not supported yet\n");
835
- return -EINVAL;
8361511 }
8371512 ret = add_subprog(env, i + insn[i].imm + 1);
8381513 if (ret < 0)
....@@ -844,7 +1519,7 @@
8441519 */
8451520 subprog[env->subprog_cnt].start = insn_cnt;
8461521
847
- if (env->log.level > 1)
1522
+ if (env->log.level & BPF_LOG_LEVEL2)
8481523 for (i = 0; i < env->subprog_cnt; i++)
8491524 verbose(env, "func#%d @%d\n", i, subprog[i].start);
8501525
....@@ -854,7 +1529,14 @@
8541529 for (i = 0; i < insn_cnt; i++) {
8551530 u8 code = insn[i].code;
8561531
857
- if (BPF_CLASS(code) != BPF_JMP)
1532
+ if (code == (BPF_JMP | BPF_CALL) &&
1533
+ insn[i].imm == BPF_FUNC_tail_call &&
1534
+ insn[i].src_reg != BPF_PSEUDO_CALL)
1535
+ subprog[cur_subprog].has_tail_call = true;
1536
+ if (BPF_CLASS(code) == BPF_LD &&
1537
+ (BPF_MODE(code) == BPF_ABS || BPF_MODE(code) == BPF_IND))
1538
+ subprog[cur_subprog].has_ld_abs = true;
1539
+ if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32)
8581540 goto next;
8591541 if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL)
8601542 goto next;
....@@ -888,21 +1570,165 @@
8881570 */
8891571 static int mark_reg_read(struct bpf_verifier_env *env,
8901572 const struct bpf_reg_state *state,
891
- struct bpf_reg_state *parent)
1573
+ struct bpf_reg_state *parent, u8 flag)
8921574 {
8931575 bool writes = parent == state->parent; /* Observe write marks */
1576
+ int cnt = 0;
8941577
8951578 while (parent) {
8961579 /* if read wasn't screened by an earlier write ... */
8971580 if (writes && state->live & REG_LIVE_WRITTEN)
8981581 break;
1582
+ if (parent->live & REG_LIVE_DONE) {
1583
+ verbose(env, "verifier BUG type %s var_off %lld off %d\n",
1584
+ reg_type_str[parent->type],
1585
+ parent->var_off.value, parent->off);
1586
+ return -EFAULT;
1587
+ }
1588
+ /* The first condition is more likely to be true than the
1589
+ * second, checked it first.
1590
+ */
1591
+ if ((parent->live & REG_LIVE_READ) == flag ||
1592
+ parent->live & REG_LIVE_READ64)
1593
+ /* The parentage chain never changes and
1594
+ * this parent was already marked as LIVE_READ.
1595
+ * There is no need to keep walking the chain again and
1596
+ * keep re-marking all parents as LIVE_READ.
1597
+ * This case happens when the same register is read
1598
+ * multiple times without writes into it in-between.
1599
+ * Also, if parent has the stronger REG_LIVE_READ64 set,
1600
+ * then no need to set the weak REG_LIVE_READ32.
1601
+ */
1602
+ break;
8991603 /* ... then we depend on parent's value */
900
- parent->live |= REG_LIVE_READ;
1604
+ parent->live |= flag;
1605
+ /* REG_LIVE_READ64 overrides REG_LIVE_READ32. */
1606
+ if (flag == REG_LIVE_READ64)
1607
+ parent->live &= ~REG_LIVE_READ32;
9011608 state = parent;
9021609 parent = state->parent;
9031610 writes = true;
1611
+ cnt++;
9041612 }
1613
+
1614
+ if (env->longest_mark_read_walk < cnt)
1615
+ env->longest_mark_read_walk = cnt;
9051616 return 0;
1617
+}
1618
+
1619
+/* This function is supposed to be used by the following 32-bit optimization
1620
+ * code only. It returns TRUE if the source or destination register operates
1621
+ * on 64-bit, otherwise return FALSE.
1622
+ */
1623
+static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn,
1624
+ u32 regno, struct bpf_reg_state *reg, enum reg_arg_type t)
1625
+{
1626
+ u8 code, class, op;
1627
+
1628
+ code = insn->code;
1629
+ class = BPF_CLASS(code);
1630
+ op = BPF_OP(code);
1631
+ if (class == BPF_JMP) {
1632
+ /* BPF_EXIT for "main" will reach here. Return TRUE
1633
+ * conservatively.
1634
+ */
1635
+ if (op == BPF_EXIT)
1636
+ return true;
1637
+ if (op == BPF_CALL) {
1638
+ /* BPF to BPF call will reach here because of marking
1639
+ * caller saved clobber with DST_OP_NO_MARK for which we
1640
+ * don't care the register def because they are anyway
1641
+ * marked as NOT_INIT already.
1642
+ */
1643
+ if (insn->src_reg == BPF_PSEUDO_CALL)
1644
+ return false;
1645
+ /* Helper call will reach here because of arg type
1646
+ * check, conservatively return TRUE.
1647
+ */
1648
+ if (t == SRC_OP)
1649
+ return true;
1650
+
1651
+ return false;
1652
+ }
1653
+ }
1654
+
1655
+ if (class == BPF_ALU64 || class == BPF_JMP ||
1656
+ /* BPF_END always use BPF_ALU class. */
1657
+ (class == BPF_ALU && op == BPF_END && insn->imm == 64))
1658
+ return true;
1659
+
1660
+ if (class == BPF_ALU || class == BPF_JMP32)
1661
+ return false;
1662
+
1663
+ if (class == BPF_LDX) {
1664
+ if (t != SRC_OP)
1665
+ return BPF_SIZE(code) == BPF_DW;
1666
+ /* LDX source must be ptr. */
1667
+ return true;
1668
+ }
1669
+
1670
+ if (class == BPF_STX) {
1671
+ if (reg->type != SCALAR_VALUE)
1672
+ return true;
1673
+ return BPF_SIZE(code) == BPF_DW;
1674
+ }
1675
+
1676
+ if (class == BPF_LD) {
1677
+ u8 mode = BPF_MODE(code);
1678
+
1679
+ /* LD_IMM64 */
1680
+ if (mode == BPF_IMM)
1681
+ return true;
1682
+
1683
+ /* Both LD_IND and LD_ABS return 32-bit data. */
1684
+ if (t != SRC_OP)
1685
+ return false;
1686
+
1687
+ /* Implicit ctx ptr. */
1688
+ if (regno == BPF_REG_6)
1689
+ return true;
1690
+
1691
+ /* Explicit source could be any width. */
1692
+ return true;
1693
+ }
1694
+
1695
+ if (class == BPF_ST)
1696
+ /* The only source register for BPF_ST is a ptr. */
1697
+ return true;
1698
+
1699
+ /* Conservatively return true at default. */
1700
+ return true;
1701
+}
1702
+
1703
+/* Return TRUE if INSN doesn't have explicit value define. */
1704
+static bool insn_no_def(struct bpf_insn *insn)
1705
+{
1706
+ u8 class = BPF_CLASS(insn->code);
1707
+
1708
+ return (class == BPF_JMP || class == BPF_JMP32 ||
1709
+ class == BPF_STX || class == BPF_ST);
1710
+}
1711
+
1712
+/* Return TRUE if INSN has defined any 32-bit value explicitly. */
1713
+static bool insn_has_def32(struct bpf_verifier_env *env, struct bpf_insn *insn)
1714
+{
1715
+ if (insn_no_def(insn))
1716
+ return false;
1717
+
1718
+ return !is_reg64(env, insn, insn->dst_reg, NULL, DST_OP);
1719
+}
1720
+
1721
+static void mark_insn_zext(struct bpf_verifier_env *env,
1722
+ struct bpf_reg_state *reg)
1723
+{
1724
+ s32 def_idx = reg->subreg_def;
1725
+
1726
+ if (def_idx == DEF_NOT_SUBREG)
1727
+ return;
1728
+
1729
+ env->insn_aux_data[def_idx - 1].zext_dst = true;
1730
+ /* The dst will be zero extended, so won't be sub-register anymore. */
1731
+ reg->subreg_def = DEF_NOT_SUBREG;
9061732 }
9071733
9081734 static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
....@@ -910,34 +1736,631 @@
9101736 {
9111737 struct bpf_verifier_state *vstate = env->cur_state;
9121738 struct bpf_func_state *state = vstate->frame[vstate->curframe];
913
- struct bpf_reg_state *regs = state->regs;
1739
+ struct bpf_insn *insn = env->prog->insnsi + env->insn_idx;
1740
+ struct bpf_reg_state *reg, *regs = state->regs;
1741
+ bool rw64;
9141742
9151743 if (regno >= MAX_BPF_REG) {
9161744 verbose(env, "R%d is invalid\n", regno);
9171745 return -EINVAL;
9181746 }
9191747
1748
+ reg = &regs[regno];
1749
+ rw64 = is_reg64(env, insn, regno, reg, t);
9201750 if (t == SRC_OP) {
9211751 /* check whether register used as source operand can be read */
922
- if (regs[regno].type == NOT_INIT) {
1752
+ if (reg->type == NOT_INIT) {
9231753 verbose(env, "R%d !read_ok\n", regno);
9241754 return -EACCES;
9251755 }
9261756 /* We don't need to worry about FP liveness because it's read-only */
927
- if (regno != BPF_REG_FP)
928
- return mark_reg_read(env, &regs[regno],
929
- regs[regno].parent);
1757
+ if (regno == BPF_REG_FP)
1758
+ return 0;
1759
+
1760
+ if (rw64)
1761
+ mark_insn_zext(env, reg);
1762
+
1763
+ return mark_reg_read(env, reg, reg->parent,
1764
+ rw64 ? REG_LIVE_READ64 : REG_LIVE_READ32);
9301765 } else {
9311766 /* check whether register used as dest operand can be written to */
9321767 if (regno == BPF_REG_FP) {
9331768 verbose(env, "frame pointer is read only\n");
9341769 return -EACCES;
9351770 }
936
- regs[regno].live |= REG_LIVE_WRITTEN;
1771
+ reg->live |= REG_LIVE_WRITTEN;
1772
+ reg->subreg_def = rw64 ? DEF_NOT_SUBREG : env->insn_idx + 1;
9371773 if (t == DST_OP)
9381774 mark_reg_unknown(env, regs, regno);
9391775 }
9401776 return 0;
1777
+}
1778
+
1779
+/* for any branch, call, exit record the history of jmps in the given state */
1780
+static int push_jmp_history(struct bpf_verifier_env *env,
1781
+ struct bpf_verifier_state *cur)
1782
+{
1783
+ u32 cnt = cur->jmp_history_cnt;
1784
+ struct bpf_idx_pair *p;
1785
+
1786
+ cnt++;
1787
+ p = krealloc(cur->jmp_history, cnt * sizeof(*p), GFP_USER);
1788
+ if (!p)
1789
+ return -ENOMEM;
1790
+ p[cnt - 1].idx = env->insn_idx;
1791
+ p[cnt - 1].prev_idx = env->prev_insn_idx;
1792
+ cur->jmp_history = p;
1793
+ cur->jmp_history_cnt = cnt;
1794
+ return 0;
1795
+}
1796
+
1797
+/* Backtrack one insn at a time. If idx is not at the top of recorded
1798
+ * history then previous instruction came from straight line execution.
1799
+ */
1800
+static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
1801
+ u32 *history)
1802
+{
1803
+ u32 cnt = *history;
1804
+
1805
+ if (cnt && st->jmp_history[cnt - 1].idx == i) {
1806
+ i = st->jmp_history[cnt - 1].prev_idx;
1807
+ (*history)--;
1808
+ } else {
1809
+ i--;
1810
+ }
1811
+ return i;
1812
+}
1813
+
1814
+/* For given verifier state backtrack_insn() is called from the last insn to
1815
+ * the first insn. Its purpose is to compute a bitmask of registers and
1816
+ * stack slots that needs precision in the parent verifier state.
1817
+ */
1818
+static int backtrack_insn(struct bpf_verifier_env *env, int idx,
1819
+ u32 *reg_mask, u64 *stack_mask)
1820
+{
1821
+ const struct bpf_insn_cbs cbs = {
1822
+ .cb_print = verbose,
1823
+ .private_data = env,
1824
+ };
1825
+ struct bpf_insn *insn = env->prog->insnsi + idx;
1826
+ u8 class = BPF_CLASS(insn->code);
1827
+ u8 opcode = BPF_OP(insn->code);
1828
+ u8 mode = BPF_MODE(insn->code);
1829
+ u32 dreg = 1u << insn->dst_reg;
1830
+ u32 sreg = 1u << insn->src_reg;
1831
+ u32 spi;
1832
+
1833
+ if (insn->code == 0)
1834
+ return 0;
1835
+ if (env->log.level & BPF_LOG_LEVEL) {
1836
+ verbose(env, "regs=%x stack=%llx before ", *reg_mask, *stack_mask);
1837
+ verbose(env, "%d: ", idx);
1838
+ print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
1839
+ }
1840
+
1841
+ if (class == BPF_ALU || class == BPF_ALU64) {
1842
+ if (!(*reg_mask & dreg))
1843
+ return 0;
1844
+ if (opcode == BPF_MOV) {
1845
+ if (BPF_SRC(insn->code) == BPF_X) {
1846
+ /* dreg = sreg
1847
+ * dreg needs precision after this insn
1848
+ * sreg needs precision before this insn
1849
+ */
1850
+ *reg_mask &= ~dreg;
1851
+ *reg_mask |= sreg;
1852
+ } else {
1853
+ /* dreg = K
1854
+ * dreg needs precision after this insn.
1855
+ * Corresponding register is already marked
1856
+ * as precise=true in this verifier state.
1857
+ * No further markings in parent are necessary
1858
+ */
1859
+ *reg_mask &= ~dreg;
1860
+ }
1861
+ } else {
1862
+ if (BPF_SRC(insn->code) == BPF_X) {
1863
+ /* dreg += sreg
1864
+ * both dreg and sreg need precision
1865
+ * before this insn
1866
+ */
1867
+ *reg_mask |= sreg;
1868
+ } /* else dreg += K
1869
+ * dreg still needs precision before this insn
1870
+ */
1871
+ }
1872
+ } else if (class == BPF_LDX) {
1873
+ if (!(*reg_mask & dreg))
1874
+ return 0;
1875
+ *reg_mask &= ~dreg;
1876
+
1877
+ /* scalars can only be spilled into stack w/o losing precision.
1878
+ * Load from any other memory can be zero extended.
1879
+ * The desire to keep that precision is already indicated
1880
+ * by 'precise' mark in corresponding register of this state.
1881
+ * No further tracking necessary.
1882
+ */
1883
+ if (insn->src_reg != BPF_REG_FP)
1884
+ return 0;
1885
+
1886
+ /* dreg = *(u64 *)[fp - off] was a fill from the stack.
1887
+ * that [fp - off] slot contains scalar that needs to be
1888
+ * tracked with precision
1889
+ */
1890
+ spi = (-insn->off - 1) / BPF_REG_SIZE;
1891
+ if (spi >= 64) {
1892
+ verbose(env, "BUG spi %d\n", spi);
1893
+ WARN_ONCE(1, "verifier backtracking bug");
1894
+ return -EFAULT;
1895
+ }
1896
+ *stack_mask |= 1ull << spi;
1897
+ } else if (class == BPF_STX || class == BPF_ST) {
1898
+ if (*reg_mask & dreg)
1899
+ /* stx & st shouldn't be using _scalar_ dst_reg
1900
+ * to access memory. It means backtracking
1901
+ * encountered a case of pointer subtraction.
1902
+ */
1903
+ return -ENOTSUPP;
1904
+ /* scalars can only be spilled into stack */
1905
+ if (insn->dst_reg != BPF_REG_FP)
1906
+ return 0;
1907
+ spi = (-insn->off - 1) / BPF_REG_SIZE;
1908
+ if (spi >= 64) {
1909
+ verbose(env, "BUG spi %d\n", spi);
1910
+ WARN_ONCE(1, "verifier backtracking bug");
1911
+ return -EFAULT;
1912
+ }
1913
+ if (!(*stack_mask & (1ull << spi)))
1914
+ return 0;
1915
+ *stack_mask &= ~(1ull << spi);
1916
+ if (class == BPF_STX)
1917
+ *reg_mask |= sreg;
1918
+ } else if (class == BPF_JMP || class == BPF_JMP32) {
1919
+ if (opcode == BPF_CALL) {
1920
+ if (insn->src_reg == BPF_PSEUDO_CALL)
1921
+ return -ENOTSUPP;
1922
+ /* regular helper call sets R0 */
1923
+ *reg_mask &= ~1;
1924
+ if (*reg_mask & 0x3f) {
1925
+ /* if backtracing was looking for registers R1-R5
1926
+ * they should have been found already.
1927
+ */
1928
+ verbose(env, "BUG regs %x\n", *reg_mask);
1929
+ WARN_ONCE(1, "verifier backtracking bug");
1930
+ return -EFAULT;
1931
+ }
1932
+ } else if (opcode == BPF_EXIT) {
1933
+ return -ENOTSUPP;
1934
+ } else if (BPF_SRC(insn->code) == BPF_X) {
1935
+ if (!(*reg_mask & (dreg | sreg)))
1936
+ return 0;
1937
+ /* dreg <cond> sreg
1938
+ * Both dreg and sreg need precision before
1939
+ * this insn. If only sreg was marked precise
1940
+ * before it would be equally necessary to
1941
+ * propagate it to dreg.
1942
+ */
1943
+ *reg_mask |= (sreg | dreg);
1944
+ /* else dreg <cond> K
1945
+ * Only dreg still needs precision before
1946
+ * this insn, so for the K-based conditional
1947
+ * there is nothing new to be marked.
1948
+ */
1949
+ }
1950
+ } else if (class == BPF_LD) {
1951
+ if (!(*reg_mask & dreg))
1952
+ return 0;
1953
+ *reg_mask &= ~dreg;
1954
+ /* It's ld_imm64 or ld_abs or ld_ind.
1955
+ * For ld_imm64 no further tracking of precision
1956
+ * into parent is necessary
1957
+ */
1958
+ if (mode == BPF_IND || mode == BPF_ABS)
1959
+ /* to be analyzed */
1960
+ return -ENOTSUPP;
1961
+ }
1962
+ return 0;
1963
+}
1964
+
1965
+/* the scalar precision tracking algorithm:
1966
+ * . at the start all registers have precise=false.
1967
+ * . scalar ranges are tracked as normal through alu and jmp insns.
1968
+ * . once precise value of the scalar register is used in:
1969
+ * . ptr + scalar alu
1970
+ * . if (scalar cond K|scalar)
1971
+ * . helper_call(.., scalar, ...) where ARG_CONST is expected
1972
+ * backtrack through the verifier states and mark all registers and
1973
+ * stack slots with spilled constants that these scalar regisers
1974
+ * should be precise.
1975
+ * . during state pruning two registers (or spilled stack slots)
1976
+ * are equivalent if both are not precise.
1977
+ *
1978
+ * Note the verifier cannot simply walk register parentage chain,
1979
+ * since many different registers and stack slots could have been
1980
+ * used to compute single precise scalar.
1981
+ *
1982
+ * The approach of starting with precise=true for all registers and then
1983
+ * backtrack to mark a register as not precise when the verifier detects
1984
+ * that program doesn't care about specific value (e.g., when helper
1985
+ * takes register as ARG_ANYTHING parameter) is not safe.
1986
+ *
1987
+ * It's ok to walk single parentage chain of the verifier states.
1988
+ * It's possible that this backtracking will go all the way till 1st insn.
1989
+ * All other branches will be explored for needing precision later.
1990
+ *
1991
+ * The backtracking needs to deal with cases like:
1992
+ * R8=map_value(id=0,off=0,ks=4,vs=1952,imm=0) R9_w=map_value(id=0,off=40,ks=4,vs=1952,imm=0)
1993
+ * r9 -= r8
1994
+ * r5 = r9
1995
+ * if r5 > 0x79f goto pc+7
1996
+ * R5_w=inv(id=0,umax_value=1951,var_off=(0x0; 0x7ff))
1997
+ * r5 += 1
1998
+ * ...
1999
+ * call bpf_perf_event_output#25
2000
+ * where .arg5_type = ARG_CONST_SIZE_OR_ZERO
2001
+ *
2002
+ * and this case:
2003
+ * r6 = 1
2004
+ * call foo // uses callee's r6 inside to compute r0
2005
+ * r0 += r6
2006
+ * if r0 == 0 goto
2007
+ *
2008
+ * to track above reg_mask/stack_mask needs to be independent for each frame.
2009
+ *
2010
+ * Also if parent's curframe > frame where backtracking started,
2011
+ * the verifier need to mark registers in both frames, otherwise callees
2012
+ * may incorrectly prune callers. This is similar to
2013
+ * commit 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences")
2014
+ *
2015
+ * For now backtracking falls back into conservative marking.
2016
+ */
2017
+static void mark_all_scalars_precise(struct bpf_verifier_env *env,
2018
+ struct bpf_verifier_state *st)
2019
+{
2020
+ struct bpf_func_state *func;
2021
+ struct bpf_reg_state *reg;
2022
+ int i, j;
2023
+
2024
+ /* big hammer: mark all scalars precise in this path.
2025
+ * pop_stack may still get !precise scalars.
2026
+ * We also skip current state and go straight to first parent state,
2027
+ * because precision markings in current non-checkpointed state are
2028
+ * not needed. See why in the comment in __mark_chain_precision below.
2029
+ */
2030
+ for (st = st->parent; st; st = st->parent) {
2031
+ for (i = 0; i <= st->curframe; i++) {
2032
+ func = st->frame[i];
2033
+ for (j = 0; j < BPF_REG_FP; j++) {
2034
+ reg = &func->regs[j];
2035
+ if (reg->type != SCALAR_VALUE)
2036
+ continue;
2037
+ reg->precise = true;
2038
+ }
2039
+ for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
2040
+ if (!is_spilled_reg(&func->stack[j]))
2041
+ continue;
2042
+ reg = &func->stack[j].spilled_ptr;
2043
+ if (reg->type != SCALAR_VALUE)
2044
+ continue;
2045
+ reg->precise = true;
2046
+ }
2047
+ }
2048
+ }
2049
+}
2050
+
2051
+static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
2052
+{
2053
+ struct bpf_func_state *func;
2054
+ struct bpf_reg_state *reg;
2055
+ int i, j;
2056
+
2057
+ for (i = 0; i <= st->curframe; i++) {
2058
+ func = st->frame[i];
2059
+ for (j = 0; j < BPF_REG_FP; j++) {
2060
+ reg = &func->regs[j];
2061
+ if (reg->type != SCALAR_VALUE)
2062
+ continue;
2063
+ reg->precise = false;
2064
+ }
2065
+ for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
2066
+ if (!is_spilled_reg(&func->stack[j]))
2067
+ continue;
2068
+ reg = &func->stack[j].spilled_ptr;
2069
+ if (reg->type != SCALAR_VALUE)
2070
+ continue;
2071
+ reg->precise = false;
2072
+ }
2073
+ }
2074
+}
2075
+
2076
+/*
2077
+ * __mark_chain_precision() backtracks BPF program instruction sequence and
2078
+ * chain of verifier states making sure that register *regno* (if regno >= 0)
2079
+ * and/or stack slot *spi* (if spi >= 0) are marked as precisely tracked
2080
+ * SCALARS, as well as any other registers and slots that contribute to
2081
+ * a tracked state of given registers/stack slots, depending on specific BPF
2082
+ * assembly instructions (see backtrack_insns() for exact instruction handling
2083
+ * logic). This backtracking relies on recorded jmp_history and is able to
2084
+ * traverse entire chain of parent states. This process ends only when all the
2085
+ * necessary registers/slots and their transitive dependencies are marked as
2086
+ * precise.
2087
+ *
2088
+ * One important and subtle aspect is that precise marks *do not matter* in
2089
+ * the currently verified state (current state). It is important to understand
2090
+ * why this is the case.
2091
+ *
2092
+ * First, note that current state is the state that is not yet "checkpointed",
2093
+ * i.e., it is not yet put into env->explored_states, and it has no children
2094
+ * states as well. It's ephemeral, and can end up either a) being discarded if
2095
+ * compatible explored state is found at some point or BPF_EXIT instruction is
2096
+ * reached or b) checkpointed and put into env->explored_states, branching out
2097
+ * into one or more children states.
2098
+ *
2099
+ * In the former case, precise markings in current state are completely
2100
+ * ignored by state comparison code (see regsafe() for details). Only
2101
+ * checkpointed ("old") state precise markings are important, and if old
2102
+ * state's register/slot is precise, regsafe() assumes current state's
2103
+ * register/slot as precise and checks value ranges exactly and precisely. If
2104
+ * states turn out to be compatible, current state's necessary precise
2105
+ * markings and any required parent states' precise markings are enforced
2106
+ * after the fact with propagate_precision() logic, after the fact. But it's
2107
+ * important to realize that in this case, even after marking current state
2108
+ * registers/slots as precise, we immediately discard current state. So what
2109
+ * actually matters is any of the precise markings propagated into current
2110
+ * state's parent states, which are always checkpointed (due to b) case above).
2111
+ * As such, for scenario a) it doesn't matter if current state has precise
2112
+ * markings set or not.
2113
+ *
2114
+ * Now, for the scenario b), checkpointing and forking into child(ren)
2115
+ * state(s). Note that before current state gets to checkpointing step, any
2116
+ * processed instruction always assumes precise SCALAR register/slot
2117
+ * knowledge: if precise value or range is useful to prune jump branch, BPF
2118
+ * verifier takes this opportunity enthusiastically. Similarly, when
2119
+ * register's value is used to calculate offset or memory address, exact
2120
+ * knowledge of SCALAR range is assumed, checked, and enforced. So, similar to
2121
+ * what we mentioned above about state comparison ignoring precise markings
2122
+ * during state comparison, BPF verifier ignores and also assumes precise
2123
+ * markings *at will* during instruction verification process. But as verifier
2124
+ * assumes precision, it also propagates any precision dependencies across
2125
+ * parent states, which are not yet finalized, so can be further restricted
2126
+ * based on new knowledge gained from restrictions enforced by their children
2127
+ * states. This is so that once those parent states are finalized, i.e., when
2128
+ * they have no more active children state, state comparison logic in
2129
+ * is_state_visited() would enforce strict and precise SCALAR ranges, if
2130
+ * required for correctness.
2131
+ *
2132
+ * To build a bit more intuition, note also that once a state is checkpointed,
2133
+ * the path we took to get to that state is not important. This is crucial
2134
+ * property for state pruning. When state is checkpointed and finalized at
2135
+ * some instruction index, it can be correctly and safely used to "short
2136
+ * circuit" any *compatible* state that reaches exactly the same instruction
2137
+ * index. I.e., if we jumped to that instruction from a completely different
2138
+ * code path than original finalized state was derived from, it doesn't
2139
+ * matter, current state can be discarded because from that instruction
2140
+ * forward having a compatible state will ensure we will safely reach the
2141
+ * exit. States describe preconditions for further exploration, but completely
2142
+ * forget the history of how we got here.
2143
+ *
2144
+ * This also means that even if we needed precise SCALAR range to get to
2145
+ * finalized state, but from that point forward *that same* SCALAR register is
2146
+ * never used in a precise context (i.e., it's precise value is not needed for
2147
+ * correctness), it's correct and safe to mark such register as "imprecise"
2148
+ * (i.e., precise marking set to false). This is what we rely on when we do
2149
+ * not set precise marking in current state. If no child state requires
2150
+ * precision for any given SCALAR register, it's safe to dictate that it can
2151
+ * be imprecise. If any child state does require this register to be precise,
2152
+ * we'll mark it precise later retroactively during precise markings
2153
+ * propagation from child state to parent states.
2154
+ *
2155
+ * Skipping precise marking setting in current state is a mild version of
2156
+ * relying on the above observation. But we can utilize this property even
2157
+ * more aggressively by proactively forgetting any precise marking in the
2158
+ * current state (which we inherited from the parent state), right before we
2159
+ * checkpoint it and branch off into new child state. This is done by
2160
+ * mark_all_scalars_imprecise() to hopefully get more permissive and generic
2161
+ * finalized states which help in short circuiting more future states.
2162
+ */
2163
+static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int regno,
2164
+ int spi)
2165
+{
2166
+ struct bpf_verifier_state *st = env->cur_state;
2167
+ int first_idx = st->first_insn_idx;
2168
+ int last_idx = env->insn_idx;
2169
+ struct bpf_func_state *func;
2170
+ struct bpf_reg_state *reg;
2171
+ u32 reg_mask = regno >= 0 ? 1u << regno : 0;
2172
+ u64 stack_mask = spi >= 0 ? 1ull << spi : 0;
2173
+ bool skip_first = true;
2174
+ bool new_marks = false;
2175
+ int i, err;
2176
+
2177
+ if (!env->bpf_capable)
2178
+ return 0;
2179
+
2180
+ /* Do sanity checks against current state of register and/or stack
2181
+ * slot, but don't set precise flag in current state, as precision
2182
+ * tracking in the current state is unnecessary.
2183
+ */
2184
+ func = st->frame[frame];
2185
+ if (regno >= 0) {
2186
+ reg = &func->regs[regno];
2187
+ if (reg->type != SCALAR_VALUE) {
2188
+ WARN_ONCE(1, "backtracing misuse");
2189
+ return -EFAULT;
2190
+ }
2191
+ new_marks = true;
2192
+ }
2193
+
2194
+ while (spi >= 0) {
2195
+ if (!is_spilled_reg(&func->stack[spi])) {
2196
+ stack_mask = 0;
2197
+ break;
2198
+ }
2199
+ reg = &func->stack[spi].spilled_ptr;
2200
+ if (reg->type != SCALAR_VALUE) {
2201
+ stack_mask = 0;
2202
+ break;
2203
+ }
2204
+ new_marks = true;
2205
+ break;
2206
+ }
2207
+
2208
+ if (!new_marks)
2209
+ return 0;
2210
+ if (!reg_mask && !stack_mask)
2211
+ return 0;
2212
+
2213
+ for (;;) {
2214
+ DECLARE_BITMAP(mask, 64);
2215
+ u32 history = st->jmp_history_cnt;
2216
+
2217
+ if (env->log.level & BPF_LOG_LEVEL)
2218
+ verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx);
2219
+
2220
+ if (last_idx < 0) {
2221
+ /* we are at the entry into subprog, which
2222
+ * is expected for global funcs, but only if
2223
+ * requested precise registers are R1-R5
2224
+ * (which are global func's input arguments)
2225
+ */
2226
+ if (st->curframe == 0 &&
2227
+ st->frame[0]->subprogno > 0 &&
2228
+ st->frame[0]->callsite == BPF_MAIN_FUNC &&
2229
+ stack_mask == 0 && (reg_mask & ~0x3e) == 0) {
2230
+ bitmap_from_u64(mask, reg_mask);
2231
+ for_each_set_bit(i, mask, 32) {
2232
+ reg = &st->frame[0]->regs[i];
2233
+ if (reg->type != SCALAR_VALUE) {
2234
+ reg_mask &= ~(1u << i);
2235
+ continue;
2236
+ }
2237
+ reg->precise = true;
2238
+ }
2239
+ return 0;
2240
+ }
2241
+
2242
+ verbose(env, "BUG backtracing func entry subprog %d reg_mask %x stack_mask %llx\n",
2243
+ st->frame[0]->subprogno, reg_mask, stack_mask);
2244
+ WARN_ONCE(1, "verifier backtracking bug");
2245
+ return -EFAULT;
2246
+ }
2247
+
2248
+ for (i = last_idx;;) {
2249
+ if (skip_first) {
2250
+ err = 0;
2251
+ skip_first = false;
2252
+ } else {
2253
+ err = backtrack_insn(env, i, &reg_mask, &stack_mask);
2254
+ }
2255
+ if (err == -ENOTSUPP) {
2256
+ mark_all_scalars_precise(env, st);
2257
+ return 0;
2258
+ } else if (err) {
2259
+ return err;
2260
+ }
2261
+ if (!reg_mask && !stack_mask)
2262
+ /* Found assignment(s) into tracked register in this state.
2263
+ * Since this state is already marked, just return.
2264
+ * Nothing to be tracked further in the parent state.
2265
+ */
2266
+ return 0;
2267
+ if (i == first_idx)
2268
+ break;
2269
+ i = get_prev_insn_idx(st, i, &history);
2270
+ if (i >= env->prog->len) {
2271
+ /* This can happen if backtracking reached insn 0
2272
+ * and there are still reg_mask or stack_mask
2273
+ * to backtrack.
2274
+ * It means the backtracking missed the spot where
2275
+ * particular register was initialized with a constant.
2276
+ */
2277
+ verbose(env, "BUG backtracking idx %d\n", i);
2278
+ WARN_ONCE(1, "verifier backtracking bug");
2279
+ return -EFAULT;
2280
+ }
2281
+ }
2282
+ st = st->parent;
2283
+ if (!st)
2284
+ break;
2285
+
2286
+ new_marks = false;
2287
+ func = st->frame[frame];
2288
+ bitmap_from_u64(mask, reg_mask);
2289
+ for_each_set_bit(i, mask, 32) {
2290
+ reg = &func->regs[i];
2291
+ if (reg->type != SCALAR_VALUE) {
2292
+ reg_mask &= ~(1u << i);
2293
+ continue;
2294
+ }
2295
+ if (!reg->precise)
2296
+ new_marks = true;
2297
+ reg->precise = true;
2298
+ }
2299
+
2300
+ bitmap_from_u64(mask, stack_mask);
2301
+ for_each_set_bit(i, mask, 64) {
2302
+ if (i >= func->allocated_stack / BPF_REG_SIZE) {
2303
+ /* the sequence of instructions:
2304
+ * 2: (bf) r3 = r10
2305
+ * 3: (7b) *(u64 *)(r3 -8) = r0
2306
+ * 4: (79) r4 = *(u64 *)(r10 -8)
2307
+ * doesn't contain jmps. It's backtracked
2308
+ * as a single block.
2309
+ * During backtracking insn 3 is not recognized as
2310
+ * stack access, so at the end of backtracking
2311
+ * stack slot fp-8 is still marked in stack_mask.
2312
+ * However the parent state may not have accessed
2313
+ * fp-8 and it's "unallocated" stack space.
2314
+ * In such case fallback to conservative.
2315
+ */
2316
+ mark_all_scalars_precise(env, st);
2317
+ return 0;
2318
+ }
2319
+
2320
+ if (!is_spilled_reg(&func->stack[i])) {
2321
+ stack_mask &= ~(1ull << i);
2322
+ continue;
2323
+ }
2324
+ reg = &func->stack[i].spilled_ptr;
2325
+ if (reg->type != SCALAR_VALUE) {
2326
+ stack_mask &= ~(1ull << i);
2327
+ continue;
2328
+ }
2329
+ if (!reg->precise)
2330
+ new_marks = true;
2331
+ reg->precise = true;
2332
+ }
2333
+ if (env->log.level & BPF_LOG_LEVEL) {
2334
+ print_verifier_state(env, func);
2335
+ verbose(env, "parent %s regs=%x stack=%llx marks\n",
2336
+ new_marks ? "didn't have" : "already had",
2337
+ reg_mask, stack_mask);
2338
+ }
2339
+
2340
+ if (!reg_mask && !stack_mask)
2341
+ break;
2342
+ if (!new_marks)
2343
+ break;
2344
+
2345
+ last_idx = st->last_insn_idx;
2346
+ first_idx = st->first_insn_idx;
2347
+ }
2348
+ return 0;
2349
+}
2350
+
2351
+static int mark_chain_precision(struct bpf_verifier_env *env, int regno)
2352
+{
2353
+ return __mark_chain_precision(env, env->cur_state->curframe, regno, -1);
2354
+}
2355
+
2356
+static int mark_chain_precision_frame(struct bpf_verifier_env *env, int frame, int regno)
2357
+{
2358
+ return __mark_chain_precision(env, frame, regno, -1);
2359
+}
2360
+
2361
+static int mark_chain_precision_stack_frame(struct bpf_verifier_env *env, int frame, int spi)
2362
+{
2363
+ return __mark_chain_precision(env, frame, -1, spi);
9412364 }
9422365
9432366 static bool is_spillable_regtype(enum bpf_reg_type type)
....@@ -950,7 +2373,24 @@
9502373 case PTR_TO_PACKET:
9512374 case PTR_TO_PACKET_META:
9522375 case PTR_TO_PACKET_END:
2376
+ case PTR_TO_FLOW_KEYS:
9532377 case CONST_PTR_TO_MAP:
2378
+ case PTR_TO_SOCKET:
2379
+ case PTR_TO_SOCKET_OR_NULL:
2380
+ case PTR_TO_SOCK_COMMON:
2381
+ case PTR_TO_SOCK_COMMON_OR_NULL:
2382
+ case PTR_TO_TCP_SOCK:
2383
+ case PTR_TO_TCP_SOCK_OR_NULL:
2384
+ case PTR_TO_XDP_SOCK:
2385
+ case PTR_TO_BTF_ID:
2386
+ case PTR_TO_BTF_ID_OR_NULL:
2387
+ case PTR_TO_RDONLY_BUF:
2388
+ case PTR_TO_RDONLY_BUF_OR_NULL:
2389
+ case PTR_TO_RDWR_BUF:
2390
+ case PTR_TO_RDWR_BUF_OR_NULL:
2391
+ case PTR_TO_PERCPU_BTF_ID:
2392
+ case PTR_TO_MEM:
2393
+ case PTR_TO_MEM_OR_NULL:
9542394 return true;
9552395 default:
9562396 return false;
....@@ -968,31 +2408,80 @@
9682408 return reg->type == SCALAR_VALUE && tnum_is_const(reg->var_off);
9692409 }
9702410
2411
+static bool __is_scalar_unbounded(struct bpf_reg_state *reg)
2412
+{
2413
+ return tnum_is_unknown(reg->var_off) &&
2414
+ reg->smin_value == S64_MIN && reg->smax_value == S64_MAX &&
2415
+ reg->umin_value == 0 && reg->umax_value == U64_MAX &&
2416
+ reg->s32_min_value == S32_MIN && reg->s32_max_value == S32_MAX &&
2417
+ reg->u32_min_value == 0 && reg->u32_max_value == U32_MAX;
2418
+}
2419
+
2420
+static bool register_is_bounded(struct bpf_reg_state *reg)
2421
+{
2422
+ return reg->type == SCALAR_VALUE && !__is_scalar_unbounded(reg);
2423
+}
2424
+
2425
+static bool __is_pointer_value(bool allow_ptr_leaks,
2426
+ const struct bpf_reg_state *reg)
2427
+{
2428
+ if (allow_ptr_leaks)
2429
+ return false;
2430
+
2431
+ return reg->type != SCALAR_VALUE;
2432
+}
2433
+
2434
+/* Copy src state preserving dst->parent and dst->live fields */
2435
+static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_state *src)
2436
+{
2437
+ struct bpf_reg_state *parent = dst->parent;
2438
+ enum bpf_reg_liveness live = dst->live;
2439
+
2440
+ *dst = *src;
2441
+ dst->parent = parent;
2442
+ dst->live = live;
2443
+}
2444
+
9712445 static void save_register_state(struct bpf_func_state *state,
972
- int spi, struct bpf_reg_state *reg)
2446
+ int spi, struct bpf_reg_state *reg,
2447
+ int size)
9732448 {
9742449 int i;
9752450
976
- state->stack[spi].spilled_ptr = *reg;
977
- state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
2451
+ copy_register_state(&state->stack[spi].spilled_ptr, reg);
2452
+ if (size == BPF_REG_SIZE)
2453
+ state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
9782454
979
- for (i = 0; i < BPF_REG_SIZE; i++)
980
- state->stack[spi].slot_type[i] = STACK_SPILL;
2455
+ for (i = BPF_REG_SIZE; i > BPF_REG_SIZE - size; i--)
2456
+ state->stack[spi].slot_type[i - 1] = STACK_SPILL;
2457
+
2458
+ /* size < 8 bytes spill */
2459
+ for (; i; i--)
2460
+ scrub_spilled_slot(&state->stack[spi].slot_type[i - 1]);
9812461 }
9822462
983
-/* check_stack_read/write functions track spill/fill of registers,
2463
+static bool is_bpf_st_mem(struct bpf_insn *insn)
2464
+{
2465
+ return BPF_CLASS(insn->code) == BPF_ST && BPF_MODE(insn->code) == BPF_MEM;
2466
+}
2467
+
2468
+/* check_stack_{read,write}_fixed_off functions track spill/fill of registers,
9842469 * stack boundary and alignment are checked in check_mem_access()
9852470 */
986
-static int check_stack_write(struct bpf_verifier_env *env,
987
- struct bpf_func_state *state, /* func where register points to */
988
- int off, int size, int value_regno, int insn_idx)
2471
+static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
2472
+ /* stack frame we're writing to */
2473
+ struct bpf_func_state *state,
2474
+ int off, int size, int value_regno,
2475
+ int insn_idx)
9892476 {
9902477 struct bpf_func_state *cur; /* state of the current function */
9912478 int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
2479
+ struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
9922480 struct bpf_reg_state *reg = NULL;
2481
+ u32 dst_reg = insn->dst_reg;
9932482
9942483 err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE),
995
- true);
2484
+ state->acquired_refs, true);
9962485 if (err)
9972486 return err;
9982487 /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
....@@ -1008,11 +2497,13 @@
10082497 cur = env->cur_state->frame[env->cur_state->curframe];
10092498 if (value_regno >= 0)
10102499 reg = &cur->regs[value_regno];
1011
- if (!env->allow_ptr_leaks) {
2500
+ if (!env->bypass_spec_v4) {
10122501 bool sanitize = reg && is_spillable_regtype(reg->type);
10132502
10142503 for (i = 0; i < size; i++) {
1015
- if (state->stack[spi].slot_type[i] == STACK_INVALID) {
2504
+ u8 type = state->stack[spi].slot_type[i];
2505
+
2506
+ if (type != STACK_MISC && type != STACK_ZERO) {
10162507 sanitize = true;
10172508 break;
10182509 }
....@@ -1022,12 +2513,34 @@
10222513 env->insn_aux_data[insn_idx].sanitize_stack_spill = true;
10232514 }
10242515
1025
- if (reg && size == BPF_REG_SIZE && register_is_const(reg) &&
1026
- !register_is_null(reg) && env->allow_ptr_leaks) {
1027
- save_register_state(state, spi, reg);
2516
+ if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) &&
2517
+ !register_is_null(reg) && env->bpf_capable) {
2518
+ if (dst_reg != BPF_REG_FP) {
2519
+ /* The backtracking logic can only recognize explicit
2520
+ * stack slot address like [fp - 8]. Other spill of
2521
+ * scalar via different register has to be conervative.
2522
+ * Backtrack from here and mark all registers as precise
2523
+ * that contributed into 'reg' being a constant.
2524
+ */
2525
+ err = mark_chain_precision(env, value_regno);
2526
+ if (err)
2527
+ return err;
2528
+ }
2529
+ save_register_state(state, spi, reg, size);
2530
+ /* Break the relation on a narrowing spill. */
2531
+ if (fls64(reg->umax_value) > BITS_PER_BYTE * size)
2532
+ state->stack[spi].spilled_ptr.id = 0;
2533
+ } else if (!reg && !(off % BPF_REG_SIZE) && is_bpf_st_mem(insn) &&
2534
+ insn->imm != 0 && env->bpf_capable) {
2535
+ struct bpf_reg_state fake_reg = {};
2536
+
2537
+ __mark_reg_known(&fake_reg, (u32)insn->imm);
2538
+ fake_reg.type = SCALAR_VALUE;
2539
+ save_register_state(state, spi, &fake_reg, size);
10282540 } else if (reg && is_spillable_regtype(reg->type)) {
10292541 /* register containing pointer is being spilled into stack */
10302542 if (size != BPF_REG_SIZE) {
2543
+ verbose_linfo(env, insn_idx, "; ");
10312544 verbose(env, "invalid size of register spill\n");
10322545 return -EACCES;
10332546 }
....@@ -1035,16 +2548,16 @@
10352548 verbose(env, "cannot spill pointers to stack into stack frame of the caller\n");
10362549 return -EINVAL;
10372550 }
1038
- save_register_state(state, spi, reg);
2551
+ save_register_state(state, spi, reg, size);
10392552 } else {
10402553 u8 type = STACK_MISC;
10412554
10422555 /* regular write of data into stack destroys any spilled ptr */
10432556 state->stack[spi].spilled_ptr.type = NOT_INIT;
10442557 /* Mark slots as STACK_MISC if they belonged to spilled ptr. */
1045
- if (state->stack[spi].slot_type[0] == STACK_SPILL)
2558
+ if (is_spilled_reg(&state->stack[spi]))
10462559 for (i = 0; i < BPF_REG_SIZE; i++)
1047
- state->stack[spi].slot_type[i] = STACK_MISC;
2560
+ scrub_spilled_slot(&state->stack[spi].slot_type[i]);
10482561
10492562 /* only mark the slot as written if all 8 bytes were written
10502563 * otherwise read propagation may incorrectly stop too soon
....@@ -1058,8 +2571,14 @@
10582571 state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
10592572
10602573 /* when we zero initialize stack slots mark them as such */
1061
- if (reg && register_is_null(reg))
2574
+ if ((reg && register_is_null(reg)) ||
2575
+ (!reg && is_bpf_st_mem(insn) && insn->imm == 0)) {
2576
+ /* backtracking doesn't work for STACK_ZERO yet. */
2577
+ err = mark_chain_precision(env, value_regno);
2578
+ if (err)
2579
+ return err;
10622580 type = STACK_ZERO;
2581
+ }
10632582
10642583 /* Mark slots affected by this stack write. */
10652584 for (i = 0; i < size; i++)
....@@ -1069,140 +2588,477 @@
10692588 return 0;
10702589 }
10712590
1072
-static int check_stack_read(struct bpf_verifier_env *env,
1073
- struct bpf_func_state *reg_state /* func where register points to */,
1074
- int off, int size, int value_regno)
2591
+/* Write the stack: 'stack[ptr_regno + off] = value_regno'. 'ptr_regno' is
2592
+ * known to contain a variable offset.
2593
+ * This function checks whether the write is permitted and conservatively
2594
+ * tracks the effects of the write, considering that each stack slot in the
2595
+ * dynamic range is potentially written to.
2596
+ *
2597
+ * 'off' includes 'regno->off'.
2598
+ * 'value_regno' can be -1, meaning that an unknown value is being written to
2599
+ * the stack.
2600
+ *
2601
+ * Spilled pointers in range are not marked as written because we don't know
2602
+ * what's going to be actually written. This means that read propagation for
2603
+ * future reads cannot be terminated by this write.
2604
+ *
2605
+ * For privileged programs, uninitialized stack slots are considered
2606
+ * initialized by this write (even though we don't know exactly what offsets
2607
+ * are going to be written to). The idea is that we don't want the verifier to
2608
+ * reject future reads that access slots written to through variable offsets.
2609
+ */
2610
+static int check_stack_write_var_off(struct bpf_verifier_env *env,
2611
+ /* func where register points to */
2612
+ struct bpf_func_state *state,
2613
+ int ptr_regno, int off, int size,
2614
+ int value_regno, int insn_idx)
2615
+{
2616
+ struct bpf_func_state *cur; /* state of the current function */
2617
+ int min_off, max_off;
2618
+ int i, err;
2619
+ struct bpf_reg_state *ptr_reg = NULL, *value_reg = NULL;
2620
+ bool writing_zero = false;
2621
+ /* set if the fact that we're writing a zero is used to let any
2622
+ * stack slots remain STACK_ZERO
2623
+ */
2624
+ bool zero_used = false;
2625
+
2626
+ cur = env->cur_state->frame[env->cur_state->curframe];
2627
+ ptr_reg = &cur->regs[ptr_regno];
2628
+ min_off = ptr_reg->smin_value + off;
2629
+ max_off = ptr_reg->smax_value + off + size;
2630
+ if (value_regno >= 0)
2631
+ value_reg = &cur->regs[value_regno];
2632
+ if (value_reg && register_is_null(value_reg))
2633
+ writing_zero = true;
2634
+
2635
+ err = realloc_func_state(state, round_up(-min_off, BPF_REG_SIZE),
2636
+ state->acquired_refs, true);
2637
+ if (err)
2638
+ return err;
2639
+
2640
+
2641
+ /* Variable offset writes destroy any spilled pointers in range. */
2642
+ for (i = min_off; i < max_off; i++) {
2643
+ u8 new_type, *stype;
2644
+ int slot, spi;
2645
+
2646
+ slot = -i - 1;
2647
+ spi = slot / BPF_REG_SIZE;
2648
+ stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];
2649
+
2650
+ if (!env->allow_ptr_leaks && *stype != STACK_MISC && *stype != STACK_ZERO) {
2651
+ /* Reject the write if range we may write to has not
2652
+ * been initialized beforehand. If we didn't reject
2653
+ * here, the ptr status would be erased below (even
2654
+ * though not all slots are actually overwritten),
2655
+ * possibly opening the door to leaks.
2656
+ *
2657
+ * We do however catch STACK_INVALID case below, and
2658
+ * only allow reading possibly uninitialized memory
2659
+ * later for CAP_PERFMON, as the write may not happen to
2660
+ * that slot.
2661
+ */
2662
+ verbose(env, "spilled ptr in range of var-offset stack write; insn %d, ptr off: %d",
2663
+ insn_idx, i);
2664
+ return -EINVAL;
2665
+ }
2666
+
2667
+ /* Erase all spilled pointers. */
2668
+ state->stack[spi].spilled_ptr.type = NOT_INIT;
2669
+
2670
+ /* Update the slot type. */
2671
+ new_type = STACK_MISC;
2672
+ if (writing_zero && *stype == STACK_ZERO) {
2673
+ new_type = STACK_ZERO;
2674
+ zero_used = true;
2675
+ }
2676
+ /* If the slot is STACK_INVALID, we check whether it's OK to
2677
+ * pretend that it will be initialized by this write. The slot
2678
+ * might not actually be written to, and so if we mark it as
2679
+ * initialized future reads might leak uninitialized memory.
2680
+ * For privileged programs, we will accept such reads to slots
2681
+ * that may or may not be written because, if we're reject
2682
+ * them, the error would be too confusing.
2683
+ */
2684
+ if (*stype == STACK_INVALID && !env->allow_uninit_stack) {
2685
+ verbose(env, "uninit stack in range of var-offset write prohibited for !root; insn %d, off: %d",
2686
+ insn_idx, i);
2687
+ return -EINVAL;
2688
+ }
2689
+ *stype = new_type;
2690
+ }
2691
+ if (zero_used) {
2692
+ /* backtracking doesn't work for STACK_ZERO yet. */
2693
+ err = mark_chain_precision(env, value_regno);
2694
+ if (err)
2695
+ return err;
2696
+ }
2697
+ return 0;
2698
+}
2699
+
2700
+/* When register 'dst_regno' is assigned some values from stack[min_off,
2701
+ * max_off), we set the register's type according to the types of the
2702
+ * respective stack slots. If all the stack values are known to be zeros, then
2703
+ * so is the destination reg. Otherwise, the register is considered to be
2704
+ * SCALAR. This function does not deal with register filling; the caller must
2705
+ * ensure that all spilled registers in the stack range have been marked as
2706
+ * read.
2707
+ */
2708
+static void mark_reg_stack_read(struct bpf_verifier_env *env,
2709
+ /* func where src register points to */
2710
+ struct bpf_func_state *ptr_state,
2711
+ int min_off, int max_off, int dst_regno)
2712
+{
2713
+ struct bpf_verifier_state *vstate = env->cur_state;
2714
+ struct bpf_func_state *state = vstate->frame[vstate->curframe];
2715
+ int i, slot, spi;
2716
+ u8 *stype;
2717
+ int zeros = 0;
2718
+
2719
+ for (i = min_off; i < max_off; i++) {
2720
+ slot = -i - 1;
2721
+ spi = slot / BPF_REG_SIZE;
2722
+ stype = ptr_state->stack[spi].slot_type;
2723
+ if (stype[slot % BPF_REG_SIZE] != STACK_ZERO)
2724
+ break;
2725
+ zeros++;
2726
+ }
2727
+ if (zeros == max_off - min_off) {
2728
+ /* any access_size read into register is zero extended,
2729
+ * so the whole register == const_zero
2730
+ */
2731
+ __mark_reg_const_zero(&state->regs[dst_regno]);
2732
+ /* backtracking doesn't support STACK_ZERO yet,
2733
+ * so mark it precise here, so that later
2734
+ * backtracking can stop here.
2735
+ * Backtracking may not need this if this register
2736
+ * doesn't participate in pointer adjustment.
2737
+ * Forward propagation of precise flag is not
2738
+ * necessary either. This mark is only to stop
2739
+ * backtracking. Any register that contributed
2740
+ * to const 0 was marked precise before spill.
2741
+ */
2742
+ state->regs[dst_regno].precise = true;
2743
+ } else {
2744
+ /* have read misc data from the stack */
2745
+ mark_reg_unknown(env, state->regs, dst_regno);
2746
+ }
2747
+ state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
2748
+}
2749
+
2750
+/* Read the stack at 'off' and put the results into the register indicated by
2751
+ * 'dst_regno'. It handles reg filling if the addressed stack slot is a
2752
+ * spilled reg.
2753
+ *
2754
+ * 'dst_regno' can be -1, meaning that the read value is not going to a
2755
+ * register.
2756
+ *
2757
+ * The access is assumed to be within the current stack bounds.
2758
+ */
2759
+static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
2760
+ /* func where src register points to */
2761
+ struct bpf_func_state *reg_state,
2762
+ int off, int size, int dst_regno)
10752763 {
10762764 struct bpf_verifier_state *vstate = env->cur_state;
10772765 struct bpf_func_state *state = vstate->frame[vstate->curframe];
10782766 int i, slot = -off - 1, spi = slot / BPF_REG_SIZE;
10792767 struct bpf_reg_state *reg;
1080
- u8 *stype;
2768
+ u8 *stype, type;
10812769
1082
- if (reg_state->allocated_stack <= slot) {
1083
- verbose(env, "invalid read from stack off %d+0 size %d\n",
1084
- off, size);
1085
- return -EACCES;
1086
- }
10872770 stype = reg_state->stack[spi].slot_type;
10882771 reg = &reg_state->stack[spi].spilled_ptr;
10892772
1090
- if (stype[0] == STACK_SPILL) {
1091
- if (size != BPF_REG_SIZE) {
2773
+ if (is_spilled_reg(&reg_state->stack[spi])) {
2774
+ u8 spill_size = 1;
2775
+
2776
+ for (i = BPF_REG_SIZE - 1; i > 0 && stype[i - 1] == STACK_SPILL; i--)
2777
+ spill_size++;
2778
+
2779
+ if (size != BPF_REG_SIZE || spill_size != BPF_REG_SIZE) {
10922780 if (reg->type != SCALAR_VALUE) {
2781
+ verbose_linfo(env, env->insn_idx, "; ");
10932782 verbose(env, "invalid size of register fill\n");
10942783 return -EACCES;
10952784 }
1096
- if (value_regno >= 0) {
1097
- mark_reg_unknown(env, state->regs, value_regno);
1098
- state->regs[value_regno].live |= REG_LIVE_WRITTEN;
2785
+
2786
+ mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
2787
+ if (dst_regno < 0)
2788
+ return 0;
2789
+
2790
+ if (!(off % BPF_REG_SIZE) && size == spill_size) {
2791
+ /* The earlier check_reg_arg() has decided the
2792
+ * subreg_def for this insn. Save it first.
2793
+ */
2794
+ s32 subreg_def = state->regs[dst_regno].subreg_def;
2795
+
2796
+ copy_register_state(&state->regs[dst_regno], reg);
2797
+ state->regs[dst_regno].subreg_def = subreg_def;
2798
+ } else {
2799
+ for (i = 0; i < size; i++) {
2800
+ type = stype[(slot - i) % BPF_REG_SIZE];
2801
+ if (type == STACK_SPILL)
2802
+ continue;
2803
+ if (type == STACK_MISC)
2804
+ continue;
2805
+ verbose(env, "invalid read from stack off %d+%d size %d\n",
2806
+ off, i, size);
2807
+ return -EACCES;
2808
+ }
2809
+ mark_reg_unknown(env, state->regs, dst_regno);
10992810 }
1100
- mark_reg_read(env, reg, reg->parent);
2811
+ state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
11012812 return 0;
11022813 }
1103
- for (i = 1; i < BPF_REG_SIZE; i++) {
1104
- if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) {
1105
- verbose(env, "corrupted spill memory\n");
1106
- return -EACCES;
1107
- }
1108
- }
11092814
1110
- if (value_regno >= 0) {
2815
+ if (dst_regno >= 0) {
11112816 /* restore register state from stack */
1112
- state->regs[value_regno] = *reg;
2817
+ copy_register_state(&state->regs[dst_regno], reg);
11132818 /* mark reg as written since spilled pointer state likely
11142819 * has its liveness marks cleared by is_state_visited()
11152820 * which resets stack/reg liveness for state transitions
11162821 */
1117
- state->regs[value_regno].live |= REG_LIVE_WRITTEN;
2822
+ state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
2823
+ } else if (__is_pointer_value(env->allow_ptr_leaks, reg)) {
2824
+ /* If dst_regno==-1, the caller is asking us whether
2825
+ * it is acceptable to use this value as a SCALAR_VALUE
2826
+ * (e.g. for XADD).
2827
+ * We must not allow unprivileged callers to do that
2828
+ * with spilled pointers.
2829
+ */
2830
+ verbose(env, "leaking pointer from stack off %d\n",
2831
+ off);
2832
+ return -EACCES;
11182833 }
1119
- mark_reg_read(env, reg, reg->parent);
2834
+ mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
11202835 } else {
1121
- int zeros = 0;
1122
-
11232836 for (i = 0; i < size; i++) {
1124
- if (stype[(slot - i) % BPF_REG_SIZE] == STACK_MISC)
2837
+ type = stype[(slot - i) % BPF_REG_SIZE];
2838
+ if (type == STACK_MISC)
11252839 continue;
1126
- if (stype[(slot - i) % BPF_REG_SIZE] == STACK_ZERO) {
1127
- zeros++;
2840
+ if (type == STACK_ZERO)
11282841 continue;
1129
- }
11302842 verbose(env, "invalid read from stack off %d+%d size %d\n",
11312843 off, i, size);
11322844 return -EACCES;
11332845 }
1134
- mark_reg_read(env, reg, reg->parent);
1135
- if (value_regno >= 0) {
1136
- if (zeros == size) {
1137
- /* any size read into register is zero extended,
1138
- * so the whole register == const_zero
1139
- */
1140
- __mark_reg_const_zero(&state->regs[value_regno]);
1141
- } else {
1142
- /* have read misc data from the stack */
1143
- mark_reg_unknown(env, state->regs, value_regno);
1144
- }
1145
- state->regs[value_regno].live |= REG_LIVE_WRITTEN;
1146
- }
2846
+ mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
2847
+ if (dst_regno >= 0)
2848
+ mark_reg_stack_read(env, reg_state, off, off + size, dst_regno);
11472849 }
11482850 return 0;
11492851 }
11502852
1151
-static int check_stack_access(struct bpf_verifier_env *env,
1152
- const struct bpf_reg_state *reg,
1153
- int off, int size)
2853
+enum stack_access_src {
2854
+ ACCESS_DIRECT = 1, /* the access is performed by an instruction */
2855
+ ACCESS_HELPER = 2, /* the access is performed by a helper */
2856
+};
2857
+
2858
+static int check_stack_range_initialized(struct bpf_verifier_env *env,
2859
+ int regno, int off, int access_size,
2860
+ bool zero_size_allowed,
2861
+ enum stack_access_src type,
2862
+ struct bpf_call_arg_meta *meta);
2863
+
2864
+static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno)
11542865 {
1155
- /* Stack accesses must be at a fixed offset, so that we
1156
- * can determine what type of data were returned. See
1157
- * check_stack_read().
2866
+ return cur_regs(env) + regno;
2867
+}
2868
+
2869
+/* Read the stack at 'ptr_regno + off' and put the result into the register
2870
+ * 'dst_regno'.
2871
+ * 'off' includes the pointer register's fixed offset(i.e. 'ptr_regno.off'),
2872
+ * but not its variable offset.
2873
+ * 'size' is assumed to be <= reg size and the access is assumed to be aligned.
2874
+ *
2875
+ * As opposed to check_stack_read_fixed_off, this function doesn't deal with
2876
+ * filling registers (i.e. reads of spilled register cannot be detected when
2877
+ * the offset is not fixed). We conservatively mark 'dst_regno' as containing
2878
+ * SCALAR_VALUE. That's why we assert that the 'ptr_regno' has a variable
2879
+ * offset; for a fixed offset check_stack_read_fixed_off should be used
2880
+ * instead.
2881
+ */
2882
+static int check_stack_read_var_off(struct bpf_verifier_env *env,
2883
+ int ptr_regno, int off, int size, int dst_regno)
2884
+{
2885
+ /* The state of the source register. */
2886
+ struct bpf_reg_state *reg = reg_state(env, ptr_regno);
2887
+ struct bpf_func_state *ptr_state = func(env, reg);
2888
+ int err;
2889
+ int min_off, max_off;
2890
+
2891
+ /* Note that we pass a NULL meta, so raw access will not be permitted.
11582892 */
1159
- if (!tnum_is_const(reg->var_off)) {
2893
+ err = check_stack_range_initialized(env, ptr_regno, off, size,
2894
+ false, ACCESS_DIRECT, NULL);
2895
+ if (err)
2896
+ return err;
2897
+
2898
+ min_off = reg->smin_value + off;
2899
+ max_off = reg->smax_value + off;
2900
+ mark_reg_stack_read(env, ptr_state, min_off, max_off + size, dst_regno);
2901
+ return 0;
2902
+}
2903
+
2904
+/* check_stack_read dispatches to check_stack_read_fixed_off or
2905
+ * check_stack_read_var_off.
2906
+ *
2907
+ * The caller must ensure that the offset falls within the allocated stack
2908
+ * bounds.
2909
+ *
2910
+ * 'dst_regno' is a register which will receive the value from the stack. It
2911
+ * can be -1, meaning that the read value is not going to a register.
2912
+ */
2913
+static int check_stack_read(struct bpf_verifier_env *env,
2914
+ int ptr_regno, int off, int size,
2915
+ int dst_regno)
2916
+{
2917
+ struct bpf_reg_state *reg = reg_state(env, ptr_regno);
2918
+ struct bpf_func_state *state = func(env, reg);
2919
+ int err;
2920
+ /* Some accesses are only permitted with a static offset. */
2921
+ bool var_off = !tnum_is_const(reg->var_off);
2922
+
2923
+ /* The offset is required to be static when reads don't go to a
2924
+ * register, in order to not leak pointers (see
2925
+ * check_stack_read_fixed_off).
2926
+ */
2927
+ if (dst_regno < 0 && var_off) {
11602928 char tn_buf[48];
11612929
11622930 tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
1163
- verbose(env, "variable stack access var_off=%s off=%d size=%d\n",
2931
+ verbose(env, "variable offset stack pointer cannot be passed into helper function; var_off=%s off=%d size=%d\n",
11642932 tn_buf, off, size);
11652933 return -EACCES;
11662934 }
1167
-
1168
- if (off >= 0 || off < -MAX_BPF_STACK) {
1169
- verbose(env, "invalid stack off=%d size=%d\n", off, size);
1170
- return -EACCES;
2935
+ /* Variable offset is prohibited for unprivileged mode for simplicity
2936
+ * since it requires corresponding support in Spectre masking for stack
2937
+ * ALU. See also retrieve_ptr_limit(). The check in
2938
+ * check_stack_access_for_ptr_arithmetic() called by
2939
+ * adjust_ptr_min_max_vals() prevents users from creating stack pointers
2940
+ * with variable offsets, therefore no check is required here. Further,
2941
+ * just checking it here would be insufficient as speculative stack
2942
+ * writes could still lead to unsafe speculative behaviour.
2943
+ */
2944
+ if (!var_off) {
2945
+ off += reg->var_off.value;
2946
+ err = check_stack_read_fixed_off(env, state, off, size,
2947
+ dst_regno);
2948
+ } else {
2949
+ /* Variable offset stack reads need more conservative handling
2950
+ * than fixed offset ones. Note that dst_regno >= 0 on this
2951
+ * branch.
2952
+ */
2953
+ err = check_stack_read_var_off(env, ptr_regno, off, size,
2954
+ dst_regno);
11712955 }
1172
-
1173
- return 0;
2956
+ return err;
11742957 }
11752958
1176
-/* check read/write into map element returned by bpf_map_lookup_elem() */
1177
-static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
1178
- int size, bool zero_size_allowed)
2959
+
2960
+/* check_stack_write dispatches to check_stack_write_fixed_off or
2961
+ * check_stack_write_var_off.
2962
+ *
2963
+ * 'ptr_regno' is the register used as a pointer into the stack.
2964
+ * 'off' includes 'ptr_regno->off', but not its variable offset (if any).
2965
+ * 'value_regno' is the register whose value we're writing to the stack. It can
2966
+ * be -1, meaning that we're not writing from a register.
2967
+ *
2968
+ * The caller must ensure that the offset falls within the maximum stack size.
2969
+ */
2970
+static int check_stack_write(struct bpf_verifier_env *env,
2971
+ int ptr_regno, int off, int size,
2972
+ int value_regno, int insn_idx)
2973
+{
2974
+ struct bpf_reg_state *reg = reg_state(env, ptr_regno);
2975
+ struct bpf_func_state *state = func(env, reg);
2976
+ int err;
2977
+
2978
+ if (tnum_is_const(reg->var_off)) {
2979
+ off += reg->var_off.value;
2980
+ err = check_stack_write_fixed_off(env, state, off, size,
2981
+ value_regno, insn_idx);
2982
+ } else {
2983
+ /* Variable offset stack reads need more conservative handling
2984
+ * than fixed offset ones.
2985
+ */
2986
+ err = check_stack_write_var_off(env, state,
2987
+ ptr_regno, off, size,
2988
+ value_regno, insn_idx);
2989
+ }
2990
+ return err;
2991
+}
2992
+
2993
+static int check_map_access_type(struct bpf_verifier_env *env, u32 regno,
2994
+ int off, int size, enum bpf_access_type type)
11792995 {
11802996 struct bpf_reg_state *regs = cur_regs(env);
11812997 struct bpf_map *map = regs[regno].map_ptr;
2998
+ u32 cap = bpf_map_flags_to_cap(map);
11822999
1183
- if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) ||
1184
- off + size > map->value_size) {
1185
- verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
3000
+ if (type == BPF_WRITE && !(cap & BPF_MAP_CAN_WRITE)) {
3001
+ verbose(env, "write into map forbidden, value_size=%d off=%d size=%d\n",
11863002 map->value_size, off, size);
11873003 return -EACCES;
11883004 }
3005
+
3006
+ if (type == BPF_READ && !(cap & BPF_MAP_CAN_READ)) {
3007
+ verbose(env, "read from map forbidden, value_size=%d off=%d size=%d\n",
3008
+ map->value_size, off, size);
3009
+ return -EACCES;
3010
+ }
3011
+
11893012 return 0;
11903013 }
11913014
1192
-/* check read/write into a map element with possible variable offset */
1193
-static int check_map_access(struct bpf_verifier_env *env, u32 regno,
1194
- int off, int size, bool zero_size_allowed)
3015
+/* check read/write into memory region (e.g., map value, ringbuf sample, etc) */
3016
+static int __check_mem_access(struct bpf_verifier_env *env, int regno,
3017
+ int off, int size, u32 mem_size,
3018
+ bool zero_size_allowed)
3019
+{
3020
+ bool size_ok = size > 0 || (size == 0 && zero_size_allowed);
3021
+ struct bpf_reg_state *reg;
3022
+
3023
+ if (off >= 0 && size_ok && (u64)off + size <= mem_size)
3024
+ return 0;
3025
+
3026
+ reg = &cur_regs(env)[regno];
3027
+ switch (reg->type) {
3028
+ case PTR_TO_MAP_VALUE:
3029
+ verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
3030
+ mem_size, off, size);
3031
+ break;
3032
+ case PTR_TO_PACKET:
3033
+ case PTR_TO_PACKET_META:
3034
+ case PTR_TO_PACKET_END:
3035
+ verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
3036
+ off, size, regno, reg->id, off, mem_size);
3037
+ break;
3038
+ case PTR_TO_MEM:
3039
+ default:
3040
+ verbose(env, "invalid access to memory, mem_size=%u off=%d size=%d\n",
3041
+ mem_size, off, size);
3042
+ }
3043
+
3044
+ return -EACCES;
3045
+}
3046
+
3047
+/* check read/write into a memory region with possible variable offset */
3048
+static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno,
3049
+ int off, int size, u32 mem_size,
3050
+ bool zero_size_allowed)
11953051 {
11963052 struct bpf_verifier_state *vstate = env->cur_state;
11973053 struct bpf_func_state *state = vstate->frame[vstate->curframe];
11983054 struct bpf_reg_state *reg = &state->regs[regno];
11993055 int err;
12003056
1201
- /* We may have adjusted the register to this map value, so we
3057
+ /* We may have adjusted the register pointing to memory region, so we
12023058 * need to try adding each of min_value and max_value to off
12033059 * to make sure our theoretical access will be safe.
12043060 */
1205
- if (env->log.level)
3061
+ if (env->log.level & BPF_LOG_LEVEL)
12063062 print_verifier_state(env, state);
12073063
12083064 /* The minimum value is only important with signed
....@@ -1219,10 +3075,10 @@
12193075 regno);
12203076 return -EACCES;
12213077 }
1222
- err = __check_map_access(env, regno, reg->smin_value + off, size,
1223
- zero_size_allowed);
3078
+ err = __check_mem_access(env, regno, reg->smin_value + off, size,
3079
+ mem_size, zero_size_allowed);
12243080 if (err) {
1225
- verbose(env, "R%d min value is outside of the array range\n",
3081
+ verbose(env, "R%d min value is outside of the allowed memory range\n",
12263082 regno);
12273083 return err;
12283084 }
....@@ -1232,33 +3088,79 @@
12323088 * If reg->umax_value + off could overflow, treat that as unbounded too.
12333089 */
12343090 if (reg->umax_value >= BPF_MAX_VAR_OFF) {
1235
- verbose(env, "R%d unbounded memory access, make sure to bounds check any array access into a map\n",
3091
+ verbose(env, "R%d unbounded memory access, make sure to bounds check any such access\n",
12363092 regno);
12373093 return -EACCES;
12383094 }
1239
- err = __check_map_access(env, regno, reg->umax_value + off, size,
1240
- zero_size_allowed);
1241
- if (err)
1242
- verbose(env, "R%d max value is outside of the array range\n",
3095
+ err = __check_mem_access(env, regno, reg->umax_value + off, size,
3096
+ mem_size, zero_size_allowed);
3097
+ if (err) {
3098
+ verbose(env, "R%d max value is outside of the allowed memory range\n",
12433099 regno);
3100
+ return err;
3101
+ }
3102
+
3103
+ return 0;
3104
+}
3105
+
3106
+/* check read/write into a map element with possible variable offset */
3107
+static int check_map_access(struct bpf_verifier_env *env, u32 regno,
3108
+ int off, int size, bool zero_size_allowed)
3109
+{
3110
+ struct bpf_verifier_state *vstate = env->cur_state;
3111
+ struct bpf_func_state *state = vstate->frame[vstate->curframe];
3112
+ struct bpf_reg_state *reg = &state->regs[regno];
3113
+ struct bpf_map *map = reg->map_ptr;
3114
+ int err;
3115
+
3116
+ err = check_mem_region_access(env, regno, off, size, map->value_size,
3117
+ zero_size_allowed);
3118
+ if (err)
3119
+ return err;
3120
+
3121
+ if (map_value_has_spin_lock(map)) {
3122
+ u32 lock = map->spin_lock_off;
3123
+
3124
+ /* if any part of struct bpf_spin_lock can be touched by
3125
+ * load/store reject this program.
3126
+ * To check that [x1, x2) overlaps with [y1, y2)
3127
+ * it is sufficient to check x1 < y2 && y1 < x2.
3128
+ */
3129
+ if (reg->smin_value + off < lock + sizeof(struct bpf_spin_lock) &&
3130
+ lock < reg->umax_value + off + size) {
3131
+ verbose(env, "bpf_spin_lock cannot be accessed directly by load/store\n");
3132
+ return -EACCES;
3133
+ }
3134
+ }
12443135 return err;
12453136 }
12463137
12473138 #define MAX_PACKET_OFF 0xffff
12483139
3140
+static enum bpf_prog_type resolve_prog_type(struct bpf_prog *prog)
3141
+{
3142
+ return prog->aux->dst_prog ? prog->aux->dst_prog->type : prog->type;
3143
+}
3144
+
12493145 static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
12503146 const struct bpf_call_arg_meta *meta,
12513147 enum bpf_access_type t)
12523148 {
1253
- switch (env->prog->type) {
3149
+ enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
3150
+
3151
+ switch (prog_type) {
3152
+ /* Program types only with direct read access go here! */
12543153 case BPF_PROG_TYPE_LWT_IN:
12553154 case BPF_PROG_TYPE_LWT_OUT:
12563155 case BPF_PROG_TYPE_LWT_SEG6LOCAL:
12573156 case BPF_PROG_TYPE_SK_REUSEPORT:
1258
- /* dst_input() and dst_output() can't write for now */
3157
+ case BPF_PROG_TYPE_FLOW_DISSECTOR:
3158
+ case BPF_PROG_TYPE_CGROUP_SKB:
12593159 if (t == BPF_WRITE)
12603160 return false;
1261
- /* fallthrough */
3161
+ fallthrough;
3162
+
3163
+ /* Program types with direct read + write access go here! */
12623164 case BPF_PROG_TYPE_SCHED_CLS:
12633165 case BPF_PROG_TYPE_SCHED_ACT:
12643166 case BPF_PROG_TYPE_XDP:
....@@ -1270,24 +3172,16 @@
12703172
12713173 env->seen_direct_write = true;
12723174 return true;
3175
+
3176
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
3177
+ if (t == BPF_WRITE)
3178
+ env->seen_direct_write = true;
3179
+
3180
+ return true;
3181
+
12733182 default:
12743183 return false;
12753184 }
1276
-}
1277
-
1278
-static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
1279
- int off, int size, bool zero_size_allowed)
1280
-{
1281
- struct bpf_reg_state *regs = cur_regs(env);
1282
- struct bpf_reg_state *reg = &regs[regno];
1283
-
1284
- if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) ||
1285
- (u64)off + size > reg->range) {
1286
- verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
1287
- off, size, regno, reg->id, reg->off, reg->range);
1288
- return -EACCES;
1289
- }
1290
- return 0;
12913185 }
12923186
12933187 static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
....@@ -1310,20 +3204,36 @@
13103204 regno);
13113205 return -EACCES;
13123206 }
1313
- err = __check_packet_access(env, regno, off, size, zero_size_allowed);
3207
+
3208
+ err = reg->range < 0 ? -EINVAL :
3209
+ __check_mem_access(env, regno, off, size, reg->range,
3210
+ zero_size_allowed);
13143211 if (err) {
13153212 verbose(env, "R%d offset is outside of the packet\n", regno);
13163213 return err;
13173214 }
3215
+
3216
+ /* __check_mem_access has made sure "off + size - 1" is within u16.
3217
+ * reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff,
3218
+ * otherwise find_good_pkt_pointers would have refused to set range info
3219
+ * that __check_mem_access would have rejected this pkt access.
3220
+ * Therefore, "off + reg->umax_value + size - 1" won't overflow u32.
3221
+ */
3222
+ env->prog->aux->max_pkt_offset =
3223
+ max_t(u32, env->prog->aux->max_pkt_offset,
3224
+ off + reg->umax_value + size - 1);
3225
+
13183226 return err;
13193227 }
13203228
13213229 /* check access to 'struct bpf_context' fields. Supports fixed offsets only */
13223230 static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
1323
- enum bpf_access_type t, enum bpf_reg_type *reg_type)
3231
+ enum bpf_access_type t, enum bpf_reg_type *reg_type,
3232
+ u32 *btf_id)
13243233 {
13253234 struct bpf_insn_access_aux info = {
13263235 .reg_type = *reg_type,
3236
+ .log = &env->log,
13273237 };
13283238
13293239 if (env->ops->is_valid_access &&
....@@ -1337,7 +3247,10 @@
13373247 */
13383248 *reg_type = info.reg_type;
13393249
1340
- env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
3250
+ if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL)
3251
+ *btf_id = info.btf_id;
3252
+ else
3253
+ env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
13413254 /* remember the offset of last byte accessed in ctx */
13423255 if (env->prog->aux->max_ctx_offset < off + size)
13433256 env->prog->aux->max_ctx_offset = off + size;
....@@ -1348,32 +3261,95 @@
13483261 return -EACCES;
13493262 }
13503263
1351
-static bool __is_pointer_value(bool allow_ptr_leaks,
1352
- const struct bpf_reg_state *reg)
3264
+static int check_flow_keys_access(struct bpf_verifier_env *env, int off,
3265
+ int size)
13533266 {
1354
- if (allow_ptr_leaks)
1355
- return false;
3267
+ if (size < 0 || off < 0 ||
3268
+ (u64)off + size > sizeof(struct bpf_flow_keys)) {
3269
+ verbose(env, "invalid access to flow keys off=%d size=%d\n",
3270
+ off, size);
3271
+ return -EACCES;
3272
+ }
3273
+ return 0;
3274
+}
13563275
1357
- return reg->type != SCALAR_VALUE;
3276
+static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
3277
+ u32 regno, int off, int size,
3278
+ enum bpf_access_type t)
3279
+{
3280
+ struct bpf_reg_state *regs = cur_regs(env);
3281
+ struct bpf_reg_state *reg = &regs[regno];
3282
+ struct bpf_insn_access_aux info = {};
3283
+ bool valid;
3284
+
3285
+ if (reg->smin_value < 0) {
3286
+ verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
3287
+ regno);
3288
+ return -EACCES;
3289
+ }
3290
+
3291
+ switch (reg->type) {
3292
+ case PTR_TO_SOCK_COMMON:
3293
+ valid = bpf_sock_common_is_valid_access(off, size, t, &info);
3294
+ break;
3295
+ case PTR_TO_SOCKET:
3296
+ valid = bpf_sock_is_valid_access(off, size, t, &info);
3297
+ break;
3298
+ case PTR_TO_TCP_SOCK:
3299
+ valid = bpf_tcp_sock_is_valid_access(off, size, t, &info);
3300
+ break;
3301
+ case PTR_TO_XDP_SOCK:
3302
+ valid = bpf_xdp_sock_is_valid_access(off, size, t, &info);
3303
+ break;
3304
+ default:
3305
+ valid = false;
3306
+ }
3307
+
3308
+
3309
+ if (valid) {
3310
+ env->insn_aux_data[insn_idx].ctx_field_size =
3311
+ info.ctx_field_size;
3312
+ return 0;
3313
+ }
3314
+
3315
+ verbose(env, "R%d invalid %s access off=%d size=%d\n",
3316
+ regno, reg_type_str[reg->type], off, size);
3317
+
3318
+ return -EACCES;
13583319 }
13593320
13603321 static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
13613322 {
1362
- return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno);
3323
+ return __is_pointer_value(env->allow_ptr_leaks, reg_state(env, regno));
13633324 }
13643325
13653326 static bool is_ctx_reg(struct bpf_verifier_env *env, int regno)
13663327 {
1367
- const struct bpf_reg_state *reg = cur_regs(env) + regno;
3328
+ const struct bpf_reg_state *reg = reg_state(env, regno);
13683329
13693330 return reg->type == PTR_TO_CTX;
13703331 }
13713332
3333
+static bool is_sk_reg(struct bpf_verifier_env *env, int regno)
3334
+{
3335
+ const struct bpf_reg_state *reg = reg_state(env, regno);
3336
+
3337
+ return type_is_sk_pointer(reg->type);
3338
+}
3339
+
13723340 static bool is_pkt_reg(struct bpf_verifier_env *env, int regno)
13733341 {
1374
- const struct bpf_reg_state *reg = cur_regs(env) + regno;
3342
+ const struct bpf_reg_state *reg = reg_state(env, regno);
13753343
13763344 return type_is_pkt_pointer(reg->type);
3345
+}
3346
+
3347
+static bool is_flow_key_reg(struct bpf_verifier_env *env, int regno)
3348
+{
3349
+ const struct bpf_reg_state *reg = reg_state(env, regno);
3350
+
3351
+ /* Separate to is_ctx_reg() since we still want to allow BPF_ST here. */
3352
+ return reg->type == PTR_TO_FLOW_KEYS;
13773353 }
13783354
13793355 static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
....@@ -1449,6 +3425,9 @@
14493425 * right in front, treat it the very same way.
14503426 */
14513427 return check_pkt_ptr_alignment(env, reg, off, size, strict);
3428
+ case PTR_TO_FLOW_KEYS:
3429
+ pointer_desc = "flow keys ";
3430
+ break;
14523431 case PTR_TO_MAP_VALUE:
14533432 pointer_desc = "value ";
14543433 break;
....@@ -1457,11 +3436,23 @@
14573436 break;
14583437 case PTR_TO_STACK:
14593438 pointer_desc = "stack ";
1460
- /* The stack spill tracking logic in check_stack_write()
1461
- * and check_stack_read() relies on stack accesses being
3439
+ /* The stack spill tracking logic in check_stack_write_fixed_off()
3440
+ * and check_stack_read_fixed_off() relies on stack accesses being
14623441 * aligned.
14633442 */
14643443 strict = true;
3444
+ break;
3445
+ case PTR_TO_SOCKET:
3446
+ pointer_desc = "sock ";
3447
+ break;
3448
+ case PTR_TO_SOCK_COMMON:
3449
+ pointer_desc = "sock_common ";
3450
+ break;
3451
+ case PTR_TO_TCP_SOCK:
3452
+ pointer_desc = "tcp_sock ";
3453
+ break;
3454
+ case PTR_TO_XDP_SOCK:
3455
+ pointer_desc = "xdp_sock ";
14653456 break;
14663457 default:
14673458 break;
....@@ -1495,10 +3486,37 @@
14953486 int depth = 0, frame = 0, idx = 0, i = 0, subprog_end;
14963487 struct bpf_subprog_info *subprog = env->subprog_info;
14973488 struct bpf_insn *insn = env->prog->insnsi;
3489
+ bool tail_call_reachable = false;
14983490 int ret_insn[MAX_CALL_FRAMES];
14993491 int ret_prog[MAX_CALL_FRAMES];
3492
+ int j;
15003493
15013494 process_func:
3495
+ /* protect against potential stack overflow that might happen when
3496
+ * bpf2bpf calls get combined with tailcalls. Limit the caller's stack
3497
+ * depth for such case down to 256 so that the worst case scenario
3498
+ * would result in 8k stack size (32 which is tailcall limit * 256 =
3499
+ * 8k).
3500
+ *
3501
+ * To get the idea what might happen, see an example:
3502
+ * func1 -> sub rsp, 128
3503
+ * subfunc1 -> sub rsp, 256
3504
+ * tailcall1 -> add rsp, 256
3505
+ * func2 -> sub rsp, 192 (total stack size = 128 + 192 = 320)
3506
+ * subfunc2 -> sub rsp, 64
3507
+ * subfunc22 -> sub rsp, 128
3508
+ * tailcall2 -> add rsp, 128
3509
+ * func3 -> sub rsp, 32 (total stack size 128 + 192 + 64 + 32 = 416)
3510
+ *
3511
+ * tailcall will unwind the current stack frame but it will not get rid
3512
+ * of caller's stack as shown on the example above.
3513
+ */
3514
+ if (idx && subprog[idx].has_tail_call && depth >= 256) {
3515
+ verbose(env,
3516
+ "tail_calls are not allowed when call stack of previous frames is %d bytes. Too large\n",
3517
+ depth);
3518
+ return -EACCES;
3519
+ }
15023520 /* round up to 32-bytes, since this is granularity
15033521 * of interpreter stack size
15043522 */
....@@ -1527,13 +3545,29 @@
15273545 i);
15283546 return -EFAULT;
15293547 }
3548
+
3549
+ if (subprog[idx].has_tail_call)
3550
+ tail_call_reachable = true;
3551
+
15303552 frame++;
15313553 if (frame >= MAX_CALL_FRAMES) {
1532
- WARN_ONCE(1, "verifier bug. Call stack is too deep\n");
1533
- return -EFAULT;
3554
+ verbose(env, "the call stack of %d frames is too deep !\n",
3555
+ frame);
3556
+ return -E2BIG;
15343557 }
15353558 goto process_func;
15363559 }
3560
+ /* if tail call got detected across bpf2bpf calls then mark each of the
3561
+ * currently present subprog frames as tail call reachable subprogs;
3562
+ * this info will be utilized by JIT so that we will be preserving the
3563
+ * tail call counter throughout bpf2bpf calls combined with tailcalls
3564
+ */
3565
+ if (tail_call_reachable)
3566
+ for (j = 0; j < frame; j++)
3567
+ subprog[ret_prog[j]].tail_call_reachable = true;
3568
+ if (subprog[0].tail_call_reachable)
3569
+ env->prog->aux->tail_call_reachable = true;
3570
+
15373571 /* end of for() loop means the last insn of the 'subprog'
15383572 * was reached. Doesn't matter whether it was JA or EXIT
15393573 */
....@@ -1562,8 +3596,8 @@
15623596 }
15633597 #endif
15643598
1565
-static int check_ctx_reg(struct bpf_verifier_env *env,
1566
- const struct bpf_reg_state *reg, int regno)
3599
+int check_ctx_reg(struct bpf_verifier_env *env,
3600
+ const struct bpf_reg_state *reg, int regno)
15673601 {
15683602 /* Access to ctx or passing it to a helper is only allowed in
15693603 * its original, unmodified form.
....@@ -1584,6 +3618,72 @@
15843618 }
15853619
15863620 return 0;
3621
+}
3622
+
3623
+static int __check_buffer_access(struct bpf_verifier_env *env,
3624
+ const char *buf_info,
3625
+ const struct bpf_reg_state *reg,
3626
+ int regno, int off, int size)
3627
+{
3628
+ if (off < 0) {
3629
+ verbose(env,
3630
+ "R%d invalid %s buffer access: off=%d, size=%d\n",
3631
+ regno, buf_info, off, size);
3632
+ return -EACCES;
3633
+ }
3634
+ if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
3635
+ char tn_buf[48];
3636
+
3637
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
3638
+ verbose(env,
3639
+ "R%d invalid variable buffer offset: off=%d, var_off=%s\n",
3640
+ regno, off, tn_buf);
3641
+ return -EACCES;
3642
+ }
3643
+
3644
+ return 0;
3645
+}
3646
+
3647
+static int check_tp_buffer_access(struct bpf_verifier_env *env,
3648
+ const struct bpf_reg_state *reg,
3649
+ int regno, int off, int size)
3650
+{
3651
+ int err;
3652
+
3653
+ err = __check_buffer_access(env, "tracepoint", reg, regno, off, size);
3654
+ if (err)
3655
+ return err;
3656
+
3657
+ if (off + size > env->prog->aux->max_tp_access)
3658
+ env->prog->aux->max_tp_access = off + size;
3659
+
3660
+ return 0;
3661
+}
3662
+
3663
+static int check_buffer_access(struct bpf_verifier_env *env,
3664
+ const struct bpf_reg_state *reg,
3665
+ int regno, int off, int size,
3666
+ bool zero_size_allowed,
3667
+ const char *buf_info,
3668
+ u32 *max_access)
3669
+{
3670
+ int err;
3671
+
3672
+ err = __check_buffer_access(env, buf_info, reg, regno, off, size);
3673
+ if (err)
3674
+ return err;
3675
+
3676
+ if (off + size > *max_access)
3677
+ *max_access = off + size;
3678
+
3679
+ return 0;
3680
+}
3681
+
3682
+/* BPF architecture zero extends alu32 ops into 64-bit registesr */
3683
+static void zext_32_to_64(struct bpf_reg_state *reg)
3684
+{
3685
+ reg->var_off = tnum_subreg(reg->var_off);
3686
+ __reg_assign_32_into_64(reg);
15873687 }
15883688
15893689 /* truncate register to smaller size (in bytes)
....@@ -1607,6 +3707,255 @@
16073707 }
16083708 reg->smin_value = reg->umin_value;
16093709 reg->smax_value = reg->umax_value;
3710
+
3711
+ /* If size is smaller than 32bit register the 32bit register
3712
+ * values are also truncated so we push 64-bit bounds into
3713
+ * 32-bit bounds. Above were truncated < 32-bits already.
3714
+ */
3715
+ if (size >= 4)
3716
+ return;
3717
+ __reg_combine_64_into_32(reg);
3718
+}
3719
+
3720
+static bool bpf_map_is_rdonly(const struct bpf_map *map)
3721
+{
3722
+ /* A map is considered read-only if the following condition are true:
3723
+ *
3724
+ * 1) BPF program side cannot change any of the map content. The
3725
+ * BPF_F_RDONLY_PROG flag is throughout the lifetime of a map
3726
+ * and was set at map creation time.
3727
+ * 2) The map value(s) have been initialized from user space by a
3728
+ * loader and then "frozen", such that no new map update/delete
3729
+ * operations from syscall side are possible for the rest of
3730
+ * the map's lifetime from that point onwards.
3731
+ * 3) Any parallel/pending map update/delete operations from syscall
3732
+ * side have been completed. Only after that point, it's safe to
3733
+ * assume that map value(s) are immutable.
3734
+ */
3735
+ return (map->map_flags & BPF_F_RDONLY_PROG) &&
3736
+ READ_ONCE(map->frozen) &&
3737
+ !bpf_map_write_active(map);
3738
+}
3739
+
3740
+static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val)
3741
+{
3742
+ void *ptr;
3743
+ u64 addr;
3744
+ int err;
3745
+
3746
+ err = map->ops->map_direct_value_addr(map, &addr, off);
3747
+ if (err)
3748
+ return err;
3749
+ ptr = (void *)(long)addr + off;
3750
+
3751
+ switch (size) {
3752
+ case sizeof(u8):
3753
+ *val = (u64)*(u8 *)ptr;
3754
+ break;
3755
+ case sizeof(u16):
3756
+ *val = (u64)*(u16 *)ptr;
3757
+ break;
3758
+ case sizeof(u32):
3759
+ *val = (u64)*(u32 *)ptr;
3760
+ break;
3761
+ case sizeof(u64):
3762
+ *val = *(u64 *)ptr;
3763
+ break;
3764
+ default:
3765
+ return -EINVAL;
3766
+ }
3767
+ return 0;
3768
+}
3769
+
3770
+static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
3771
+ struct bpf_reg_state *regs,
3772
+ int regno, int off, int size,
3773
+ enum bpf_access_type atype,
3774
+ int value_regno)
3775
+{
3776
+ struct bpf_reg_state *reg = regs + regno;
3777
+ const struct btf_type *t = btf_type_by_id(btf_vmlinux, reg->btf_id);
3778
+ const char *tname = btf_name_by_offset(btf_vmlinux, t->name_off);
3779
+ u32 btf_id;
3780
+ int ret;
3781
+
3782
+ if (off < 0) {
3783
+ verbose(env,
3784
+ "R%d is ptr_%s invalid negative access: off=%d\n",
3785
+ regno, tname, off);
3786
+ return -EACCES;
3787
+ }
3788
+ if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
3789
+ char tn_buf[48];
3790
+
3791
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
3792
+ verbose(env,
3793
+ "R%d is ptr_%s invalid variable offset: off=%d, var_off=%s\n",
3794
+ regno, tname, off, tn_buf);
3795
+ return -EACCES;
3796
+ }
3797
+
3798
+ if (env->ops->btf_struct_access) {
3799
+ ret = env->ops->btf_struct_access(&env->log, t, off, size,
3800
+ atype, &btf_id);
3801
+ } else {
3802
+ if (atype != BPF_READ) {
3803
+ verbose(env, "only read is supported\n");
3804
+ return -EACCES;
3805
+ }
3806
+
3807
+ ret = btf_struct_access(&env->log, t, off, size, atype,
3808
+ &btf_id);
3809
+ }
3810
+
3811
+ if (ret < 0)
3812
+ return ret;
3813
+
3814
+ if (atype == BPF_READ && value_regno >= 0)
3815
+ mark_btf_ld_reg(env, regs, value_regno, ret, btf_id);
3816
+
3817
+ return 0;
3818
+}
3819
+
3820
+static int check_ptr_to_map_access(struct bpf_verifier_env *env,
3821
+ struct bpf_reg_state *regs,
3822
+ int regno, int off, int size,
3823
+ enum bpf_access_type atype,
3824
+ int value_regno)
3825
+{
3826
+ struct bpf_reg_state *reg = regs + regno;
3827
+ struct bpf_map *map = reg->map_ptr;
3828
+ const struct btf_type *t;
3829
+ const char *tname;
3830
+ u32 btf_id;
3831
+ int ret;
3832
+
3833
+ if (!btf_vmlinux) {
3834
+ verbose(env, "map_ptr access not supported without CONFIG_DEBUG_INFO_BTF\n");
3835
+ return -ENOTSUPP;
3836
+ }
3837
+
3838
+ if (!map->ops->map_btf_id || !*map->ops->map_btf_id) {
3839
+ verbose(env, "map_ptr access not supported for map type %d\n",
3840
+ map->map_type);
3841
+ return -ENOTSUPP;
3842
+ }
3843
+
3844
+ t = btf_type_by_id(btf_vmlinux, *map->ops->map_btf_id);
3845
+ tname = btf_name_by_offset(btf_vmlinux, t->name_off);
3846
+
3847
+ if (!env->allow_ptr_to_map_access) {
3848
+ verbose(env,
3849
+ "%s access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n",
3850
+ tname);
3851
+ return -EPERM;
3852
+ }
3853
+
3854
+ if (off < 0) {
3855
+ verbose(env, "R%d is %s invalid negative access: off=%d\n",
3856
+ regno, tname, off);
3857
+ return -EACCES;
3858
+ }
3859
+
3860
+ if (atype != BPF_READ) {
3861
+ verbose(env, "only read from %s is supported\n", tname);
3862
+ return -EACCES;
3863
+ }
3864
+
3865
+ ret = btf_struct_access(&env->log, t, off, size, atype, &btf_id);
3866
+ if (ret < 0)
3867
+ return ret;
3868
+
3869
+ if (value_regno >= 0)
3870
+ mark_btf_ld_reg(env, regs, value_regno, ret, btf_id);
3871
+
3872
+ return 0;
3873
+}
3874
+
3875
+/* Check that the stack access at the given offset is within bounds. The
3876
+ * maximum valid offset is -1.
3877
+ *
3878
+ * The minimum valid offset is -MAX_BPF_STACK for writes, and
3879
+ * -state->allocated_stack for reads.
3880
+ */
3881
+static int check_stack_slot_within_bounds(int off,
3882
+ struct bpf_func_state *state,
3883
+ enum bpf_access_type t)
3884
+{
3885
+ int min_valid_off;
3886
+
3887
+ if (t == BPF_WRITE)
3888
+ min_valid_off = -MAX_BPF_STACK;
3889
+ else
3890
+ min_valid_off = -state->allocated_stack;
3891
+
3892
+ if (off < min_valid_off || off > -1)
3893
+ return -EACCES;
3894
+ return 0;
3895
+}
3896
+
3897
+/* Check that the stack access at 'regno + off' falls within the maximum stack
3898
+ * bounds.
3899
+ *
3900
+ * 'off' includes `regno->offset`, but not its dynamic part (if any).
3901
+ */
3902
+static int check_stack_access_within_bounds(
3903
+ struct bpf_verifier_env *env,
3904
+ int regno, int off, int access_size,
3905
+ enum stack_access_src src, enum bpf_access_type type)
3906
+{
3907
+ struct bpf_reg_state *regs = cur_regs(env);
3908
+ struct bpf_reg_state *reg = regs + regno;
3909
+ struct bpf_func_state *state = func(env, reg);
3910
+ int min_off, max_off;
3911
+ int err;
3912
+ char *err_extra;
3913
+
3914
+ if (src == ACCESS_HELPER)
3915
+ /* We don't know if helpers are reading or writing (or both). */
3916
+ err_extra = " indirect access to";
3917
+ else if (type == BPF_READ)
3918
+ err_extra = " read from";
3919
+ else
3920
+ err_extra = " write to";
3921
+
3922
+ if (tnum_is_const(reg->var_off)) {
3923
+ min_off = reg->var_off.value + off;
3924
+ if (access_size > 0)
3925
+ max_off = min_off + access_size - 1;
3926
+ else
3927
+ max_off = min_off;
3928
+ } else {
3929
+ if (reg->smax_value >= BPF_MAX_VAR_OFF ||
3930
+ reg->smin_value <= -BPF_MAX_VAR_OFF) {
3931
+ verbose(env, "invalid unbounded variable-offset%s stack R%d\n",
3932
+ err_extra, regno);
3933
+ return -EACCES;
3934
+ }
3935
+ min_off = reg->smin_value + off;
3936
+ if (access_size > 0)
3937
+ max_off = reg->smax_value + off + access_size - 1;
3938
+ else
3939
+ max_off = min_off;
3940
+ }
3941
+
3942
+ err = check_stack_slot_within_bounds(min_off, state, type);
3943
+ if (!err)
3944
+ err = check_stack_slot_within_bounds(max_off, state, type);
3945
+
3946
+ if (err) {
3947
+ if (tnum_is_const(reg->var_off)) {
3948
+ verbose(env, "invalid%s stack R%d off=%d size=%d\n",
3949
+ err_extra, regno, off, access_size);
3950
+ } else {
3951
+ char tn_buf[48];
3952
+
3953
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
3954
+ verbose(env, "invalid variable-offset%s stack R%d var_off=%s size=%d\n",
3955
+ err_extra, regno, tn_buf, access_size);
3956
+ }
3957
+ }
3958
+ return err;
16103959 }
16113960
16123961 /* check whether memory at (regno + off) is accessible for t = (read | write)
....@@ -1642,13 +3991,44 @@
16423991 verbose(env, "R%d leaks addr into map\n", value_regno);
16433992 return -EACCES;
16443993 }
1645
-
3994
+ err = check_map_access_type(env, regno, off, size, t);
3995
+ if (err)
3996
+ return err;
16463997 err = check_map_access(env, regno, off, size, false);
3998
+ if (!err && t == BPF_READ && value_regno >= 0) {
3999
+ struct bpf_map *map = reg->map_ptr;
4000
+
4001
+ /* if map is read-only, track its contents as scalars */
4002
+ if (tnum_is_const(reg->var_off) &&
4003
+ bpf_map_is_rdonly(map) &&
4004
+ map->ops->map_direct_value_addr) {
4005
+ int map_off = off + reg->var_off.value;
4006
+ u64 val = 0;
4007
+
4008
+ err = bpf_map_direct_read(map, map_off, size,
4009
+ &val);
4010
+ if (err)
4011
+ return err;
4012
+
4013
+ regs[value_regno].type = SCALAR_VALUE;
4014
+ __mark_reg_known(&regs[value_regno], val);
4015
+ } else {
4016
+ mark_reg_unknown(env, regs, value_regno);
4017
+ }
4018
+ }
4019
+ } else if (reg->type == PTR_TO_MEM) {
4020
+ if (t == BPF_WRITE && value_regno >= 0 &&
4021
+ is_pointer_value(env, value_regno)) {
4022
+ verbose(env, "R%d leaks addr into mem\n", value_regno);
4023
+ return -EACCES;
4024
+ }
4025
+ err = check_mem_region_access(env, regno, off, size,
4026
+ reg->mem_size, false);
16474027 if (!err && t == BPF_READ && value_regno >= 0)
16484028 mark_reg_unknown(env, regs, value_regno);
1649
-
16504029 } else if (reg->type == PTR_TO_CTX) {
16514030 enum bpf_reg_type reg_type = SCALAR_VALUE;
4031
+ u32 btf_id = 0;
16524032
16534033 if (t == BPF_WRITE && value_regno >= 0 &&
16544034 is_pointer_value(env, value_regno)) {
....@@ -1660,23 +4040,37 @@
16604040 if (err < 0)
16614041 return err;
16624042
1663
- err = check_ctx_access(env, insn_idx, off, size, t, &reg_type);
4043
+ err = check_ctx_access(env, insn_idx, off, size, t, &reg_type, &btf_id);
4044
+ if (err)
4045
+ verbose_linfo(env, insn_idx, "; ");
16644046 if (!err && t == BPF_READ && value_regno >= 0) {
16654047 /* ctx access returns either a scalar, or a
16664048 * PTR_TO_PACKET[_META,_END]. In the latter
16674049 * case, we know the offset is zero.
16684050 */
1669
- if (reg_type == SCALAR_VALUE)
4051
+ if (reg_type == SCALAR_VALUE) {
16704052 mark_reg_unknown(env, regs, value_regno);
1671
- else
4053
+ } else {
16724054 mark_reg_known_zero(env, regs,
16734055 value_regno);
4056
+ if (reg_type_may_be_null(reg_type))
4057
+ regs[value_regno].id = ++env->id_gen;
4058
+ /* A load of ctx field could have different
4059
+ * actual load size with the one encoded in the
4060
+ * insn. When the dst is PTR, it is for sure not
4061
+ * a sub-register.
4062
+ */
4063
+ regs[value_regno].subreg_def = DEF_NOT_SUBREG;
4064
+ if (reg_type == PTR_TO_BTF_ID ||
4065
+ reg_type == PTR_TO_BTF_ID_OR_NULL)
4066
+ regs[value_regno].btf_id = btf_id;
4067
+ }
16744068 regs[value_regno].type = reg_type;
16754069 }
16764070
16774071 } else if (reg->type == PTR_TO_STACK) {
1678
- off += reg->var_off.value;
1679
- err = check_stack_access(env, reg, off, size);
4072
+ /* Basic bounds checks. */
4073
+ err = check_stack_access_within_bounds(env, regno, off, size, ACCESS_DIRECT, t);
16804074 if (err)
16814075 return err;
16824076
....@@ -1685,12 +4079,12 @@
16854079 if (err)
16864080 return err;
16874081
1688
- if (t == BPF_WRITE)
1689
- err = check_stack_write(env, state, off, size,
1690
- value_regno, insn_idx);
1691
- else
1692
- err = check_stack_read(env, state, off, size,
4082
+ if (t == BPF_READ)
4083
+ err = check_stack_read(env, regno, off, size,
16934084 value_regno);
4085
+ else
4086
+ err = check_stack_write(env, regno, off, size,
4087
+ value_regno, insn_idx);
16944088 } else if (reg_is_pkt_pointer(reg)) {
16954089 if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
16964090 verbose(env, "cannot write into packet\n");
....@@ -1703,6 +4097,53 @@
17034097 return -EACCES;
17044098 }
17054099 err = check_packet_access(env, regno, off, size, false);
4100
+ if (!err && t == BPF_READ && value_regno >= 0)
4101
+ mark_reg_unknown(env, regs, value_regno);
4102
+ } else if (reg->type == PTR_TO_FLOW_KEYS) {
4103
+ if (t == BPF_WRITE && value_regno >= 0 &&
4104
+ is_pointer_value(env, value_regno)) {
4105
+ verbose(env, "R%d leaks addr into flow keys\n",
4106
+ value_regno);
4107
+ return -EACCES;
4108
+ }
4109
+
4110
+ err = check_flow_keys_access(env, off, size);
4111
+ if (!err && t == BPF_READ && value_regno >= 0)
4112
+ mark_reg_unknown(env, regs, value_regno);
4113
+ } else if (type_is_sk_pointer(reg->type)) {
4114
+ if (t == BPF_WRITE) {
4115
+ verbose(env, "R%d cannot write into %s\n",
4116
+ regno, reg_type_str[reg->type]);
4117
+ return -EACCES;
4118
+ }
4119
+ err = check_sock_access(env, insn_idx, regno, off, size, t);
4120
+ if (!err && value_regno >= 0)
4121
+ mark_reg_unknown(env, regs, value_regno);
4122
+ } else if (reg->type == PTR_TO_TP_BUFFER) {
4123
+ err = check_tp_buffer_access(env, reg, regno, off, size);
4124
+ if (!err && t == BPF_READ && value_regno >= 0)
4125
+ mark_reg_unknown(env, regs, value_regno);
4126
+ } else if (reg->type == PTR_TO_BTF_ID) {
4127
+ err = check_ptr_to_btf_access(env, regs, regno, off, size, t,
4128
+ value_regno);
4129
+ } else if (reg->type == CONST_PTR_TO_MAP) {
4130
+ err = check_ptr_to_map_access(env, regs, regno, off, size, t,
4131
+ value_regno);
4132
+ } else if (reg->type == PTR_TO_RDONLY_BUF) {
4133
+ if (t == BPF_WRITE) {
4134
+ verbose(env, "R%d cannot write into %s\n",
4135
+ regno, reg_type_str[reg->type]);
4136
+ return -EACCES;
4137
+ }
4138
+ err = check_buffer_access(env, reg, regno, off, size, false,
4139
+ "rdonly",
4140
+ &env->prog->aux->max_rdonly_access);
4141
+ if (!err && value_regno >= 0)
4142
+ mark_reg_unknown(env, regs, value_regno);
4143
+ } else if (reg->type == PTR_TO_RDWR_BUF) {
4144
+ err = check_buffer_access(env, reg, regno, off, size, false,
4145
+ "rdwr",
4146
+ &env->prog->aux->max_rdwr_access);
17064147 if (!err && t == BPF_READ && value_regno >= 0)
17074148 mark_reg_unknown(env, regs, value_regno);
17084149 } else {
....@@ -1745,10 +4186,12 @@
17454186 }
17464187
17474188 if (is_ctx_reg(env, insn->dst_reg) ||
1748
- is_pkt_reg(env, insn->dst_reg)) {
4189
+ is_pkt_reg(env, insn->dst_reg) ||
4190
+ is_flow_key_reg(env, insn->dst_reg) ||
4191
+ is_sk_reg(env, insn->dst_reg)) {
17494192 verbose(env, "BPF_XADD stores into R%d %s is not allowed\n",
1750
- insn->dst_reg, is_ctx_reg(env, insn->dst_reg) ?
1751
- "context" : "packet");
4193
+ insn->dst_reg,
4194
+ reg_type_str[reg_state(env, insn->dst_reg)->type]);
17524195 return -EACCES;
17534196 }
17544197
....@@ -1763,73 +4206,65 @@
17634206 BPF_SIZE(insn->code), BPF_WRITE, -1, true);
17644207 }
17654208
1766
-static int __check_stack_boundary(struct bpf_verifier_env *env, u32 regno,
1767
- int off, int access_size,
1768
- bool zero_size_allowed)
1769
-{
1770
- struct bpf_reg_state *reg = cur_regs(env) + regno;
1771
-
1772
- if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
1773
- access_size < 0 || (access_size == 0 && !zero_size_allowed)) {
1774
- if (tnum_is_const(reg->var_off)) {
1775
- verbose(env, "invalid stack type R%d off=%d access_size=%d\n",
1776
- regno, off, access_size);
1777
- } else {
1778
- char tn_buf[48];
1779
-
1780
- tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
1781
- verbose(env, "invalid stack type R%d var_off=%s access_size=%d\n",
1782
- regno, tn_buf, access_size);
1783
- }
1784
- return -EACCES;
1785
- }
1786
- return 0;
1787
-}
1788
-
1789
-/* when register 'regno' is passed into function that will read 'access_size'
1790
- * bytes from that pointer, make sure that it's within stack boundary
1791
- * and all elements of stack are initialized.
1792
- * Unlike most pointer bounds-checking functions, this one doesn't take an
1793
- * 'off' argument, so it has to add in reg->off itself.
4209
+/* When register 'regno' is used to read the stack (either directly or through
4210
+ * a helper function) make sure that it's within stack boundary and, depending
4211
+ * on the access type, that all elements of the stack are initialized.
4212
+ *
4213
+ * 'off' includes 'regno->off', but not its dynamic part (if any).
4214
+ *
4215
+ * All registers that have been spilled on the stack in the slots within the
4216
+ * read offsets are marked as read.
17944217 */
1795
-static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
1796
- int access_size, bool zero_size_allowed,
1797
- struct bpf_call_arg_meta *meta)
4218
+static int check_stack_range_initialized(
4219
+ struct bpf_verifier_env *env, int regno, int off,
4220
+ int access_size, bool zero_size_allowed,
4221
+ enum stack_access_src type, struct bpf_call_arg_meta *meta)
17984222 {
1799
- struct bpf_reg_state *reg = cur_regs(env) + regno;
4223
+ struct bpf_reg_state *reg = reg_state(env, regno);
18004224 struct bpf_func_state *state = func(env, reg);
18014225 int err, min_off, max_off, i, j, slot, spi;
4226
+ char *err_extra = type == ACCESS_HELPER ? " indirect" : "";
4227
+ enum bpf_access_type bounds_check_type;
4228
+ /* Some accesses can write anything into the stack, others are
4229
+ * read-only.
4230
+ */
4231
+ bool clobber = false;
18024232
1803
- if (reg->type != PTR_TO_STACK) {
1804
- /* Allow zero-byte read from NULL, regardless of pointer type */
1805
- if (zero_size_allowed && access_size == 0 &&
1806
- register_is_null(reg))
1807
- return 0;
1808
-
1809
- verbose(env, "R%d type=%s expected=%s\n", regno,
1810
- reg_type_str[reg->type],
1811
- reg_type_str[PTR_TO_STACK]);
4233
+ if (access_size == 0 && !zero_size_allowed) {
4234
+ verbose(env, "invalid zero-sized read\n");
18124235 return -EACCES;
18134236 }
18144237
4238
+ if (type == ACCESS_HELPER) {
4239
+ /* The bounds checks for writes are more permissive than for
4240
+ * reads. However, if raw_mode is not set, we'll do extra
4241
+ * checks below.
4242
+ */
4243
+ bounds_check_type = BPF_WRITE;
4244
+ clobber = true;
4245
+ } else {
4246
+ bounds_check_type = BPF_READ;
4247
+ }
4248
+ err = check_stack_access_within_bounds(env, regno, off, access_size,
4249
+ type, bounds_check_type);
4250
+ if (err)
4251
+ return err;
4252
+
4253
+
18154254 if (tnum_is_const(reg->var_off)) {
1816
- min_off = max_off = reg->var_off.value + reg->off;
1817
- err = __check_stack_boundary(env, regno, min_off, access_size,
1818
- zero_size_allowed);
1819
- if (err)
1820
- return err;
4255
+ min_off = max_off = reg->var_off.value + off;
18214256 } else {
18224257 /* Variable offset is prohibited for unprivileged mode for
18234258 * simplicity since it requires corresponding support in
18244259 * Spectre masking for stack ALU.
18254260 * See also retrieve_ptr_limit().
18264261 */
1827
- if (!env->allow_ptr_leaks) {
4262
+ if (!env->bypass_spec_v1) {
18284263 char tn_buf[48];
18294264
18304265 tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
1831
- verbose(env, "R%d indirect variable offset stack access prohibited for !root, var_off=%s\n",
1832
- regno, tn_buf);
4266
+ verbose(env, "R%d%s variable offset stack access prohibited for !root, var_off=%s\n",
4267
+ regno, err_extra, tn_buf);
18334268 return -EACCES;
18344269 }
18354270 /* Only initialized buffer on stack is allowed to be accessed
....@@ -1841,28 +4276,8 @@
18414276 if (meta && meta->raw_mode)
18424277 meta = NULL;
18434278
1844
- if (reg->smax_value >= BPF_MAX_VAR_OFF ||
1845
- reg->smax_value <= -BPF_MAX_VAR_OFF) {
1846
- verbose(env, "R%d unbounded indirect variable offset stack access\n",
1847
- regno);
1848
- return -EACCES;
1849
- }
1850
- min_off = reg->smin_value + reg->off;
1851
- max_off = reg->smax_value + reg->off;
1852
- err = __check_stack_boundary(env, regno, min_off, access_size,
1853
- zero_size_allowed);
1854
- if (err) {
1855
- verbose(env, "R%d min value is outside of stack bound\n",
1856
- regno);
1857
- return err;
1858
- }
1859
- err = __check_stack_boundary(env, regno, max_off, access_size,
1860
- zero_size_allowed);
1861
- if (err) {
1862
- verbose(env, "R%d max value is outside of stack bound\n",
1863
- regno);
1864
- return err;
1865
- }
4279
+ min_off = reg->smin_value + off;
4280
+ max_off = reg->smax_value + off;
18664281 }
18674282
18684283 if (meta && meta->raw_mode) {
....@@ -1882,28 +4297,38 @@
18824297 if (*stype == STACK_MISC)
18834298 goto mark;
18844299 if (*stype == STACK_ZERO) {
1885
- /* helper can write anything into the stack */
1886
- *stype = STACK_MISC;
4300
+ if (clobber) {
4301
+ /* helper can write anything into the stack */
4302
+ *stype = STACK_MISC;
4303
+ }
18874304 goto mark;
18884305 }
1889
- if (state->stack[spi].slot_type[0] == STACK_SPILL &&
1890
- state->stack[spi].spilled_ptr.type == SCALAR_VALUE) {
1891
- __mark_reg_unknown(&state->stack[spi].spilled_ptr);
1892
- for (j = 0; j < BPF_REG_SIZE; j++)
1893
- state->stack[spi].slot_type[j] = STACK_MISC;
4306
+
4307
+ if (is_spilled_reg(&state->stack[spi]) &&
4308
+ state->stack[spi].spilled_ptr.type == PTR_TO_BTF_ID)
4309
+ goto mark;
4310
+
4311
+ if (is_spilled_reg(&state->stack[spi]) &&
4312
+ (state->stack[spi].spilled_ptr.type == SCALAR_VALUE ||
4313
+ env->allow_ptr_leaks)) {
4314
+ if (clobber) {
4315
+ __mark_reg_unknown(env, &state->stack[spi].spilled_ptr);
4316
+ for (j = 0; j < BPF_REG_SIZE; j++)
4317
+ scrub_spilled_slot(&state->stack[spi].slot_type[j]);
4318
+ }
18944319 goto mark;
18954320 }
18964321
18974322 err:
18984323 if (tnum_is_const(reg->var_off)) {
1899
- verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
1900
- min_off, i - min_off, access_size);
4324
+ verbose(env, "invalid%s read from stack R%d off %d+%d size %d\n",
4325
+ err_extra, regno, min_off, i - min_off, access_size);
19014326 } else {
19024327 char tn_buf[48];
19034328
19044329 tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
1905
- verbose(env, "invalid indirect read from stack var_off %s+%d size %d\n",
1906
- tn_buf, i - min_off, access_size);
4330
+ verbose(env, "invalid%s read from stack R%d var_off %s+%d size %d\n",
4331
+ err_extra, regno, tn_buf, i - min_off, access_size);
19074332 }
19084333 return -EACCES;
19094334 mark:
....@@ -1911,7 +4336,8 @@
19114336 * the whole slot to be marked as 'read'
19124337 */
19134338 mark_reg_read(env, &state->stack[spi].spilled_ptr,
1914
- state->stack[spi].spilled_ptr.parent);
4339
+ state->stack[spi].spilled_ptr.parent,
4340
+ REG_LIVE_READ64);
19154341 }
19164342 return update_stack_depth(env, state, min_off);
19174343 }
....@@ -1928,12 +4354,125 @@
19284354 return check_packet_access(env, regno, reg->off, access_size,
19294355 zero_size_allowed);
19304356 case PTR_TO_MAP_VALUE:
4357
+ if (check_map_access_type(env, regno, reg->off, access_size,
4358
+ meta && meta->raw_mode ? BPF_WRITE :
4359
+ BPF_READ))
4360
+ return -EACCES;
19314361 return check_map_access(env, regno, reg->off, access_size,
19324362 zero_size_allowed);
1933
- default: /* scalar_value|ptr_to_stack or invalid ptr */
1934
- return check_stack_boundary(env, regno, access_size,
1935
- zero_size_allowed, meta);
4363
+ case PTR_TO_MEM:
4364
+ return check_mem_region_access(env, regno, reg->off,
4365
+ access_size, reg->mem_size,
4366
+ zero_size_allowed);
4367
+ case PTR_TO_RDONLY_BUF:
4368
+ if (meta && meta->raw_mode)
4369
+ return -EACCES;
4370
+ return check_buffer_access(env, reg, regno, reg->off,
4371
+ access_size, zero_size_allowed,
4372
+ "rdonly",
4373
+ &env->prog->aux->max_rdonly_access);
4374
+ case PTR_TO_RDWR_BUF:
4375
+ return check_buffer_access(env, reg, regno, reg->off,
4376
+ access_size, zero_size_allowed,
4377
+ "rdwr",
4378
+ &env->prog->aux->max_rdwr_access);
4379
+ case PTR_TO_STACK:
4380
+ return check_stack_range_initialized(
4381
+ env,
4382
+ regno, reg->off, access_size,
4383
+ zero_size_allowed, ACCESS_HELPER, meta);
4384
+ default: /* scalar_value or invalid ptr */
4385
+ /* Allow zero-byte read from NULL, regardless of pointer type */
4386
+ if (zero_size_allowed && access_size == 0 &&
4387
+ register_is_null(reg))
4388
+ return 0;
4389
+
4390
+ verbose(env, "R%d type=%s expected=%s\n", regno,
4391
+ reg_type_str[reg->type],
4392
+ reg_type_str[PTR_TO_STACK]);
4393
+ return -EACCES;
19364394 }
4395
+}
4396
+
4397
+/* Implementation details:
4398
+ * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL
4399
+ * Two bpf_map_lookups (even with the same key) will have different reg->id.
4400
+ * For traditional PTR_TO_MAP_VALUE the verifier clears reg->id after
4401
+ * value_or_null->value transition, since the verifier only cares about
4402
+ * the range of access to valid map value pointer and doesn't care about actual
4403
+ * address of the map element.
4404
+ * For maps with 'struct bpf_spin_lock' inside map value the verifier keeps
4405
+ * reg->id > 0 after value_or_null->value transition. By doing so
4406
+ * two bpf_map_lookups will be considered two different pointers that
4407
+ * point to different bpf_spin_locks.
4408
+ * The verifier allows taking only one bpf_spin_lock at a time to avoid
4409
+ * dead-locks.
4410
+ * Since only one bpf_spin_lock is allowed the checks are simpler than
4411
+ * reg_is_refcounted() logic. The verifier needs to remember only
4412
+ * one spin_lock instead of array of acquired_refs.
4413
+ * cur_state->active_spin_lock remembers which map value element got locked
4414
+ * and clears it after bpf_spin_unlock.
4415
+ */
4416
+static int process_spin_lock(struct bpf_verifier_env *env, int regno,
4417
+ bool is_lock)
4418
+{
4419
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
4420
+ struct bpf_verifier_state *cur = env->cur_state;
4421
+ bool is_const = tnum_is_const(reg->var_off);
4422
+ struct bpf_map *map = reg->map_ptr;
4423
+ u64 val = reg->var_off.value;
4424
+
4425
+ if (!is_const) {
4426
+ verbose(env,
4427
+ "R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n",
4428
+ regno);
4429
+ return -EINVAL;
4430
+ }
4431
+ if (!map->btf) {
4432
+ verbose(env,
4433
+ "map '%s' has to have BTF in order to use bpf_spin_lock\n",
4434
+ map->name);
4435
+ return -EINVAL;
4436
+ }
4437
+ if (!map_value_has_spin_lock(map)) {
4438
+ if (map->spin_lock_off == -E2BIG)
4439
+ verbose(env,
4440
+ "map '%s' has more than one 'struct bpf_spin_lock'\n",
4441
+ map->name);
4442
+ else if (map->spin_lock_off == -ENOENT)
4443
+ verbose(env,
4444
+ "map '%s' doesn't have 'struct bpf_spin_lock'\n",
4445
+ map->name);
4446
+ else
4447
+ verbose(env,
4448
+ "map '%s' is not a struct type or bpf_spin_lock is mangled\n",
4449
+ map->name);
4450
+ return -EINVAL;
4451
+ }
4452
+ if (map->spin_lock_off != val + reg->off) {
4453
+ verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock'\n",
4454
+ val + reg->off);
4455
+ return -EINVAL;
4456
+ }
4457
+ if (is_lock) {
4458
+ if (cur->active_spin_lock) {
4459
+ verbose(env,
4460
+ "Locking two bpf_spin_locks are not allowed\n");
4461
+ return -EINVAL;
4462
+ }
4463
+ cur->active_spin_lock = reg->id;
4464
+ } else {
4465
+ if (!cur->active_spin_lock) {
4466
+ verbose(env, "bpf_spin_unlock without taking a lock\n");
4467
+ return -EINVAL;
4468
+ }
4469
+ if (cur->active_spin_lock != reg->id) {
4470
+ verbose(env, "bpf_spin_unlock of different lock\n");
4471
+ return -EINVAL;
4472
+ }
4473
+ cur->active_spin_lock = 0;
4474
+ }
4475
+ return 0;
19374476 }
19384477
19394478 static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
....@@ -1949,12 +4488,215 @@
19494488 type == ARG_CONST_SIZE_OR_ZERO;
19504489 }
19514490
1952
-static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
4491
+static bool arg_type_is_alloc_size(enum bpf_arg_type type)
4492
+{
4493
+ return type == ARG_CONST_ALLOC_SIZE_OR_ZERO;
4494
+}
4495
+
4496
+static bool arg_type_is_int_ptr(enum bpf_arg_type type)
4497
+{
4498
+ return type == ARG_PTR_TO_INT ||
4499
+ type == ARG_PTR_TO_LONG;
4500
+}
4501
+
4502
+static int int_ptr_type_to_size(enum bpf_arg_type type)
4503
+{
4504
+ if (type == ARG_PTR_TO_INT)
4505
+ return sizeof(u32);
4506
+ else if (type == ARG_PTR_TO_LONG)
4507
+ return sizeof(u64);
4508
+
4509
+ return -EINVAL;
4510
+}
4511
+
4512
+static int resolve_map_arg_type(struct bpf_verifier_env *env,
4513
+ const struct bpf_call_arg_meta *meta,
4514
+ enum bpf_arg_type *arg_type)
4515
+{
4516
+ if (!meta->map_ptr) {
4517
+ /* kernel subsystem misconfigured verifier */
4518
+ verbose(env, "invalid map_ptr to access map->type\n");
4519
+ return -EACCES;
4520
+ }
4521
+
4522
+ switch (meta->map_ptr->map_type) {
4523
+ case BPF_MAP_TYPE_SOCKMAP:
4524
+ case BPF_MAP_TYPE_SOCKHASH:
4525
+ if (*arg_type == ARG_PTR_TO_MAP_VALUE) {
4526
+ *arg_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON;
4527
+ } else {
4528
+ verbose(env, "invalid arg_type for sockmap/sockhash\n");
4529
+ return -EINVAL;
4530
+ }
4531
+ break;
4532
+
4533
+ default:
4534
+ break;
4535
+ }
4536
+ return 0;
4537
+}
4538
+
4539
+struct bpf_reg_types {
4540
+ const enum bpf_reg_type types[10];
4541
+ u32 *btf_id;
4542
+};
4543
+
4544
+static const struct bpf_reg_types map_key_value_types = {
4545
+ .types = {
4546
+ PTR_TO_STACK,
4547
+ PTR_TO_PACKET,
4548
+ PTR_TO_PACKET_META,
4549
+ PTR_TO_MAP_VALUE,
4550
+ },
4551
+};
4552
+
4553
+static const struct bpf_reg_types sock_types = {
4554
+ .types = {
4555
+ PTR_TO_SOCK_COMMON,
4556
+ PTR_TO_SOCKET,
4557
+ PTR_TO_TCP_SOCK,
4558
+ PTR_TO_XDP_SOCK,
4559
+ },
4560
+};
4561
+
4562
+#ifdef CONFIG_NET
4563
+static const struct bpf_reg_types btf_id_sock_common_types = {
4564
+ .types = {
4565
+ PTR_TO_SOCK_COMMON,
4566
+ PTR_TO_SOCKET,
4567
+ PTR_TO_TCP_SOCK,
4568
+ PTR_TO_XDP_SOCK,
4569
+ PTR_TO_BTF_ID,
4570
+ },
4571
+ .btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
4572
+};
4573
+#endif
4574
+
4575
+static const struct bpf_reg_types mem_types = {
4576
+ .types = {
4577
+ PTR_TO_STACK,
4578
+ PTR_TO_PACKET,
4579
+ PTR_TO_PACKET_META,
4580
+ PTR_TO_MAP_VALUE,
4581
+ PTR_TO_MEM,
4582
+ PTR_TO_RDONLY_BUF,
4583
+ PTR_TO_RDWR_BUF,
4584
+ },
4585
+};
4586
+
4587
+static const struct bpf_reg_types int_ptr_types = {
4588
+ .types = {
4589
+ PTR_TO_STACK,
4590
+ PTR_TO_PACKET,
4591
+ PTR_TO_PACKET_META,
4592
+ PTR_TO_MAP_VALUE,
4593
+ },
4594
+};
4595
+
4596
+static const struct bpf_reg_types fullsock_types = { .types = { PTR_TO_SOCKET } };
4597
+static const struct bpf_reg_types scalar_types = { .types = { SCALAR_VALUE } };
4598
+static const struct bpf_reg_types context_types = { .types = { PTR_TO_CTX } };
4599
+static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM } };
4600
+static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } };
4601
+static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } };
4602
+static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } };
4603
+static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } };
4604
+
4605
+static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
4606
+ [ARG_PTR_TO_MAP_KEY] = &map_key_value_types,
4607
+ [ARG_PTR_TO_MAP_VALUE] = &map_key_value_types,
4608
+ [ARG_PTR_TO_UNINIT_MAP_VALUE] = &map_key_value_types,
4609
+ [ARG_PTR_TO_MAP_VALUE_OR_NULL] = &map_key_value_types,
4610
+ [ARG_CONST_SIZE] = &scalar_types,
4611
+ [ARG_CONST_SIZE_OR_ZERO] = &scalar_types,
4612
+ [ARG_CONST_ALLOC_SIZE_OR_ZERO] = &scalar_types,
4613
+ [ARG_CONST_MAP_PTR] = &const_map_ptr_types,
4614
+ [ARG_PTR_TO_CTX] = &context_types,
4615
+ [ARG_PTR_TO_CTX_OR_NULL] = &context_types,
4616
+ [ARG_PTR_TO_SOCK_COMMON] = &sock_types,
4617
+#ifdef CONFIG_NET
4618
+ [ARG_PTR_TO_BTF_ID_SOCK_COMMON] = &btf_id_sock_common_types,
4619
+#endif
4620
+ [ARG_PTR_TO_SOCKET] = &fullsock_types,
4621
+ [ARG_PTR_TO_SOCKET_OR_NULL] = &fullsock_types,
4622
+ [ARG_PTR_TO_BTF_ID] = &btf_ptr_types,
4623
+ [ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types,
4624
+ [ARG_PTR_TO_MEM] = &mem_types,
4625
+ [ARG_PTR_TO_MEM_OR_NULL] = &mem_types,
4626
+ [ARG_PTR_TO_UNINIT_MEM] = &mem_types,
4627
+ [ARG_PTR_TO_ALLOC_MEM] = &alloc_mem_types,
4628
+ [ARG_PTR_TO_ALLOC_MEM_OR_NULL] = &alloc_mem_types,
4629
+ [ARG_PTR_TO_INT] = &int_ptr_types,
4630
+ [ARG_PTR_TO_LONG] = &int_ptr_types,
4631
+ [ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types,
4632
+};
4633
+
4634
+static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
19534635 enum bpf_arg_type arg_type,
1954
- struct bpf_call_arg_meta *meta)
4636
+ const u32 *arg_btf_id)
19554637 {
19564638 struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
1957
- enum bpf_reg_type expected_type, type = reg->type;
4639
+ enum bpf_reg_type expected, type = reg->type;
4640
+ const struct bpf_reg_types *compatible;
4641
+ int i, j;
4642
+
4643
+ compatible = compatible_reg_types[arg_type];
4644
+ if (!compatible) {
4645
+ verbose(env, "verifier internal error: unsupported arg type %d\n", arg_type);
4646
+ return -EFAULT;
4647
+ }
4648
+
4649
+ for (i = 0; i < ARRAY_SIZE(compatible->types); i++) {
4650
+ expected = compatible->types[i];
4651
+ if (expected == NOT_INIT)
4652
+ break;
4653
+
4654
+ if (type == expected)
4655
+ goto found;
4656
+ }
4657
+
4658
+ verbose(env, "R%d type=%s expected=", regno, reg_type_str[type]);
4659
+ for (j = 0; j + 1 < i; j++)
4660
+ verbose(env, "%s, ", reg_type_str[compatible->types[j]]);
4661
+ verbose(env, "%s\n", reg_type_str[compatible->types[j]]);
4662
+ return -EACCES;
4663
+
4664
+found:
4665
+ if (type == PTR_TO_BTF_ID) {
4666
+ if (!arg_btf_id) {
4667
+ if (!compatible->btf_id) {
4668
+ verbose(env, "verifier internal error: missing arg compatible BTF ID\n");
4669
+ return -EFAULT;
4670
+ }
4671
+ arg_btf_id = compatible->btf_id;
4672
+ }
4673
+
4674
+ if (!btf_struct_ids_match(&env->log, reg->off, reg->btf_id,
4675
+ *arg_btf_id)) {
4676
+ verbose(env, "R%d is of type %s but %s is expected\n",
4677
+ regno, kernel_type_name(reg->btf_id),
4678
+ kernel_type_name(*arg_btf_id));
4679
+ return -EACCES;
4680
+ }
4681
+
4682
+ if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
4683
+ verbose(env, "R%d is a pointer to in-kernel struct with non-zero offset\n",
4684
+ regno);
4685
+ return -EACCES;
4686
+ }
4687
+ }
4688
+
4689
+ return 0;
4690
+}
4691
+
4692
+static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
4693
+ struct bpf_call_arg_meta *meta,
4694
+ const struct bpf_func_proto *fn)
4695
+{
4696
+ u32 regno = BPF_REG_1 + arg;
4697
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
4698
+ enum bpf_arg_type arg_type = fn->arg_type[arg];
4699
+ enum bpf_reg_type type = reg->type;
19584700 int err = 0;
19594701
19604702 if (arg_type == ARG_DONTCARE)
....@@ -1979,45 +4721,39 @@
19794721 return -EACCES;
19804722 }
19814723
1982
- if (arg_type == ARG_PTR_TO_MAP_KEY ||
1983
- arg_type == ARG_PTR_TO_MAP_VALUE) {
1984
- expected_type = PTR_TO_STACK;
1985
- if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE &&
1986
- type != expected_type)
1987
- goto err_type;
1988
- } else if (arg_type == ARG_CONST_SIZE ||
1989
- arg_type == ARG_CONST_SIZE_OR_ZERO) {
1990
- expected_type = SCALAR_VALUE;
1991
- if (type != expected_type)
1992
- goto err_type;
1993
- } else if (arg_type == ARG_CONST_MAP_PTR) {
1994
- expected_type = CONST_PTR_TO_MAP;
1995
- if (type != expected_type)
1996
- goto err_type;
1997
- } else if (arg_type == ARG_PTR_TO_CTX) {
1998
- expected_type = PTR_TO_CTX;
1999
- if (type != expected_type)
2000
- goto err_type;
4724
+ if (arg_type == ARG_PTR_TO_MAP_VALUE ||
4725
+ arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE ||
4726
+ arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) {
4727
+ err = resolve_map_arg_type(env, meta, &arg_type);
4728
+ if (err)
4729
+ return err;
4730
+ }
4731
+
4732
+ if (register_is_null(reg) && arg_type_may_be_null(arg_type))
4733
+ /* A NULL register has a SCALAR_VALUE type, so skip
4734
+ * type checking.
4735
+ */
4736
+ goto skip_type_check;
4737
+
4738
+ err = check_reg_type(env, regno, arg_type, fn->arg_btf_id[arg]);
4739
+ if (err)
4740
+ return err;
4741
+
4742
+ if (type == PTR_TO_CTX) {
20014743 err = check_ctx_reg(env, reg, regno);
20024744 if (err < 0)
20034745 return err;
2004
- } else if (arg_type_is_mem_ptr(arg_type)) {
2005
- expected_type = PTR_TO_STACK;
2006
- /* One exception here. In case function allows for NULL to be
2007
- * passed in as argument, it's a SCALAR_VALUE type. Final test
2008
- * happens during stack boundary checking.
2009
- */
2010
- if (register_is_null(reg) &&
2011
- arg_type == ARG_PTR_TO_MEM_OR_NULL)
2012
- /* final test in check_stack_boundary() */;
2013
- else if (!type_is_pkt_pointer(type) &&
2014
- type != PTR_TO_MAP_VALUE &&
2015
- type != expected_type)
2016
- goto err_type;
2017
- meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
2018
- } else {
2019
- verbose(env, "unsupported arg_type %d\n", arg_type);
2020
- return -EFAULT;
4746
+ }
4747
+
4748
+skip_type_check:
4749
+ if (reg->ref_obj_id) {
4750
+ if (meta->ref_obj_id) {
4751
+ verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
4752
+ regno, reg->ref_obj_id,
4753
+ meta->ref_obj_id);
4754
+ return -EFAULT;
4755
+ }
4756
+ meta->ref_obj_id = reg->ref_obj_id;
20214757 }
20224758
20234759 if (arg_type == ARG_CONST_MAP_PTR) {
....@@ -2040,7 +4776,10 @@
20404776 err = check_helper_mem_access(env, regno,
20414777 meta->map_ptr->key_size, false,
20424778 NULL);
2043
- } else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
4779
+ } else if (arg_type == ARG_PTR_TO_MAP_VALUE ||
4780
+ (arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL &&
4781
+ !register_is_null(reg)) ||
4782
+ arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) {
20444783 /* bpf_map_xxx(..., map_ptr, ..., value) call:
20454784 * check [value, value + map->value_size) validity
20464785 */
....@@ -2049,14 +4788,42 @@
20494788 verbose(env, "invalid map_ptr to access map->value\n");
20504789 return -EACCES;
20514790 }
4791
+ meta->raw_mode = (arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE);
20524792 err = check_helper_mem_access(env, regno,
20534793 meta->map_ptr->value_size, false,
2054
- NULL);
4794
+ meta);
4795
+ } else if (arg_type == ARG_PTR_TO_PERCPU_BTF_ID) {
4796
+ if (!reg->btf_id) {
4797
+ verbose(env, "Helper has invalid btf_id in R%d\n", regno);
4798
+ return -EACCES;
4799
+ }
4800
+ meta->ret_btf_id = reg->btf_id;
4801
+ } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) {
4802
+ if (meta->func_id == BPF_FUNC_spin_lock) {
4803
+ if (process_spin_lock(env, regno, true))
4804
+ return -EACCES;
4805
+ } else if (meta->func_id == BPF_FUNC_spin_unlock) {
4806
+ if (process_spin_lock(env, regno, false))
4807
+ return -EACCES;
4808
+ } else {
4809
+ verbose(env, "verifier internal error\n");
4810
+ return -EFAULT;
4811
+ }
4812
+ } else if (arg_type_is_mem_ptr(arg_type)) {
4813
+ /* The access to this pointer is only checked when we hit the
4814
+ * next is_mem_size argument below.
4815
+ */
4816
+ meta->raw_mode = (arg_type == ARG_PTR_TO_UNINIT_MEM);
20554817 } else if (arg_type_is_mem_size(arg_type)) {
20564818 bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
20574819
2058
- /* remember the mem_size which may be used later
2059
- * to refine return values.
4820
+ /* This is used to refine r0 return value bounds for helpers
4821
+ * that enforce this value as an upper bound on return values.
4822
+ * See do_refine_retval_range() for helpers that can refine
4823
+ * the return value. C type of helper is u32 so we pull register
4824
+ * bound from umax_value however, if negative verifier errors
4825
+ * out. Only upper bounds can be learned because retval is an
4826
+ * int type and negative retvals are allowed.
20604827 */
20614828 meta->msize_max_value = reg->umax_value;
20624829
....@@ -2093,13 +4860,62 @@
20934860 err = check_helper_mem_access(env, regno - 1,
20944861 reg->umax_value,
20954862 zero_size_allowed, meta);
4863
+ if (!err)
4864
+ err = mark_chain_precision(env, regno);
4865
+ } else if (arg_type_is_alloc_size(arg_type)) {
4866
+ if (!tnum_is_const(reg->var_off)) {
4867
+ verbose(env, "R%d unbounded size, use 'var &= const' or 'if (var < const)'\n",
4868
+ regno);
4869
+ return -EACCES;
4870
+ }
4871
+ meta->mem_size = reg->var_off.value;
4872
+ } else if (arg_type_is_int_ptr(arg_type)) {
4873
+ int size = int_ptr_type_to_size(arg_type);
4874
+
4875
+ err = check_helper_mem_access(env, regno, size, false, meta);
4876
+ if (err)
4877
+ return err;
4878
+ err = check_ptr_alignment(env, reg, 0, size, true);
20964879 }
20974880
20984881 return err;
2099
-err_type:
2100
- verbose(env, "R%d type=%s expected=%s\n", regno,
2101
- reg_type_str[type], reg_type_str[expected_type]);
2102
- return -EACCES;
4882
+}
4883
+
4884
+static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id)
4885
+{
4886
+ enum bpf_attach_type eatype = env->prog->expected_attach_type;
4887
+ enum bpf_prog_type type = resolve_prog_type(env->prog);
4888
+
4889
+ if (func_id != BPF_FUNC_map_update_elem)
4890
+ return false;
4891
+
4892
+ /* It's not possible to get access to a locked struct sock in these
4893
+ * contexts, so updating is safe.
4894
+ */
4895
+ switch (type) {
4896
+ case BPF_PROG_TYPE_TRACING:
4897
+ if (eatype == BPF_TRACE_ITER)
4898
+ return true;
4899
+ break;
4900
+ case BPF_PROG_TYPE_SOCKET_FILTER:
4901
+ case BPF_PROG_TYPE_SCHED_CLS:
4902
+ case BPF_PROG_TYPE_SCHED_ACT:
4903
+ case BPF_PROG_TYPE_XDP:
4904
+ case BPF_PROG_TYPE_SK_REUSEPORT:
4905
+ case BPF_PROG_TYPE_FLOW_DISSECTOR:
4906
+ case BPF_PROG_TYPE_SK_LOOKUP:
4907
+ return true;
4908
+ default:
4909
+ break;
4910
+ }
4911
+
4912
+ verbose(env, "cannot update sockmap in this context\n");
4913
+ return false;
4914
+}
4915
+
4916
+static bool allow_tail_call_in_subprogs(struct bpf_verifier_env *env)
4917
+{
4918
+ return env->prog->jit_requested && IS_ENABLED(CONFIG_X86_64);
21034919 }
21044920
21054921 static int check_map_func_compatibility(struct bpf_verifier_env *env,
....@@ -2117,7 +4933,15 @@
21174933 case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
21184934 if (func_id != BPF_FUNC_perf_event_read &&
21194935 func_id != BPF_FUNC_perf_event_output &&
2120
- func_id != BPF_FUNC_perf_event_read_value)
4936
+ func_id != BPF_FUNC_skb_output &&
4937
+ func_id != BPF_FUNC_perf_event_read_value &&
4938
+ func_id != BPF_FUNC_xdp_output)
4939
+ goto error;
4940
+ break;
4941
+ case BPF_MAP_TYPE_RINGBUF:
4942
+ if (func_id != BPF_FUNC_ringbuf_output &&
4943
+ func_id != BPF_FUNC_ringbuf_reserve &&
4944
+ func_id != BPF_FUNC_ringbuf_query)
21214945 goto error;
21224946 break;
21234947 case BPF_MAP_TYPE_STACK_TRACE:
....@@ -2130,23 +4954,26 @@
21304954 goto error;
21314955 break;
21324956 case BPF_MAP_TYPE_CGROUP_STORAGE:
4957
+ case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE:
21334958 if (func_id != BPF_FUNC_get_local_storage)
21344959 goto error;
21354960 break;
2136
- /* devmap returns a pointer to a live net_device ifindex that we cannot
2137
- * allow to be modified from bpf side. So do not allow lookup elements
2138
- * for now.
2139
- */
21404961 case BPF_MAP_TYPE_DEVMAP:
2141
- if (func_id != BPF_FUNC_redirect_map)
4962
+ case BPF_MAP_TYPE_DEVMAP_HASH:
4963
+ if (func_id != BPF_FUNC_redirect_map &&
4964
+ func_id != BPF_FUNC_map_lookup_elem)
21424965 goto error;
21434966 break;
21444967 /* Restrict bpf side of cpumap and xskmap, open when use-cases
21454968 * appear.
21464969 */
21474970 case BPF_MAP_TYPE_CPUMAP:
2148
- case BPF_MAP_TYPE_XSKMAP:
21494971 if (func_id != BPF_FUNC_redirect_map)
4972
+ goto error;
4973
+ break;
4974
+ case BPF_MAP_TYPE_XSKMAP:
4975
+ if (func_id != BPF_FUNC_redirect_map &&
4976
+ func_id != BPF_FUNC_map_lookup_elem)
21504977 goto error;
21514978 break;
21524979 case BPF_MAP_TYPE_ARRAY_OF_MAPS:
....@@ -2158,18 +4985,41 @@
21584985 if (func_id != BPF_FUNC_sk_redirect_map &&
21594986 func_id != BPF_FUNC_sock_map_update &&
21604987 func_id != BPF_FUNC_map_delete_elem &&
2161
- func_id != BPF_FUNC_msg_redirect_map)
4988
+ func_id != BPF_FUNC_msg_redirect_map &&
4989
+ func_id != BPF_FUNC_sk_select_reuseport &&
4990
+ func_id != BPF_FUNC_map_lookup_elem &&
4991
+ !may_update_sockmap(env, func_id))
21624992 goto error;
21634993 break;
21644994 case BPF_MAP_TYPE_SOCKHASH:
21654995 if (func_id != BPF_FUNC_sk_redirect_hash &&
21664996 func_id != BPF_FUNC_sock_hash_update &&
21674997 func_id != BPF_FUNC_map_delete_elem &&
2168
- func_id != BPF_FUNC_msg_redirect_hash)
4998
+ func_id != BPF_FUNC_msg_redirect_hash &&
4999
+ func_id != BPF_FUNC_sk_select_reuseport &&
5000
+ func_id != BPF_FUNC_map_lookup_elem &&
5001
+ !may_update_sockmap(env, func_id))
21695002 goto error;
21705003 break;
21715004 case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
21725005 if (func_id != BPF_FUNC_sk_select_reuseport)
5006
+ goto error;
5007
+ break;
5008
+ case BPF_MAP_TYPE_QUEUE:
5009
+ case BPF_MAP_TYPE_STACK:
5010
+ if (func_id != BPF_FUNC_map_peek_elem &&
5011
+ func_id != BPF_FUNC_map_pop_elem &&
5012
+ func_id != BPF_FUNC_map_push_elem)
5013
+ goto error;
5014
+ break;
5015
+ case BPF_MAP_TYPE_SK_STORAGE:
5016
+ if (func_id != BPF_FUNC_sk_storage_get &&
5017
+ func_id != BPF_FUNC_sk_storage_delete)
5018
+ goto error;
5019
+ break;
5020
+ case BPF_MAP_TYPE_INODE_STORAGE:
5021
+ if (func_id != BPF_FUNC_inode_storage_get &&
5022
+ func_id != BPF_FUNC_inode_storage_delete)
21735023 goto error;
21745024 break;
21755025 default:
....@@ -2181,15 +5031,23 @@
21815031 case BPF_FUNC_tail_call:
21825032 if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
21835033 goto error;
2184
- if (env->subprog_cnt > 1) {
2185
- verbose(env, "tail_calls are not allowed in programs with bpf-to-bpf calls\n");
5034
+ if (env->subprog_cnt > 1 && !allow_tail_call_in_subprogs(env)) {
5035
+ verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n");
21865036 return -EINVAL;
21875037 }
21885038 break;
21895039 case BPF_FUNC_perf_event_read:
21905040 case BPF_FUNC_perf_event_output:
21915041 case BPF_FUNC_perf_event_read_value:
5042
+ case BPF_FUNC_skb_output:
5043
+ case BPF_FUNC_xdp_output:
21925044 if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
5045
+ goto error;
5046
+ break;
5047
+ case BPF_FUNC_ringbuf_output:
5048
+ case BPF_FUNC_ringbuf_reserve:
5049
+ case BPF_FUNC_ringbuf_query:
5050
+ if (map->map_type != BPF_MAP_TYPE_RINGBUF)
21935051 goto error;
21945052 break;
21955053 case BPF_FUNC_get_stackid:
....@@ -2203,6 +5061,7 @@
22035061 break;
22045062 case BPF_FUNC_redirect_map:
22055063 if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
5064
+ map->map_type != BPF_MAP_TYPE_DEVMAP_HASH &&
22065065 map->map_type != BPF_MAP_TYPE_CPUMAP &&
22075066 map->map_type != BPF_MAP_TYPE_XSKMAP)
22085067 goto error;
....@@ -2220,11 +5079,31 @@
22205079 goto error;
22215080 break;
22225081 case BPF_FUNC_get_local_storage:
2223
- if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE)
5082
+ if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
5083
+ map->map_type != BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
22245084 goto error;
22255085 break;
22265086 case BPF_FUNC_sk_select_reuseport:
2227
- if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY)
5087
+ if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY &&
5088
+ map->map_type != BPF_MAP_TYPE_SOCKMAP &&
5089
+ map->map_type != BPF_MAP_TYPE_SOCKHASH)
5090
+ goto error;
5091
+ break;
5092
+ case BPF_FUNC_map_peek_elem:
5093
+ case BPF_FUNC_map_pop_elem:
5094
+ case BPF_FUNC_map_push_elem:
5095
+ if (map->map_type != BPF_MAP_TYPE_QUEUE &&
5096
+ map->map_type != BPF_MAP_TYPE_STACK)
5097
+ goto error;
5098
+ break;
5099
+ case BPF_FUNC_sk_storage_get:
5100
+ case BPF_FUNC_sk_storage_delete:
5101
+ if (map->map_type != BPF_MAP_TYPE_SK_STORAGE)
5102
+ goto error;
5103
+ break;
5104
+ case BPF_FUNC_inode_storage_get:
5105
+ case BPF_FUNC_inode_storage_delete:
5106
+ if (map->map_type != BPF_MAP_TYPE_INODE_STORAGE)
22285107 goto error;
22295108 break;
22305109 default:
....@@ -2287,49 +5166,142 @@
22875166 return true;
22885167 }
22895168
2290
-static int check_func_proto(const struct bpf_func_proto *fn)
5169
+static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id)
5170
+{
5171
+ int count = 0;
5172
+
5173
+ if (arg_type_may_be_refcounted(fn->arg1_type))
5174
+ count++;
5175
+ if (arg_type_may_be_refcounted(fn->arg2_type))
5176
+ count++;
5177
+ if (arg_type_may_be_refcounted(fn->arg3_type))
5178
+ count++;
5179
+ if (arg_type_may_be_refcounted(fn->arg4_type))
5180
+ count++;
5181
+ if (arg_type_may_be_refcounted(fn->arg5_type))
5182
+ count++;
5183
+
5184
+ /* A reference acquiring function cannot acquire
5185
+ * another refcounted ptr.
5186
+ */
5187
+ if (may_be_acquire_function(func_id) && count)
5188
+ return false;
5189
+
5190
+ /* We only support one arg being unreferenced at the moment,
5191
+ * which is sufficient for the helper functions we have right now.
5192
+ */
5193
+ return count <= 1;
5194
+}
5195
+
5196
+static bool check_btf_id_ok(const struct bpf_func_proto *fn)
5197
+{
5198
+ int i;
5199
+
5200
+ for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) {
5201
+ if (fn->arg_type[i] == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i])
5202
+ return false;
5203
+
5204
+ if (fn->arg_type[i] != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i])
5205
+ return false;
5206
+ }
5207
+
5208
+ return true;
5209
+}
5210
+
5211
+static int check_func_proto(const struct bpf_func_proto *fn, int func_id)
22915212 {
22925213 return check_raw_mode_ok(fn) &&
2293
- check_arg_pair_ok(fn) ? 0 : -EINVAL;
5214
+ check_arg_pair_ok(fn) &&
5215
+ check_btf_id_ok(fn) &&
5216
+ check_refcount_ok(fn, func_id) ? 0 : -EINVAL;
22945217 }
22955218
22965219 /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
22975220 * are now invalid, so turn them into unknown SCALAR_VALUE.
22985221 */
2299
-static void __clear_all_pkt_pointers(struct bpf_verifier_env *env,
2300
- struct bpf_func_state *state)
2301
-{
2302
- struct bpf_reg_state *regs = state->regs, *reg;
2303
- int i;
2304
-
2305
- for (i = 0; i < MAX_BPF_REG; i++)
2306
- if (reg_is_pkt_pointer_any(&regs[i]))
2307
- mark_reg_unknown(env, regs, i);
2308
-
2309
- for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
2310
- if (state->stack[i].slot_type[0] != STACK_SPILL)
2311
- continue;
2312
- reg = &state->stack[i].spilled_ptr;
2313
- if (reg_is_pkt_pointer_any(reg))
2314
- __mark_reg_unknown(reg);
2315
- }
2316
-}
2317
-
23185222 static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
23195223 {
2320
- struct bpf_verifier_state *vstate = env->cur_state;
5224
+ struct bpf_func_state *state;
5225
+ struct bpf_reg_state *reg;
5226
+
5227
+ bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
5228
+ if (reg_is_pkt_pointer_any(reg))
5229
+ __mark_reg_unknown(env, reg);
5230
+ }));
5231
+}
5232
+
5233
+enum {
5234
+ AT_PKT_END = -1,
5235
+ BEYOND_PKT_END = -2,
5236
+};
5237
+
5238
+static void mark_pkt_end(struct bpf_verifier_state *vstate, int regn, bool range_open)
5239
+{
5240
+ struct bpf_func_state *state = vstate->frame[vstate->curframe];
5241
+ struct bpf_reg_state *reg = &state->regs[regn];
5242
+
5243
+ if (reg->type != PTR_TO_PACKET)
5244
+ /* PTR_TO_PACKET_META is not supported yet */
5245
+ return;
5246
+
5247
+ /* The 'reg' is pkt > pkt_end or pkt >= pkt_end.
5248
+ * How far beyond pkt_end it goes is unknown.
5249
+ * if (!range_open) it's the case of pkt >= pkt_end
5250
+ * if (range_open) it's the case of pkt > pkt_end
5251
+ * hence this pointer is at least 1 byte bigger than pkt_end
5252
+ */
5253
+ if (range_open)
5254
+ reg->range = BEYOND_PKT_END;
5255
+ else
5256
+ reg->range = AT_PKT_END;
5257
+}
5258
+
5259
+/* The pointer with the specified id has released its reference to kernel
5260
+ * resources. Identify all copies of the same pointer and clear the reference.
5261
+ */
5262
+static int release_reference(struct bpf_verifier_env *env,
5263
+ int ref_obj_id)
5264
+{
5265
+ struct bpf_func_state *state;
5266
+ struct bpf_reg_state *reg;
5267
+ int err;
5268
+
5269
+ err = release_reference_state(cur_func(env), ref_obj_id);
5270
+ if (err)
5271
+ return err;
5272
+
5273
+ bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
5274
+ if (reg->ref_obj_id == ref_obj_id) {
5275
+ if (!env->allow_ptr_leaks)
5276
+ __mark_reg_not_init(env, reg);
5277
+ else
5278
+ __mark_reg_unknown(env, reg);
5279
+ }
5280
+ }));
5281
+
5282
+ return 0;
5283
+}
5284
+
5285
+static void clear_caller_saved_regs(struct bpf_verifier_env *env,
5286
+ struct bpf_reg_state *regs)
5287
+{
23215288 int i;
23225289
2323
- for (i = 0; i <= vstate->curframe; i++)
2324
- __clear_all_pkt_pointers(env, vstate->frame[i]);
5290
+ /* after the call registers r0 - r5 were scratched */
5291
+ for (i = 0; i < CALLER_SAVED_REGS; i++) {
5292
+ mark_reg_not_init(env, regs, caller_saved[i]);
5293
+ check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
5294
+ }
23255295 }
23265296
23275297 static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
23285298 int *insn_idx)
23295299 {
23305300 struct bpf_verifier_state *state = env->cur_state;
5301
+ struct bpf_func_info_aux *func_info_aux;
23315302 struct bpf_func_state *caller, *callee;
2332
- int i, subprog, target_insn;
5303
+ int i, err, subprog, target_insn;
5304
+ bool is_global = false;
23335305
23345306 if (state->curframe + 1 >= MAX_CALL_FRAMES) {
23355307 verbose(env, "the call stack of %d frames is too deep\n",
....@@ -2352,6 +5324,33 @@
23525324 return -EFAULT;
23535325 }
23545326
5327
+ func_info_aux = env->prog->aux->func_info_aux;
5328
+ if (func_info_aux)
5329
+ is_global = func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
5330
+ err = btf_check_func_arg_match(env, subprog, caller->regs);
5331
+ if (err == -EFAULT)
5332
+ return err;
5333
+ if (is_global) {
5334
+ if (err) {
5335
+ verbose(env, "Caller passes invalid args into func#%d\n",
5336
+ subprog);
5337
+ return err;
5338
+ } else {
5339
+ if (env->log.level & BPF_LOG_LEVEL)
5340
+ verbose(env,
5341
+ "Func#%d is global and valid. Skipping.\n",
5342
+ subprog);
5343
+ clear_caller_saved_regs(env, caller->regs);
5344
+
5345
+ /* All global functions return a 64-bit SCALAR_VALUE */
5346
+ mark_reg_unknown(env, caller->regs, BPF_REG_0);
5347
+ caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
5348
+
5349
+ /* continue with next insn after call */
5350
+ return 0;
5351
+ }
5352
+ }
5353
+
23555354 callee = kzalloc(sizeof(*callee), GFP_KERNEL);
23565355 if (!callee)
23575356 return -ENOMEM;
....@@ -2367,17 +5366,18 @@
23675366 state->curframe + 1 /* frameno within this callchain */,
23685367 subprog /* subprog number within this prog */);
23695368
5369
+ /* Transfer references to the callee */
5370
+ err = transfer_reference_state(callee, caller);
5371
+ if (err)
5372
+ return err;
5373
+
23705374 /* copy r1 - r5 args that callee can access. The copy includes parent
23715375 * pointers, which connects us up to the liveness chain
23725376 */
23735377 for (i = BPF_REG_1; i <= BPF_REG_5; i++)
23745378 callee->regs[i] = caller->regs[i];
23755379
2376
- /* after the call registers r0 - r5 were scratched */
2377
- for (i = 0; i < CALLER_SAVED_REGS; i++) {
2378
- mark_reg_not_init(env, caller->regs, caller_saved[i]);
2379
- check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
2380
- }
5380
+ clear_caller_saved_regs(env, caller->regs);
23815381
23825382 /* only increment it after check_reg_arg() finished */
23835383 state->curframe++;
....@@ -2385,7 +5385,7 @@
23855385 /* and go analyze first insn of the callee */
23865386 *insn_idx = target_insn;
23875387
2388
- if (env->log.level) {
5388
+ if (env->log.level & BPF_LOG_LEVEL) {
23895389 verbose(env, "caller:\n");
23905390 print_verifier_state(env, caller);
23915391 verbose(env, "callee:\n");
....@@ -2399,6 +5399,7 @@
23995399 struct bpf_verifier_state *state = env->cur_state;
24005400 struct bpf_func_state *caller, *callee;
24015401 struct bpf_reg_state *r0;
5402
+ int err;
24025403
24035404 callee = state->frame[state->curframe];
24045405 r0 = &callee->regs[BPF_REG_0];
....@@ -2418,8 +5419,13 @@
24185419 /* return to the caller whatever r0 had in the callee */
24195420 caller->regs[BPF_REG_0] = *r0;
24205421
5422
+ /* Transfer references to the caller */
5423
+ err = transfer_reference_state(caller, callee);
5424
+ if (err)
5425
+ return err;
5426
+
24215427 *insn_idx = callee->callsite + 1;
2422
- if (env->log.level) {
5428
+ if (env->log.level & BPF_LOG_LEVEL) {
24235429 verbose(env, "returning from callee:\n");
24245430 print_verifier_state(env, callee);
24255431 verbose(env, "to caller at %d:\n", *insn_idx);
....@@ -2431,44 +5437,24 @@
24315437 return 0;
24325438 }
24335439
2434
-static int do_refine_retval_range(struct bpf_verifier_env *env,
2435
- struct bpf_reg_state *regs, int ret_type,
2436
- int func_id, struct bpf_call_arg_meta *meta)
5440
+static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type,
5441
+ int func_id,
5442
+ struct bpf_call_arg_meta *meta)
24375443 {
24385444 struct bpf_reg_state *ret_reg = &regs[BPF_REG_0];
2439
- struct bpf_reg_state tmp_reg = *ret_reg;
2440
- bool ret;
24415445
24425446 if (ret_type != RET_INTEGER ||
24435447 (func_id != BPF_FUNC_get_stack &&
2444
- func_id != BPF_FUNC_probe_read_str))
2445
- return 0;
5448
+ func_id != BPF_FUNC_probe_read_str &&
5449
+ func_id != BPF_FUNC_probe_read_kernel_str &&
5450
+ func_id != BPF_FUNC_probe_read_user_str))
5451
+ return;
24465452
2447
- /* Error case where ret is in interval [S32MIN, -1]. */
2448
- ret_reg->smin_value = S32_MIN;
2449
- ret_reg->smax_value = -1;
2450
-
2451
- __reg_deduce_bounds(ret_reg);
2452
- __reg_bound_offset(ret_reg);
2453
- __update_reg_bounds(ret_reg);
2454
-
2455
- ret = push_stack(env, env->insn_idx + 1, env->insn_idx, false);
2456
- if (!ret)
2457
- return -EFAULT;
2458
-
2459
- *ret_reg = tmp_reg;
2460
-
2461
- /* Success case where ret is in range [0, msize_max_value]. */
2462
- ret_reg->smin_value = 0;
24635453 ret_reg->smax_value = meta->msize_max_value;
2464
- ret_reg->umin_value = ret_reg->smin_value;
2465
- ret_reg->umax_value = ret_reg->smax_value;
2466
-
2467
- __reg_deduce_bounds(ret_reg);
2468
- __reg_bound_offset(ret_reg);
2469
- __update_reg_bounds(ret_reg);
2470
-
2471
- return 0;
5454
+ ret_reg->s32_max_value = meta->msize_max_value;
5455
+ ret_reg->smin_value = -MAX_ERRNO;
5456
+ ret_reg->s32_min_value = -MAX_ERRNO;
5457
+ reg_bounds_sync(ret_reg);
24725458 }
24735459
24745460 static int
....@@ -2476,25 +5462,91 @@
24765462 int func_id, int insn_idx)
24775463 {
24785464 struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
5465
+ struct bpf_map *map = meta->map_ptr;
24795466
24805467 if (func_id != BPF_FUNC_tail_call &&
24815468 func_id != BPF_FUNC_map_lookup_elem &&
24825469 func_id != BPF_FUNC_map_update_elem &&
2483
- func_id != BPF_FUNC_map_delete_elem)
5470
+ func_id != BPF_FUNC_map_delete_elem &&
5471
+ func_id != BPF_FUNC_map_push_elem &&
5472
+ func_id != BPF_FUNC_map_pop_elem &&
5473
+ func_id != BPF_FUNC_map_peek_elem)
24845474 return 0;
24855475
2486
- if (meta->map_ptr == NULL) {
5476
+ if (map == NULL) {
24875477 verbose(env, "kernel subsystem misconfigured verifier\n");
24885478 return -EINVAL;
24895479 }
24905480
2491
- if (!BPF_MAP_PTR(aux->map_state))
5481
+ /* In case of read-only, some additional restrictions
5482
+ * need to be applied in order to prevent altering the
5483
+ * state of the map from program side.
5484
+ */
5485
+ if ((map->map_flags & BPF_F_RDONLY_PROG) &&
5486
+ (func_id == BPF_FUNC_map_delete_elem ||
5487
+ func_id == BPF_FUNC_map_update_elem ||
5488
+ func_id == BPF_FUNC_map_push_elem ||
5489
+ func_id == BPF_FUNC_map_pop_elem)) {
5490
+ verbose(env, "write into map forbidden\n");
5491
+ return -EACCES;
5492
+ }
5493
+
5494
+ if (!BPF_MAP_PTR(aux->map_ptr_state))
24925495 bpf_map_ptr_store(aux, meta->map_ptr,
2493
- meta->map_ptr->unpriv_array);
2494
- else if (BPF_MAP_PTR(aux->map_state) != meta->map_ptr)
5496
+ !meta->map_ptr->bypass_spec_v1);
5497
+ else if (BPF_MAP_PTR(aux->map_ptr_state) != meta->map_ptr)
24955498 bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON,
2496
- meta->map_ptr->unpriv_array);
5499
+ !meta->map_ptr->bypass_spec_v1);
24975500 return 0;
5501
+}
5502
+
5503
+static int
5504
+record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
5505
+ int func_id, int insn_idx)
5506
+{
5507
+ struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
5508
+ struct bpf_reg_state *regs = cur_regs(env), *reg;
5509
+ struct bpf_map *map = meta->map_ptr;
5510
+ u64 val, max;
5511
+ int err;
5512
+
5513
+ if (func_id != BPF_FUNC_tail_call)
5514
+ return 0;
5515
+ if (!map || map->map_type != BPF_MAP_TYPE_PROG_ARRAY) {
5516
+ verbose(env, "kernel subsystem misconfigured verifier\n");
5517
+ return -EINVAL;
5518
+ }
5519
+
5520
+ reg = &regs[BPF_REG_3];
5521
+ val = reg->var_off.value;
5522
+ max = map->max_entries;
5523
+
5524
+ if (!(register_is_const(reg) && val < max)) {
5525
+ bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
5526
+ return 0;
5527
+ }
5528
+
5529
+ err = mark_chain_precision(env, BPF_REG_3);
5530
+ if (err)
5531
+ return err;
5532
+ if (bpf_map_key_unseen(aux))
5533
+ bpf_map_key_store(aux, val);
5534
+ else if (!bpf_map_key_poisoned(aux) &&
5535
+ bpf_map_key_immediate(aux) != val)
5536
+ bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
5537
+ return 0;
5538
+}
5539
+
5540
+static int check_reference_leak(struct bpf_verifier_env *env)
5541
+{
5542
+ struct bpf_func_state *state = cur_func(env);
5543
+ int i;
5544
+
5545
+ for (i = 0; i < state->acquired_refs; i++) {
5546
+ verbose(env, "Unreleased reference id=%d alloc_insn=%d\n",
5547
+ state->refs[i].id, state->refs[i].insn_idx);
5548
+ }
5549
+ return state->acquired_refs ? -EINVAL : 0;
24985550 }
24995551
25005552 static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
....@@ -2526,6 +5578,11 @@
25265578 return -EINVAL;
25275579 }
25285580
5581
+ if (fn->allowed && !fn->allowed(env->prog)) {
5582
+ verbose(env, "helper call is not allowed in probe\n");
5583
+ return -EINVAL;
5584
+ }
5585
+
25295586 /* With LD_ABS/IND some JITs save/restore skb from r1. */
25305587 changes_data = bpf_helper_changes_pkt_data(fn->func);
25315588 if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) {
....@@ -2537,31 +5594,26 @@
25375594 memset(&meta, 0, sizeof(meta));
25385595 meta.pkt_access = fn->pkt_access;
25395596
2540
- err = check_func_proto(fn);
5597
+ err = check_func_proto(fn, func_id);
25415598 if (err) {
25425599 verbose(env, "kernel subsystem misconfigured func %s#%d\n",
25435600 func_id_name(func_id), func_id);
25445601 return err;
25455602 }
25465603
5604
+ meta.func_id = func_id;
25475605 /* check args */
2548
- err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta);
2549
- if (err)
2550
- return err;
2551
- err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta);
2552
- if (err)
2553
- return err;
2554
- err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta);
2555
- if (err)
2556
- return err;
2557
- err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &meta);
2558
- if (err)
2559
- return err;
2560
- err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &meta);
5606
+ for (i = 0; i < 5; i++) {
5607
+ err = check_func_arg(env, i, &meta, fn);
5608
+ if (err)
5609
+ return err;
5610
+ }
5611
+
5612
+ err = record_func_map(env, &meta, func_id, insn_idx);
25615613 if (err)
25625614 return err;
25635615
2564
- err = record_func_map(env, &meta, func_id, insn_idx);
5616
+ err = record_func_key(env, &meta, func_id, insn_idx);
25655617 if (err)
25665618 return err;
25675619
....@@ -2573,6 +5625,21 @@
25735625 BPF_WRITE, -1, false);
25745626 if (err)
25755627 return err;
5628
+ }
5629
+
5630
+ if (func_id == BPF_FUNC_tail_call) {
5631
+ err = check_reference_leak(env);
5632
+ if (err) {
5633
+ verbose(env, "tail_call would lead to reference leak\n");
5634
+ return err;
5635
+ }
5636
+ } else if (is_release_function(func_id)) {
5637
+ err = release_reference(env, meta.ref_obj_id);
5638
+ if (err) {
5639
+ verbose(env, "func %s#%d reference has not been acquired before\n",
5640
+ func_id_name(func_id), func_id);
5641
+ return err;
5642
+ }
25765643 }
25775644
25785645 regs = cur_regs(env);
....@@ -2592,6 +5659,9 @@
25925659 check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
25935660 }
25945661
5662
+ /* helper call returns 64-bit value. */
5663
+ regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
5664
+
25955665 /* update return register (already marked as written above) */
25965666 if (fn->ret_type == RET_INTEGER) {
25975667 /* sets type to SCALAR_VALUE */
....@@ -2600,10 +5670,6 @@
26005670 regs[BPF_REG_0].type = NOT_INIT;
26015671 } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL ||
26025672 fn->ret_type == RET_PTR_TO_MAP_VALUE) {
2603
- if (fn->ret_type == RET_PTR_TO_MAP_VALUE)
2604
- regs[BPF_REG_0].type = PTR_TO_MAP_VALUE;
2605
- else
2606
- regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
26075673 /* There is no offset yet applied, variable or fixed */
26085674 mark_reg_known_zero(env, regs, BPF_REG_0);
26095675 /* remember map_ptr, so that check_map_access()
....@@ -2616,22 +5682,99 @@
26165682 return -EINVAL;
26175683 }
26185684 regs[BPF_REG_0].map_ptr = meta.map_ptr;
2619
- regs[BPF_REG_0].id = ++env->id_gen;
5685
+ if (fn->ret_type == RET_PTR_TO_MAP_VALUE) {
5686
+ regs[BPF_REG_0].type = PTR_TO_MAP_VALUE;
5687
+ if (map_value_has_spin_lock(meta.map_ptr))
5688
+ regs[BPF_REG_0].id = ++env->id_gen;
5689
+ } else {
5690
+ regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
5691
+ }
5692
+ } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) {
5693
+ mark_reg_known_zero(env, regs, BPF_REG_0);
5694
+ regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL;
5695
+ } else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) {
5696
+ mark_reg_known_zero(env, regs, BPF_REG_0);
5697
+ regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL;
5698
+ } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) {
5699
+ mark_reg_known_zero(env, regs, BPF_REG_0);
5700
+ regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL;
5701
+ } else if (fn->ret_type == RET_PTR_TO_ALLOC_MEM_OR_NULL) {
5702
+ mark_reg_known_zero(env, regs, BPF_REG_0);
5703
+ regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL;
5704
+ regs[BPF_REG_0].mem_size = meta.mem_size;
5705
+ } else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL ||
5706
+ fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID) {
5707
+ const struct btf_type *t;
5708
+
5709
+ mark_reg_known_zero(env, regs, BPF_REG_0);
5710
+ t = btf_type_skip_modifiers(btf_vmlinux, meta.ret_btf_id, NULL);
5711
+ if (!btf_type_is_struct(t)) {
5712
+ u32 tsize;
5713
+ const struct btf_type *ret;
5714
+ const char *tname;
5715
+
5716
+ /* resolve the type size of ksym. */
5717
+ ret = btf_resolve_size(btf_vmlinux, t, &tsize);
5718
+ if (IS_ERR(ret)) {
5719
+ tname = btf_name_by_offset(btf_vmlinux, t->name_off);
5720
+ verbose(env, "unable to resolve the size of type '%s': %ld\n",
5721
+ tname, PTR_ERR(ret));
5722
+ return -EINVAL;
5723
+ }
5724
+ regs[BPF_REG_0].type =
5725
+ fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ?
5726
+ PTR_TO_MEM : PTR_TO_MEM_OR_NULL;
5727
+ regs[BPF_REG_0].mem_size = tsize;
5728
+ } else {
5729
+ regs[BPF_REG_0].type =
5730
+ fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ?
5731
+ PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL;
5732
+ regs[BPF_REG_0].btf_id = meta.ret_btf_id;
5733
+ }
5734
+ } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL) {
5735
+ int ret_btf_id;
5736
+
5737
+ mark_reg_known_zero(env, regs, BPF_REG_0);
5738
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID_OR_NULL;
5739
+ ret_btf_id = *fn->ret_btf_id;
5740
+ if (ret_btf_id == 0) {
5741
+ verbose(env, "invalid return type %d of func %s#%d\n",
5742
+ fn->ret_type, func_id_name(func_id), func_id);
5743
+ return -EINVAL;
5744
+ }
5745
+ regs[BPF_REG_0].btf_id = ret_btf_id;
26205746 } else {
26215747 verbose(env, "unknown return type %d of func %s#%d\n",
26225748 fn->ret_type, func_id_name(func_id), func_id);
26235749 return -EINVAL;
26245750 }
26255751
2626
- err = do_refine_retval_range(env, regs, fn->ret_type, func_id, &meta);
2627
- if (err)
2628
- return err;
5752
+ if (reg_type_may_be_null(regs[BPF_REG_0].type))
5753
+ regs[BPF_REG_0].id = ++env->id_gen;
5754
+
5755
+ if (is_ptr_cast_function(func_id)) {
5756
+ /* For release_reference() */
5757
+ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
5758
+ } else if (is_acquire_function(func_id, meta.map_ptr)) {
5759
+ int id = acquire_reference_state(env, insn_idx);
5760
+
5761
+ if (id < 0)
5762
+ return id;
5763
+ /* For mark_ptr_or_null_reg() */
5764
+ regs[BPF_REG_0].id = id;
5765
+ /* For release_reference() */
5766
+ regs[BPF_REG_0].ref_obj_id = id;
5767
+ }
5768
+
5769
+ do_refine_retval_range(regs, fn->ret_type, func_id, &meta);
26295770
26305771 err = check_map_func_compatibility(env, meta.map_ptr, func_id);
26315772 if (err)
26325773 return err;
26335774
2634
- if (func_id == BPF_FUNC_get_stack && !env->prog->has_callchain_buf) {
5775
+ if ((func_id == BPF_FUNC_get_stack ||
5776
+ func_id == BPF_FUNC_get_task_stack) &&
5777
+ !env->prog->has_callchain_buf) {
26355778 const char *err_str;
26365779
26375780 #ifdef CONFIG_PERF_EVENTS
....@@ -2649,6 +5792,9 @@
26495792 env->prog->has_callchain_buf = true;
26505793 }
26515794
5795
+ if (func_id == BPF_FUNC_get_stackid || func_id == BPF_FUNC_get_stack)
5796
+ env->prog->call_get_stack = true;
5797
+
26525798 if (changes_data)
26535799 clear_all_pkt_pointers(env);
26545800 return 0;
....@@ -2664,10 +5810,30 @@
26645810 return res < a;
26655811 }
26665812
5813
+static bool signed_add32_overflows(s32 a, s32 b)
5814
+{
5815
+ /* Do the add in u32, where overflow is well-defined */
5816
+ s32 res = (s32)((u32)a + (u32)b);
5817
+
5818
+ if (b < 0)
5819
+ return res > a;
5820
+ return res < a;
5821
+}
5822
+
26675823 static bool signed_sub_overflows(s64 a, s64 b)
26685824 {
26695825 /* Do the sub in u64, where overflow is well-defined */
26705826 s64 res = (s64)((u64)a - (u64)b);
5827
+
5828
+ if (b < 0)
5829
+ return res < a;
5830
+ return res > a;
5831
+}
5832
+
5833
+static bool signed_sub32_overflows(s32 a, s32 b)
5834
+{
5835
+ /* Do the sub in u32, where overflow is well-defined */
5836
+ s32 res = (s32)((u32)a - (u32)b);
26715837
26725838 if (b < 0)
26735839 return res < a;
....@@ -2756,7 +5922,7 @@
27565922 static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env,
27575923 const struct bpf_insn *insn)
27585924 {
2759
- return env->allow_ptr_leaks || BPF_SRC(insn->code) == BPF_K;
5925
+ return env->bypass_spec_v1 || BPF_SRC(insn->code) == BPF_K;
27605926 }
27615927
27625928 static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux,
....@@ -2905,7 +6071,7 @@
29056071 */
29066072 if (!ptr_is_dst_reg) {
29076073 tmp = *dst_reg;
2908
- *dst_reg = *ptr_reg;
6074
+ copy_register_state(dst_reg, ptr_reg);
29096075 }
29106076 ret = sanitize_speculative_path(env, NULL, env->insn_idx + 1,
29116077 env->insn_idx);
....@@ -2924,7 +6090,7 @@
29246090 * rewrite/sanitize them.
29256091 */
29266092 if (!vstate->speculative)
2927
- env->insn_aux_data[env->insn_idx].seen = true;
6093
+ env->insn_aux_data[env->insn_idx].seen = env->pass_cnt;
29286094 }
29296095
29306096 static int sanitize_err(struct bpf_verifier_env *env,
....@@ -2966,6 +6132,40 @@
29666132 return -EACCES;
29676133 }
29686134
6135
+/* check that stack access falls within stack limits and that 'reg' doesn't
6136
+ * have a variable offset.
6137
+ *
6138
+ * Variable offset is prohibited for unprivileged mode for simplicity since it
6139
+ * requires corresponding support in Spectre masking for stack ALU. See also
6140
+ * retrieve_ptr_limit().
6141
+ *
6142
+ *
6143
+ * 'off' includes 'reg->off'.
6144
+ */
6145
+static int check_stack_access_for_ptr_arithmetic(
6146
+ struct bpf_verifier_env *env,
6147
+ int regno,
6148
+ const struct bpf_reg_state *reg,
6149
+ int off)
6150
+{
6151
+ if (!tnum_is_const(reg->var_off)) {
6152
+ char tn_buf[48];
6153
+
6154
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
6155
+ verbose(env, "R%d variable stack access prohibited for !root, var_off=%s off=%d\n",
6156
+ regno, tn_buf, off);
6157
+ return -EACCES;
6158
+ }
6159
+
6160
+ if (off >= 0 || off < -MAX_BPF_STACK) {
6161
+ verbose(env, "R%d stack pointer arithmetic goes out of range, "
6162
+ "prohibited for !root; off=%d\n", regno, off);
6163
+ return -EACCES;
6164
+ }
6165
+
6166
+ return 0;
6167
+}
6168
+
29696169 static int sanitize_check_bounds(struct bpf_verifier_env *env,
29706170 const struct bpf_insn *insn,
29716171 const struct bpf_reg_state *dst_reg)
....@@ -2975,17 +6175,14 @@
29756175 /* For unprivileged we require that resulting offset must be in bounds
29766176 * in order to be able to sanitize access later on.
29776177 */
2978
- if (env->allow_ptr_leaks)
6178
+ if (env->bypass_spec_v1)
29796179 return 0;
29806180
29816181 switch (dst_reg->type) {
29826182 case PTR_TO_STACK:
2983
- if (check_stack_access(env, dst_reg, dst_reg->off +
2984
- dst_reg->var_off.value, 1)) {
2985
- verbose(env, "R%d stack pointer arithmetic goes out of range, "
2986
- "prohibited for !root\n", dst);
6183
+ if (check_stack_access_for_ptr_arithmetic(env, dst, dst_reg,
6184
+ dst_reg->off + dst_reg->var_off.value))
29876185 return -EACCES;
2988
- }
29896186 break;
29906187 case PTR_TO_MAP_VALUE:
29916188 if (check_map_access(env, dst, dst_reg->off, 1, false)) {
....@@ -3031,32 +6228,46 @@
30316228 /* Taint dst register if offset had invalid bounds derived from
30326229 * e.g. dead branches.
30336230 */
3034
- __mark_reg_unknown(dst_reg);
6231
+ __mark_reg_unknown(env, dst_reg);
30356232 return 0;
30366233 }
30376234
30386235 if (BPF_CLASS(insn->code) != BPF_ALU64) {
30396236 /* 32-bit ALU ops on pointers produce (meaningless) scalars */
6237
+ if (opcode == BPF_SUB && env->allow_ptr_leaks) {
6238
+ __mark_reg_unknown(env, dst_reg);
6239
+ return 0;
6240
+ }
6241
+
30406242 verbose(env,
30416243 "R%d 32-bit pointer arithmetic prohibited\n",
30426244 dst);
30436245 return -EACCES;
30446246 }
30456247
3046
- if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
3047
- verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
3048
- dst);
6248
+ switch (ptr_reg->type) {
6249
+ case PTR_TO_MAP_VALUE_OR_NULL:
6250
+ verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n",
6251
+ dst, reg_type_str[ptr_reg->type]);
30496252 return -EACCES;
3050
- }
3051
- if (ptr_reg->type == CONST_PTR_TO_MAP) {
3052
- verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
3053
- dst);
6253
+ case CONST_PTR_TO_MAP:
6254
+ /* smin_val represents the known value */
6255
+ if (known && smin_val == 0 && opcode == BPF_ADD)
6256
+ break;
6257
+ fallthrough;
6258
+ case PTR_TO_PACKET_END:
6259
+ case PTR_TO_SOCKET:
6260
+ case PTR_TO_SOCK_COMMON:
6261
+ case PTR_TO_TCP_SOCK:
6262
+ case PTR_TO_XDP_SOCK:
6263
+reject:
6264
+ verbose(env, "R%d pointer arithmetic on %s prohibited\n",
6265
+ dst, reg_type_str[ptr_reg->type]);
30546266 return -EACCES;
3055
- }
3056
- if (ptr_reg->type == PTR_TO_PACKET_END) {
3057
- verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
3058
- dst);
3059
- return -EACCES;
6267
+ default:
6268
+ if (reg_type_may_be_null(ptr_reg->type))
6269
+ goto reject;
6270
+ break;
30606271 }
30616272
30626273 /* In case of 'scalar += pointer', dst_reg inherits pointer type and id.
....@@ -3068,6 +6279,9 @@
30686279 if (!check_reg_sane_offset(env, off_reg, ptr_reg->type) ||
30696280 !check_reg_sane_offset(env, ptr_reg, ptr_reg->type))
30706281 return -EINVAL;
6282
+
6283
+ /* pointer types do not carry 32-bit bounds at the moment. */
6284
+ __mark_reg32_unbounded(dst_reg);
30716285
30726286 if (sanitize_needed(opcode)) {
30736287 ret = sanitize_ptr_alu(env, insn, ptr_reg, off_reg, dst_reg,
....@@ -3203,11 +6417,7 @@
32036417
32046418 if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type))
32056419 return -EINVAL;
3206
-
3207
- __update_reg_bounds(dst_reg);
3208
- __reg_deduce_bounds(dst_reg);
3209
- __reg_bound_offset(dst_reg);
3210
-
6420
+ reg_bounds_sync(dst_reg);
32116421 if (sanitize_check_bounds(env, insn, dst_reg) < 0)
32126422 return -EACCES;
32136423 if (sanitize_needed(opcode)) {
....@@ -3218,6 +6428,573 @@
32186428 }
32196429
32206430 return 0;
6431
+}
6432
+
6433
+static void scalar32_min_max_add(struct bpf_reg_state *dst_reg,
6434
+ struct bpf_reg_state *src_reg)
6435
+{
6436
+ s32 smin_val = src_reg->s32_min_value;
6437
+ s32 smax_val = src_reg->s32_max_value;
6438
+ u32 umin_val = src_reg->u32_min_value;
6439
+ u32 umax_val = src_reg->u32_max_value;
6440
+
6441
+ if (signed_add32_overflows(dst_reg->s32_min_value, smin_val) ||
6442
+ signed_add32_overflows(dst_reg->s32_max_value, smax_val)) {
6443
+ dst_reg->s32_min_value = S32_MIN;
6444
+ dst_reg->s32_max_value = S32_MAX;
6445
+ } else {
6446
+ dst_reg->s32_min_value += smin_val;
6447
+ dst_reg->s32_max_value += smax_val;
6448
+ }
6449
+ if (dst_reg->u32_min_value + umin_val < umin_val ||
6450
+ dst_reg->u32_max_value + umax_val < umax_val) {
6451
+ dst_reg->u32_min_value = 0;
6452
+ dst_reg->u32_max_value = U32_MAX;
6453
+ } else {
6454
+ dst_reg->u32_min_value += umin_val;
6455
+ dst_reg->u32_max_value += umax_val;
6456
+ }
6457
+}
6458
+
6459
+static void scalar_min_max_add(struct bpf_reg_state *dst_reg,
6460
+ struct bpf_reg_state *src_reg)
6461
+{
6462
+ s64 smin_val = src_reg->smin_value;
6463
+ s64 smax_val = src_reg->smax_value;
6464
+ u64 umin_val = src_reg->umin_value;
6465
+ u64 umax_val = src_reg->umax_value;
6466
+
6467
+ if (signed_add_overflows(dst_reg->smin_value, smin_val) ||
6468
+ signed_add_overflows(dst_reg->smax_value, smax_val)) {
6469
+ dst_reg->smin_value = S64_MIN;
6470
+ dst_reg->smax_value = S64_MAX;
6471
+ } else {
6472
+ dst_reg->smin_value += smin_val;
6473
+ dst_reg->smax_value += smax_val;
6474
+ }
6475
+ if (dst_reg->umin_value + umin_val < umin_val ||
6476
+ dst_reg->umax_value + umax_val < umax_val) {
6477
+ dst_reg->umin_value = 0;
6478
+ dst_reg->umax_value = U64_MAX;
6479
+ } else {
6480
+ dst_reg->umin_value += umin_val;
6481
+ dst_reg->umax_value += umax_val;
6482
+ }
6483
+}
6484
+
6485
+static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg,
6486
+ struct bpf_reg_state *src_reg)
6487
+{
6488
+ s32 smin_val = src_reg->s32_min_value;
6489
+ s32 smax_val = src_reg->s32_max_value;
6490
+ u32 umin_val = src_reg->u32_min_value;
6491
+ u32 umax_val = src_reg->u32_max_value;
6492
+
6493
+ if (signed_sub32_overflows(dst_reg->s32_min_value, smax_val) ||
6494
+ signed_sub32_overflows(dst_reg->s32_max_value, smin_val)) {
6495
+ /* Overflow possible, we know nothing */
6496
+ dst_reg->s32_min_value = S32_MIN;
6497
+ dst_reg->s32_max_value = S32_MAX;
6498
+ } else {
6499
+ dst_reg->s32_min_value -= smax_val;
6500
+ dst_reg->s32_max_value -= smin_val;
6501
+ }
6502
+ if (dst_reg->u32_min_value < umax_val) {
6503
+ /* Overflow possible, we know nothing */
6504
+ dst_reg->u32_min_value = 0;
6505
+ dst_reg->u32_max_value = U32_MAX;
6506
+ } else {
6507
+ /* Cannot overflow (as long as bounds are consistent) */
6508
+ dst_reg->u32_min_value -= umax_val;
6509
+ dst_reg->u32_max_value -= umin_val;
6510
+ }
6511
+}
6512
+
6513
+static void scalar_min_max_sub(struct bpf_reg_state *dst_reg,
6514
+ struct bpf_reg_state *src_reg)
6515
+{
6516
+ s64 smin_val = src_reg->smin_value;
6517
+ s64 smax_val = src_reg->smax_value;
6518
+ u64 umin_val = src_reg->umin_value;
6519
+ u64 umax_val = src_reg->umax_value;
6520
+
6521
+ if (signed_sub_overflows(dst_reg->smin_value, smax_val) ||
6522
+ signed_sub_overflows(dst_reg->smax_value, smin_val)) {
6523
+ /* Overflow possible, we know nothing */
6524
+ dst_reg->smin_value = S64_MIN;
6525
+ dst_reg->smax_value = S64_MAX;
6526
+ } else {
6527
+ dst_reg->smin_value -= smax_val;
6528
+ dst_reg->smax_value -= smin_val;
6529
+ }
6530
+ if (dst_reg->umin_value < umax_val) {
6531
+ /* Overflow possible, we know nothing */
6532
+ dst_reg->umin_value = 0;
6533
+ dst_reg->umax_value = U64_MAX;
6534
+ } else {
6535
+ /* Cannot overflow (as long as bounds are consistent) */
6536
+ dst_reg->umin_value -= umax_val;
6537
+ dst_reg->umax_value -= umin_val;
6538
+ }
6539
+}
6540
+
6541
+static void scalar32_min_max_mul(struct bpf_reg_state *dst_reg,
6542
+ struct bpf_reg_state *src_reg)
6543
+{
6544
+ s32 smin_val = src_reg->s32_min_value;
6545
+ u32 umin_val = src_reg->u32_min_value;
6546
+ u32 umax_val = src_reg->u32_max_value;
6547
+
6548
+ if (smin_val < 0 || dst_reg->s32_min_value < 0) {
6549
+ /* Ain't nobody got time to multiply that sign */
6550
+ __mark_reg32_unbounded(dst_reg);
6551
+ return;
6552
+ }
6553
+ /* Both values are positive, so we can work with unsigned and
6554
+ * copy the result to signed (unless it exceeds S32_MAX).
6555
+ */
6556
+ if (umax_val > U16_MAX || dst_reg->u32_max_value > U16_MAX) {
6557
+ /* Potential overflow, we know nothing */
6558
+ __mark_reg32_unbounded(dst_reg);
6559
+ return;
6560
+ }
6561
+ dst_reg->u32_min_value *= umin_val;
6562
+ dst_reg->u32_max_value *= umax_val;
6563
+ if (dst_reg->u32_max_value > S32_MAX) {
6564
+ /* Overflow possible, we know nothing */
6565
+ dst_reg->s32_min_value = S32_MIN;
6566
+ dst_reg->s32_max_value = S32_MAX;
6567
+ } else {
6568
+ dst_reg->s32_min_value = dst_reg->u32_min_value;
6569
+ dst_reg->s32_max_value = dst_reg->u32_max_value;
6570
+ }
6571
+}
6572
+
6573
+static void scalar_min_max_mul(struct bpf_reg_state *dst_reg,
6574
+ struct bpf_reg_state *src_reg)
6575
+{
6576
+ s64 smin_val = src_reg->smin_value;
6577
+ u64 umin_val = src_reg->umin_value;
6578
+ u64 umax_val = src_reg->umax_value;
6579
+
6580
+ if (smin_val < 0 || dst_reg->smin_value < 0) {
6581
+ /* Ain't nobody got time to multiply that sign */
6582
+ __mark_reg64_unbounded(dst_reg);
6583
+ return;
6584
+ }
6585
+ /* Both values are positive, so we can work with unsigned and
6586
+ * copy the result to signed (unless it exceeds S64_MAX).
6587
+ */
6588
+ if (umax_val > U32_MAX || dst_reg->umax_value > U32_MAX) {
6589
+ /* Potential overflow, we know nothing */
6590
+ __mark_reg64_unbounded(dst_reg);
6591
+ return;
6592
+ }
6593
+ dst_reg->umin_value *= umin_val;
6594
+ dst_reg->umax_value *= umax_val;
6595
+ if (dst_reg->umax_value > S64_MAX) {
6596
+ /* Overflow possible, we know nothing */
6597
+ dst_reg->smin_value = S64_MIN;
6598
+ dst_reg->smax_value = S64_MAX;
6599
+ } else {
6600
+ dst_reg->smin_value = dst_reg->umin_value;
6601
+ dst_reg->smax_value = dst_reg->umax_value;
6602
+ }
6603
+}
6604
+
6605
+static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
6606
+ struct bpf_reg_state *src_reg)
6607
+{
6608
+ bool src_known = tnum_subreg_is_const(src_reg->var_off);
6609
+ bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
6610
+ struct tnum var32_off = tnum_subreg(dst_reg->var_off);
6611
+ s32 smin_val = src_reg->s32_min_value;
6612
+ u32 umax_val = src_reg->u32_max_value;
6613
+
6614
+ if (src_known && dst_known) {
6615
+ __mark_reg32_known(dst_reg, var32_off.value);
6616
+ return;
6617
+ }
6618
+
6619
+ /* We get our minimum from the var_off, since that's inherently
6620
+ * bitwise. Our maximum is the minimum of the operands' maxima.
6621
+ */
6622
+ dst_reg->u32_min_value = var32_off.value;
6623
+ dst_reg->u32_max_value = min(dst_reg->u32_max_value, umax_val);
6624
+ if (dst_reg->s32_min_value < 0 || smin_val < 0) {
6625
+ /* Lose signed bounds when ANDing negative numbers,
6626
+ * ain't nobody got time for that.
6627
+ */
6628
+ dst_reg->s32_min_value = S32_MIN;
6629
+ dst_reg->s32_max_value = S32_MAX;
6630
+ } else {
6631
+ /* ANDing two positives gives a positive, so safe to
6632
+ * cast result into s64.
6633
+ */
6634
+ dst_reg->s32_min_value = dst_reg->u32_min_value;
6635
+ dst_reg->s32_max_value = dst_reg->u32_max_value;
6636
+ }
6637
+}
6638
+
6639
+static void scalar_min_max_and(struct bpf_reg_state *dst_reg,
6640
+ struct bpf_reg_state *src_reg)
6641
+{
6642
+ bool src_known = tnum_is_const(src_reg->var_off);
6643
+ bool dst_known = tnum_is_const(dst_reg->var_off);
6644
+ s64 smin_val = src_reg->smin_value;
6645
+ u64 umax_val = src_reg->umax_value;
6646
+
6647
+ if (src_known && dst_known) {
6648
+ __mark_reg_known(dst_reg, dst_reg->var_off.value);
6649
+ return;
6650
+ }
6651
+
6652
+ /* We get our minimum from the var_off, since that's inherently
6653
+ * bitwise. Our maximum is the minimum of the operands' maxima.
6654
+ */
6655
+ dst_reg->umin_value = dst_reg->var_off.value;
6656
+ dst_reg->umax_value = min(dst_reg->umax_value, umax_val);
6657
+ if (dst_reg->smin_value < 0 || smin_val < 0) {
6658
+ /* Lose signed bounds when ANDing negative numbers,
6659
+ * ain't nobody got time for that.
6660
+ */
6661
+ dst_reg->smin_value = S64_MIN;
6662
+ dst_reg->smax_value = S64_MAX;
6663
+ } else {
6664
+ /* ANDing two positives gives a positive, so safe to
6665
+ * cast result into s64.
6666
+ */
6667
+ dst_reg->smin_value = dst_reg->umin_value;
6668
+ dst_reg->smax_value = dst_reg->umax_value;
6669
+ }
6670
+ /* We may learn something more from the var_off */
6671
+ __update_reg_bounds(dst_reg);
6672
+}
6673
+
6674
+static void scalar32_min_max_or(struct bpf_reg_state *dst_reg,
6675
+ struct bpf_reg_state *src_reg)
6676
+{
6677
+ bool src_known = tnum_subreg_is_const(src_reg->var_off);
6678
+ bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
6679
+ struct tnum var32_off = tnum_subreg(dst_reg->var_off);
6680
+ s32 smin_val = src_reg->s32_min_value;
6681
+ u32 umin_val = src_reg->u32_min_value;
6682
+
6683
+ if (src_known && dst_known) {
6684
+ __mark_reg32_known(dst_reg, var32_off.value);
6685
+ return;
6686
+ }
6687
+
6688
+ /* We get our maximum from the var_off, and our minimum is the
6689
+ * maximum of the operands' minima
6690
+ */
6691
+ dst_reg->u32_min_value = max(dst_reg->u32_min_value, umin_val);
6692
+ dst_reg->u32_max_value = var32_off.value | var32_off.mask;
6693
+ if (dst_reg->s32_min_value < 0 || smin_val < 0) {
6694
+ /* Lose signed bounds when ORing negative numbers,
6695
+ * ain't nobody got time for that.
6696
+ */
6697
+ dst_reg->s32_min_value = S32_MIN;
6698
+ dst_reg->s32_max_value = S32_MAX;
6699
+ } else {
6700
+ /* ORing two positives gives a positive, so safe to
6701
+ * cast result into s64.
6702
+ */
6703
+ dst_reg->s32_min_value = dst_reg->u32_min_value;
6704
+ dst_reg->s32_max_value = dst_reg->u32_max_value;
6705
+ }
6706
+}
6707
+
6708
+static void scalar_min_max_or(struct bpf_reg_state *dst_reg,
6709
+ struct bpf_reg_state *src_reg)
6710
+{
6711
+ bool src_known = tnum_is_const(src_reg->var_off);
6712
+ bool dst_known = tnum_is_const(dst_reg->var_off);
6713
+ s64 smin_val = src_reg->smin_value;
6714
+ u64 umin_val = src_reg->umin_value;
6715
+
6716
+ if (src_known && dst_known) {
6717
+ __mark_reg_known(dst_reg, dst_reg->var_off.value);
6718
+ return;
6719
+ }
6720
+
6721
+ /* We get our maximum from the var_off, and our minimum is the
6722
+ * maximum of the operands' minima
6723
+ */
6724
+ dst_reg->umin_value = max(dst_reg->umin_value, umin_val);
6725
+ dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask;
6726
+ if (dst_reg->smin_value < 0 || smin_val < 0) {
6727
+ /* Lose signed bounds when ORing negative numbers,
6728
+ * ain't nobody got time for that.
6729
+ */
6730
+ dst_reg->smin_value = S64_MIN;
6731
+ dst_reg->smax_value = S64_MAX;
6732
+ } else {
6733
+ /* ORing two positives gives a positive, so safe to
6734
+ * cast result into s64.
6735
+ */
6736
+ dst_reg->smin_value = dst_reg->umin_value;
6737
+ dst_reg->smax_value = dst_reg->umax_value;
6738
+ }
6739
+ /* We may learn something more from the var_off */
6740
+ __update_reg_bounds(dst_reg);
6741
+}
6742
+
6743
+static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg,
6744
+ struct bpf_reg_state *src_reg)
6745
+{
6746
+ bool src_known = tnum_subreg_is_const(src_reg->var_off);
6747
+ bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
6748
+ struct tnum var32_off = tnum_subreg(dst_reg->var_off);
6749
+ s32 smin_val = src_reg->s32_min_value;
6750
+
6751
+ if (src_known && dst_known) {
6752
+ __mark_reg32_known(dst_reg, var32_off.value);
6753
+ return;
6754
+ }
6755
+
6756
+ /* We get both minimum and maximum from the var32_off. */
6757
+ dst_reg->u32_min_value = var32_off.value;
6758
+ dst_reg->u32_max_value = var32_off.value | var32_off.mask;
6759
+
6760
+ if (dst_reg->s32_min_value >= 0 && smin_val >= 0) {
6761
+ /* XORing two positive sign numbers gives a positive,
6762
+ * so safe to cast u32 result into s32.
6763
+ */
6764
+ dst_reg->s32_min_value = dst_reg->u32_min_value;
6765
+ dst_reg->s32_max_value = dst_reg->u32_max_value;
6766
+ } else {
6767
+ dst_reg->s32_min_value = S32_MIN;
6768
+ dst_reg->s32_max_value = S32_MAX;
6769
+ }
6770
+}
6771
+
6772
+static void scalar_min_max_xor(struct bpf_reg_state *dst_reg,
6773
+ struct bpf_reg_state *src_reg)
6774
+{
6775
+ bool src_known = tnum_is_const(src_reg->var_off);
6776
+ bool dst_known = tnum_is_const(dst_reg->var_off);
6777
+ s64 smin_val = src_reg->smin_value;
6778
+
6779
+ if (src_known && dst_known) {
6780
+ /* dst_reg->var_off.value has been updated earlier */
6781
+ __mark_reg_known(dst_reg, dst_reg->var_off.value);
6782
+ return;
6783
+ }
6784
+
6785
+ /* We get both minimum and maximum from the var_off. */
6786
+ dst_reg->umin_value = dst_reg->var_off.value;
6787
+ dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask;
6788
+
6789
+ if (dst_reg->smin_value >= 0 && smin_val >= 0) {
6790
+ /* XORing two positive sign numbers gives a positive,
6791
+ * so safe to cast u64 result into s64.
6792
+ */
6793
+ dst_reg->smin_value = dst_reg->umin_value;
6794
+ dst_reg->smax_value = dst_reg->umax_value;
6795
+ } else {
6796
+ dst_reg->smin_value = S64_MIN;
6797
+ dst_reg->smax_value = S64_MAX;
6798
+ }
6799
+
6800
+ __update_reg_bounds(dst_reg);
6801
+}
6802
+
6803
+static void __scalar32_min_max_lsh(struct bpf_reg_state *dst_reg,
6804
+ u64 umin_val, u64 umax_val)
6805
+{
6806
+ /* We lose all sign bit information (except what we can pick
6807
+ * up from var_off)
6808
+ */
6809
+ dst_reg->s32_min_value = S32_MIN;
6810
+ dst_reg->s32_max_value = S32_MAX;
6811
+ /* If we might shift our top bit out, then we know nothing */
6812
+ if (umax_val > 31 || dst_reg->u32_max_value > 1ULL << (31 - umax_val)) {
6813
+ dst_reg->u32_min_value = 0;
6814
+ dst_reg->u32_max_value = U32_MAX;
6815
+ } else {
6816
+ dst_reg->u32_min_value <<= umin_val;
6817
+ dst_reg->u32_max_value <<= umax_val;
6818
+ }
6819
+}
6820
+
6821
+static void scalar32_min_max_lsh(struct bpf_reg_state *dst_reg,
6822
+ struct bpf_reg_state *src_reg)
6823
+{
6824
+ u32 umax_val = src_reg->u32_max_value;
6825
+ u32 umin_val = src_reg->u32_min_value;
6826
+ /* u32 alu operation will zext upper bits */
6827
+ struct tnum subreg = tnum_subreg(dst_reg->var_off);
6828
+
6829
+ __scalar32_min_max_lsh(dst_reg, umin_val, umax_val);
6830
+ dst_reg->var_off = tnum_subreg(tnum_lshift(subreg, umin_val));
6831
+ /* Not required but being careful mark reg64 bounds as unknown so
6832
+ * that we are forced to pick them up from tnum and zext later and
6833
+ * if some path skips this step we are still safe.
6834
+ */
6835
+ __mark_reg64_unbounded(dst_reg);
6836
+ __update_reg32_bounds(dst_reg);
6837
+}
6838
+
6839
+static void __scalar64_min_max_lsh(struct bpf_reg_state *dst_reg,
6840
+ u64 umin_val, u64 umax_val)
6841
+{
6842
+ /* Special case <<32 because it is a common compiler pattern to sign
6843
+ * extend subreg by doing <<32 s>>32. In this case if 32bit bounds are
6844
+ * positive we know this shift will also be positive so we can track
6845
+ * bounds correctly. Otherwise we lose all sign bit information except
6846
+ * what we can pick up from var_off. Perhaps we can generalize this
6847
+ * later to shifts of any length.
6848
+ */
6849
+ if (umin_val == 32 && umax_val == 32 && dst_reg->s32_max_value >= 0)
6850
+ dst_reg->smax_value = (s64)dst_reg->s32_max_value << 32;
6851
+ else
6852
+ dst_reg->smax_value = S64_MAX;
6853
+
6854
+ if (umin_val == 32 && umax_val == 32 && dst_reg->s32_min_value >= 0)
6855
+ dst_reg->smin_value = (s64)dst_reg->s32_min_value << 32;
6856
+ else
6857
+ dst_reg->smin_value = S64_MIN;
6858
+
6859
+ /* If we might shift our top bit out, then we know nothing */
6860
+ if (dst_reg->umax_value > 1ULL << (63 - umax_val)) {
6861
+ dst_reg->umin_value = 0;
6862
+ dst_reg->umax_value = U64_MAX;
6863
+ } else {
6864
+ dst_reg->umin_value <<= umin_val;
6865
+ dst_reg->umax_value <<= umax_val;
6866
+ }
6867
+}
6868
+
6869
+static void scalar_min_max_lsh(struct bpf_reg_state *dst_reg,
6870
+ struct bpf_reg_state *src_reg)
6871
+{
6872
+ u64 umax_val = src_reg->umax_value;
6873
+ u64 umin_val = src_reg->umin_value;
6874
+
6875
+ /* scalar64 calc uses 32bit unshifted bounds so must be called first */
6876
+ __scalar64_min_max_lsh(dst_reg, umin_val, umax_val);
6877
+ __scalar32_min_max_lsh(dst_reg, umin_val, umax_val);
6878
+
6879
+ dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val);
6880
+ /* We may learn something more from the var_off */
6881
+ __update_reg_bounds(dst_reg);
6882
+}
6883
+
6884
+static void scalar32_min_max_rsh(struct bpf_reg_state *dst_reg,
6885
+ struct bpf_reg_state *src_reg)
6886
+{
6887
+ struct tnum subreg = tnum_subreg(dst_reg->var_off);
6888
+ u32 umax_val = src_reg->u32_max_value;
6889
+ u32 umin_val = src_reg->u32_min_value;
6890
+
6891
+ /* BPF_RSH is an unsigned shift. If the value in dst_reg might
6892
+ * be negative, then either:
6893
+ * 1) src_reg might be zero, so the sign bit of the result is
6894
+ * unknown, so we lose our signed bounds
6895
+ * 2) it's known negative, thus the unsigned bounds capture the
6896
+ * signed bounds
6897
+ * 3) the signed bounds cross zero, so they tell us nothing
6898
+ * about the result
6899
+ * If the value in dst_reg is known nonnegative, then again the
6900
+ * unsigned bounts capture the signed bounds.
6901
+ * Thus, in all cases it suffices to blow away our signed bounds
6902
+ * and rely on inferring new ones from the unsigned bounds and
6903
+ * var_off of the result.
6904
+ */
6905
+ dst_reg->s32_min_value = S32_MIN;
6906
+ dst_reg->s32_max_value = S32_MAX;
6907
+
6908
+ dst_reg->var_off = tnum_rshift(subreg, umin_val);
6909
+ dst_reg->u32_min_value >>= umax_val;
6910
+ dst_reg->u32_max_value >>= umin_val;
6911
+
6912
+ __mark_reg64_unbounded(dst_reg);
6913
+ __update_reg32_bounds(dst_reg);
6914
+}
6915
+
6916
+static void scalar_min_max_rsh(struct bpf_reg_state *dst_reg,
6917
+ struct bpf_reg_state *src_reg)
6918
+{
6919
+ u64 umax_val = src_reg->umax_value;
6920
+ u64 umin_val = src_reg->umin_value;
6921
+
6922
+ /* BPF_RSH is an unsigned shift. If the value in dst_reg might
6923
+ * be negative, then either:
6924
+ * 1) src_reg might be zero, so the sign bit of the result is
6925
+ * unknown, so we lose our signed bounds
6926
+ * 2) it's known negative, thus the unsigned bounds capture the
6927
+ * signed bounds
6928
+ * 3) the signed bounds cross zero, so they tell us nothing
6929
+ * about the result
6930
+ * If the value in dst_reg is known nonnegative, then again the
6931
+ * unsigned bounts capture the signed bounds.
6932
+ * Thus, in all cases it suffices to blow away our signed bounds
6933
+ * and rely on inferring new ones from the unsigned bounds and
6934
+ * var_off of the result.
6935
+ */
6936
+ dst_reg->smin_value = S64_MIN;
6937
+ dst_reg->smax_value = S64_MAX;
6938
+ dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val);
6939
+ dst_reg->umin_value >>= umax_val;
6940
+ dst_reg->umax_value >>= umin_val;
6941
+
6942
+ /* Its not easy to operate on alu32 bounds here because it depends
6943
+ * on bits being shifted in. Take easy way out and mark unbounded
6944
+ * so we can recalculate later from tnum.
6945
+ */
6946
+ __mark_reg32_unbounded(dst_reg);
6947
+ __update_reg_bounds(dst_reg);
6948
+}
6949
+
6950
+static void scalar32_min_max_arsh(struct bpf_reg_state *dst_reg,
6951
+ struct bpf_reg_state *src_reg)
6952
+{
6953
+ u64 umin_val = src_reg->u32_min_value;
6954
+
6955
+ /* Upon reaching here, src_known is true and
6956
+ * umax_val is equal to umin_val.
6957
+ */
6958
+ dst_reg->s32_min_value = (u32)(((s32)dst_reg->s32_min_value) >> umin_val);
6959
+ dst_reg->s32_max_value = (u32)(((s32)dst_reg->s32_max_value) >> umin_val);
6960
+
6961
+ dst_reg->var_off = tnum_arshift(tnum_subreg(dst_reg->var_off), umin_val, 32);
6962
+
6963
+ /* blow away the dst_reg umin_value/umax_value and rely on
6964
+ * dst_reg var_off to refine the result.
6965
+ */
6966
+ dst_reg->u32_min_value = 0;
6967
+ dst_reg->u32_max_value = U32_MAX;
6968
+
6969
+ __mark_reg64_unbounded(dst_reg);
6970
+ __update_reg32_bounds(dst_reg);
6971
+}
6972
+
6973
+static void scalar_min_max_arsh(struct bpf_reg_state *dst_reg,
6974
+ struct bpf_reg_state *src_reg)
6975
+{
6976
+ u64 umin_val = src_reg->umin_value;
6977
+
6978
+ /* Upon reaching here, src_known is true and umax_val is equal
6979
+ * to umin_val.
6980
+ */
6981
+ dst_reg->smin_value >>= umin_val;
6982
+ dst_reg->smax_value >>= umin_val;
6983
+
6984
+ dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val, 64);
6985
+
6986
+ /* blow away the dst_reg umin_value/umax_value and rely on
6987
+ * dst_reg var_off to refine the result.
6988
+ */
6989
+ dst_reg->umin_value = 0;
6990
+ dst_reg->umax_value = U64_MAX;
6991
+
6992
+ /* Its not easy to operate on alu32 bounds here because it depends
6993
+ * on bits being shifted in from upper 32-bits. Take easy way out
6994
+ * and mark unbounded so we can recalculate later from tnum.
6995
+ */
6996
+ __mark_reg32_unbounded(dst_reg);
6997
+ __update_reg_bounds(dst_reg);
32216998 }
32226999
32237000 /* WARNING: This function does calculations on 64-bit values, but the actual
....@@ -3231,40 +7008,52 @@
32317008 {
32327009 struct bpf_reg_state *regs = cur_regs(env);
32337010 u8 opcode = BPF_OP(insn->code);
3234
- bool src_known, dst_known;
7011
+ bool src_known;
32357012 s64 smin_val, smax_val;
32367013 u64 umin_val, umax_val;
7014
+ s32 s32_min_val, s32_max_val;
7015
+ u32 u32_min_val, u32_max_val;
32377016 u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32;
7017
+ bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64);
32387018 int ret;
3239
-
3240
- if (insn_bitness == 32) {
3241
- /* Relevant for 32-bit RSH: Information can propagate towards
3242
- * LSB, so it isn't sufficient to only truncate the output to
3243
- * 32 bits.
3244
- */
3245
- coerce_reg_to_size(dst_reg, 4);
3246
- coerce_reg_to_size(&src_reg, 4);
3247
- }
32487019
32497020 smin_val = src_reg.smin_value;
32507021 smax_val = src_reg.smax_value;
32517022 umin_val = src_reg.umin_value;
32527023 umax_val = src_reg.umax_value;
3253
- src_known = tnum_is_const(src_reg.var_off);
3254
- dst_known = tnum_is_const(dst_reg->var_off);
32557024
3256
- if ((src_known && (smin_val != smax_val || umin_val != umax_val)) ||
3257
- smin_val > smax_val || umin_val > umax_val) {
3258
- /* Taint dst register if offset had invalid bounds derived from
3259
- * e.g. dead branches.
3260
- */
3261
- __mark_reg_unknown(dst_reg);
3262
- return 0;
7025
+ s32_min_val = src_reg.s32_min_value;
7026
+ s32_max_val = src_reg.s32_max_value;
7027
+ u32_min_val = src_reg.u32_min_value;
7028
+ u32_max_val = src_reg.u32_max_value;
7029
+
7030
+ if (alu32) {
7031
+ src_known = tnum_subreg_is_const(src_reg.var_off);
7032
+ if ((src_known &&
7033
+ (s32_min_val != s32_max_val || u32_min_val != u32_max_val)) ||
7034
+ s32_min_val > s32_max_val || u32_min_val > u32_max_val) {
7035
+ /* Taint dst register if offset had invalid bounds
7036
+ * derived from e.g. dead branches.
7037
+ */
7038
+ __mark_reg_unknown(env, dst_reg);
7039
+ return 0;
7040
+ }
7041
+ } else {
7042
+ src_known = tnum_is_const(src_reg.var_off);
7043
+ if ((src_known &&
7044
+ (smin_val != smax_val || umin_val != umax_val)) ||
7045
+ smin_val > smax_val || umin_val > umax_val) {
7046
+ /* Taint dst register if offset had invalid bounds
7047
+ * derived from e.g. dead branches.
7048
+ */
7049
+ __mark_reg_unknown(env, dst_reg);
7050
+ return 0;
7051
+ }
32637052 }
32647053
32657054 if (!src_known &&
32667055 opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) {
3267
- __mark_reg_unknown(dst_reg);
7056
+ __mark_reg_unknown(env, dst_reg);
32687057 return 0;
32697058 }
32707059
....@@ -3274,132 +7063,50 @@
32747063 return sanitize_err(env, insn, ret, NULL, NULL);
32757064 }
32767065
7066
+ /* Calculate sign/unsigned bounds and tnum for alu32 and alu64 bit ops.
7067
+ * There are two classes of instructions: The first class we track both
7068
+ * alu32 and alu64 sign/unsigned bounds independently this provides the
7069
+ * greatest amount of precision when alu operations are mixed with jmp32
7070
+ * operations. These operations are BPF_ADD, BPF_SUB, BPF_MUL, BPF_ADD,
7071
+ * and BPF_OR. This is possible because these ops have fairly easy to
7072
+ * understand and calculate behavior in both 32-bit and 64-bit alu ops.
7073
+ * See alu32 verifier tests for examples. The second class of
7074
+ * operations, BPF_LSH, BPF_RSH, and BPF_ARSH, however are not so easy
7075
+ * with regards to tracking sign/unsigned bounds because the bits may
7076
+ * cross subreg boundaries in the alu64 case. When this happens we mark
7077
+ * the reg unbounded in the subreg bound space and use the resulting
7078
+ * tnum to calculate an approximation of the sign/unsigned bounds.
7079
+ */
32777080 switch (opcode) {
32787081 case BPF_ADD:
3279
- if (signed_add_overflows(dst_reg->smin_value, smin_val) ||
3280
- signed_add_overflows(dst_reg->smax_value, smax_val)) {
3281
- dst_reg->smin_value = S64_MIN;
3282
- dst_reg->smax_value = S64_MAX;
3283
- } else {
3284
- dst_reg->smin_value += smin_val;
3285
- dst_reg->smax_value += smax_val;
3286
- }
3287
- if (dst_reg->umin_value + umin_val < umin_val ||
3288
- dst_reg->umax_value + umax_val < umax_val) {
3289
- dst_reg->umin_value = 0;
3290
- dst_reg->umax_value = U64_MAX;
3291
- } else {
3292
- dst_reg->umin_value += umin_val;
3293
- dst_reg->umax_value += umax_val;
3294
- }
7082
+ scalar32_min_max_add(dst_reg, &src_reg);
7083
+ scalar_min_max_add(dst_reg, &src_reg);
32957084 dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off);
32967085 break;
32977086 case BPF_SUB:
3298
- if (signed_sub_overflows(dst_reg->smin_value, smax_val) ||
3299
- signed_sub_overflows(dst_reg->smax_value, smin_val)) {
3300
- /* Overflow possible, we know nothing */
3301
- dst_reg->smin_value = S64_MIN;
3302
- dst_reg->smax_value = S64_MAX;
3303
- } else {
3304
- dst_reg->smin_value -= smax_val;
3305
- dst_reg->smax_value -= smin_val;
3306
- }
3307
- if (dst_reg->umin_value < umax_val) {
3308
- /* Overflow possible, we know nothing */
3309
- dst_reg->umin_value = 0;
3310
- dst_reg->umax_value = U64_MAX;
3311
- } else {
3312
- /* Cannot overflow (as long as bounds are consistent) */
3313
- dst_reg->umin_value -= umax_val;
3314
- dst_reg->umax_value -= umin_val;
3315
- }
7087
+ scalar32_min_max_sub(dst_reg, &src_reg);
7088
+ scalar_min_max_sub(dst_reg, &src_reg);
33167089 dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off);
33177090 break;
33187091 case BPF_MUL:
33197092 dst_reg->var_off = tnum_mul(dst_reg->var_off, src_reg.var_off);
3320
- if (smin_val < 0 || dst_reg->smin_value < 0) {
3321
- /* Ain't nobody got time to multiply that sign */
3322
- __mark_reg_unbounded(dst_reg);
3323
- __update_reg_bounds(dst_reg);
3324
- break;
3325
- }
3326
- /* Both values are positive, so we can work with unsigned and
3327
- * copy the result to signed (unless it exceeds S64_MAX).
3328
- */
3329
- if (umax_val > U32_MAX || dst_reg->umax_value > U32_MAX) {
3330
- /* Potential overflow, we know nothing */
3331
- __mark_reg_unbounded(dst_reg);
3332
- /* (except what we can learn from the var_off) */
3333
- __update_reg_bounds(dst_reg);
3334
- break;
3335
- }
3336
- dst_reg->umin_value *= umin_val;
3337
- dst_reg->umax_value *= umax_val;
3338
- if (dst_reg->umax_value > S64_MAX) {
3339
- /* Overflow possible, we know nothing */
3340
- dst_reg->smin_value = S64_MIN;
3341
- dst_reg->smax_value = S64_MAX;
3342
- } else {
3343
- dst_reg->smin_value = dst_reg->umin_value;
3344
- dst_reg->smax_value = dst_reg->umax_value;
3345
- }
7093
+ scalar32_min_max_mul(dst_reg, &src_reg);
7094
+ scalar_min_max_mul(dst_reg, &src_reg);
33467095 break;
33477096 case BPF_AND:
3348
- if (src_known && dst_known) {
3349
- __mark_reg_known(dst_reg, dst_reg->var_off.value &
3350
- src_reg.var_off.value);
3351
- break;
3352
- }
3353
- /* We get our minimum from the var_off, since that's inherently
3354
- * bitwise. Our maximum is the minimum of the operands' maxima.
3355
- */
33567097 dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg.var_off);
3357
- dst_reg->umin_value = dst_reg->var_off.value;
3358
- dst_reg->umax_value = min(dst_reg->umax_value, umax_val);
3359
- if (dst_reg->smin_value < 0 || smin_val < 0) {
3360
- /* Lose signed bounds when ANDing negative numbers,
3361
- * ain't nobody got time for that.
3362
- */
3363
- dst_reg->smin_value = S64_MIN;
3364
- dst_reg->smax_value = S64_MAX;
3365
- } else {
3366
- /* ANDing two positives gives a positive, so safe to
3367
- * cast result into s64.
3368
- */
3369
- dst_reg->smin_value = dst_reg->umin_value;
3370
- dst_reg->smax_value = dst_reg->umax_value;
3371
- }
3372
- /* We may learn something more from the var_off */
3373
- __update_reg_bounds(dst_reg);
7098
+ scalar32_min_max_and(dst_reg, &src_reg);
7099
+ scalar_min_max_and(dst_reg, &src_reg);
33747100 break;
33757101 case BPF_OR:
3376
- if (src_known && dst_known) {
3377
- __mark_reg_known(dst_reg, dst_reg->var_off.value |
3378
- src_reg.var_off.value);
3379
- break;
3380
- }
3381
- /* We get our maximum from the var_off, and our minimum is the
3382
- * maximum of the operands' minima
3383
- */
33847102 dst_reg->var_off = tnum_or(dst_reg->var_off, src_reg.var_off);
3385
- dst_reg->umin_value = max(dst_reg->umin_value, umin_val);
3386
- dst_reg->umax_value = dst_reg->var_off.value |
3387
- dst_reg->var_off.mask;
3388
- if (dst_reg->smin_value < 0 || smin_val < 0) {
3389
- /* Lose signed bounds when ORing negative numbers,
3390
- * ain't nobody got time for that.
3391
- */
3392
- dst_reg->smin_value = S64_MIN;
3393
- dst_reg->smax_value = S64_MAX;
3394
- } else {
3395
- /* ORing two positives gives a positive, so safe to
3396
- * cast result into s64.
3397
- */
3398
- dst_reg->smin_value = dst_reg->umin_value;
3399
- dst_reg->smax_value = dst_reg->umax_value;
3400
- }
3401
- /* We may learn something more from the var_off */
3402
- __update_reg_bounds(dst_reg);
7103
+ scalar32_min_max_or(dst_reg, &src_reg);
7104
+ scalar_min_max_or(dst_reg, &src_reg);
7105
+ break;
7106
+ case BPF_XOR:
7107
+ dst_reg->var_off = tnum_xor(dst_reg->var_off, src_reg.var_off);
7108
+ scalar32_min_max_xor(dst_reg, &src_reg);
7109
+ scalar_min_max_xor(dst_reg, &src_reg);
34037110 break;
34047111 case BPF_LSH:
34057112 if (umax_val >= insn_bitness) {
....@@ -3409,22 +7116,10 @@
34097116 mark_reg_unknown(env, regs, insn->dst_reg);
34107117 break;
34117118 }
3412
- /* We lose all sign bit information (except what we can pick
3413
- * up from var_off)
3414
- */
3415
- dst_reg->smin_value = S64_MIN;
3416
- dst_reg->smax_value = S64_MAX;
3417
- /* If we might shift our top bit out, then we know nothing */
3418
- if (dst_reg->umax_value > 1ULL << (63 - umax_val)) {
3419
- dst_reg->umin_value = 0;
3420
- dst_reg->umax_value = U64_MAX;
3421
- } else {
3422
- dst_reg->umin_value <<= umin_val;
3423
- dst_reg->umax_value <<= umax_val;
3424
- }
3425
- dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val);
3426
- /* We may learn something more from the var_off */
3427
- __update_reg_bounds(dst_reg);
7119
+ if (alu32)
7120
+ scalar32_min_max_lsh(dst_reg, &src_reg);
7121
+ else
7122
+ scalar_min_max_lsh(dst_reg, &src_reg);
34287123 break;
34297124 case BPF_RSH:
34307125 if (umax_val >= insn_bitness) {
....@@ -3434,27 +7129,10 @@
34347129 mark_reg_unknown(env, regs, insn->dst_reg);
34357130 break;
34367131 }
3437
- /* BPF_RSH is an unsigned shift. If the value in dst_reg might
3438
- * be negative, then either:
3439
- * 1) src_reg might be zero, so the sign bit of the result is
3440
- * unknown, so we lose our signed bounds
3441
- * 2) it's known negative, thus the unsigned bounds capture the
3442
- * signed bounds
3443
- * 3) the signed bounds cross zero, so they tell us nothing
3444
- * about the result
3445
- * If the value in dst_reg is known nonnegative, then again the
3446
- * unsigned bounts capture the signed bounds.
3447
- * Thus, in all cases it suffices to blow away our signed bounds
3448
- * and rely on inferring new ones from the unsigned bounds and
3449
- * var_off of the result.
3450
- */
3451
- dst_reg->smin_value = S64_MIN;
3452
- dst_reg->smax_value = S64_MAX;
3453
- dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val);
3454
- dst_reg->umin_value >>= umax_val;
3455
- dst_reg->umax_value >>= umin_val;
3456
- /* We may learn something more from the var_off */
3457
- __update_reg_bounds(dst_reg);
7132
+ if (alu32)
7133
+ scalar32_min_max_rsh(dst_reg, &src_reg);
7134
+ else
7135
+ scalar_min_max_rsh(dst_reg, &src_reg);
34587136 break;
34597137 case BPF_ARSH:
34607138 if (umax_val >= insn_bitness) {
....@@ -3464,40 +7142,20 @@
34647142 mark_reg_unknown(env, regs, insn->dst_reg);
34657143 break;
34667144 }
3467
-
3468
- /* Upon reaching here, src_known is true and
3469
- * umax_val is equal to umin_val.
3470
- */
3471
- if (insn_bitness == 32) {
3472
- dst_reg->smin_value = (u32)(((s32)dst_reg->smin_value) >> umin_val);
3473
- dst_reg->smax_value = (u32)(((s32)dst_reg->smax_value) >> umin_val);
3474
- } else {
3475
- dst_reg->smin_value >>= umin_val;
3476
- dst_reg->smax_value >>= umin_val;
3477
- }
3478
-
3479
- dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val,
3480
- insn_bitness);
3481
-
3482
- /* blow away the dst_reg umin_value/umax_value and rely on
3483
- * dst_reg var_off to refine the result.
3484
- */
3485
- dst_reg->umin_value = 0;
3486
- dst_reg->umax_value = U64_MAX;
3487
- __update_reg_bounds(dst_reg);
7145
+ if (alu32)
7146
+ scalar32_min_max_arsh(dst_reg, &src_reg);
7147
+ else
7148
+ scalar_min_max_arsh(dst_reg, &src_reg);
34887149 break;
34897150 default:
34907151 mark_reg_unknown(env, regs, insn->dst_reg);
34917152 break;
34927153 }
34937154
3494
- if (BPF_CLASS(insn->code) != BPF_ALU64) {
3495
- /* 32-bit ALU ops are (32,32)->32 */
3496
- coerce_reg_to_size(dst_reg, 4);
3497
- }
3498
-
3499
- __reg_deduce_bounds(dst_reg);
3500
- __reg_bound_offset(dst_reg);
7155
+ /* ALU32 ops are zero extended into 64bit register */
7156
+ if (alu32)
7157
+ zext_32_to_64(dst_reg);
7158
+ reg_bounds_sync(dst_reg);
35017159 return 0;
35027160 }
35037161
....@@ -3512,11 +7170,17 @@
35127170 struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg;
35137171 struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
35147172 u8 opcode = BPF_OP(insn->code);
7173
+ int err;
35157174
35167175 dst_reg = &regs[insn->dst_reg];
35177176 src_reg = NULL;
35187177 if (dst_reg->type != SCALAR_VALUE)
35197178 ptr_reg = dst_reg;
7179
+ else
7180
+ /* Make sure ID is cleared otherwise dst_reg min/max could be
7181
+ * incorrectly propagated into other registers by find_equal_scalars()
7182
+ */
7183
+ dst_reg->id = 0;
35207184 if (BPF_SRC(insn->code) == BPF_X) {
35217185 src_reg = &regs[insn->src_reg];
35227186 if (src_reg->type != SCALAR_VALUE) {
....@@ -3538,13 +7202,24 @@
35387202 * This is legal, but we have to reverse our
35397203 * src/dest handling in computing the range
35407204 */
7205
+ err = mark_chain_precision(env, insn->dst_reg);
7206
+ if (err)
7207
+ return err;
35417208 return adjust_ptr_min_max_vals(env, insn,
35427209 src_reg, dst_reg);
35437210 }
35447211 } else if (ptr_reg) {
35457212 /* pointer += scalar */
7213
+ err = mark_chain_precision(env, insn->src_reg);
7214
+ if (err)
7215
+ return err;
35467216 return adjust_ptr_min_max_vals(env, insn,
35477217 dst_reg, src_reg);
7218
+ } else if (dst_reg->precise) {
7219
+ /* if dst_reg is precise, src_reg should be precise as well */
7220
+ err = mark_chain_precision(env, insn->src_reg);
7221
+ if (err)
7222
+ return err;
35487223 }
35497224 } else {
35507225 /* Pretend the src is a reg with a known value, since we only
....@@ -3644,8 +7319,15 @@
36447319 /* case: R1 = R2
36457320 * copy register state to dest reg
36467321 */
3647
- *dst_reg = *src_reg;
7322
+ if (src_reg->type == SCALAR_VALUE && !src_reg->id)
7323
+ /* Assign src and dst registers the same ID
7324
+ * that will be used by find_equal_scalars()
7325
+ * to propagate min/max range.
7326
+ */
7327
+ src_reg->id = ++env->id_gen;
7328
+ copy_register_state(dst_reg, src_reg);
36487329 dst_reg->live |= REG_LIVE_WRITTEN;
7330
+ dst_reg->subreg_def = DEF_NOT_SUBREG;
36497331 } else {
36507332 /* R1 = (u32) R2 */
36517333 if (is_pointer_value(env, insn->src_reg)) {
....@@ -3654,13 +7336,20 @@
36547336 insn->src_reg);
36557337 return -EACCES;
36567338 } else if (src_reg->type == SCALAR_VALUE) {
3657
- *dst_reg = *src_reg;
7339
+ copy_register_state(dst_reg, src_reg);
7340
+ /* Make sure ID is cleared otherwise
7341
+ * dst_reg min/max could be incorrectly
7342
+ * propagated into src_reg by find_equal_scalars()
7343
+ */
7344
+ dst_reg->id = 0;
36587345 dst_reg->live |= REG_LIVE_WRITTEN;
7346
+ dst_reg->subreg_def = env->insn_idx + 1;
36597347 } else {
36607348 mark_reg_unknown(env, regs,
36617349 insn->dst_reg);
36627350 }
3663
- coerce_reg_to_size(dst_reg, 4);
7351
+ zext_32_to_64(dst_reg);
7352
+ reg_bounds_sync(dst_reg);
36647353 }
36657354 } else {
36667355 /* case: R = imm
....@@ -3711,11 +7400,6 @@
37117400 return -EINVAL;
37127401 }
37137402
3714
- if (opcode == BPF_ARSH && BPF_CLASS(insn->code) != BPF_ALU64) {
3715
- verbose(env, "BPF_ARSH not supported for 32 bit ALU\n");
3716
- return -EINVAL;
3717
- }
3718
-
37197403 if ((opcode == BPF_LSH || opcode == BPF_RSH ||
37207404 opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) {
37217405 int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
....@@ -3742,10 +7426,9 @@
37427426 enum bpf_reg_type type,
37437427 bool range_right_open)
37447428 {
3745
- struct bpf_func_state *state = vstate->frame[vstate->curframe];
3746
- struct bpf_reg_state *regs = state->regs, *reg;
3747
- u16 new_range;
3748
- int i, j;
7429
+ struct bpf_func_state *state;
7430
+ struct bpf_reg_state *reg;
7431
+ int new_range;
37497432
37507433 if (dst_reg->off < 0 ||
37517434 (dst_reg->off == 0 && range_right_open))
....@@ -3810,33 +7493,90 @@
38107493 * the range won't allow anything.
38117494 * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
38127495 */
3813
- for (i = 0; i < MAX_BPF_REG; i++)
3814
- if (regs[i].type == type && regs[i].id == dst_reg->id)
7496
+ bpf_for_each_reg_in_vstate(vstate, state, reg, ({
7497
+ if (reg->type == type && reg->id == dst_reg->id)
38157498 /* keep the maximum range already checked */
3816
- regs[i].range = max(regs[i].range, new_range);
3817
-
3818
- for (j = 0; j <= vstate->curframe; j++) {
3819
- state = vstate->frame[j];
3820
- for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
3821
- if (state->stack[i].slot_type[0] != STACK_SPILL)
3822
- continue;
3823
- reg = &state->stack[i].spilled_ptr;
3824
- if (reg->type == type && reg->id == dst_reg->id)
3825
- reg->range = max(reg->range, new_range);
3826
- }
3827
- }
7499
+ reg->range = max(reg->range, new_range);
7500
+ }));
38287501 }
38297502
3830
-/* compute branch direction of the expression "if (reg opcode val) goto target;"
3831
- * and return:
3832
- * 1 - branch will be taken and "goto target" will be executed
3833
- * 0 - branch will not be taken and fall-through to next insn
3834
- * -1 - unknown. Example: "if (reg < 5)" is unknown when register value range [0,10]
3835
- */
3836
-static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
7503
+static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode)
38377504 {
3838
- if (__is_pointer_value(false, reg))
3839
- return -1;
7505
+ struct tnum subreg = tnum_subreg(reg->var_off);
7506
+ s32 sval = (s32)val;
7507
+
7508
+ switch (opcode) {
7509
+ case BPF_JEQ:
7510
+ if (tnum_is_const(subreg))
7511
+ return !!tnum_equals_const(subreg, val);
7512
+ break;
7513
+ case BPF_JNE:
7514
+ if (tnum_is_const(subreg))
7515
+ return !tnum_equals_const(subreg, val);
7516
+ break;
7517
+ case BPF_JSET:
7518
+ if ((~subreg.mask & subreg.value) & val)
7519
+ return 1;
7520
+ if (!((subreg.mask | subreg.value) & val))
7521
+ return 0;
7522
+ break;
7523
+ case BPF_JGT:
7524
+ if (reg->u32_min_value > val)
7525
+ return 1;
7526
+ else if (reg->u32_max_value <= val)
7527
+ return 0;
7528
+ break;
7529
+ case BPF_JSGT:
7530
+ if (reg->s32_min_value > sval)
7531
+ return 1;
7532
+ else if (reg->s32_max_value <= sval)
7533
+ return 0;
7534
+ break;
7535
+ case BPF_JLT:
7536
+ if (reg->u32_max_value < val)
7537
+ return 1;
7538
+ else if (reg->u32_min_value >= val)
7539
+ return 0;
7540
+ break;
7541
+ case BPF_JSLT:
7542
+ if (reg->s32_max_value < sval)
7543
+ return 1;
7544
+ else if (reg->s32_min_value >= sval)
7545
+ return 0;
7546
+ break;
7547
+ case BPF_JGE:
7548
+ if (reg->u32_min_value >= val)
7549
+ return 1;
7550
+ else if (reg->u32_max_value < val)
7551
+ return 0;
7552
+ break;
7553
+ case BPF_JSGE:
7554
+ if (reg->s32_min_value >= sval)
7555
+ return 1;
7556
+ else if (reg->s32_max_value < sval)
7557
+ return 0;
7558
+ break;
7559
+ case BPF_JLE:
7560
+ if (reg->u32_max_value <= val)
7561
+ return 1;
7562
+ else if (reg->u32_min_value > val)
7563
+ return 0;
7564
+ break;
7565
+ case BPF_JSLE:
7566
+ if (reg->s32_max_value <= sval)
7567
+ return 1;
7568
+ else if (reg->s32_min_value > sval)
7569
+ return 0;
7570
+ break;
7571
+ }
7572
+
7573
+ return -1;
7574
+}
7575
+
7576
+
7577
+static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
7578
+{
7579
+ s64 sval = (s64)val;
38407580
38417581 switch (opcode) {
38427582 case BPF_JEQ:
....@@ -3847,6 +7587,12 @@
38477587 if (tnum_is_const(reg->var_off))
38487588 return !tnum_equals_const(reg->var_off, val);
38497589 break;
7590
+ case BPF_JSET:
7591
+ if ((~reg->var_off.mask & reg->var_off.value) & val)
7592
+ return 1;
7593
+ if (!((reg->var_off.mask | reg->var_off.value) & val))
7594
+ return 0;
7595
+ break;
38507596 case BPF_JGT:
38517597 if (reg->umin_value > val)
38527598 return 1;
....@@ -3854,9 +7600,9 @@
38547600 return 0;
38557601 break;
38567602 case BPF_JSGT:
3857
- if (reg->smin_value > (s64)val)
7603
+ if (reg->smin_value > sval)
38587604 return 1;
3859
- else if (reg->smax_value < (s64)val)
7605
+ else if (reg->smax_value <= sval)
38607606 return 0;
38617607 break;
38627608 case BPF_JLT:
....@@ -3866,9 +7612,9 @@
38667612 return 0;
38677613 break;
38687614 case BPF_JSLT:
3869
- if (reg->smax_value < (s64)val)
7615
+ if (reg->smax_value < sval)
38707616 return 1;
3871
- else if (reg->smin_value >= (s64)val)
7617
+ else if (reg->smin_value >= sval)
38727618 return 0;
38737619 break;
38747620 case BPF_JGE:
....@@ -3878,9 +7624,9 @@
38787624 return 0;
38797625 break;
38807626 case BPF_JSGE:
3881
- if (reg->smin_value >= (s64)val)
7627
+ if (reg->smin_value >= sval)
38827628 return 1;
3883
- else if (reg->smax_value < (s64)val)
7629
+ else if (reg->smax_value < sval)
38847630 return 0;
38857631 break;
38867632 case BPF_JLE:
....@@ -3890,13 +7636,109 @@
38907636 return 0;
38917637 break;
38927638 case BPF_JSLE:
3893
- if (reg->smax_value <= (s64)val)
7639
+ if (reg->smax_value <= sval)
38947640 return 1;
3895
- else if (reg->smin_value > (s64)val)
7641
+ else if (reg->smin_value > sval)
38967642 return 0;
38977643 break;
38987644 }
38997645
7646
+ return -1;
7647
+}
7648
+
7649
+/* compute branch direction of the expression "if (reg opcode val) goto target;"
7650
+ * and return:
7651
+ * 1 - branch will be taken and "goto target" will be executed
7652
+ * 0 - branch will not be taken and fall-through to next insn
7653
+ * -1 - unknown. Example: "if (reg < 5)" is unknown when register value
7654
+ * range [0,10]
7655
+ */
7656
+static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode,
7657
+ bool is_jmp32)
7658
+{
7659
+ if (__is_pointer_value(false, reg)) {
7660
+ if (!reg_type_not_null(reg->type))
7661
+ return -1;
7662
+
7663
+ /* If pointer is valid tests against zero will fail so we can
7664
+ * use this to direct branch taken.
7665
+ */
7666
+ if (val != 0)
7667
+ return -1;
7668
+
7669
+ switch (opcode) {
7670
+ case BPF_JEQ:
7671
+ return 0;
7672
+ case BPF_JNE:
7673
+ return 1;
7674
+ default:
7675
+ return -1;
7676
+ }
7677
+ }
7678
+
7679
+ if (is_jmp32)
7680
+ return is_branch32_taken(reg, val, opcode);
7681
+ return is_branch64_taken(reg, val, opcode);
7682
+}
7683
+
7684
+static int flip_opcode(u32 opcode)
7685
+{
7686
+ /* How can we transform "a <op> b" into "b <op> a"? */
7687
+ static const u8 opcode_flip[16] = {
7688
+ /* these stay the same */
7689
+ [BPF_JEQ >> 4] = BPF_JEQ,
7690
+ [BPF_JNE >> 4] = BPF_JNE,
7691
+ [BPF_JSET >> 4] = BPF_JSET,
7692
+ /* these swap "lesser" and "greater" (L and G in the opcodes) */
7693
+ [BPF_JGE >> 4] = BPF_JLE,
7694
+ [BPF_JGT >> 4] = BPF_JLT,
7695
+ [BPF_JLE >> 4] = BPF_JGE,
7696
+ [BPF_JLT >> 4] = BPF_JGT,
7697
+ [BPF_JSGE >> 4] = BPF_JSLE,
7698
+ [BPF_JSGT >> 4] = BPF_JSLT,
7699
+ [BPF_JSLE >> 4] = BPF_JSGE,
7700
+ [BPF_JSLT >> 4] = BPF_JSGT
7701
+ };
7702
+ return opcode_flip[opcode >> 4];
7703
+}
7704
+
7705
+static int is_pkt_ptr_branch_taken(struct bpf_reg_state *dst_reg,
7706
+ struct bpf_reg_state *src_reg,
7707
+ u8 opcode)
7708
+{
7709
+ struct bpf_reg_state *pkt;
7710
+
7711
+ if (src_reg->type == PTR_TO_PACKET_END) {
7712
+ pkt = dst_reg;
7713
+ } else if (dst_reg->type == PTR_TO_PACKET_END) {
7714
+ pkt = src_reg;
7715
+ opcode = flip_opcode(opcode);
7716
+ } else {
7717
+ return -1;
7718
+ }
7719
+
7720
+ if (pkt->range >= 0)
7721
+ return -1;
7722
+
7723
+ switch (opcode) {
7724
+ case BPF_JLE:
7725
+ /* pkt <= pkt_end */
7726
+ fallthrough;
7727
+ case BPF_JGT:
7728
+ /* pkt > pkt_end */
7729
+ if (pkt->range == BEYOND_PKT_END)
7730
+ /* pkt has at last one extra byte beyond pkt_end */
7731
+ return opcode == BPF_JGT;
7732
+ break;
7733
+ case BPF_JLT:
7734
+ /* pkt < pkt_end */
7735
+ fallthrough;
7736
+ case BPF_JGE:
7737
+ /* pkt >= pkt_end */
7738
+ if (pkt->range == BEYOND_PKT_END || pkt->range == AT_PKT_END)
7739
+ return opcode == BPF_JGE;
7740
+ break;
7741
+ }
39007742 return -1;
39017743 }
39027744
....@@ -3906,9 +7748,17 @@
39067748 * In JEQ/JNE cases we also adjust the var_off values.
39077749 */
39087750 static void reg_set_min_max(struct bpf_reg_state *true_reg,
3909
- struct bpf_reg_state *false_reg, u64 val,
3910
- u8 opcode)
7751
+ struct bpf_reg_state *false_reg,
7752
+ u64 val, u32 val32,
7753
+ u8 opcode, bool is_jmp32)
39117754 {
7755
+ struct tnum false_32off = tnum_subreg(false_reg->var_off);
7756
+ struct tnum false_64off = false_reg->var_off;
7757
+ struct tnum true_32off = tnum_subreg(true_reg->var_off);
7758
+ struct tnum true_64off = true_reg->var_off;
7759
+ s64 sval = (s64)val;
7760
+ s32 sval32 = (s32)val32;
7761
+
39127762 /* If the dst_reg is a pointer, we can't learn anything about its
39137763 * variable offset from the compare (unless src_reg were a pointer into
39147764 * the same object, but we don't bother with that.
....@@ -3919,137 +7769,155 @@
39197769 return;
39207770
39217771 switch (opcode) {
7772
+ /* JEQ/JNE comparison doesn't change the register equivalence.
7773
+ *
7774
+ * r1 = r2;
7775
+ * if (r1 == 42) goto label;
7776
+ * ...
7777
+ * label: // here both r1 and r2 are known to be 42.
7778
+ *
7779
+ * Hence when marking register as known preserve it's ID.
7780
+ */
39227781 case BPF_JEQ:
3923
- /* If this is false then we know nothing Jon Snow, but if it is
3924
- * true then we know for sure.
3925
- */
3926
- __mark_reg_known(true_reg, val);
7782
+ if (is_jmp32) {
7783
+ __mark_reg32_known(true_reg, val32);
7784
+ true_32off = tnum_subreg(true_reg->var_off);
7785
+ } else {
7786
+ ___mark_reg_known(true_reg, val);
7787
+ true_64off = true_reg->var_off;
7788
+ }
39277789 break;
39287790 case BPF_JNE:
3929
- /* If this is true we know nothing Jon Snow, but if it is false
3930
- * we know the value for sure;
3931
- */
3932
- __mark_reg_known(false_reg, val);
7791
+ if (is_jmp32) {
7792
+ __mark_reg32_known(false_reg, val32);
7793
+ false_32off = tnum_subreg(false_reg->var_off);
7794
+ } else {
7795
+ ___mark_reg_known(false_reg, val);
7796
+ false_64off = false_reg->var_off;
7797
+ }
39337798 break;
3934
- case BPF_JGT:
3935
- false_reg->umax_value = min(false_reg->umax_value, val);
3936
- true_reg->umin_value = max(true_reg->umin_value, val + 1);
3937
- break;
3938
- case BPF_JSGT:
3939
- false_reg->smax_value = min_t(s64, false_reg->smax_value, val);
3940
- true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1);
3941
- break;
3942
- case BPF_JLT:
3943
- false_reg->umin_value = max(false_reg->umin_value, val);
3944
- true_reg->umax_value = min(true_reg->umax_value, val - 1);
3945
- break;
3946
- case BPF_JSLT:
3947
- false_reg->smin_value = max_t(s64, false_reg->smin_value, val);
3948
- true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1);
7799
+ case BPF_JSET:
7800
+ if (is_jmp32) {
7801
+ false_32off = tnum_and(false_32off, tnum_const(~val32));
7802
+ if (is_power_of_2(val32))
7803
+ true_32off = tnum_or(true_32off,
7804
+ tnum_const(val32));
7805
+ } else {
7806
+ false_64off = tnum_and(false_64off, tnum_const(~val));
7807
+ if (is_power_of_2(val))
7808
+ true_64off = tnum_or(true_64off,
7809
+ tnum_const(val));
7810
+ }
39497811 break;
39507812 case BPF_JGE:
3951
- false_reg->umax_value = min(false_reg->umax_value, val - 1);
3952
- true_reg->umin_value = max(true_reg->umin_value, val);
3953
- break;
3954
- case BPF_JSGE:
3955
- false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1);
3956
- true_reg->smin_value = max_t(s64, true_reg->smin_value, val);
3957
- break;
3958
- case BPF_JLE:
3959
- false_reg->umin_value = max(false_reg->umin_value, val + 1);
3960
- true_reg->umax_value = min(true_reg->umax_value, val);
3961
- break;
3962
- case BPF_JSLE:
3963
- false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1);
3964
- true_reg->smax_value = min_t(s64, true_reg->smax_value, val);
3965
- break;
3966
- default:
7813
+ case BPF_JGT:
7814
+ {
7815
+ if (is_jmp32) {
7816
+ u32 false_umax = opcode == BPF_JGT ? val32 : val32 - 1;
7817
+ u32 true_umin = opcode == BPF_JGT ? val32 + 1 : val32;
7818
+
7819
+ false_reg->u32_max_value = min(false_reg->u32_max_value,
7820
+ false_umax);
7821
+ true_reg->u32_min_value = max(true_reg->u32_min_value,
7822
+ true_umin);
7823
+ } else {
7824
+ u64 false_umax = opcode == BPF_JGT ? val : val - 1;
7825
+ u64 true_umin = opcode == BPF_JGT ? val + 1 : val;
7826
+
7827
+ false_reg->umax_value = min(false_reg->umax_value, false_umax);
7828
+ true_reg->umin_value = max(true_reg->umin_value, true_umin);
7829
+ }
39677830 break;
39687831 }
7832
+ case BPF_JSGE:
7833
+ case BPF_JSGT:
7834
+ {
7835
+ if (is_jmp32) {
7836
+ s32 false_smax = opcode == BPF_JSGT ? sval32 : sval32 - 1;
7837
+ s32 true_smin = opcode == BPF_JSGT ? sval32 + 1 : sval32;
39697838
3970
- __reg_deduce_bounds(false_reg);
3971
- __reg_deduce_bounds(true_reg);
3972
- /* We might have learned some bits from the bounds. */
3973
- __reg_bound_offset(false_reg);
3974
- __reg_bound_offset(true_reg);
3975
- /* Intersecting with the old var_off might have improved our bounds
3976
- * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
3977
- * then new var_off is (0; 0x7f...fc) which improves our umax.
3978
- */
3979
- __update_reg_bounds(false_reg);
3980
- __update_reg_bounds(true_reg);
7839
+ false_reg->s32_max_value = min(false_reg->s32_max_value, false_smax);
7840
+ true_reg->s32_min_value = max(true_reg->s32_min_value, true_smin);
7841
+ } else {
7842
+ s64 false_smax = opcode == BPF_JSGT ? sval : sval - 1;
7843
+ s64 true_smin = opcode == BPF_JSGT ? sval + 1 : sval;
7844
+
7845
+ false_reg->smax_value = min(false_reg->smax_value, false_smax);
7846
+ true_reg->smin_value = max(true_reg->smin_value, true_smin);
7847
+ }
7848
+ break;
7849
+ }
7850
+ case BPF_JLE:
7851
+ case BPF_JLT:
7852
+ {
7853
+ if (is_jmp32) {
7854
+ u32 false_umin = opcode == BPF_JLT ? val32 : val32 + 1;
7855
+ u32 true_umax = opcode == BPF_JLT ? val32 - 1 : val32;
7856
+
7857
+ false_reg->u32_min_value = max(false_reg->u32_min_value,
7858
+ false_umin);
7859
+ true_reg->u32_max_value = min(true_reg->u32_max_value,
7860
+ true_umax);
7861
+ } else {
7862
+ u64 false_umin = opcode == BPF_JLT ? val : val + 1;
7863
+ u64 true_umax = opcode == BPF_JLT ? val - 1 : val;
7864
+
7865
+ false_reg->umin_value = max(false_reg->umin_value, false_umin);
7866
+ true_reg->umax_value = min(true_reg->umax_value, true_umax);
7867
+ }
7868
+ break;
7869
+ }
7870
+ case BPF_JSLE:
7871
+ case BPF_JSLT:
7872
+ {
7873
+ if (is_jmp32) {
7874
+ s32 false_smin = opcode == BPF_JSLT ? sval32 : sval32 + 1;
7875
+ s32 true_smax = opcode == BPF_JSLT ? sval32 - 1 : sval32;
7876
+
7877
+ false_reg->s32_min_value = max(false_reg->s32_min_value, false_smin);
7878
+ true_reg->s32_max_value = min(true_reg->s32_max_value, true_smax);
7879
+ } else {
7880
+ s64 false_smin = opcode == BPF_JSLT ? sval : sval + 1;
7881
+ s64 true_smax = opcode == BPF_JSLT ? sval - 1 : sval;
7882
+
7883
+ false_reg->smin_value = max(false_reg->smin_value, false_smin);
7884
+ true_reg->smax_value = min(true_reg->smax_value, true_smax);
7885
+ }
7886
+ break;
7887
+ }
7888
+ default:
7889
+ return;
7890
+ }
7891
+
7892
+ if (is_jmp32) {
7893
+ false_reg->var_off = tnum_or(tnum_clear_subreg(false_64off),
7894
+ tnum_subreg(false_32off));
7895
+ true_reg->var_off = tnum_or(tnum_clear_subreg(true_64off),
7896
+ tnum_subreg(true_32off));
7897
+ __reg_combine_32_into_64(false_reg);
7898
+ __reg_combine_32_into_64(true_reg);
7899
+ } else {
7900
+ false_reg->var_off = false_64off;
7901
+ true_reg->var_off = true_64off;
7902
+ __reg_combine_64_into_32(false_reg);
7903
+ __reg_combine_64_into_32(true_reg);
7904
+ }
39817905 }
39827906
39837907 /* Same as above, but for the case that dst_reg holds a constant and src_reg is
39847908 * the variable reg.
39857909 */
39867910 static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
3987
- struct bpf_reg_state *false_reg, u64 val,
3988
- u8 opcode)
7911
+ struct bpf_reg_state *false_reg,
7912
+ u64 val, u32 val32,
7913
+ u8 opcode, bool is_jmp32)
39897914 {
3990
- if (__is_pointer_value(false, false_reg))
3991
- return;
3992
-
3993
- switch (opcode) {
3994
- case BPF_JEQ:
3995
- /* If this is false then we know nothing Jon Snow, but if it is
3996
- * true then we know for sure.
3997
- */
3998
- __mark_reg_known(true_reg, val);
3999
- break;
4000
- case BPF_JNE:
4001
- /* If this is true we know nothing Jon Snow, but if it is false
4002
- * we know the value for sure;
4003
- */
4004
- __mark_reg_known(false_reg, val);
4005
- break;
4006
- case BPF_JGT:
4007
- true_reg->umax_value = min(true_reg->umax_value, val - 1);
4008
- false_reg->umin_value = max(false_reg->umin_value, val);
4009
- break;
4010
- case BPF_JSGT:
4011
- true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1);
4012
- false_reg->smin_value = max_t(s64, false_reg->smin_value, val);
4013
- break;
4014
- case BPF_JLT:
4015
- true_reg->umin_value = max(true_reg->umin_value, val + 1);
4016
- false_reg->umax_value = min(false_reg->umax_value, val);
4017
- break;
4018
- case BPF_JSLT:
4019
- true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1);
4020
- false_reg->smax_value = min_t(s64, false_reg->smax_value, val);
4021
- break;
4022
- case BPF_JGE:
4023
- true_reg->umax_value = min(true_reg->umax_value, val);
4024
- false_reg->umin_value = max(false_reg->umin_value, val + 1);
4025
- break;
4026
- case BPF_JSGE:
4027
- true_reg->smax_value = min_t(s64, true_reg->smax_value, val);
4028
- false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1);
4029
- break;
4030
- case BPF_JLE:
4031
- true_reg->umin_value = max(true_reg->umin_value, val);
4032
- false_reg->umax_value = min(false_reg->umax_value, val - 1);
4033
- break;
4034
- case BPF_JSLE:
4035
- true_reg->smin_value = max_t(s64, true_reg->smin_value, val);
4036
- false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1);
4037
- break;
4038
- default:
4039
- break;
4040
- }
4041
-
4042
- __reg_deduce_bounds(false_reg);
4043
- __reg_deduce_bounds(true_reg);
4044
- /* We might have learned some bits from the bounds. */
4045
- __reg_bound_offset(false_reg);
4046
- __reg_bound_offset(true_reg);
4047
- /* Intersecting with the old var_off might have improved our bounds
4048
- * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
4049
- * then new var_off is (0; 0x7f...fc) which improves our umax.
7915
+ opcode = flip_opcode(opcode);
7916
+ /* This uses zero as "not present in table"; luckily the zero opcode,
7917
+ * BPF_JA, can't get here.
40507918 */
4051
- __update_reg_bounds(false_reg);
4052
- __update_reg_bounds(true_reg);
7919
+ if (opcode)
7920
+ reg_set_min_max(true_reg, false_reg, val, val32, opcode, is_jmp32);
40537921 }
40547922
40557923 /* Regs are known to be equal, so intersect their min/max/var_off */
....@@ -4066,21 +7934,8 @@
40667934 dst_reg->smax_value);
40677935 src_reg->var_off = dst_reg->var_off = tnum_intersect(src_reg->var_off,
40687936 dst_reg->var_off);
4069
- /* We might have learned new bounds from the var_off. */
4070
- __update_reg_bounds(src_reg);
4071
- __update_reg_bounds(dst_reg);
4072
- /* We might have learned something about the sign bit. */
4073
- __reg_deduce_bounds(src_reg);
4074
- __reg_deduce_bounds(dst_reg);
4075
- /* We might have learned some bits from the bounds. */
4076
- __reg_bound_offset(src_reg);
4077
- __reg_bound_offset(dst_reg);
4078
- /* Intersecting with the old var_off might have improved our bounds
4079
- * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
4080
- * then new var_off is (0; 0x7f...fc) which improves our umax.
4081
- */
4082
- __update_reg_bounds(src_reg);
4083
- __update_reg_bounds(dst_reg);
7937
+ reg_bounds_sync(src_reg);
7938
+ reg_bounds_sync(dst_reg);
40847939 }
40857940
40867941 static void reg_combine_min_max(struct bpf_reg_state *true_src,
....@@ -4099,60 +7954,93 @@
40997954 }
41007955 }
41017956
4102
-static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
4103
- bool is_null)
7957
+static void mark_ptr_or_null_reg(struct bpf_func_state *state,
7958
+ struct bpf_reg_state *reg, u32 id,
7959
+ bool is_null)
41047960 {
4105
- struct bpf_reg_state *reg = &regs[regno];
4106
-
4107
- if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) {
4108
- /* Old offset (both fixed and variable parts) should
4109
- * have been known-zero, because we don't allow pointer
4110
- * arithmetic on pointers that might be NULL.
4111
- */
7961
+ if (reg_type_may_be_null(reg->type) && reg->id == id &&
7962
+ !WARN_ON_ONCE(!reg->id)) {
41127963 if (WARN_ON_ONCE(reg->smin_value || reg->smax_value ||
41137964 !tnum_equals_const(reg->var_off, 0) ||
41147965 reg->off)) {
4115
- __mark_reg_known_zero(reg);
4116
- reg->off = 0;
7966
+ /* Old offset (both fixed and variable parts) should
7967
+ * have been known-zero, because we don't allow pointer
7968
+ * arithmetic on pointers that might be NULL. If we
7969
+ * see this happening, don't convert the register.
7970
+ */
7971
+ return;
41177972 }
41187973 if (is_null) {
41197974 reg->type = SCALAR_VALUE;
4120
- } else if (reg->map_ptr->inner_map_meta) {
4121
- reg->type = CONST_PTR_TO_MAP;
4122
- reg->map_ptr = reg->map_ptr->inner_map_meta;
4123
- } else {
4124
- reg->type = PTR_TO_MAP_VALUE;
7975
+ } else if (reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
7976
+ const struct bpf_map *map = reg->map_ptr;
7977
+
7978
+ if (map->inner_map_meta) {
7979
+ reg->type = CONST_PTR_TO_MAP;
7980
+ reg->map_ptr = map->inner_map_meta;
7981
+ } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
7982
+ reg->type = PTR_TO_XDP_SOCK;
7983
+ } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||
7984
+ map->map_type == BPF_MAP_TYPE_SOCKHASH) {
7985
+ reg->type = PTR_TO_SOCKET;
7986
+ } else {
7987
+ reg->type = PTR_TO_MAP_VALUE;
7988
+ }
7989
+ } else if (reg->type == PTR_TO_SOCKET_OR_NULL) {
7990
+ reg->type = PTR_TO_SOCKET;
7991
+ } else if (reg->type == PTR_TO_SOCK_COMMON_OR_NULL) {
7992
+ reg->type = PTR_TO_SOCK_COMMON;
7993
+ } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) {
7994
+ reg->type = PTR_TO_TCP_SOCK;
7995
+ } else if (reg->type == PTR_TO_BTF_ID_OR_NULL) {
7996
+ reg->type = PTR_TO_BTF_ID;
7997
+ } else if (reg->type == PTR_TO_MEM_OR_NULL) {
7998
+ reg->type = PTR_TO_MEM;
7999
+ } else if (reg->type == PTR_TO_RDONLY_BUF_OR_NULL) {
8000
+ reg->type = PTR_TO_RDONLY_BUF;
8001
+ } else if (reg->type == PTR_TO_RDWR_BUF_OR_NULL) {
8002
+ reg->type = PTR_TO_RDWR_BUF;
41258003 }
4126
- /* We don't need id from this point onwards anymore, thus we
4127
- * should better reset it, so that state pruning has chances
4128
- * to take effect.
4129
- */
4130
- reg->id = 0;
8004
+ if (is_null) {
8005
+ /* We don't need id and ref_obj_id from this point
8006
+ * onwards anymore, thus we should better reset it,
8007
+ * so that state pruning has chances to take effect.
8008
+ */
8009
+ reg->id = 0;
8010
+ reg->ref_obj_id = 0;
8011
+ } else if (!reg_may_point_to_spin_lock(reg)) {
8012
+ /* For not-NULL ptr, reg->ref_obj_id will be reset
8013
+ * in release_reference().
8014
+ *
8015
+ * reg->id is still used by spin_lock ptr. Other
8016
+ * than spin_lock ptr type, reg->id can be reset.
8017
+ */
8018
+ reg->id = 0;
8019
+ }
41318020 }
41328021 }
41338022
41348023 /* The logic is similar to find_good_pkt_pointers(), both could eventually
41358024 * be folded together at some point.
41368025 */
4137
-static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno,
4138
- bool is_null)
8026
+static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
8027
+ bool is_null)
41398028 {
41408029 struct bpf_func_state *state = vstate->frame[vstate->curframe];
4141
- struct bpf_reg_state *regs = state->regs;
8030
+ struct bpf_reg_state *regs = state->regs, *reg;
8031
+ u32 ref_obj_id = regs[regno].ref_obj_id;
41428032 u32 id = regs[regno].id;
4143
- int i, j;
41448033
4145
- for (i = 0; i < MAX_BPF_REG; i++)
4146
- mark_map_reg(regs, i, id, is_null);
8034
+ if (ref_obj_id && ref_obj_id == id && is_null)
8035
+ /* regs[regno] is in the " == NULL" branch.
8036
+ * No one could have freed the reference state before
8037
+ * doing the NULL check.
8038
+ */
8039
+ WARN_ON_ONCE(release_reference_state(state, id));
41478040
4148
- for (j = 0; j <= vstate->curframe; j++) {
4149
- state = vstate->frame[j];
4150
- for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
4151
- if (state->stack[i].slot_type[0] != STACK_SPILL)
4152
- continue;
4153
- mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null);
4154
- }
4155
- }
8041
+ bpf_for_each_reg_in_vstate(vstate, state, reg, ({
8042
+ mark_ptr_or_null_reg(state, reg, id, is_null);
8043
+ }));
41568044 }
41578045
41588046 static bool try_match_pkt_pointers(const struct bpf_insn *insn,
....@@ -4164,6 +8052,10 @@
41648052 if (BPF_SRC(insn->code) != BPF_X)
41658053 return false;
41668054
8055
+ /* Pointers are always 64-bit. */
8056
+ if (BPF_CLASS(insn->code) == BPF_JMP32)
8057
+ return false;
8058
+
41678059 switch (BPF_OP(insn->code)) {
41688060 case BPF_JGT:
41698061 if ((dst_reg->type == PTR_TO_PACKET &&
....@@ -4173,6 +8065,7 @@
41738065 /* pkt_data' > pkt_end, pkt_meta' > pkt_data */
41748066 find_good_pkt_pointers(this_branch, dst_reg,
41758067 dst_reg->type, false);
8068
+ mark_pkt_end(other_branch, insn->dst_reg, true);
41768069 } else if ((dst_reg->type == PTR_TO_PACKET_END &&
41778070 src_reg->type == PTR_TO_PACKET) ||
41788071 (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
....@@ -4180,6 +8073,7 @@
41808073 /* pkt_end > pkt_data', pkt_data > pkt_meta' */
41818074 find_good_pkt_pointers(other_branch, src_reg,
41828075 src_reg->type, true);
8076
+ mark_pkt_end(this_branch, insn->src_reg, false);
41838077 } else {
41848078 return false;
41858079 }
....@@ -4192,6 +8086,7 @@
41928086 /* pkt_data' < pkt_end, pkt_meta' < pkt_data */
41938087 find_good_pkt_pointers(other_branch, dst_reg,
41948088 dst_reg->type, true);
8089
+ mark_pkt_end(this_branch, insn->dst_reg, false);
41958090 } else if ((dst_reg->type == PTR_TO_PACKET_END &&
41968091 src_reg->type == PTR_TO_PACKET) ||
41978092 (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
....@@ -4199,6 +8094,7 @@
41998094 /* pkt_end < pkt_data', pkt_data > pkt_meta' */
42008095 find_good_pkt_pointers(this_branch, src_reg,
42018096 src_reg->type, false);
8097
+ mark_pkt_end(other_branch, insn->src_reg, true);
42028098 } else {
42038099 return false;
42048100 }
....@@ -4211,6 +8107,7 @@
42118107 /* pkt_data' >= pkt_end, pkt_meta' >= pkt_data */
42128108 find_good_pkt_pointers(this_branch, dst_reg,
42138109 dst_reg->type, true);
8110
+ mark_pkt_end(other_branch, insn->dst_reg, false);
42148111 } else if ((dst_reg->type == PTR_TO_PACKET_END &&
42158112 src_reg->type == PTR_TO_PACKET) ||
42168113 (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
....@@ -4218,6 +8115,7 @@
42188115 /* pkt_end >= pkt_data', pkt_data >= pkt_meta' */
42198116 find_good_pkt_pointers(other_branch, src_reg,
42208117 src_reg->type, false);
8118
+ mark_pkt_end(this_branch, insn->src_reg, true);
42218119 } else {
42228120 return false;
42238121 }
....@@ -4230,6 +8128,7 @@
42308128 /* pkt_data' <= pkt_end, pkt_meta' <= pkt_data */
42318129 find_good_pkt_pointers(other_branch, dst_reg,
42328130 dst_reg->type, false);
8131
+ mark_pkt_end(this_branch, insn->dst_reg, true);
42338132 } else if ((dst_reg->type == PTR_TO_PACKET_END &&
42348133 src_reg->type == PTR_TO_PACKET) ||
42358134 (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
....@@ -4237,6 +8136,7 @@
42378136 /* pkt_end <= pkt_data', pkt_data <= pkt_meta' */
42388137 find_good_pkt_pointers(this_branch, src_reg,
42398138 src_reg->type, true);
8139
+ mark_pkt_end(other_branch, insn->src_reg, false);
42408140 } else {
42418141 return false;
42428142 }
....@@ -4248,6 +8148,18 @@
42488148 return true;
42498149 }
42508150
8151
+static void find_equal_scalars(struct bpf_verifier_state *vstate,
8152
+ struct bpf_reg_state *known_reg)
8153
+{
8154
+ struct bpf_func_state *state;
8155
+ struct bpf_reg_state *reg;
8156
+
8157
+ bpf_for_each_reg_in_vstate(vstate, state, reg, ({
8158
+ if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
8159
+ copy_register_state(reg, known_reg);
8160
+ }));
8161
+}
8162
+
42518163 static int check_cond_jmp_op(struct bpf_verifier_env *env,
42528164 struct bpf_insn *insn, int *insn_idx)
42538165 {
....@@ -4256,17 +8168,19 @@
42568168 struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs;
42578169 struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL;
42588170 u8 opcode = BPF_OP(insn->code);
8171
+ bool is_jmp32;
42598172 int pred = -1;
42608173 int err;
42618174
4262
- if (opcode > BPF_JSLE) {
4263
- verbose(env, "invalid BPF_JMP opcode %x\n", opcode);
8175
+ /* Only conditional jumps are expected to reach here. */
8176
+ if (opcode == BPF_JA || opcode > BPF_JSLE) {
8177
+ verbose(env, "invalid BPF_JMP/JMP32 opcode %x\n", opcode);
42648178 return -EINVAL;
42658179 }
42668180
42678181 if (BPF_SRC(insn->code) == BPF_X) {
42688182 if (insn->imm != 0) {
4269
- verbose(env, "BPF_JMP uses reserved fields\n");
8183
+ verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
42708184 return -EINVAL;
42718185 }
42728186
....@@ -4283,7 +8197,7 @@
42838197 src_reg = &regs[insn->src_reg];
42848198 } else {
42858199 if (insn->src_reg != BPF_REG_0) {
4286
- verbose(env, "BPF_JMP uses reserved fields\n");
8200
+ verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
42878201 return -EINVAL;
42888202 }
42898203 }
....@@ -4294,20 +8208,47 @@
42948208 return err;
42958209
42968210 dst_reg = &regs[insn->dst_reg];
8211
+ is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
42978212
4298
- if (BPF_SRC(insn->code) == BPF_K)
4299
- pred = is_branch_taken(dst_reg, insn->imm, opcode);
4300
- else if (src_reg->type == SCALAR_VALUE &&
4301
- tnum_is_const(src_reg->var_off))
4302
- pred = is_branch_taken(dst_reg, src_reg->var_off.value,
4303
- opcode);
8213
+ if (BPF_SRC(insn->code) == BPF_K) {
8214
+ pred = is_branch_taken(dst_reg, insn->imm, opcode, is_jmp32);
8215
+ } else if (src_reg->type == SCALAR_VALUE &&
8216
+ is_jmp32 && tnum_is_const(tnum_subreg(src_reg->var_off))) {
8217
+ pred = is_branch_taken(dst_reg,
8218
+ tnum_subreg(src_reg->var_off).value,
8219
+ opcode,
8220
+ is_jmp32);
8221
+ } else if (src_reg->type == SCALAR_VALUE &&
8222
+ !is_jmp32 && tnum_is_const(src_reg->var_off)) {
8223
+ pred = is_branch_taken(dst_reg,
8224
+ src_reg->var_off.value,
8225
+ opcode,
8226
+ is_jmp32);
8227
+ } else if (reg_is_pkt_pointer_any(dst_reg) &&
8228
+ reg_is_pkt_pointer_any(src_reg) &&
8229
+ !is_jmp32) {
8230
+ pred = is_pkt_ptr_branch_taken(dst_reg, src_reg, opcode);
8231
+ }
8232
+
8233
+ if (pred >= 0) {
8234
+ /* If we get here with a dst_reg pointer type it is because
8235
+ * above is_branch_taken() special cased the 0 comparison.
8236
+ */
8237
+ if (!__is_pointer_value(false, dst_reg))
8238
+ err = mark_chain_precision(env, insn->dst_reg);
8239
+ if (BPF_SRC(insn->code) == BPF_X && !err &&
8240
+ !__is_pointer_value(false, src_reg))
8241
+ err = mark_chain_precision(env, insn->src_reg);
8242
+ if (err)
8243
+ return err;
8244
+ }
43048245
43058246 if (pred == 1) {
43068247 /* Only follow the goto, ignore fall-through. If needed, push
43078248 * the fall-through branch for simulation under speculative
43088249 * execution.
43098250 */
4310
- if (!env->allow_ptr_leaks &&
8251
+ if (!env->bypass_spec_v1 &&
43118252 !sanitize_speculative_path(env, insn, *insn_idx + 1,
43128253 *insn_idx))
43138254 return -EFAULT;
....@@ -4318,7 +8259,7 @@
43188259 * program will go. If needed, push the goto branch for
43198260 * simulation under speculative execution.
43208261 */
4321
- if (!env->allow_ptr_leaks &&
8262
+ if (!env->bypass_spec_v1 &&
43228263 !sanitize_speculative_path(env, insn,
43238264 *insn_idx + insn->off + 1,
43248265 *insn_idx))
....@@ -4340,37 +8281,65 @@
43408281 * comparable.
43418282 */
43428283 if (BPF_SRC(insn->code) == BPF_X) {
8284
+ struct bpf_reg_state *src_reg = &regs[insn->src_reg];
8285
+
43438286 if (dst_reg->type == SCALAR_VALUE &&
4344
- regs[insn->src_reg].type == SCALAR_VALUE) {
4345
- if (tnum_is_const(regs[insn->src_reg].var_off))
8287
+ src_reg->type == SCALAR_VALUE) {
8288
+ if (tnum_is_const(src_reg->var_off) ||
8289
+ (is_jmp32 &&
8290
+ tnum_is_const(tnum_subreg(src_reg->var_off))))
43468291 reg_set_min_max(&other_branch_regs[insn->dst_reg],
4347
- dst_reg, regs[insn->src_reg].var_off.value,
4348
- opcode);
4349
- else if (tnum_is_const(dst_reg->var_off))
8292
+ dst_reg,
8293
+ src_reg->var_off.value,
8294
+ tnum_subreg(src_reg->var_off).value,
8295
+ opcode, is_jmp32);
8296
+ else if (tnum_is_const(dst_reg->var_off) ||
8297
+ (is_jmp32 &&
8298
+ tnum_is_const(tnum_subreg(dst_reg->var_off))))
43508299 reg_set_min_max_inv(&other_branch_regs[insn->src_reg],
4351
- &regs[insn->src_reg],
4352
- dst_reg->var_off.value, opcode);
4353
- else if (opcode == BPF_JEQ || opcode == BPF_JNE)
8300
+ src_reg,
8301
+ dst_reg->var_off.value,
8302
+ tnum_subreg(dst_reg->var_off).value,
8303
+ opcode, is_jmp32);
8304
+ else if (!is_jmp32 &&
8305
+ (opcode == BPF_JEQ || opcode == BPF_JNE))
43548306 /* Comparing for equality, we can combine knowledge */
43558307 reg_combine_min_max(&other_branch_regs[insn->src_reg],
43568308 &other_branch_regs[insn->dst_reg],
4357
- &regs[insn->src_reg],
4358
- &regs[insn->dst_reg], opcode);
8309
+ src_reg, dst_reg, opcode);
8310
+ if (src_reg->id &&
8311
+ !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) {
8312
+ find_equal_scalars(this_branch, src_reg);
8313
+ find_equal_scalars(other_branch, &other_branch_regs[insn->src_reg]);
8314
+ }
8315
+
43598316 }
43608317 } else if (dst_reg->type == SCALAR_VALUE) {
43618318 reg_set_min_max(&other_branch_regs[insn->dst_reg],
4362
- dst_reg, insn->imm, opcode);
8319
+ dst_reg, insn->imm, (u32)insn->imm,
8320
+ opcode, is_jmp32);
43638321 }
43648322
4365
- /* detect if R == 0 where R is returned from bpf_map_lookup_elem() */
4366
- if (BPF_SRC(insn->code) == BPF_K &&
8323
+ if (dst_reg->type == SCALAR_VALUE && dst_reg->id &&
8324
+ !WARN_ON_ONCE(dst_reg->id != other_branch_regs[insn->dst_reg].id)) {
8325
+ find_equal_scalars(this_branch, dst_reg);
8326
+ find_equal_scalars(other_branch, &other_branch_regs[insn->dst_reg]);
8327
+ }
8328
+
8329
+ /* detect if R == 0 where R is returned from bpf_map_lookup_elem().
8330
+ * NOTE: these optimizations below are related with pointer comparison
8331
+ * which will never be JMP32.
8332
+ */
8333
+ if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K &&
43678334 insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
4368
- dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
4369
- /* Mark all identical map registers in each branch as either
8335
+ reg_type_may_be_null(dst_reg->type)) {
8336
+ /* Mark all identical registers in each branch as either
43708337 * safe or unknown depending R == 0 or R != 0 conditional.
43718338 */
4372
- mark_map_regs(this_branch, insn->dst_reg, opcode == BPF_JNE);
4373
- mark_map_regs(other_branch, insn->dst_reg, opcode == BPF_JEQ);
8339
+ mark_ptr_or_null_regs(this_branch, insn->dst_reg,
8340
+ opcode == BPF_JNE);
8341
+ mark_ptr_or_null_regs(other_branch, insn->dst_reg,
8342
+ opcode == BPF_JEQ);
43748343 } else if (!try_match_pkt_pointers(insn, dst_reg, &regs[insn->src_reg],
43758344 this_branch, other_branch) &&
43768345 is_pointer_value(env, insn->dst_reg)) {
....@@ -4378,23 +8347,18 @@
43788347 insn->dst_reg);
43798348 return -EACCES;
43808349 }
4381
- if (env->log.level)
8350
+ if (env->log.level & BPF_LOG_LEVEL)
43828351 print_verifier_state(env, this_branch->frame[this_branch->curframe]);
43838352 return 0;
4384
-}
4385
-
4386
-/* return the map pointer stored inside BPF_LD_IMM64 instruction */
4387
-static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
4388
-{
4389
- u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32;
4390
-
4391
- return (struct bpf_map *) (unsigned long) imm64;
43928353 }
43938354
43948355 /* verify BPF_LD_IMM64 instruction */
43958356 static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
43968357 {
8358
+ struct bpf_insn_aux_data *aux = cur_aux(env);
43978359 struct bpf_reg_state *regs = cur_regs(env);
8360
+ struct bpf_reg_state *dst_reg;
8361
+ struct bpf_map *map;
43988362 int err;
43998363
44008364 if (BPF_SIZE(insn->code) != BPF_DW) {
....@@ -4410,19 +8374,50 @@
44108374 if (err)
44118375 return err;
44128376
8377
+ dst_reg = &regs[insn->dst_reg];
44138378 if (insn->src_reg == 0) {
44148379 u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
44158380
4416
- regs[insn->dst_reg].type = SCALAR_VALUE;
8381
+ dst_reg->type = SCALAR_VALUE;
44178382 __mark_reg_known(&regs[insn->dst_reg], imm);
44188383 return 0;
44198384 }
44208385
4421
- /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */
4422
- BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD);
8386
+ if (insn->src_reg == BPF_PSEUDO_BTF_ID) {
8387
+ mark_reg_known_zero(env, regs, insn->dst_reg);
44238388
4424
- regs[insn->dst_reg].type = CONST_PTR_TO_MAP;
4425
- regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn);
8389
+ dst_reg->type = aux->btf_var.reg_type;
8390
+ switch (dst_reg->type) {
8391
+ case PTR_TO_MEM:
8392
+ dst_reg->mem_size = aux->btf_var.mem_size;
8393
+ break;
8394
+ case PTR_TO_BTF_ID:
8395
+ case PTR_TO_PERCPU_BTF_ID:
8396
+ dst_reg->btf_id = aux->btf_var.btf_id;
8397
+ break;
8398
+ default:
8399
+ verbose(env, "bpf verifier is misconfigured\n");
8400
+ return -EFAULT;
8401
+ }
8402
+ return 0;
8403
+ }
8404
+
8405
+ map = env->used_maps[aux->map_index];
8406
+ mark_reg_known_zero(env, regs, insn->dst_reg);
8407
+ dst_reg->map_ptr = map;
8408
+
8409
+ if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) {
8410
+ dst_reg->type = PTR_TO_MAP_VALUE;
8411
+ dst_reg->off = aux->map_off;
8412
+ if (map_value_has_spin_lock(map))
8413
+ dst_reg->id = ++env->id_gen;
8414
+ } else if (insn->src_reg == BPF_PSEUDO_MAP_FD) {
8415
+ dst_reg->type = CONST_PTR_TO_MAP;
8416
+ } else {
8417
+ verbose(env, "bpf verifier is misconfigured\n");
8418
+ return -EINVAL;
8419
+ }
8420
+
44268421 return 0;
44278422 }
44288423
....@@ -4460,25 +8455,13 @@
44608455 u8 mode = BPF_MODE(insn->code);
44618456 int i, err;
44628457
4463
- if (!may_access_skb(env->prog->type)) {
8458
+ if (!may_access_skb(resolve_prog_type(env->prog))) {
44648459 verbose(env, "BPF_LD_[ABS|IND] instructions not allowed for this program type\n");
44658460 return -EINVAL;
44668461 }
44678462
44688463 if (!env->ops->gen_ld_abs) {
44698464 verbose(env, "bpf verifier is misconfigured\n");
4470
- return -EINVAL;
4471
- }
4472
-
4473
- if (env->subprog_cnt > 1) {
4474
- /* when program has LD_ABS insn JITs and interpreter assume
4475
- * that r1 == ctx == skb which is not the case for callees
4476
- * that can have arbitrary arguments. It's problematic
4477
- * for main prog as well since JITs would need to analyze
4478
- * all functions in order to make proper register save/restore
4479
- * decisions in the main prog. Hence disallow LD_ABS with calls
4480
- */
4481
- verbose(env, "BPF_LD_[ABS|IND] instructions cannot be mixed with bpf-to-bpf calls\n");
44828465 return -EINVAL;
44838466 }
44848467
....@@ -4493,6 +8476,21 @@
44938476 err = check_reg_arg(env, ctx_reg, SRC_OP);
44948477 if (err)
44958478 return err;
8479
+
8480
+ /* Disallow usage of BPF_LD_[ABS|IND] with reference tracking, as
8481
+ * gen_ld_abs() may terminate the program at runtime, leading to
8482
+ * reference leak.
8483
+ */
8484
+ err = check_reference_leak(env);
8485
+ if (err) {
8486
+ verbose(env, "BPF_LD_[ABS|IND] cannot be mixed with socket references\n");
8487
+ return err;
8488
+ }
8489
+
8490
+ if (env->cur_state->active_spin_lock) {
8491
+ verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_spin_lock-ed region\n");
8492
+ return -EINVAL;
8493
+ }
44968494
44978495 if (regs[ctx_reg].type != PTR_TO_CTX) {
44988496 verbose(env,
....@@ -4522,29 +8520,106 @@
45228520 * Already marked as written above.
45238521 */
45248522 mark_reg_unknown(env, regs, BPF_REG_0);
8523
+ /* ld_abs load up to 32-bit skb data. */
8524
+ regs[BPF_REG_0].subreg_def = env->insn_idx + 1;
45258525 return 0;
45268526 }
45278527
45288528 static int check_return_code(struct bpf_verifier_env *env)
45298529 {
8530
+ struct tnum enforce_attach_type_range = tnum_unknown;
8531
+ const struct bpf_prog *prog = env->prog;
45308532 struct bpf_reg_state *reg;
45318533 struct tnum range = tnum_range(0, 1);
8534
+ enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
8535
+ int err;
8536
+ const bool is_subprog = env->cur_state->frame[0]->subprogno;
45328537
4533
- switch (env->prog->type) {
8538
+ /* LSM and struct_ops func-ptr's return type could be "void" */
8539
+ if (!is_subprog &&
8540
+ (prog_type == BPF_PROG_TYPE_STRUCT_OPS ||
8541
+ prog_type == BPF_PROG_TYPE_LSM) &&
8542
+ !prog->aux->attach_func_proto->type)
8543
+ return 0;
8544
+
8545
+ /* eBPF calling convetion is such that R0 is used
8546
+ * to return the value from eBPF program.
8547
+ * Make sure that it's readable at this time
8548
+ * of bpf_exit, which means that program wrote
8549
+ * something into it earlier
8550
+ */
8551
+ err = check_reg_arg(env, BPF_REG_0, SRC_OP);
8552
+ if (err)
8553
+ return err;
8554
+
8555
+ if (is_pointer_value(env, BPF_REG_0)) {
8556
+ verbose(env, "R0 leaks addr as return value\n");
8557
+ return -EACCES;
8558
+ }
8559
+
8560
+ reg = cur_regs(env) + BPF_REG_0;
8561
+ if (is_subprog) {
8562
+ if (reg->type != SCALAR_VALUE) {
8563
+ verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n",
8564
+ reg_type_str[reg->type]);
8565
+ return -EINVAL;
8566
+ }
8567
+ return 0;
8568
+ }
8569
+
8570
+ switch (prog_type) {
45348571 case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
45358572 if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG ||
4536
- env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG)
8573
+ env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG ||
8574
+ env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME ||
8575
+ env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME ||
8576
+ env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME ||
8577
+ env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME)
45378578 range = tnum_range(1, 1);
8579
+ break;
45388580 case BPF_PROG_TYPE_CGROUP_SKB:
8581
+ if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) {
8582
+ range = tnum_range(0, 3);
8583
+ enforce_attach_type_range = tnum_range(2, 3);
8584
+ }
8585
+ break;
45398586 case BPF_PROG_TYPE_CGROUP_SOCK:
45408587 case BPF_PROG_TYPE_SOCK_OPS:
45418588 case BPF_PROG_TYPE_CGROUP_DEVICE:
8589
+ case BPF_PROG_TYPE_CGROUP_SYSCTL:
8590
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
45428591 break;
8592
+ case BPF_PROG_TYPE_RAW_TRACEPOINT:
8593
+ if (!env->prog->aux->attach_btf_id)
8594
+ return 0;
8595
+ range = tnum_const(0);
8596
+ break;
8597
+ case BPF_PROG_TYPE_TRACING:
8598
+ switch (env->prog->expected_attach_type) {
8599
+ case BPF_TRACE_FENTRY:
8600
+ case BPF_TRACE_FEXIT:
8601
+ range = tnum_const(0);
8602
+ break;
8603
+ case BPF_TRACE_RAW_TP:
8604
+ case BPF_MODIFY_RETURN:
8605
+ return 0;
8606
+ case BPF_TRACE_ITER:
8607
+ break;
8608
+ default:
8609
+ return -ENOTSUPP;
8610
+ }
8611
+ break;
8612
+ case BPF_PROG_TYPE_SK_LOOKUP:
8613
+ range = tnum_range(SK_DROP, SK_PASS);
8614
+ break;
8615
+ case BPF_PROG_TYPE_EXT:
8616
+ /* freplace program can return anything as its return value
8617
+ * depends on the to-be-replaced kernel func or bpf program.
8618
+ */
45438619 default:
45448620 return 0;
45458621 }
45468622
4547
- reg = cur_regs(env) + BPF_REG_0;
45488623 if (reg->type != SCALAR_VALUE) {
45498624 verbose(env, "At program exit the register R0 is not a known value (%s)\n",
45508625 reg_type_str[reg->type]);
....@@ -4565,6 +8640,10 @@
45658640 verbose(env, " should have been in %s\n", tn_buf);
45668641 return -EINVAL;
45678642 }
8643
+
8644
+ if (!tnum_is_unknown(enforce_attach_type_range) &&
8645
+ tnum_in(enforce_attach_type_range, reg->var_off))
8646
+ env->prog->enforce_expected_attach_type = 1;
45688647 return 0;
45698648 }
45708649
....@@ -4608,19 +8687,37 @@
46088687 BRANCH = 2,
46098688 };
46108689
4611
-#define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L)
8690
+static u32 state_htab_size(struct bpf_verifier_env *env)
8691
+{
8692
+ return env->prog->len;
8693
+}
46128694
4613
-static int *insn_stack; /* stack of insns to process */
4614
-static int cur_stack; /* current stack index */
4615
-static int *insn_state;
8695
+static struct bpf_verifier_state_list **explored_state(
8696
+ struct bpf_verifier_env *env,
8697
+ int idx)
8698
+{
8699
+ struct bpf_verifier_state *cur = env->cur_state;
8700
+ struct bpf_func_state *state = cur->frame[cur->curframe];
8701
+
8702
+ return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)];
8703
+}
8704
+
8705
+static void init_explored_state(struct bpf_verifier_env *env, int idx)
8706
+{
8707
+ env->insn_aux_data[idx].prune_point = true;
8708
+}
46168709
46178710 /* t, w, e - match pseudo-code above:
46188711 * t - index of current instruction
46198712 * w - next instruction
46208713 * e - edge
46218714 */
4622
-static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
8715
+static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
8716
+ bool loop_ok)
46238717 {
8718
+ int *insn_stack = env->cfg.insn_stack;
8719
+ int *insn_state = env->cfg.insn_state;
8720
+
46248721 if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH))
46258722 return 0;
46268723
....@@ -4628,23 +8725,28 @@
46288725 return 0;
46298726
46308727 if (w < 0 || w >= env->prog->len) {
8728
+ verbose_linfo(env, t, "%d: ", t);
46318729 verbose(env, "jump out of range from insn %d to %d\n", t, w);
46328730 return -EINVAL;
46338731 }
46348732
46358733 if (e == BRANCH)
46368734 /* mark branch target for state pruning */
4637
- env->explored_states[w] = STATE_LIST_MARK;
8735
+ init_explored_state(env, w);
46388736
46398737 if (insn_state[w] == 0) {
46408738 /* tree-edge */
46418739 insn_state[t] = DISCOVERED | e;
46428740 insn_state[w] = DISCOVERED;
4643
- if (cur_stack >= env->prog->len)
8741
+ if (env->cfg.cur_stack >= env->prog->len)
46448742 return -E2BIG;
4645
- insn_stack[cur_stack++] = w;
8743
+ insn_stack[env->cfg.cur_stack++] = w;
46468744 return 1;
46478745 } else if ((insn_state[w] & 0xF0) == DISCOVERED) {
8746
+ if (loop_ok && env->bpf_capable)
8747
+ return 0;
8748
+ verbose_linfo(env, t, "%d: ", t);
8749
+ verbose_linfo(env, w, "%d: ", w);
46488750 verbose(env, "back-edge from insn %d to %d\n", t, w);
46498751 return -EINVAL;
46508752 } else if (insn_state[w] == EXPLORED) {
....@@ -4664,48 +8766,47 @@
46648766 {
46658767 struct bpf_insn *insns = env->prog->insnsi;
46668768 int insn_cnt = env->prog->len;
8769
+ int *insn_stack, *insn_state;
46678770 int ret = 0;
46688771 int i, t;
46698772
4670
- ret = check_subprogs(env);
4671
- if (ret < 0)
4672
- return ret;
4673
-
4674
- insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
8773
+ insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
46758774 if (!insn_state)
46768775 return -ENOMEM;
46778776
4678
- insn_stack = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
8777
+ insn_stack = env->cfg.insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
46798778 if (!insn_stack) {
4680
- kfree(insn_state);
8779
+ kvfree(insn_state);
46818780 return -ENOMEM;
46828781 }
46838782
46848783 insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */
46858784 insn_stack[0] = 0; /* 0 is the first instruction */
4686
- cur_stack = 1;
8785
+ env->cfg.cur_stack = 1;
46878786
46888787 peek_stack:
4689
- if (cur_stack == 0)
8788
+ if (env->cfg.cur_stack == 0)
46908789 goto check_state;
4691
- t = insn_stack[cur_stack - 1];
8790
+ t = insn_stack[env->cfg.cur_stack - 1];
46928791
4693
- if (BPF_CLASS(insns[t].code) == BPF_JMP) {
8792
+ if (BPF_CLASS(insns[t].code) == BPF_JMP ||
8793
+ BPF_CLASS(insns[t].code) == BPF_JMP32) {
46948794 u8 opcode = BPF_OP(insns[t].code);
46958795
46968796 if (opcode == BPF_EXIT) {
46978797 goto mark_explored;
46988798 } else if (opcode == BPF_CALL) {
4699
- ret = push_insn(t, t + 1, FALLTHROUGH, env);
8799
+ ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
47008800 if (ret == 1)
47018801 goto peek_stack;
47028802 else if (ret < 0)
47038803 goto err_free;
47048804 if (t + 1 < insn_cnt)
4705
- env->explored_states[t + 1] = STATE_LIST_MARK;
8805
+ init_explored_state(env, t + 1);
47068806 if (insns[t].src_reg == BPF_PSEUDO_CALL) {
4707
- env->explored_states[t] = STATE_LIST_MARK;
4708
- ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env);
8807
+ init_explored_state(env, t);
8808
+ ret = push_insn(t, t + insns[t].imm + 1, BRANCH,
8809
+ env, false);
47098810 if (ret == 1)
47108811 goto peek_stack;
47118812 else if (ret < 0)
....@@ -4718,26 +8819,31 @@
47188819 }
47198820 /* unconditional jump with single edge */
47208821 ret = push_insn(t, t + insns[t].off + 1,
4721
- FALLTHROUGH, env);
8822
+ FALLTHROUGH, env, true);
47228823 if (ret == 1)
47238824 goto peek_stack;
47248825 else if (ret < 0)
47258826 goto err_free;
8827
+ /* unconditional jmp is not a good pruning point,
8828
+ * but it's marked, since backtracking needs
8829
+ * to record jmp history in is_state_visited().
8830
+ */
8831
+ init_explored_state(env, t + insns[t].off + 1);
47268832 /* tell verifier to check for equivalent states
47278833 * after every call and jump
47288834 */
47298835 if (t + 1 < insn_cnt)
4730
- env->explored_states[t + 1] = STATE_LIST_MARK;
8836
+ init_explored_state(env, t + 1);
47318837 } else {
47328838 /* conditional jump with two edges */
4733
- env->explored_states[t] = STATE_LIST_MARK;
4734
- ret = push_insn(t, t + 1, FALLTHROUGH, env);
8839
+ init_explored_state(env, t);
8840
+ ret = push_insn(t, t + 1, FALLTHROUGH, env, true);
47358841 if (ret == 1)
47368842 goto peek_stack;
47378843 else if (ret < 0)
47388844 goto err_free;
47398845
4740
- ret = push_insn(t, t + insns[t].off + 1, BRANCH, env);
8846
+ ret = push_insn(t, t + insns[t].off + 1, BRANCH, env, true);
47418847 if (ret == 1)
47428848 goto peek_stack;
47438849 else if (ret < 0)
....@@ -4747,7 +8853,7 @@
47478853 /* all other non-branch instructions with single
47488854 * fall-through edge
47498855 */
4750
- ret = push_insn(t, t + 1, FALLTHROUGH, env);
8856
+ ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
47518857 if (ret == 1)
47528858 goto peek_stack;
47538859 else if (ret < 0)
....@@ -4756,7 +8862,7 @@
47568862
47578863 mark_explored:
47588864 insn_state[t] = EXPLORED;
4759
- if (cur_stack-- <= 0) {
8865
+ if (env->cfg.cur_stack-- <= 0) {
47608866 verbose(env, "pop stack internal bug\n");
47618867 ret = -EFAULT;
47628868 goto err_free;
....@@ -4774,9 +8880,329 @@
47748880 ret = 0; /* cfg looks good */
47758881
47768882 err_free:
4777
- kfree(insn_state);
4778
- kfree(insn_stack);
8883
+ kvfree(insn_state);
8884
+ kvfree(insn_stack);
8885
+ env->cfg.insn_state = env->cfg.insn_stack = NULL;
47798886 return ret;
8887
+}
8888
+
8889
+static int check_abnormal_return(struct bpf_verifier_env *env)
8890
+{
8891
+ int i;
8892
+
8893
+ for (i = 1; i < env->subprog_cnt; i++) {
8894
+ if (env->subprog_info[i].has_ld_abs) {
8895
+ verbose(env, "LD_ABS is not allowed in subprogs without BTF\n");
8896
+ return -EINVAL;
8897
+ }
8898
+ if (env->subprog_info[i].has_tail_call) {
8899
+ verbose(env, "tail_call is not allowed in subprogs without BTF\n");
8900
+ return -EINVAL;
8901
+ }
8902
+ }
8903
+ return 0;
8904
+}
8905
+
8906
+/* The minimum supported BTF func info size */
8907
+#define MIN_BPF_FUNCINFO_SIZE 8
8908
+#define MAX_FUNCINFO_REC_SIZE 252
8909
+
8910
+static int check_btf_func(struct bpf_verifier_env *env,
8911
+ const union bpf_attr *attr,
8912
+ union bpf_attr __user *uattr)
8913
+{
8914
+ const struct btf_type *type, *func_proto, *ret_type;
8915
+ u32 i, nfuncs, urec_size, min_size;
8916
+ u32 krec_size = sizeof(struct bpf_func_info);
8917
+ struct bpf_func_info *krecord;
8918
+ struct bpf_func_info_aux *info_aux = NULL;
8919
+ struct bpf_prog *prog;
8920
+ const struct btf *btf;
8921
+ void __user *urecord;
8922
+ u32 prev_offset = 0;
8923
+ bool scalar_return;
8924
+ int ret = -ENOMEM;
8925
+
8926
+ nfuncs = attr->func_info_cnt;
8927
+ if (!nfuncs) {
8928
+ if (check_abnormal_return(env))
8929
+ return -EINVAL;
8930
+ return 0;
8931
+ }
8932
+
8933
+ if (nfuncs != env->subprog_cnt) {
8934
+ verbose(env, "number of funcs in func_info doesn't match number of subprogs\n");
8935
+ return -EINVAL;
8936
+ }
8937
+
8938
+ urec_size = attr->func_info_rec_size;
8939
+ if (urec_size < MIN_BPF_FUNCINFO_SIZE ||
8940
+ urec_size > MAX_FUNCINFO_REC_SIZE ||
8941
+ urec_size % sizeof(u32)) {
8942
+ verbose(env, "invalid func info rec size %u\n", urec_size);
8943
+ return -EINVAL;
8944
+ }
8945
+
8946
+ prog = env->prog;
8947
+ btf = prog->aux->btf;
8948
+
8949
+ urecord = u64_to_user_ptr(attr->func_info);
8950
+ min_size = min_t(u32, krec_size, urec_size);
8951
+
8952
+ krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN);
8953
+ if (!krecord)
8954
+ return -ENOMEM;
8955
+ info_aux = kcalloc(nfuncs, sizeof(*info_aux), GFP_KERNEL | __GFP_NOWARN);
8956
+ if (!info_aux)
8957
+ goto err_free;
8958
+
8959
+ for (i = 0; i < nfuncs; i++) {
8960
+ ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size);
8961
+ if (ret) {
8962
+ if (ret == -E2BIG) {
8963
+ verbose(env, "nonzero tailing record in func info");
8964
+ /* set the size kernel expects so loader can zero
8965
+ * out the rest of the record.
8966
+ */
8967
+ if (put_user(min_size, &uattr->func_info_rec_size))
8968
+ ret = -EFAULT;
8969
+ }
8970
+ goto err_free;
8971
+ }
8972
+
8973
+ if (copy_from_user(&krecord[i], urecord, min_size)) {
8974
+ ret = -EFAULT;
8975
+ goto err_free;
8976
+ }
8977
+
8978
+ /* check insn_off */
8979
+ ret = -EINVAL;
8980
+ if (i == 0) {
8981
+ if (krecord[i].insn_off) {
8982
+ verbose(env,
8983
+ "nonzero insn_off %u for the first func info record",
8984
+ krecord[i].insn_off);
8985
+ goto err_free;
8986
+ }
8987
+ } else if (krecord[i].insn_off <= prev_offset) {
8988
+ verbose(env,
8989
+ "same or smaller insn offset (%u) than previous func info record (%u)",
8990
+ krecord[i].insn_off, prev_offset);
8991
+ goto err_free;
8992
+ }
8993
+
8994
+ if (env->subprog_info[i].start != krecord[i].insn_off) {
8995
+ verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n");
8996
+ goto err_free;
8997
+ }
8998
+
8999
+ /* check type_id */
9000
+ type = btf_type_by_id(btf, krecord[i].type_id);
9001
+ if (!type || !btf_type_is_func(type)) {
9002
+ verbose(env, "invalid type id %d in func info",
9003
+ krecord[i].type_id);
9004
+ goto err_free;
9005
+ }
9006
+ info_aux[i].linkage = BTF_INFO_VLEN(type->info);
9007
+
9008
+ func_proto = btf_type_by_id(btf, type->type);
9009
+ if (unlikely(!func_proto || !btf_type_is_func_proto(func_proto)))
9010
+ /* btf_func_check() already verified it during BTF load */
9011
+ goto err_free;
9012
+ ret_type = btf_type_skip_modifiers(btf, func_proto->type, NULL);
9013
+ scalar_return =
9014
+ btf_type_is_small_int(ret_type) || btf_type_is_enum(ret_type);
9015
+ if (i && !scalar_return && env->subprog_info[i].has_ld_abs) {
9016
+ verbose(env, "LD_ABS is only allowed in functions that return 'int'.\n");
9017
+ goto err_free;
9018
+ }
9019
+ if (i && !scalar_return && env->subprog_info[i].has_tail_call) {
9020
+ verbose(env, "tail_call is only allowed in functions that return 'int'.\n");
9021
+ goto err_free;
9022
+ }
9023
+
9024
+ prev_offset = krecord[i].insn_off;
9025
+ urecord += urec_size;
9026
+ }
9027
+
9028
+ prog->aux->func_info = krecord;
9029
+ prog->aux->func_info_cnt = nfuncs;
9030
+ prog->aux->func_info_aux = info_aux;
9031
+ return 0;
9032
+
9033
+err_free:
9034
+ kvfree(krecord);
9035
+ kfree(info_aux);
9036
+ return ret;
9037
+}
9038
+
9039
+static void adjust_btf_func(struct bpf_verifier_env *env)
9040
+{
9041
+ struct bpf_prog_aux *aux = env->prog->aux;
9042
+ int i;
9043
+
9044
+ if (!aux->func_info)
9045
+ return;
9046
+
9047
+ for (i = 0; i < env->subprog_cnt; i++)
9048
+ aux->func_info[i].insn_off = env->subprog_info[i].start;
9049
+}
9050
+
9051
+#define MIN_BPF_LINEINFO_SIZE (offsetof(struct bpf_line_info, line_col) + \
9052
+ sizeof(((struct bpf_line_info *)(0))->line_col))
9053
+#define MAX_LINEINFO_REC_SIZE MAX_FUNCINFO_REC_SIZE
9054
+
9055
+static int check_btf_line(struct bpf_verifier_env *env,
9056
+ const union bpf_attr *attr,
9057
+ union bpf_attr __user *uattr)
9058
+{
9059
+ u32 i, s, nr_linfo, ncopy, expected_size, rec_size, prev_offset = 0;
9060
+ struct bpf_subprog_info *sub;
9061
+ struct bpf_line_info *linfo;
9062
+ struct bpf_prog *prog;
9063
+ const struct btf *btf;
9064
+ void __user *ulinfo;
9065
+ int err;
9066
+
9067
+ nr_linfo = attr->line_info_cnt;
9068
+ if (!nr_linfo)
9069
+ return 0;
9070
+ if (nr_linfo > INT_MAX / sizeof(struct bpf_line_info))
9071
+ return -EINVAL;
9072
+
9073
+ rec_size = attr->line_info_rec_size;
9074
+ if (rec_size < MIN_BPF_LINEINFO_SIZE ||
9075
+ rec_size > MAX_LINEINFO_REC_SIZE ||
9076
+ rec_size & (sizeof(u32) - 1))
9077
+ return -EINVAL;
9078
+
9079
+ /* Need to zero it in case the userspace may
9080
+ * pass in a smaller bpf_line_info object.
9081
+ */
9082
+ linfo = kvcalloc(nr_linfo, sizeof(struct bpf_line_info),
9083
+ GFP_KERNEL | __GFP_NOWARN);
9084
+ if (!linfo)
9085
+ return -ENOMEM;
9086
+
9087
+ prog = env->prog;
9088
+ btf = prog->aux->btf;
9089
+
9090
+ s = 0;
9091
+ sub = env->subprog_info;
9092
+ ulinfo = u64_to_user_ptr(attr->line_info);
9093
+ expected_size = sizeof(struct bpf_line_info);
9094
+ ncopy = min_t(u32, expected_size, rec_size);
9095
+ for (i = 0; i < nr_linfo; i++) {
9096
+ err = bpf_check_uarg_tail_zero(ulinfo, expected_size, rec_size);
9097
+ if (err) {
9098
+ if (err == -E2BIG) {
9099
+ verbose(env, "nonzero tailing record in line_info");
9100
+ if (put_user(expected_size,
9101
+ &uattr->line_info_rec_size))
9102
+ err = -EFAULT;
9103
+ }
9104
+ goto err_free;
9105
+ }
9106
+
9107
+ if (copy_from_user(&linfo[i], ulinfo, ncopy)) {
9108
+ err = -EFAULT;
9109
+ goto err_free;
9110
+ }
9111
+
9112
+ /*
9113
+ * Check insn_off to ensure
9114
+ * 1) strictly increasing AND
9115
+ * 2) bounded by prog->len
9116
+ *
9117
+ * The linfo[0].insn_off == 0 check logically falls into
9118
+ * the later "missing bpf_line_info for func..." case
9119
+ * because the first linfo[0].insn_off must be the
9120
+ * first sub also and the first sub must have
9121
+ * subprog_info[0].start == 0.
9122
+ */
9123
+ if ((i && linfo[i].insn_off <= prev_offset) ||
9124
+ linfo[i].insn_off >= prog->len) {
9125
+ verbose(env, "Invalid line_info[%u].insn_off:%u (prev_offset:%u prog->len:%u)\n",
9126
+ i, linfo[i].insn_off, prev_offset,
9127
+ prog->len);
9128
+ err = -EINVAL;
9129
+ goto err_free;
9130
+ }
9131
+
9132
+ if (!prog->insnsi[linfo[i].insn_off].code) {
9133
+ verbose(env,
9134
+ "Invalid insn code at line_info[%u].insn_off\n",
9135
+ i);
9136
+ err = -EINVAL;
9137
+ goto err_free;
9138
+ }
9139
+
9140
+ if (!btf_name_by_offset(btf, linfo[i].line_off) ||
9141
+ !btf_name_by_offset(btf, linfo[i].file_name_off)) {
9142
+ verbose(env, "Invalid line_info[%u].line_off or .file_name_off\n", i);
9143
+ err = -EINVAL;
9144
+ goto err_free;
9145
+ }
9146
+
9147
+ if (s != env->subprog_cnt) {
9148
+ if (linfo[i].insn_off == sub[s].start) {
9149
+ sub[s].linfo_idx = i;
9150
+ s++;
9151
+ } else if (sub[s].start < linfo[i].insn_off) {
9152
+ verbose(env, "missing bpf_line_info for func#%u\n", s);
9153
+ err = -EINVAL;
9154
+ goto err_free;
9155
+ }
9156
+ }
9157
+
9158
+ prev_offset = linfo[i].insn_off;
9159
+ ulinfo += rec_size;
9160
+ }
9161
+
9162
+ if (s != env->subprog_cnt) {
9163
+ verbose(env, "missing bpf_line_info for %u funcs starting from func#%u\n",
9164
+ env->subprog_cnt - s, s);
9165
+ err = -EINVAL;
9166
+ goto err_free;
9167
+ }
9168
+
9169
+ prog->aux->linfo = linfo;
9170
+ prog->aux->nr_linfo = nr_linfo;
9171
+
9172
+ return 0;
9173
+
9174
+err_free:
9175
+ kvfree(linfo);
9176
+ return err;
9177
+}
9178
+
9179
+static int check_btf_info(struct bpf_verifier_env *env,
9180
+ const union bpf_attr *attr,
9181
+ union bpf_attr __user *uattr)
9182
+{
9183
+ struct btf *btf;
9184
+ int err;
9185
+
9186
+ if (!attr->func_info_cnt && !attr->line_info_cnt) {
9187
+ if (check_abnormal_return(env))
9188
+ return -EINVAL;
9189
+ return 0;
9190
+ }
9191
+
9192
+ btf = btf_get_by_fd(attr->prog_btf_fd);
9193
+ if (IS_ERR(btf))
9194
+ return PTR_ERR(btf);
9195
+ env->prog->aux->btf = btf;
9196
+
9197
+ err = check_btf_func(env, attr, uattr);
9198
+ if (err)
9199
+ return err;
9200
+
9201
+ err = check_btf_line(env, attr, uattr);
9202
+ if (err)
9203
+ return err;
9204
+
9205
+ return 0;
47809206 }
47819207
47829208 /* check %cur's range satisfies %old's */
....@@ -4786,7 +9212,11 @@
47869212 return old->umin_value <= cur->umin_value &&
47879213 old->umax_value >= cur->umax_value &&
47889214 old->smin_value <= cur->smin_value &&
4789
- old->smax_value >= cur->smax_value;
9215
+ old->smax_value >= cur->smax_value &&
9216
+ old->u32_min_value <= cur->u32_min_value &&
9217
+ old->u32_max_value >= cur->u32_max_value &&
9218
+ old->s32_min_value <= cur->s32_min_value &&
9219
+ old->s32_max_value >= cur->s32_max_value;
47909220 }
47919221
47929222 /* If in the old state two registers had the same id, then they need to have
....@@ -4816,6 +9246,102 @@
48169246 /* We ran out of idmap slots, which should be impossible */
48179247 WARN_ON_ONCE(1);
48189248 return false;
9249
+}
9250
+
9251
+static void clean_func_state(struct bpf_verifier_env *env,
9252
+ struct bpf_func_state *st)
9253
+{
9254
+ enum bpf_reg_liveness live;
9255
+ int i, j;
9256
+
9257
+ for (i = 0; i < BPF_REG_FP; i++) {
9258
+ live = st->regs[i].live;
9259
+ /* liveness must not touch this register anymore */
9260
+ st->regs[i].live |= REG_LIVE_DONE;
9261
+ if (!(live & REG_LIVE_READ))
9262
+ /* since the register is unused, clear its state
9263
+ * to make further comparison simpler
9264
+ */
9265
+ __mark_reg_not_init(env, &st->regs[i]);
9266
+ }
9267
+
9268
+ for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) {
9269
+ live = st->stack[i].spilled_ptr.live;
9270
+ /* liveness must not touch this stack slot anymore */
9271
+ st->stack[i].spilled_ptr.live |= REG_LIVE_DONE;
9272
+ if (!(live & REG_LIVE_READ)) {
9273
+ __mark_reg_not_init(env, &st->stack[i].spilled_ptr);
9274
+ for (j = 0; j < BPF_REG_SIZE; j++)
9275
+ st->stack[i].slot_type[j] = STACK_INVALID;
9276
+ }
9277
+ }
9278
+}
9279
+
9280
+static void clean_verifier_state(struct bpf_verifier_env *env,
9281
+ struct bpf_verifier_state *st)
9282
+{
9283
+ int i;
9284
+
9285
+ if (st->frame[0]->regs[0].live & REG_LIVE_DONE)
9286
+ /* all regs in this state in all frames were already marked */
9287
+ return;
9288
+
9289
+ for (i = 0; i <= st->curframe; i++)
9290
+ clean_func_state(env, st->frame[i]);
9291
+}
9292
+
9293
+/* the parentage chains form a tree.
9294
+ * the verifier states are added to state lists at given insn and
9295
+ * pushed into state stack for future exploration.
9296
+ * when the verifier reaches bpf_exit insn some of the verifer states
9297
+ * stored in the state lists have their final liveness state already,
9298
+ * but a lot of states will get revised from liveness point of view when
9299
+ * the verifier explores other branches.
9300
+ * Example:
9301
+ * 1: r0 = 1
9302
+ * 2: if r1 == 100 goto pc+1
9303
+ * 3: r0 = 2
9304
+ * 4: exit
9305
+ * when the verifier reaches exit insn the register r0 in the state list of
9306
+ * insn 2 will be seen as !REG_LIVE_READ. Then the verifier pops the other_branch
9307
+ * of insn 2 and goes exploring further. At the insn 4 it will walk the
9308
+ * parentage chain from insn 4 into insn 2 and will mark r0 as REG_LIVE_READ.
9309
+ *
9310
+ * Since the verifier pushes the branch states as it sees them while exploring
9311
+ * the program the condition of walking the branch instruction for the second
9312
+ * time means that all states below this branch were already explored and
9313
+ * their final liveness markes are already propagated.
9314
+ * Hence when the verifier completes the search of state list in is_state_visited()
9315
+ * we can call this clean_live_states() function to mark all liveness states
9316
+ * as REG_LIVE_DONE to indicate that 'parent' pointers of 'struct bpf_reg_state'
9317
+ * will not be used.
9318
+ * This function also clears the registers and stack for states that !READ
9319
+ * to simplify state merging.
9320
+ *
9321
+ * Important note here that walking the same branch instruction in the callee
9322
+ * doesn't meant that the states are DONE. The verifier has to compare
9323
+ * the callsites
9324
+ */
9325
+static void clean_live_states(struct bpf_verifier_env *env, int insn,
9326
+ struct bpf_verifier_state *cur)
9327
+{
9328
+ struct bpf_verifier_state_list *sl;
9329
+ int i;
9330
+
9331
+ sl = *explored_state(env, insn);
9332
+ while (sl) {
9333
+ if (sl->state.branches)
9334
+ goto next;
9335
+ if (sl->state.insn_idx != insn ||
9336
+ sl->state.curframe != cur->curframe)
9337
+ goto next;
9338
+ for (i = 0; i <= cur->curframe; i++)
9339
+ if (sl->state.frame[i]->callsite != cur->frame[i]->callsite)
9340
+ goto next;
9341
+ clean_verifier_state(env, &sl->state);
9342
+next:
9343
+ sl = sl->next;
9344
+ }
48199345 }
48209346
48219347 /* Returns true if (rold safe implies rcur safe) */
....@@ -4849,6 +9375,8 @@
48499375 if (env->explore_alu_limits)
48509376 return false;
48519377 if (rcur->type == SCALAR_VALUE) {
9378
+ if (!rold->precise)
9379
+ return true;
48529380 /* new val must satisfy old val knowledge */
48539381 return range_within(rold, rcur) &&
48549382 tnum_in(rold->var_off, rcur->var_off);
....@@ -4865,8 +9393,11 @@
48659393 case PTR_TO_MAP_VALUE:
48669394 /* If the new min/max/var_off satisfy the old ones and
48679395 * everything else matches, we are OK.
4868
- * We don't care about the 'id' value, because nothing
4869
- * uses it for PTR_TO_MAP_VALUE (only for ..._OR_NULL)
9396
+ * 'id' is not compared, since it's only used for maps with
9397
+ * bpf_spin_lock inside map element and in such cases if
9398
+ * the rest of the prog is valid for one map element then
9399
+ * it's valid for all map elements regardless of the key
9400
+ * used in bpf_map_lookup()
48709401 */
48719402 return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
48729403 range_within(rold, rcur) &&
....@@ -4911,6 +9442,14 @@
49119442 case PTR_TO_CTX:
49129443 case CONST_PTR_TO_MAP:
49139444 case PTR_TO_PACKET_END:
9445
+ case PTR_TO_FLOW_KEYS:
9446
+ case PTR_TO_SOCKET:
9447
+ case PTR_TO_SOCKET_OR_NULL:
9448
+ case PTR_TO_SOCK_COMMON:
9449
+ case PTR_TO_SOCK_COMMON_OR_NULL:
9450
+ case PTR_TO_TCP_SOCK:
9451
+ case PTR_TO_TCP_SOCK_OR_NULL:
9452
+ case PTR_TO_XDP_SOCK:
49149453 /* Only valid matches are exact, which memcmp() above
49159454 * would have accepted
49169455 */
....@@ -4929,12 +9468,6 @@
49299468 {
49309469 int i, spi;
49319470
4932
- /* if explored stack has more populated slots than current stack
4933
- * such stacks are not equivalent
4934
- */
4935
- if (old->allocated_stack > cur->allocated_stack)
4936
- return false;
4937
-
49389471 /* walk slots of the explored stack and ignore any additional
49399472 * slots in the current stack, since explored(safe) state
49409473 * didn't use them
....@@ -4942,12 +9475,21 @@
49429475 for (i = 0; i < old->allocated_stack; i++) {
49439476 spi = i / BPF_REG_SIZE;
49449477
4945
- if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ))
9478
+ if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) {
9479
+ i += BPF_REG_SIZE - 1;
49469480 /* explored state didn't use this */
49479481 continue;
9482
+ }
49489483
49499484 if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID)
49509485 continue;
9486
+
9487
+ /* explored stack has more populated slots than current stack
9488
+ * and these slots were used
9489
+ */
9490
+ if (i >= cur->allocated_stack)
9491
+ return false;
9492
+
49519493 /* if old state was safe with misc data in the stack
49529494 * it will be safe with zero-initialized stack.
49539495 * The opposite is not true
....@@ -4958,14 +9500,14 @@
49589500 if (old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
49599501 cur->stack[spi].slot_type[i % BPF_REG_SIZE])
49609502 /* Ex: old explored (safe) state has STACK_SPILL in
4961
- * this stack slot, but current has has STACK_MISC ->
9503
+ * this stack slot, but current has STACK_MISC ->
49629504 * this verifier states are not equivalent,
49639505 * return false to continue verification of this path
49649506 */
49659507 return false;
4966
- if (i % BPF_REG_SIZE)
9508
+ if (i % BPF_REG_SIZE != BPF_REG_SIZE - 1)
49679509 continue;
4968
- if (old->stack[spi].slot_type[0] != STACK_SPILL)
9510
+ if (!is_spilled_reg(&old->stack[spi]))
49699511 continue;
49709512 if (!regsafe(env, &old->stack[spi].spilled_ptr,
49719513 &cur->stack[spi].spilled_ptr, idmap))
....@@ -4982,6 +9524,14 @@
49829524 return false;
49839525 }
49849526 return true;
9527
+}
9528
+
9529
+static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur)
9530
+{
9531
+ if (old->acquired_refs != cur->acquired_refs)
9532
+ return false;
9533
+ return !memcmp(old->refs, cur->refs,
9534
+ sizeof(*old->refs) * old->acquired_refs);
49859535 }
49869536
49879537 /* compare two verifier states
....@@ -5024,6 +9574,9 @@
50249574 if (!stacksafe(env, old, cur, env->idmap_scratch))
50259575 return false;
50269576
9577
+ if (!refsafe(old, cur))
9578
+ return false;
9579
+
50279580 return true;
50289581 }
50299582
....@@ -5042,6 +9595,9 @@
50429595 if (old->speculative && !cur->speculative)
50439596 return false;
50449597
9598
+ if (old->active_spin_lock != cur->active_spin_lock)
9599
+ return false;
9600
+
50459601 /* for states to be equal callsites have to be the same
50469602 * and all frame states need to be equivalent
50479603 */
....@@ -5052,6 +9608,35 @@
50529608 return false;
50539609 }
50549610 return true;
9611
+}
9612
+
9613
+/* Return 0 if no propagation happened. Return negative error code if error
9614
+ * happened. Otherwise, return the propagated bit.
9615
+ */
9616
+static int propagate_liveness_reg(struct bpf_verifier_env *env,
9617
+ struct bpf_reg_state *reg,
9618
+ struct bpf_reg_state *parent_reg)
9619
+{
9620
+ u8 parent_flag = parent_reg->live & REG_LIVE_READ;
9621
+ u8 flag = reg->live & REG_LIVE_READ;
9622
+ int err;
9623
+
9624
+ /* When comes here, read flags of PARENT_REG or REG could be any of
9625
+ * REG_LIVE_READ64, REG_LIVE_READ32, REG_LIVE_NONE. There is no need
9626
+ * of propagation if PARENT_REG has strongest REG_LIVE_READ64.
9627
+ */
9628
+ if (parent_flag == REG_LIVE_READ64 ||
9629
+ /* Or if there is no read flag from REG. */
9630
+ !flag ||
9631
+ /* Or if the read flag from REG is the same as PARENT_REG. */
9632
+ parent_flag == flag)
9633
+ return 0;
9634
+
9635
+ err = mark_reg_read(env, reg, parent_reg, flag);
9636
+ if (err)
9637
+ return err;
9638
+
9639
+ return flag;
50559640 }
50569641
50579642 /* A write screens off any subsequent reads; but write marks come from the
....@@ -5065,8 +9650,9 @@
50659650 const struct bpf_verifier_state *vstate,
50669651 struct bpf_verifier_state *vparent)
50679652 {
5068
- int i, frame, err = 0;
9653
+ struct bpf_reg_state *state_reg, *parent_reg;
50699654 struct bpf_func_state *state, *parent;
9655
+ int i, frame, err = 0;
50709656
50719657 if (vparent->curframe != vstate->curframe) {
50729658 WARN(1, "propagate_live: parent frame %d current frame %d\n",
....@@ -5075,50 +9661,160 @@
50759661 }
50769662 /* Propagate read liveness of registers... */
50779663 BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
5078
- /* We don't need to worry about FP liveness because it's read-only */
5079
- for (i = 0; i < BPF_REG_FP; i++) {
5080
- if (vparent->frame[vparent->curframe]->regs[i].live & REG_LIVE_READ)
5081
- continue;
5082
- if (vstate->frame[vstate->curframe]->regs[i].live & REG_LIVE_READ) {
5083
- err = mark_reg_read(env, &vstate->frame[vstate->curframe]->regs[i],
5084
- &vparent->frame[vstate->curframe]->regs[i]);
5085
- if (err)
9664
+ for (frame = 0; frame <= vstate->curframe; frame++) {
9665
+ parent = vparent->frame[frame];
9666
+ state = vstate->frame[frame];
9667
+ parent_reg = parent->regs;
9668
+ state_reg = state->regs;
9669
+ /* We don't need to worry about FP liveness, it's read-only */
9670
+ for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) {
9671
+ err = propagate_liveness_reg(env, &state_reg[i],
9672
+ &parent_reg[i]);
9673
+ if (err < 0)
9674
+ return err;
9675
+ if (err == REG_LIVE_READ64)
9676
+ mark_insn_zext(env, &parent_reg[i]);
9677
+ }
9678
+
9679
+ /* Propagate stack slots. */
9680
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE &&
9681
+ i < parent->allocated_stack / BPF_REG_SIZE; i++) {
9682
+ parent_reg = &parent->stack[i].spilled_ptr;
9683
+ state_reg = &state->stack[i].spilled_ptr;
9684
+ err = propagate_liveness_reg(env, state_reg,
9685
+ parent_reg);
9686
+ if (err < 0)
50869687 return err;
50879688 }
50889689 }
9690
+ return 0;
9691
+}
50899692
5090
- /* ... and stack slots */
5091
- for (frame = 0; frame <= vstate->curframe; frame++) {
5092
- state = vstate->frame[frame];
5093
- parent = vparent->frame[frame];
5094
- for (i = 0; i < state->allocated_stack / BPF_REG_SIZE &&
5095
- i < parent->allocated_stack / BPF_REG_SIZE; i++) {
5096
- if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ)
9693
+/* find precise scalars in the previous equivalent state and
9694
+ * propagate them into the current state
9695
+ */
9696
+static int propagate_precision(struct bpf_verifier_env *env,
9697
+ const struct bpf_verifier_state *old)
9698
+{
9699
+ struct bpf_reg_state *state_reg;
9700
+ struct bpf_func_state *state;
9701
+ int i, err = 0, fr;
9702
+
9703
+ for (fr = old->curframe; fr >= 0; fr--) {
9704
+ state = old->frame[fr];
9705
+ state_reg = state->regs;
9706
+ for (i = 0; i < BPF_REG_FP; i++, state_reg++) {
9707
+ if (state_reg->type != SCALAR_VALUE ||
9708
+ !state_reg->precise ||
9709
+ !(state_reg->live & REG_LIVE_READ))
50979710 continue;
5098
- if (state->stack[i].spilled_ptr.live & REG_LIVE_READ)
5099
- mark_reg_read(env, &state->stack[i].spilled_ptr,
5100
- &parent->stack[i].spilled_ptr);
9711
+ if (env->log.level & BPF_LOG_LEVEL2)
9712
+ verbose(env, "frame %d: propagating r%d\n", fr, i);
9713
+ err = mark_chain_precision_frame(env, fr, i);
9714
+ if (err < 0)
9715
+ return err;
9716
+ }
9717
+
9718
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
9719
+ if (!is_spilled_reg(&state->stack[i]))
9720
+ continue;
9721
+ state_reg = &state->stack[i].spilled_ptr;
9722
+ if (state_reg->type != SCALAR_VALUE ||
9723
+ !state_reg->precise ||
9724
+ !(state_reg->live & REG_LIVE_READ))
9725
+ continue;
9726
+ if (env->log.level & BPF_LOG_LEVEL2)
9727
+ verbose(env, "frame %d: propagating fp%d\n",
9728
+ fr, (-i - 1) * BPF_REG_SIZE);
9729
+ err = mark_chain_precision_stack_frame(env, fr, i);
9730
+ if (err < 0)
9731
+ return err;
51019732 }
51029733 }
5103
- return err;
9734
+ return 0;
51049735 }
9736
+
9737
+static bool states_maybe_looping(struct bpf_verifier_state *old,
9738
+ struct bpf_verifier_state *cur)
9739
+{
9740
+ struct bpf_func_state *fold, *fcur;
9741
+ int i, fr = cur->curframe;
9742
+
9743
+ if (old->curframe != fr)
9744
+ return false;
9745
+
9746
+ fold = old->frame[fr];
9747
+ fcur = cur->frame[fr];
9748
+ for (i = 0; i < MAX_BPF_REG; i++)
9749
+ if (memcmp(&fold->regs[i], &fcur->regs[i],
9750
+ offsetof(struct bpf_reg_state, parent)))
9751
+ return false;
9752
+ return true;
9753
+}
9754
+
51059755
51069756 static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
51079757 {
51089758 struct bpf_verifier_state_list *new_sl;
5109
- struct bpf_verifier_state_list *sl;
9759
+ struct bpf_verifier_state_list *sl, **pprev;
51109760 struct bpf_verifier_state *cur = env->cur_state, *new;
51119761 int i, j, err, states_cnt = 0;
9762
+ bool add_new_state = env->test_state_freq ? true : false;
51129763
5113
- sl = env->explored_states[insn_idx];
5114
- if (!sl)
9764
+ cur->last_insn_idx = env->prev_insn_idx;
9765
+ if (!env->insn_aux_data[insn_idx].prune_point)
51159766 /* this 'insn_idx' instruction wasn't marked, so we will not
51169767 * be doing state search here
51179768 */
51189769 return 0;
51199770
5120
- while (sl != STATE_LIST_MARK) {
9771
+ /* bpf progs typically have pruning point every 4 instructions
9772
+ * http://vger.kernel.org/bpfconf2019.html#session-1
9773
+ * Do not add new state for future pruning if the verifier hasn't seen
9774
+ * at least 2 jumps and at least 8 instructions.
9775
+ * This heuristics helps decrease 'total_states' and 'peak_states' metric.
9776
+ * In tests that amounts to up to 50% reduction into total verifier
9777
+ * memory consumption and 20% verifier time speedup.
9778
+ */
9779
+ if (env->jmps_processed - env->prev_jmps_processed >= 2 &&
9780
+ env->insn_processed - env->prev_insn_processed >= 8)
9781
+ add_new_state = true;
9782
+
9783
+ pprev = explored_state(env, insn_idx);
9784
+ sl = *pprev;
9785
+
9786
+ clean_live_states(env, insn_idx, cur);
9787
+
9788
+ while (sl) {
9789
+ states_cnt++;
9790
+ if (sl->state.insn_idx != insn_idx)
9791
+ goto next;
9792
+ if (sl->state.branches) {
9793
+ if (states_maybe_looping(&sl->state, cur) &&
9794
+ states_equal(env, &sl->state, cur)) {
9795
+ verbose_linfo(env, insn_idx, "; ");
9796
+ verbose(env, "infinite loop detected at insn %d\n", insn_idx);
9797
+ return -EINVAL;
9798
+ }
9799
+ /* if the verifier is processing a loop, avoid adding new state
9800
+ * too often, since different loop iterations have distinct
9801
+ * states and may not help future pruning.
9802
+ * This threshold shouldn't be too low to make sure that
9803
+ * a loop with large bound will be rejected quickly.
9804
+ * The most abusive loop will be:
9805
+ * r1 += 1
9806
+ * if r1 < 1000000 goto pc-2
9807
+ * 1M insn_procssed limit / 100 == 10k peak states.
9808
+ * This threshold shouldn't be too high either, since states
9809
+ * at the end of the loop are likely to be useful in pruning.
9810
+ */
9811
+ if (env->jmps_processed - env->prev_jmps_processed < 20 &&
9812
+ env->insn_processed - env->prev_insn_processed < 100)
9813
+ add_new_state = false;
9814
+ goto miss;
9815
+ }
51219816 if (states_equal(env, &sl->state, cur)) {
9817
+ sl->hit_cnt++;
51229818 /* reached equivalent register/stack state,
51239819 * prune the search.
51249820 * Registers read by the continuation are read by us.
....@@ -5130,27 +9826,91 @@
51309826 * this state and will pop a new one.
51319827 */
51329828 err = propagate_liveness(env, &sl->state, cur);
9829
+
9830
+ /* if previous state reached the exit with precision and
9831
+ * current state is equivalent to it (except precsion marks)
9832
+ * the precision needs to be propagated back in
9833
+ * the current state.
9834
+ */
9835
+ err = err ? : push_jmp_history(env, cur);
9836
+ err = err ? : propagate_precision(env, &sl->state);
51339837 if (err)
51349838 return err;
51359839 return 1;
51369840 }
5137
- sl = sl->next;
5138
- states_cnt++;
9841
+miss:
9842
+ /* when new state is not going to be added do not increase miss count.
9843
+ * Otherwise several loop iterations will remove the state
9844
+ * recorded earlier. The goal of these heuristics is to have
9845
+ * states from some iterations of the loop (some in the beginning
9846
+ * and some at the end) to help pruning.
9847
+ */
9848
+ if (add_new_state)
9849
+ sl->miss_cnt++;
9850
+ /* heuristic to determine whether this state is beneficial
9851
+ * to keep checking from state equivalence point of view.
9852
+ * Higher numbers increase max_states_per_insn and verification time,
9853
+ * but do not meaningfully decrease insn_processed.
9854
+ */
9855
+ if (sl->miss_cnt > sl->hit_cnt * 3 + 3) {
9856
+ /* the state is unlikely to be useful. Remove it to
9857
+ * speed up verification
9858
+ */
9859
+ *pprev = sl->next;
9860
+ if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) {
9861
+ u32 br = sl->state.branches;
9862
+
9863
+ WARN_ONCE(br,
9864
+ "BUG live_done but branches_to_explore %d\n",
9865
+ br);
9866
+ free_verifier_state(&sl->state, false);
9867
+ kfree(sl);
9868
+ env->peak_states--;
9869
+ } else {
9870
+ /* cannot free this state, since parentage chain may
9871
+ * walk it later. Add it for free_list instead to
9872
+ * be freed at the end of verification
9873
+ */
9874
+ sl->next = env->free_list;
9875
+ env->free_list = sl;
9876
+ }
9877
+ sl = *pprev;
9878
+ continue;
9879
+ }
9880
+next:
9881
+ pprev = &sl->next;
9882
+ sl = *pprev;
51399883 }
51409884
5141
- if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
5142
- return 0;
9885
+ if (env->max_states_per_insn < states_cnt)
9886
+ env->max_states_per_insn = states_cnt;
51439887
5144
- /* there were no equivalent states, remember current one.
5145
- * technically the current state is not proven to be safe yet,
9888
+ if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
9889
+ return push_jmp_history(env, cur);
9890
+
9891
+ if (!add_new_state)
9892
+ return push_jmp_history(env, cur);
9893
+
9894
+ /* There were no equivalent states, remember the current one.
9895
+ * Technically the current state is not proven to be safe yet,
51469896 * but it will either reach outer most bpf_exit (which means it's safe)
5147
- * or it will be rejected. Since there are no loops, we won't be
9897
+ * or it will be rejected. When there are no loops the verifier won't be
51489898 * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx)
5149
- * again on the way to bpf_exit
9899
+ * again on the way to bpf_exit.
9900
+ * When looping the sl->state.branches will be > 0 and this state
9901
+ * will not be considered for equivalence until branches == 0.
51509902 */
51519903 new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL);
51529904 if (!new_sl)
51539905 return -ENOMEM;
9906
+ env->total_states++;
9907
+ env->peak_states++;
9908
+ env->prev_jmps_processed = env->jmps_processed;
9909
+ env->prev_insn_processed = env->insn_processed;
9910
+
9911
+ /* forget precise markings we inherited, see __mark_chain_precision */
9912
+ if (env->bpf_capable)
9913
+ mark_all_scalars_imprecise(env, cur);
51549914
51559915 /* add new state to the head of linked list */
51569916 new = &new_sl->state;
....@@ -5160,19 +9920,34 @@
51609920 kfree(new_sl);
51619921 return err;
51629922 }
5163
- new_sl->next = env->explored_states[insn_idx];
5164
- env->explored_states[insn_idx] = new_sl;
5165
- /* connect new state to parentage chain */
5166
- for (i = 0; i < BPF_REG_FP; i++)
5167
- cur_regs(env)[i].parent = &new->frame[new->curframe]->regs[i];
9923
+ new->insn_idx = insn_idx;
9924
+ WARN_ONCE(new->branches != 1,
9925
+ "BUG is_state_visited:branches_to_explore=%d insn %d\n", new->branches, insn_idx);
9926
+
9927
+ cur->parent = new;
9928
+ cur->first_insn_idx = insn_idx;
9929
+ clear_jmp_history(cur);
9930
+ new_sl->next = *explored_state(env, insn_idx);
9931
+ *explored_state(env, insn_idx) = new_sl;
9932
+ /* connect new state to parentage chain. Current frame needs all
9933
+ * registers connected. Only r6 - r9 of the callers are alive (pushed
9934
+ * to the stack implicitly by JITs) so in callers' frames connect just
9935
+ * r6 - r9 as an optimization. Callers will have r1 - r5 connected to
9936
+ * the state of the call instruction (with WRITTEN set), and r0 comes
9937
+ * from callee with its full parentage chain, anyway.
9938
+ */
51689939 /* clear write marks in current state: the writes we did are not writes
51699940 * our child did, so they don't screen off its reads from us.
51709941 * (There are no read marks in current state, because reads always mark
51719942 * their parent and current state never has children yet. Only
51729943 * explored_states can get read marks.)
51739944 */
5174
- for (i = 0; i < BPF_REG_FP; i++)
5175
- cur->frame[cur->curframe]->regs[i].live = REG_LIVE_NONE;
9945
+ for (j = 0; j <= cur->curframe; j++) {
9946
+ for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++)
9947
+ cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i];
9948
+ for (i = 0; i < BPF_REG_FP; i++)
9949
+ cur->frame[j]->regs[i].live = REG_LIVE_NONE;
9950
+ }
51769951
51779952 /* all stack frames are accessible from callee, clear them all */
51789953 for (j = 0; j <= cur->curframe; j++) {
....@@ -5188,36 +9963,60 @@
51889963 return 0;
51899964 }
51909965
9966
+/* Return true if it's OK to have the same insn return a different type. */
9967
+static bool reg_type_mismatch_ok(enum bpf_reg_type type)
9968
+{
9969
+ switch (type) {
9970
+ case PTR_TO_CTX:
9971
+ case PTR_TO_SOCKET:
9972
+ case PTR_TO_SOCKET_OR_NULL:
9973
+ case PTR_TO_SOCK_COMMON:
9974
+ case PTR_TO_SOCK_COMMON_OR_NULL:
9975
+ case PTR_TO_TCP_SOCK:
9976
+ case PTR_TO_TCP_SOCK_OR_NULL:
9977
+ case PTR_TO_XDP_SOCK:
9978
+ case PTR_TO_BTF_ID:
9979
+ case PTR_TO_BTF_ID_OR_NULL:
9980
+ return false;
9981
+ default:
9982
+ return true;
9983
+ }
9984
+}
9985
+
9986
+/* If an instruction was previously used with particular pointer types, then we
9987
+ * need to be careful to avoid cases such as the below, where it may be ok
9988
+ * for one branch accessing the pointer, but not ok for the other branch:
9989
+ *
9990
+ * R1 = sock_ptr
9991
+ * goto X;
9992
+ * ...
9993
+ * R1 = some_other_valid_ptr;
9994
+ * goto X;
9995
+ * ...
9996
+ * R2 = *(u32 *)(R1 + 0);
9997
+ */
9998
+static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev)
9999
+{
10000
+ return src != prev && (!reg_type_mismatch_ok(src) ||
10001
+ !reg_type_mismatch_ok(prev));
10002
+}
10003
+
519110004 static int do_check(struct bpf_verifier_env *env)
519210005 {
5193
- struct bpf_verifier_state *state;
10006
+ bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
10007
+ struct bpf_verifier_state *state = env->cur_state;
519410008 struct bpf_insn *insns = env->prog->insnsi;
519510009 struct bpf_reg_state *regs;
5196
- int insn_cnt = env->prog->len, i;
5197
- int insn_processed = 0;
10010
+ int insn_cnt = env->prog->len;
519810011 bool do_print_state = false;
5199
-
5200
- state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL);
5201
- if (!state)
5202
- return -ENOMEM;
5203
- state->curframe = 0;
5204
- state->speculative = false;
5205
- state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL);
5206
- if (!state->frame[0]) {
5207
- kfree(state);
5208
- return -ENOMEM;
5209
- }
5210
- env->cur_state = state;
5211
- init_func_state(env, state->frame[0],
5212
- BPF_MAIN_FUNC /* callsite */,
5213
- 0 /* frameno */,
5214
- 0 /* subprogno, zero == main subprog */);
10012
+ int prev_insn_idx = -1;
521510013
521610014 for (;;) {
521710015 struct bpf_insn *insn;
521810016 u8 class;
521910017 int err;
522010018
10019
+ env->prev_insn_idx = prev_insn_idx;
522110020 if (env->insn_idx >= insn_cnt) {
522210021 verbose(env, "invalid insn idx %d insn_cnt %d\n",
522310022 env->insn_idx, insn_cnt);
....@@ -5227,10 +10026,10 @@
522710026 insn = &insns[env->insn_idx];
522810027 class = BPF_CLASS(insn->code);
522910028
5230
- if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
10029
+ if (++env->insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
523110030 verbose(env,
523210031 "BPF program is too large. Processed %d insn\n",
5233
- insn_processed);
10032
+ env->insn_processed);
523410033 return -E2BIG;
523510034 }
523610035
....@@ -5239,7 +10038,7 @@
523910038 return err;
524010039 if (err == 1) {
524110040 /* found equivalent state, can prune the search */
5242
- if (env->log.level) {
10041
+ if (env->log.level & BPF_LOG_LEVEL) {
524310042 if (do_print_state)
524410043 verbose(env, "\nfrom %d to %d%s: safe\n",
524510044 env->prev_insn_idx, env->insn_idx,
....@@ -5257,8 +10056,9 @@
525710056 if (need_resched())
525810057 cond_resched();
525910058
5260
- if (env->log.level > 1 || (env->log.level && do_print_state)) {
5261
- if (env->log.level > 1)
10059
+ if (env->log.level & BPF_LOG_LEVEL2 ||
10060
+ (env->log.level & BPF_LOG_LEVEL && do_print_state)) {
10061
+ if (env->log.level & BPF_LOG_LEVEL2)
526210062 verbose(env, "%d:", env->insn_idx);
526310063 else
526410064 verbose(env, "\nfrom %d to %d%s:",
....@@ -5269,12 +10069,13 @@
526910069 do_print_state = false;
527010070 }
527110071
5272
- if (env->log.level) {
10072
+ if (env->log.level & BPF_LOG_LEVEL) {
527310073 const struct bpf_insn_cbs cbs = {
527410074 .cb_print = verbose,
527510075 .private_data = env,
527610076 };
527710077
10078
+ verbose_linfo(env, env->insn_idx, "; ");
527810079 verbose(env, "%d: ", env->insn_idx);
527910080 print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
528010081 }
....@@ -5288,6 +10089,7 @@
528810089
528910090 regs = cur_regs(env);
529010091 sanitize_mark_insn_seen(env);
10092
+ prev_insn_idx = env->insn_idx;
529110093
529210094 if (class == BPF_ALU || class == BPF_ALU64) {
529310095 err = check_alu_op(env, insn);
....@@ -5328,9 +10130,7 @@
532810130 */
532910131 *prev_src_type = src_reg_type;
533010132
5331
- } else if (src_reg_type != *prev_src_type &&
5332
- (src_reg_type == PTR_TO_CTX ||
5333
- *prev_src_type == PTR_TO_CTX)) {
10133
+ } else if (reg_type_mismatch(src_reg_type, *prev_src_type)) {
533410134 /* ABuser program is trying to use the same insn
533510135 * dst_reg = *(u32*) (src_reg + off)
533610136 * with different pointer types:
....@@ -5375,9 +10175,7 @@
537510175
537610176 if (*prev_dst_type == NOT_INIT) {
537710177 *prev_dst_type = dst_reg_type;
5378
- } else if (dst_reg_type != *prev_dst_type &&
5379
- (dst_reg_type == PTR_TO_CTX ||
5380
- *prev_dst_type == PTR_TO_CTX)) {
10178
+ } else if (reg_type_mismatch(dst_reg_type, *prev_dst_type)) {
538110179 verbose(env, "same insn cannot be used with different pointers\n");
538210180 return -EINVAL;
538310181 }
....@@ -5394,8 +10192,9 @@
539410192 return err;
539510193
539610194 if (is_ctx_reg(env, insn->dst_reg)) {
5397
- verbose(env, "BPF_ST stores into R%d context is not allowed\n",
5398
- insn->dst_reg);
10195
+ verbose(env, "BPF_ST stores into R%d %s is not allowed\n",
10196
+ insn->dst_reg,
10197
+ reg_type_str[reg_state(env, insn->dst_reg)->type]);
539910198 return -EACCES;
540010199 }
540110200
....@@ -5406,19 +10205,27 @@
540610205 if (err)
540710206 return err;
540810207
5409
- } else if (class == BPF_JMP) {
10208
+ } else if (class == BPF_JMP || class == BPF_JMP32) {
541010209 u8 opcode = BPF_OP(insn->code);
541110210
10211
+ env->jmps_processed++;
541210212 if (opcode == BPF_CALL) {
541310213 if (BPF_SRC(insn->code) != BPF_K ||
541410214 insn->off != 0 ||
541510215 (insn->src_reg != BPF_REG_0 &&
541610216 insn->src_reg != BPF_PSEUDO_CALL) ||
5417
- insn->dst_reg != BPF_REG_0) {
10217
+ insn->dst_reg != BPF_REG_0 ||
10218
+ class == BPF_JMP32) {
541810219 verbose(env, "BPF_CALL uses reserved fields\n");
541910220 return -EINVAL;
542010221 }
542110222
10223
+ if (env->cur_state->active_spin_lock &&
10224
+ (insn->src_reg == BPF_PSEUDO_CALL ||
10225
+ insn->imm != BPF_FUNC_spin_unlock)) {
10226
+ verbose(env, "function calls are not allowed while holding a lock\n");
10227
+ return -EINVAL;
10228
+ }
542210229 if (insn->src_reg == BPF_PSEUDO_CALL)
542310230 err = check_func_call(env, insn, &env->insn_idx);
542410231 else
....@@ -5430,7 +10237,8 @@
543010237 if (BPF_SRC(insn->code) != BPF_K ||
543110238 insn->imm != 0 ||
543210239 insn->src_reg != BPF_REG_0 ||
5433
- insn->dst_reg != BPF_REG_0) {
10240
+ insn->dst_reg != BPF_REG_0 ||
10241
+ class == BPF_JMP32) {
543410242 verbose(env, "BPF_JA uses reserved fields\n");
543510243 return -EINVAL;
543610244 }
....@@ -5442,14 +10250,19 @@
544210250 if (BPF_SRC(insn->code) != BPF_K ||
544310251 insn->imm != 0 ||
544410252 insn->src_reg != BPF_REG_0 ||
5445
- insn->dst_reg != BPF_REG_0) {
10253
+ insn->dst_reg != BPF_REG_0 ||
10254
+ class == BPF_JMP32) {
544610255 verbose(env, "BPF_EXIT uses reserved fields\n");
10256
+ return -EINVAL;
10257
+ }
10258
+
10259
+ if (env->cur_state->active_spin_lock) {
10260
+ verbose(env, "bpf_spin_unlock is missing\n");
544710261 return -EINVAL;
544810262 }
544910263
545010264 if (state->curframe) {
545110265 /* exit from nested function */
5452
- env->prev_insn_idx = env->insn_idx;
545310266 err = prepare_func_exit(env, &env->insn_idx);
545410267 if (err)
545510268 return err;
....@@ -5457,27 +10270,17 @@
545710270 continue;
545810271 }
545910272
5460
- /* eBPF calling convetion is such that R0 is used
5461
- * to return the value from eBPF program.
5462
- * Make sure that it's readable at this time
5463
- * of bpf_exit, which means that program wrote
5464
- * something into it earlier
5465
- */
5466
- err = check_reg_arg(env, BPF_REG_0, SRC_OP);
10273
+ err = check_reference_leak(env);
546710274 if (err)
546810275 return err;
5469
-
5470
- if (is_pointer_value(env, BPF_REG_0)) {
5471
- verbose(env, "R0 leaks addr as return value\n");
5472
- return -EACCES;
5473
- }
547410276
547510277 err = check_return_code(env);
547610278 if (err)
547710279 return err;
547810280 process_bpf_exit:
5479
- err = pop_stack(env, &env->prev_insn_idx,
5480
- &env->insn_idx);
10281
+ update_branch_counts(env, env->cur_state);
10282
+ err = pop_stack(env, &prev_insn_idx,
10283
+ &env->insn_idx, pop_log);
548110284 if (err < 0) {
548210285 if (err != -ENOENT)
548310286 return err;
....@@ -5518,17 +10321,93 @@
551810321 env->insn_idx++;
551910322 }
552010323
5521
- verbose(env, "processed %d insns (limit %d), stack depth ",
5522
- insn_processed, BPF_COMPLEXITY_LIMIT_INSNS);
5523
- for (i = 0; i < env->subprog_cnt; i++) {
5524
- u32 depth = env->subprog_info[i].stack_depth;
10324
+ return 0;
10325
+}
552510326
5526
- verbose(env, "%d", depth);
5527
- if (i + 1 < env->subprog_cnt)
5528
- verbose(env, "+");
10327
+/* replace pseudo btf_id with kernel symbol address */
10328
+static int check_pseudo_btf_id(struct bpf_verifier_env *env,
10329
+ struct bpf_insn *insn,
10330
+ struct bpf_insn_aux_data *aux)
10331
+{
10332
+ const struct btf_var_secinfo *vsi;
10333
+ const struct btf_type *datasec;
10334
+ const struct btf_type *t;
10335
+ const char *sym_name;
10336
+ bool percpu = false;
10337
+ u32 type, id = insn->imm;
10338
+ s32 datasec_id;
10339
+ u64 addr;
10340
+ int i;
10341
+
10342
+ if (!btf_vmlinux) {
10343
+ verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n");
10344
+ return -EINVAL;
552910345 }
5530
- verbose(env, "\n");
5531
- env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
10346
+
10347
+ if (insn[1].imm != 0) {
10348
+ verbose(env, "reserved field (insn[1].imm) is used in pseudo_btf_id ldimm64 insn.\n");
10349
+ return -EINVAL;
10350
+ }
10351
+
10352
+ t = btf_type_by_id(btf_vmlinux, id);
10353
+ if (!t) {
10354
+ verbose(env, "ldimm64 insn specifies invalid btf_id %d.\n", id);
10355
+ return -ENOENT;
10356
+ }
10357
+
10358
+ if (!btf_type_is_var(t)) {
10359
+ verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n",
10360
+ id);
10361
+ return -EINVAL;
10362
+ }
10363
+
10364
+ sym_name = btf_name_by_offset(btf_vmlinux, t->name_off);
10365
+ addr = kallsyms_lookup_name(sym_name);
10366
+ if (!addr) {
10367
+ verbose(env, "ldimm64 failed to find the address for kernel symbol '%s'.\n",
10368
+ sym_name);
10369
+ return -ENOENT;
10370
+ }
10371
+
10372
+ datasec_id = btf_find_by_name_kind(btf_vmlinux, ".data..percpu",
10373
+ BTF_KIND_DATASEC);
10374
+ if (datasec_id > 0) {
10375
+ datasec = btf_type_by_id(btf_vmlinux, datasec_id);
10376
+ for_each_vsi(i, datasec, vsi) {
10377
+ if (vsi->type == id) {
10378
+ percpu = true;
10379
+ break;
10380
+ }
10381
+ }
10382
+ }
10383
+
10384
+ insn[0].imm = (u32)addr;
10385
+ insn[1].imm = addr >> 32;
10386
+
10387
+ type = t->type;
10388
+ t = btf_type_skip_modifiers(btf_vmlinux, type, NULL);
10389
+ if (percpu) {
10390
+ aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID;
10391
+ aux->btf_var.btf_id = type;
10392
+ } else if (!btf_type_is_struct(t)) {
10393
+ const struct btf_type *ret;
10394
+ const char *tname;
10395
+ u32 tsize;
10396
+
10397
+ /* resolve the type size of ksym. */
10398
+ ret = btf_resolve_size(btf_vmlinux, t, &tsize);
10399
+ if (IS_ERR(ret)) {
10400
+ tname = btf_name_by_offset(btf_vmlinux, t->name_off);
10401
+ verbose(env, "ldimm64 unable to resolve the size of type '%s': %ld\n",
10402
+ tname, PTR_ERR(ret));
10403
+ return -EINVAL;
10404
+ }
10405
+ aux->btf_var.reg_type = PTR_TO_MEM;
10406
+ aux->btf_var.mem_size = tsize;
10407
+ } else {
10408
+ aux->btf_var.reg_type = PTR_TO_BTF_ID;
10409
+ aux->btf_var.btf_id = type;
10410
+ }
553210411 return 0;
553310412 }
553410413
....@@ -5540,26 +10419,69 @@
554010419 !(map->map_flags & BPF_F_NO_PREALLOC);
554110420 }
554210421
10422
+static bool is_tracing_prog_type(enum bpf_prog_type type)
10423
+{
10424
+ switch (type) {
10425
+ case BPF_PROG_TYPE_KPROBE:
10426
+ case BPF_PROG_TYPE_TRACEPOINT:
10427
+ case BPF_PROG_TYPE_PERF_EVENT:
10428
+ case BPF_PROG_TYPE_RAW_TRACEPOINT:
10429
+ return true;
10430
+ default:
10431
+ return false;
10432
+ }
10433
+}
10434
+
10435
+static bool is_preallocated_map(struct bpf_map *map)
10436
+{
10437
+ if (!check_map_prealloc(map))
10438
+ return false;
10439
+ if (map->inner_map_meta && !check_map_prealloc(map->inner_map_meta))
10440
+ return false;
10441
+ return true;
10442
+}
10443
+
554310444 static int check_map_prog_compatibility(struct bpf_verifier_env *env,
554410445 struct bpf_map *map,
554510446 struct bpf_prog *prog)
554610447
554710448 {
5548
- /* Make sure that BPF_PROG_TYPE_PERF_EVENT programs only use
5549
- * preallocated hash maps, since doing memory allocation
5550
- * in overflow_handler can crash depending on where nmi got
5551
- * triggered.
10449
+ enum bpf_prog_type prog_type = resolve_prog_type(prog);
10450
+ /*
10451
+ * Validate that trace type programs use preallocated hash maps.
10452
+ *
10453
+ * For programs attached to PERF events this is mandatory as the
10454
+ * perf NMI can hit any arbitrary code sequence.
10455
+ *
10456
+ * All other trace types using preallocated hash maps are unsafe as
10457
+ * well because tracepoint or kprobes can be inside locked regions
10458
+ * of the memory allocator or at a place where a recursion into the
10459
+ * memory allocator would see inconsistent state.
10460
+ *
10461
+ * On RT enabled kernels run-time allocation of all trace type
10462
+ * programs is strictly prohibited due to lock type constraints. On
10463
+ * !RT kernels it is allowed for backwards compatibility reasons for
10464
+ * now, but warnings are emitted so developers are made aware of
10465
+ * the unsafety and can fix their programs before this is enforced.
555210466 */
5553
- if (prog->type == BPF_PROG_TYPE_PERF_EVENT) {
5554
- if (!check_map_prealloc(map)) {
10467
+ if (is_tracing_prog_type(prog_type) && !is_preallocated_map(map)) {
10468
+ if (prog_type == BPF_PROG_TYPE_PERF_EVENT) {
555510469 verbose(env, "perf_event programs can only use preallocated hash map\n");
555610470 return -EINVAL;
555710471 }
5558
- if (map->inner_map_meta &&
5559
- !check_map_prealloc(map->inner_map_meta)) {
5560
- verbose(env, "perf_event programs can only use preallocated inner hash map\n");
10472
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
10473
+ verbose(env, "trace type programs can only use preallocated hash map\n");
556110474 return -EINVAL;
556210475 }
10476
+ WARN_ONCE(1, "trace type BPF program uses run-time allocation\n");
10477
+ verbose(env, "trace type programs with run-time allocated hash maps are unsafe. Switch to preallocated hash maps.\n");
10478
+ }
10479
+
10480
+ if ((is_tracing_prog_type(prog_type) ||
10481
+ prog_type == BPF_PROG_TYPE_SOCKET_FILTER) &&
10482
+ map_value_has_spin_lock(map)) {
10483
+ verbose(env, "tracing progs cannot use bpf_spin_lock yet\n");
10484
+ return -EINVAL;
556310485 }
556410486
556510487 if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) &&
....@@ -5568,13 +10490,45 @@
556810490 return -EINVAL;
556910491 }
557010492
10493
+ if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
10494
+ verbose(env, "bpf_struct_ops map cannot be used in prog\n");
10495
+ return -EINVAL;
10496
+ }
10497
+
10498
+ if (prog->aux->sleepable)
10499
+ switch (map->map_type) {
10500
+ case BPF_MAP_TYPE_HASH:
10501
+ case BPF_MAP_TYPE_LRU_HASH:
10502
+ case BPF_MAP_TYPE_ARRAY:
10503
+ if (!is_preallocated_map(map)) {
10504
+ verbose(env,
10505
+ "Sleepable programs can only use preallocated hash maps\n");
10506
+ return -EINVAL;
10507
+ }
10508
+ break;
10509
+ default:
10510
+ verbose(env,
10511
+ "Sleepable programs can only use array and hash maps\n");
10512
+ return -EINVAL;
10513
+ }
10514
+
557110515 return 0;
557210516 }
557310517
5574
-/* look for pseudo eBPF instructions that access map FDs and
5575
- * replace them with actual map pointers
10518
+static bool bpf_map_is_cgroup_storage(struct bpf_map *map)
10519
+{
10520
+ return (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE ||
10521
+ map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE);
10522
+}
10523
+
10524
+/* find and rewrite pseudo imm in ld_imm64 instructions:
10525
+ *
10526
+ * 1. if it accesses map FD, replace it with actual map pointer.
10527
+ * 2. if it accesses btf_id of a VAR, replace it with pointer to the var.
10528
+ *
10529
+ * NOTE: btf_vmlinux is required for converting pseudo btf_id.
557610530 */
5577
-static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
10531
+static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
557810532 {
557910533 struct bpf_insn *insn = env->prog->insnsi;
558010534 int insn_cnt = env->prog->len;
....@@ -5599,8 +10553,10 @@
559910553 }
560010554
560110555 if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
10556
+ struct bpf_insn_aux_data *aux;
560210557 struct bpf_map *map;
560310558 struct fd f;
10559
+ u64 addr;
560410560
560510561 if (i == insn_cnt - 1 || insn[1].code != 0 ||
560610562 insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
....@@ -5609,21 +10565,35 @@
560910565 return -EINVAL;
561010566 }
561110567
5612
- if (insn->src_reg == 0)
10568
+ if (insn[0].src_reg == 0)
561310569 /* valid generic load 64-bit imm */
561410570 goto next_insn;
561510571
5616
- if (insn->src_reg != BPF_PSEUDO_MAP_FD) {
10572
+ if (insn[0].src_reg == BPF_PSEUDO_BTF_ID) {
10573
+ aux = &env->insn_aux_data[i];
10574
+ err = check_pseudo_btf_id(env, insn, aux);
10575
+ if (err)
10576
+ return err;
10577
+ goto next_insn;
10578
+ }
10579
+
10580
+ /* In final convert_pseudo_ld_imm64() step, this is
10581
+ * converted into regular 64-bit imm load insn.
10582
+ */
10583
+ if ((insn[0].src_reg != BPF_PSEUDO_MAP_FD &&
10584
+ insn[0].src_reg != BPF_PSEUDO_MAP_VALUE) ||
10585
+ (insn[0].src_reg == BPF_PSEUDO_MAP_FD &&
10586
+ insn[1].imm != 0)) {
561710587 verbose(env,
561810588 "unrecognized bpf_ld_imm64 insn\n");
561910589 return -EINVAL;
562010590 }
562110591
5622
- f = fdget(insn->imm);
10592
+ f = fdget(insn[0].imm);
562310593 map = __bpf_map_get(f);
562410594 if (IS_ERR(map)) {
562510595 verbose(env, "fd %d is not pointing to valid bpf_map\n",
5626
- insn->imm);
10596
+ insn[0].imm);
562710597 return PTR_ERR(map);
562810598 }
562910599
....@@ -5633,16 +10603,47 @@
563310603 return err;
563410604 }
563510605
5636
- /* store map pointer inside BPF_LD_IMM64 instruction */
5637
- insn[0].imm = (u32) (unsigned long) map;
5638
- insn[1].imm = ((u64) (unsigned long) map) >> 32;
10606
+ aux = &env->insn_aux_data[i];
10607
+ if (insn->src_reg == BPF_PSEUDO_MAP_FD) {
10608
+ addr = (unsigned long)map;
10609
+ } else {
10610
+ u32 off = insn[1].imm;
10611
+
10612
+ if (off >= BPF_MAX_VAR_OFF) {
10613
+ verbose(env, "direct value offset of %u is not allowed\n", off);
10614
+ fdput(f);
10615
+ return -EINVAL;
10616
+ }
10617
+
10618
+ if (!map->ops->map_direct_value_addr) {
10619
+ verbose(env, "no direct value access support for this map type\n");
10620
+ fdput(f);
10621
+ return -EINVAL;
10622
+ }
10623
+
10624
+ err = map->ops->map_direct_value_addr(map, &addr, off);
10625
+ if (err) {
10626
+ verbose(env, "invalid access to map value pointer, value_size=%u off=%u\n",
10627
+ map->value_size, off);
10628
+ fdput(f);
10629
+ return err;
10630
+ }
10631
+
10632
+ aux->map_off = off;
10633
+ addr += off;
10634
+ }
10635
+
10636
+ insn[0].imm = (u32)addr;
10637
+ insn[1].imm = addr >> 32;
563910638
564010639 /* check whether we recorded this map already */
5641
- for (j = 0; j < env->used_map_cnt; j++)
10640
+ for (j = 0; j < env->used_map_cnt; j++) {
564210641 if (env->used_maps[j] == map) {
10642
+ aux->map_index = j;
564310643 fdput(f);
564410644 goto next_insn;
564510645 }
10646
+ }
564610647
564710648 if (env->used_map_cnt >= MAX_USED_MAPS) {
564810649 fdput(f);
....@@ -5654,17 +10655,14 @@
565410655 * will be used by the valid program until it's unloaded
565510656 * and all maps are released in free_used_maps()
565610657 */
5657
- map = bpf_map_inc(map, false);
5658
- if (IS_ERR(map)) {
5659
- fdput(f);
5660
- return PTR_ERR(map);
5661
- }
10658
+ bpf_map_inc(map);
10659
+
10660
+ aux->map_index = env->used_map_cnt;
566210661 env->used_maps[env->used_map_cnt++] = map;
566310662
5664
- if (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE &&
5665
- bpf_cgroup_storage_assign(env->prog, map)) {
5666
- verbose(env,
5667
- "only one cgroup storage is allowed\n");
10663
+ if (bpf_map_is_cgroup_storage(map) &&
10664
+ bpf_cgroup_storage_assign(env->prog->aux, map)) {
10665
+ verbose(env, "only one cgroup storage of each type is allowed\n");
566810666 fdput(f);
566910667 return -EBUSY;
567010668 }
....@@ -5693,14 +10691,8 @@
569310691 /* drop refcnt of maps used by the rejected program */
569410692 static void release_maps(struct bpf_verifier_env *env)
569510693 {
5696
- int i;
5697
-
5698
- if (env->prog->aux->cgroup_storage)
5699
- bpf_cgroup_storage_release(env->prog,
5700
- env->prog->aux->cgroup_storage);
5701
-
5702
- for (i = 0; i < env->used_map_cnt; i++)
5703
- bpf_map_put(env->used_maps[i]);
10694
+ __bpf_free_used_maps(env->prog->aux, env->used_maps,
10695
+ env->used_map_cnt);
570410696 }
570510697
570610698 /* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */
....@@ -5719,29 +10711,36 @@
571910711 * insni[off, off + cnt). Adjust corresponding insn_aux_data by copying
572010712 * [0, off) and [off, end) to new locations, so the patched range stays zero
572110713 */
5722
-static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,
5723
- u32 off, u32 cnt)
10714
+static void adjust_insn_aux_data(struct bpf_verifier_env *env,
10715
+ struct bpf_insn_aux_data *new_data,
10716
+ struct bpf_prog *new_prog, u32 off, u32 cnt)
572410717 {
5725
- struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data;
5726
- bool old_seen = old_data[off].seen;
10718
+ struct bpf_insn_aux_data *old_data = env->insn_aux_data;
10719
+ struct bpf_insn *insn = new_prog->insnsi;
10720
+ u32 old_seen = old_data[off].seen;
10721
+ u32 prog_len;
572710722 int i;
572810723
10724
+ /* aux info at OFF always needs adjustment, no matter fast path
10725
+ * (cnt == 1) is taken or not. There is no guarantee INSN at OFF is the
10726
+ * original insn at old prog.
10727
+ */
10728
+ old_data[off].zext_dst = insn_has_def32(env, insn + off + cnt - 1);
10729
+
572910730 if (cnt == 1)
5730
- return 0;
5731
- new_data = vzalloc(array_size(prog_len,
5732
- sizeof(struct bpf_insn_aux_data)));
5733
- if (!new_data)
5734
- return -ENOMEM;
10731
+ return;
10732
+ prog_len = new_prog->len;
10733
+
573510734 memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off);
573610735 memcpy(new_data + off + cnt - 1, old_data + off,
573710736 sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
573810737 for (i = off; i < off + cnt - 1; i++) {
573910738 /* Expand insni[off]'s seen count to the patched range. */
574010739 new_data[i].seen = old_seen;
10740
+ new_data[i].zext_dst = insn_has_def32(env, insn + i);
574110741 }
574210742 env->insn_aux_data = new_data;
574310743 vfree(old_data);
5744
- return 0;
574510744 }
574610745
574710746 static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len)
....@@ -5758,18 +10757,193 @@
575810757 }
575910758 }
576010759
10760
+static void adjust_poke_descs(struct bpf_prog *prog, u32 off, u32 len)
10761
+{
10762
+ struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab;
10763
+ int i, sz = prog->aux->size_poke_tab;
10764
+ struct bpf_jit_poke_descriptor *desc;
10765
+
10766
+ for (i = 0; i < sz; i++) {
10767
+ desc = &tab[i];
10768
+ if (desc->insn_idx <= off)
10769
+ continue;
10770
+ desc->insn_idx += len - 1;
10771
+ }
10772
+}
10773
+
576110774 static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off,
576210775 const struct bpf_insn *patch, u32 len)
576310776 {
576410777 struct bpf_prog *new_prog;
10778
+ struct bpf_insn_aux_data *new_data = NULL;
10779
+
10780
+ if (len > 1) {
10781
+ new_data = vzalloc(array_size(env->prog->len + len - 1,
10782
+ sizeof(struct bpf_insn_aux_data)));
10783
+ if (!new_data)
10784
+ return NULL;
10785
+ }
576510786
576610787 new_prog = bpf_patch_insn_single(env->prog, off, patch, len);
5767
- if (!new_prog)
10788
+ if (IS_ERR(new_prog)) {
10789
+ if (PTR_ERR(new_prog) == -ERANGE)
10790
+ verbose(env,
10791
+ "insn %d cannot be patched due to 16-bit range\n",
10792
+ env->insn_aux_data[off].orig_idx);
10793
+ vfree(new_data);
576810794 return NULL;
5769
- if (adjust_insn_aux_data(env, new_prog->len, off, len))
5770
- return NULL;
10795
+ }
10796
+ adjust_insn_aux_data(env, new_data, new_prog, off, len);
577110797 adjust_subprog_starts(env, off, len);
10798
+ adjust_poke_descs(new_prog, off, len);
577210799 return new_prog;
10800
+}
10801
+
10802
+static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env,
10803
+ u32 off, u32 cnt)
10804
+{
10805
+ int i, j;
10806
+
10807
+ /* find first prog starting at or after off (first to remove) */
10808
+ for (i = 0; i < env->subprog_cnt; i++)
10809
+ if (env->subprog_info[i].start >= off)
10810
+ break;
10811
+ /* find first prog starting at or after off + cnt (first to stay) */
10812
+ for (j = i; j < env->subprog_cnt; j++)
10813
+ if (env->subprog_info[j].start >= off + cnt)
10814
+ break;
10815
+ /* if j doesn't start exactly at off + cnt, we are just removing
10816
+ * the front of previous prog
10817
+ */
10818
+ if (env->subprog_info[j].start != off + cnt)
10819
+ j--;
10820
+
10821
+ if (j > i) {
10822
+ struct bpf_prog_aux *aux = env->prog->aux;
10823
+ int move;
10824
+
10825
+ /* move fake 'exit' subprog as well */
10826
+ move = env->subprog_cnt + 1 - j;
10827
+
10828
+ memmove(env->subprog_info + i,
10829
+ env->subprog_info + j,
10830
+ sizeof(*env->subprog_info) * move);
10831
+ env->subprog_cnt -= j - i;
10832
+
10833
+ /* remove func_info */
10834
+ if (aux->func_info) {
10835
+ move = aux->func_info_cnt - j;
10836
+
10837
+ memmove(aux->func_info + i,
10838
+ aux->func_info + j,
10839
+ sizeof(*aux->func_info) * move);
10840
+ aux->func_info_cnt -= j - i;
10841
+ /* func_info->insn_off is set after all code rewrites,
10842
+ * in adjust_btf_func() - no need to adjust
10843
+ */
10844
+ }
10845
+ } else {
10846
+ /* convert i from "first prog to remove" to "first to adjust" */
10847
+ if (env->subprog_info[i].start == off)
10848
+ i++;
10849
+ }
10850
+
10851
+ /* update fake 'exit' subprog as well */
10852
+ for (; i <= env->subprog_cnt; i++)
10853
+ env->subprog_info[i].start -= cnt;
10854
+
10855
+ return 0;
10856
+}
10857
+
10858
+static int bpf_adj_linfo_after_remove(struct bpf_verifier_env *env, u32 off,
10859
+ u32 cnt)
10860
+{
10861
+ struct bpf_prog *prog = env->prog;
10862
+ u32 i, l_off, l_cnt, nr_linfo;
10863
+ struct bpf_line_info *linfo;
10864
+
10865
+ nr_linfo = prog->aux->nr_linfo;
10866
+ if (!nr_linfo)
10867
+ return 0;
10868
+
10869
+ linfo = prog->aux->linfo;
10870
+
10871
+ /* find first line info to remove, count lines to be removed */
10872
+ for (i = 0; i < nr_linfo; i++)
10873
+ if (linfo[i].insn_off >= off)
10874
+ break;
10875
+
10876
+ l_off = i;
10877
+ l_cnt = 0;
10878
+ for (; i < nr_linfo; i++)
10879
+ if (linfo[i].insn_off < off + cnt)
10880
+ l_cnt++;
10881
+ else
10882
+ break;
10883
+
10884
+ /* First live insn doesn't match first live linfo, it needs to "inherit"
10885
+ * last removed linfo. prog is already modified, so prog->len == off
10886
+ * means no live instructions after (tail of the program was removed).
10887
+ */
10888
+ if (prog->len != off && l_cnt &&
10889
+ (i == nr_linfo || linfo[i].insn_off != off + cnt)) {
10890
+ l_cnt--;
10891
+ linfo[--i].insn_off = off + cnt;
10892
+ }
10893
+
10894
+ /* remove the line info which refer to the removed instructions */
10895
+ if (l_cnt) {
10896
+ memmove(linfo + l_off, linfo + i,
10897
+ sizeof(*linfo) * (nr_linfo - i));
10898
+
10899
+ prog->aux->nr_linfo -= l_cnt;
10900
+ nr_linfo = prog->aux->nr_linfo;
10901
+ }
10902
+
10903
+ /* pull all linfo[i].insn_off >= off + cnt in by cnt */
10904
+ for (i = l_off; i < nr_linfo; i++)
10905
+ linfo[i].insn_off -= cnt;
10906
+
10907
+ /* fix up all subprogs (incl. 'exit') which start >= off */
10908
+ for (i = 0; i <= env->subprog_cnt; i++)
10909
+ if (env->subprog_info[i].linfo_idx > l_off) {
10910
+ /* program may have started in the removed region but
10911
+ * may not be fully removed
10912
+ */
10913
+ if (env->subprog_info[i].linfo_idx >= l_off + l_cnt)
10914
+ env->subprog_info[i].linfo_idx -= l_cnt;
10915
+ else
10916
+ env->subprog_info[i].linfo_idx = l_off;
10917
+ }
10918
+
10919
+ return 0;
10920
+}
10921
+
10922
+static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
10923
+{
10924
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
10925
+ unsigned int orig_prog_len = env->prog->len;
10926
+ int err;
10927
+
10928
+ if (bpf_prog_is_dev_bound(env->prog->aux))
10929
+ bpf_prog_offload_remove_insns(env, off, cnt);
10930
+
10931
+ err = bpf_remove_insns(env->prog, off, cnt);
10932
+ if (err)
10933
+ return err;
10934
+
10935
+ err = adjust_subprog_starts_after_remove(env, off, cnt);
10936
+ if (err)
10937
+ return err;
10938
+
10939
+ err = bpf_adj_linfo_after_remove(env, off, cnt);
10940
+ if (err)
10941
+ return err;
10942
+
10943
+ memmove(aux_data + off, aux_data + off + cnt,
10944
+ sizeof(*aux_data) * (orig_prog_len - off - cnt));
10945
+
10946
+ return 0;
577310947 }
577410948
577510949 /* The verifier does more data flow analysis than llvm and will not
....@@ -5795,11 +10969,177 @@
579510969 if (aux_data[i].seen)
579610970 continue;
579710971 memcpy(insn + i, &trap, sizeof(trap));
10972
+ aux_data[i].zext_dst = false;
579810973 }
579910974 }
580010975
5801
-/* convert load instructions that access fields of 'struct __sk_buff'
5802
- * into sequence of instructions that access fields of 'struct sk_buff'
10976
+static bool insn_is_cond_jump(u8 code)
10977
+{
10978
+ u8 op;
10979
+
10980
+ if (BPF_CLASS(code) == BPF_JMP32)
10981
+ return true;
10982
+
10983
+ if (BPF_CLASS(code) != BPF_JMP)
10984
+ return false;
10985
+
10986
+ op = BPF_OP(code);
10987
+ return op != BPF_JA && op != BPF_EXIT && op != BPF_CALL;
10988
+}
10989
+
10990
+static void opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env)
10991
+{
10992
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
10993
+ struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0);
10994
+ struct bpf_insn *insn = env->prog->insnsi;
10995
+ const int insn_cnt = env->prog->len;
10996
+ int i;
10997
+
10998
+ for (i = 0; i < insn_cnt; i++, insn++) {
10999
+ if (!insn_is_cond_jump(insn->code))
11000
+ continue;
11001
+
11002
+ if (!aux_data[i + 1].seen)
11003
+ ja.off = insn->off;
11004
+ else if (!aux_data[i + 1 + insn->off].seen)
11005
+ ja.off = 0;
11006
+ else
11007
+ continue;
11008
+
11009
+ if (bpf_prog_is_dev_bound(env->prog->aux))
11010
+ bpf_prog_offload_replace_insn(env, i, &ja);
11011
+
11012
+ memcpy(insn, &ja, sizeof(ja));
11013
+ }
11014
+}
11015
+
11016
+static int opt_remove_dead_code(struct bpf_verifier_env *env)
11017
+{
11018
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
11019
+ int insn_cnt = env->prog->len;
11020
+ int i, err;
11021
+
11022
+ for (i = 0; i < insn_cnt; i++) {
11023
+ int j;
11024
+
11025
+ j = 0;
11026
+ while (i + j < insn_cnt && !aux_data[i + j].seen)
11027
+ j++;
11028
+ if (!j)
11029
+ continue;
11030
+
11031
+ err = verifier_remove_insns(env, i, j);
11032
+ if (err)
11033
+ return err;
11034
+ insn_cnt = env->prog->len;
11035
+ }
11036
+
11037
+ return 0;
11038
+}
11039
+
11040
+static int opt_remove_nops(struct bpf_verifier_env *env)
11041
+{
11042
+ const struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0);
11043
+ struct bpf_insn *insn = env->prog->insnsi;
11044
+ int insn_cnt = env->prog->len;
11045
+ int i, err;
11046
+
11047
+ for (i = 0; i < insn_cnt; i++) {
11048
+ if (memcmp(&insn[i], &ja, sizeof(ja)))
11049
+ continue;
11050
+
11051
+ err = verifier_remove_insns(env, i, 1);
11052
+ if (err)
11053
+ return err;
11054
+ insn_cnt--;
11055
+ i--;
11056
+ }
11057
+
11058
+ return 0;
11059
+}
11060
+
11061
+static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
11062
+ const union bpf_attr *attr)
11063
+{
11064
+ struct bpf_insn *patch, zext_patch[2], rnd_hi32_patch[4];
11065
+ struct bpf_insn_aux_data *aux = env->insn_aux_data;
11066
+ int i, patch_len, delta = 0, len = env->prog->len;
11067
+ struct bpf_insn *insns = env->prog->insnsi;
11068
+ struct bpf_prog *new_prog;
11069
+ bool rnd_hi32;
11070
+
11071
+ rnd_hi32 = attr->prog_flags & BPF_F_TEST_RND_HI32;
11072
+ zext_patch[1] = BPF_ZEXT_REG(0);
11073
+ rnd_hi32_patch[1] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, 0);
11074
+ rnd_hi32_patch[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32);
11075
+ rnd_hi32_patch[3] = BPF_ALU64_REG(BPF_OR, 0, BPF_REG_AX);
11076
+ for (i = 0; i < len; i++) {
11077
+ int adj_idx = i + delta;
11078
+ struct bpf_insn insn;
11079
+
11080
+ insn = insns[adj_idx];
11081
+ if (!aux[adj_idx].zext_dst) {
11082
+ u8 code, class;
11083
+ u32 imm_rnd;
11084
+
11085
+ if (!rnd_hi32)
11086
+ continue;
11087
+
11088
+ code = insn.code;
11089
+ class = BPF_CLASS(code);
11090
+ if (insn_no_def(&insn))
11091
+ continue;
11092
+
11093
+ /* NOTE: arg "reg" (the fourth one) is only used for
11094
+ * BPF_STX which has been ruled out in above
11095
+ * check, it is safe to pass NULL here.
11096
+ */
11097
+ if (is_reg64(env, &insn, insn.dst_reg, NULL, DST_OP)) {
11098
+ if (class == BPF_LD &&
11099
+ BPF_MODE(code) == BPF_IMM)
11100
+ i++;
11101
+ continue;
11102
+ }
11103
+
11104
+ /* ctx load could be transformed into wider load. */
11105
+ if (class == BPF_LDX &&
11106
+ aux[adj_idx].ptr_type == PTR_TO_CTX)
11107
+ continue;
11108
+
11109
+ imm_rnd = get_random_int();
11110
+ rnd_hi32_patch[0] = insn;
11111
+ rnd_hi32_patch[1].imm = imm_rnd;
11112
+ rnd_hi32_patch[3].dst_reg = insn.dst_reg;
11113
+ patch = rnd_hi32_patch;
11114
+ patch_len = 4;
11115
+ goto apply_patch_buffer;
11116
+ }
11117
+
11118
+ if (!bpf_jit_needs_zext())
11119
+ continue;
11120
+
11121
+ zext_patch[0] = insn;
11122
+ zext_patch[1].dst_reg = insn.dst_reg;
11123
+ zext_patch[1].src_reg = insn.dst_reg;
11124
+ patch = zext_patch;
11125
+ patch_len = 2;
11126
+apply_patch_buffer:
11127
+ new_prog = bpf_patch_insn_data(env, adj_idx, patch, patch_len);
11128
+ if (!new_prog)
11129
+ return -ENOMEM;
11130
+ env->prog = new_prog;
11131
+ insns = new_prog->insnsi;
11132
+ aux = env->insn_aux_data;
11133
+ delta += patch_len - 1;
11134
+ }
11135
+
11136
+ return 0;
11137
+}
11138
+
11139
+/* convert load instructions that access fields of a context type into a
11140
+ * sequence of instructions that access fields of the underlying structure:
11141
+ * struct __sk_buff -> struct sk_buff
11142
+ * struct bpf_sock_ops -> struct sock
580311143 */
580411144 static int convert_ctx_accesses(struct bpf_verifier_env *env)
580511145 {
....@@ -5812,7 +11152,11 @@
581211152 enum bpf_access_type type;
581311153 bool is_narrower_load;
581411154
5815
- if (ops->gen_prologue) {
11155
+ if (ops->gen_prologue || env->seen_direct_write) {
11156
+ if (!ops->gen_prologue) {
11157
+ verbose(env, "bpf verifier is misconfigured\n");
11158
+ return -EINVAL;
11159
+ }
581611160 cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
581711161 env->prog);
581811162 if (cnt >= ARRAY_SIZE(insn_buf)) {
....@@ -5828,12 +11172,13 @@
582811172 }
582911173 }
583011174
5831
- if (!ops->convert_ctx_access || bpf_prog_is_dev_bound(env->prog->aux))
11175
+ if (bpf_prog_is_dev_bound(env->prog->aux))
583211176 return 0;
583311177
583411178 insn = env->prog->insnsi + delta;
583511179
583611180 for (i = 0; i < insn_cnt; i++, insn++) {
11181
+ bpf_convert_ctx_access_t convert_ctx_access;
583711182 bool ctx_access;
583811183
583911184 if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) ||
....@@ -5877,8 +11222,35 @@
587711222 if (!ctx_access)
587811223 continue;
587911224
5880
- if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX)
11225
+ switch (env->insn_aux_data[i + delta].ptr_type) {
11226
+ case PTR_TO_CTX:
11227
+ if (!ops->convert_ctx_access)
11228
+ continue;
11229
+ convert_ctx_access = ops->convert_ctx_access;
11230
+ break;
11231
+ case PTR_TO_SOCKET:
11232
+ case PTR_TO_SOCK_COMMON:
11233
+ convert_ctx_access = bpf_sock_convert_ctx_access;
11234
+ break;
11235
+ case PTR_TO_TCP_SOCK:
11236
+ convert_ctx_access = bpf_tcp_sock_convert_ctx_access;
11237
+ break;
11238
+ case PTR_TO_XDP_SOCK:
11239
+ convert_ctx_access = bpf_xdp_sock_convert_ctx_access;
11240
+ break;
11241
+ case PTR_TO_BTF_ID:
11242
+ if (type == BPF_READ) {
11243
+ insn->code = BPF_LDX | BPF_PROBE_MEM |
11244
+ BPF_SIZE((insn)->code);
11245
+ env->prog->aux->num_exentries++;
11246
+ } else if (resolve_prog_type(env->prog) != BPF_PROG_TYPE_STRUCT_OPS) {
11247
+ verbose(env, "Writes through BTF pointers are not allowed\n");
11248
+ return -EINVAL;
11249
+ }
588111250 continue;
11251
+ default:
11252
+ continue;
11253
+ }
588211254
588311255 ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size;
588411256 size = BPF_LDST_BYTES(insn);
....@@ -5910,8 +11282,8 @@
591011282 }
591111283
591211284 target_size = 0;
5913
- cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog,
5914
- &target_size);
11285
+ cnt = convert_ctx_access(type, insn, insn_buf, env->prog,
11286
+ &target_size);
591511287 if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) ||
591611288 (ctx_field_size && !target_size)) {
591711289 verbose(env, "bpf verifier is misconfigured\n");
....@@ -5919,8 +11291,12 @@
591911291 }
592011292
592111293 if (is_narrower_load && size < target_size) {
5922
- u8 shift = (off & (size_default - 1)) * 8;
5923
-
11294
+ u8 shift = bpf_ctx_narrow_access_offset(
11295
+ off, size, size_default) * 8;
11296
+ if (shift && cnt + 1 >= ARRAY_SIZE(insn_buf)) {
11297
+ verbose(env, "bpf verifier narrow ctx load misconfigured\n");
11298
+ return -EINVAL;
11299
+ }
592411300 if (ctx_field_size <= 4) {
592511301 if (shift)
592611302 insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH,
....@@ -5933,7 +11309,7 @@
593311309 insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH,
593411310 insn->dst_reg,
593511311 shift);
5936
- insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg,
11312
+ insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
593711313 (1ULL << size * 8) - 1);
593811314 }
593911315 }
....@@ -5956,9 +11332,10 @@
595611332 {
595711333 struct bpf_prog *prog = env->prog, **func, *tmp;
595811334 int i, j, subprog_start, subprog_end = 0, len, subprog;
11335
+ struct bpf_map *map_ptr;
595911336 struct bpf_insn *insn;
596011337 void *old_bpf_func;
5961
- int err = -ENOMEM;
11338
+ int err, num_exentries;
596211339
596311340 if (env->subprog_cnt <= 1)
596411341 return 0;
....@@ -5989,6 +11366,11 @@
598911366 insn->imm = 1;
599011367 }
599111368
11369
+ err = bpf_prog_alloc_jited_linfo(prog);
11370
+ if (err)
11371
+ goto out_undo_insn;
11372
+
11373
+ err = -ENOMEM;
599211374 func = kcalloc(env->subprog_cnt, sizeof(prog), GFP_KERNEL);
599311375 if (!func)
599411376 goto out_undo_insn;
....@@ -5998,7 +11380,12 @@
599811380 subprog_end = env->subprog_info[i + 1].start;
599911381
600011382 len = subprog_end - subprog_start;
6001
- func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER);
11383
+ /* BPF_PROG_RUN doesn't call subprogs directly,
11384
+ * hence main prog stats include the runtime of subprogs.
11385
+ * subprogs don't have IDs and not reachable via prog_get_next_id
11386
+ * func[i]->aux->stats will never be accessed and stays NULL
11387
+ */
11388
+ func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER);
600211389 if (!func[i])
600311390 goto out_free;
600411391 memcpy(func[i]->insnsi, &prog->insnsi[subprog_start],
....@@ -6008,12 +11395,53 @@
600811395 if (bpf_prog_calc_tag(func[i]))
600911396 goto out_free;
601011397 func[i]->is_func = 1;
6011
- /* Use bpf_prog_F_tag to indicate functions in stack traces.
6012
- * Long term would need debug info to populate names
6013
- */
11398
+ func[i]->aux->func_idx = i;
11399
+ /* the btf and func_info will be freed only at prog->aux */
11400
+ func[i]->aux->btf = prog->aux->btf;
11401
+ func[i]->aux->func_info = prog->aux->func_info;
11402
+ func[i]->aux->func_info_cnt = prog->aux->func_info_cnt;
11403
+
11404
+ for (j = 0; j < prog->aux->size_poke_tab; j++) {
11405
+ u32 insn_idx = prog->aux->poke_tab[j].insn_idx;
11406
+ int ret;
11407
+
11408
+ if (!(insn_idx >= subprog_start &&
11409
+ insn_idx <= subprog_end))
11410
+ continue;
11411
+
11412
+ ret = bpf_jit_add_poke_descriptor(func[i],
11413
+ &prog->aux->poke_tab[j]);
11414
+ if (ret < 0) {
11415
+ verbose(env, "adding tail call poke descriptor failed\n");
11416
+ goto out_free;
11417
+ }
11418
+
11419
+ func[i]->insnsi[insn_idx - subprog_start].imm = ret + 1;
11420
+
11421
+ map_ptr = func[i]->aux->poke_tab[ret].tail_call.map;
11422
+ ret = map_ptr->ops->map_poke_track(map_ptr, func[i]->aux);
11423
+ if (ret < 0) {
11424
+ verbose(env, "tracking tail call prog failed\n");
11425
+ goto out_free;
11426
+ }
11427
+ }
11428
+
601411429 func[i]->aux->name[0] = 'F';
601511430 func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
601611431 func[i]->jit_requested = 1;
11432
+ func[i]->aux->linfo = prog->aux->linfo;
11433
+ func[i]->aux->nr_linfo = prog->aux->nr_linfo;
11434
+ func[i]->aux->jited_linfo = prog->aux->jited_linfo;
11435
+ func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx;
11436
+ num_exentries = 0;
11437
+ insn = func[i]->insnsi;
11438
+ for (j = 0; j < func[i]->len; j++, insn++) {
11439
+ if (BPF_CLASS(insn->code) == BPF_LDX &&
11440
+ BPF_MODE(insn->code) == BPF_PROBE_MEM)
11441
+ num_exentries++;
11442
+ }
11443
+ func[i]->aux->num_exentries = num_exentries;
11444
+ func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable;
601711445 func[i] = bpf_int_jit_compile(func[i]);
601811446 if (!func[i]->jited) {
601911447 err = -ENOTSUPP;
....@@ -6021,6 +11449,19 @@
602111449 }
602211450 cond_resched();
602311451 }
11452
+
11453
+ /* Untrack main program's aux structs so that during map_poke_run()
11454
+ * we will not stumble upon the unfilled poke descriptors; each
11455
+ * of the main program's poke descs got distributed across subprogs
11456
+ * and got tracked onto map, so we are sure that none of them will
11457
+ * be missed after the operation below
11458
+ */
11459
+ for (i = 0; i < prog->aux->size_poke_tab; i++) {
11460
+ map_ptr = prog->aux->poke_tab[i].tail_call.map;
11461
+
11462
+ map_ptr->ops->map_poke_untrack(map_ptr, prog->aux);
11463
+ }
11464
+
602411465 /* at this point all bpf functions were successfully JITed
602511466 * now populate all bpf_calls with correct addresses and
602611467 * run last pass of JIT
....@@ -6032,9 +11473,8 @@
603211473 insn->src_reg != BPF_PSEUDO_CALL)
603311474 continue;
603411475 subprog = insn->off;
6035
- insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
6036
- func[subprog]->bpf_func -
6037
- __bpf_call_base;
11476
+ insn->imm = BPF_CAST_CALL(func[subprog]->bpf_func) -
11477
+ __bpf_call_base;
603811478 }
603911479
604011480 /* we use the aux data to keep a list of the start addresses
....@@ -6087,11 +11527,19 @@
608711527 prog->bpf_func = func[0]->bpf_func;
608811528 prog->aux->func = func;
608911529 prog->aux->func_cnt = env->subprog_cnt;
11530
+ bpf_prog_free_unused_jited_linfo(prog);
609011531 return 0;
609111532 out_free:
6092
- for (i = 0; i < env->subprog_cnt; i++)
6093
- if (func[i])
6094
- bpf_jit_free(func[i]);
11533
+ for (i = 0; i < env->subprog_cnt; i++) {
11534
+ if (!func[i])
11535
+ continue;
11536
+
11537
+ for (j = 0; j < func[i]->aux->size_poke_tab; j++) {
11538
+ map_ptr = func[i]->aux->poke_tab[j].tail_call.map;
11539
+ map_ptr->ops->map_poke_untrack(map_ptr, func[i]->aux);
11540
+ }
11541
+ bpf_jit_free(func[i]);
11542
+ }
609511543 kfree(func);
609611544 out_undo_insn:
609711545 /* cleanup main prog to be interpreted */
....@@ -6103,6 +11551,7 @@
610311551 insn->off = 0;
610411552 insn->imm = env->insn_aux_data[i].call_imm;
610511553 }
11554
+ bpf_prog_free_jited_linfo(prog);
610611555 return err;
610711556 }
610811557
....@@ -6113,10 +11562,10 @@
611311562 struct bpf_insn *insn = prog->insnsi;
611411563 int i, depth;
611511564 #endif
6116
- int err;
11565
+ int err = 0;
611711566
6118
- err = 0;
6119
- if (env->prog->jit_requested) {
11567
+ if (env->prog->jit_requested &&
11568
+ !bpf_prog_is_dev_bound(env->prog->aux)) {
612011569 err = jit_subprogs(env);
612111570 if (err == 0)
612211571 return 0;
....@@ -6124,6 +11573,13 @@
612411573 return err;
612511574 }
612611575 #ifndef CONFIG_BPF_JIT_ALWAYS_ON
11576
+ if (env->subprog_cnt > 1 && env->prog->aux->tail_call_reachable) {
11577
+ /* When JIT fails the progs with bpf2bpf calls and tail_calls
11578
+ * have to be rejected, since interpreter doesn't support them yet.
11579
+ */
11580
+ verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n");
11581
+ return -EINVAL;
11582
+ }
612711583 for (i = 0; i < prog->len; i++, insn++) {
612811584 if (insn->code != (BPF_JMP | BPF_CALL) ||
612911585 insn->src_reg != BPF_PSEUDO_CALL)
....@@ -6146,6 +11602,7 @@
614611602 static int fixup_bpf_calls(struct bpf_verifier_env *env)
614711603 {
614811604 struct bpf_prog *prog = env->prog;
11605
+ bool expect_blinding = bpf_jit_blinding_enabled(prog);
614911606 struct bpf_insn *insn = prog->insnsi;
615011607 const struct bpf_func_proto *fn;
615111608 const int insn_cnt = prog->len;
....@@ -6154,7 +11611,7 @@
615411611 struct bpf_insn insn_buf[16];
615511612 struct bpf_prog *new_prog;
615611613 struct bpf_map *map_ptr;
6157
- int i, cnt, delta = 0;
11614
+ int i, ret, cnt, delta = 0;
615811615
615911616 for (i = 0; i < insn_cnt; i++, insn++) {
616011617 if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
....@@ -6162,31 +11619,30 @@
616211619 insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
616311620 insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
616411621 bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
6165
- struct bpf_insn mask_and_div[] = {
6166
- BPF_MOV_REG(BPF_CLASS(insn->code), BPF_REG_AX, insn->src_reg),
11622
+ bool isdiv = BPF_OP(insn->code) == BPF_DIV;
11623
+ struct bpf_insn *patchlet;
11624
+ struct bpf_insn chk_and_div[] = {
616711625 /* [R,W]x div 0 -> 0 */
6168
- BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, 2),
6169
- BPF_RAW_REG(*insn, insn->dst_reg, BPF_REG_AX),
11626
+ BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
11627
+ BPF_JNE | BPF_K, insn->src_reg,
11628
+ 0, 2, 0),
11629
+ BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg),
617011630 BPF_JMP_IMM(BPF_JA, 0, 0, 1),
6171
- BPF_ALU_REG(BPF_CLASS(insn->code), BPF_XOR, insn->dst_reg, insn->dst_reg),
11631
+ *insn,
617211632 };
6173
- struct bpf_insn mask_and_mod[] = {
6174
- BPF_MOV_REG(BPF_CLASS(insn->code), BPF_REG_AX, insn->src_reg),
6175
- BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, 1 + (is64 ? 0 : 1)),
6176
- BPF_RAW_REG(*insn, insn->dst_reg, BPF_REG_AX),
11633
+ struct bpf_insn chk_and_mod[] = {
11634
+ /* [R,W]x mod 0 -> [R,W]x */
11635
+ BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
11636
+ BPF_JEQ | BPF_K, insn->src_reg,
11637
+ 0, 1 + (is64 ? 0 : 1), 0),
11638
+ *insn,
617711639 BPF_JMP_IMM(BPF_JA, 0, 0, 1),
617811640 BPF_MOV32_REG(insn->dst_reg, insn->dst_reg),
617911641 };
6180
- struct bpf_insn *patchlet;
618111642
6182
- if (insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
6183
- insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
6184
- patchlet = mask_and_div;
6185
- cnt = ARRAY_SIZE(mask_and_div);
6186
- } else {
6187
- patchlet = mask_and_mod;
6188
- cnt = ARRAY_SIZE(mask_and_mod) - (is64 ? 2 : 0);
6189
- }
11643
+ patchlet = isdiv ? chk_and_div : chk_and_mod;
11644
+ cnt = isdiv ? ARRAY_SIZE(chk_and_div) :
11645
+ ARRAY_SIZE(chk_and_mod) - (is64 ? 2 : 0);
619011646
619111647 new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt);
619211648 if (!new_prog)
....@@ -6288,7 +11744,9 @@
628811744 * the program array.
628911745 */
629011746 prog->cb_access = 1;
6291
- env->prog->aux->stack_depth = MAX_BPF_STACK;
11747
+ if (!allow_tail_call_in_subprogs(env))
11748
+ prog->aux->stack_depth = MAX_BPF_STACK;
11749
+ prog->aux->max_pkt_offset = MAX_PACKET_OFF;
629211750
629311751 /* mark bpf_tail_call as different opcode to avoid
629411752 * conditional branch in the interpeter for every normal
....@@ -6299,6 +11757,28 @@
629911757 insn->code = BPF_JMP | BPF_TAIL_CALL;
630011758
630111759 aux = &env->insn_aux_data[i + delta];
11760
+ if (env->bpf_capable && !expect_blinding &&
11761
+ prog->jit_requested &&
11762
+ !bpf_map_key_poisoned(aux) &&
11763
+ !bpf_map_ptr_poisoned(aux) &&
11764
+ !bpf_map_ptr_unpriv(aux)) {
11765
+ struct bpf_jit_poke_descriptor desc = {
11766
+ .reason = BPF_POKE_REASON_TAIL_CALL,
11767
+ .tail_call.map = BPF_MAP_PTR(aux->map_ptr_state),
11768
+ .tail_call.key = bpf_map_key_immediate(aux),
11769
+ .insn_idx = i + delta,
11770
+ };
11771
+
11772
+ ret = bpf_jit_add_poke_descriptor(prog, &desc);
11773
+ if (ret < 0) {
11774
+ verbose(env, "adding tail call poke descriptor failed\n");
11775
+ return ret;
11776
+ }
11777
+
11778
+ insn->imm = ret + 1;
11779
+ continue;
11780
+ }
11781
+
630211782 if (!bpf_map_ptr_unpriv(aux))
630311783 continue;
630411784
....@@ -6313,7 +11793,7 @@
631311793 return -EINVAL;
631411794 }
631511795
6316
- map_ptr = BPF_MAP_PTR(aux->map_state);
11796
+ map_ptr = BPF_MAP_PTR(aux->map_ptr_state);
631711797 insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
631811798 map_ptr->max_entries, 2);
631911799 insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
....@@ -6339,17 +11819,22 @@
633911819 if (prog->jit_requested && BITS_PER_LONG == 64 &&
634011820 (insn->imm == BPF_FUNC_map_lookup_elem ||
634111821 insn->imm == BPF_FUNC_map_update_elem ||
6342
- insn->imm == BPF_FUNC_map_delete_elem)) {
11822
+ insn->imm == BPF_FUNC_map_delete_elem ||
11823
+ insn->imm == BPF_FUNC_map_push_elem ||
11824
+ insn->imm == BPF_FUNC_map_pop_elem ||
11825
+ insn->imm == BPF_FUNC_map_peek_elem)) {
634311826 aux = &env->insn_aux_data[i + delta];
634411827 if (bpf_map_ptr_poisoned(aux))
634511828 goto patch_call_imm;
634611829
6347
- map_ptr = BPF_MAP_PTR(aux->map_state);
11830
+ map_ptr = BPF_MAP_PTR(aux->map_ptr_state);
634811831 ops = map_ptr->ops;
634911832 if (insn->imm == BPF_FUNC_map_lookup_elem &&
635011833 ops->map_gen_lookup) {
635111834 cnt = ops->map_gen_lookup(map_ptr, insn_buf);
6352
- if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
11835
+ if (cnt == -EOPNOTSUPP)
11836
+ goto patch_map_ops_generic;
11837
+ if (cnt <= 0 || cnt >= ARRAY_SIZE(insn_buf)) {
635311838 verbose(env, "bpf verifier is misconfigured\n");
635411839 return -EINVAL;
635511840 }
....@@ -6372,6 +11857,14 @@
637211857 BUILD_BUG_ON(!__same_type(ops->map_update_elem,
637311858 (int (*)(struct bpf_map *map, void *key, void *value,
637411859 u64 flags))NULL));
11860
+ BUILD_BUG_ON(!__same_type(ops->map_push_elem,
11861
+ (int (*)(struct bpf_map *map, void *value,
11862
+ u64 flags))NULL));
11863
+ BUILD_BUG_ON(!__same_type(ops->map_pop_elem,
11864
+ (int (*)(struct bpf_map *map, void *value))NULL));
11865
+ BUILD_BUG_ON(!__same_type(ops->map_peek_elem,
11866
+ (int (*)(struct bpf_map *map, void *value))NULL));
11867
+patch_map_ops_generic:
637511868 switch (insn->imm) {
637611869 case BPF_FUNC_map_lookup_elem:
637711870 insn->imm = BPF_CAST_CALL(ops->map_lookup_elem) -
....@@ -6385,9 +11878,45 @@
638511878 insn->imm = BPF_CAST_CALL(ops->map_delete_elem) -
638611879 __bpf_call_base;
638711880 continue;
11881
+ case BPF_FUNC_map_push_elem:
11882
+ insn->imm = BPF_CAST_CALL(ops->map_push_elem) -
11883
+ __bpf_call_base;
11884
+ continue;
11885
+ case BPF_FUNC_map_pop_elem:
11886
+ insn->imm = BPF_CAST_CALL(ops->map_pop_elem) -
11887
+ __bpf_call_base;
11888
+ continue;
11889
+ case BPF_FUNC_map_peek_elem:
11890
+ insn->imm = BPF_CAST_CALL(ops->map_peek_elem) -
11891
+ __bpf_call_base;
11892
+ continue;
638811893 }
638911894
639011895 goto patch_call_imm;
11896
+ }
11897
+
11898
+ if (prog->jit_requested && BITS_PER_LONG == 64 &&
11899
+ insn->imm == BPF_FUNC_jiffies64) {
11900
+ struct bpf_insn ld_jiffies_addr[2] = {
11901
+ BPF_LD_IMM64(BPF_REG_0,
11902
+ (unsigned long)&jiffies),
11903
+ };
11904
+
11905
+ insn_buf[0] = ld_jiffies_addr[0];
11906
+ insn_buf[1] = ld_jiffies_addr[1];
11907
+ insn_buf[2] = BPF_LDX_MEM(BPF_DW, BPF_REG_0,
11908
+ BPF_REG_0, 0);
11909
+ cnt = 3;
11910
+
11911
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf,
11912
+ cnt);
11913
+ if (!new_prog)
11914
+ return -ENOMEM;
11915
+
11916
+ delta += cnt - 1;
11917
+ env->prog = prog = new_prog;
11918
+ insn = new_prog->insnsi + i + delta;
11919
+ continue;
639111920 }
639211921
639311922 patch_call_imm:
....@@ -6404,6 +11933,23 @@
640411933 insn->imm = fn->func - __bpf_call_base;
640511934 }
640611935
11936
+ /* Since poke tab is now finalized, publish aux to tracker. */
11937
+ for (i = 0; i < prog->aux->size_poke_tab; i++) {
11938
+ map_ptr = prog->aux->poke_tab[i].tail_call.map;
11939
+ if (!map_ptr->ops->map_poke_track ||
11940
+ !map_ptr->ops->map_poke_untrack ||
11941
+ !map_ptr->ops->map_poke_run) {
11942
+ verbose(env, "bpf verifier is misconfigured\n");
11943
+ return -EINVAL;
11944
+ }
11945
+
11946
+ ret = map_ptr->ops->map_poke_track(map_ptr, prog->aux);
11947
+ if (ret < 0) {
11948
+ verbose(env, "tracking tail call prog failed\n");
11949
+ return ret;
11950
+ }
11951
+ }
11952
+
640711953 return 0;
640811954 }
640911955
....@@ -6412,29 +11958,605 @@
641211958 struct bpf_verifier_state_list *sl, *sln;
641311959 int i;
641411960
11961
+ sl = env->free_list;
11962
+ while (sl) {
11963
+ sln = sl->next;
11964
+ free_verifier_state(&sl->state, false);
11965
+ kfree(sl);
11966
+ sl = sln;
11967
+ }
11968
+ env->free_list = NULL;
11969
+
641511970 if (!env->explored_states)
641611971 return;
641711972
6418
- for (i = 0; i < env->prog->len; i++) {
11973
+ for (i = 0; i < state_htab_size(env); i++) {
641911974 sl = env->explored_states[i];
642011975
6421
- if (sl)
6422
- while (sl != STATE_LIST_MARK) {
6423
- sln = sl->next;
6424
- free_verifier_state(&sl->state, false);
6425
- kfree(sl);
6426
- sl = sln;
6427
- }
11976
+ while (sl) {
11977
+ sln = sl->next;
11978
+ free_verifier_state(&sl->state, false);
11979
+ kfree(sl);
11980
+ sl = sln;
11981
+ }
11982
+ env->explored_states[i] = NULL;
642811983 }
6429
-
6430
- kfree(env->explored_states);
643111984 }
643211985
6433
-int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
11986
+static int do_check_common(struct bpf_verifier_env *env, int subprog)
643411987 {
11988
+ bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
11989
+ struct bpf_verifier_state *state;
11990
+ struct bpf_reg_state *regs;
11991
+ int ret, i;
11992
+
11993
+ env->prev_linfo = NULL;
11994
+ env->pass_cnt++;
11995
+
11996
+ state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL);
11997
+ if (!state)
11998
+ return -ENOMEM;
11999
+ state->curframe = 0;
12000
+ state->speculative = false;
12001
+ state->branches = 1;
12002
+ state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL);
12003
+ if (!state->frame[0]) {
12004
+ kfree(state);
12005
+ return -ENOMEM;
12006
+ }
12007
+ env->cur_state = state;
12008
+ init_func_state(env, state->frame[0],
12009
+ BPF_MAIN_FUNC /* callsite */,
12010
+ 0 /* frameno */,
12011
+ subprog);
12012
+
12013
+ state->first_insn_idx = env->subprog_info[subprog].start;
12014
+ state->last_insn_idx = -1;
12015
+
12016
+ regs = state->frame[state->curframe]->regs;
12017
+ if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) {
12018
+ ret = btf_prepare_func_args(env, subprog, regs);
12019
+ if (ret)
12020
+ goto out;
12021
+ for (i = BPF_REG_1; i <= BPF_REG_5; i++) {
12022
+ if (regs[i].type == PTR_TO_CTX)
12023
+ mark_reg_known_zero(env, regs, i);
12024
+ else if (regs[i].type == SCALAR_VALUE)
12025
+ mark_reg_unknown(env, regs, i);
12026
+ }
12027
+ } else {
12028
+ /* 1st arg to a function */
12029
+ regs[BPF_REG_1].type = PTR_TO_CTX;
12030
+ mark_reg_known_zero(env, regs, BPF_REG_1);
12031
+ ret = btf_check_func_arg_match(env, subprog, regs);
12032
+ if (ret == -EFAULT)
12033
+ /* unlikely verifier bug. abort.
12034
+ * ret == 0 and ret < 0 are sadly acceptable for
12035
+ * main() function due to backward compatibility.
12036
+ * Like socket filter program may be written as:
12037
+ * int bpf_prog(struct pt_regs *ctx)
12038
+ * and never dereference that ctx in the program.
12039
+ * 'struct pt_regs' is a type mismatch for socket
12040
+ * filter that should be using 'struct __sk_buff'.
12041
+ */
12042
+ goto out;
12043
+ }
12044
+
12045
+ ret = do_check(env);
12046
+out:
12047
+ /* check for NULL is necessary, since cur_state can be freed inside
12048
+ * do_check() under memory pressure.
12049
+ */
12050
+ if (env->cur_state) {
12051
+ free_verifier_state(env->cur_state, true);
12052
+ env->cur_state = NULL;
12053
+ }
12054
+ while (!pop_stack(env, NULL, NULL, false));
12055
+ if (!ret && pop_log)
12056
+ bpf_vlog_reset(&env->log, 0);
12057
+ free_states(env);
12058
+ return ret;
12059
+}
12060
+
12061
+/* Verify all global functions in a BPF program one by one based on their BTF.
12062
+ * All global functions must pass verification. Otherwise the whole program is rejected.
12063
+ * Consider:
12064
+ * int bar(int);
12065
+ * int foo(int f)
12066
+ * {
12067
+ * return bar(f);
12068
+ * }
12069
+ * int bar(int b)
12070
+ * {
12071
+ * ...
12072
+ * }
12073
+ * foo() will be verified first for R1=any_scalar_value. During verification it
12074
+ * will be assumed that bar() already verified successfully and call to bar()
12075
+ * from foo() will be checked for type match only. Later bar() will be verified
12076
+ * independently to check that it's safe for R1=any_scalar_value.
12077
+ */
12078
+static int do_check_subprogs(struct bpf_verifier_env *env)
12079
+{
12080
+ struct bpf_prog_aux *aux = env->prog->aux;
12081
+ int i, ret;
12082
+
12083
+ if (!aux->func_info)
12084
+ return 0;
12085
+
12086
+ for (i = 1; i < env->subprog_cnt; i++) {
12087
+ if (aux->func_info_aux[i].linkage != BTF_FUNC_GLOBAL)
12088
+ continue;
12089
+ env->insn_idx = env->subprog_info[i].start;
12090
+ WARN_ON_ONCE(env->insn_idx == 0);
12091
+ ret = do_check_common(env, i);
12092
+ if (ret) {
12093
+ return ret;
12094
+ } else if (env->log.level & BPF_LOG_LEVEL) {
12095
+ verbose(env,
12096
+ "Func#%d is safe for any args that match its prototype\n",
12097
+ i);
12098
+ }
12099
+ }
12100
+ return 0;
12101
+}
12102
+
12103
+static int do_check_main(struct bpf_verifier_env *env)
12104
+{
12105
+ int ret;
12106
+
12107
+ env->insn_idx = 0;
12108
+ ret = do_check_common(env, 0);
12109
+ if (!ret)
12110
+ env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
12111
+ return ret;
12112
+}
12113
+
12114
+
12115
+static void print_verification_stats(struct bpf_verifier_env *env)
12116
+{
12117
+ int i;
12118
+
12119
+ if (env->log.level & BPF_LOG_STATS) {
12120
+ verbose(env, "verification time %lld usec\n",
12121
+ div_u64(env->verification_time, 1000));
12122
+ verbose(env, "stack depth ");
12123
+ for (i = 0; i < env->subprog_cnt; i++) {
12124
+ u32 depth = env->subprog_info[i].stack_depth;
12125
+
12126
+ verbose(env, "%d", depth);
12127
+ if (i + 1 < env->subprog_cnt)
12128
+ verbose(env, "+");
12129
+ }
12130
+ verbose(env, "\n");
12131
+ }
12132
+ verbose(env, "processed %d insns (limit %d) max_states_per_insn %d "
12133
+ "total_states %d peak_states %d mark_read %d\n",
12134
+ env->insn_processed, BPF_COMPLEXITY_LIMIT_INSNS,
12135
+ env->max_states_per_insn, env->total_states,
12136
+ env->peak_states, env->longest_mark_read_walk);
12137
+}
12138
+
12139
+static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
12140
+{
12141
+ const struct btf_type *t, *func_proto;
12142
+ const struct bpf_struct_ops *st_ops;
12143
+ const struct btf_member *member;
12144
+ struct bpf_prog *prog = env->prog;
12145
+ u32 btf_id, member_idx;
12146
+ const char *mname;
12147
+
12148
+ if (!prog->gpl_compatible) {
12149
+ verbose(env, "struct ops programs must have a GPL compatible license\n");
12150
+ return -EINVAL;
12151
+ }
12152
+
12153
+ btf_id = prog->aux->attach_btf_id;
12154
+ st_ops = bpf_struct_ops_find(btf_id);
12155
+ if (!st_ops) {
12156
+ verbose(env, "attach_btf_id %u is not a supported struct\n",
12157
+ btf_id);
12158
+ return -ENOTSUPP;
12159
+ }
12160
+
12161
+ t = st_ops->type;
12162
+ member_idx = prog->expected_attach_type;
12163
+ if (member_idx >= btf_type_vlen(t)) {
12164
+ verbose(env, "attach to invalid member idx %u of struct %s\n",
12165
+ member_idx, st_ops->name);
12166
+ return -EINVAL;
12167
+ }
12168
+
12169
+ member = &btf_type_member(t)[member_idx];
12170
+ mname = btf_name_by_offset(btf_vmlinux, member->name_off);
12171
+ func_proto = btf_type_resolve_func_ptr(btf_vmlinux, member->type,
12172
+ NULL);
12173
+ if (!func_proto) {
12174
+ verbose(env, "attach to invalid member %s(@idx %u) of struct %s\n",
12175
+ mname, member_idx, st_ops->name);
12176
+ return -EINVAL;
12177
+ }
12178
+
12179
+ if (st_ops->check_member) {
12180
+ int err = st_ops->check_member(t, member);
12181
+
12182
+ if (err) {
12183
+ verbose(env, "attach to unsupported member %s of struct %s\n",
12184
+ mname, st_ops->name);
12185
+ return err;
12186
+ }
12187
+ }
12188
+
12189
+ prog->aux->attach_func_proto = func_proto;
12190
+ prog->aux->attach_func_name = mname;
12191
+ env->ops = st_ops->verifier_ops;
12192
+
12193
+ return 0;
12194
+}
12195
+#define SECURITY_PREFIX "security_"
12196
+
12197
+static int check_attach_modify_return(unsigned long addr, const char *func_name)
12198
+{
12199
+ if (within_error_injection_list(addr) ||
12200
+ !strncmp(SECURITY_PREFIX, func_name, sizeof(SECURITY_PREFIX) - 1))
12201
+ return 0;
12202
+
12203
+ return -EINVAL;
12204
+}
12205
+
12206
+/* non exhaustive list of sleepable bpf_lsm_*() functions */
12207
+BTF_SET_START(btf_sleepable_lsm_hooks)
12208
+#ifdef CONFIG_BPF_LSM
12209
+BTF_ID(func, bpf_lsm_bprm_committed_creds)
12210
+#else
12211
+BTF_ID_UNUSED
12212
+#endif
12213
+BTF_SET_END(btf_sleepable_lsm_hooks)
12214
+
12215
+static int check_sleepable_lsm_hook(u32 btf_id)
12216
+{
12217
+ return btf_id_set_contains(&btf_sleepable_lsm_hooks, btf_id);
12218
+}
12219
+
12220
+/* list of non-sleepable functions that are otherwise on
12221
+ * ALLOW_ERROR_INJECTION list
12222
+ */
12223
+BTF_SET_START(btf_non_sleepable_error_inject)
12224
+/* Three functions below can be called from sleepable and non-sleepable context.
12225
+ * Assume non-sleepable from bpf safety point of view.
12226
+ */
12227
+BTF_ID(func, __add_to_page_cache_locked)
12228
+BTF_ID(func, should_fail_alloc_page)
12229
+BTF_ID(func, should_failslab)
12230
+BTF_SET_END(btf_non_sleepable_error_inject)
12231
+
12232
+static int check_non_sleepable_error_inject(u32 btf_id)
12233
+{
12234
+ return btf_id_set_contains(&btf_non_sleepable_error_inject, btf_id);
12235
+}
12236
+
12237
+int bpf_check_attach_target(struct bpf_verifier_log *log,
12238
+ const struct bpf_prog *prog,
12239
+ const struct bpf_prog *tgt_prog,
12240
+ u32 btf_id,
12241
+ struct bpf_attach_target_info *tgt_info)
12242
+{
12243
+ bool prog_extension = prog->type == BPF_PROG_TYPE_EXT;
12244
+ const char prefix[] = "btf_trace_";
12245
+ int ret = 0, subprog = -1, i;
12246
+ const struct btf_type *t;
12247
+ bool conservative = true;
12248
+ const char *tname;
12249
+ struct btf *btf;
12250
+ long addr = 0;
12251
+
12252
+ if (!btf_id) {
12253
+ bpf_log(log, "Tracing programs must provide btf_id\n");
12254
+ return -EINVAL;
12255
+ }
12256
+ btf = tgt_prog ? tgt_prog->aux->btf : btf_vmlinux;
12257
+ if (!btf) {
12258
+ bpf_log(log,
12259
+ "FENTRY/FEXIT program can only be attached to another program annotated with BTF\n");
12260
+ return -EINVAL;
12261
+ }
12262
+ t = btf_type_by_id(btf, btf_id);
12263
+ if (!t) {
12264
+ bpf_log(log, "attach_btf_id %u is invalid\n", btf_id);
12265
+ return -EINVAL;
12266
+ }
12267
+ tname = btf_name_by_offset(btf, t->name_off);
12268
+ if (!tname) {
12269
+ bpf_log(log, "attach_btf_id %u doesn't have a name\n", btf_id);
12270
+ return -EINVAL;
12271
+ }
12272
+ if (tgt_prog) {
12273
+ struct bpf_prog_aux *aux = tgt_prog->aux;
12274
+
12275
+ for (i = 0; i < aux->func_info_cnt; i++)
12276
+ if (aux->func_info[i].type_id == btf_id) {
12277
+ subprog = i;
12278
+ break;
12279
+ }
12280
+ if (subprog == -1) {
12281
+ bpf_log(log, "Subprog %s doesn't exist\n", tname);
12282
+ return -EINVAL;
12283
+ }
12284
+ conservative = aux->func_info_aux[subprog].unreliable;
12285
+ if (prog_extension) {
12286
+ if (conservative) {
12287
+ bpf_log(log,
12288
+ "Cannot replace static functions\n");
12289
+ return -EINVAL;
12290
+ }
12291
+ if (!prog->jit_requested) {
12292
+ bpf_log(log,
12293
+ "Extension programs should be JITed\n");
12294
+ return -EINVAL;
12295
+ }
12296
+ }
12297
+ if (!tgt_prog->jited) {
12298
+ bpf_log(log, "Can attach to only JITed progs\n");
12299
+ return -EINVAL;
12300
+ }
12301
+ if (tgt_prog->type == prog->type) {
12302
+ /* Cannot fentry/fexit another fentry/fexit program.
12303
+ * Cannot attach program extension to another extension.
12304
+ * It's ok to attach fentry/fexit to extension program.
12305
+ */
12306
+ bpf_log(log, "Cannot recursively attach\n");
12307
+ return -EINVAL;
12308
+ }
12309
+ if (tgt_prog->type == BPF_PROG_TYPE_TRACING &&
12310
+ prog_extension &&
12311
+ (tgt_prog->expected_attach_type == BPF_TRACE_FENTRY ||
12312
+ tgt_prog->expected_attach_type == BPF_TRACE_FEXIT)) {
12313
+ /* Program extensions can extend all program types
12314
+ * except fentry/fexit. The reason is the following.
12315
+ * The fentry/fexit programs are used for performance
12316
+ * analysis, stats and can be attached to any program
12317
+ * type except themselves. When extension program is
12318
+ * replacing XDP function it is necessary to allow
12319
+ * performance analysis of all functions. Both original
12320
+ * XDP program and its program extension. Hence
12321
+ * attaching fentry/fexit to BPF_PROG_TYPE_EXT is
12322
+ * allowed. If extending of fentry/fexit was allowed it
12323
+ * would be possible to create long call chain
12324
+ * fentry->extension->fentry->extension beyond
12325
+ * reasonable stack size. Hence extending fentry is not
12326
+ * allowed.
12327
+ */
12328
+ bpf_log(log, "Cannot extend fentry/fexit\n");
12329
+ return -EINVAL;
12330
+ }
12331
+ } else {
12332
+ if (prog_extension) {
12333
+ bpf_log(log, "Cannot replace kernel functions\n");
12334
+ return -EINVAL;
12335
+ }
12336
+ }
12337
+
12338
+ switch (prog->expected_attach_type) {
12339
+ case BPF_TRACE_RAW_TP:
12340
+ if (tgt_prog) {
12341
+ bpf_log(log,
12342
+ "Only FENTRY/FEXIT progs are attachable to another BPF prog\n");
12343
+ return -EINVAL;
12344
+ }
12345
+ if (!btf_type_is_typedef(t)) {
12346
+ bpf_log(log, "attach_btf_id %u is not a typedef\n",
12347
+ btf_id);
12348
+ return -EINVAL;
12349
+ }
12350
+ if (strncmp(prefix, tname, sizeof(prefix) - 1)) {
12351
+ bpf_log(log, "attach_btf_id %u points to wrong type name %s\n",
12352
+ btf_id, tname);
12353
+ return -EINVAL;
12354
+ }
12355
+ tname += sizeof(prefix) - 1;
12356
+ t = btf_type_by_id(btf, t->type);
12357
+ if (!btf_type_is_ptr(t))
12358
+ /* should never happen in valid vmlinux build */
12359
+ return -EINVAL;
12360
+ t = btf_type_by_id(btf, t->type);
12361
+ if (!btf_type_is_func_proto(t))
12362
+ /* should never happen in valid vmlinux build */
12363
+ return -EINVAL;
12364
+
12365
+ break;
12366
+ case BPF_TRACE_ITER:
12367
+ if (!btf_type_is_func(t)) {
12368
+ bpf_log(log, "attach_btf_id %u is not a function\n",
12369
+ btf_id);
12370
+ return -EINVAL;
12371
+ }
12372
+ t = btf_type_by_id(btf, t->type);
12373
+ if (!btf_type_is_func_proto(t))
12374
+ return -EINVAL;
12375
+ ret = btf_distill_func_proto(log, btf, t, tname, &tgt_info->fmodel);
12376
+ if (ret)
12377
+ return ret;
12378
+ break;
12379
+ default:
12380
+ if (!prog_extension)
12381
+ return -EINVAL;
12382
+ fallthrough;
12383
+ case BPF_MODIFY_RETURN:
12384
+ case BPF_LSM_MAC:
12385
+ case BPF_TRACE_FENTRY:
12386
+ case BPF_TRACE_FEXIT:
12387
+ if (!btf_type_is_func(t)) {
12388
+ bpf_log(log, "attach_btf_id %u is not a function\n",
12389
+ btf_id);
12390
+ return -EINVAL;
12391
+ }
12392
+ if (prog_extension &&
12393
+ btf_check_type_match(log, prog, btf, t))
12394
+ return -EINVAL;
12395
+ t = btf_type_by_id(btf, t->type);
12396
+ if (!btf_type_is_func_proto(t))
12397
+ return -EINVAL;
12398
+
12399
+ if ((prog->aux->saved_dst_prog_type || prog->aux->saved_dst_attach_type) &&
12400
+ (!tgt_prog || prog->aux->saved_dst_prog_type != tgt_prog->type ||
12401
+ prog->aux->saved_dst_attach_type != tgt_prog->expected_attach_type))
12402
+ return -EINVAL;
12403
+
12404
+ if (tgt_prog && conservative)
12405
+ t = NULL;
12406
+
12407
+ ret = btf_distill_func_proto(log, btf, t, tname, &tgt_info->fmodel);
12408
+ if (ret < 0)
12409
+ return ret;
12410
+
12411
+ if (tgt_prog) {
12412
+ if (subprog == 0)
12413
+ addr = (long) tgt_prog->bpf_func;
12414
+ else
12415
+ addr = (long) tgt_prog->aux->func[subprog]->bpf_func;
12416
+ } else {
12417
+ addr = kallsyms_lookup_name(tname);
12418
+ if (!addr) {
12419
+ bpf_log(log,
12420
+ "The address of function %s cannot be found\n",
12421
+ tname);
12422
+ return -ENOENT;
12423
+ }
12424
+ }
12425
+
12426
+ if (prog->aux->sleepable) {
12427
+ ret = -EINVAL;
12428
+ switch (prog->type) {
12429
+ case BPF_PROG_TYPE_TRACING:
12430
+ /* fentry/fexit/fmod_ret progs can be sleepable only if they are
12431
+ * attached to ALLOW_ERROR_INJECTION and are not in denylist.
12432
+ */
12433
+ if (!check_non_sleepable_error_inject(btf_id) &&
12434
+ within_error_injection_list(addr))
12435
+ ret = 0;
12436
+ break;
12437
+ case BPF_PROG_TYPE_LSM:
12438
+ /* LSM progs check that they are attached to bpf_lsm_*() funcs.
12439
+ * Only some of them are sleepable.
12440
+ */
12441
+ if (check_sleepable_lsm_hook(btf_id))
12442
+ ret = 0;
12443
+ break;
12444
+ default:
12445
+ break;
12446
+ }
12447
+ if (ret) {
12448
+ bpf_log(log, "%s is not sleepable\n", tname);
12449
+ return ret;
12450
+ }
12451
+ } else if (prog->expected_attach_type == BPF_MODIFY_RETURN) {
12452
+ if (tgt_prog) {
12453
+ bpf_log(log, "can't modify return codes of BPF programs\n");
12454
+ return -EINVAL;
12455
+ }
12456
+ ret = check_attach_modify_return(addr, tname);
12457
+ if (ret) {
12458
+ bpf_log(log, "%s() is not modifiable\n", tname);
12459
+ return ret;
12460
+ }
12461
+ }
12462
+
12463
+ break;
12464
+ }
12465
+ tgt_info->tgt_addr = addr;
12466
+ tgt_info->tgt_name = tname;
12467
+ tgt_info->tgt_type = t;
12468
+ return 0;
12469
+}
12470
+
12471
+static int check_attach_btf_id(struct bpf_verifier_env *env)
12472
+{
12473
+ struct bpf_prog *prog = env->prog;
12474
+ struct bpf_prog *tgt_prog = prog->aux->dst_prog;
12475
+ struct bpf_attach_target_info tgt_info = {};
12476
+ u32 btf_id = prog->aux->attach_btf_id;
12477
+ struct bpf_trampoline *tr;
12478
+ int ret;
12479
+ u64 key;
12480
+
12481
+ if (prog->aux->sleepable && prog->type != BPF_PROG_TYPE_TRACING &&
12482
+ prog->type != BPF_PROG_TYPE_LSM) {
12483
+ verbose(env, "Only fentry/fexit/fmod_ret and lsm programs can be sleepable\n");
12484
+ return -EINVAL;
12485
+ }
12486
+
12487
+ if (prog->type == BPF_PROG_TYPE_STRUCT_OPS)
12488
+ return check_struct_ops_btf_id(env);
12489
+
12490
+ if (prog->type != BPF_PROG_TYPE_TRACING &&
12491
+ prog->type != BPF_PROG_TYPE_LSM &&
12492
+ prog->type != BPF_PROG_TYPE_EXT)
12493
+ return 0;
12494
+
12495
+ ret = bpf_check_attach_target(&env->log, prog, tgt_prog, btf_id, &tgt_info);
12496
+ if (ret)
12497
+ return ret;
12498
+
12499
+ if (tgt_prog && prog->type == BPF_PROG_TYPE_EXT) {
12500
+ /* to make freplace equivalent to their targets, they need to
12501
+ * inherit env->ops and expected_attach_type for the rest of the
12502
+ * verification
12503
+ */
12504
+ env->ops = bpf_verifier_ops[tgt_prog->type];
12505
+ prog->expected_attach_type = tgt_prog->expected_attach_type;
12506
+ }
12507
+
12508
+ /* store info about the attachment target that will be used later */
12509
+ prog->aux->attach_func_proto = tgt_info.tgt_type;
12510
+ prog->aux->attach_func_name = tgt_info.tgt_name;
12511
+
12512
+ if (tgt_prog) {
12513
+ prog->aux->saved_dst_prog_type = tgt_prog->type;
12514
+ prog->aux->saved_dst_attach_type = tgt_prog->expected_attach_type;
12515
+ }
12516
+
12517
+ if (prog->expected_attach_type == BPF_TRACE_RAW_TP) {
12518
+ prog->aux->attach_btf_trace = true;
12519
+ return 0;
12520
+ } else if (prog->expected_attach_type == BPF_TRACE_ITER) {
12521
+ if (!bpf_iter_prog_supported(prog))
12522
+ return -EINVAL;
12523
+ return 0;
12524
+ }
12525
+
12526
+ if (prog->type == BPF_PROG_TYPE_LSM) {
12527
+ ret = bpf_lsm_verify_prog(&env->log, prog);
12528
+ if (ret < 0)
12529
+ return ret;
12530
+ }
12531
+
12532
+ key = bpf_trampoline_compute_key(tgt_prog, btf_id);
12533
+ tr = bpf_trampoline_get(key, &tgt_info);
12534
+ if (!tr)
12535
+ return -ENOMEM;
12536
+
12537
+ prog->aux->dst_trampoline = tr;
12538
+ return 0;
12539
+}
12540
+
12541
+struct btf *bpf_get_btf_vmlinux(void)
12542
+{
12543
+ if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
12544
+ mutex_lock(&bpf_verifier_lock);
12545
+ if (!btf_vmlinux)
12546
+ btf_vmlinux = btf_parse_vmlinux();
12547
+ mutex_unlock(&bpf_verifier_lock);
12548
+ }
12549
+ return btf_vmlinux;
12550
+}
12551
+
12552
+int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
12553
+ union bpf_attr __user *uattr)
12554
+{
12555
+ u64 start_time = ktime_get_ns();
643512556 struct bpf_verifier_env *env;
643612557 struct bpf_verifier_log *log;
6437
- int ret = -EINVAL;
12558
+ int i, len, ret = -EINVAL;
12559
+ bool is_priv;
643812560
643912561 /* no program is valid */
644012562 if (ARRAY_SIZE(bpf_verifier_ops) == 0)
....@@ -6448,17 +12570,23 @@
644812570 return -ENOMEM;
644912571 log = &env->log;
645012572
12573
+ len = (*prog)->len;
645112574 env->insn_aux_data =
6452
- vzalloc(array_size(sizeof(struct bpf_insn_aux_data),
6453
- (*prog)->len));
12575
+ vzalloc(array_size(sizeof(struct bpf_insn_aux_data), len));
645412576 ret = -ENOMEM;
645512577 if (!env->insn_aux_data)
645612578 goto err_free_env;
12579
+ for (i = 0; i < len; i++)
12580
+ env->insn_aux_data[i].orig_idx = i;
645712581 env->prog = *prog;
645812582 env->ops = bpf_verifier_ops[env->prog->type];
12583
+ is_priv = bpf_capable();
12584
+
12585
+ bpf_get_btf_vmlinux();
645912586
646012587 /* grab the mutex to protect few globals used by verifier */
6461
- mutex_lock(&bpf_verifier_lock);
12588
+ if (!is_priv)
12589
+ mutex_lock(&bpf_verifier_lock);
646212590
646312591 if (attr->log_level || attr->log_buf || attr->log_size) {
646412592 /* user requested verbose verifier output
....@@ -6468,58 +12596,93 @@
646812596 log->ubuf = (char __user *) (unsigned long) attr->log_buf;
646912597 log->len_total = attr->log_size;
647012598
6471
- ret = -EINVAL;
647212599 /* log attributes have to be sane */
6473
- if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 ||
6474
- !log->level || !log->ubuf)
12600
+ if (!bpf_verifier_log_attr_valid(log)) {
12601
+ ret = -EINVAL;
647512602 goto err_unlock;
12603
+ }
12604
+ }
12605
+
12606
+ if (IS_ERR(btf_vmlinux)) {
12607
+ /* Either gcc or pahole or kernel are broken. */
12608
+ verbose(env, "in-kernel BTF is malformed\n");
12609
+ ret = PTR_ERR(btf_vmlinux);
12610
+ goto skip_full_check;
647612611 }
647712612
647812613 env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
647912614 if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
648012615 env->strict_alignment = true;
6481
-
648212616 if (attr->prog_flags & BPF_F_ANY_ALIGNMENT)
648312617 env->strict_alignment = false;
648412618
6485
- ret = replace_map_fd_with_map_ptr(env);
6486
- if (ret < 0)
6487
- goto skip_full_check;
12619
+ env->allow_ptr_leaks = bpf_allow_ptr_leaks();
12620
+ env->allow_uninit_stack = bpf_allow_uninit_stack();
12621
+ env->allow_ptr_to_map_access = bpf_allow_ptr_to_map_access();
12622
+ env->bypass_spec_v1 = bpf_bypass_spec_v1();
12623
+ env->bypass_spec_v4 = bpf_bypass_spec_v4();
12624
+ env->bpf_capable = bpf_capable();
648812625
6489
- if (bpf_prog_is_dev_bound(env->prog->aux)) {
6490
- ret = bpf_prog_offload_verifier_prep(env);
6491
- if (ret)
6492
- goto skip_full_check;
6493
- }
12626
+ if (is_priv)
12627
+ env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
649412628
6495
- env->explored_states = kcalloc(env->prog->len,
12629
+ env->explored_states = kvcalloc(state_htab_size(env),
649612630 sizeof(struct bpf_verifier_state_list *),
649712631 GFP_USER);
649812632 ret = -ENOMEM;
649912633 if (!env->explored_states)
650012634 goto skip_full_check;
650112635
6502
- env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
12636
+ ret = check_subprogs(env);
12637
+ if (ret < 0)
12638
+ goto skip_full_check;
12639
+
12640
+ ret = check_btf_info(env, attr, uattr);
12641
+ if (ret < 0)
12642
+ goto skip_full_check;
12643
+
12644
+ ret = check_attach_btf_id(env);
12645
+ if (ret)
12646
+ goto skip_full_check;
12647
+
12648
+ ret = resolve_pseudo_ldimm64(env);
12649
+ if (ret < 0)
12650
+ goto skip_full_check;
12651
+
12652
+ if (bpf_prog_is_dev_bound(env->prog->aux)) {
12653
+ ret = bpf_prog_offload_verifier_prep(env->prog);
12654
+ if (ret)
12655
+ goto skip_full_check;
12656
+ }
650312657
650412658 ret = check_cfg(env);
650512659 if (ret < 0)
650612660 goto skip_full_check;
650712661
6508
- ret = do_check(env);
6509
- if (env->cur_state) {
6510
- free_verifier_state(env->cur_state, true);
6511
- env->cur_state = NULL;
6512
- }
12662
+ ret = do_check_subprogs(env);
12663
+ ret = ret ?: do_check_main(env);
12664
+
12665
+ if (ret == 0 && bpf_prog_is_dev_bound(env->prog->aux))
12666
+ ret = bpf_prog_offload_finalize(env);
651312667
651412668 skip_full_check:
6515
- while (!pop_stack(env, NULL, NULL));
6516
- free_states(env);
6517
-
6518
- if (ret == 0)
6519
- sanitize_dead_code(env);
12669
+ kvfree(env->explored_states);
652012670
652112671 if (ret == 0)
652212672 ret = check_max_stack_depth(env);
12673
+
12674
+ /* instruction rewrites happen after this point */
12675
+ if (is_priv) {
12676
+ if (ret == 0)
12677
+ opt_hard_wire_dead_code_branches(env);
12678
+ if (ret == 0)
12679
+ ret = opt_remove_dead_code(env);
12680
+ if (ret == 0)
12681
+ ret = opt_remove_nops(env);
12682
+ } else {
12683
+ if (ret == 0)
12684
+ sanitize_dead_code(env);
12685
+ }
652312686
652412687 if (ret == 0)
652512688 /* program is valid, convert *(u32*)(ctx + off) accesses */
....@@ -6528,8 +12691,20 @@
652812691 if (ret == 0)
652912692 ret = fixup_bpf_calls(env);
653012693
12694
+ /* do 32-bit optimization after insn patching has done so those patched
12695
+ * insns could be handled correctly.
12696
+ */
12697
+ if (ret == 0 && !bpf_prog_is_dev_bound(env->prog->aux)) {
12698
+ ret = opt_subreg_zext_lo32_rnd_hi32(env, attr);
12699
+ env->prog->aux->verifier_zext = bpf_jit_needs_zext() ? !ret
12700
+ : false;
12701
+ }
12702
+
653112703 if (ret == 0)
653212704 ret = fixup_call_args(env);
12705
+
12706
+ env->verification_time = ktime_get_ns() - start_time;
12707
+ print_verification_stats(env);
653312708
653412709 if (log->level && bpf_verifier_log_full(log))
653512710 ret = -ENOSPC;
....@@ -6559,15 +12734,26 @@
655912734 convert_pseudo_ld_imm64(env);
656012735 }
656112736
12737
+ if (ret == 0)
12738
+ adjust_btf_func(env);
12739
+
656212740 err_release_maps:
656312741 if (!env->prog->aux->used_maps)
656412742 /* if we didn't copy map pointers into bpf_prog_info, release
656512743 * them now. Otherwise free_used_maps() will release them.
656612744 */
656712745 release_maps(env);
12746
+
12747
+ /* extension progs temporarily inherit the attach_type of their targets
12748
+ for verification purposes, so set it back to zero before returning
12749
+ */
12750
+ if (env->prog->type == BPF_PROG_TYPE_EXT)
12751
+ env->prog->expected_attach_type = 0;
12752
+
656812753 *prog = env->prog;
656912754 err_unlock:
6570
- mutex_unlock(&bpf_verifier_lock);
12755
+ if (!is_priv)
12756
+ mutex_unlock(&bpf_verifier_lock);
657112757 vfree(env->insn_aux_data);
657212758 err_free_env:
657312759 kfree(env);