~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,19 +1,14 @@
	1	+// SPDX-License-Identifier: GPL-2.0-only
1	2	/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2	3	* Copyright (c) 2016 Facebook
3		- *
4		- * This program is free software; you can redistribute it and/or
5		- * modify it under the terms of version 2 of the GNU General Public
6		- * License as published by the Free Software Foundation.
7		- *
8		- * This program is distributed in the hope that it will be useful, but
9		- * WITHOUT ANY WARRANTY; without even the implied warranty of
10		- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11		- * General Public License for more details.
	4	+ * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
12	5	*/
	6	+#include <uapi/linux/btf.h>
13	7	#include <linux/kernel.h>
14	8	#include <linux/types.h>
15	9	#include <linux/slab.h>
16	10	#include <linux/bpf.h>
	11	+#include <linux/btf.h>
17	12	#include <linux/bpf_verifier.h>
18	13	#include <linux/filter.h>
19	14	#include <net/netlink.h>
..	..	@@ -23,16 +18,22 @@
23	18	#include <linux/bsearch.h>
24	19	#include <linux/sort.h>
25	20	#include <linux/perf_event.h>
	21	+#include <linux/ctype.h>
	22	+#include <linux/error-injection.h>
	23	+#include <linux/bpf_lsm.h>
	24	+#include <linux/btf_ids.h>
26	25
27	26	#include "disasm.h"
28	27
29	28	static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
30		-#define BPF_PROG_TYPE(_id, _name) \
	29	+#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
31	30	[_id] = & _name ## _verifier_ops,
32	31	#define BPF_MAP_TYPE(_id, _ops)
	32	+#define BPF_LINK_TYPE(_id, _name)
33	33	#include <linux/bpf_types.h>
34	34	#undef BPF_PROG_TYPE
35	35	#undef BPF_MAP_TYPE
	36	+#undef BPF_LINK_TYPE
36	37	};
37	38
38	39	/* bpf_check() is a static code analyzer that walks eBPF program
..	..	@@ -80,8 +81,8 @@
80	81	* (like pointer plus pointer becomes SCALAR_VALUE type)
81	82	*
82	83	* When verifier sees load or store instructions the type of base register
83		- * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, PTR_TO_STACK. These are three pointer
84		- * types recognized by check_mem_access() function.
	84	+ * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, PTR_TO_STACK, PTR_TO_SOCKET. These are
	85	+ * four pointer types recognized by check_mem_access() function.
85	86	*
86	87	* PTR_TO_MAP_VALUE means that this register is pointing to 'map element value'
87	88	* and the range of [ptr, ptr + map's value_size) is accessible.
..	..	@@ -140,6 +141,24 @@
140	141	*
141	142	* After the call R0 is set to return type of the function and registers R1-R5
142	143	* are set to NOT_INIT to indicate that they are no longer readable.
	144	+ *
	145	+ * The following reference types represent a potential reference to a kernel
	146	+ * resource which, after first being allocated, must be checked and freed by
	147	+ * the BPF program:
	148	+ * - PTR_TO_SOCKET_OR_NULL, PTR_TO_SOCKET
	149	+ *
	150	+ * When the verifier sees a helper call return a reference type, it allocates a
	151	+ * pointer id for the reference and stores it in the current function state.
	152	+ * Similar to the way that PTR_TO_MAP_VALUE_OR_NULL is converted into
	153	+ * PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL becomes PTR_TO_SOCKET when the type
	154	+ * passes through a NULL-check conditional. For the branch wherein the state is
	155	+ * changed to CONST_IMM, the verifier releases the reference.
	156	+ *
	157	+ * For each helper function that allocates a reference, such as
	158	+ * bpf_sk_lookup_tcp(), there is a corresponding release function, such as
	159	+ * bpf_sk_release(). When a reference type passes into the release function,
	160	+ * the verifier also releases the reference. If any unchecked or unreleased
	161	+ * reference remains at the end of the program, the verifier rejects it.
143	162	*/
144	163
145	164	/* verifier_state + insn_idx are pushed to stack when branch is encountered */
..	..	@@ -152,11 +171,15 @@
152	171	int insn_idx;
153	172	int prev_insn_idx;
154	173	struct bpf_verifier_stack_elem *next;
	174	+ /* length of verifier log at the time this state was pushed on stack */
	175	+ u32 log_pos;
155	176	};
156	177
157		-#define BPF_COMPLEXITY_LIMIT_INSNS 131072
158		-#define BPF_COMPLEXITY_LIMIT_STACK 1024
	178	+#define BPF_COMPLEXITY_LIMIT_JMP_SEQ 8192
159	179	#define BPF_COMPLEXITY_LIMIT_STATES 64
	180	+
	181	+#define BPF_MAP_KEY_POISON (1ULL << 63)
	182	+#define BPF_MAP_KEY_SEEN (1ULL << 62)
160	183
161	184	#define BPF_MAP_PTR_UNPRIV 1UL
162	185	#define BPF_MAP_PTR_POISON ((void *)((0xeB9FUL << 1) + \
..	..	@@ -165,12 +188,12 @@
165	188
166	189	static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
167	190	{
168		- return BPF_MAP_PTR(aux->map_state) == BPF_MAP_PTR_POISON;
	191	+ return BPF_MAP_PTR(aux->map_ptr_state) == BPF_MAP_PTR_POISON;
169	192	}
170	193
171	194	static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux)
172	195	{
173		- return aux->map_state & BPF_MAP_PTR_UNPRIV;
	196	+ return aux->map_ptr_state & BPF_MAP_PTR_UNPRIV;
174	197	}
175	198
176	199	static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux,
..	..	@@ -178,8 +201,31 @@
178	201	{
179	202	BUILD_BUG_ON((unsigned long)BPF_MAP_PTR_POISON & BPF_MAP_PTR_UNPRIV);
180	203	unpriv \|= bpf_map_ptr_unpriv(aux);
181		- aux->map_state = (unsigned long)map \|
182		- (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL);
	204	+ aux->map_ptr_state = (unsigned long)map \|
	205	+ (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL);
	206	+}
	207	+
	208	+static bool bpf_map_key_poisoned(const struct bpf_insn_aux_data *aux)
	209	+{
	210	+ return aux->map_key_state & BPF_MAP_KEY_POISON;
	211	+}
	212	+
	213	+static bool bpf_map_key_unseen(const struct bpf_insn_aux_data *aux)
	214	+{
	215	+ return !(aux->map_key_state & BPF_MAP_KEY_SEEN);
	216	+}
	217	+
	218	+static u64 bpf_map_key_immediate(const struct bpf_insn_aux_data *aux)
	219	+{
	220	+ return aux->map_key_state & ~(BPF_MAP_KEY_SEEN \| BPF_MAP_KEY_POISON);
	221	+}
	222	+
	223	+static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state)
	224	+{
	225	+ bool poisoned = bpf_map_key_poisoned(aux);
	226	+
	227	+ aux->map_key_state = state \| BPF_MAP_KEY_SEEN \|
	228	+ (poisoned ? BPF_MAP_KEY_POISON : 0ULL);
183	229	}
184	230
185	231	struct bpf_call_arg_meta {
..	..	@@ -188,10 +234,38 @@
188	234	bool pkt_access;
189	235	int regno;
190	236	int access_size;
	237	+ int mem_size;
191	238	u64 msize_max_value;
	239	+ int ref_obj_id;
	240	+ int func_id;
	241	+ u32 btf_id;
	242	+ u32 ret_btf_id;
192	243	};
193	244
	245	+struct btf *btf_vmlinux;
	246	+
194	247	static DEFINE_MUTEX(bpf_verifier_lock);
	248	+
	249	+static const struct bpf_line_info *
	250	+find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
	251	+{
	252	+ const struct bpf_line_info *linfo;
	253	+ const struct bpf_prog *prog;
	254	+ u32 i, nr_linfo;
	255	+
	256	+ prog = env->prog;
	257	+ nr_linfo = prog->aux->nr_linfo;
	258	+
	259	+ if (!nr_linfo \|\| insn_off >= prog->len)
	260	+ return NULL;
	261	+
	262	+ linfo = prog->aux->linfo;
	263	+ for (i = 1; i < nr_linfo; i++)
	264	+ if (insn_off < linfo[i].insn_off)
	265	+ break;
	266	+
	267	+ return &linfo[i - 1];
	268	+}
195	269
196	270	void bpf_verifier_vlog(struct bpf_verifier_log log, const char fmt,
197	271	va_list args)
..	..	@@ -206,9 +280,25 @@
206	280	n = min(log->len_total - log->len_used - 1, n);
207	281	log->kbuf[n] = '\0';
208	282
	283	+ if (log->level == BPF_LOG_KERNEL) {
	284	+ pr_err("BPF:%s\n", log->kbuf);
	285	+ return;
	286	+ }
209	287	if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1))
210	288	log->len_used += n;
211	289	else
	290	+ log->ubuf = NULL;
	291	+}
	292	+
	293	+static void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos)
	294	+{
	295	+ char zero = 0;
	296	+
	297	+ if (!bpf_verifier_log_needed(log))
	298	+ return;
	299	+
	300	+ log->len_used = new_pos;
	301	+ if (put_user(zero, log->ubuf + new_pos))
212	302	log->ubuf = NULL;
213	303	}
214	304
..	..	@@ -243,10 +333,167 @@
243	333	va_end(args);
244	334	}
245	335
	336	+__printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
	337	+ const char *fmt, ...)
	338	+{
	339	+ va_list args;
	340	+
	341	+ if (!bpf_verifier_log_needed(log))
	342	+ return;
	343	+
	344	+ va_start(args, fmt);
	345	+ bpf_verifier_vlog(log, fmt, args);
	346	+ va_end(args);
	347	+}
	348	+
	349	+static const char ltrim(const char s)
	350	+{
	351	+ while (isspace(*s))
	352	+ s++;
	353	+
	354	+ return s;
	355	+}
	356	+
	357	+__printf(3, 4) static void verbose_linfo(struct bpf_verifier_env *env,
	358	+ u32 insn_off,
	359	+ const char *prefix_fmt, ...)
	360	+{
	361	+ const struct bpf_line_info *linfo;
	362	+
	363	+ if (!bpf_verifier_log_needed(&env->log))
	364	+ return;
	365	+
	366	+ linfo = find_linfo(env, insn_off);
	367	+ if (!linfo \|\| linfo == env->prev_linfo)
	368	+ return;
	369	+
	370	+ if (prefix_fmt) {
	371	+ va_list args;
	372	+
	373	+ va_start(args, prefix_fmt);
	374	+ bpf_verifier_vlog(&env->log, prefix_fmt, args);
	375	+ va_end(args);
	376	+ }
	377	+
	378	+ verbose(env, "%s\n",
	379	+ ltrim(btf_name_by_offset(env->prog->aux->btf,
	380	+ linfo->line_off)));
	381	+
	382	+ env->prev_linfo = linfo;
	383	+}
	384	+
246	385	static bool type_is_pkt_pointer(enum bpf_reg_type type)
247	386	{
248	387	return type == PTR_TO_PACKET \|\|
249	388	type == PTR_TO_PACKET_META;
	389	+}
	390	+
	391	+static bool type_is_sk_pointer(enum bpf_reg_type type)
	392	+{
	393	+ return type == PTR_TO_SOCKET \|\|
	394	+ type == PTR_TO_SOCK_COMMON \|\|
	395	+ type == PTR_TO_TCP_SOCK \|\|
	396	+ type == PTR_TO_XDP_SOCK;
	397	+}
	398	+
	399	+static bool reg_type_not_null(enum bpf_reg_type type)
	400	+{
	401	+ return type == PTR_TO_SOCKET \|\|
	402	+ type == PTR_TO_TCP_SOCK \|\|
	403	+ type == PTR_TO_MAP_VALUE \|\|
	404	+ type == PTR_TO_SOCK_COMMON;
	405	+}
	406	+
	407	+static bool reg_type_may_be_null(enum bpf_reg_type type)
	408	+{
	409	+ return type == PTR_TO_MAP_VALUE_OR_NULL \|\|
	410	+ type == PTR_TO_SOCKET_OR_NULL \|\|
	411	+ type == PTR_TO_SOCK_COMMON_OR_NULL \|\|
	412	+ type == PTR_TO_TCP_SOCK_OR_NULL \|\|
	413	+ type == PTR_TO_BTF_ID_OR_NULL \|\|
	414	+ type == PTR_TO_MEM_OR_NULL \|\|
	415	+ type == PTR_TO_RDONLY_BUF_OR_NULL \|\|
	416	+ type == PTR_TO_RDWR_BUF_OR_NULL;
	417	+}
	418	+
	419	+static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
	420	+{
	421	+ return reg->type == PTR_TO_MAP_VALUE &&
	422	+ map_value_has_spin_lock(reg->map_ptr);
	423	+}
	424	+
	425	+static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type)
	426	+{
	427	+ return type == PTR_TO_SOCKET \|\|
	428	+ type == PTR_TO_SOCKET_OR_NULL \|\|
	429	+ type == PTR_TO_TCP_SOCK \|\|
	430	+ type == PTR_TO_TCP_SOCK_OR_NULL \|\|
	431	+ type == PTR_TO_MEM \|\|
	432	+ type == PTR_TO_MEM_OR_NULL;
	433	+}
	434	+
	435	+static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
	436	+{
	437	+ return type == ARG_PTR_TO_SOCK_COMMON;
	438	+}
	439	+
	440	+static bool arg_type_may_be_null(enum bpf_arg_type type)
	441	+{
	442	+ return type == ARG_PTR_TO_MAP_VALUE_OR_NULL \|\|
	443	+ type == ARG_PTR_TO_MEM_OR_NULL \|\|
	444	+ type == ARG_PTR_TO_CTX_OR_NULL \|\|
	445	+ type == ARG_PTR_TO_SOCKET_OR_NULL \|\|
	446	+ type == ARG_PTR_TO_ALLOC_MEM_OR_NULL;
	447	+}
	448	+
	449	+/* Determine whether the function releases some resources allocated by another
	450	+ * function call. The first reference type argument will be assumed to be
	451	+ * released by release_reference().
	452	+ */
	453	+static bool is_release_function(enum bpf_func_id func_id)
	454	+{
	455	+ return func_id == BPF_FUNC_sk_release \|\|
	456	+ func_id == BPF_FUNC_ringbuf_submit \|\|
	457	+ func_id == BPF_FUNC_ringbuf_discard;
	458	+}
	459	+
	460	+static bool may_be_acquire_function(enum bpf_func_id func_id)
	461	+{
	462	+ return func_id == BPF_FUNC_sk_lookup_tcp \|\|
	463	+ func_id == BPF_FUNC_sk_lookup_udp \|\|
	464	+ func_id == BPF_FUNC_skc_lookup_tcp \|\|
	465	+ func_id == BPF_FUNC_map_lookup_elem \|\|
	466	+ func_id == BPF_FUNC_ringbuf_reserve;
	467	+}
	468	+
	469	+static bool is_acquire_function(enum bpf_func_id func_id,
	470	+ const struct bpf_map *map)
	471	+{
	472	+ enum bpf_map_type map_type = map ? map->map_type : BPF_MAP_TYPE_UNSPEC;
	473	+
	474	+ if (func_id == BPF_FUNC_sk_lookup_tcp \|\|
	475	+ func_id == BPF_FUNC_sk_lookup_udp \|\|
	476	+ func_id == BPF_FUNC_skc_lookup_tcp \|\|
	477	+ func_id == BPF_FUNC_ringbuf_reserve)
	478	+ return true;
	479	+
	480	+ if (func_id == BPF_FUNC_map_lookup_elem &&
	481	+ (map_type == BPF_MAP_TYPE_SOCKMAP \|\|
	482	+ map_type == BPF_MAP_TYPE_SOCKHASH))
	483	+ return true;
	484	+
	485	+ return false;
	486	+}
	487	+
	488	+static bool is_ptr_cast_function(enum bpf_func_id func_id)
	489	+{
	490	+ return func_id == BPF_FUNC_tcp_sock \|\|
	491	+ func_id == BPF_FUNC_sk_fullsock \|\|
	492	+ func_id == BPF_FUNC_skc_to_tcp_sock \|\|
	493	+ func_id == BPF_FUNC_skc_to_tcp6_sock \|\|
	494	+ func_id == BPF_FUNC_skc_to_udp6_sock \|\|
	495	+ func_id == BPF_FUNC_skc_to_tcp_timewait_sock \|\|
	496	+ func_id == BPF_FUNC_skc_to_tcp_request_sock;
250	497	}
251	498
252	499	/* string representation of 'enum bpf_reg_type' */
..	..	@@ -261,17 +508,44 @@
261	508	[PTR_TO_PACKET] = "pkt",
262	509	[PTR_TO_PACKET_META] = "pkt_meta",
263	510	[PTR_TO_PACKET_END] = "pkt_end",
	511	+ [PTR_TO_FLOW_KEYS] = "flow_keys",
	512	+ [PTR_TO_SOCKET] = "sock",
	513	+ [PTR_TO_SOCKET_OR_NULL] = "sock_or_null",
	514	+ [PTR_TO_SOCK_COMMON] = "sock_common",
	515	+ [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null",
	516	+ [PTR_TO_TCP_SOCK] = "tcp_sock",
	517	+ [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null",
	518	+ [PTR_TO_TP_BUFFER] = "tp_buffer",
	519	+ [PTR_TO_XDP_SOCK] = "xdp_sock",
	520	+ [PTR_TO_BTF_ID] = "ptr_",
	521	+ [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_",
	522	+ [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_",
	523	+ [PTR_TO_MEM] = "mem",
	524	+ [PTR_TO_MEM_OR_NULL] = "mem_or_null",
	525	+ [PTR_TO_RDONLY_BUF] = "rdonly_buf",
	526	+ [PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null",
	527	+ [PTR_TO_RDWR_BUF] = "rdwr_buf",
	528	+ [PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null",
	529	+};
	530	+
	531	+static char slot_type_char[] = {
	532	+ [STACK_INVALID] = '?',
	533	+ [STACK_SPILL] = 'r',
	534	+ [STACK_MISC] = 'm',
	535	+ [STACK_ZERO] = '0',
264	536	};
265	537
266	538	static void print_liveness(struct bpf_verifier_env *env,
267	539	enum bpf_reg_liveness live)
268	540	{
269		- if (live & (REG_LIVE_READ \| REG_LIVE_WRITTEN))
	541	+ if (live & (REG_LIVE_READ \| REG_LIVE_WRITTEN \| REG_LIVE_DONE))
270	542	verbose(env, "_");
271	543	if (live & REG_LIVE_READ)
272	544	verbose(env, "r");
273	545	if (live & REG_LIVE_WRITTEN)
274	546	verbose(env, "w");
	547	+ if (live & REG_LIVE_DONE)
	548	+ verbose(env, "D");
275	549	}
276	550
277	551	static struct bpf_func_state func(struct bpf_verifier_env env,
..	..	@@ -280,6 +554,26 @@
280	554	struct bpf_verifier_state *cur = env->cur_state;
281	555
282	556	return cur->frame[reg->frameno];
	557	+}
	558	+
	559	+const char *kernel_type_name(u32 id)
	560	+{
	561	+ return btf_name_by_offset(btf_vmlinux,
	562	+ btf_type_by_id(btf_vmlinux, id)->name_off);
	563	+}
	564	+
	565	+/* The reg state of a pointer or a bounded scalar was saved when
	566	+ * it was spilled to the stack.
	567	+ */
	568	+static bool is_spilled_reg(const struct bpf_stack_state *stack)
	569	+{
	570	+ return stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL;
	571	+}
	572	+
	573	+static void scrub_spilled_slot(u8 *stype)
	574	+{
	575	+ if (*stype != STACK_INVALID)
	576	+ *stype = STACK_MISC;
283	577	}
284	578
285	579	static void print_verifier_state(struct bpf_verifier_env *env,
..	..	@@ -299,14 +593,20 @@
299	593	verbose(env, " R%d", i);
300	594	print_liveness(env, reg->live);
301	595	verbose(env, "=%s", reg_type_str[t]);
	596	+ if (t == SCALAR_VALUE && reg->precise)
	597	+ verbose(env, "P");
302	598	if ((t == SCALAR_VALUE \|\| t == PTR_TO_STACK) &&
303	599	tnum_is_const(reg->var_off)) {
304	600	/* reg->off should be 0 for SCALAR_VALUE */
305	601	verbose(env, "%lld", reg->var_off.value + reg->off);
306		- if (t == PTR_TO_STACK)
307		- verbose(env, ",call_%d", func(env, reg)->callsite);
308	602	} else {
	603	+ if (t == PTR_TO_BTF_ID \|\|
	604	+ t == PTR_TO_BTF_ID_OR_NULL \|\|
	605	+ t == PTR_TO_PERCPU_BTF_ID)
	606	+ verbose(env, "%s", kernel_type_name(reg->btf_id));
309	607	verbose(env, "(id=%d", reg->id);
	608	+ if (reg_type_may_be_refcounted_or_null(t))
	609	+ verbose(env, ",ref_obj_id=%d", reg->ref_obj_id);
310	610	if (t != SCALAR_VALUE)
311	611	verbose(env, ",off=%d", reg->off);
312	612	if (type_is_pkt_pointer(t))
..	..	@@ -344,77 +644,189 @@
344	644	tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
345	645	verbose(env, ",var_off=%s", tn_buf);
346	646	}
	647	+ if (reg->s32_min_value != reg->smin_value &&
	648	+ reg->s32_min_value != S32_MIN)
	649	+ verbose(env, ",s32_min_value=%d",
	650	+ (int)(reg->s32_min_value));
	651	+ if (reg->s32_max_value != reg->smax_value &&
	652	+ reg->s32_max_value != S32_MAX)
	653	+ verbose(env, ",s32_max_value=%d",
	654	+ (int)(reg->s32_max_value));
	655	+ if (reg->u32_min_value != reg->umin_value &&
	656	+ reg->u32_min_value != U32_MIN)
	657	+ verbose(env, ",u32_min_value=%d",
	658	+ (int)(reg->u32_min_value));
	659	+ if (reg->u32_max_value != reg->umax_value &&
	660	+ reg->u32_max_value != U32_MAX)
	661	+ verbose(env, ",u32_max_value=%d",
	662	+ (int)(reg->u32_max_value));
347	663	}
348	664	verbose(env, ")");
349	665	}
350	666	}
351	667	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
352		- if (state->stack[i].slot_type[0] == STACK_SPILL) {
353		- verbose(env, " fp%d",
354		- (-i - 1) * BPF_REG_SIZE);
355		- print_liveness(env, state->stack[i].spilled_ptr.live);
356		- verbose(env, "=%s",
357		- reg_type_str[state->stack[i].spilled_ptr.type]);
	668	+ char types_buf[BPF_REG_SIZE + 1];
	669	+ bool valid = false;
	670	+ int j;
	671	+
	672	+ for (j = 0; j < BPF_REG_SIZE; j++) {
	673	+ if (state->stack[i].slot_type[j] != STACK_INVALID)
	674	+ valid = true;
	675	+ types_buf[j] = slot_type_char[
	676	+ state->stack[i].slot_type[j]];
358	677	}
359		- if (state->stack[i].slot_type[0] == STACK_ZERO)
360		- verbose(env, " fp%d=0", (-i - 1) * BPF_REG_SIZE);
	678	+ types_buf[BPF_REG_SIZE] = 0;
	679	+ if (!valid)
	680	+ continue;
	681	+ verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
	682	+ print_liveness(env, state->stack[i].spilled_ptr.live);
	683	+ if (is_spilled_reg(&state->stack[i])) {
	684	+ reg = &state->stack[i].spilled_ptr;
	685	+ t = reg->type;
	686	+ verbose(env, "=%s", reg_type_str[t]);
	687	+ if (t == SCALAR_VALUE && reg->precise)
	688	+ verbose(env, "P");
	689	+ if (t == SCALAR_VALUE && tnum_is_const(reg->var_off))
	690	+ verbose(env, "%lld", reg->var_off.value + reg->off);
	691	+ } else {
	692	+ verbose(env, "=%s", types_buf);
	693	+ }
	694	+ }
	695	+ if (state->acquired_refs && state->refs[0].id) {
	696	+ verbose(env, " refs=%d", state->refs[0].id);
	697	+ for (i = 1; i < state->acquired_refs; i++)
	698	+ if (state->refs[i].id)
	699	+ verbose(env, ",%d", state->refs[i].id);
361	700	}
362	701	verbose(env, "\n");
363	702	}
364	703
365		-static int copy_stack_state(struct bpf_func_state *dst,
366		- const struct bpf_func_state *src)
367		-{
368		- if (!src->stack)
369		- return 0;
370		- if (WARN_ON_ONCE(dst->allocated_stack < src->allocated_stack)) {
371		- /* internal bug, make state invalid to reject the program */
372		- memset(dst, 0, sizeof(*dst));
373		- return -EFAULT;
374		- }
375		- memcpy(dst->stack, src->stack,
376		- sizeof(src->stack) (src->allocated_stack / BPF_REG_SIZE));
377		- return 0;
	704	+#define COPY_STATE_FN(NAME, COUNT, FIELD, SIZE) \
	705	+static int copy_##NAME##_state(struct bpf_func_state *dst, \
	706	+ const struct bpf_func_state *src) \
	707	+{ \
	708	+ if (!src->FIELD) \
	709	+ return 0; \
	710	+ if (WARN_ON_ONCE(dst->COUNT < src->COUNT)) { \
	711	+ /* internal bug, make state invalid to reject the program */ \
	712	+ memset(dst, 0, sizeof(*dst)); \
	713	+ return -EFAULT; \
	714	+ } \
	715	+ memcpy(dst->FIELD, src->FIELD, \
	716	+ sizeof(src->FIELD) (src->COUNT / SIZE)); \
	717	+ return 0; \
378	718	}
	719	+/* copy_reference_state() */
	720	+COPY_STATE_FN(reference, acquired_refs, refs, 1)
	721	+/* copy_stack_state() */
	722	+COPY_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE)
	723	+#undef COPY_STATE_FN
	724	+
	725	+#define REALLOC_STATE_FN(NAME, COUNT, FIELD, SIZE) \
	726	+static int realloc_##NAME##_state(struct bpf_func_state *state, int size, \
	727	+ bool copy_old) \
	728	+{ \
	729	+ u32 old_size = state->COUNT; \
	730	+ struct bpf_##NAME##_state *new_##FIELD; \
	731	+ int slot = size / SIZE; \
	732	+ \
	733	+ if (size <= old_size \|\| !size) { \
	734	+ if (copy_old) \
	735	+ return 0; \
	736	+ state->COUNT = slot * SIZE; \
	737	+ if (!size && old_size) { \
	738	+ kfree(state->FIELD); \
	739	+ state->FIELD = NULL; \
	740	+ } \
	741	+ return 0; \
	742	+ } \
	743	+ new_##FIELD = kmalloc_array(slot, sizeof(struct bpf_##NAME##_state), \
	744	+ GFP_KERNEL); \
	745	+ if (!new_##FIELD) \
	746	+ return -ENOMEM; \
	747	+ if (copy_old) { \
	748	+ if (state->FIELD) \
	749	+ memcpy(new_##FIELD, state->FIELD, \
	750	+ sizeof(new_##FIELD) (old_size / SIZE)); \
	751	+ memset(new_##FIELD + old_size / SIZE, 0, \
	752	+ sizeof(new_##FIELD) (size - old_size) / SIZE); \
	753	+ } \
	754	+ state->COUNT = slot * SIZE; \
	755	+ kfree(state->FIELD); \
	756	+ state->FIELD = new_##FIELD; \
	757	+ return 0; \
	758	+}
	759	+/* realloc_reference_state() */
	760	+REALLOC_STATE_FN(reference, acquired_refs, refs, 1)
	761	+/* realloc_stack_state() */
	762	+REALLOC_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE)
	763	+#undef REALLOC_STATE_FN
379	764
380	765	/* do_check() starts with zero-sized stack in struct bpf_verifier_state to
381	766	* make it consume minimal amount of memory. check_stack_write() access from
382	767	* the program calls into realloc_func_state() to grow the stack size.
383		- * Note there is a non-zero parent pointer inside each reg of bpf_verifier_state
384		- * which this function copies over. It points to corresponding reg in previous
385		- * bpf_verifier_state which is never reallocated
	768	+ * Note there is a non-zero 'parent' pointer inside bpf_verifier_state
	769	+ * which realloc_stack_state() copies over. It points to previous
	770	+ * bpf_verifier_state which is never reallocated.
386	771	*/
387		-static int realloc_func_state(struct bpf_func_state *state, int size,
388		- bool copy_old)
	772	+static int realloc_func_state(struct bpf_func_state *state, int stack_size,
	773	+ int refs_size, bool copy_old)
389	774	{
390		- u32 old_size = state->allocated_stack;
391		- struct bpf_stack_state *new_stack;
392		- int slot = size / BPF_REG_SIZE;
	775	+ int err = realloc_reference_state(state, refs_size, copy_old);
	776	+ if (err)
	777	+ return err;
	778	+ return realloc_stack_state(state, stack_size, copy_old);
	779	+}
393	780
394		- if (size <= old_size \|\| !size) {
395		- if (copy_old)
	781	+/* Acquire a pointer id from the env and update the state->refs to include
	782	+ * this new pointer reference.
	783	+ * On success, returns a valid pointer id to associate with the register
	784	+ * On failure, returns a negative errno.
	785	+ */
	786	+static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx)
	787	+{
	788	+ struct bpf_func_state *state = cur_func(env);
	789	+ int new_ofs = state->acquired_refs;
	790	+ int id, err;
	791	+
	792	+ err = realloc_reference_state(state, state->acquired_refs + 1, true);
	793	+ if (err)
	794	+ return err;
	795	+ id = ++env->id_gen;
	796	+ state->refs[new_ofs].id = id;
	797	+ state->refs[new_ofs].insn_idx = insn_idx;
	798	+
	799	+ return id;
	800	+}
	801	+
	802	+/* release function corresponding to acquire_reference_state(). Idempotent. */
	803	+static int release_reference_state(struct bpf_func_state *state, int ptr_id)
	804	+{
	805	+ int i, last_idx;
	806	+
	807	+ last_idx = state->acquired_refs - 1;
	808	+ for (i = 0; i < state->acquired_refs; i++) {
	809	+ if (state->refs[i].id == ptr_id) {
	810	+ if (last_idx && i != last_idx)
	811	+ memcpy(&state->refs[i], &state->refs[last_idx],
	812	+ sizeof(*state->refs));
	813	+ memset(&state->refs[last_idx], 0, sizeof(*state->refs));
	814	+ state->acquired_refs--;
396	815	return 0;
397		- state->allocated_stack = slot * BPF_REG_SIZE;
398		- if (!size && old_size) {
399		- kfree(state->stack);
400		- state->stack = NULL;
401	816	}
402		- return 0;
403	817	}
404		- new_stack = kmalloc_array(slot, sizeof(struct bpf_stack_state),
405		- GFP_KERNEL);
406		- if (!new_stack)
407		- return -ENOMEM;
408		- if (copy_old) {
409		- if (state->stack)
410		- memcpy(new_stack, state->stack,
411		- sizeof(new_stack) (old_size / BPF_REG_SIZE));
412		- memset(new_stack + old_size / BPF_REG_SIZE, 0,
413		- sizeof(new_stack) (size - old_size) / BPF_REG_SIZE);
414		- }
415		- state->allocated_stack = slot * BPF_REG_SIZE;
416		- kfree(state->stack);
417		- state->stack = new_stack;
	818	+ return -EINVAL;
	819	+}
	820	+
	821	+static int transfer_reference_state(struct bpf_func_state *dst,
	822	+ struct bpf_func_state *src)
	823	+{
	824	+ int err = realloc_reference_state(dst, src->acquired_refs, false);
	825	+ if (err)
	826	+ return err;
	827	+ err = copy_reference_state(dst, src);
	828	+ if (err)
	829	+ return err;
418	830	return 0;
419	831	}
420	832
..	..	@@ -422,8 +834,16 @@
422	834	{
423	835	if (!state)
424	836	return;
	837	+ kfree(state->refs);
425	838	kfree(state->stack);
426	839	kfree(state);
	840	+}
	841	+
	842	+static void clear_jmp_history(struct bpf_verifier_state *state)
	843	+{
	844	+ kfree(state->jmp_history);
	845	+ state->jmp_history = NULL;
	846	+ state->jmp_history_cnt = 0;
427	847	}
428	848
429	849	static void free_verifier_state(struct bpf_verifier_state *state,
..	..	@@ -435,6 +855,7 @@
435	855	free_func_state(state->frame[i]);
436	856	state->frame[i] = NULL;
437	857	}
	858	+ clear_jmp_history(state);
438	859	if (free_self)
439	860	kfree(state);
440	861	}
..	..	@@ -447,10 +868,14 @@
447	868	{
448	869	int err;
449	870
450		- err = realloc_func_state(dst, src->allocated_stack, false);
	871	+ err = realloc_func_state(dst, src->allocated_stack, src->acquired_refs,
	872	+ false);
451	873	if (err)
452	874	return err;
453		- memcpy(dst, src, offsetof(struct bpf_func_state, allocated_stack));
	875	+ memcpy(dst, src, offsetof(struct bpf_func_state, acquired_refs));
	876	+ err = copy_reference_state(dst, src);
	877	+ if (err)
	878	+ return err;
454	879	return copy_stack_state(dst, src);
455	880	}
456	881
..	..	@@ -458,7 +883,17 @@
458	883	const struct bpf_verifier_state *src)
459	884	{
460	885	struct bpf_func_state *dst;
	886	+ u32 jmp_sz = sizeof(struct bpf_idx_pair) * src->jmp_history_cnt;
461	887	int i, err;
	888	+
	889	+ if (dst_state->jmp_history_cnt < src->jmp_history_cnt) {
	890	+ kfree(dst_state->jmp_history);
	891	+ dst_state->jmp_history = kmalloc(jmp_sz, GFP_USER);
	892	+ if (!dst_state->jmp_history)
	893	+ return -ENOMEM;
	894	+ }
	895	+ memcpy(dst_state->jmp_history, src->jmp_history, jmp_sz);
	896	+ dst_state->jmp_history_cnt = src->jmp_history_cnt;
462	897
463	898	/* if dst has more stack frames then src frame, free them */
464	899	for (i = src->curframe + 1; i <= dst_state->curframe; i++) {
..	..	@@ -467,6 +902,11 @@
467	902	}
468	903	dst_state->speculative = src->speculative;
469	904	dst_state->curframe = src->curframe;
	905	+ dst_state->active_spin_lock = src->active_spin_lock;
	906	+ dst_state->branches = src->branches;
	907	+ dst_state->parent = src->parent;
	908	+ dst_state->first_insn_idx = src->first_insn_idx;
	909	+ dst_state->last_insn_idx = src->last_insn_idx;
470	910	for (i = 0; i <= src->curframe; i++) {
471	911	dst = dst_state->frame[i];
472	912	if (!dst) {
..	..	@@ -482,8 +922,25 @@
482	922	return 0;
483	923	}
484	924
	925	+static void update_branch_counts(struct bpf_verifier_env env, struct bpf_verifier_state st)
	926	+{
	927	+ while (st) {
	928	+ u32 br = --st->branches;
	929	+
	930	+ /* WARN_ON(br > 1) technically makes sense here,
	931	+ * but see comment in push_stack(), hence:
	932	+ */
	933	+ WARN_ONCE((int)br < 0,
	934	+ "BUG update_branch_counts:branches_to_explore=%d\n",
	935	+ br);
	936	+ if (br)
	937	+ break;
	938	+ st = st->parent;
	939	+ }
	940	+}
	941	+
485	942	static int pop_stack(struct bpf_verifier_env env, int prev_insn_idx,
486		- int *insn_idx)
	943	+ int *insn_idx, bool pop_log)
487	944	{
488	945	struct bpf_verifier_state *cur = env->cur_state;
489	946	struct bpf_verifier_stack_elem elem, head = env->head;
..	..	@@ -497,6 +954,8 @@
497	954	if (err)
498	955	return err;
499	956	}
	957	+ if (pop_log)
	958	+ bpf_vlog_reset(&env->log, head->log_pos);
500	959	if (insn_idx)
501	960	*insn_idx = head->insn_idx;
502	961	if (prev_insn_idx)
..	..	@@ -524,22 +983,36 @@
524	983	elem->insn_idx = insn_idx;
525	984	elem->prev_insn_idx = prev_insn_idx;
526	985	elem->next = env->head;
	986	+ elem->log_pos = env->log.len_used;
527	987	env->head = elem;
528	988	env->stack_size++;
529	989	err = copy_verifier_state(&elem->st, cur);
530	990	if (err)
531	991	goto err;
532	992	elem->st.speculative \|= speculative;
533		- if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) {
534		- verbose(env, "BPF program is too complex\n");
	993	+ if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) {
	994	+ verbose(env, "The sequence of %d jumps is too complex.\n",
	995	+ env->stack_size);
535	996	goto err;
	997	+ }
	998	+ if (elem->st.parent) {
	999	+ ++elem->st.parent->branches;
	1000	+ /* WARN_ON(branches > 2) technically makes sense here,
	1001	+ * but
	1002	+ * 1. speculative states will bump 'branches' for non-branch
	1003	+ * instructions
	1004	+ * 2. is_state_visited() heuristics may decide not to create
	1005	+ * a new state for a sequence of branches and all such current
	1006	+ * and cloned states will be pointing to a single parent state
	1007	+ * which might have large 'branches' count.
	1008	+ */
536	1009	}
537	1010	return &elem->st;
538	1011	err:
539	1012	free_verifier_state(env->cur_state, true);
540	1013	env->cur_state = NULL;
541	1014	/* pop all elements and return */
542		- while (!pop_stack(env, NULL, NULL));
	1015	+ while (!pop_stack(env, NULL, NULL, false));
543	1016	return NULL;
544	1017	}
545	1018
..	..	@@ -548,7 +1021,23 @@
548	1021	BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
549	1022	};
550	1023
551		-static void __mark_reg_not_init(struct bpf_reg_state *reg);
	1024	+static void __mark_reg_not_init(const struct bpf_verifier_env *env,
	1025	+ struct bpf_reg_state *reg);
	1026	+
	1027	+/* This helper doesn't clear reg->id */
	1028	+static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm)
	1029	+{
	1030	+ reg->var_off = tnum_const(imm);
	1031	+ reg->smin_value = (s64)imm;
	1032	+ reg->smax_value = (s64)imm;
	1033	+ reg->umin_value = imm;
	1034	+ reg->umax_value = imm;
	1035	+
	1036	+ reg->s32_min_value = (s32)imm;
	1037	+ reg->s32_max_value = (s32)imm;
	1038	+ reg->u32_min_value = (u32)imm;
	1039	+ reg->u32_max_value = (u32)imm;
	1040	+}
552	1041
553	1042	/* Mark the unknown part of a register (variable offset or scalar value) as
554	1043	* known to have the value @imm.
..	..	@@ -558,11 +1047,16 @@
558	1047	/* Clear id, off, and union(map_ptr, range) */
559	1048	memset(((u8 *)reg) + sizeof(reg->type), 0,
560	1049	offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type));
561		- reg->var_off = tnum_const(imm);
562		- reg->smin_value = (s64)imm;
563		- reg->smax_value = (s64)imm;
564		- reg->umin_value = imm;
565		- reg->umax_value = imm;
	1050	+ ___mark_reg_known(reg, imm);
	1051	+}
	1052	+
	1053	+static void __mark_reg32_known(struct bpf_reg_state *reg, u64 imm)
	1054	+{
	1055	+ reg->var_off = tnum_const_subreg(reg->var_off, imm);
	1056	+ reg->s32_min_value = (s32)imm;
	1057	+ reg->s32_max_value = (s32)imm;
	1058	+ reg->u32_min_value = (u32)imm;
	1059	+ reg->u32_max_value = (u32)imm;
566	1060	}
567	1061
568	1062	/* Mark the 'variable offset' part of a register as zero. This should be
..	..	@@ -586,7 +1080,7 @@
586	1080	verbose(env, "mark_reg_known_zero(regs, %u)\n", regno);
587	1081	/* Something bad happened, let's kill all regs */
588	1082	for (regno = 0; regno < MAX_BPF_REG; regno++)
589		- __mark_reg_not_init(regs + regno);
	1083	+ __mark_reg_not_init(env, regs + regno);
590	1084	return;
591	1085	}
592	1086	__mark_reg_known_zero(regs + regno);
..	..	@@ -617,8 +1111,52 @@
617	1111	tnum_equals_const(reg->var_off, 0);
618	1112	}
619	1113
620		-/* Attempts to improve min/max values based on var_off information */
621		-static void __update_reg_bounds(struct bpf_reg_state *reg)
	1114	+/* Reset the min/max bounds of a register */
	1115	+static void __mark_reg_unbounded(struct bpf_reg_state *reg)
	1116	+{
	1117	+ reg->smin_value = S64_MIN;
	1118	+ reg->smax_value = S64_MAX;
	1119	+ reg->umin_value = 0;
	1120	+ reg->umax_value = U64_MAX;
	1121	+
	1122	+ reg->s32_min_value = S32_MIN;
	1123	+ reg->s32_max_value = S32_MAX;
	1124	+ reg->u32_min_value = 0;
	1125	+ reg->u32_max_value = U32_MAX;
	1126	+}
	1127	+
	1128	+static void __mark_reg64_unbounded(struct bpf_reg_state *reg)
	1129	+{
	1130	+ reg->smin_value = S64_MIN;
	1131	+ reg->smax_value = S64_MAX;
	1132	+ reg->umin_value = 0;
	1133	+ reg->umax_value = U64_MAX;
	1134	+}
	1135	+
	1136	+static void __mark_reg32_unbounded(struct bpf_reg_state *reg)
	1137	+{
	1138	+ reg->s32_min_value = S32_MIN;
	1139	+ reg->s32_max_value = S32_MAX;
	1140	+ reg->u32_min_value = 0;
	1141	+ reg->u32_max_value = U32_MAX;
	1142	+}
	1143	+
	1144	+static void __update_reg32_bounds(struct bpf_reg_state *reg)
	1145	+{
	1146	+ struct tnum var32_off = tnum_subreg(reg->var_off);
	1147	+
	1148	+ /* min signed is max(sign bit) \| min(other bits) */
	1149	+ reg->s32_min_value = max_t(s32, reg->s32_min_value,
	1150	+ var32_off.value \| (var32_off.mask & S32_MIN));
	1151	+ /* max signed is min(sign bit) \| max(other bits) */
	1152	+ reg->s32_max_value = min_t(s32, reg->s32_max_value,
	1153	+ var32_off.value \| (var32_off.mask & S32_MAX));
	1154	+ reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)var32_off.value);
	1155	+ reg->u32_max_value = min(reg->u32_max_value,
	1156	+ (u32)(var32_off.value \| var32_off.mask));
	1157	+}
	1158	+
	1159	+static void __update_reg64_bounds(struct bpf_reg_state *reg)
622	1160	{
623	1161	/* min signed is max(sign bit) \| min(other bits) */
624	1162	reg->smin_value = max_t(s64, reg->smin_value,
..	..	@@ -631,8 +1169,48 @@
631	1169	reg->var_off.value \| reg->var_off.mask);
632	1170	}
633	1171
	1172	+static void __update_reg_bounds(struct bpf_reg_state *reg)
	1173	+{
	1174	+ __update_reg32_bounds(reg);
	1175	+ __update_reg64_bounds(reg);
	1176	+}
	1177	+
634	1178	/* Uses signed min/max values to inform unsigned, and vice-versa */
635		-static void __reg_deduce_bounds(struct bpf_reg_state *reg)
	1179	+static void __reg32_deduce_bounds(struct bpf_reg_state *reg)
	1180	+{
	1181	+ /* Learn sign from signed bounds.
	1182	+ * If we cannot cross the sign boundary, then signed and unsigned bounds
	1183	+ * are the same, so combine. This works even in the negative case, e.g.
	1184	+ * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff.
	1185	+ */
	1186	+ if (reg->s32_min_value >= 0 \|\| reg->s32_max_value < 0) {
	1187	+ reg->s32_min_value = reg->u32_min_value =
	1188	+ max_t(u32, reg->s32_min_value, reg->u32_min_value);
	1189	+ reg->s32_max_value = reg->u32_max_value =
	1190	+ min_t(u32, reg->s32_max_value, reg->u32_max_value);
	1191	+ return;
	1192	+ }
	1193	+ /* Learn sign from unsigned bounds. Signed bounds cross the sign
	1194	+ * boundary, so we must be careful.
	1195	+ */
	1196	+ if ((s32)reg->u32_max_value >= 0) {
	1197	+ /* Positive. We can't learn anything from the smin, but smax
	1198	+ * is positive, hence safe.
	1199	+ */
	1200	+ reg->s32_min_value = reg->u32_min_value;
	1201	+ reg->s32_max_value = reg->u32_max_value =
	1202	+ min_t(u32, reg->s32_max_value, reg->u32_max_value);
	1203	+ } else if ((s32)reg->u32_min_value < 0) {
	1204	+ /* Negative. We can't learn anything from the smax, but smin
	1205	+ * is negative, hence safe.
	1206	+ */
	1207	+ reg->s32_min_value = reg->u32_min_value =
	1208	+ max_t(u32, reg->s32_min_value, reg->u32_min_value);
	1209	+ reg->s32_max_value = reg->u32_max_value;
	1210	+ }
	1211	+}
	1212	+
	1213	+static void __reg64_deduce_bounds(struct bpf_reg_state *reg)
636	1214	{
637	1215	/* Learn sign from signed bounds.
638	1216	* If we cannot cross the sign boundary, then signed and unsigned bounds
..	..	@@ -666,25 +1244,112 @@
666	1244	}
667	1245	}
668	1246
	1247	+static void __reg_deduce_bounds(struct bpf_reg_state *reg)
	1248	+{
	1249	+ __reg32_deduce_bounds(reg);
	1250	+ __reg64_deduce_bounds(reg);
	1251	+}
	1252	+
669	1253	/* Attempts to improve var_off based on unsigned min/max information */
670	1254	static void __reg_bound_offset(struct bpf_reg_state *reg)
671	1255	{
672		- reg->var_off = tnum_intersect(reg->var_off,
673		- tnum_range(reg->umin_value,
674		- reg->umax_value));
	1256	+ struct tnum var64_off = tnum_intersect(reg->var_off,
	1257	+ tnum_range(reg->umin_value,
	1258	+ reg->umax_value));
	1259	+ struct tnum var32_off = tnum_intersect(tnum_subreg(reg->var_off),
	1260	+ tnum_range(reg->u32_min_value,
	1261	+ reg->u32_max_value));
	1262	+
	1263	+ reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off);
675	1264	}
676	1265
677		-/* Reset the min/max bounds of a register */
678		-static void __mark_reg_unbounded(struct bpf_reg_state *reg)
	1266	+static void reg_bounds_sync(struct bpf_reg_state *reg)
679	1267	{
680		- reg->smin_value = S64_MIN;
681		- reg->smax_value = S64_MAX;
682		- reg->umin_value = 0;
683		- reg->umax_value = U64_MAX;
	1268	+ /* We might have learned new bounds from the var_off. */
	1269	+ __update_reg_bounds(reg);
	1270	+ /* We might have learned something about the sign bit. */
	1271	+ __reg_deduce_bounds(reg);
	1272	+ /* We might have learned some bits from the bounds. */
	1273	+ __reg_bound_offset(reg);
	1274	+ /* Intersecting with the old var_off might have improved our bounds
	1275	+ * slightly, e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
	1276	+ * then new var_off is (0; 0x7f...fc) which improves our umax.
	1277	+ */
	1278	+ __update_reg_bounds(reg);
	1279	+}
	1280	+
	1281	+static bool __reg32_bound_s64(s32 a)
	1282	+{
	1283	+ return a >= 0 && a <= S32_MAX;
	1284	+}
	1285	+
	1286	+static void __reg_assign_32_into_64(struct bpf_reg_state *reg)
	1287	+{
	1288	+ reg->umin_value = reg->u32_min_value;
	1289	+ reg->umax_value = reg->u32_max_value;
	1290	+
	1291	+ /* Attempt to pull 32-bit signed bounds into 64-bit bounds but must
	1292	+ * be positive otherwise set to worse case bounds and refine later
	1293	+ * from tnum.
	1294	+ */
	1295	+ if (__reg32_bound_s64(reg->s32_min_value) &&
	1296	+ __reg32_bound_s64(reg->s32_max_value)) {
	1297	+ reg->smin_value = reg->s32_min_value;
	1298	+ reg->smax_value = reg->s32_max_value;
	1299	+ } else {
	1300	+ reg->smin_value = 0;
	1301	+ reg->smax_value = U32_MAX;
	1302	+ }
	1303	+}
	1304	+
	1305	+static void __reg_combine_32_into_64(struct bpf_reg_state *reg)
	1306	+{
	1307	+ /* special case when 64-bit register has upper 32-bit register
	1308	+ * zeroed. Typically happens after zext or <<32, >>32 sequence
	1309	+ * allowing us to use 32-bit bounds directly,
	1310	+ */
	1311	+ if (tnum_equals_const(tnum_clear_subreg(reg->var_off), 0)) {
	1312	+ __reg_assign_32_into_64(reg);
	1313	+ } else {
	1314	+ /* Otherwise the best we can do is push lower 32bit known and
	1315	+ * unknown bits into register (var_off set from jmp logic)
	1316	+ * then learn as much as possible from the 64-bit tnum
	1317	+ * known and unknown bits. The previous smin/smax bounds are
	1318	+ * invalid here because of jmp32 compare so mark them unknown
	1319	+ * so they do not impact tnum bounds calculation.
	1320	+ */
	1321	+ __mark_reg64_unbounded(reg);
	1322	+ }
	1323	+ reg_bounds_sync(reg);
	1324	+}
	1325	+
	1326	+static bool __reg64_bound_s32(s64 a)
	1327	+{
	1328	+ return a >= S32_MIN && a <= S32_MAX;
	1329	+}
	1330	+
	1331	+static bool __reg64_bound_u32(u64 a)
	1332	+{
	1333	+ return a >= U32_MIN && a <= U32_MAX;
	1334	+}
	1335	+
	1336	+static void __reg_combine_64_into_32(struct bpf_reg_state *reg)
	1337	+{
	1338	+ __mark_reg32_unbounded(reg);
	1339	+ if (__reg64_bound_s32(reg->smin_value) && __reg64_bound_s32(reg->smax_value)) {
	1340	+ reg->s32_min_value = (s32)reg->smin_value;
	1341	+ reg->s32_max_value = (s32)reg->smax_value;
	1342	+ }
	1343	+ if (__reg64_bound_u32(reg->umin_value) && __reg64_bound_u32(reg->umax_value)) {
	1344	+ reg->u32_min_value = (u32)reg->umin_value;
	1345	+ reg->u32_max_value = (u32)reg->umax_value;
	1346	+ }
	1347	+ reg_bounds_sync(reg);
684	1348	}
685	1349
686	1350	/* Mark a register as having a completely unknown (scalar) value. */
687		-static void __mark_reg_unknown(struct bpf_reg_state *reg)
	1351	+static void __mark_reg_unknown(const struct bpf_verifier_env *env,
	1352	+ struct bpf_reg_state *reg)
688	1353	{
689	1354	/*
690	1355	* Clear type, id, off, and union(map_ptr, range) and
..	..	@@ -694,6 +1359,7 @@
694	1359	reg->type = SCALAR_VALUE;
695	1360	reg->var_off = tnum_unknown;
696	1361	reg->frameno = 0;
	1362	+ reg->precise = !env->bpf_capable;
697	1363	__mark_reg_unbounded(reg);
698	1364	}
699	1365
..	..	@@ -704,15 +1370,16 @@
704	1370	verbose(env, "mark_reg_unknown(regs, %u)\n", regno);
705	1371	/* Something bad happened, let's kill all regs except FP */
706	1372	for (regno = 0; regno < BPF_REG_FP; regno++)
707		- __mark_reg_not_init(regs + regno);
	1373	+ __mark_reg_not_init(env, regs + regno);
708	1374	return;
709	1375	}
710		- __mark_reg_unknown(regs + regno);
	1376	+ __mark_reg_unknown(env, regs + regno);
711	1377	}
712	1378
713		-static void __mark_reg_not_init(struct bpf_reg_state *reg)
	1379	+static void __mark_reg_not_init(const struct bpf_verifier_env *env,
	1380	+ struct bpf_reg_state *reg)
714	1381	{
715		- __mark_reg_unknown(reg);
	1382	+ __mark_reg_unknown(env, reg);
716	1383	reg->type = NOT_INIT;
717	1384	}
718	1385
..	..	@@ -723,12 +1390,26 @@
723	1390	verbose(env, "mark_reg_not_init(regs, %u)\n", regno);
724	1391	/* Something bad happened, let's kill all regs except FP */
725	1392	for (regno = 0; regno < BPF_REG_FP; regno++)
726		- __mark_reg_not_init(regs + regno);
	1393	+ __mark_reg_not_init(env, regs + regno);
727	1394	return;
728	1395	}
729		- __mark_reg_not_init(regs + regno);
	1396	+ __mark_reg_not_init(env, regs + regno);
730	1397	}
731	1398
	1399	+static void mark_btf_ld_reg(struct bpf_verifier_env *env,
	1400	+ struct bpf_reg_state *regs, u32 regno,
	1401	+ enum bpf_reg_type reg_type, u32 btf_id)
	1402	+{
	1403	+ if (reg_type == SCALAR_VALUE) {
	1404	+ mark_reg_unknown(env, regs, regno);
	1405	+ return;
	1406	+ }
	1407	+ mark_reg_known_zero(env, regs, regno);
	1408	+ regs[regno].type = PTR_TO_BTF_ID;
	1409	+ regs[regno].btf_id = btf_id;
	1410	+}
	1411	+
	1412	+#define DEF_NOT_SUBREG (0)
732	1413	static void init_reg_state(struct bpf_verifier_env *env,
733	1414	struct bpf_func_state *state)
734	1415	{
..	..	@@ -739,16 +1420,13 @@
739	1420	mark_reg_not_init(env, regs, i);
740	1421	regs[i].live = REG_LIVE_NONE;
741	1422	regs[i].parent = NULL;
	1423	+ regs[i].subreg_def = DEF_NOT_SUBREG;
742	1424	}
743	1425
744	1426	/* frame pointer */
745	1427	regs[BPF_REG_FP].type = PTR_TO_STACK;
746	1428	mark_reg_known_zero(env, regs, BPF_REG_FP);
747	1429	regs[BPF_REG_FP].frameno = state->frameno;
748		-
749		- /* 1st arg to a function */
750		- regs[BPF_REG_1].type = PTR_TO_CTX;
751		- mark_reg_known_zero(env, regs, BPF_REG_1);
752	1430	}
753	1431
754	1432	#define BPF_MAIN_FUNC (-1)
..	..	@@ -826,13 +1504,10 @@
826	1504	continue;
827	1505	if (insn[i].src_reg != BPF_PSEUDO_CALL)
828	1506	continue;
829		- if (!env->allow_ptr_leaks) {
830		- verbose(env, "function calls to other bpf functions are allowed for root only\n");
	1507	+ if (!env->bpf_capable) {
	1508	+ verbose(env,
	1509	+ "function calls to other bpf functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n");
831	1510	return -EPERM;
832		- }
833		- if (bpf_prog_is_dev_bound(env->prog->aux)) {
834		- verbose(env, "function calls in offloaded programs are not supported yet\n");
835		- return -EINVAL;
836	1511	}
837	1512	ret = add_subprog(env, i + insn[i].imm + 1);
838	1513	if (ret < 0)
..	..	@@ -844,7 +1519,7 @@
844	1519	*/
845	1520	subprog[env->subprog_cnt].start = insn_cnt;
846	1521
847		- if (env->log.level > 1)
	1522	+ if (env->log.level & BPF_LOG_LEVEL2)
848	1523	for (i = 0; i < env->subprog_cnt; i++)
849	1524	verbose(env, "func#%d @%d\n", i, subprog[i].start);
850	1525
..	..	@@ -854,7 +1529,14 @@
854	1529	for (i = 0; i < insn_cnt; i++) {
855	1530	u8 code = insn[i].code;
856	1531
857		- if (BPF_CLASS(code) != BPF_JMP)
	1532	+ if (code == (BPF_JMP \| BPF_CALL) &&
	1533	+ insn[i].imm == BPF_FUNC_tail_call &&
	1534	+ insn[i].src_reg != BPF_PSEUDO_CALL)
	1535	+ subprog[cur_subprog].has_tail_call = true;
	1536	+ if (BPF_CLASS(code) == BPF_LD &&
	1537	+ (BPF_MODE(code) == BPF_ABS \|\| BPF_MODE(code) == BPF_IND))
	1538	+ subprog[cur_subprog].has_ld_abs = true;
	1539	+ if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32)
858	1540	goto next;
859	1541	if (BPF_OP(code) == BPF_EXIT \|\| BPF_OP(code) == BPF_CALL)
860	1542	goto next;
..	..	@@ -888,21 +1570,165 @@
888	1570	*/
889	1571	static int mark_reg_read(struct bpf_verifier_env *env,
890	1572	const struct bpf_reg_state *state,
891		- struct bpf_reg_state *parent)
	1573	+ struct bpf_reg_state *parent, u8 flag)
892	1574	{
893	1575	bool writes = parent == state->parent; /* Observe write marks */
	1576	+ int cnt = 0;
894	1577
895	1578	while (parent) {
896	1579	/* if read wasn't screened by an earlier write ... */
897	1580	if (writes && state->live & REG_LIVE_WRITTEN)
898	1581	break;
	1582	+ if (parent->live & REG_LIVE_DONE) {
	1583	+ verbose(env, "verifier BUG type %s var_off %lld off %d\n",
	1584	+ reg_type_str[parent->type],
	1585	+ parent->var_off.value, parent->off);
	1586	+ return -EFAULT;
	1587	+ }
	1588	+ /* The first condition is more likely to be true than the
	1589	+ * second, checked it first.
	1590	+ */
	1591	+ if ((parent->live & REG_LIVE_READ) == flag \|\|
	1592	+ parent->live & REG_LIVE_READ64)
	1593	+ /* The parentage chain never changes and
	1594	+ * this parent was already marked as LIVE_READ.
	1595	+ * There is no need to keep walking the chain again and
	1596	+ * keep re-marking all parents as LIVE_READ.
	1597	+ * This case happens when the same register is read
	1598	+ * multiple times without writes into it in-between.
	1599	+ * Also, if parent has the stronger REG_LIVE_READ64 set,
	1600	+ * then no need to set the weak REG_LIVE_READ32.
	1601	+ */
	1602	+ break;
899	1603	/* ... then we depend on parent's value */
900		- parent->live \|= REG_LIVE_READ;
	1604	+ parent->live \|= flag;
	1605	+ /* REG_LIVE_READ64 overrides REG_LIVE_READ32. */
	1606	+ if (flag == REG_LIVE_READ64)
	1607	+ parent->live &= ~REG_LIVE_READ32;
901	1608	state = parent;
902	1609	parent = state->parent;
903	1610	writes = true;
	1611	+ cnt++;
904	1612	}
	1613	+
	1614	+ if (env->longest_mark_read_walk < cnt)
	1615	+ env->longest_mark_read_walk = cnt;
905	1616	return 0;
	1617	+}
	1618	+
	1619	+/* This function is supposed to be used by the following 32-bit optimization
	1620	+ * code only. It returns TRUE if the source or destination register operates
	1621	+ * on 64-bit, otherwise return FALSE.
	1622	+ */
	1623	+static bool is_reg64(struct bpf_verifier_env env, struct bpf_insn insn,
	1624	+ u32 regno, struct bpf_reg_state *reg, enum reg_arg_type t)
	1625	+{
	1626	+ u8 code, class, op;
	1627	+
	1628	+ code = insn->code;
	1629	+ class = BPF_CLASS(code);
	1630	+ op = BPF_OP(code);
	1631	+ if (class == BPF_JMP) {
	1632	+ /* BPF_EXIT for "main" will reach here. Return TRUE
	1633	+ * conservatively.
	1634	+ */
	1635	+ if (op == BPF_EXIT)
	1636	+ return true;
	1637	+ if (op == BPF_CALL) {
	1638	+ /* BPF to BPF call will reach here because of marking
	1639	+ * caller saved clobber with DST_OP_NO_MARK for which we
	1640	+ * don't care the register def because they are anyway
	1641	+ * marked as NOT_INIT already.
	1642	+ */
	1643	+ if (insn->src_reg == BPF_PSEUDO_CALL)
	1644	+ return false;
	1645	+ /* Helper call will reach here because of arg type
	1646	+ * check, conservatively return TRUE.
	1647	+ */
	1648	+ if (t == SRC_OP)
	1649	+ return true;
	1650	+
	1651	+ return false;
	1652	+ }
	1653	+ }
	1654	+
	1655	+ if (class == BPF_ALU64 \|\| class == BPF_JMP \|\|
	1656	+ /* BPF_END always use BPF_ALU class. */
	1657	+ (class == BPF_ALU && op == BPF_END && insn->imm == 64))
	1658	+ return true;
	1659	+
	1660	+ if (class == BPF_ALU \|\| class == BPF_JMP32)
	1661	+ return false;
	1662	+
	1663	+ if (class == BPF_LDX) {
	1664	+ if (t != SRC_OP)
	1665	+ return BPF_SIZE(code) == BPF_DW;
	1666	+ /* LDX source must be ptr. */
	1667	+ return true;
	1668	+ }
	1669	+
	1670	+ if (class == BPF_STX) {
	1671	+ if (reg->type != SCALAR_VALUE)
	1672	+ return true;
	1673	+ return BPF_SIZE(code) == BPF_DW;
	1674	+ }
	1675	+
	1676	+ if (class == BPF_LD) {
	1677	+ u8 mode = BPF_MODE(code);
	1678	+
	1679	+ /* LD_IMM64 */
	1680	+ if (mode == BPF_IMM)
	1681	+ return true;
	1682	+
	1683	+ /* Both LD_IND and LD_ABS return 32-bit data. */
	1684	+ if (t != SRC_OP)
	1685	+ return false;
	1686	+
	1687	+ /* Implicit ctx ptr. */
	1688	+ if (regno == BPF_REG_6)
	1689	+ return true;
	1690	+
	1691	+ /* Explicit source could be any width. */
	1692	+ return true;
	1693	+ }
	1694	+
	1695	+ if (class == BPF_ST)
	1696	+ /* The only source register for BPF_ST is a ptr. */
	1697	+ return true;
	1698	+
	1699	+ /* Conservatively return true at default. */
	1700	+ return true;
	1701	+}
	1702	+
	1703	+/* Return TRUE if INSN doesn't have explicit value define. */
	1704	+static bool insn_no_def(struct bpf_insn *insn)
	1705	+{
	1706	+ u8 class = BPF_CLASS(insn->code);
	1707	+
	1708	+ return (class == BPF_JMP \|\| class == BPF_JMP32 \|\|
	1709	+ class == BPF_STX \|\| class == BPF_ST);
	1710	+}
	1711	+
	1712	+/* Return TRUE if INSN has defined any 32-bit value explicitly. */
	1713	+static bool insn_has_def32(struct bpf_verifier_env env, struct bpf_insn insn)
	1714	+{
	1715	+ if (insn_no_def(insn))
	1716	+ return false;
	1717	+
	1718	+ return !is_reg64(env, insn, insn->dst_reg, NULL, DST_OP);
	1719	+}
	1720	+
	1721	+static void mark_insn_zext(struct bpf_verifier_env *env,
	1722	+ struct bpf_reg_state *reg)
	1723	+{
	1724	+ s32 def_idx = reg->subreg_def;
	1725	+
	1726	+ if (def_idx == DEF_NOT_SUBREG)
	1727	+ return;
	1728	+
	1729	+ env->insn_aux_data[def_idx - 1].zext_dst = true;
	1730	+ /* The dst will be zero extended, so won't be sub-register anymore. */
	1731	+ reg->subreg_def = DEF_NOT_SUBREG;
906	1732	}
907	1733
908	1734	static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
..	..	@@ -910,34 +1736,631 @@
910	1736	{
911	1737	struct bpf_verifier_state *vstate = env->cur_state;
912	1738	struct bpf_func_state *state = vstate->frame[vstate->curframe];
913		- struct bpf_reg_state *regs = state->regs;
	1739	+ struct bpf_insn *insn = env->prog->insnsi + env->insn_idx;
	1740	+ struct bpf_reg_state reg, regs = state->regs;
	1741	+ bool rw64;
914	1742
915	1743	if (regno >= MAX_BPF_REG) {
916	1744	verbose(env, "R%d is invalid\n", regno);
917	1745	return -EINVAL;
918	1746	}
919	1747
	1748	+ reg = &regs[regno];
	1749	+ rw64 = is_reg64(env, insn, regno, reg, t);
920	1750	if (t == SRC_OP) {
921	1751	/* check whether register used as source operand can be read */
922		- if (regs[regno].type == NOT_INIT) {
	1752	+ if (reg->type == NOT_INIT) {
923	1753	verbose(env, "R%d !read_ok\n", regno);
924	1754	return -EACCES;
925	1755	}
926	1756	/* We don't need to worry about FP liveness because it's read-only */
927		- if (regno != BPF_REG_FP)
928		- return mark_reg_read(env, &regs[regno],
929		- regs[regno].parent);
	1757	+ if (regno == BPF_REG_FP)
	1758	+ return 0;
	1759	+
	1760	+ if (rw64)
	1761	+ mark_insn_zext(env, reg);
	1762	+
	1763	+ return mark_reg_read(env, reg, reg->parent,
	1764	+ rw64 ? REG_LIVE_READ64 : REG_LIVE_READ32);
930	1765	} else {
931	1766	/* check whether register used as dest operand can be written to */
932	1767	if (regno == BPF_REG_FP) {
933	1768	verbose(env, "frame pointer is read only\n");
934	1769	return -EACCES;
935	1770	}
936		- regs[regno].live \|= REG_LIVE_WRITTEN;
	1771	+ reg->live \|= REG_LIVE_WRITTEN;
	1772	+ reg->subreg_def = rw64 ? DEF_NOT_SUBREG : env->insn_idx + 1;
937	1773	if (t == DST_OP)
938	1774	mark_reg_unknown(env, regs, regno);
939	1775	}
940	1776	return 0;
	1777	+}
	1778	+
	1779	+/* for any branch, call, exit record the history of jmps in the given state */
	1780	+static int push_jmp_history(struct bpf_verifier_env *env,
	1781	+ struct bpf_verifier_state *cur)
	1782	+{
	1783	+ u32 cnt = cur->jmp_history_cnt;
	1784	+ struct bpf_idx_pair *p;
	1785	+
	1786	+ cnt++;
	1787	+ p = krealloc(cur->jmp_history, cnt * sizeof(*p), GFP_USER);
	1788	+ if (!p)
	1789	+ return -ENOMEM;
	1790	+ p[cnt - 1].idx = env->insn_idx;
	1791	+ p[cnt - 1].prev_idx = env->prev_insn_idx;
	1792	+ cur->jmp_history = p;
	1793	+ cur->jmp_history_cnt = cnt;
	1794	+ return 0;
	1795	+}
	1796	+
	1797	+/* Backtrack one insn at a time. If idx is not at the top of recorded
	1798	+ * history then previous instruction came from straight line execution.
	1799	+ */
	1800	+static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
	1801	+ u32 *history)
	1802	+{
	1803	+ u32 cnt = *history;
	1804	+
	1805	+ if (cnt && st->jmp_history[cnt - 1].idx == i) {
	1806	+ i = st->jmp_history[cnt - 1].prev_idx;
	1807	+ (*history)--;
	1808	+ } else {
	1809	+ i--;
	1810	+ }
	1811	+ return i;
	1812	+}
	1813	+
	1814	+/* For given verifier state backtrack_insn() is called from the last insn to
	1815	+ * the first insn. Its purpose is to compute a bitmask of registers and
	1816	+ * stack slots that needs precision in the parent verifier state.
	1817	+ */
	1818	+static int backtrack_insn(struct bpf_verifier_env *env, int idx,
	1819	+ u32 reg_mask, u64 stack_mask)
	1820	+{
	1821	+ const struct bpf_insn_cbs cbs = {
	1822	+ .cb_print = verbose,
	1823	+ .private_data = env,
	1824	+ };
	1825	+ struct bpf_insn *insn = env->prog->insnsi + idx;
	1826	+ u8 class = BPF_CLASS(insn->code);
	1827	+ u8 opcode = BPF_OP(insn->code);
	1828	+ u8 mode = BPF_MODE(insn->code);
	1829	+ u32 dreg = 1u << insn->dst_reg;
	1830	+ u32 sreg = 1u << insn->src_reg;
	1831	+ u32 spi;
	1832	+
	1833	+ if (insn->code == 0)
	1834	+ return 0;
	1835	+ if (env->log.level & BPF_LOG_LEVEL) {
	1836	+ verbose(env, "regs=%x stack=%llx before ", reg_mask, stack_mask);
	1837	+ verbose(env, "%d: ", idx);
	1838	+ print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
	1839	+ }
	1840	+
	1841	+ if (class == BPF_ALU \|\| class == BPF_ALU64) {
	1842	+ if (!(*reg_mask & dreg))
	1843	+ return 0;
	1844	+ if (opcode == BPF_MOV) {
	1845	+ if (BPF_SRC(insn->code) == BPF_X) {
	1846	+ /* dreg = sreg
	1847	+ * dreg needs precision after this insn
	1848	+ * sreg needs precision before this insn
	1849	+ */
	1850	+ *reg_mask &= ~dreg;
	1851	+ *reg_mask \|= sreg;
	1852	+ } else {
	1853	+ /* dreg = K
	1854	+ * dreg needs precision after this insn.
	1855	+ * Corresponding register is already marked
	1856	+ * as precise=true in this verifier state.
	1857	+ * No further markings in parent are necessary
	1858	+ */
	1859	+ *reg_mask &= ~dreg;
	1860	+ }
	1861	+ } else {
	1862	+ if (BPF_SRC(insn->code) == BPF_X) {
	1863	+ /* dreg += sreg
	1864	+ * both dreg and sreg need precision
	1865	+ * before this insn
	1866	+ */
	1867	+ *reg_mask \|= sreg;
	1868	+ } /* else dreg += K
	1869	+ * dreg still needs precision before this insn
	1870	+ */
	1871	+ }
	1872	+ } else if (class == BPF_LDX) {
	1873	+ if (!(*reg_mask & dreg))
	1874	+ return 0;
	1875	+ *reg_mask &= ~dreg;
	1876	+
	1877	+ /* scalars can only be spilled into stack w/o losing precision.
	1878	+ * Load from any other memory can be zero extended.
	1879	+ * The desire to keep that precision is already indicated
	1880	+ * by 'precise' mark in corresponding register of this state.
	1881	+ * No further tracking necessary.
	1882	+ */
	1883	+ if (insn->src_reg != BPF_REG_FP)
	1884	+ return 0;
	1885	+
	1886	+ /* dreg = (u64 )[fp - off] was a fill from the stack.
	1887	+ * that [fp - off] slot contains scalar that needs to be
	1888	+ * tracked with precision
	1889	+ */
	1890	+ spi = (-insn->off - 1) / BPF_REG_SIZE;
	1891	+ if (spi >= 64) {
	1892	+ verbose(env, "BUG spi %d\n", spi);
	1893	+ WARN_ONCE(1, "verifier backtracking bug");
	1894	+ return -EFAULT;
	1895	+ }
	1896	+ *stack_mask \|= 1ull << spi;
	1897	+ } else if (class == BPF_STX \|\| class == BPF_ST) {
	1898	+ if (*reg_mask & dreg)
	1899	+ /* stx & st shouldn't be using _scalar_ dst_reg
	1900	+ * to access memory. It means backtracking
	1901	+ * encountered a case of pointer subtraction.
	1902	+ */
	1903	+ return -ENOTSUPP;
	1904	+ /* scalars can only be spilled into stack */
	1905	+ if (insn->dst_reg != BPF_REG_FP)
	1906	+ return 0;
	1907	+ spi = (-insn->off - 1) / BPF_REG_SIZE;
	1908	+ if (spi >= 64) {
	1909	+ verbose(env, "BUG spi %d\n", spi);
	1910	+ WARN_ONCE(1, "verifier backtracking bug");
	1911	+ return -EFAULT;
	1912	+ }
	1913	+ if (!(*stack_mask & (1ull << spi)))
	1914	+ return 0;
	1915	+ *stack_mask &= ~(1ull << spi);
	1916	+ if (class == BPF_STX)
	1917	+ *reg_mask \|= sreg;
	1918	+ } else if (class == BPF_JMP \|\| class == BPF_JMP32) {
	1919	+ if (opcode == BPF_CALL) {
	1920	+ if (insn->src_reg == BPF_PSEUDO_CALL)
	1921	+ return -ENOTSUPP;
	1922	+ /* regular helper call sets R0 */
	1923	+ *reg_mask &= ~1;
	1924	+ if (*reg_mask & 0x3f) {
	1925	+ /* if backtracing was looking for registers R1-R5
	1926	+ * they should have been found already.
	1927	+ */
	1928	+ verbose(env, "BUG regs %x\n", *reg_mask);
	1929	+ WARN_ONCE(1, "verifier backtracking bug");
	1930	+ return -EFAULT;
	1931	+ }
	1932	+ } else if (opcode == BPF_EXIT) {
	1933	+ return -ENOTSUPP;
	1934	+ } else if (BPF_SRC(insn->code) == BPF_X) {
	1935	+ if (!(*reg_mask & (dreg \| sreg)))
	1936	+ return 0;
	1937	+ /* dreg <cond> sreg
	1938	+ * Both dreg and sreg need precision before
	1939	+ * this insn. If only sreg was marked precise
	1940	+ * before it would be equally necessary to
	1941	+ * propagate it to dreg.
	1942	+ */
	1943	+ *reg_mask \|= (sreg \| dreg);
	1944	+ /* else dreg <cond> K
	1945	+ * Only dreg still needs precision before
	1946	+ * this insn, so for the K-based conditional
	1947	+ * there is nothing new to be marked.
	1948	+ */
	1949	+ }
	1950	+ } else if (class == BPF_LD) {
	1951	+ if (!(*reg_mask & dreg))
	1952	+ return 0;
	1953	+ *reg_mask &= ~dreg;
	1954	+ /* It's ld_imm64 or ld_abs or ld_ind.
	1955	+ * For ld_imm64 no further tracking of precision
	1956	+ * into parent is necessary
	1957	+ */
	1958	+ if (mode == BPF_IND \|\| mode == BPF_ABS)
	1959	+ /* to be analyzed */
	1960	+ return -ENOTSUPP;
	1961	+ }
	1962	+ return 0;
	1963	+}
	1964	+
	1965	+/* the scalar precision tracking algorithm:
	1966	+ * . at the start all registers have precise=false.
	1967	+ * . scalar ranges are tracked as normal through alu and jmp insns.
	1968	+ * . once precise value of the scalar register is used in:
	1969	+ * . ptr + scalar alu
	1970	+ * . if (scalar cond K\|scalar)
	1971	+ * . helper_call(.., scalar, ...) where ARG_CONST is expected
	1972	+ * backtrack through the verifier states and mark all registers and
	1973	+ * stack slots with spilled constants that these scalar regisers
	1974	+ * should be precise.
	1975	+ * . during state pruning two registers (or spilled stack slots)
	1976	+ * are equivalent if both are not precise.
	1977	+ *
	1978	+ * Note the verifier cannot simply walk register parentage chain,
	1979	+ * since many different registers and stack slots could have been
	1980	+ * used to compute single precise scalar.
	1981	+ *
	1982	+ * The approach of starting with precise=true for all registers and then
	1983	+ * backtrack to mark a register as not precise when the verifier detects
	1984	+ * that program doesn't care about specific value (e.g., when helper
	1985	+ * takes register as ARG_ANYTHING parameter) is not safe.
	1986	+ *
	1987	+ * It's ok to walk single parentage chain of the verifier states.
	1988	+ * It's possible that this backtracking will go all the way till 1st insn.
	1989	+ * All other branches will be explored for needing precision later.
	1990	+ *
	1991	+ * The backtracking needs to deal with cases like:
	1992	+ * R8=map_value(id=0,off=0,ks=4,vs=1952,imm=0) R9_w=map_value(id=0,off=40,ks=4,vs=1952,imm=0)
	1993	+ * r9 -= r8
	1994	+ * r5 = r9
	1995	+ * if r5 > 0x79f goto pc+7
	1996	+ * R5_w=inv(id=0,umax_value=1951,var_off=(0x0; 0x7ff))
	1997	+ * r5 += 1
	1998	+ * ...
	1999	+ * call bpf_perf_event_output#25
	2000	+ * where .arg5_type = ARG_CONST_SIZE_OR_ZERO
	2001	+ *
	2002	+ * and this case:
	2003	+ * r6 = 1
	2004	+ * call foo // uses callee's r6 inside to compute r0
	2005	+ * r0 += r6
	2006	+ * if r0 == 0 goto
	2007	+ *
	2008	+ * to track above reg_mask/stack_mask needs to be independent for each frame.
	2009	+ *
	2010	+ * Also if parent's curframe > frame where backtracking started,
	2011	+ * the verifier need to mark registers in both frames, otherwise callees
	2012	+ * may incorrectly prune callers. This is similar to
	2013	+ * commit 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences")
	2014	+ *
	2015	+ * For now backtracking falls back into conservative marking.
	2016	+ */
	2017	+static void mark_all_scalars_precise(struct bpf_verifier_env *env,
	2018	+ struct bpf_verifier_state *st)
	2019	+{
	2020	+ struct bpf_func_state *func;
	2021	+ struct bpf_reg_state *reg;
	2022	+ int i, j;
	2023	+
	2024	+ /* big hammer: mark all scalars precise in this path.
	2025	+ * pop_stack may still get !precise scalars.
	2026	+ * We also skip current state and go straight to first parent state,
	2027	+ * because precision markings in current non-checkpointed state are
	2028	+ * not needed. See why in the comment in __mark_chain_precision below.
	2029	+ */
	2030	+ for (st = st->parent; st; st = st->parent) {
	2031	+ for (i = 0; i <= st->curframe; i++) {
	2032	+ func = st->frame[i];
	2033	+ for (j = 0; j < BPF_REG_FP; j++) {
	2034	+ reg = &func->regs[j];
	2035	+ if (reg->type != SCALAR_VALUE)
	2036	+ continue;
	2037	+ reg->precise = true;
	2038	+ }
	2039	+ for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
	2040	+ if (!is_spilled_reg(&func->stack[j]))
	2041	+ continue;
	2042	+ reg = &func->stack[j].spilled_ptr;
	2043	+ if (reg->type != SCALAR_VALUE)
	2044	+ continue;
	2045	+ reg->precise = true;
	2046	+ }
	2047	+ }
	2048	+ }
	2049	+}
	2050	+
	2051	+static void mark_all_scalars_imprecise(struct bpf_verifier_env env, struct bpf_verifier_state st)
	2052	+{
	2053	+ struct bpf_func_state *func;
	2054	+ struct bpf_reg_state *reg;
	2055	+ int i, j;
	2056	+
	2057	+ for (i = 0; i <= st->curframe; i++) {
	2058	+ func = st->frame[i];
	2059	+ for (j = 0; j < BPF_REG_FP; j++) {
	2060	+ reg = &func->regs[j];
	2061	+ if (reg->type != SCALAR_VALUE)
	2062	+ continue;
	2063	+ reg->precise = false;
	2064	+ }
	2065	+ for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
	2066	+ if (!is_spilled_reg(&func->stack[j]))
	2067	+ continue;
	2068	+ reg = &func->stack[j].spilled_ptr;
	2069	+ if (reg->type != SCALAR_VALUE)
	2070	+ continue;
	2071	+ reg->precise = false;
	2072	+ }
	2073	+ }
	2074	+}
	2075	+
	2076	+/*
	2077	+ * __mark_chain_precision() backtracks BPF program instruction sequence and
	2078	+ * chain of verifier states making sure that register regno (if regno >= 0)
	2079	+ * and/or stack slot spi (if spi >= 0) are marked as precisely tracked
	2080	+ * SCALARS, as well as any other registers and slots that contribute to
	2081	+ * a tracked state of given registers/stack slots, depending on specific BPF
	2082	+ * assembly instructions (see backtrack_insns() for exact instruction handling
	2083	+ * logic). This backtracking relies on recorded jmp_history and is able to
	2084	+ * traverse entire chain of parent states. This process ends only when all the
	2085	+ * necessary registers/slots and their transitive dependencies are marked as
	2086	+ * precise.
	2087	+ *
	2088	+ * One important and subtle aspect is that precise marks do not matter in
	2089	+ * the currently verified state (current state). It is important to understand
	2090	+ * why this is the case.
	2091	+ *
	2092	+ * First, note that current state is the state that is not yet "checkpointed",
	2093	+ * i.e., it is not yet put into env->explored_states, and it has no children
	2094	+ * states as well. It's ephemeral, and can end up either a) being discarded if
	2095	+ * compatible explored state is found at some point or BPF_EXIT instruction is
	2096	+ * reached or b) checkpointed and put into env->explored_states, branching out
	2097	+ * into one or more children states.
	2098	+ *
	2099	+ * In the former case, precise markings in current state are completely
	2100	+ * ignored by state comparison code (see regsafe() for details). Only
	2101	+ * checkpointed ("old") state precise markings are important, and if old
	2102	+ * state's register/slot is precise, regsafe() assumes current state's
	2103	+ * register/slot as precise and checks value ranges exactly and precisely. If
	2104	+ * states turn out to be compatible, current state's necessary precise
	2105	+ * markings and any required parent states' precise markings are enforced
	2106	+ * after the fact with propagate_precision() logic, after the fact. But it's
	2107	+ * important to realize that in this case, even after marking current state
	2108	+ * registers/slots as precise, we immediately discard current state. So what
	2109	+ * actually matters is any of the precise markings propagated into current
	2110	+ * state's parent states, which are always checkpointed (due to b) case above).
	2111	+ * As such, for scenario a) it doesn't matter if current state has precise
	2112	+ * markings set or not.
	2113	+ *
	2114	+ * Now, for the scenario b), checkpointing and forking into child(ren)
	2115	+ * state(s). Note that before current state gets to checkpointing step, any
	2116	+ * processed instruction always assumes precise SCALAR register/slot
	2117	+ * knowledge: if precise value or range is useful to prune jump branch, BPF
	2118	+ * verifier takes this opportunity enthusiastically. Similarly, when
	2119	+ * register's value is used to calculate offset or memory address, exact
	2120	+ * knowledge of SCALAR range is assumed, checked, and enforced. So, similar to
	2121	+ * what we mentioned above about state comparison ignoring precise markings
	2122	+ * during state comparison, BPF verifier ignores and also assumes precise
	2123	+ * markings at will during instruction verification process. But as verifier
	2124	+ * assumes precision, it also propagates any precision dependencies across
	2125	+ * parent states, which are not yet finalized, so can be further restricted
	2126	+ * based on new knowledge gained from restrictions enforced by their children
	2127	+ * states. This is so that once those parent states are finalized, i.e., when
	2128	+ * they have no more active children state, state comparison logic in
	2129	+ * is_state_visited() would enforce strict and precise SCALAR ranges, if
	2130	+ * required for correctness.
	2131	+ *
	2132	+ * To build a bit more intuition, note also that once a state is checkpointed,
	2133	+ * the path we took to get to that state is not important. This is crucial
	2134	+ * property for state pruning. When state is checkpointed and finalized at
	2135	+ * some instruction index, it can be correctly and safely used to "short
	2136	+ * circuit" any compatible state that reaches exactly the same instruction
	2137	+ * index. I.e., if we jumped to that instruction from a completely different
	2138	+ * code path than original finalized state was derived from, it doesn't
	2139	+ * matter, current state can be discarded because from that instruction
	2140	+ * forward having a compatible state will ensure we will safely reach the
	2141	+ * exit. States describe preconditions for further exploration, but completely
	2142	+ * forget the history of how we got here.
	2143	+ *
	2144	+ * This also means that even if we needed precise SCALAR range to get to
	2145	+ * finalized state, but from that point forward that same SCALAR register is
	2146	+ * never used in a precise context (i.e., it's precise value is not needed for
	2147	+ * correctness), it's correct and safe to mark such register as "imprecise"
	2148	+ * (i.e., precise marking set to false). This is what we rely on when we do
	2149	+ * not set precise marking in current state. If no child state requires
	2150	+ * precision for any given SCALAR register, it's safe to dictate that it can
	2151	+ * be imprecise. If any child state does require this register to be precise,
	2152	+ * we'll mark it precise later retroactively during precise markings
	2153	+ * propagation from child state to parent states.
	2154	+ *
	2155	+ * Skipping precise marking setting in current state is a mild version of
	2156	+ * relying on the above observation. But we can utilize this property even
	2157	+ * more aggressively by proactively forgetting any precise marking in the
	2158	+ * current state (which we inherited from the parent state), right before we
	2159	+ * checkpoint it and branch off into new child state. This is done by
	2160	+ * mark_all_scalars_imprecise() to hopefully get more permissive and generic
	2161	+ * finalized states which help in short circuiting more future states.
	2162	+ */
	2163	+static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int regno,
	2164	+ int spi)
	2165	+{
	2166	+ struct bpf_verifier_state *st = env->cur_state;
	2167	+ int first_idx = st->first_insn_idx;
	2168	+ int last_idx = env->insn_idx;
	2169	+ struct bpf_func_state *func;
	2170	+ struct bpf_reg_state *reg;
	2171	+ u32 reg_mask = regno >= 0 ? 1u << regno : 0;
	2172	+ u64 stack_mask = spi >= 0 ? 1ull << spi : 0;
	2173	+ bool skip_first = true;
	2174	+ bool new_marks = false;
	2175	+ int i, err;
	2176	+
	2177	+ if (!env->bpf_capable)
	2178	+ return 0;
	2179	+
	2180	+ /* Do sanity checks against current state of register and/or stack
	2181	+ * slot, but don't set precise flag in current state, as precision
	2182	+ * tracking in the current state is unnecessary.
	2183	+ */
	2184	+ func = st->frame[frame];
	2185	+ if (regno >= 0) {
	2186	+ reg = &func->regs[regno];
	2187	+ if (reg->type != SCALAR_VALUE) {
	2188	+ WARN_ONCE(1, "backtracing misuse");
	2189	+ return -EFAULT;
	2190	+ }
	2191	+ new_marks = true;
	2192	+ }
	2193	+
	2194	+ while (spi >= 0) {
	2195	+ if (!is_spilled_reg(&func->stack[spi])) {
	2196	+ stack_mask = 0;
	2197	+ break;
	2198	+ }
	2199	+ reg = &func->stack[spi].spilled_ptr;
	2200	+ if (reg->type != SCALAR_VALUE) {
	2201	+ stack_mask = 0;
	2202	+ break;
	2203	+ }
	2204	+ new_marks = true;
	2205	+ break;
	2206	+ }
	2207	+
	2208	+ if (!new_marks)
	2209	+ return 0;
	2210	+ if (!reg_mask && !stack_mask)
	2211	+ return 0;
	2212	+
	2213	+ for (;;) {
	2214	+ DECLARE_BITMAP(mask, 64);
	2215	+ u32 history = st->jmp_history_cnt;
	2216	+
	2217	+ if (env->log.level & BPF_LOG_LEVEL)
	2218	+ verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx);
	2219	+
	2220	+ if (last_idx < 0) {
	2221	+ /* we are at the entry into subprog, which
	2222	+ * is expected for global funcs, but only if
	2223	+ * requested precise registers are R1-R5
	2224	+ * (which are global func's input arguments)
	2225	+ */
	2226	+ if (st->curframe == 0 &&
	2227	+ st->frame[0]->subprogno > 0 &&
	2228	+ st->frame[0]->callsite == BPF_MAIN_FUNC &&
	2229	+ stack_mask == 0 && (reg_mask & ~0x3e) == 0) {
	2230	+ bitmap_from_u64(mask, reg_mask);
	2231	+ for_each_set_bit(i, mask, 32) {
	2232	+ reg = &st->frame[0]->regs[i];
	2233	+ if (reg->type != SCALAR_VALUE) {
	2234	+ reg_mask &= ~(1u << i);
	2235	+ continue;
	2236	+ }
	2237	+ reg->precise = true;
	2238	+ }
	2239	+ return 0;
	2240	+ }
	2241	+
	2242	+ verbose(env, "BUG backtracing func entry subprog %d reg_mask %x stack_mask %llx\n",
	2243	+ st->frame[0]->subprogno, reg_mask, stack_mask);
	2244	+ WARN_ONCE(1, "verifier backtracking bug");
	2245	+ return -EFAULT;
	2246	+ }
	2247	+
	2248	+ for (i = last_idx;;) {
	2249	+ if (skip_first) {
	2250	+ err = 0;
	2251	+ skip_first = false;
	2252	+ } else {
	2253	+ err = backtrack_insn(env, i, &reg_mask, &stack_mask);
	2254	+ }
	2255	+ if (err == -ENOTSUPP) {
	2256	+ mark_all_scalars_precise(env, st);
	2257	+ return 0;
	2258	+ } else if (err) {
	2259	+ return err;
	2260	+ }
	2261	+ if (!reg_mask && !stack_mask)
	2262	+ /* Found assignment(s) into tracked register in this state.
	2263	+ * Since this state is already marked, just return.
	2264	+ * Nothing to be tracked further in the parent state.
	2265	+ */
	2266	+ return 0;
	2267	+ if (i == first_idx)
	2268	+ break;
	2269	+ i = get_prev_insn_idx(st, i, &history);
	2270	+ if (i >= env->prog->len) {
	2271	+ /* This can happen if backtracking reached insn 0
	2272	+ * and there are still reg_mask or stack_mask
	2273	+ * to backtrack.
	2274	+ * It means the backtracking missed the spot where
	2275	+ * particular register was initialized with a constant.
	2276	+ */
	2277	+ verbose(env, "BUG backtracking idx %d\n", i);
	2278	+ WARN_ONCE(1, "verifier backtracking bug");
	2279	+ return -EFAULT;
	2280	+ }
	2281	+ }
	2282	+ st = st->parent;
	2283	+ if (!st)
	2284	+ break;
	2285	+
	2286	+ new_marks = false;
	2287	+ func = st->frame[frame];
	2288	+ bitmap_from_u64(mask, reg_mask);
	2289	+ for_each_set_bit(i, mask, 32) {
	2290	+ reg = &func->regs[i];
	2291	+ if (reg->type != SCALAR_VALUE) {
	2292	+ reg_mask &= ~(1u << i);
	2293	+ continue;
	2294	+ }
	2295	+ if (!reg->precise)
	2296	+ new_marks = true;
	2297	+ reg->precise = true;
	2298	+ }
	2299	+
	2300	+ bitmap_from_u64(mask, stack_mask);
	2301	+ for_each_set_bit(i, mask, 64) {
	2302	+ if (i >= func->allocated_stack / BPF_REG_SIZE) {
	2303	+ /* the sequence of instructions:
	2304	+ * 2: (bf) r3 = r10
	2305	+ * 3: (7b) (u64 )(r3 -8) = r0
	2306	+ * 4: (79) r4 = (u64 )(r10 -8)
	2307	+ * doesn't contain jmps. It's backtracked
	2308	+ * as a single block.
	2309	+ * During backtracking insn 3 is not recognized as
	2310	+ * stack access, so at the end of backtracking
	2311	+ * stack slot fp-8 is still marked in stack_mask.
	2312	+ * However the parent state may not have accessed
	2313	+ * fp-8 and it's "unallocated" stack space.
	2314	+ * In such case fallback to conservative.
	2315	+ */
	2316	+ mark_all_scalars_precise(env, st);
	2317	+ return 0;
	2318	+ }
	2319	+
	2320	+ if (!is_spilled_reg(&func->stack[i])) {
	2321	+ stack_mask &= ~(1ull << i);
	2322	+ continue;
	2323	+ }
	2324	+ reg = &func->stack[i].spilled_ptr;
	2325	+ if (reg->type != SCALAR_VALUE) {
	2326	+ stack_mask &= ~(1ull << i);
	2327	+ continue;
	2328	+ }
	2329	+ if (!reg->precise)
	2330	+ new_marks = true;
	2331	+ reg->precise = true;
	2332	+ }
	2333	+ if (env->log.level & BPF_LOG_LEVEL) {
	2334	+ print_verifier_state(env, func);
	2335	+ verbose(env, "parent %s regs=%x stack=%llx marks\n",
	2336	+ new_marks ? "didn't have" : "already had",
	2337	+ reg_mask, stack_mask);
	2338	+ }
	2339	+
	2340	+ if (!reg_mask && !stack_mask)
	2341	+ break;
	2342	+ if (!new_marks)
	2343	+ break;
	2344	+
	2345	+ last_idx = st->last_insn_idx;
	2346	+ first_idx = st->first_insn_idx;
	2347	+ }
	2348	+ return 0;
	2349	+}
	2350	+
	2351	+static int mark_chain_precision(struct bpf_verifier_env *env, int regno)
	2352	+{
	2353	+ return __mark_chain_precision(env, env->cur_state->curframe, regno, -1);
	2354	+}
	2355	+
	2356	+static int mark_chain_precision_frame(struct bpf_verifier_env *env, int frame, int regno)
	2357	+{
	2358	+ return __mark_chain_precision(env, frame, regno, -1);
	2359	+}
	2360	+
	2361	+static int mark_chain_precision_stack_frame(struct bpf_verifier_env *env, int frame, int spi)
	2362	+{
	2363	+ return __mark_chain_precision(env, frame, -1, spi);
941	2364	}
942	2365
943	2366	static bool is_spillable_regtype(enum bpf_reg_type type)
..	..	@@ -950,7 +2373,24 @@
950	2373	case PTR_TO_PACKET:
951	2374	case PTR_TO_PACKET_META:
952	2375	case PTR_TO_PACKET_END:
	2376	+ case PTR_TO_FLOW_KEYS:
953	2377	case CONST_PTR_TO_MAP:
	2378	+ case PTR_TO_SOCKET:
	2379	+ case PTR_TO_SOCKET_OR_NULL:
	2380	+ case PTR_TO_SOCK_COMMON:
	2381	+ case PTR_TO_SOCK_COMMON_OR_NULL:
	2382	+ case PTR_TO_TCP_SOCK:
	2383	+ case PTR_TO_TCP_SOCK_OR_NULL:
	2384	+ case PTR_TO_XDP_SOCK:
	2385	+ case PTR_TO_BTF_ID:
	2386	+ case PTR_TO_BTF_ID_OR_NULL:
	2387	+ case PTR_TO_RDONLY_BUF:
	2388	+ case PTR_TO_RDONLY_BUF_OR_NULL:
	2389	+ case PTR_TO_RDWR_BUF:
	2390	+ case PTR_TO_RDWR_BUF_OR_NULL:
	2391	+ case PTR_TO_PERCPU_BTF_ID:
	2392	+ case PTR_TO_MEM:
	2393	+ case PTR_TO_MEM_OR_NULL:
954	2394	return true;
955	2395	default:
956	2396	return false;
..	..	@@ -968,31 +2408,80 @@
968	2408	return reg->type == SCALAR_VALUE && tnum_is_const(reg->var_off);
969	2409	}
970	2410
	2411	+static bool __is_scalar_unbounded(struct bpf_reg_state *reg)
	2412	+{
	2413	+ return tnum_is_unknown(reg->var_off) &&
	2414	+ reg->smin_value == S64_MIN && reg->smax_value == S64_MAX &&
	2415	+ reg->umin_value == 0 && reg->umax_value == U64_MAX &&
	2416	+ reg->s32_min_value == S32_MIN && reg->s32_max_value == S32_MAX &&
	2417	+ reg->u32_min_value == 0 && reg->u32_max_value == U32_MAX;
	2418	+}
	2419	+
	2420	+static bool register_is_bounded(struct bpf_reg_state *reg)
	2421	+{
	2422	+ return reg->type == SCALAR_VALUE && !__is_scalar_unbounded(reg);
	2423	+}
	2424	+
	2425	+static bool __is_pointer_value(bool allow_ptr_leaks,
	2426	+ const struct bpf_reg_state *reg)
	2427	+{
	2428	+ if (allow_ptr_leaks)
	2429	+ return false;
	2430	+
	2431	+ return reg->type != SCALAR_VALUE;
	2432	+}
	2433	+
	2434	+/* Copy src state preserving dst->parent and dst->live fields */
	2435	+static void copy_register_state(struct bpf_reg_state dst, const struct bpf_reg_state src)
	2436	+{
	2437	+ struct bpf_reg_state *parent = dst->parent;
	2438	+ enum bpf_reg_liveness live = dst->live;
	2439	+
	2440	+ dst = src;
	2441	+ dst->parent = parent;
	2442	+ dst->live = live;
	2443	+}
	2444	+
971	2445	static void save_register_state(struct bpf_func_state *state,
972		- int spi, struct bpf_reg_state *reg)
	2446	+ int spi, struct bpf_reg_state *reg,
	2447	+ int size)
973	2448	{
974	2449	int i;
975	2450
976		- state->stack[spi].spilled_ptr = *reg;
977		- state->stack[spi].spilled_ptr.live \|= REG_LIVE_WRITTEN;
	2451	+ copy_register_state(&state->stack[spi].spilled_ptr, reg);
	2452	+ if (size == BPF_REG_SIZE)
	2453	+ state->stack[spi].spilled_ptr.live \|= REG_LIVE_WRITTEN;
978	2454
979		- for (i = 0; i < BPF_REG_SIZE; i++)
980		- state->stack[spi].slot_type[i] = STACK_SPILL;
	2455	+ for (i = BPF_REG_SIZE; i > BPF_REG_SIZE - size; i--)
	2456	+ state->stack[spi].slot_type[i - 1] = STACK_SPILL;
	2457	+
	2458	+ /* size < 8 bytes spill */
	2459	+ for (; i; i--)
	2460	+ scrub_spilled_slot(&state->stack[spi].slot_type[i - 1]);
981	2461	}
982	2462
983		-/* check_stack_read/write functions track spill/fill of registers,
	2463	+static bool is_bpf_st_mem(struct bpf_insn *insn)
	2464	+{
	2465	+ return BPF_CLASS(insn->code) == BPF_ST && BPF_MODE(insn->code) == BPF_MEM;
	2466	+}
	2467	+
	2468	+/* check_stack_{read,write}_fixed_off functions track spill/fill of registers,
984	2469	* stack boundary and alignment are checked in check_mem_access()
985	2470	*/
986		-static int check_stack_write(struct bpf_verifier_env *env,
987		- struct bpf_func_state state, / func where register points to */
988		- int off, int size, int value_regno, int insn_idx)
	2471	+static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
	2472	+ /* stack frame we're writing to */
	2473	+ struct bpf_func_state *state,
	2474	+ int off, int size, int value_regno,
	2475	+ int insn_idx)
989	2476	{
990	2477	struct bpf_func_state cur; / state of the current function */
991	2478	int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
	2479	+ struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
992	2480	struct bpf_reg_state *reg = NULL;
	2481	+ u32 dst_reg = insn->dst_reg;
993	2482
994	2483	err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE),
995		- true);
	2484	+ state->acquired_refs, true);
996	2485	if (err)
997	2486	return err;
998	2487	/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
..	..	@@ -1008,11 +2497,13 @@
1008	2497	cur = env->cur_state->frame[env->cur_state->curframe];
1009	2498	if (value_regno >= 0)
1010	2499	reg = &cur->regs[value_regno];
1011		- if (!env->allow_ptr_leaks) {
	2500	+ if (!env->bypass_spec_v4) {
1012	2501	bool sanitize = reg && is_spillable_regtype(reg->type);
1013	2502
1014	2503	for (i = 0; i < size; i++) {
1015		- if (state->stack[spi].slot_type[i] == STACK_INVALID) {
	2504	+ u8 type = state->stack[spi].slot_type[i];
	2505	+
	2506	+ if (type != STACK_MISC && type != STACK_ZERO) {
1016	2507	sanitize = true;
1017	2508	break;
1018	2509	}
..	..	@@ -1022,12 +2513,34 @@
1022	2513	env->insn_aux_data[insn_idx].sanitize_stack_spill = true;
1023	2514	}
1024	2515
1025		- if (reg && size == BPF_REG_SIZE && register_is_const(reg) &&
1026		- !register_is_null(reg) && env->allow_ptr_leaks) {
1027		- save_register_state(state, spi, reg);
	2516	+ if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) &&
	2517	+ !register_is_null(reg) && env->bpf_capable) {
	2518	+ if (dst_reg != BPF_REG_FP) {
	2519	+ /* The backtracking logic can only recognize explicit
	2520	+ * stack slot address like [fp - 8]. Other spill of
	2521	+ * scalar via different register has to be conervative.
	2522	+ * Backtrack from here and mark all registers as precise
	2523	+ * that contributed into 'reg' being a constant.
	2524	+ */
	2525	+ err = mark_chain_precision(env, value_regno);
	2526	+ if (err)
	2527	+ return err;
	2528	+ }
	2529	+ save_register_state(state, spi, reg, size);
	2530	+ /* Break the relation on a narrowing spill. */
	2531	+ if (fls64(reg->umax_value) > BITS_PER_BYTE * size)
	2532	+ state->stack[spi].spilled_ptr.id = 0;
	2533	+ } else if (!reg && !(off % BPF_REG_SIZE) && is_bpf_st_mem(insn) &&
	2534	+ insn->imm != 0 && env->bpf_capable) {
	2535	+ struct bpf_reg_state fake_reg = {};
	2536	+
	2537	+ __mark_reg_known(&fake_reg, (u32)insn->imm);
	2538	+ fake_reg.type = SCALAR_VALUE;
	2539	+ save_register_state(state, spi, &fake_reg, size);
1028	2540	} else if (reg && is_spillable_regtype(reg->type)) {
1029	2541	/* register containing pointer is being spilled into stack */
1030	2542	if (size != BPF_REG_SIZE) {
	2543	+ verbose_linfo(env, insn_idx, "; ");
1031	2544	verbose(env, "invalid size of register spill\n");
1032	2545	return -EACCES;
1033	2546	}
..	..	@@ -1035,16 +2548,16 @@
1035	2548	verbose(env, "cannot spill pointers to stack into stack frame of the caller\n");
1036	2549	return -EINVAL;
1037	2550	}
1038		- save_register_state(state, spi, reg);
	2551	+ save_register_state(state, spi, reg, size);
1039	2552	} else {
1040	2553	u8 type = STACK_MISC;
1041	2554
1042	2555	/* regular write of data into stack destroys any spilled ptr */
1043	2556	state->stack[spi].spilled_ptr.type = NOT_INIT;
1044	2557	/* Mark slots as STACK_MISC if they belonged to spilled ptr. */
1045		- if (state->stack[spi].slot_type[0] == STACK_SPILL)
	2558	+ if (is_spilled_reg(&state->stack[spi]))
1046	2559	for (i = 0; i < BPF_REG_SIZE; i++)
1047		- state->stack[spi].slot_type[i] = STACK_MISC;
	2560	+ scrub_spilled_slot(&state->stack[spi].slot_type[i]);
1048	2561
1049	2562	/* only mark the slot as written if all 8 bytes were written
1050	2563	* otherwise read propagation may incorrectly stop too soon
..	..	@@ -1058,8 +2571,14 @@
1058	2571	state->stack[spi].spilled_ptr.live \|= REG_LIVE_WRITTEN;
1059	2572
1060	2573	/* when we zero initialize stack slots mark them as such */
1061		- if (reg && register_is_null(reg))
	2574	+ if ((reg && register_is_null(reg)) \|\|
	2575	+ (!reg && is_bpf_st_mem(insn) && insn->imm == 0)) {
	2576	+ /* backtracking doesn't work for STACK_ZERO yet. */
	2577	+ err = mark_chain_precision(env, value_regno);
	2578	+ if (err)
	2579	+ return err;
1062	2580	type = STACK_ZERO;
	2581	+ }
1063	2582
1064	2583	/* Mark slots affected by this stack write. */
1065	2584	for (i = 0; i < size; i++)
..	..	@@ -1069,140 +2588,477 @@
1069	2588	return 0;
1070	2589	}
1071	2590
1072		-static int check_stack_read(struct bpf_verifier_env *env,
1073		- struct bpf_func_state reg_state / func where register points to */,
1074		- int off, int size, int value_regno)
	2591	+/* Write the stack: 'stack[ptr_regno + off] = value_regno'. 'ptr_regno' is
	2592	+ * known to contain a variable offset.
	2593	+ * This function checks whether the write is permitted and conservatively
	2594	+ * tracks the effects of the write, considering that each stack slot in the
	2595	+ * dynamic range is potentially written to.
	2596	+ *
	2597	+ * 'off' includes 'regno->off'.
	2598	+ * 'value_regno' can be -1, meaning that an unknown value is being written to
	2599	+ * the stack.
	2600	+ *
	2601	+ * Spilled pointers in range are not marked as written because we don't know
	2602	+ * what's going to be actually written. This means that read propagation for
	2603	+ * future reads cannot be terminated by this write.
	2604	+ *
	2605	+ * For privileged programs, uninitialized stack slots are considered
	2606	+ * initialized by this write (even though we don't know exactly what offsets
	2607	+ * are going to be written to). The idea is that we don't want the verifier to
	2608	+ * reject future reads that access slots written to through variable offsets.
	2609	+ */
	2610	+static int check_stack_write_var_off(struct bpf_verifier_env *env,
	2611	+ /* func where register points to */
	2612	+ struct bpf_func_state *state,
	2613	+ int ptr_regno, int off, int size,
	2614	+ int value_regno, int insn_idx)
	2615	+{
	2616	+ struct bpf_func_state cur; / state of the current function */
	2617	+ int min_off, max_off;
	2618	+ int i, err;
	2619	+ struct bpf_reg_state ptr_reg = NULL, value_reg = NULL;
	2620	+ bool writing_zero = false;
	2621	+ /* set if the fact that we're writing a zero is used to let any
	2622	+ * stack slots remain STACK_ZERO
	2623	+ */
	2624	+ bool zero_used = false;
	2625	+
	2626	+ cur = env->cur_state->frame[env->cur_state->curframe];
	2627	+ ptr_reg = &cur->regs[ptr_regno];
	2628	+ min_off = ptr_reg->smin_value + off;
	2629	+ max_off = ptr_reg->smax_value + off + size;
	2630	+ if (value_regno >= 0)
	2631	+ value_reg = &cur->regs[value_regno];
	2632	+ if (value_reg && register_is_null(value_reg))
	2633	+ writing_zero = true;
	2634	+
	2635	+ err = realloc_func_state(state, round_up(-min_off, BPF_REG_SIZE),
	2636	+ state->acquired_refs, true);
	2637	+ if (err)
	2638	+ return err;
	2639	+
	2640	+
	2641	+ /* Variable offset writes destroy any spilled pointers in range. */
	2642	+ for (i = min_off; i < max_off; i++) {
	2643	+ u8 new_type, *stype;
	2644	+ int slot, spi;
	2645	+
	2646	+ slot = -i - 1;
	2647	+ spi = slot / BPF_REG_SIZE;
	2648	+ stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];
	2649	+
	2650	+ if (!env->allow_ptr_leaks && stype != STACK_MISC && stype != STACK_ZERO) {
	2651	+ /* Reject the write if range we may write to has not
	2652	+ * been initialized beforehand. If we didn't reject
	2653	+ * here, the ptr status would be erased below (even
	2654	+ * though not all slots are actually overwritten),
	2655	+ * possibly opening the door to leaks.
	2656	+ *
	2657	+ * We do however catch STACK_INVALID case below, and
	2658	+ * only allow reading possibly uninitialized memory
	2659	+ * later for CAP_PERFMON, as the write may not happen to
	2660	+ * that slot.
	2661	+ */
	2662	+ verbose(env, "spilled ptr in range of var-offset stack write; insn %d, ptr off: %d",
	2663	+ insn_idx, i);
	2664	+ return -EINVAL;
	2665	+ }
	2666	+
	2667	+ /* Erase all spilled pointers. */
	2668	+ state->stack[spi].spilled_ptr.type = NOT_INIT;
	2669	+
	2670	+ /* Update the slot type. */
	2671	+ new_type = STACK_MISC;
	2672	+ if (writing_zero && *stype == STACK_ZERO) {
	2673	+ new_type = STACK_ZERO;
	2674	+ zero_used = true;
	2675	+ }
	2676	+ /* If the slot is STACK_INVALID, we check whether it's OK to
	2677	+ * pretend that it will be initialized by this write. The slot
	2678	+ * might not actually be written to, and so if we mark it as
	2679	+ * initialized future reads might leak uninitialized memory.
	2680	+ * For privileged programs, we will accept such reads to slots
	2681	+ * that may or may not be written because, if we're reject
	2682	+ * them, the error would be too confusing.
	2683	+ */
	2684	+ if (*stype == STACK_INVALID && !env->allow_uninit_stack) {
	2685	+ verbose(env, "uninit stack in range of var-offset write prohibited for !root; insn %d, off: %d",
	2686	+ insn_idx, i);
	2687	+ return -EINVAL;
	2688	+ }
	2689	+ *stype = new_type;
	2690	+ }
	2691	+ if (zero_used) {
	2692	+ /* backtracking doesn't work for STACK_ZERO yet. */
	2693	+ err = mark_chain_precision(env, value_regno);
	2694	+ if (err)
	2695	+ return err;
	2696	+ }
	2697	+ return 0;
	2698	+}
	2699	+
	2700	+/* When register 'dst_regno' is assigned some values from stack[min_off,
	2701	+ * max_off), we set the register's type according to the types of the
	2702	+ * respective stack slots. If all the stack values are known to be zeros, then
	2703	+ * so is the destination reg. Otherwise, the register is considered to be
	2704	+ * SCALAR. This function does not deal with register filling; the caller must
	2705	+ * ensure that all spilled registers in the stack range have been marked as
	2706	+ * read.
	2707	+ */
	2708	+static void mark_reg_stack_read(struct bpf_verifier_env *env,
	2709	+ /* func where src register points to */
	2710	+ struct bpf_func_state *ptr_state,
	2711	+ int min_off, int max_off, int dst_regno)
	2712	+{
	2713	+ struct bpf_verifier_state *vstate = env->cur_state;
	2714	+ struct bpf_func_state *state = vstate->frame[vstate->curframe];
	2715	+ int i, slot, spi;
	2716	+ u8 *stype;
	2717	+ int zeros = 0;
	2718	+
	2719	+ for (i = min_off; i < max_off; i++) {
	2720	+ slot = -i - 1;
	2721	+ spi = slot / BPF_REG_SIZE;
	2722	+ stype = ptr_state->stack[spi].slot_type;
	2723	+ if (stype[slot % BPF_REG_SIZE] != STACK_ZERO)
	2724	+ break;
	2725	+ zeros++;
	2726	+ }
	2727	+ if (zeros == max_off - min_off) {
	2728	+ /* any access_size read into register is zero extended,
	2729	+ * so the whole register == const_zero
	2730	+ */
	2731	+ __mark_reg_const_zero(&state->regs[dst_regno]);
	2732	+ /* backtracking doesn't support STACK_ZERO yet,
	2733	+ * so mark it precise here, so that later
	2734	+ * backtracking can stop here.
	2735	+ * Backtracking may not need this if this register
	2736	+ * doesn't participate in pointer adjustment.
	2737	+ * Forward propagation of precise flag is not
	2738	+ * necessary either. This mark is only to stop
	2739	+ * backtracking. Any register that contributed
	2740	+ * to const 0 was marked precise before spill.
	2741	+ */
	2742	+ state->regs[dst_regno].precise = true;
	2743	+ } else {
	2744	+ /* have read misc data from the stack */
	2745	+ mark_reg_unknown(env, state->regs, dst_regno);
	2746	+ }
	2747	+ state->regs[dst_regno].live \|= REG_LIVE_WRITTEN;
	2748	+}
	2749	+
	2750	+/* Read the stack at 'off' and put the results into the register indicated by
	2751	+ * 'dst_regno'. It handles reg filling if the addressed stack slot is a
	2752	+ * spilled reg.
	2753	+ *
	2754	+ * 'dst_regno' can be -1, meaning that the read value is not going to a
	2755	+ * register.
	2756	+ *
	2757	+ * The access is assumed to be within the current stack bounds.
	2758	+ */
	2759	+static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
	2760	+ /* func where src register points to */
	2761	+ struct bpf_func_state *reg_state,
	2762	+ int off, int size, int dst_regno)
1075	2763	{
1076	2764	struct bpf_verifier_state *vstate = env->cur_state;
1077	2765	struct bpf_func_state *state = vstate->frame[vstate->curframe];
1078	2766	int i, slot = -off - 1, spi = slot / BPF_REG_SIZE;
1079	2767	struct bpf_reg_state *reg;
1080		- u8 *stype;
	2768	+ u8 *stype, type;
1081	2769
1082		- if (reg_state->allocated_stack <= slot) {
1083		- verbose(env, "invalid read from stack off %d+0 size %d\n",
1084		- off, size);
1085		- return -EACCES;
1086		- }
1087	2770	stype = reg_state->stack[spi].slot_type;
1088	2771	reg = &reg_state->stack[spi].spilled_ptr;
1089	2772
1090		- if (stype[0] == STACK_SPILL) {
1091		- if (size != BPF_REG_SIZE) {
	2773	+ if (is_spilled_reg(&reg_state->stack[spi])) {
	2774	+ u8 spill_size = 1;
	2775	+
	2776	+ for (i = BPF_REG_SIZE - 1; i > 0 && stype[i - 1] == STACK_SPILL; i--)
	2777	+ spill_size++;
	2778	+
	2779	+ if (size != BPF_REG_SIZE \|\| spill_size != BPF_REG_SIZE) {
1092	2780	if (reg->type != SCALAR_VALUE) {
	2781	+ verbose_linfo(env, env->insn_idx, "; ");
1093	2782	verbose(env, "invalid size of register fill\n");
1094	2783	return -EACCES;
1095	2784	}
1096		- if (value_regno >= 0) {
1097		- mark_reg_unknown(env, state->regs, value_regno);
1098		- state->regs[value_regno].live \|= REG_LIVE_WRITTEN;
	2785	+
	2786	+ mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
	2787	+ if (dst_regno < 0)
	2788	+ return 0;
	2789	+
	2790	+ if (!(off % BPF_REG_SIZE) && size == spill_size) {
	2791	+ /* The earlier check_reg_arg() has decided the
	2792	+ * subreg_def for this insn. Save it first.
	2793	+ */
	2794	+ s32 subreg_def = state->regs[dst_regno].subreg_def;
	2795	+
	2796	+ copy_register_state(&state->regs[dst_regno], reg);
	2797	+ state->regs[dst_regno].subreg_def = subreg_def;
	2798	+ } else {
	2799	+ for (i = 0; i < size; i++) {
	2800	+ type = stype[(slot - i) % BPF_REG_SIZE];
	2801	+ if (type == STACK_SPILL)
	2802	+ continue;
	2803	+ if (type == STACK_MISC)
	2804	+ continue;
	2805	+ verbose(env, "invalid read from stack off %d+%d size %d\n",
	2806	+ off, i, size);
	2807	+ return -EACCES;
	2808	+ }
	2809	+ mark_reg_unknown(env, state->regs, dst_regno);
1099	2810	}
1100		- mark_reg_read(env, reg, reg->parent);
	2811	+ state->regs[dst_regno].live \|= REG_LIVE_WRITTEN;
1101	2812	return 0;
1102	2813	}
1103		- for (i = 1; i < BPF_REG_SIZE; i++) {
1104		- if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) {
1105		- verbose(env, "corrupted spill memory\n");
1106		- return -EACCES;
1107		- }
1108		- }
1109	2814
1110		- if (value_regno >= 0) {
	2815	+ if (dst_regno >= 0) {
1111	2816	/* restore register state from stack */
1112		- state->regs[value_regno] = *reg;
	2817	+ copy_register_state(&state->regs[dst_regno], reg);
1113	2818	/* mark reg as written since spilled pointer state likely
1114	2819	* has its liveness marks cleared by is_state_visited()
1115	2820	* which resets stack/reg liveness for state transitions
1116	2821	*/
1117		- state->regs[value_regno].live \|= REG_LIVE_WRITTEN;
	2822	+ state->regs[dst_regno].live \|= REG_LIVE_WRITTEN;
	2823	+ } else if (__is_pointer_value(env->allow_ptr_leaks, reg)) {
	2824	+ /* If dst_regno==-1, the caller is asking us whether
	2825	+ * it is acceptable to use this value as a SCALAR_VALUE
	2826	+ * (e.g. for XADD).
	2827	+ * We must not allow unprivileged callers to do that
	2828	+ * with spilled pointers.
	2829	+ */
	2830	+ verbose(env, "leaking pointer from stack off %d\n",
	2831	+ off);
	2832	+ return -EACCES;
1118	2833	}
1119		- mark_reg_read(env, reg, reg->parent);
	2834	+ mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
1120	2835	} else {
1121		- int zeros = 0;
1122		-
1123	2836	for (i = 0; i < size; i++) {
1124		- if (stype[(slot - i) % BPF_REG_SIZE] == STACK_MISC)
	2837	+ type = stype[(slot - i) % BPF_REG_SIZE];
	2838	+ if (type == STACK_MISC)
1125	2839	continue;
1126		- if (stype[(slot - i) % BPF_REG_SIZE] == STACK_ZERO) {
1127		- zeros++;
	2840	+ if (type == STACK_ZERO)
1128	2841	continue;
1129		- }
1130	2842	verbose(env, "invalid read from stack off %d+%d size %d\n",
1131	2843	off, i, size);
1132	2844	return -EACCES;
1133	2845	}
1134		- mark_reg_read(env, reg, reg->parent);
1135		- if (value_regno >= 0) {
1136		- if (zeros == size) {
1137		- /* any size read into register is zero extended,
1138		- * so the whole register == const_zero
1139		- */
1140		- __mark_reg_const_zero(&state->regs[value_regno]);
1141		- } else {
1142		- /* have read misc data from the stack */
1143		- mark_reg_unknown(env, state->regs, value_regno);
1144		- }
1145		- state->regs[value_regno].live \|= REG_LIVE_WRITTEN;
1146		- }
	2846	+ mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
	2847	+ if (dst_regno >= 0)
	2848	+ mark_reg_stack_read(env, reg_state, off, off + size, dst_regno);
1147	2849	}
1148	2850	return 0;
1149	2851	}
1150	2852
1151		-static int check_stack_access(struct bpf_verifier_env *env,
1152		- const struct bpf_reg_state *reg,
1153		- int off, int size)
	2853	+enum stack_access_src {
	2854	+ ACCESS_DIRECT = 1, /* the access is performed by an instruction */
	2855	+ ACCESS_HELPER = 2, /* the access is performed by a helper */
	2856	+};
	2857	+
	2858	+static int check_stack_range_initialized(struct bpf_verifier_env *env,
	2859	+ int regno, int off, int access_size,
	2860	+ bool zero_size_allowed,
	2861	+ enum stack_access_src type,
	2862	+ struct bpf_call_arg_meta *meta);
	2863	+
	2864	+static struct bpf_reg_state reg_state(struct bpf_verifier_env env, int regno)
1154	2865	{
1155		- /* Stack accesses must be at a fixed offset, so that we
1156		- * can determine what type of data were returned. See
1157		- * check_stack_read().
	2866	+ return cur_regs(env) + regno;
	2867	+}
	2868	+
	2869	+/* Read the stack at 'ptr_regno + off' and put the result into the register
	2870	+ * 'dst_regno'.
	2871	+ * 'off' includes the pointer register's fixed offset(i.e. 'ptr_regno.off'),
	2872	+ * but not its variable offset.
	2873	+ * 'size' is assumed to be <= reg size and the access is assumed to be aligned.
	2874	+ *
	2875	+ * As opposed to check_stack_read_fixed_off, this function doesn't deal with
	2876	+ * filling registers (i.e. reads of spilled register cannot be detected when
	2877	+ * the offset is not fixed). We conservatively mark 'dst_regno' as containing
	2878	+ * SCALAR_VALUE. That's why we assert that the 'ptr_regno' has a variable
	2879	+ * offset; for a fixed offset check_stack_read_fixed_off should be used
	2880	+ * instead.
	2881	+ */
	2882	+static int check_stack_read_var_off(struct bpf_verifier_env *env,
	2883	+ int ptr_regno, int off, int size, int dst_regno)
	2884	+{
	2885	+ /* The state of the source register. */
	2886	+ struct bpf_reg_state *reg = reg_state(env, ptr_regno);
	2887	+ struct bpf_func_state *ptr_state = func(env, reg);
	2888	+ int err;
	2889	+ int min_off, max_off;
	2890	+
	2891	+ /* Note that we pass a NULL meta, so raw access will not be permitted.
1158	2892	*/
1159		- if (!tnum_is_const(reg->var_off)) {
	2893	+ err = check_stack_range_initialized(env, ptr_regno, off, size,
	2894	+ false, ACCESS_DIRECT, NULL);
	2895	+ if (err)
	2896	+ return err;
	2897	+
	2898	+ min_off = reg->smin_value + off;
	2899	+ max_off = reg->smax_value + off;
	2900	+ mark_reg_stack_read(env, ptr_state, min_off, max_off + size, dst_regno);
	2901	+ return 0;
	2902	+}
	2903	+
	2904	+/* check_stack_read dispatches to check_stack_read_fixed_off or
	2905	+ * check_stack_read_var_off.
	2906	+ *
	2907	+ * The caller must ensure that the offset falls within the allocated stack
	2908	+ * bounds.
	2909	+ *
	2910	+ * 'dst_regno' is a register which will receive the value from the stack. It
	2911	+ * can be -1, meaning that the read value is not going to a register.
	2912	+ */
	2913	+static int check_stack_read(struct bpf_verifier_env *env,
	2914	+ int ptr_regno, int off, int size,
	2915	+ int dst_regno)
	2916	+{
	2917	+ struct bpf_reg_state *reg = reg_state(env, ptr_regno);
	2918	+ struct bpf_func_state *state = func(env, reg);
	2919	+ int err;
	2920	+ /* Some accesses are only permitted with a static offset. */
	2921	+ bool var_off = !tnum_is_const(reg->var_off);
	2922	+
	2923	+ /* The offset is required to be static when reads don't go to a
	2924	+ * register, in order to not leak pointers (see
	2925	+ * check_stack_read_fixed_off).
	2926	+ */
	2927	+ if (dst_regno < 0 && var_off) {
1160	2928	char tn_buf[48];
1161	2929
1162	2930	tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
1163		- verbose(env, "variable stack access var_off=%s off=%d size=%d\n",
	2931	+ verbose(env, "variable offset stack pointer cannot be passed into helper function; var_off=%s off=%d size=%d\n",
1164	2932	tn_buf, off, size);
1165	2933	return -EACCES;
1166	2934	}
1167		-
1168		- if (off >= 0 \|\| off < -MAX_BPF_STACK) {
1169		- verbose(env, "invalid stack off=%d size=%d\n", off, size);
1170		- return -EACCES;
	2935	+ /* Variable offset is prohibited for unprivileged mode for simplicity
	2936	+ * since it requires corresponding support in Spectre masking for stack
	2937	+ * ALU. See also retrieve_ptr_limit(). The check in
	2938	+ * check_stack_access_for_ptr_arithmetic() called by
	2939	+ * adjust_ptr_min_max_vals() prevents users from creating stack pointers
	2940	+ * with variable offsets, therefore no check is required here. Further,
	2941	+ * just checking it here would be insufficient as speculative stack
	2942	+ * writes could still lead to unsafe speculative behaviour.
	2943	+ */
	2944	+ if (!var_off) {
	2945	+ off += reg->var_off.value;
	2946	+ err = check_stack_read_fixed_off(env, state, off, size,
	2947	+ dst_regno);
	2948	+ } else {
	2949	+ /* Variable offset stack reads need more conservative handling
	2950	+ * than fixed offset ones. Note that dst_regno >= 0 on this
	2951	+ * branch.
	2952	+ */
	2953	+ err = check_stack_read_var_off(env, ptr_regno, off, size,
	2954	+ dst_regno);
1171	2955	}
1172		-
1173		- return 0;
	2956	+ return err;
1174	2957	}
1175	2958
1176		-/* check read/write into map element returned by bpf_map_lookup_elem() */
1177		-static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
1178		- int size, bool zero_size_allowed)
	2959	+
	2960	+/* check_stack_write dispatches to check_stack_write_fixed_off or
	2961	+ * check_stack_write_var_off.
	2962	+ *
	2963	+ * 'ptr_regno' is the register used as a pointer into the stack.
	2964	+ * 'off' includes 'ptr_regno->off', but not its variable offset (if any).
	2965	+ * 'value_regno' is the register whose value we're writing to the stack. It can
	2966	+ * be -1, meaning that we're not writing from a register.
	2967	+ *
	2968	+ * The caller must ensure that the offset falls within the maximum stack size.
	2969	+ */
	2970	+static int check_stack_write(struct bpf_verifier_env *env,
	2971	+ int ptr_regno, int off, int size,
	2972	+ int value_regno, int insn_idx)
	2973	+{
	2974	+ struct bpf_reg_state *reg = reg_state(env, ptr_regno);
	2975	+ struct bpf_func_state *state = func(env, reg);
	2976	+ int err;
	2977	+
	2978	+ if (tnum_is_const(reg->var_off)) {
	2979	+ off += reg->var_off.value;
	2980	+ err = check_stack_write_fixed_off(env, state, off, size,
	2981	+ value_regno, insn_idx);
	2982	+ } else {
	2983	+ /* Variable offset stack reads need more conservative handling
	2984	+ * than fixed offset ones.
	2985	+ */
	2986	+ err = check_stack_write_var_off(env, state,
	2987	+ ptr_regno, off, size,
	2988	+ value_regno, insn_idx);
	2989	+ }
	2990	+ return err;
	2991	+}
	2992	+
	2993	+static int check_map_access_type(struct bpf_verifier_env *env, u32 regno,
	2994	+ int off, int size, enum bpf_access_type type)
1179	2995	{
1180	2996	struct bpf_reg_state *regs = cur_regs(env);
1181	2997	struct bpf_map *map = regs[regno].map_ptr;
	2998	+ u32 cap = bpf_map_flags_to_cap(map);
1182	2999
1183		- if (off < 0 \|\| size < 0 \|\| (size == 0 && !zero_size_allowed) \|\|
1184		- off + size > map->value_size) {
1185		- verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
	3000	+ if (type == BPF_WRITE && !(cap & BPF_MAP_CAN_WRITE)) {
	3001	+ verbose(env, "write into map forbidden, value_size=%d off=%d size=%d\n",
1186	3002	map->value_size, off, size);
1187	3003	return -EACCES;
1188	3004	}
	3005	+
	3006	+ if (type == BPF_READ && !(cap & BPF_MAP_CAN_READ)) {
	3007	+ verbose(env, "read from map forbidden, value_size=%d off=%d size=%d\n",
	3008	+ map->value_size, off, size);
	3009	+ return -EACCES;
	3010	+ }
	3011	+
1189	3012	return 0;
1190	3013	}
1191	3014
1192		-/* check read/write into a map element with possible variable offset */
1193		-static int check_map_access(struct bpf_verifier_env *env, u32 regno,
1194		- int off, int size, bool zero_size_allowed)
	3015	+/* check read/write into memory region (e.g., map value, ringbuf sample, etc) */
	3016	+static int __check_mem_access(struct bpf_verifier_env *env, int regno,
	3017	+ int off, int size, u32 mem_size,
	3018	+ bool zero_size_allowed)
	3019	+{
	3020	+ bool size_ok = size > 0 \|\| (size == 0 && zero_size_allowed);
	3021	+ struct bpf_reg_state *reg;
	3022	+
	3023	+ if (off >= 0 && size_ok && (u64)off + size <= mem_size)
	3024	+ return 0;
	3025	+
	3026	+ reg = &cur_regs(env)[regno];
	3027	+ switch (reg->type) {
	3028	+ case PTR_TO_MAP_VALUE:
	3029	+ verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
	3030	+ mem_size, off, size);
	3031	+ break;
	3032	+ case PTR_TO_PACKET:
	3033	+ case PTR_TO_PACKET_META:
	3034	+ case PTR_TO_PACKET_END:
	3035	+ verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
	3036	+ off, size, regno, reg->id, off, mem_size);
	3037	+ break;
	3038	+ case PTR_TO_MEM:
	3039	+ default:
	3040	+ verbose(env, "invalid access to memory, mem_size=%u off=%d size=%d\n",
	3041	+ mem_size, off, size);
	3042	+ }
	3043	+
	3044	+ return -EACCES;
	3045	+}
	3046	+
	3047	+/* check read/write into a memory region with possible variable offset */
	3048	+static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno,
	3049	+ int off, int size, u32 mem_size,
	3050	+ bool zero_size_allowed)
1195	3051	{
1196	3052	struct bpf_verifier_state *vstate = env->cur_state;
1197	3053	struct bpf_func_state *state = vstate->frame[vstate->curframe];
1198	3054	struct bpf_reg_state *reg = &state->regs[regno];
1199	3055	int err;
1200	3056
1201		- /* We may have adjusted the register to this map value, so we
	3057	+ /* We may have adjusted the register pointing to memory region, so we
1202	3058	* need to try adding each of min_value and max_value to off
1203	3059	* to make sure our theoretical access will be safe.
1204	3060	*/
1205		- if (env->log.level)
	3061	+ if (env->log.level & BPF_LOG_LEVEL)
1206	3062	print_verifier_state(env, state);
1207	3063
1208	3064	/* The minimum value is only important with signed
..	..	@@ -1219,10 +3075,10 @@
1219	3075	regno);
1220	3076	return -EACCES;
1221	3077	}
1222		- err = __check_map_access(env, regno, reg->smin_value + off, size,
1223		- zero_size_allowed);
	3078	+ err = __check_mem_access(env, regno, reg->smin_value + off, size,
	3079	+ mem_size, zero_size_allowed);
1224	3080	if (err) {
1225		- verbose(env, "R%d min value is outside of the array range\n",
	3081	+ verbose(env, "R%d min value is outside of the allowed memory range\n",
1226	3082	regno);
1227	3083	return err;
1228	3084	}
..	..	@@ -1232,33 +3088,79 @@
1232	3088	* If reg->umax_value + off could overflow, treat that as unbounded too.
1233	3089	*/
1234	3090	if (reg->umax_value >= BPF_MAX_VAR_OFF) {
1235		- verbose(env, "R%d unbounded memory access, make sure to bounds check any array access into a map\n",
	3091	+ verbose(env, "R%d unbounded memory access, make sure to bounds check any such access\n",
1236	3092	regno);
1237	3093	return -EACCES;
1238	3094	}
1239		- err = __check_map_access(env, regno, reg->umax_value + off, size,
1240		- zero_size_allowed);
1241		- if (err)
1242		- verbose(env, "R%d max value is outside of the array range\n",
	3095	+ err = __check_mem_access(env, regno, reg->umax_value + off, size,
	3096	+ mem_size, zero_size_allowed);
	3097	+ if (err) {
	3098	+ verbose(env, "R%d max value is outside of the allowed memory range\n",
1243	3099	regno);
	3100	+ return err;
	3101	+ }
	3102	+
	3103	+ return 0;
	3104	+}
	3105	+
	3106	+/* check read/write into a map element with possible variable offset */
	3107	+static int check_map_access(struct bpf_verifier_env *env, u32 regno,
	3108	+ int off, int size, bool zero_size_allowed)
	3109	+{
	3110	+ struct bpf_verifier_state *vstate = env->cur_state;
	3111	+ struct bpf_func_state *state = vstate->frame[vstate->curframe];
	3112	+ struct bpf_reg_state *reg = &state->regs[regno];
	3113	+ struct bpf_map *map = reg->map_ptr;
	3114	+ int err;
	3115	+
	3116	+ err = check_mem_region_access(env, regno, off, size, map->value_size,
	3117	+ zero_size_allowed);
	3118	+ if (err)
	3119	+ return err;
	3120	+
	3121	+ if (map_value_has_spin_lock(map)) {
	3122	+ u32 lock = map->spin_lock_off;
	3123	+
	3124	+ /* if any part of struct bpf_spin_lock can be touched by
	3125	+ * load/store reject this program.
	3126	+ * To check that [x1, x2) overlaps with [y1, y2)
	3127	+ * it is sufficient to check x1 < y2 && y1 < x2.
	3128	+ */
	3129	+ if (reg->smin_value + off < lock + sizeof(struct bpf_spin_lock) &&
	3130	+ lock < reg->umax_value + off + size) {
	3131	+ verbose(env, "bpf_spin_lock cannot be accessed directly by load/store\n");
	3132	+ return -EACCES;
	3133	+ }
	3134	+ }
1244	3135	return err;
1245	3136	}
1246	3137
1247	3138	#define MAX_PACKET_OFF 0xffff
1248	3139
	3140	+static enum bpf_prog_type resolve_prog_type(struct bpf_prog *prog)
	3141	+{
	3142	+ return prog->aux->dst_prog ? prog->aux->dst_prog->type : prog->type;
	3143	+}
	3144	+
1249	3145	static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
1250	3146	const struct bpf_call_arg_meta *meta,
1251	3147	enum bpf_access_type t)
1252	3148	{
1253		- switch (env->prog->type) {
	3149	+ enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
	3150	+
	3151	+ switch (prog_type) {
	3152	+ /* Program types only with direct read access go here! */
1254	3153	case BPF_PROG_TYPE_LWT_IN:
1255	3154	case BPF_PROG_TYPE_LWT_OUT:
1256	3155	case BPF_PROG_TYPE_LWT_SEG6LOCAL:
1257	3156	case BPF_PROG_TYPE_SK_REUSEPORT:
1258		- /* dst_input() and dst_output() can't write for now */
	3157	+ case BPF_PROG_TYPE_FLOW_DISSECTOR:
	3158	+ case BPF_PROG_TYPE_CGROUP_SKB:
1259	3159	if (t == BPF_WRITE)
1260	3160	return false;
1261		- /* fallthrough */
	3161	+ fallthrough;
	3162	+
	3163	+ /* Program types with direct read + write access go here! */
1262	3164	case BPF_PROG_TYPE_SCHED_CLS:
1263	3165	case BPF_PROG_TYPE_SCHED_ACT:
1264	3166	case BPF_PROG_TYPE_XDP:
..	..	@@ -1270,24 +3172,16 @@
1270	3172
1271	3173	env->seen_direct_write = true;
1272	3174	return true;
	3175	+
	3176	+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
	3177	+ if (t == BPF_WRITE)
	3178	+ env->seen_direct_write = true;
	3179	+
	3180	+ return true;
	3181	+
1273	3182	default:
1274	3183	return false;
1275	3184	}
1276		-}
1277		-
1278		-static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
1279		- int off, int size, bool zero_size_allowed)
1280		-{
1281		- struct bpf_reg_state *regs = cur_regs(env);
1282		- struct bpf_reg_state *reg = &regs[regno];
1283		-
1284		- if (off < 0 \|\| size < 0 \|\| (size == 0 && !zero_size_allowed) \|\|
1285		- (u64)off + size > reg->range) {
1286		- verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
1287		- off, size, regno, reg->id, reg->off, reg->range);
1288		- return -EACCES;
1289		- }
1290		- return 0;
1291	3185	}
1292	3186
1293	3187	static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
..	..	@@ -1310,20 +3204,36 @@
1310	3204	regno);
1311	3205	return -EACCES;
1312	3206	}
1313		- err = __check_packet_access(env, regno, off, size, zero_size_allowed);
	3207	+
	3208	+ err = reg->range < 0 ? -EINVAL :
	3209	+ __check_mem_access(env, regno, off, size, reg->range,
	3210	+ zero_size_allowed);
1314	3211	if (err) {
1315	3212	verbose(env, "R%d offset is outside of the packet\n", regno);
1316	3213	return err;
1317	3214	}
	3215	+
	3216	+ /* __check_mem_access has made sure "off + size - 1" is within u16.
	3217	+ * reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff,
	3218	+ * otherwise find_good_pkt_pointers would have refused to set range info
	3219	+ * that __check_mem_access would have rejected this pkt access.
	3220	+ * Therefore, "off + reg->umax_value + size - 1" won't overflow u32.
	3221	+ */
	3222	+ env->prog->aux->max_pkt_offset =
	3223	+ max_t(u32, env->prog->aux->max_pkt_offset,
	3224	+ off + reg->umax_value + size - 1);
	3225	+
1318	3226	return err;
1319	3227	}
1320	3228
1321	3229	/* check access to 'struct bpf_context' fields. Supports fixed offsets only */
1322	3230	static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
1323		- enum bpf_access_type t, enum bpf_reg_type *reg_type)
	3231	+ enum bpf_access_type t, enum bpf_reg_type *reg_type,
	3232	+ u32 *btf_id)
1324	3233	{
1325	3234	struct bpf_insn_access_aux info = {
1326	3235	.reg_type = *reg_type,
	3236	+ .log = &env->log,
1327	3237	};
1328	3238
1329	3239	if (env->ops->is_valid_access &&
..	..	@@ -1337,7 +3247,10 @@
1337	3247	*/
1338	3248	*reg_type = info.reg_type;
1339	3249
1340		- env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
	3250	+ if (reg_type == PTR_TO_BTF_ID \|\| reg_type == PTR_TO_BTF_ID_OR_NULL)
	3251	+ *btf_id = info.btf_id;
	3252	+ else
	3253	+ env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
1341	3254	/* remember the offset of last byte accessed in ctx */
1342	3255	if (env->prog->aux->max_ctx_offset < off + size)
1343	3256	env->prog->aux->max_ctx_offset = off + size;
..	..	@@ -1348,32 +3261,95 @@
1348	3261	return -EACCES;
1349	3262	}
1350	3263
1351		-static bool __is_pointer_value(bool allow_ptr_leaks,
1352		- const struct bpf_reg_state *reg)
	3264	+static int check_flow_keys_access(struct bpf_verifier_env *env, int off,
	3265	+ int size)
1353	3266	{
1354		- if (allow_ptr_leaks)
1355		- return false;
	3267	+ if (size < 0 \|\| off < 0 \|\|
	3268	+ (u64)off + size > sizeof(struct bpf_flow_keys)) {
	3269	+ verbose(env, "invalid access to flow keys off=%d size=%d\n",
	3270	+ off, size);
	3271	+ return -EACCES;
	3272	+ }
	3273	+ return 0;
	3274	+}
1356	3275
1357		- return reg->type != SCALAR_VALUE;
	3276	+static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
	3277	+ u32 regno, int off, int size,
	3278	+ enum bpf_access_type t)
	3279	+{
	3280	+ struct bpf_reg_state *regs = cur_regs(env);
	3281	+ struct bpf_reg_state *reg = &regs[regno];
	3282	+ struct bpf_insn_access_aux info = {};
	3283	+ bool valid;
	3284	+
	3285	+ if (reg->smin_value < 0) {
	3286	+ verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
	3287	+ regno);
	3288	+ return -EACCES;
	3289	+ }
	3290	+
	3291	+ switch (reg->type) {
	3292	+ case PTR_TO_SOCK_COMMON:
	3293	+ valid = bpf_sock_common_is_valid_access(off, size, t, &info);
	3294	+ break;
	3295	+ case PTR_TO_SOCKET:
	3296	+ valid = bpf_sock_is_valid_access(off, size, t, &info);
	3297	+ break;
	3298	+ case PTR_TO_TCP_SOCK:
	3299	+ valid = bpf_tcp_sock_is_valid_access(off, size, t, &info);
	3300	+ break;
	3301	+ case PTR_TO_XDP_SOCK:
	3302	+ valid = bpf_xdp_sock_is_valid_access(off, size, t, &info);
	3303	+ break;
	3304	+ default:
	3305	+ valid = false;
	3306	+ }
	3307	+
	3308	+
	3309	+ if (valid) {
	3310	+ env->insn_aux_data[insn_idx].ctx_field_size =
	3311	+ info.ctx_field_size;
	3312	+ return 0;
	3313	+ }
	3314	+
	3315	+ verbose(env, "R%d invalid %s access off=%d size=%d\n",
	3316	+ regno, reg_type_str[reg->type], off, size);
	3317	+
	3318	+ return -EACCES;
1358	3319	}
1359	3320
1360	3321	static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
1361	3322	{
1362		- return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno);
	3323	+ return __is_pointer_value(env->allow_ptr_leaks, reg_state(env, regno));
1363	3324	}
1364	3325
1365	3326	static bool is_ctx_reg(struct bpf_verifier_env *env, int regno)
1366	3327	{
1367		- const struct bpf_reg_state *reg = cur_regs(env) + regno;
	3328	+ const struct bpf_reg_state *reg = reg_state(env, regno);
1368	3329
1369	3330	return reg->type == PTR_TO_CTX;
1370	3331	}
1371	3332
	3333	+static bool is_sk_reg(struct bpf_verifier_env *env, int regno)
	3334	+{
	3335	+ const struct bpf_reg_state *reg = reg_state(env, regno);
	3336	+
	3337	+ return type_is_sk_pointer(reg->type);
	3338	+}
	3339	+
1372	3340	static bool is_pkt_reg(struct bpf_verifier_env *env, int regno)
1373	3341	{
1374		- const struct bpf_reg_state *reg = cur_regs(env) + regno;
	3342	+ const struct bpf_reg_state *reg = reg_state(env, regno);
1375	3343
1376	3344	return type_is_pkt_pointer(reg->type);
	3345	+}
	3346	+
	3347	+static bool is_flow_key_reg(struct bpf_verifier_env *env, int regno)
	3348	+{
	3349	+ const struct bpf_reg_state *reg = reg_state(env, regno);
	3350	+
	3351	+ /* Separate to is_ctx_reg() since we still want to allow BPF_ST here. */
	3352	+ return reg->type == PTR_TO_FLOW_KEYS;
1377	3353	}
1378	3354
1379	3355	static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
..	..	@@ -1449,6 +3425,9 @@
1449	3425	* right in front, treat it the very same way.
1450	3426	*/
1451	3427	return check_pkt_ptr_alignment(env, reg, off, size, strict);
	3428	+ case PTR_TO_FLOW_KEYS:
	3429	+ pointer_desc = "flow keys ";
	3430	+ break;
1452	3431	case PTR_TO_MAP_VALUE:
1453	3432	pointer_desc = "value ";
1454	3433	break;
..	..	@@ -1457,11 +3436,23 @@
1457	3436	break;
1458	3437	case PTR_TO_STACK:
1459	3438	pointer_desc = "stack ";
1460		- /* The stack spill tracking logic in check_stack_write()
1461		- * and check_stack_read() relies on stack accesses being
	3439	+ /* The stack spill tracking logic in check_stack_write_fixed_off()
	3440	+ * and check_stack_read_fixed_off() relies on stack accesses being
1462	3441	* aligned.
1463	3442	*/
1464	3443	strict = true;
	3444	+ break;
	3445	+ case PTR_TO_SOCKET:
	3446	+ pointer_desc = "sock ";
	3447	+ break;
	3448	+ case PTR_TO_SOCK_COMMON:
	3449	+ pointer_desc = "sock_common ";
	3450	+ break;
	3451	+ case PTR_TO_TCP_SOCK:
	3452	+ pointer_desc = "tcp_sock ";
	3453	+ break;
	3454	+ case PTR_TO_XDP_SOCK:
	3455	+ pointer_desc = "xdp_sock ";
1465	3456	break;
1466	3457	default:
1467	3458	break;
..	..	@@ -1495,10 +3486,37 @@
1495	3486	int depth = 0, frame = 0, idx = 0, i = 0, subprog_end;
1496	3487	struct bpf_subprog_info *subprog = env->subprog_info;
1497	3488	struct bpf_insn *insn = env->prog->insnsi;
	3489	+ bool tail_call_reachable = false;
1498	3490	int ret_insn[MAX_CALL_FRAMES];
1499	3491	int ret_prog[MAX_CALL_FRAMES];
	3492	+ int j;
1500	3493
1501	3494	process_func:
	3495	+ /* protect against potential stack overflow that might happen when
	3496	+ * bpf2bpf calls get combined with tailcalls. Limit the caller's stack
	3497	+ * depth for such case down to 256 so that the worst case scenario
	3498	+ * would result in 8k stack size (32 which is tailcall limit * 256 =
	3499	+ * 8k).
	3500	+ *
	3501	+ * To get the idea what might happen, see an example:
	3502	+ * func1 -> sub rsp, 128
	3503	+ * subfunc1 -> sub rsp, 256
	3504	+ * tailcall1 -> add rsp, 256
	3505	+ * func2 -> sub rsp, 192 (total stack size = 128 + 192 = 320)
	3506	+ * subfunc2 -> sub rsp, 64
	3507	+ * subfunc22 -> sub rsp, 128
	3508	+ * tailcall2 -> add rsp, 128
	3509	+ * func3 -> sub rsp, 32 (total stack size 128 + 192 + 64 + 32 = 416)
	3510	+ *
	3511	+ * tailcall will unwind the current stack frame but it will not get rid
	3512	+ * of caller's stack as shown on the example above.
	3513	+ */
	3514	+ if (idx && subprog[idx].has_tail_call && depth >= 256) {
	3515	+ verbose(env,
	3516	+ "tail_calls are not allowed when call stack of previous frames is %d bytes. Too large\n",
	3517	+ depth);
	3518	+ return -EACCES;
	3519	+ }
1502	3520	/* round up to 32-bytes, since this is granularity
1503	3521	* of interpreter stack size
1504	3522	*/
..	..	@@ -1527,13 +3545,29 @@
1527	3545	i);
1528	3546	return -EFAULT;
1529	3547	}
	3548	+
	3549	+ if (subprog[idx].has_tail_call)
	3550	+ tail_call_reachable = true;
	3551	+
1530	3552	frame++;
1531	3553	if (frame >= MAX_CALL_FRAMES) {
1532		- WARN_ONCE(1, "verifier bug. Call stack is too deep\n");
1533		- return -EFAULT;
	3554	+ verbose(env, "the call stack of %d frames is too deep !\n",
	3555	+ frame);
	3556	+ return -E2BIG;
1534	3557	}
1535	3558	goto process_func;
1536	3559	}
	3560	+ /* if tail call got detected across bpf2bpf calls then mark each of the
	3561	+ * currently present subprog frames as tail call reachable subprogs;
	3562	+ * this info will be utilized by JIT so that we will be preserving the
	3563	+ * tail call counter throughout bpf2bpf calls combined with tailcalls
	3564	+ */
	3565	+ if (tail_call_reachable)
	3566	+ for (j = 0; j < frame; j++)
	3567	+ subprog[ret_prog[j]].tail_call_reachable = true;
	3568	+ if (subprog[0].tail_call_reachable)
	3569	+ env->prog->aux->tail_call_reachable = true;
	3570	+
1537	3571	/* end of for() loop means the last insn of the 'subprog'
1538	3572	* was reached. Doesn't matter whether it was JA or EXIT
1539	3573	*/
..	..	@@ -1562,8 +3596,8 @@
1562	3596	}
1563	3597	#endif
1564	3598
1565		-static int check_ctx_reg(struct bpf_verifier_env *env,
1566		- const struct bpf_reg_state *reg, int regno)
	3599	+int check_ctx_reg(struct bpf_verifier_env *env,
	3600	+ const struct bpf_reg_state *reg, int regno)
1567	3601	{
1568	3602	/* Access to ctx or passing it to a helper is only allowed in
1569	3603	* its original, unmodified form.
..	..	@@ -1584,6 +3618,72 @@
1584	3618	}
1585	3619
1586	3620	return 0;
	3621	+}
	3622	+
	3623	+static int __check_buffer_access(struct bpf_verifier_env *env,
	3624	+ const char *buf_info,
	3625	+ const struct bpf_reg_state *reg,
	3626	+ int regno, int off, int size)
	3627	+{
	3628	+ if (off < 0) {
	3629	+ verbose(env,
	3630	+ "R%d invalid %s buffer access: off=%d, size=%d\n",
	3631	+ regno, buf_info, off, size);
	3632	+ return -EACCES;
	3633	+ }
	3634	+ if (!tnum_is_const(reg->var_off) \|\| reg->var_off.value) {
	3635	+ char tn_buf[48];
	3636	+
	3637	+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
	3638	+ verbose(env,
	3639	+ "R%d invalid variable buffer offset: off=%d, var_off=%s\n",
	3640	+ regno, off, tn_buf);
	3641	+ return -EACCES;
	3642	+ }
	3643	+
	3644	+ return 0;
	3645	+}
	3646	+
	3647	+static int check_tp_buffer_access(struct bpf_verifier_env *env,
	3648	+ const struct bpf_reg_state *reg,
	3649	+ int regno, int off, int size)
	3650	+{
	3651	+ int err;
	3652	+
	3653	+ err = __check_buffer_access(env, "tracepoint", reg, regno, off, size);
	3654	+ if (err)
	3655	+ return err;
	3656	+
	3657	+ if (off + size > env->prog->aux->max_tp_access)
	3658	+ env->prog->aux->max_tp_access = off + size;
	3659	+
	3660	+ return 0;
	3661	+}
	3662	+
	3663	+static int check_buffer_access(struct bpf_verifier_env *env,
	3664	+ const struct bpf_reg_state *reg,
	3665	+ int regno, int off, int size,
	3666	+ bool zero_size_allowed,
	3667	+ const char *buf_info,
	3668	+ u32 *max_access)
	3669	+{
	3670	+ int err;
	3671	+
	3672	+ err = __check_buffer_access(env, buf_info, reg, regno, off, size);
	3673	+ if (err)
	3674	+ return err;
	3675	+
	3676	+ if (off + size > *max_access)
	3677	+ *max_access = off + size;
	3678	+
	3679	+ return 0;
	3680	+}
	3681	+
	3682	+/* BPF architecture zero extends alu32 ops into 64-bit registesr */
	3683	+static void zext_32_to_64(struct bpf_reg_state *reg)
	3684	+{
	3685	+ reg->var_off = tnum_subreg(reg->var_off);
	3686	+ __reg_assign_32_into_64(reg);
1587	3687	}
1588	3688
1589	3689	/* truncate register to smaller size (in bytes)
..	..	@@ -1607,6 +3707,255 @@
1607	3707	}
1608	3708	reg->smin_value = reg->umin_value;
1609	3709	reg->smax_value = reg->umax_value;
	3710	+
	3711	+ /* If size is smaller than 32bit register the 32bit register
	3712	+ * values are also truncated so we push 64-bit bounds into
	3713	+ * 32-bit bounds. Above were truncated < 32-bits already.
	3714	+ */
	3715	+ if (size >= 4)
	3716	+ return;
	3717	+ __reg_combine_64_into_32(reg);
	3718	+}
	3719	+
	3720	+static bool bpf_map_is_rdonly(const struct bpf_map *map)
	3721	+{
	3722	+ /* A map is considered read-only if the following condition are true:
	3723	+ *
	3724	+ * 1) BPF program side cannot change any of the map content. The
	3725	+ * BPF_F_RDONLY_PROG flag is throughout the lifetime of a map
	3726	+ * and was set at map creation time.
	3727	+ * 2) The map value(s) have been initialized from user space by a
	3728	+ * loader and then "frozen", such that no new map update/delete
	3729	+ * operations from syscall side are possible for the rest of
	3730	+ * the map's lifetime from that point onwards.
	3731	+ * 3) Any parallel/pending map update/delete operations from syscall
	3732	+ * side have been completed. Only after that point, it's safe to
	3733	+ * assume that map value(s) are immutable.
	3734	+ */
	3735	+ return (map->map_flags & BPF_F_RDONLY_PROG) &&
	3736	+ READ_ONCE(map->frozen) &&
	3737	+ !bpf_map_write_active(map);
	3738	+}
	3739	+
	3740	+static int bpf_map_direct_read(struct bpf_map map, int off, int size, u64 val)
	3741	+{
	3742	+ void *ptr;
	3743	+ u64 addr;
	3744	+ int err;
	3745	+
	3746	+ err = map->ops->map_direct_value_addr(map, &addr, off);
	3747	+ if (err)
	3748	+ return err;
	3749	+ ptr = (void *)(long)addr + off;
	3750	+
	3751	+ switch (size) {
	3752	+ case sizeof(u8):
	3753	+ val = (u64)(u8 *)ptr;
	3754	+ break;
	3755	+ case sizeof(u16):
	3756	+ val = (u64)(u16 *)ptr;
	3757	+ break;
	3758	+ case sizeof(u32):
	3759	+ val = (u64)(u32 *)ptr;
	3760	+ break;
	3761	+ case sizeof(u64):
	3762	+ val = (u64 *)ptr;
	3763	+ break;
	3764	+ default:
	3765	+ return -EINVAL;
	3766	+ }
	3767	+ return 0;
	3768	+}
	3769	+
	3770	+static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
	3771	+ struct bpf_reg_state *regs,
	3772	+ int regno, int off, int size,
	3773	+ enum bpf_access_type atype,
	3774	+ int value_regno)
	3775	+{
	3776	+ struct bpf_reg_state *reg = regs + regno;
	3777	+ const struct btf_type *t = btf_type_by_id(btf_vmlinux, reg->btf_id);
	3778	+ const char *tname = btf_name_by_offset(btf_vmlinux, t->name_off);
	3779	+ u32 btf_id;
	3780	+ int ret;
	3781	+
	3782	+ if (off < 0) {
	3783	+ verbose(env,
	3784	+ "R%d is ptr_%s invalid negative access: off=%d\n",
	3785	+ regno, tname, off);
	3786	+ return -EACCES;
	3787	+ }
	3788	+ if (!tnum_is_const(reg->var_off) \|\| reg->var_off.value) {
	3789	+ char tn_buf[48];
	3790	+
	3791	+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
	3792	+ verbose(env,
	3793	+ "R%d is ptr_%s invalid variable offset: off=%d, var_off=%s\n",
	3794	+ regno, tname, off, tn_buf);
	3795	+ return -EACCES;
	3796	+ }
	3797	+
	3798	+ if (env->ops->btf_struct_access) {
	3799	+ ret = env->ops->btf_struct_access(&env->log, t, off, size,
	3800	+ atype, &btf_id);
	3801	+ } else {
	3802	+ if (atype != BPF_READ) {
	3803	+ verbose(env, "only read is supported\n");
	3804	+ return -EACCES;
	3805	+ }
	3806	+
	3807	+ ret = btf_struct_access(&env->log, t, off, size, atype,
	3808	+ &btf_id);
	3809	+ }
	3810	+
	3811	+ if (ret < 0)
	3812	+ return ret;
	3813	+
	3814	+ if (atype == BPF_READ && value_regno >= 0)
	3815	+ mark_btf_ld_reg(env, regs, value_regno, ret, btf_id);
	3816	+
	3817	+ return 0;
	3818	+}
	3819	+
	3820	+static int check_ptr_to_map_access(struct bpf_verifier_env *env,
	3821	+ struct bpf_reg_state *regs,
	3822	+ int regno, int off, int size,
	3823	+ enum bpf_access_type atype,
	3824	+ int value_regno)
	3825	+{
	3826	+ struct bpf_reg_state *reg = regs + regno;
	3827	+ struct bpf_map *map = reg->map_ptr;
	3828	+ const struct btf_type *t;
	3829	+ const char *tname;
	3830	+ u32 btf_id;
	3831	+ int ret;
	3832	+
	3833	+ if (!btf_vmlinux) {
	3834	+ verbose(env, "map_ptr access not supported without CONFIG_DEBUG_INFO_BTF\n");
	3835	+ return -ENOTSUPP;
	3836	+ }
	3837	+
	3838	+ if (!map->ops->map_btf_id \|\| !*map->ops->map_btf_id) {
	3839	+ verbose(env, "map_ptr access not supported for map type %d\n",
	3840	+ map->map_type);
	3841	+ return -ENOTSUPP;
	3842	+ }
	3843	+
	3844	+ t = btf_type_by_id(btf_vmlinux, *map->ops->map_btf_id);
	3845	+ tname = btf_name_by_offset(btf_vmlinux, t->name_off);
	3846	+
	3847	+ if (!env->allow_ptr_to_map_access) {
	3848	+ verbose(env,
	3849	+ "%s access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n",
	3850	+ tname);
	3851	+ return -EPERM;
	3852	+ }
	3853	+
	3854	+ if (off < 0) {
	3855	+ verbose(env, "R%d is %s invalid negative access: off=%d\n",
	3856	+ regno, tname, off);
	3857	+ return -EACCES;
	3858	+ }
	3859	+
	3860	+ if (atype != BPF_READ) {
	3861	+ verbose(env, "only read from %s is supported\n", tname);
	3862	+ return -EACCES;
	3863	+ }
	3864	+
	3865	+ ret = btf_struct_access(&env->log, t, off, size, atype, &btf_id);
	3866	+ if (ret < 0)
	3867	+ return ret;
	3868	+
	3869	+ if (value_regno >= 0)
	3870	+ mark_btf_ld_reg(env, regs, value_regno, ret, btf_id);
	3871	+
	3872	+ return 0;
	3873	+}
	3874	+
	3875	+/* Check that the stack access at the given offset is within bounds. The
	3876	+ * maximum valid offset is -1.
	3877	+ *
	3878	+ * The minimum valid offset is -MAX_BPF_STACK for writes, and
	3879	+ * -state->allocated_stack for reads.
	3880	+ */
	3881	+static int check_stack_slot_within_bounds(int off,
	3882	+ struct bpf_func_state *state,
	3883	+ enum bpf_access_type t)
	3884	+{
	3885	+ int min_valid_off;
	3886	+
	3887	+ if (t == BPF_WRITE)
	3888	+ min_valid_off = -MAX_BPF_STACK;
	3889	+ else
	3890	+ min_valid_off = -state->allocated_stack;
	3891	+
	3892	+ if (off < min_valid_off \|\| off > -1)
	3893	+ return -EACCES;
	3894	+ return 0;
	3895	+}
	3896	+
	3897	+/* Check that the stack access at 'regno + off' falls within the maximum stack
	3898	+ * bounds.
	3899	+ *
	3900	+ * 'off' includes `regno->offset`, but not its dynamic part (if any).
	3901	+ */
	3902	+static int check_stack_access_within_bounds(
	3903	+ struct bpf_verifier_env *env,
	3904	+ int regno, int off, int access_size,
	3905	+ enum stack_access_src src, enum bpf_access_type type)
	3906	+{
	3907	+ struct bpf_reg_state *regs = cur_regs(env);
	3908	+ struct bpf_reg_state *reg = regs + regno;
	3909	+ struct bpf_func_state *state = func(env, reg);
	3910	+ int min_off, max_off;
	3911	+ int err;
	3912	+ char *err_extra;
	3913	+
	3914	+ if (src == ACCESS_HELPER)
	3915	+ /* We don't know if helpers are reading or writing (or both). */
	3916	+ err_extra = " indirect access to";
	3917	+ else if (type == BPF_READ)
	3918	+ err_extra = " read from";
	3919	+ else
	3920	+ err_extra = " write to";
	3921	+
	3922	+ if (tnum_is_const(reg->var_off)) {
	3923	+ min_off = reg->var_off.value + off;
	3924	+ if (access_size > 0)
	3925	+ max_off = min_off + access_size - 1;
	3926	+ else
	3927	+ max_off = min_off;
	3928	+ } else {
	3929	+ if (reg->smax_value >= BPF_MAX_VAR_OFF \|\|
	3930	+ reg->smin_value <= -BPF_MAX_VAR_OFF) {
	3931	+ verbose(env, "invalid unbounded variable-offset%s stack R%d\n",
	3932	+ err_extra, regno);
	3933	+ return -EACCES;
	3934	+ }
	3935	+ min_off = reg->smin_value + off;
	3936	+ if (access_size > 0)
	3937	+ max_off = reg->smax_value + off + access_size - 1;
	3938	+ else
	3939	+ max_off = min_off;
	3940	+ }
	3941	+
	3942	+ err = check_stack_slot_within_bounds(min_off, state, type);
	3943	+ if (!err)
	3944	+ err = check_stack_slot_within_bounds(max_off, state, type);
	3945	+
	3946	+ if (err) {
	3947	+ if (tnum_is_const(reg->var_off)) {
	3948	+ verbose(env, "invalid%s stack R%d off=%d size=%d\n",
	3949	+ err_extra, regno, off, access_size);
	3950	+ } else {
	3951	+ char tn_buf[48];
	3952	+
	3953	+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
	3954	+ verbose(env, "invalid variable-offset%s stack R%d var_off=%s size=%d\n",
	3955	+ err_extra, regno, tn_buf, access_size);
	3956	+ }
	3957	+ }
	3958	+ return err;
1610	3959	}
1611	3960
1612	3961	/* check whether memory at (regno + off) is accessible for t = (read \| write)
..	..	@@ -1642,13 +3991,44 @@
1642	3991	verbose(env, "R%d leaks addr into map\n", value_regno);
1643	3992	return -EACCES;
1644	3993	}
1645		-
	3994	+ err = check_map_access_type(env, regno, off, size, t);
	3995	+ if (err)
	3996	+ return err;
1646	3997	err = check_map_access(env, regno, off, size, false);
	3998	+ if (!err && t == BPF_READ && value_regno >= 0) {
	3999	+ struct bpf_map *map = reg->map_ptr;
	4000	+
	4001	+ /* if map is read-only, track its contents as scalars */
	4002	+ if (tnum_is_const(reg->var_off) &&
	4003	+ bpf_map_is_rdonly(map) &&
	4004	+ map->ops->map_direct_value_addr) {
	4005	+ int map_off = off + reg->var_off.value;
	4006	+ u64 val = 0;
	4007	+
	4008	+ err = bpf_map_direct_read(map, map_off, size,
	4009	+ &val);
	4010	+ if (err)
	4011	+ return err;
	4012	+
	4013	+ regs[value_regno].type = SCALAR_VALUE;
	4014	+ __mark_reg_known(&regs[value_regno], val);
	4015	+ } else {
	4016	+ mark_reg_unknown(env, regs, value_regno);
	4017	+ }
	4018	+ }
	4019	+ } else if (reg->type == PTR_TO_MEM) {
	4020	+ if (t == BPF_WRITE && value_regno >= 0 &&
	4021	+ is_pointer_value(env, value_regno)) {
	4022	+ verbose(env, "R%d leaks addr into mem\n", value_regno);
	4023	+ return -EACCES;
	4024	+ }
	4025	+ err = check_mem_region_access(env, regno, off, size,
	4026	+ reg->mem_size, false);
1647	4027	if (!err && t == BPF_READ && value_regno >= 0)
1648	4028	mark_reg_unknown(env, regs, value_regno);
1649		-
1650	4029	} else if (reg->type == PTR_TO_CTX) {
1651	4030	enum bpf_reg_type reg_type = SCALAR_VALUE;
	4031	+ u32 btf_id = 0;
1652	4032
1653	4033	if (t == BPF_WRITE && value_regno >= 0 &&
1654	4034	is_pointer_value(env, value_regno)) {
..	..	@@ -1660,23 +4040,37 @@
1660	4040	if (err < 0)
1661	4041	return err;
1662	4042
1663		- err = check_ctx_access(env, insn_idx, off, size, t, &reg_type);
	4043	+ err = check_ctx_access(env, insn_idx, off, size, t, &reg_type, &btf_id);
	4044	+ if (err)
	4045	+ verbose_linfo(env, insn_idx, "; ");
1664	4046	if (!err && t == BPF_READ && value_regno >= 0) {
1665	4047	/* ctx access returns either a scalar, or a
1666	4048	* PTR_TO_PACKET[_META,_END]. In the latter
1667	4049	* case, we know the offset is zero.
1668	4050	*/
1669		- if (reg_type == SCALAR_VALUE)
	4051	+ if (reg_type == SCALAR_VALUE) {
1670	4052	mark_reg_unknown(env, regs, value_regno);
1671		- else
	4053	+ } else {
1672	4054	mark_reg_known_zero(env, regs,
1673	4055	value_regno);
	4056	+ if (reg_type_may_be_null(reg_type))
	4057	+ regs[value_regno].id = ++env->id_gen;
	4058	+ /* A load of ctx field could have different
	4059	+ * actual load size with the one encoded in the
	4060	+ * insn. When the dst is PTR, it is for sure not
	4061	+ * a sub-register.
	4062	+ */
	4063	+ regs[value_regno].subreg_def = DEF_NOT_SUBREG;
	4064	+ if (reg_type == PTR_TO_BTF_ID \|\|
	4065	+ reg_type == PTR_TO_BTF_ID_OR_NULL)
	4066	+ regs[value_regno].btf_id = btf_id;
	4067	+ }
1674	4068	regs[value_regno].type = reg_type;
1675	4069	}
1676	4070
1677	4071	} else if (reg->type == PTR_TO_STACK) {
1678		- off += reg->var_off.value;
1679		- err = check_stack_access(env, reg, off, size);
	4072	+ /* Basic bounds checks. */
	4073	+ err = check_stack_access_within_bounds(env, regno, off, size, ACCESS_DIRECT, t);
1680	4074	if (err)
1681	4075	return err;
1682	4076
..	..	@@ -1685,12 +4079,12 @@
1685	4079	if (err)
1686	4080	return err;
1687	4081
1688		- if (t == BPF_WRITE)
1689		- err = check_stack_write(env, state, off, size,
1690		- value_regno, insn_idx);
1691		- else
1692		- err = check_stack_read(env, state, off, size,
	4082	+ if (t == BPF_READ)
	4083	+ err = check_stack_read(env, regno, off, size,
1693	4084	value_regno);
	4085	+ else
	4086	+ err = check_stack_write(env, regno, off, size,
	4087	+ value_regno, insn_idx);
1694	4088	} else if (reg_is_pkt_pointer(reg)) {
1695	4089	if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
1696	4090	verbose(env, "cannot write into packet\n");
..	..	@@ -1703,6 +4097,53 @@
1703	4097	return -EACCES;
1704	4098	}
1705	4099	err = check_packet_access(env, regno, off, size, false);
	4100	+ if (!err && t == BPF_READ && value_regno >= 0)
	4101	+ mark_reg_unknown(env, regs, value_regno);
	4102	+ } else if (reg->type == PTR_TO_FLOW_KEYS) {
	4103	+ if (t == BPF_WRITE && value_regno >= 0 &&
	4104	+ is_pointer_value(env, value_regno)) {
	4105	+ verbose(env, "R%d leaks addr into flow keys\n",
	4106	+ value_regno);
	4107	+ return -EACCES;
	4108	+ }
	4109	+
	4110	+ err = check_flow_keys_access(env, off, size);
	4111	+ if (!err && t == BPF_READ && value_regno >= 0)
	4112	+ mark_reg_unknown(env, regs, value_regno);
	4113	+ } else if (type_is_sk_pointer(reg->type)) {
	4114	+ if (t == BPF_WRITE) {
	4115	+ verbose(env, "R%d cannot write into %s\n",
	4116	+ regno, reg_type_str[reg->type]);
	4117	+ return -EACCES;
	4118	+ }
	4119	+ err = check_sock_access(env, insn_idx, regno, off, size, t);
	4120	+ if (!err && value_regno >= 0)
	4121	+ mark_reg_unknown(env, regs, value_regno);
	4122	+ } else if (reg->type == PTR_TO_TP_BUFFER) {
	4123	+ err = check_tp_buffer_access(env, reg, regno, off, size);
	4124	+ if (!err && t == BPF_READ && value_regno >= 0)
	4125	+ mark_reg_unknown(env, regs, value_regno);
	4126	+ } else if (reg->type == PTR_TO_BTF_ID) {
	4127	+ err = check_ptr_to_btf_access(env, regs, regno, off, size, t,
	4128	+ value_regno);
	4129	+ } else if (reg->type == CONST_PTR_TO_MAP) {
	4130	+ err = check_ptr_to_map_access(env, regs, regno, off, size, t,
	4131	+ value_regno);
	4132	+ } else if (reg->type == PTR_TO_RDONLY_BUF) {
	4133	+ if (t == BPF_WRITE) {
	4134	+ verbose(env, "R%d cannot write into %s\n",
	4135	+ regno, reg_type_str[reg->type]);
	4136	+ return -EACCES;
	4137	+ }
	4138	+ err = check_buffer_access(env, reg, regno, off, size, false,
	4139	+ "rdonly",
	4140	+ &env->prog->aux->max_rdonly_access);
	4141	+ if (!err && value_regno >= 0)
	4142	+ mark_reg_unknown(env, regs, value_regno);
	4143	+ } else if (reg->type == PTR_TO_RDWR_BUF) {
	4144	+ err = check_buffer_access(env, reg, regno, off, size, false,
	4145	+ "rdwr",
	4146	+ &env->prog->aux->max_rdwr_access);
1706	4147	if (!err && t == BPF_READ && value_regno >= 0)
1707	4148	mark_reg_unknown(env, regs, value_regno);
1708	4149	} else {
..	..	@@ -1745,10 +4186,12 @@
1745	4186	}
1746	4187
1747	4188	if (is_ctx_reg(env, insn->dst_reg) \|\|
1748		- is_pkt_reg(env, insn->dst_reg)) {
	4189	+ is_pkt_reg(env, insn->dst_reg) \|\|
	4190	+ is_flow_key_reg(env, insn->dst_reg) \|\|
	4191	+ is_sk_reg(env, insn->dst_reg)) {
1749	4192	verbose(env, "BPF_XADD stores into R%d %s is not allowed\n",
1750		- insn->dst_reg, is_ctx_reg(env, insn->dst_reg) ?
1751		- "context" : "packet");
	4193	+ insn->dst_reg,
	4194	+ reg_type_str[reg_state(env, insn->dst_reg)->type]);
1752	4195	return -EACCES;
1753	4196	}
1754	4197
..	..	@@ -1763,73 +4206,65 @@
1763	4206	BPF_SIZE(insn->code), BPF_WRITE, -1, true);
1764	4207	}
1765	4208
1766		-static int __check_stack_boundary(struct bpf_verifier_env *env, u32 regno,
1767		- int off, int access_size,
1768		- bool zero_size_allowed)
1769		-{
1770		- struct bpf_reg_state *reg = cur_regs(env) + regno;
1771		-
1772		- if (off >= 0 \|\| off < -MAX_BPF_STACK \|\| off + access_size > 0 \|\|
1773		- access_size < 0 \|\| (access_size == 0 && !zero_size_allowed)) {
1774		- if (tnum_is_const(reg->var_off)) {
1775		- verbose(env, "invalid stack type R%d off=%d access_size=%d\n",
1776		- regno, off, access_size);
1777		- } else {
1778		- char tn_buf[48];
1779		-
1780		- tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
1781		- verbose(env, "invalid stack type R%d var_off=%s access_size=%d\n",
1782		- regno, tn_buf, access_size);
1783		- }
1784		- return -EACCES;
1785		- }
1786		- return 0;
1787		-}
1788		-
1789		-/* when register 'regno' is passed into function that will read 'access_size'
1790		- * bytes from that pointer, make sure that it's within stack boundary
1791		- * and all elements of stack are initialized.
1792		- * Unlike most pointer bounds-checking functions, this one doesn't take an
1793		- * 'off' argument, so it has to add in reg->off itself.
	4209	+/* When register 'regno' is used to read the stack (either directly or through
	4210	+ * a helper function) make sure that it's within stack boundary and, depending
	4211	+ * on the access type, that all elements of the stack are initialized.
	4212	+ *
	4213	+ * 'off' includes 'regno->off', but not its dynamic part (if any).
	4214	+ *
	4215	+ * All registers that have been spilled on the stack in the slots within the
	4216	+ * read offsets are marked as read.
1794	4217	*/
1795		-static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
1796		- int access_size, bool zero_size_allowed,
1797		- struct bpf_call_arg_meta *meta)
	4218	+static int check_stack_range_initialized(
	4219	+ struct bpf_verifier_env *env, int regno, int off,
	4220	+ int access_size, bool zero_size_allowed,
	4221	+ enum stack_access_src type, struct bpf_call_arg_meta *meta)
1798	4222	{
1799		- struct bpf_reg_state *reg = cur_regs(env) + regno;
	4223	+ struct bpf_reg_state *reg = reg_state(env, regno);
1800	4224	struct bpf_func_state *state = func(env, reg);
1801	4225	int err, min_off, max_off, i, j, slot, spi;
	4226	+ char *err_extra = type == ACCESS_HELPER ? " indirect" : "";
	4227	+ enum bpf_access_type bounds_check_type;
	4228	+ /* Some accesses can write anything into the stack, others are
	4229	+ * read-only.
	4230	+ */
	4231	+ bool clobber = false;
1802	4232
1803		- if (reg->type != PTR_TO_STACK) {
1804		- /* Allow zero-byte read from NULL, regardless of pointer type */
1805		- if (zero_size_allowed && access_size == 0 &&
1806		- register_is_null(reg))
1807		- return 0;
1808		-
1809		- verbose(env, "R%d type=%s expected=%s\n", regno,
1810		- reg_type_str[reg->type],
1811		- reg_type_str[PTR_TO_STACK]);
	4233	+ if (access_size == 0 && !zero_size_allowed) {
	4234	+ verbose(env, "invalid zero-sized read\n");
1812	4235	return -EACCES;
1813	4236	}
1814	4237
	4238	+ if (type == ACCESS_HELPER) {
	4239	+ /* The bounds checks for writes are more permissive than for
	4240	+ * reads. However, if raw_mode is not set, we'll do extra
	4241	+ * checks below.
	4242	+ */
	4243	+ bounds_check_type = BPF_WRITE;
	4244	+ clobber = true;
	4245	+ } else {
	4246	+ bounds_check_type = BPF_READ;
	4247	+ }
	4248	+ err = check_stack_access_within_bounds(env, regno, off, access_size,
	4249	+ type, bounds_check_type);
	4250	+ if (err)
	4251	+ return err;
	4252	+
	4253	+
1815	4254	if (tnum_is_const(reg->var_off)) {
1816		- min_off = max_off = reg->var_off.value + reg->off;
1817		- err = __check_stack_boundary(env, regno, min_off, access_size,
1818		- zero_size_allowed);
1819		- if (err)
1820		- return err;
	4255	+ min_off = max_off = reg->var_off.value + off;
1821	4256	} else {
1822	4257	/* Variable offset is prohibited for unprivileged mode for
1823	4258	* simplicity since it requires corresponding support in
1824	4259	* Spectre masking for stack ALU.
1825	4260	* See also retrieve_ptr_limit().
1826	4261	*/
1827		- if (!env->allow_ptr_leaks) {
	4262	+ if (!env->bypass_spec_v1) {
1828	4263	char tn_buf[48];
1829	4264
1830	4265	tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
1831		- verbose(env, "R%d indirect variable offset stack access prohibited for !root, var_off=%s\n",
1832		- regno, tn_buf);
	4266	+ verbose(env, "R%d%s variable offset stack access prohibited for !root, var_off=%s\n",
	4267	+ regno, err_extra, tn_buf);
1833	4268	return -EACCES;
1834	4269	}
1835	4270	/* Only initialized buffer on stack is allowed to be accessed
..	..	@@ -1841,28 +4276,8 @@
1841	4276	if (meta && meta->raw_mode)
1842	4277	meta = NULL;
1843	4278
1844		- if (reg->smax_value >= BPF_MAX_VAR_OFF \|\|
1845		- reg->smax_value <= -BPF_MAX_VAR_OFF) {
1846		- verbose(env, "R%d unbounded indirect variable offset stack access\n",
1847		- regno);
1848		- return -EACCES;
1849		- }
1850		- min_off = reg->smin_value + reg->off;
1851		- max_off = reg->smax_value + reg->off;
1852		- err = __check_stack_boundary(env, regno, min_off, access_size,
1853		- zero_size_allowed);
1854		- if (err) {
1855		- verbose(env, "R%d min value is outside of stack bound\n",
1856		- regno);
1857		- return err;
1858		- }
1859		- err = __check_stack_boundary(env, regno, max_off, access_size,
1860		- zero_size_allowed);
1861		- if (err) {
1862		- verbose(env, "R%d max value is outside of stack bound\n",
1863		- regno);
1864		- return err;
1865		- }
	4279	+ min_off = reg->smin_value + off;
	4280	+ max_off = reg->smax_value + off;
1866	4281	}
1867	4282
1868	4283	if (meta && meta->raw_mode) {
..	..	@@ -1882,28 +4297,38 @@
1882	4297	if (*stype == STACK_MISC)
1883	4298	goto mark;
1884	4299	if (*stype == STACK_ZERO) {
1885		- /* helper can write anything into the stack */
1886		- *stype = STACK_MISC;
	4300	+ if (clobber) {
	4301	+ /* helper can write anything into the stack */
	4302	+ *stype = STACK_MISC;
	4303	+ }
1887	4304	goto mark;
1888	4305	}
1889		- if (state->stack[spi].slot_type[0] == STACK_SPILL &&
1890		- state->stack[spi].spilled_ptr.type == SCALAR_VALUE) {
1891		- __mark_reg_unknown(&state->stack[spi].spilled_ptr);
1892		- for (j = 0; j < BPF_REG_SIZE; j++)
1893		- state->stack[spi].slot_type[j] = STACK_MISC;
	4306	+
	4307	+ if (is_spilled_reg(&state->stack[spi]) &&
	4308	+ state->stack[spi].spilled_ptr.type == PTR_TO_BTF_ID)
	4309	+ goto mark;
	4310	+
	4311	+ if (is_spilled_reg(&state->stack[spi]) &&
	4312	+ (state->stack[spi].spilled_ptr.type == SCALAR_VALUE \|\|
	4313	+ env->allow_ptr_leaks)) {
	4314	+ if (clobber) {
	4315	+ __mark_reg_unknown(env, &state->stack[spi].spilled_ptr);
	4316	+ for (j = 0; j < BPF_REG_SIZE; j++)
	4317	+ scrub_spilled_slot(&state->stack[spi].slot_type[j]);
	4318	+ }
1894	4319	goto mark;
1895	4320	}
1896	4321
1897	4322	err:
1898	4323	if (tnum_is_const(reg->var_off)) {
1899		- verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
1900		- min_off, i - min_off, access_size);
	4324	+ verbose(env, "invalid%s read from stack R%d off %d+%d size %d\n",
	4325	+ err_extra, regno, min_off, i - min_off, access_size);
1901	4326	} else {
1902	4327	char tn_buf[48];
1903	4328
1904	4329	tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
1905		- verbose(env, "invalid indirect read from stack var_off %s+%d size %d\n",
1906		- tn_buf, i - min_off, access_size);
	4330	+ verbose(env, "invalid%s read from stack R%d var_off %s+%d size %d\n",
	4331	+ err_extra, regno, tn_buf, i - min_off, access_size);
1907	4332	}
1908	4333	return -EACCES;
1909	4334	mark:
..	..	@@ -1911,7 +4336,8 @@
1911	4336	* the whole slot to be marked as 'read'
1912	4337	*/
1913	4338	mark_reg_read(env, &state->stack[spi].spilled_ptr,
1914		- state->stack[spi].spilled_ptr.parent);
	4339	+ state->stack[spi].spilled_ptr.parent,
	4340	+ REG_LIVE_READ64);
1915	4341	}
1916	4342	return update_stack_depth(env, state, min_off);
1917	4343	}
..	..	@@ -1928,12 +4354,125 @@
1928	4354	return check_packet_access(env, regno, reg->off, access_size,
1929	4355	zero_size_allowed);
1930	4356	case PTR_TO_MAP_VALUE:
	4357	+ if (check_map_access_type(env, regno, reg->off, access_size,
	4358	+ meta && meta->raw_mode ? BPF_WRITE :
	4359	+ BPF_READ))
	4360	+ return -EACCES;
1931	4361	return check_map_access(env, regno, reg->off, access_size,
1932	4362	zero_size_allowed);
1933		- default: /* scalar_value\|ptr_to_stack or invalid ptr */
1934		- return check_stack_boundary(env, regno, access_size,
1935		- zero_size_allowed, meta);
	4363	+ case PTR_TO_MEM:
	4364	+ return check_mem_region_access(env, regno, reg->off,
	4365	+ access_size, reg->mem_size,
	4366	+ zero_size_allowed);
	4367	+ case PTR_TO_RDONLY_BUF:
	4368	+ if (meta && meta->raw_mode)
	4369	+ return -EACCES;
	4370	+ return check_buffer_access(env, reg, regno, reg->off,
	4371	+ access_size, zero_size_allowed,
	4372	+ "rdonly",
	4373	+ &env->prog->aux->max_rdonly_access);
	4374	+ case PTR_TO_RDWR_BUF:
	4375	+ return check_buffer_access(env, reg, regno, reg->off,
	4376	+ access_size, zero_size_allowed,
	4377	+ "rdwr",
	4378	+ &env->prog->aux->max_rdwr_access);
	4379	+ case PTR_TO_STACK:
	4380	+ return check_stack_range_initialized(
	4381	+ env,
	4382	+ regno, reg->off, access_size,
	4383	+ zero_size_allowed, ACCESS_HELPER, meta);
	4384	+ default: /* scalar_value or invalid ptr */
	4385	+ /* Allow zero-byte read from NULL, regardless of pointer type */
	4386	+ if (zero_size_allowed && access_size == 0 &&
	4387	+ register_is_null(reg))
	4388	+ return 0;
	4389	+
	4390	+ verbose(env, "R%d type=%s expected=%s\n", regno,
	4391	+ reg_type_str[reg->type],
	4392	+ reg_type_str[PTR_TO_STACK]);
	4393	+ return -EACCES;
1936	4394	}
	4395	+}
	4396	+
	4397	+/* Implementation details:
	4398	+ * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL
	4399	+ * Two bpf_map_lookups (even with the same key) will have different reg->id.
	4400	+ * For traditional PTR_TO_MAP_VALUE the verifier clears reg->id after
	4401	+ * value_or_null->value transition, since the verifier only cares about
	4402	+ * the range of access to valid map value pointer and doesn't care about actual
	4403	+ * address of the map element.
	4404	+ * For maps with 'struct bpf_spin_lock' inside map value the verifier keeps
	4405	+ * reg->id > 0 after value_or_null->value transition. By doing so
	4406	+ * two bpf_map_lookups will be considered two different pointers that
	4407	+ * point to different bpf_spin_locks.
	4408	+ * The verifier allows taking only one bpf_spin_lock at a time to avoid
	4409	+ * dead-locks.
	4410	+ * Since only one bpf_spin_lock is allowed the checks are simpler than
	4411	+ * reg_is_refcounted() logic. The verifier needs to remember only
	4412	+ * one spin_lock instead of array of acquired_refs.
	4413	+ * cur_state->active_spin_lock remembers which map value element got locked
	4414	+ * and clears it after bpf_spin_unlock.
	4415	+ */
	4416	+static int process_spin_lock(struct bpf_verifier_env *env, int regno,
	4417	+ bool is_lock)
	4418	+{
	4419	+ struct bpf_reg_state regs = cur_regs(env), reg = &regs[regno];
	4420	+ struct bpf_verifier_state *cur = env->cur_state;
	4421	+ bool is_const = tnum_is_const(reg->var_off);
	4422	+ struct bpf_map *map = reg->map_ptr;
	4423	+ u64 val = reg->var_off.value;
	4424	+
	4425	+ if (!is_const) {
	4426	+ verbose(env,
	4427	+ "R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n",
	4428	+ regno);
	4429	+ return -EINVAL;
	4430	+ }
	4431	+ if (!map->btf) {
	4432	+ verbose(env,
	4433	+ "map '%s' has to have BTF in order to use bpf_spin_lock\n",
	4434	+ map->name);
	4435	+ return -EINVAL;
	4436	+ }
	4437	+ if (!map_value_has_spin_lock(map)) {
	4438	+ if (map->spin_lock_off == -E2BIG)
	4439	+ verbose(env,
	4440	+ "map '%s' has more than one 'struct bpf_spin_lock'\n",
	4441	+ map->name);
	4442	+ else if (map->spin_lock_off == -ENOENT)
	4443	+ verbose(env,
	4444	+ "map '%s' doesn't have 'struct bpf_spin_lock'\n",
	4445	+ map->name);
	4446	+ else
	4447	+ verbose(env,
	4448	+ "map '%s' is not a struct type or bpf_spin_lock is mangled\n",
	4449	+ map->name);
	4450	+ return -EINVAL;
	4451	+ }
	4452	+ if (map->spin_lock_off != val + reg->off) {
	4453	+ verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock'\n",
	4454	+ val + reg->off);
	4455	+ return -EINVAL;
	4456	+ }
	4457	+ if (is_lock) {
	4458	+ if (cur->active_spin_lock) {
	4459	+ verbose(env,
	4460	+ "Locking two bpf_spin_locks are not allowed\n");
	4461	+ return -EINVAL;
	4462	+ }
	4463	+ cur->active_spin_lock = reg->id;
	4464	+ } else {
	4465	+ if (!cur->active_spin_lock) {
	4466	+ verbose(env, "bpf_spin_unlock without taking a lock\n");
	4467	+ return -EINVAL;
	4468	+ }
	4469	+ if (cur->active_spin_lock != reg->id) {
	4470	+ verbose(env, "bpf_spin_unlock of different lock\n");
	4471	+ return -EINVAL;
	4472	+ }
	4473	+ cur->active_spin_lock = 0;
	4474	+ }
	4475	+ return 0;
1937	4476	}
1938	4477
1939	4478	static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
..	..	@@ -1949,12 +4488,215 @@
1949	4488	type == ARG_CONST_SIZE_OR_ZERO;
1950	4489	}
1951	4490
1952		-static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
	4491	+static bool arg_type_is_alloc_size(enum bpf_arg_type type)
	4492	+{
	4493	+ return type == ARG_CONST_ALLOC_SIZE_OR_ZERO;
	4494	+}
	4495	+
	4496	+static bool arg_type_is_int_ptr(enum bpf_arg_type type)
	4497	+{
	4498	+ return type == ARG_PTR_TO_INT \|\|
	4499	+ type == ARG_PTR_TO_LONG;
	4500	+}
	4501	+
	4502	+static int int_ptr_type_to_size(enum bpf_arg_type type)
	4503	+{
	4504	+ if (type == ARG_PTR_TO_INT)
	4505	+ return sizeof(u32);
	4506	+ else if (type == ARG_PTR_TO_LONG)
	4507	+ return sizeof(u64);
	4508	+
	4509	+ return -EINVAL;
	4510	+}
	4511	+
	4512	+static int resolve_map_arg_type(struct bpf_verifier_env *env,
	4513	+ const struct bpf_call_arg_meta *meta,
	4514	+ enum bpf_arg_type *arg_type)
	4515	+{
	4516	+ if (!meta->map_ptr) {
	4517	+ /* kernel subsystem misconfigured verifier */
	4518	+ verbose(env, "invalid map_ptr to access map->type\n");
	4519	+ return -EACCES;
	4520	+ }
	4521	+
	4522	+ switch (meta->map_ptr->map_type) {
	4523	+ case BPF_MAP_TYPE_SOCKMAP:
	4524	+ case BPF_MAP_TYPE_SOCKHASH:
	4525	+ if (*arg_type == ARG_PTR_TO_MAP_VALUE) {
	4526	+ *arg_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON;
	4527	+ } else {
	4528	+ verbose(env, "invalid arg_type for sockmap/sockhash\n");
	4529	+ return -EINVAL;
	4530	+ }
	4531	+ break;
	4532	+
	4533	+ default:
	4534	+ break;
	4535	+ }
	4536	+ return 0;
	4537	+}
	4538	+
	4539	+struct bpf_reg_types {
	4540	+ const enum bpf_reg_type types[10];
	4541	+ u32 *btf_id;
	4542	+};
	4543	+
	4544	+static const struct bpf_reg_types map_key_value_types = {
	4545	+ .types = {
	4546	+ PTR_TO_STACK,
	4547	+ PTR_TO_PACKET,
	4548	+ PTR_TO_PACKET_META,
	4549	+ PTR_TO_MAP_VALUE,
	4550	+ },
	4551	+};
	4552	+
	4553	+static const struct bpf_reg_types sock_types = {
	4554	+ .types = {
	4555	+ PTR_TO_SOCK_COMMON,
	4556	+ PTR_TO_SOCKET,
	4557	+ PTR_TO_TCP_SOCK,
	4558	+ PTR_TO_XDP_SOCK,
	4559	+ },
	4560	+};
	4561	+
	4562	+#ifdef CONFIG_NET
	4563	+static const struct bpf_reg_types btf_id_sock_common_types = {
	4564	+ .types = {
	4565	+ PTR_TO_SOCK_COMMON,
	4566	+ PTR_TO_SOCKET,
	4567	+ PTR_TO_TCP_SOCK,
	4568	+ PTR_TO_XDP_SOCK,
	4569	+ PTR_TO_BTF_ID,
	4570	+ },
	4571	+ .btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
	4572	+};
	4573	+#endif
	4574	+
	4575	+static const struct bpf_reg_types mem_types = {
	4576	+ .types = {
	4577	+ PTR_TO_STACK,
	4578	+ PTR_TO_PACKET,
	4579	+ PTR_TO_PACKET_META,
	4580	+ PTR_TO_MAP_VALUE,
	4581	+ PTR_TO_MEM,
	4582	+ PTR_TO_RDONLY_BUF,
	4583	+ PTR_TO_RDWR_BUF,
	4584	+ },
	4585	+};
	4586	+
	4587	+static const struct bpf_reg_types int_ptr_types = {
	4588	+ .types = {
	4589	+ PTR_TO_STACK,
	4590	+ PTR_TO_PACKET,
	4591	+ PTR_TO_PACKET_META,
	4592	+ PTR_TO_MAP_VALUE,
	4593	+ },
	4594	+};
	4595	+
	4596	+static const struct bpf_reg_types fullsock_types = { .types = { PTR_TO_SOCKET } };
	4597	+static const struct bpf_reg_types scalar_types = { .types = { SCALAR_VALUE } };
	4598	+static const struct bpf_reg_types context_types = { .types = { PTR_TO_CTX } };
	4599	+static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM } };
	4600	+static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } };
	4601	+static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } };
	4602	+static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } };
	4603	+static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } };
	4604	+
	4605	+static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
	4606	+ [ARG_PTR_TO_MAP_KEY] = &map_key_value_types,
	4607	+ [ARG_PTR_TO_MAP_VALUE] = &map_key_value_types,
	4608	+ [ARG_PTR_TO_UNINIT_MAP_VALUE] = &map_key_value_types,
	4609	+ [ARG_PTR_TO_MAP_VALUE_OR_NULL] = &map_key_value_types,
	4610	+ [ARG_CONST_SIZE] = &scalar_types,
	4611	+ [ARG_CONST_SIZE_OR_ZERO] = &scalar_types,
	4612	+ [ARG_CONST_ALLOC_SIZE_OR_ZERO] = &scalar_types,
	4613	+ [ARG_CONST_MAP_PTR] = &const_map_ptr_types,
	4614	+ [ARG_PTR_TO_CTX] = &context_types,
	4615	+ [ARG_PTR_TO_CTX_OR_NULL] = &context_types,
	4616	+ [ARG_PTR_TO_SOCK_COMMON] = &sock_types,
	4617	+#ifdef CONFIG_NET
	4618	+ [ARG_PTR_TO_BTF_ID_SOCK_COMMON] = &btf_id_sock_common_types,
	4619	+#endif
	4620	+ [ARG_PTR_TO_SOCKET] = &fullsock_types,
	4621	+ [ARG_PTR_TO_SOCKET_OR_NULL] = &fullsock_types,
	4622	+ [ARG_PTR_TO_BTF_ID] = &btf_ptr_types,
	4623	+ [ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types,
	4624	+ [ARG_PTR_TO_MEM] = &mem_types,
	4625	+ [ARG_PTR_TO_MEM_OR_NULL] = &mem_types,
	4626	+ [ARG_PTR_TO_UNINIT_MEM] = &mem_types,
	4627	+ [ARG_PTR_TO_ALLOC_MEM] = &alloc_mem_types,
	4628	+ [ARG_PTR_TO_ALLOC_MEM_OR_NULL] = &alloc_mem_types,
	4629	+ [ARG_PTR_TO_INT] = &int_ptr_types,
	4630	+ [ARG_PTR_TO_LONG] = &int_ptr_types,
	4631	+ [ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types,
	4632	+};
	4633	+
	4634	+static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
1953	4635	enum bpf_arg_type arg_type,
1954		- struct bpf_call_arg_meta *meta)
	4636	+ const u32 *arg_btf_id)
1955	4637	{
1956	4638	struct bpf_reg_state regs = cur_regs(env), reg = &regs[regno];
1957		- enum bpf_reg_type expected_type, type = reg->type;
	4639	+ enum bpf_reg_type expected, type = reg->type;
	4640	+ const struct bpf_reg_types *compatible;
	4641	+ int i, j;
	4642	+
	4643	+ compatible = compatible_reg_types[arg_type];
	4644	+ if (!compatible) {
	4645	+ verbose(env, "verifier internal error: unsupported arg type %d\n", arg_type);
	4646	+ return -EFAULT;
	4647	+ }
	4648	+
	4649	+ for (i = 0; i < ARRAY_SIZE(compatible->types); i++) {
	4650	+ expected = compatible->types[i];
	4651	+ if (expected == NOT_INIT)
	4652	+ break;
	4653	+
	4654	+ if (type == expected)
	4655	+ goto found;
	4656	+ }
	4657	+
	4658	+ verbose(env, "R%d type=%s expected=", regno, reg_type_str[type]);
	4659	+ for (j = 0; j + 1 < i; j++)
	4660	+ verbose(env, "%s, ", reg_type_str[compatible->types[j]]);
	4661	+ verbose(env, "%s\n", reg_type_str[compatible->types[j]]);
	4662	+ return -EACCES;
	4663	+
	4664	+found:
	4665	+ if (type == PTR_TO_BTF_ID) {
	4666	+ if (!arg_btf_id) {
	4667	+ if (!compatible->btf_id) {
	4668	+ verbose(env, "verifier internal error: missing arg compatible BTF ID\n");
	4669	+ return -EFAULT;
	4670	+ }
	4671	+ arg_btf_id = compatible->btf_id;
	4672	+ }
	4673	+
	4674	+ if (!btf_struct_ids_match(&env->log, reg->off, reg->btf_id,
	4675	+ *arg_btf_id)) {
	4676	+ verbose(env, "R%d is of type %s but %s is expected\n",
	4677	+ regno, kernel_type_name(reg->btf_id),
	4678	+ kernel_type_name(*arg_btf_id));
	4679	+ return -EACCES;
	4680	+ }
	4681	+
	4682	+ if (!tnum_is_const(reg->var_off) \|\| reg->var_off.value) {
	4683	+ verbose(env, "R%d is a pointer to in-kernel struct with non-zero offset\n",
	4684	+ regno);
	4685	+ return -EACCES;
	4686	+ }
	4687	+ }
	4688	+
	4689	+ return 0;
	4690	+}
	4691	+
	4692	+static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
	4693	+ struct bpf_call_arg_meta *meta,
	4694	+ const struct bpf_func_proto *fn)
	4695	+{
	4696	+ u32 regno = BPF_REG_1 + arg;
	4697	+ struct bpf_reg_state regs = cur_regs(env), reg = &regs[regno];
	4698	+ enum bpf_arg_type arg_type = fn->arg_type[arg];
	4699	+ enum bpf_reg_type type = reg->type;
1958	4700	int err = 0;
1959	4701
1960	4702	if (arg_type == ARG_DONTCARE)
..	..	@@ -1979,45 +4721,39 @@
1979	4721	return -EACCES;
1980	4722	}
1981	4723
1982		- if (arg_type == ARG_PTR_TO_MAP_KEY \|\|
1983		- arg_type == ARG_PTR_TO_MAP_VALUE) {
1984		- expected_type = PTR_TO_STACK;
1985		- if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE &&
1986		- type != expected_type)
1987		- goto err_type;
1988		- } else if (arg_type == ARG_CONST_SIZE \|\|
1989		- arg_type == ARG_CONST_SIZE_OR_ZERO) {
1990		- expected_type = SCALAR_VALUE;
1991		- if (type != expected_type)
1992		- goto err_type;
1993		- } else if (arg_type == ARG_CONST_MAP_PTR) {
1994		- expected_type = CONST_PTR_TO_MAP;
1995		- if (type != expected_type)
1996		- goto err_type;
1997		- } else if (arg_type == ARG_PTR_TO_CTX) {
1998		- expected_type = PTR_TO_CTX;
1999		- if (type != expected_type)
2000		- goto err_type;
	4724	+ if (arg_type == ARG_PTR_TO_MAP_VALUE \|\|
	4725	+ arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE \|\|
	4726	+ arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) {
	4727	+ err = resolve_map_arg_type(env, meta, &arg_type);
	4728	+ if (err)
	4729	+ return err;
	4730	+ }
	4731	+
	4732	+ if (register_is_null(reg) && arg_type_may_be_null(arg_type))
	4733	+ /* A NULL register has a SCALAR_VALUE type, so skip
	4734	+ * type checking.
	4735	+ */
	4736	+ goto skip_type_check;
	4737	+
	4738	+ err = check_reg_type(env, regno, arg_type, fn->arg_btf_id[arg]);
	4739	+ if (err)
	4740	+ return err;
	4741	+
	4742	+ if (type == PTR_TO_CTX) {
2001	4743	err = check_ctx_reg(env, reg, regno);
2002	4744	if (err < 0)
2003	4745	return err;
2004		- } else if (arg_type_is_mem_ptr(arg_type)) {
2005		- expected_type = PTR_TO_STACK;
2006		- /* One exception here. In case function allows for NULL to be
2007		- * passed in as argument, it's a SCALAR_VALUE type. Final test
2008		- * happens during stack boundary checking.
2009		- */
2010		- if (register_is_null(reg) &&
2011		- arg_type == ARG_PTR_TO_MEM_OR_NULL)
2012		- /* final test in check_stack_boundary() */;
2013		- else if (!type_is_pkt_pointer(type) &&
2014		- type != PTR_TO_MAP_VALUE &&
2015		- type != expected_type)
2016		- goto err_type;
2017		- meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
2018		- } else {
2019		- verbose(env, "unsupported arg_type %d\n", arg_type);
2020		- return -EFAULT;
	4746	+ }
	4747	+
	4748	+skip_type_check:
	4749	+ if (reg->ref_obj_id) {
	4750	+ if (meta->ref_obj_id) {
	4751	+ verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
	4752	+ regno, reg->ref_obj_id,
	4753	+ meta->ref_obj_id);
	4754	+ return -EFAULT;
	4755	+ }
	4756	+ meta->ref_obj_id = reg->ref_obj_id;
2021	4757	}
2022	4758
2023	4759	if (arg_type == ARG_CONST_MAP_PTR) {
..	..	@@ -2040,7 +4776,10 @@
2040	4776	err = check_helper_mem_access(env, regno,
2041	4777	meta->map_ptr->key_size, false,
2042	4778	NULL);
2043		- } else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
	4779	+ } else if (arg_type == ARG_PTR_TO_MAP_VALUE \|\|
	4780	+ (arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL &&
	4781	+ !register_is_null(reg)) \|\|
	4782	+ arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) {
2044	4783	/* bpf_map_xxx(..., map_ptr, ..., value) call:
2045	4784	* check [value, value + map->value_size) validity
2046	4785	*/
..	..	@@ -2049,14 +4788,42 @@
2049	4788	verbose(env, "invalid map_ptr to access map->value\n");
2050	4789	return -EACCES;
2051	4790	}
	4791	+ meta->raw_mode = (arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE);
2052	4792	err = check_helper_mem_access(env, regno,
2053	4793	meta->map_ptr->value_size, false,
2054		- NULL);
	4794	+ meta);
	4795	+ } else if (arg_type == ARG_PTR_TO_PERCPU_BTF_ID) {
	4796	+ if (!reg->btf_id) {
	4797	+ verbose(env, "Helper has invalid btf_id in R%d\n", regno);
	4798	+ return -EACCES;
	4799	+ }
	4800	+ meta->ret_btf_id = reg->btf_id;
	4801	+ } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) {
	4802	+ if (meta->func_id == BPF_FUNC_spin_lock) {
	4803	+ if (process_spin_lock(env, regno, true))
	4804	+ return -EACCES;
	4805	+ } else if (meta->func_id == BPF_FUNC_spin_unlock) {
	4806	+ if (process_spin_lock(env, regno, false))
	4807	+ return -EACCES;
	4808	+ } else {
	4809	+ verbose(env, "verifier internal error\n");
	4810	+ return -EFAULT;
	4811	+ }
	4812	+ } else if (arg_type_is_mem_ptr(arg_type)) {
	4813	+ /* The access to this pointer is only checked when we hit the
	4814	+ * next is_mem_size argument below.
	4815	+ */
	4816	+ meta->raw_mode = (arg_type == ARG_PTR_TO_UNINIT_MEM);
2055	4817	} else if (arg_type_is_mem_size(arg_type)) {
2056	4818	bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
2057	4819
2058		- /* remember the mem_size which may be used later
2059		- * to refine return values.
	4820	+ /* This is used to refine r0 return value bounds for helpers
	4821	+ * that enforce this value as an upper bound on return values.
	4822	+ * See do_refine_retval_range() for helpers that can refine
	4823	+ * the return value. C type of helper is u32 so we pull register
	4824	+ * bound from umax_value however, if negative verifier errors
	4825	+ * out. Only upper bounds can be learned because retval is an
	4826	+ * int type and negative retvals are allowed.
2060	4827	*/
2061	4828	meta->msize_max_value = reg->umax_value;
2062	4829
..	..	@@ -2093,13 +4860,62 @@
2093	4860	err = check_helper_mem_access(env, regno - 1,
2094	4861	reg->umax_value,
2095	4862	zero_size_allowed, meta);
	4863	+ if (!err)
	4864	+ err = mark_chain_precision(env, regno);
	4865	+ } else if (arg_type_is_alloc_size(arg_type)) {
	4866	+ if (!tnum_is_const(reg->var_off)) {
	4867	+ verbose(env, "R%d unbounded size, use 'var &= const' or 'if (var < const)'\n",
	4868	+ regno);
	4869	+ return -EACCES;
	4870	+ }
	4871	+ meta->mem_size = reg->var_off.value;
	4872	+ } else if (arg_type_is_int_ptr(arg_type)) {
	4873	+ int size = int_ptr_type_to_size(arg_type);
	4874	+
	4875	+ err = check_helper_mem_access(env, regno, size, false, meta);
	4876	+ if (err)
	4877	+ return err;
	4878	+ err = check_ptr_alignment(env, reg, 0, size, true);
2096	4879	}
2097	4880
2098	4881	return err;
2099		-err_type:
2100		- verbose(env, "R%d type=%s expected=%s\n", regno,
2101		- reg_type_str[type], reg_type_str[expected_type]);
2102		- return -EACCES;
	4882	+}
	4883	+
	4884	+static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id)
	4885	+{
	4886	+ enum bpf_attach_type eatype = env->prog->expected_attach_type;
	4887	+ enum bpf_prog_type type = resolve_prog_type(env->prog);
	4888	+
	4889	+ if (func_id != BPF_FUNC_map_update_elem)
	4890	+ return false;
	4891	+
	4892	+ /* It's not possible to get access to a locked struct sock in these
	4893	+ * contexts, so updating is safe.
	4894	+ */
	4895	+ switch (type) {
	4896	+ case BPF_PROG_TYPE_TRACING:
	4897	+ if (eatype == BPF_TRACE_ITER)
	4898	+ return true;
	4899	+ break;
	4900	+ case BPF_PROG_TYPE_SOCKET_FILTER:
	4901	+ case BPF_PROG_TYPE_SCHED_CLS:
	4902	+ case BPF_PROG_TYPE_SCHED_ACT:
	4903	+ case BPF_PROG_TYPE_XDP:
	4904	+ case BPF_PROG_TYPE_SK_REUSEPORT:
	4905	+ case BPF_PROG_TYPE_FLOW_DISSECTOR:
	4906	+ case BPF_PROG_TYPE_SK_LOOKUP:
	4907	+ return true;
	4908	+ default:
	4909	+ break;
	4910	+ }
	4911	+
	4912	+ verbose(env, "cannot update sockmap in this context\n");
	4913	+ return false;
	4914	+}
	4915	+
	4916	+static bool allow_tail_call_in_subprogs(struct bpf_verifier_env *env)
	4917	+{
	4918	+ return env->prog->jit_requested && IS_ENABLED(CONFIG_X86_64);
2103	4919	}
2104	4920
2105	4921	static int check_map_func_compatibility(struct bpf_verifier_env *env,
..	..	@@ -2117,7 +4933,15 @@
2117	4933	case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
2118	4934	if (func_id != BPF_FUNC_perf_event_read &&
2119	4935	func_id != BPF_FUNC_perf_event_output &&
2120		- func_id != BPF_FUNC_perf_event_read_value)
	4936	+ func_id != BPF_FUNC_skb_output &&
	4937	+ func_id != BPF_FUNC_perf_event_read_value &&
	4938	+ func_id != BPF_FUNC_xdp_output)
	4939	+ goto error;
	4940	+ break;
	4941	+ case BPF_MAP_TYPE_RINGBUF:
	4942	+ if (func_id != BPF_FUNC_ringbuf_output &&
	4943	+ func_id != BPF_FUNC_ringbuf_reserve &&
	4944	+ func_id != BPF_FUNC_ringbuf_query)
2121	4945	goto error;
2122	4946	break;
2123	4947	case BPF_MAP_TYPE_STACK_TRACE:
..	..	@@ -2130,23 +4954,26 @@
2130	4954	goto error;
2131	4955	break;
2132	4956	case BPF_MAP_TYPE_CGROUP_STORAGE:
	4957	+ case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE:
2133	4958	if (func_id != BPF_FUNC_get_local_storage)
2134	4959	goto error;
2135	4960	break;
2136		- /* devmap returns a pointer to a live net_device ifindex that we cannot
2137		- * allow to be modified from bpf side. So do not allow lookup elements
2138		- * for now.
2139		- */
2140	4961	case BPF_MAP_TYPE_DEVMAP:
2141		- if (func_id != BPF_FUNC_redirect_map)
	4962	+ case BPF_MAP_TYPE_DEVMAP_HASH:
	4963	+ if (func_id != BPF_FUNC_redirect_map &&
	4964	+ func_id != BPF_FUNC_map_lookup_elem)
2142	4965	goto error;
2143	4966	break;
2144	4967	/* Restrict bpf side of cpumap and xskmap, open when use-cases
2145	4968	* appear.
2146	4969	*/
2147	4970	case BPF_MAP_TYPE_CPUMAP:
2148		- case BPF_MAP_TYPE_XSKMAP:
2149	4971	if (func_id != BPF_FUNC_redirect_map)
	4972	+ goto error;
	4973	+ break;
	4974	+ case BPF_MAP_TYPE_XSKMAP:
	4975	+ if (func_id != BPF_FUNC_redirect_map &&
	4976	+ func_id != BPF_FUNC_map_lookup_elem)
2150	4977	goto error;
2151	4978	break;
2152	4979	case BPF_MAP_TYPE_ARRAY_OF_MAPS:
..	..	@@ -2158,18 +4985,41 @@
2158	4985	if (func_id != BPF_FUNC_sk_redirect_map &&
2159	4986	func_id != BPF_FUNC_sock_map_update &&
2160	4987	func_id != BPF_FUNC_map_delete_elem &&
2161		- func_id != BPF_FUNC_msg_redirect_map)
	4988	+ func_id != BPF_FUNC_msg_redirect_map &&
	4989	+ func_id != BPF_FUNC_sk_select_reuseport &&
	4990	+ func_id != BPF_FUNC_map_lookup_elem &&
	4991	+ !may_update_sockmap(env, func_id))
2162	4992	goto error;
2163	4993	break;
2164	4994	case BPF_MAP_TYPE_SOCKHASH:
2165	4995	if (func_id != BPF_FUNC_sk_redirect_hash &&
2166	4996	func_id != BPF_FUNC_sock_hash_update &&
2167	4997	func_id != BPF_FUNC_map_delete_elem &&
2168		- func_id != BPF_FUNC_msg_redirect_hash)
	4998	+ func_id != BPF_FUNC_msg_redirect_hash &&
	4999	+ func_id != BPF_FUNC_sk_select_reuseport &&
	5000	+ func_id != BPF_FUNC_map_lookup_elem &&
	5001	+ !may_update_sockmap(env, func_id))
2169	5002	goto error;
2170	5003	break;
2171	5004	case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
2172	5005	if (func_id != BPF_FUNC_sk_select_reuseport)
	5006	+ goto error;
	5007	+ break;
	5008	+ case BPF_MAP_TYPE_QUEUE:
	5009	+ case BPF_MAP_TYPE_STACK:
	5010	+ if (func_id != BPF_FUNC_map_peek_elem &&
	5011	+ func_id != BPF_FUNC_map_pop_elem &&
	5012	+ func_id != BPF_FUNC_map_push_elem)
	5013	+ goto error;
	5014	+ break;
	5015	+ case BPF_MAP_TYPE_SK_STORAGE:
	5016	+ if (func_id != BPF_FUNC_sk_storage_get &&
	5017	+ func_id != BPF_FUNC_sk_storage_delete)
	5018	+ goto error;
	5019	+ break;
	5020	+ case BPF_MAP_TYPE_INODE_STORAGE:
	5021	+ if (func_id != BPF_FUNC_inode_storage_get &&
	5022	+ func_id != BPF_FUNC_inode_storage_delete)
2173	5023	goto error;
2174	5024	break;
2175	5025	default:
..	..	@@ -2181,15 +5031,23 @@
2181	5031	case BPF_FUNC_tail_call:
2182	5032	if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
2183	5033	goto error;
2184		- if (env->subprog_cnt > 1) {
2185		- verbose(env, "tail_calls are not allowed in programs with bpf-to-bpf calls\n");
	5034	+ if (env->subprog_cnt > 1 && !allow_tail_call_in_subprogs(env)) {
	5035	+ verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n");
2186	5036	return -EINVAL;
2187	5037	}
2188	5038	break;
2189	5039	case BPF_FUNC_perf_event_read:
2190	5040	case BPF_FUNC_perf_event_output:
2191	5041	case BPF_FUNC_perf_event_read_value:
	5042	+ case BPF_FUNC_skb_output:
	5043	+ case BPF_FUNC_xdp_output:
2192	5044	if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
	5045	+ goto error;
	5046	+ break;
	5047	+ case BPF_FUNC_ringbuf_output:
	5048	+ case BPF_FUNC_ringbuf_reserve:
	5049	+ case BPF_FUNC_ringbuf_query:
	5050	+ if (map->map_type != BPF_MAP_TYPE_RINGBUF)
2193	5051	goto error;
2194	5052	break;
2195	5053	case BPF_FUNC_get_stackid:
..	..	@@ -2203,6 +5061,7 @@
2203	5061	break;
2204	5062	case BPF_FUNC_redirect_map:
2205	5063	if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
	5064	+ map->map_type != BPF_MAP_TYPE_DEVMAP_HASH &&
2206	5065	map->map_type != BPF_MAP_TYPE_CPUMAP &&
2207	5066	map->map_type != BPF_MAP_TYPE_XSKMAP)
2208	5067	goto error;
..	..	@@ -2220,11 +5079,31 @@
2220	5079	goto error;
2221	5080	break;
2222	5081	case BPF_FUNC_get_local_storage:
2223		- if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE)
	5082	+ if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
	5083	+ map->map_type != BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
2224	5084	goto error;
2225	5085	break;
2226	5086	case BPF_FUNC_sk_select_reuseport:
2227		- if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY)
	5087	+ if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY &&
	5088	+ map->map_type != BPF_MAP_TYPE_SOCKMAP &&
	5089	+ map->map_type != BPF_MAP_TYPE_SOCKHASH)
	5090	+ goto error;
	5091	+ break;
	5092	+ case BPF_FUNC_map_peek_elem:
	5093	+ case BPF_FUNC_map_pop_elem:
	5094	+ case BPF_FUNC_map_push_elem:
	5095	+ if (map->map_type != BPF_MAP_TYPE_QUEUE &&
	5096	+ map->map_type != BPF_MAP_TYPE_STACK)
	5097	+ goto error;
	5098	+ break;
	5099	+ case BPF_FUNC_sk_storage_get:
	5100	+ case BPF_FUNC_sk_storage_delete:
	5101	+ if (map->map_type != BPF_MAP_TYPE_SK_STORAGE)
	5102	+ goto error;
	5103	+ break;
	5104	+ case BPF_FUNC_inode_storage_get:
	5105	+ case BPF_FUNC_inode_storage_delete:
	5106	+ if (map->map_type != BPF_MAP_TYPE_INODE_STORAGE)
2228	5107	goto error;
2229	5108	break;
2230	5109	default:
..	..	@@ -2287,49 +5166,142 @@
2287	5166	return true;
2288	5167	}
2289	5168
2290		-static int check_func_proto(const struct bpf_func_proto *fn)
	5169	+static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id)
	5170	+{
	5171	+ int count = 0;
	5172	+
	5173	+ if (arg_type_may_be_refcounted(fn->arg1_type))
	5174	+ count++;
	5175	+ if (arg_type_may_be_refcounted(fn->arg2_type))
	5176	+ count++;
	5177	+ if (arg_type_may_be_refcounted(fn->arg3_type))
	5178	+ count++;
	5179	+ if (arg_type_may_be_refcounted(fn->arg4_type))
	5180	+ count++;
	5181	+ if (arg_type_may_be_refcounted(fn->arg5_type))
	5182	+ count++;
	5183	+
	5184	+ /* A reference acquiring function cannot acquire
	5185	+ * another refcounted ptr.
	5186	+ */
	5187	+ if (may_be_acquire_function(func_id) && count)
	5188	+ return false;
	5189	+
	5190	+ /* We only support one arg being unreferenced at the moment,
	5191	+ * which is sufficient for the helper functions we have right now.
	5192	+ */
	5193	+ return count <= 1;
	5194	+}
	5195	+
	5196	+static bool check_btf_id_ok(const struct bpf_func_proto *fn)
	5197	+{
	5198	+ int i;
	5199	+
	5200	+ for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) {
	5201	+ if (fn->arg_type[i] == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i])
	5202	+ return false;
	5203	+
	5204	+ if (fn->arg_type[i] != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i])
	5205	+ return false;
	5206	+ }
	5207	+
	5208	+ return true;
	5209	+}
	5210	+
	5211	+static int check_func_proto(const struct bpf_func_proto *fn, int func_id)
2291	5212	{
2292	5213	return check_raw_mode_ok(fn) &&
2293		- check_arg_pair_ok(fn) ? 0 : -EINVAL;
	5214	+ check_arg_pair_ok(fn) &&
	5215	+ check_btf_id_ok(fn) &&
	5216	+ check_refcount_ok(fn, func_id) ? 0 : -EINVAL;
2294	5217	}
2295	5218
2296	5219	/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
2297	5220	* are now invalid, so turn them into unknown SCALAR_VALUE.
2298	5221	*/
2299		-static void __clear_all_pkt_pointers(struct bpf_verifier_env *env,
2300		- struct bpf_func_state *state)
2301		-{
2302		- struct bpf_reg_state regs = state->regs, reg;
2303		- int i;
2304		-
2305		- for (i = 0; i < MAX_BPF_REG; i++)
2306		- if (reg_is_pkt_pointer_any(&regs[i]))
2307		- mark_reg_unknown(env, regs, i);
2308		-
2309		- for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
2310		- if (state->stack[i].slot_type[0] != STACK_SPILL)
2311		- continue;
2312		- reg = &state->stack[i].spilled_ptr;
2313		- if (reg_is_pkt_pointer_any(reg))
2314		- __mark_reg_unknown(reg);
2315		- }
2316		-}
2317		-
2318	5222	static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
2319	5223	{
2320		- struct bpf_verifier_state *vstate = env->cur_state;
	5224	+ struct bpf_func_state *state;
	5225	+ struct bpf_reg_state *reg;
	5226	+
	5227	+ bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
	5228	+ if (reg_is_pkt_pointer_any(reg))
	5229	+ __mark_reg_unknown(env, reg);
	5230	+ }));
	5231	+}
	5232	+
	5233	+enum {
	5234	+ AT_PKT_END = -1,
	5235	+ BEYOND_PKT_END = -2,
	5236	+};
	5237	+
	5238	+static void mark_pkt_end(struct bpf_verifier_state *vstate, int regn, bool range_open)
	5239	+{
	5240	+ struct bpf_func_state *state = vstate->frame[vstate->curframe];
	5241	+ struct bpf_reg_state *reg = &state->regs[regn];
	5242	+
	5243	+ if (reg->type != PTR_TO_PACKET)
	5244	+ /* PTR_TO_PACKET_META is not supported yet */
	5245	+ return;
	5246	+
	5247	+ /* The 'reg' is pkt > pkt_end or pkt >= pkt_end.
	5248	+ * How far beyond pkt_end it goes is unknown.
	5249	+ * if (!range_open) it's the case of pkt >= pkt_end
	5250	+ * if (range_open) it's the case of pkt > pkt_end
	5251	+ * hence this pointer is at least 1 byte bigger than pkt_end
	5252	+ */
	5253	+ if (range_open)
	5254	+ reg->range = BEYOND_PKT_END;
	5255	+ else
	5256	+ reg->range = AT_PKT_END;
	5257	+}
	5258	+
	5259	+/* The pointer with the specified id has released its reference to kernel
	5260	+ * resources. Identify all copies of the same pointer and clear the reference.
	5261	+ */
	5262	+static int release_reference(struct bpf_verifier_env *env,
	5263	+ int ref_obj_id)
	5264	+{
	5265	+ struct bpf_func_state *state;
	5266	+ struct bpf_reg_state *reg;
	5267	+ int err;
	5268	+
	5269	+ err = release_reference_state(cur_func(env), ref_obj_id);
	5270	+ if (err)
	5271	+ return err;
	5272	+
	5273	+ bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
	5274	+ if (reg->ref_obj_id == ref_obj_id) {
	5275	+ if (!env->allow_ptr_leaks)
	5276	+ __mark_reg_not_init(env, reg);
	5277	+ else
	5278	+ __mark_reg_unknown(env, reg);
	5279	+ }
	5280	+ }));
	5281	+
	5282	+ return 0;
	5283	+}
	5284	+
	5285	+static void clear_caller_saved_regs(struct bpf_verifier_env *env,
	5286	+ struct bpf_reg_state *regs)
	5287	+{
2321	5288	int i;
2322	5289
2323		- for (i = 0; i <= vstate->curframe; i++)
2324		- __clear_all_pkt_pointers(env, vstate->frame[i]);
	5290	+ /* after the call registers r0 - r5 were scratched */
	5291	+ for (i = 0; i < CALLER_SAVED_REGS; i++) {
	5292	+ mark_reg_not_init(env, regs, caller_saved[i]);
	5293	+ check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
	5294	+ }
2325	5295	}
2326	5296
2327	5297	static int check_func_call(struct bpf_verifier_env env, struct bpf_insn insn,
2328	5298	int *insn_idx)
2329	5299	{
2330	5300	struct bpf_verifier_state *state = env->cur_state;
	5301	+ struct bpf_func_info_aux *func_info_aux;
2331	5302	struct bpf_func_state caller, callee;
2332		- int i, subprog, target_insn;
	5303	+ int i, err, subprog, target_insn;
	5304	+ bool is_global = false;
2333	5305
2334	5306	if (state->curframe + 1 >= MAX_CALL_FRAMES) {
2335	5307	verbose(env, "the call stack of %d frames is too deep\n",
..	..	@@ -2352,6 +5324,33 @@
2352	5324	return -EFAULT;
2353	5325	}
2354	5326
	5327	+ func_info_aux = env->prog->aux->func_info_aux;
	5328	+ if (func_info_aux)
	5329	+ is_global = func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
	5330	+ err = btf_check_func_arg_match(env, subprog, caller->regs);
	5331	+ if (err == -EFAULT)
	5332	+ return err;
	5333	+ if (is_global) {
	5334	+ if (err) {
	5335	+ verbose(env, "Caller passes invalid args into func#%d\n",
	5336	+ subprog);
	5337	+ return err;
	5338	+ } else {
	5339	+ if (env->log.level & BPF_LOG_LEVEL)
	5340	+ verbose(env,
	5341	+ "Func#%d is global and valid. Skipping.\n",
	5342	+ subprog);
	5343	+ clear_caller_saved_regs(env, caller->regs);
	5344	+
	5345	+ /* All global functions return a 64-bit SCALAR_VALUE */
	5346	+ mark_reg_unknown(env, caller->regs, BPF_REG_0);
	5347	+ caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
	5348	+
	5349	+ /* continue with next insn after call */
	5350	+ return 0;
	5351	+ }
	5352	+ }
	5353	+
2355	5354	callee = kzalloc(sizeof(*callee), GFP_KERNEL);
2356	5355	if (!callee)
2357	5356	return -ENOMEM;
..	..	@@ -2367,17 +5366,18 @@
2367	5366	state->curframe + 1 /* frameno within this callchain */,
2368	5367	subprog /* subprog number within this prog */);
2369	5368
	5369	+ /* Transfer references to the callee */
	5370	+ err = transfer_reference_state(callee, caller);
	5371	+ if (err)
	5372	+ return err;
	5373	+
2370	5374	/* copy r1 - r5 args that callee can access. The copy includes parent
2371	5375	* pointers, which connects us up to the liveness chain
2372	5376	*/
2373	5377	for (i = BPF_REG_1; i <= BPF_REG_5; i++)
2374	5378	callee->regs[i] = caller->regs[i];
2375	5379
2376		- /* after the call registers r0 - r5 were scratched */
2377		- for (i = 0; i < CALLER_SAVED_REGS; i++) {
2378		- mark_reg_not_init(env, caller->regs, caller_saved[i]);
2379		- check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
2380		- }
	5380	+ clear_caller_saved_regs(env, caller->regs);
2381	5381
2382	5382	/* only increment it after check_reg_arg() finished */
2383	5383	state->curframe++;
..	..	@@ -2385,7 +5385,7 @@
2385	5385	/* and go analyze first insn of the callee */
2386	5386	*insn_idx = target_insn;
2387	5387
2388		- if (env->log.level) {
	5388	+ if (env->log.level & BPF_LOG_LEVEL) {
2389	5389	verbose(env, "caller:\n");
2390	5390	print_verifier_state(env, caller);
2391	5391	verbose(env, "callee:\n");
..	..	@@ -2399,6 +5399,7 @@
2399	5399	struct bpf_verifier_state *state = env->cur_state;
2400	5400	struct bpf_func_state caller, callee;
2401	5401	struct bpf_reg_state *r0;
	5402	+ int err;
2402	5403
2403	5404	callee = state->frame[state->curframe];
2404	5405	r0 = &callee->regs[BPF_REG_0];
..	..	@@ -2418,8 +5419,13 @@
2418	5419	/* return to the caller whatever r0 had in the callee */
2419	5420	caller->regs[BPF_REG_0] = *r0;
2420	5421
	5422	+ /* Transfer references to the caller */
	5423	+ err = transfer_reference_state(caller, callee);
	5424	+ if (err)
	5425	+ return err;
	5426	+
2421	5427	*insn_idx = callee->callsite + 1;
2422		- if (env->log.level) {
	5428	+ if (env->log.level & BPF_LOG_LEVEL) {
2423	5429	verbose(env, "returning from callee:\n");
2424	5430	print_verifier_state(env, callee);
2425	5431	verbose(env, "to caller at %d:\n", *insn_idx);
..	..	@@ -2431,44 +5437,24 @@
2431	5437	return 0;
2432	5438	}
2433	5439
2434		-static int do_refine_retval_range(struct bpf_verifier_env *env,
2435		- struct bpf_reg_state *regs, int ret_type,
2436		- int func_id, struct bpf_call_arg_meta *meta)
	5440	+static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type,
	5441	+ int func_id,
	5442	+ struct bpf_call_arg_meta *meta)
2437	5443	{
2438	5444	struct bpf_reg_state *ret_reg = &regs[BPF_REG_0];
2439		- struct bpf_reg_state tmp_reg = *ret_reg;
2440		- bool ret;
2441	5445
2442	5446	if (ret_type != RET_INTEGER \|\|
2443	5447	(func_id != BPF_FUNC_get_stack &&
2444		- func_id != BPF_FUNC_probe_read_str))
2445		- return 0;
	5448	+ func_id != BPF_FUNC_probe_read_str &&
	5449	+ func_id != BPF_FUNC_probe_read_kernel_str &&
	5450	+ func_id != BPF_FUNC_probe_read_user_str))
	5451	+ return;
2446	5452
2447		- /* Error case where ret is in interval [S32MIN, -1]. */
2448		- ret_reg->smin_value = S32_MIN;
2449		- ret_reg->smax_value = -1;
2450		-
2451		- __reg_deduce_bounds(ret_reg);
2452		- __reg_bound_offset(ret_reg);
2453		- __update_reg_bounds(ret_reg);
2454		-
2455		- ret = push_stack(env, env->insn_idx + 1, env->insn_idx, false);
2456		- if (!ret)
2457		- return -EFAULT;
2458		-
2459		- *ret_reg = tmp_reg;
2460		-
2461		- /* Success case where ret is in range [0, msize_max_value]. */
2462		- ret_reg->smin_value = 0;
2463	5453	ret_reg->smax_value = meta->msize_max_value;
2464		- ret_reg->umin_value = ret_reg->smin_value;
2465		- ret_reg->umax_value = ret_reg->smax_value;
2466		-
2467		- __reg_deduce_bounds(ret_reg);
2468		- __reg_bound_offset(ret_reg);
2469		- __update_reg_bounds(ret_reg);
2470		-
2471		- return 0;
	5454	+ ret_reg->s32_max_value = meta->msize_max_value;
	5455	+ ret_reg->smin_value = -MAX_ERRNO;
	5456	+ ret_reg->s32_min_value = -MAX_ERRNO;
	5457	+ reg_bounds_sync(ret_reg);
2472	5458	}
2473	5459
2474	5460	static int
..	..	@@ -2476,25 +5462,91 @@
2476	5462	int func_id, int insn_idx)
2477	5463	{
2478	5464	struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
	5465	+ struct bpf_map *map = meta->map_ptr;
2479	5466
2480	5467	if (func_id != BPF_FUNC_tail_call &&
2481	5468	func_id != BPF_FUNC_map_lookup_elem &&
2482	5469	func_id != BPF_FUNC_map_update_elem &&
2483		- func_id != BPF_FUNC_map_delete_elem)
	5470	+ func_id != BPF_FUNC_map_delete_elem &&
	5471	+ func_id != BPF_FUNC_map_push_elem &&
	5472	+ func_id != BPF_FUNC_map_pop_elem &&
	5473	+ func_id != BPF_FUNC_map_peek_elem)
2484	5474	return 0;
2485	5475
2486		- if (meta->map_ptr == NULL) {
	5476	+ if (map == NULL) {
2487	5477	verbose(env, "kernel subsystem misconfigured verifier\n");
2488	5478	return -EINVAL;
2489	5479	}
2490	5480
2491		- if (!BPF_MAP_PTR(aux->map_state))
	5481	+ /* In case of read-only, some additional restrictions
	5482	+ * need to be applied in order to prevent altering the
	5483	+ * state of the map from program side.
	5484	+ */
	5485	+ if ((map->map_flags & BPF_F_RDONLY_PROG) &&
	5486	+ (func_id == BPF_FUNC_map_delete_elem \|\|
	5487	+ func_id == BPF_FUNC_map_update_elem \|\|
	5488	+ func_id == BPF_FUNC_map_push_elem \|\|
	5489	+ func_id == BPF_FUNC_map_pop_elem)) {
	5490	+ verbose(env, "write into map forbidden\n");
	5491	+ return -EACCES;
	5492	+ }
	5493	+
	5494	+ if (!BPF_MAP_PTR(aux->map_ptr_state))
2492	5495	bpf_map_ptr_store(aux, meta->map_ptr,
2493		- meta->map_ptr->unpriv_array);
2494		- else if (BPF_MAP_PTR(aux->map_state) != meta->map_ptr)
	5496	+ !meta->map_ptr->bypass_spec_v1);
	5497	+ else if (BPF_MAP_PTR(aux->map_ptr_state) != meta->map_ptr)
2495	5498	bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON,
2496		- meta->map_ptr->unpriv_array);
	5499	+ !meta->map_ptr->bypass_spec_v1);
2497	5500	return 0;
	5501	+}
	5502	+
	5503	+static int
	5504	+record_func_key(struct bpf_verifier_env env, struct bpf_call_arg_meta meta,
	5505	+ int func_id, int insn_idx)
	5506	+{
	5507	+ struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
	5508	+ struct bpf_reg_state regs = cur_regs(env), reg;
	5509	+ struct bpf_map *map = meta->map_ptr;
	5510	+ u64 val, max;
	5511	+ int err;
	5512	+
	5513	+ if (func_id != BPF_FUNC_tail_call)
	5514	+ return 0;
	5515	+ if (!map \|\| map->map_type != BPF_MAP_TYPE_PROG_ARRAY) {
	5516	+ verbose(env, "kernel subsystem misconfigured verifier\n");
	5517	+ return -EINVAL;
	5518	+ }
	5519	+
	5520	+ reg = &regs[BPF_REG_3];
	5521	+ val = reg->var_off.value;
	5522	+ max = map->max_entries;
	5523	+
	5524	+ if (!(register_is_const(reg) && val < max)) {
	5525	+ bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
	5526	+ return 0;
	5527	+ }
	5528	+
	5529	+ err = mark_chain_precision(env, BPF_REG_3);
	5530	+ if (err)
	5531	+ return err;
	5532	+ if (bpf_map_key_unseen(aux))
	5533	+ bpf_map_key_store(aux, val);
	5534	+ else if (!bpf_map_key_poisoned(aux) &&
	5535	+ bpf_map_key_immediate(aux) != val)
	5536	+ bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
	5537	+ return 0;
	5538	+}
	5539	+
	5540	+static int check_reference_leak(struct bpf_verifier_env *env)
	5541	+{
	5542	+ struct bpf_func_state *state = cur_func(env);
	5543	+ int i;
	5544	+
	5545	+ for (i = 0; i < state->acquired_refs; i++) {
	5546	+ verbose(env, "Unreleased reference id=%d alloc_insn=%d\n",
	5547	+ state->refs[i].id, state->refs[i].insn_idx);
	5548	+ }
	5549	+ return state->acquired_refs ? -EINVAL : 0;
2498	5550	}
2499	5551
2500	5552	static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
..	..	@@ -2526,6 +5578,11 @@
2526	5578	return -EINVAL;
2527	5579	}
2528	5580
	5581	+ if (fn->allowed && !fn->allowed(env->prog)) {
	5582	+ verbose(env, "helper call is not allowed in probe\n");
	5583	+ return -EINVAL;
	5584	+ }
	5585	+
2529	5586	/* With LD_ABS/IND some JITs save/restore skb from r1. */
2530	5587	changes_data = bpf_helper_changes_pkt_data(fn->func);
2531	5588	if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) {
..	..	@@ -2537,31 +5594,26 @@
2537	5594	memset(&meta, 0, sizeof(meta));
2538	5595	meta.pkt_access = fn->pkt_access;
2539	5596
2540		- err = check_func_proto(fn);
	5597	+ err = check_func_proto(fn, func_id);
2541	5598	if (err) {
2542	5599	verbose(env, "kernel subsystem misconfigured func %s#%d\n",
2543	5600	func_id_name(func_id), func_id);
2544	5601	return err;
2545	5602	}
2546	5603
	5604	+ meta.func_id = func_id;
2547	5605	/* check args */
2548		- err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta);
2549		- if (err)
2550		- return err;
2551		- err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta);
2552		- if (err)
2553		- return err;
2554		- err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta);
2555		- if (err)
2556		- return err;
2557		- err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &meta);
2558		- if (err)
2559		- return err;
2560		- err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &meta);
	5606	+ for (i = 0; i < 5; i++) {
	5607	+ err = check_func_arg(env, i, &meta, fn);
	5608	+ if (err)
	5609	+ return err;
	5610	+ }
	5611	+
	5612	+ err = record_func_map(env, &meta, func_id, insn_idx);
2561	5613	if (err)
2562	5614	return err;
2563	5615
2564		- err = record_func_map(env, &meta, func_id, insn_idx);
	5616	+ err = record_func_key(env, &meta, func_id, insn_idx);
2565	5617	if (err)
2566	5618	return err;
2567	5619
..	..	@@ -2573,6 +5625,21 @@
2573	5625	BPF_WRITE, -1, false);
2574	5626	if (err)
2575	5627	return err;
	5628	+ }
	5629	+
	5630	+ if (func_id == BPF_FUNC_tail_call) {
	5631	+ err = check_reference_leak(env);
	5632	+ if (err) {
	5633	+ verbose(env, "tail_call would lead to reference leak\n");
	5634	+ return err;
	5635	+ }
	5636	+ } else if (is_release_function(func_id)) {
	5637	+ err = release_reference(env, meta.ref_obj_id);
	5638	+ if (err) {
	5639	+ verbose(env, "func %s#%d reference has not been acquired before\n",
	5640	+ func_id_name(func_id), func_id);
	5641	+ return err;
	5642	+ }
2576	5643	}
2577	5644
2578	5645	regs = cur_regs(env);
..	..	@@ -2592,6 +5659,9 @@
2592	5659	check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
2593	5660	}
2594	5661
	5662	+ /* helper call returns 64-bit value. */
	5663	+ regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
	5664	+
2595	5665	/* update return register (already marked as written above) */
2596	5666	if (fn->ret_type == RET_INTEGER) {
2597	5667	/* sets type to SCALAR_VALUE */
..	..	@@ -2600,10 +5670,6 @@
2600	5670	regs[BPF_REG_0].type = NOT_INIT;
2601	5671	} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL \|\|
2602	5672	fn->ret_type == RET_PTR_TO_MAP_VALUE) {
2603		- if (fn->ret_type == RET_PTR_TO_MAP_VALUE)
2604		- regs[BPF_REG_0].type = PTR_TO_MAP_VALUE;
2605		- else
2606		- regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
2607	5673	/* There is no offset yet applied, variable or fixed */
2608	5674	mark_reg_known_zero(env, regs, BPF_REG_0);
2609	5675	/* remember map_ptr, so that check_map_access()
..	..	@@ -2616,22 +5682,99 @@
2616	5682	return -EINVAL;
2617	5683	}
2618	5684	regs[BPF_REG_0].map_ptr = meta.map_ptr;
2619		- regs[BPF_REG_0].id = ++env->id_gen;
	5685	+ if (fn->ret_type == RET_PTR_TO_MAP_VALUE) {
	5686	+ regs[BPF_REG_0].type = PTR_TO_MAP_VALUE;
	5687	+ if (map_value_has_spin_lock(meta.map_ptr))
	5688	+ regs[BPF_REG_0].id = ++env->id_gen;
	5689	+ } else {
	5690	+ regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
	5691	+ }
	5692	+ } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) {
	5693	+ mark_reg_known_zero(env, regs, BPF_REG_0);
	5694	+ regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL;
	5695	+ } else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) {
	5696	+ mark_reg_known_zero(env, regs, BPF_REG_0);
	5697	+ regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL;
	5698	+ } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) {
	5699	+ mark_reg_known_zero(env, regs, BPF_REG_0);
	5700	+ regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL;
	5701	+ } else if (fn->ret_type == RET_PTR_TO_ALLOC_MEM_OR_NULL) {
	5702	+ mark_reg_known_zero(env, regs, BPF_REG_0);
	5703	+ regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL;
	5704	+ regs[BPF_REG_0].mem_size = meta.mem_size;
	5705	+ } else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL \|\|
	5706	+ fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID) {
	5707	+ const struct btf_type *t;
	5708	+
	5709	+ mark_reg_known_zero(env, regs, BPF_REG_0);
	5710	+ t = btf_type_skip_modifiers(btf_vmlinux, meta.ret_btf_id, NULL);
	5711	+ if (!btf_type_is_struct(t)) {
	5712	+ u32 tsize;
	5713	+ const struct btf_type *ret;
	5714	+ const char *tname;
	5715	+
	5716	+ /* resolve the type size of ksym. */
	5717	+ ret = btf_resolve_size(btf_vmlinux, t, &tsize);
	5718	+ if (IS_ERR(ret)) {
	5719	+ tname = btf_name_by_offset(btf_vmlinux, t->name_off);
	5720	+ verbose(env, "unable to resolve the size of type '%s': %ld\n",
	5721	+ tname, PTR_ERR(ret));
	5722	+ return -EINVAL;
	5723	+ }
	5724	+ regs[BPF_REG_0].type =
	5725	+ fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ?
	5726	+ PTR_TO_MEM : PTR_TO_MEM_OR_NULL;
	5727	+ regs[BPF_REG_0].mem_size = tsize;
	5728	+ } else {
	5729	+ regs[BPF_REG_0].type =
	5730	+ fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ?
	5731	+ PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL;
	5732	+ regs[BPF_REG_0].btf_id = meta.ret_btf_id;
	5733	+ }
	5734	+ } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL) {
	5735	+ int ret_btf_id;
	5736	+
	5737	+ mark_reg_known_zero(env, regs, BPF_REG_0);
	5738	+ regs[BPF_REG_0].type = PTR_TO_BTF_ID_OR_NULL;
	5739	+ ret_btf_id = *fn->ret_btf_id;
	5740	+ if (ret_btf_id == 0) {
	5741	+ verbose(env, "invalid return type %d of func %s#%d\n",
	5742	+ fn->ret_type, func_id_name(func_id), func_id);
	5743	+ return -EINVAL;
	5744	+ }
	5745	+ regs[BPF_REG_0].btf_id = ret_btf_id;
2620	5746	} else {
2621	5747	verbose(env, "unknown return type %d of func %s#%d\n",
2622	5748	fn->ret_type, func_id_name(func_id), func_id);
2623	5749	return -EINVAL;
2624	5750	}
2625	5751
2626		- err = do_refine_retval_range(env, regs, fn->ret_type, func_id, &meta);
2627		- if (err)
2628		- return err;
	5752	+ if (reg_type_may_be_null(regs[BPF_REG_0].type))
	5753	+ regs[BPF_REG_0].id = ++env->id_gen;
	5754	+
	5755	+ if (is_ptr_cast_function(func_id)) {
	5756	+ /* For release_reference() */
	5757	+ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
	5758	+ } else if (is_acquire_function(func_id, meta.map_ptr)) {
	5759	+ int id = acquire_reference_state(env, insn_idx);
	5760	+
	5761	+ if (id < 0)
	5762	+ return id;
	5763	+ /* For mark_ptr_or_null_reg() */
	5764	+ regs[BPF_REG_0].id = id;
	5765	+ /* For release_reference() */
	5766	+ regs[BPF_REG_0].ref_obj_id = id;
	5767	+ }
	5768	+
	5769	+ do_refine_retval_range(regs, fn->ret_type, func_id, &meta);
2629	5770
2630	5771	err = check_map_func_compatibility(env, meta.map_ptr, func_id);
2631	5772	if (err)
2632	5773	return err;
2633	5774
2634		- if (func_id == BPF_FUNC_get_stack && !env->prog->has_callchain_buf) {
	5775	+ if ((func_id == BPF_FUNC_get_stack \|\|
	5776	+ func_id == BPF_FUNC_get_task_stack) &&
	5777	+ !env->prog->has_callchain_buf) {
2635	5778	const char *err_str;
2636	5779
2637	5780	#ifdef CONFIG_PERF_EVENTS
..	..	@@ -2649,6 +5792,9 @@
2649	5792	env->prog->has_callchain_buf = true;
2650	5793	}
2651	5794
	5795	+ if (func_id == BPF_FUNC_get_stackid \|\| func_id == BPF_FUNC_get_stack)
	5796	+ env->prog->call_get_stack = true;
	5797	+
2652	5798	if (changes_data)
2653	5799	clear_all_pkt_pointers(env);
2654	5800	return 0;
..	..	@@ -2664,10 +5810,30 @@
2664	5810	return res < a;
2665	5811	}
2666	5812
	5813	+static bool signed_add32_overflows(s32 a, s32 b)
	5814	+{
	5815	+ /* Do the add in u32, where overflow is well-defined */
	5816	+ s32 res = (s32)((u32)a + (u32)b);
	5817	+
	5818	+ if (b < 0)
	5819	+ return res > a;
	5820	+ return res < a;
	5821	+}
	5822	+
2667	5823	static bool signed_sub_overflows(s64 a, s64 b)
2668	5824	{
2669	5825	/* Do the sub in u64, where overflow is well-defined */
2670	5826	s64 res = (s64)((u64)a - (u64)b);
	5827	+
	5828	+ if (b < 0)
	5829	+ return res < a;
	5830	+ return res > a;
	5831	+}
	5832	+
	5833	+static bool signed_sub32_overflows(s32 a, s32 b)
	5834	+{
	5835	+ /* Do the sub in u32, where overflow is well-defined */
	5836	+ s32 res = (s32)((u32)a - (u32)b);
2671	5837
2672	5838	if (b < 0)
2673	5839	return res < a;
..	..	@@ -2756,7 +5922,7 @@
2756	5922	static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env,
2757	5923	const struct bpf_insn *insn)
2758	5924	{
2759		- return env->allow_ptr_leaks \|\| BPF_SRC(insn->code) == BPF_K;
	5925	+ return env->bypass_spec_v1 \|\| BPF_SRC(insn->code) == BPF_K;
2760	5926	}
2761	5927
2762	5928	static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux,
..	..	@@ -2905,7 +6071,7 @@
2905	6071	*/
2906	6072	if (!ptr_is_dst_reg) {
2907	6073	tmp = *dst_reg;
2908		- dst_reg = ptr_reg;
	6074	+ copy_register_state(dst_reg, ptr_reg);
2909	6075	}
2910	6076	ret = sanitize_speculative_path(env, NULL, env->insn_idx + 1,
2911	6077	env->insn_idx);
..	..	@@ -2924,7 +6090,7 @@
2924	6090	* rewrite/sanitize them.
2925	6091	*/
2926	6092	if (!vstate->speculative)
2927		- env->insn_aux_data[env->insn_idx].seen = true;
	6093	+ env->insn_aux_data[env->insn_idx].seen = env->pass_cnt;
2928	6094	}
2929	6095
2930	6096	static int sanitize_err(struct bpf_verifier_env *env,
..	..	@@ -2966,6 +6132,40 @@
2966	6132	return -EACCES;
2967	6133	}
2968	6134
	6135	+/* check that stack access falls within stack limits and that 'reg' doesn't
	6136	+ * have a variable offset.
	6137	+ *
	6138	+ * Variable offset is prohibited for unprivileged mode for simplicity since it
	6139	+ * requires corresponding support in Spectre masking for stack ALU. See also
	6140	+ * retrieve_ptr_limit().
	6141	+ *
	6142	+ *
	6143	+ * 'off' includes 'reg->off'.
	6144	+ */
	6145	+static int check_stack_access_for_ptr_arithmetic(
	6146	+ struct bpf_verifier_env *env,
	6147	+ int regno,
	6148	+ const struct bpf_reg_state *reg,
	6149	+ int off)
	6150	+{
	6151	+ if (!tnum_is_const(reg->var_off)) {
	6152	+ char tn_buf[48];
	6153	+
	6154	+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
	6155	+ verbose(env, "R%d variable stack access prohibited for !root, var_off=%s off=%d\n",
	6156	+ regno, tn_buf, off);
	6157	+ return -EACCES;
	6158	+ }
	6159	+
	6160	+ if (off >= 0 \|\| off < -MAX_BPF_STACK) {
	6161	+ verbose(env, "R%d stack pointer arithmetic goes out of range, "
	6162	+ "prohibited for !root; off=%d\n", regno, off);
	6163	+ return -EACCES;
	6164	+ }
	6165	+
	6166	+ return 0;
	6167	+}
	6168	+
2969	6169	static int sanitize_check_bounds(struct bpf_verifier_env *env,
2970	6170	const struct bpf_insn *insn,
2971	6171	const struct bpf_reg_state *dst_reg)
..	..	@@ -2975,17 +6175,14 @@
2975	6175	/* For unprivileged we require that resulting offset must be in bounds
2976	6176	* in order to be able to sanitize access later on.
2977	6177	*/
2978		- if (env->allow_ptr_leaks)
	6178	+ if (env->bypass_spec_v1)
2979	6179	return 0;
2980	6180
2981	6181	switch (dst_reg->type) {
2982	6182	case PTR_TO_STACK:
2983		- if (check_stack_access(env, dst_reg, dst_reg->off +
2984		- dst_reg->var_off.value, 1)) {
2985		- verbose(env, "R%d stack pointer arithmetic goes out of range, "
2986		- "prohibited for !root\n", dst);
	6183	+ if (check_stack_access_for_ptr_arithmetic(env, dst, dst_reg,
	6184	+ dst_reg->off + dst_reg->var_off.value))
2987	6185	return -EACCES;
2988		- }
2989	6186	break;
2990	6187	case PTR_TO_MAP_VALUE:
2991	6188	if (check_map_access(env, dst, dst_reg->off, 1, false)) {
..	..	@@ -3031,32 +6228,46 @@
3031	6228	/* Taint dst register if offset had invalid bounds derived from
3032	6229	* e.g. dead branches.
3033	6230	*/
3034		- __mark_reg_unknown(dst_reg);
	6231	+ __mark_reg_unknown(env, dst_reg);
3035	6232	return 0;
3036	6233	}
3037	6234
3038	6235	if (BPF_CLASS(insn->code) != BPF_ALU64) {
3039	6236	/* 32-bit ALU ops on pointers produce (meaningless) scalars */
	6237	+ if (opcode == BPF_SUB && env->allow_ptr_leaks) {
	6238	+ __mark_reg_unknown(env, dst_reg);
	6239	+ return 0;
	6240	+ }
	6241	+
3040	6242	verbose(env,
3041	6243	"R%d 32-bit pointer arithmetic prohibited\n",
3042	6244	dst);
3043	6245	return -EACCES;
3044	6246	}
3045	6247
3046		- if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
3047		- verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
3048		- dst);
	6248	+ switch (ptr_reg->type) {
	6249	+ case PTR_TO_MAP_VALUE_OR_NULL:
	6250	+ verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n",
	6251	+ dst, reg_type_str[ptr_reg->type]);
3049	6252	return -EACCES;
3050		- }
3051		- if (ptr_reg->type == CONST_PTR_TO_MAP) {
3052		- verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
3053		- dst);
	6253	+ case CONST_PTR_TO_MAP:
	6254	+ /* smin_val represents the known value */
	6255	+ if (known && smin_val == 0 && opcode == BPF_ADD)
	6256	+ break;
	6257	+ fallthrough;
	6258	+ case PTR_TO_PACKET_END:
	6259	+ case PTR_TO_SOCKET:
	6260	+ case PTR_TO_SOCK_COMMON:
	6261	+ case PTR_TO_TCP_SOCK:
	6262	+ case PTR_TO_XDP_SOCK:
	6263	+reject:
	6264	+ verbose(env, "R%d pointer arithmetic on %s prohibited\n",
	6265	+ dst, reg_type_str[ptr_reg->type]);
3054	6266	return -EACCES;
3055		- }
3056		- if (ptr_reg->type == PTR_TO_PACKET_END) {
3057		- verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
3058		- dst);
3059		- return -EACCES;
	6267	+ default:
	6268	+ if (reg_type_may_be_null(ptr_reg->type))
	6269	+ goto reject;
	6270	+ break;
3060	6271	}
3061	6272
3062	6273	/* In case of 'scalar += pointer', dst_reg inherits pointer type and id.
..	..	@@ -3068,6 +6279,9 @@
3068	6279	if (!check_reg_sane_offset(env, off_reg, ptr_reg->type) \|\|
3069	6280	!check_reg_sane_offset(env, ptr_reg, ptr_reg->type))
3070	6281	return -EINVAL;
	6282	+
	6283	+ /* pointer types do not carry 32-bit bounds at the moment. */
	6284	+ __mark_reg32_unbounded(dst_reg);
3071	6285
3072	6286	if (sanitize_needed(opcode)) {
3073	6287	ret = sanitize_ptr_alu(env, insn, ptr_reg, off_reg, dst_reg,
..	..	@@ -3203,11 +6417,7 @@
3203	6417
3204	6418	if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type))
3205	6419	return -EINVAL;
3206		-
3207		- __update_reg_bounds(dst_reg);
3208		- __reg_deduce_bounds(dst_reg);
3209		- __reg_bound_offset(dst_reg);
3210		-
	6420	+ reg_bounds_sync(dst_reg);
3211	6421	if (sanitize_check_bounds(env, insn, dst_reg) < 0)
3212	6422	return -EACCES;
3213	6423	if (sanitize_needed(opcode)) {
..	..	@@ -3218,6 +6428,573 @@
3218	6428	}
3219	6429
3220	6430	return 0;
	6431	+}
	6432	+
	6433	+static void scalar32_min_max_add(struct bpf_reg_state *dst_reg,
	6434	+ struct bpf_reg_state *src_reg)
	6435	+{
	6436	+ s32 smin_val = src_reg->s32_min_value;
	6437	+ s32 smax_val = src_reg->s32_max_value;
	6438	+ u32 umin_val = src_reg->u32_min_value;
	6439	+ u32 umax_val = src_reg->u32_max_value;
	6440	+
	6441	+ if (signed_add32_overflows(dst_reg->s32_min_value, smin_val) \|\|
	6442	+ signed_add32_overflows(dst_reg->s32_max_value, smax_val)) {
	6443	+ dst_reg->s32_min_value = S32_MIN;
	6444	+ dst_reg->s32_max_value = S32_MAX;
	6445	+ } else {
	6446	+ dst_reg->s32_min_value += smin_val;
	6447	+ dst_reg->s32_max_value += smax_val;
	6448	+ }
	6449	+ if (dst_reg->u32_min_value + umin_val < umin_val \|\|
	6450	+ dst_reg->u32_max_value + umax_val < umax_val) {
	6451	+ dst_reg->u32_min_value = 0;
	6452	+ dst_reg->u32_max_value = U32_MAX;
	6453	+ } else {
	6454	+ dst_reg->u32_min_value += umin_val;
	6455	+ dst_reg->u32_max_value += umax_val;
	6456	+ }
	6457	+}
	6458	+
	6459	+static void scalar_min_max_add(struct bpf_reg_state *dst_reg,
	6460	+ struct bpf_reg_state *src_reg)
	6461	+{
	6462	+ s64 smin_val = src_reg->smin_value;
	6463	+ s64 smax_val = src_reg->smax_value;
	6464	+ u64 umin_val = src_reg->umin_value;
	6465	+ u64 umax_val = src_reg->umax_value;
	6466	+
	6467	+ if (signed_add_overflows(dst_reg->smin_value, smin_val) \|\|
	6468	+ signed_add_overflows(dst_reg->smax_value, smax_val)) {
	6469	+ dst_reg->smin_value = S64_MIN;
	6470	+ dst_reg->smax_value = S64_MAX;
	6471	+ } else {
	6472	+ dst_reg->smin_value += smin_val;
	6473	+ dst_reg->smax_value += smax_val;
	6474	+ }
	6475	+ if (dst_reg->umin_value + umin_val < umin_val \|\|
	6476	+ dst_reg->umax_value + umax_val < umax_val) {
	6477	+ dst_reg->umin_value = 0;
	6478	+ dst_reg->umax_value = U64_MAX;
	6479	+ } else {
	6480	+ dst_reg->umin_value += umin_val;
	6481	+ dst_reg->umax_value += umax_val;
	6482	+ }
	6483	+}
	6484	+
	6485	+static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg,
	6486	+ struct bpf_reg_state *src_reg)
	6487	+{
	6488	+ s32 smin_val = src_reg->s32_min_value;
	6489	+ s32 smax_val = src_reg->s32_max_value;
	6490	+ u32 umin_val = src_reg->u32_min_value;
	6491	+ u32 umax_val = src_reg->u32_max_value;
	6492	+
	6493	+ if (signed_sub32_overflows(dst_reg->s32_min_value, smax_val) \|\|
	6494	+ signed_sub32_overflows(dst_reg->s32_max_value, smin_val)) {
	6495	+ /* Overflow possible, we know nothing */
	6496	+ dst_reg->s32_min_value = S32_MIN;
	6497	+ dst_reg->s32_max_value = S32_MAX;
	6498	+ } else {
	6499	+ dst_reg->s32_min_value -= smax_val;
	6500	+ dst_reg->s32_max_value -= smin_val;
	6501	+ }
	6502	+ if (dst_reg->u32_min_value < umax_val) {
	6503	+ /* Overflow possible, we know nothing */
	6504	+ dst_reg->u32_min_value = 0;
	6505	+ dst_reg->u32_max_value = U32_MAX;
	6506	+ } else {
	6507	+ /* Cannot overflow (as long as bounds are consistent) */
	6508	+ dst_reg->u32_min_value -= umax_val;
	6509	+ dst_reg->u32_max_value -= umin_val;
	6510	+ }
	6511	+}
	6512	+
	6513	+static void scalar_min_max_sub(struct bpf_reg_state *dst_reg,
	6514	+ struct bpf_reg_state *src_reg)
	6515	+{
	6516	+ s64 smin_val = src_reg->smin_value;
	6517	+ s64 smax_val = src_reg->smax_value;
	6518	+ u64 umin_val = src_reg->umin_value;
	6519	+ u64 umax_val = src_reg->umax_value;
	6520	+
	6521	+ if (signed_sub_overflows(dst_reg->smin_value, smax_val) \|\|
	6522	+ signed_sub_overflows(dst_reg->smax_value, smin_val)) {
	6523	+ /* Overflow possible, we know nothing */
	6524	+ dst_reg->smin_value = S64_MIN;
	6525	+ dst_reg->smax_value = S64_MAX;
	6526	+ } else {
	6527	+ dst_reg->smin_value -= smax_val;
	6528	+ dst_reg->smax_value -= smin_val;
	6529	+ }
	6530	+ if (dst_reg->umin_value < umax_val) {
	6531	+ /* Overflow possible, we know nothing */
	6532	+ dst_reg->umin_value = 0;
	6533	+ dst_reg->umax_value = U64_MAX;
	6534	+ } else {
	6535	+ /* Cannot overflow (as long as bounds are consistent) */
	6536	+ dst_reg->umin_value -= umax_val;
	6537	+ dst_reg->umax_value -= umin_val;
	6538	+ }
	6539	+}
	6540	+
	6541	+static void scalar32_min_max_mul(struct bpf_reg_state *dst_reg,
	6542	+ struct bpf_reg_state *src_reg)
	6543	+{
	6544	+ s32 smin_val = src_reg->s32_min_value;
	6545	+ u32 umin_val = src_reg->u32_min_value;
	6546	+ u32 umax_val = src_reg->u32_max_value;
	6547	+
	6548	+ if (smin_val < 0 \|\| dst_reg->s32_min_value < 0) {
	6549	+ /* Ain't nobody got time to multiply that sign */
	6550	+ __mark_reg32_unbounded(dst_reg);
	6551	+ return;
	6552	+ }
	6553	+ /* Both values are positive, so we can work with unsigned and
	6554	+ * copy the result to signed (unless it exceeds S32_MAX).
	6555	+ */
	6556	+ if (umax_val > U16_MAX \|\| dst_reg->u32_max_value > U16_MAX) {
	6557	+ /* Potential overflow, we know nothing */
	6558	+ __mark_reg32_unbounded(dst_reg);
	6559	+ return;
	6560	+ }
	6561	+ dst_reg->u32_min_value *= umin_val;
	6562	+ dst_reg->u32_max_value *= umax_val;
	6563	+ if (dst_reg->u32_max_value > S32_MAX) {
	6564	+ /* Overflow possible, we know nothing */
	6565	+ dst_reg->s32_min_value = S32_MIN;
	6566	+ dst_reg->s32_max_value = S32_MAX;
	6567	+ } else {
	6568	+ dst_reg->s32_min_value = dst_reg->u32_min_value;
	6569	+ dst_reg->s32_max_value = dst_reg->u32_max_value;
	6570	+ }
	6571	+}
	6572	+
	6573	+static void scalar_min_max_mul(struct bpf_reg_state *dst_reg,
	6574	+ struct bpf_reg_state *src_reg)
	6575	+{
	6576	+ s64 smin_val = src_reg->smin_value;
	6577	+ u64 umin_val = src_reg->umin_value;
	6578	+ u64 umax_val = src_reg->umax_value;
	6579	+
	6580	+ if (smin_val < 0 \|\| dst_reg->smin_value < 0) {
	6581	+ /* Ain't nobody got time to multiply that sign */
	6582	+ __mark_reg64_unbounded(dst_reg);
	6583	+ return;
	6584	+ }
	6585	+ /* Both values are positive, so we can work with unsigned and
	6586	+ * copy the result to signed (unless it exceeds S64_MAX).
	6587	+ */
	6588	+ if (umax_val > U32_MAX \|\| dst_reg->umax_value > U32_MAX) {
	6589	+ /* Potential overflow, we know nothing */
	6590	+ __mark_reg64_unbounded(dst_reg);
	6591	+ return;
	6592	+ }
	6593	+ dst_reg->umin_value *= umin_val;
	6594	+ dst_reg->umax_value *= umax_val;
	6595	+ if (dst_reg->umax_value > S64_MAX) {
	6596	+ /* Overflow possible, we know nothing */
	6597	+ dst_reg->smin_value = S64_MIN;
	6598	+ dst_reg->smax_value = S64_MAX;
	6599	+ } else {
	6600	+ dst_reg->smin_value = dst_reg->umin_value;
	6601	+ dst_reg->smax_value = dst_reg->umax_value;
	6602	+ }
	6603	+}
	6604	+
	6605	+static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
	6606	+ struct bpf_reg_state *src_reg)
	6607	+{
	6608	+ bool src_known = tnum_subreg_is_const(src_reg->var_off);
	6609	+ bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
	6610	+ struct tnum var32_off = tnum_subreg(dst_reg->var_off);
	6611	+ s32 smin_val = src_reg->s32_min_value;
	6612	+ u32 umax_val = src_reg->u32_max_value;
	6613	+
	6614	+ if (src_known && dst_known) {
	6615	+ __mark_reg32_known(dst_reg, var32_off.value);
	6616	+ return;
	6617	+ }
	6618	+
	6619	+ /* We get our minimum from the var_off, since that's inherently
	6620	+ * bitwise. Our maximum is the minimum of the operands' maxima.
	6621	+ */
	6622	+ dst_reg->u32_min_value = var32_off.value;
	6623	+ dst_reg->u32_max_value = min(dst_reg->u32_max_value, umax_val);
	6624	+ if (dst_reg->s32_min_value < 0 \|\| smin_val < 0) {
	6625	+ /* Lose signed bounds when ANDing negative numbers,
	6626	+ * ain't nobody got time for that.
	6627	+ */
	6628	+ dst_reg->s32_min_value = S32_MIN;
	6629	+ dst_reg->s32_max_value = S32_MAX;
	6630	+ } else {
	6631	+ /* ANDing two positives gives a positive, so safe to
	6632	+ * cast result into s64.
	6633	+ */
	6634	+ dst_reg->s32_min_value = dst_reg->u32_min_value;
	6635	+ dst_reg->s32_max_value = dst_reg->u32_max_value;
	6636	+ }
	6637	+}
	6638	+
	6639	+static void scalar_min_max_and(struct bpf_reg_state *dst_reg,
	6640	+ struct bpf_reg_state *src_reg)
	6641	+{
	6642	+ bool src_known = tnum_is_const(src_reg->var_off);
	6643	+ bool dst_known = tnum_is_const(dst_reg->var_off);
	6644	+ s64 smin_val = src_reg->smin_value;
	6645	+ u64 umax_val = src_reg->umax_value;
	6646	+
	6647	+ if (src_known && dst_known) {
	6648	+ __mark_reg_known(dst_reg, dst_reg->var_off.value);
	6649	+ return;
	6650	+ }
	6651	+
	6652	+ /* We get our minimum from the var_off, since that's inherently
	6653	+ * bitwise. Our maximum is the minimum of the operands' maxima.
	6654	+ */
	6655	+ dst_reg->umin_value = dst_reg->var_off.value;
	6656	+ dst_reg->umax_value = min(dst_reg->umax_value, umax_val);
	6657	+ if (dst_reg->smin_value < 0 \|\| smin_val < 0) {
	6658	+ /* Lose signed bounds when ANDing negative numbers,
	6659	+ * ain't nobody got time for that.
	6660	+ */
	6661	+ dst_reg->smin_value = S64_MIN;
	6662	+ dst_reg->smax_value = S64_MAX;
	6663	+ } else {
	6664	+ /* ANDing two positives gives a positive, so safe to
	6665	+ * cast result into s64.
	6666	+ */
	6667	+ dst_reg->smin_value = dst_reg->umin_value;
	6668	+ dst_reg->smax_value = dst_reg->umax_value;
	6669	+ }
	6670	+ /* We may learn something more from the var_off */
	6671	+ __update_reg_bounds(dst_reg);
	6672	+}
	6673	+
	6674	+static void scalar32_min_max_or(struct bpf_reg_state *dst_reg,
	6675	+ struct bpf_reg_state *src_reg)
	6676	+{
	6677	+ bool src_known = tnum_subreg_is_const(src_reg->var_off);
	6678	+ bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
	6679	+ struct tnum var32_off = tnum_subreg(dst_reg->var_off);
	6680	+ s32 smin_val = src_reg->s32_min_value;
	6681	+ u32 umin_val = src_reg->u32_min_value;
	6682	+
	6683	+ if (src_known && dst_known) {
	6684	+ __mark_reg32_known(dst_reg, var32_off.value);
	6685	+ return;
	6686	+ }
	6687	+
	6688	+ /* We get our maximum from the var_off, and our minimum is the
	6689	+ * maximum of the operands' minima
	6690	+ */
	6691	+ dst_reg->u32_min_value = max(dst_reg->u32_min_value, umin_val);
	6692	+ dst_reg->u32_max_value = var32_off.value \| var32_off.mask;
	6693	+ if (dst_reg->s32_min_value < 0 \|\| smin_val < 0) {
	6694	+ /* Lose signed bounds when ORing negative numbers,
	6695	+ * ain't nobody got time for that.
	6696	+ */
	6697	+ dst_reg->s32_min_value = S32_MIN;
	6698	+ dst_reg->s32_max_value = S32_MAX;
	6699	+ } else {
	6700	+ /* ORing two positives gives a positive, so safe to
	6701	+ * cast result into s64.
	6702	+ */
	6703	+ dst_reg->s32_min_value = dst_reg->u32_min_value;
	6704	+ dst_reg->s32_max_value = dst_reg->u32_max_value;
	6705	+ }
	6706	+}
	6707	+
	6708	+static void scalar_min_max_or(struct bpf_reg_state *dst_reg,
	6709	+ struct bpf_reg_state *src_reg)
	6710	+{
	6711	+ bool src_known = tnum_is_const(src_reg->var_off);
	6712	+ bool dst_known = tnum_is_const(dst_reg->var_off);
	6713	+ s64 smin_val = src_reg->smin_value;
	6714	+ u64 umin_val = src_reg->umin_value;
	6715	+
	6716	+ if (src_known && dst_known) {
	6717	+ __mark_reg_known(dst_reg, dst_reg->var_off.value);
	6718	+ return;
	6719	+ }
	6720	+
	6721	+ /* We get our maximum from the var_off, and our minimum is the
	6722	+ * maximum of the operands' minima
	6723	+ */
	6724	+ dst_reg->umin_value = max(dst_reg->umin_value, umin_val);
	6725	+ dst_reg->umax_value = dst_reg->var_off.value \| dst_reg->var_off.mask;
	6726	+ if (dst_reg->smin_value < 0 \|\| smin_val < 0) {
	6727	+ /* Lose signed bounds when ORing negative numbers,
	6728	+ * ain't nobody got time for that.
	6729	+ */
	6730	+ dst_reg->smin_value = S64_MIN;
	6731	+ dst_reg->smax_value = S64_MAX;
	6732	+ } else {
	6733	+ /* ORing two positives gives a positive, so safe to
	6734	+ * cast result into s64.
	6735	+ */
	6736	+ dst_reg->smin_value = dst_reg->umin_value;
	6737	+ dst_reg->smax_value = dst_reg->umax_value;
	6738	+ }
	6739	+ /* We may learn something more from the var_off */
	6740	+ __update_reg_bounds(dst_reg);
	6741	+}
	6742	+
	6743	+static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg,
	6744	+ struct bpf_reg_state *src_reg)
	6745	+{
	6746	+ bool src_known = tnum_subreg_is_const(src_reg->var_off);
	6747	+ bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
	6748	+ struct tnum var32_off = tnum_subreg(dst_reg->var_off);
	6749	+ s32 smin_val = src_reg->s32_min_value;
	6750	+
	6751	+ if (src_known && dst_known) {
	6752	+ __mark_reg32_known(dst_reg, var32_off.value);
	6753	+ return;
	6754	+ }
	6755	+
	6756	+ /* We get both minimum and maximum from the var32_off. */
	6757	+ dst_reg->u32_min_value = var32_off.value;
	6758	+ dst_reg->u32_max_value = var32_off.value \| var32_off.mask;
	6759	+
	6760	+ if (dst_reg->s32_min_value >= 0 && smin_val >= 0) {
	6761	+ /* XORing two positive sign numbers gives a positive,
	6762	+ * so safe to cast u32 result into s32.
	6763	+ */
	6764	+ dst_reg->s32_min_value = dst_reg->u32_min_value;
	6765	+ dst_reg->s32_max_value = dst_reg->u32_max_value;
	6766	+ } else {
	6767	+ dst_reg->s32_min_value = S32_MIN;
	6768	+ dst_reg->s32_max_value = S32_MAX;
	6769	+ }
	6770	+}
	6771	+
	6772	+static void scalar_min_max_xor(struct bpf_reg_state *dst_reg,
	6773	+ struct bpf_reg_state *src_reg)
	6774	+{
	6775	+ bool src_known = tnum_is_const(src_reg->var_off);
	6776	+ bool dst_known = tnum_is_const(dst_reg->var_off);
	6777	+ s64 smin_val = src_reg->smin_value;
	6778	+
	6779	+ if (src_known && dst_known) {
	6780	+ /* dst_reg->var_off.value has been updated earlier */
	6781	+ __mark_reg_known(dst_reg, dst_reg->var_off.value);
	6782	+ return;
	6783	+ }
	6784	+
	6785	+ /* We get both minimum and maximum from the var_off. */
	6786	+ dst_reg->umin_value = dst_reg->var_off.value;
	6787	+ dst_reg->umax_value = dst_reg->var_off.value \| dst_reg->var_off.mask;
	6788	+
	6789	+ if (dst_reg->smin_value >= 0 && smin_val >= 0) {
	6790	+ /* XORing two positive sign numbers gives a positive,
	6791	+ * so safe to cast u64 result into s64.
	6792	+ */
	6793	+ dst_reg->smin_value = dst_reg->umin_value;
	6794	+ dst_reg->smax_value = dst_reg->umax_value;
	6795	+ } else {
	6796	+ dst_reg->smin_value = S64_MIN;
	6797	+ dst_reg->smax_value = S64_MAX;
	6798	+ }
	6799	+
	6800	+ __update_reg_bounds(dst_reg);
	6801	+}
	6802	+
	6803	+static void __scalar32_min_max_lsh(struct bpf_reg_state *dst_reg,
	6804	+ u64 umin_val, u64 umax_val)
	6805	+{
	6806	+ /* We lose all sign bit information (except what we can pick
	6807	+ * up from var_off)
	6808	+ */
	6809	+ dst_reg->s32_min_value = S32_MIN;
	6810	+ dst_reg->s32_max_value = S32_MAX;
	6811	+ /* If we might shift our top bit out, then we know nothing */
	6812	+ if (umax_val > 31 \|\| dst_reg->u32_max_value > 1ULL << (31 - umax_val)) {
	6813	+ dst_reg->u32_min_value = 0;
	6814	+ dst_reg->u32_max_value = U32_MAX;
	6815	+ } else {
	6816	+ dst_reg->u32_min_value <<= umin_val;
	6817	+ dst_reg->u32_max_value <<= umax_val;
	6818	+ }
	6819	+}
	6820	+
	6821	+static void scalar32_min_max_lsh(struct bpf_reg_state *dst_reg,
	6822	+ struct bpf_reg_state *src_reg)
	6823	+{
	6824	+ u32 umax_val = src_reg->u32_max_value;
	6825	+ u32 umin_val = src_reg->u32_min_value;
	6826	+ /* u32 alu operation will zext upper bits */
	6827	+ struct tnum subreg = tnum_subreg(dst_reg->var_off);
	6828	+
	6829	+ __scalar32_min_max_lsh(dst_reg, umin_val, umax_val);
	6830	+ dst_reg->var_off = tnum_subreg(tnum_lshift(subreg, umin_val));
	6831	+ /* Not required but being careful mark reg64 bounds as unknown so
	6832	+ * that we are forced to pick them up from tnum and zext later and
	6833	+ * if some path skips this step we are still safe.
	6834	+ */
	6835	+ __mark_reg64_unbounded(dst_reg);
	6836	+ __update_reg32_bounds(dst_reg);
	6837	+}
	6838	+
	6839	+static void __scalar64_min_max_lsh(struct bpf_reg_state *dst_reg,
	6840	+ u64 umin_val, u64 umax_val)
	6841	+{
	6842	+ /* Special case <<32 because it is a common compiler pattern to sign
	6843	+ * extend subreg by doing <<32 s>>32. In this case if 32bit bounds are
	6844	+ * positive we know this shift will also be positive so we can track
	6845	+ * bounds correctly. Otherwise we lose all sign bit information except
	6846	+ * what we can pick up from var_off. Perhaps we can generalize this
	6847	+ * later to shifts of any length.
	6848	+ */
	6849	+ if (umin_val == 32 && umax_val == 32 && dst_reg->s32_max_value >= 0)
	6850	+ dst_reg->smax_value = (s64)dst_reg->s32_max_value << 32;
	6851	+ else
	6852	+ dst_reg->smax_value = S64_MAX;
	6853	+
	6854	+ if (umin_val == 32 && umax_val == 32 && dst_reg->s32_min_value >= 0)
	6855	+ dst_reg->smin_value = (s64)dst_reg->s32_min_value << 32;
	6856	+ else
	6857	+ dst_reg->smin_value = S64_MIN;
	6858	+
	6859	+ /* If we might shift our top bit out, then we know nothing */
	6860	+ if (dst_reg->umax_value > 1ULL << (63 - umax_val)) {
	6861	+ dst_reg->umin_value = 0;
	6862	+ dst_reg->umax_value = U64_MAX;
	6863	+ } else {
	6864	+ dst_reg->umin_value <<= umin_val;
	6865	+ dst_reg->umax_value <<= umax_val;
	6866	+ }
	6867	+}
	6868	+
	6869	+static void scalar_min_max_lsh(struct bpf_reg_state *dst_reg,
	6870	+ struct bpf_reg_state *src_reg)
	6871	+{
	6872	+ u64 umax_val = src_reg->umax_value;
	6873	+ u64 umin_val = src_reg->umin_value;
	6874	+
	6875	+ /* scalar64 calc uses 32bit unshifted bounds so must be called first */
	6876	+ __scalar64_min_max_lsh(dst_reg, umin_val, umax_val);
	6877	+ __scalar32_min_max_lsh(dst_reg, umin_val, umax_val);
	6878	+
	6879	+ dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val);
	6880	+ /* We may learn something more from the var_off */
	6881	+ __update_reg_bounds(dst_reg);
	6882	+}
	6883	+
	6884	+static void scalar32_min_max_rsh(struct bpf_reg_state *dst_reg,
	6885	+ struct bpf_reg_state *src_reg)
	6886	+{
	6887	+ struct tnum subreg = tnum_subreg(dst_reg->var_off);
	6888	+ u32 umax_val = src_reg->u32_max_value;
	6889	+ u32 umin_val = src_reg->u32_min_value;
	6890	+
	6891	+ /* BPF_RSH is an unsigned shift. If the value in dst_reg might
	6892	+ * be negative, then either:
	6893	+ * 1) src_reg might be zero, so the sign bit of the result is
	6894	+ * unknown, so we lose our signed bounds
	6895	+ * 2) it's known negative, thus the unsigned bounds capture the
	6896	+ * signed bounds
	6897	+ * 3) the signed bounds cross zero, so they tell us nothing
	6898	+ * about the result
	6899	+ * If the value in dst_reg is known nonnegative, then again the
	6900	+ * unsigned bounts capture the signed bounds.
	6901	+ * Thus, in all cases it suffices to blow away our signed bounds
	6902	+ * and rely on inferring new ones from the unsigned bounds and
	6903	+ * var_off of the result.
	6904	+ */
	6905	+ dst_reg->s32_min_value = S32_MIN;
	6906	+ dst_reg->s32_max_value = S32_MAX;
	6907	+
	6908	+ dst_reg->var_off = tnum_rshift(subreg, umin_val);
	6909	+ dst_reg->u32_min_value >>= umax_val;
	6910	+ dst_reg->u32_max_value >>= umin_val;
	6911	+
	6912	+ __mark_reg64_unbounded(dst_reg);
	6913	+ __update_reg32_bounds(dst_reg);
	6914	+}
	6915	+
	6916	+static void scalar_min_max_rsh(struct bpf_reg_state *dst_reg,
	6917	+ struct bpf_reg_state *src_reg)
	6918	+{
	6919	+ u64 umax_val = src_reg->umax_value;
	6920	+ u64 umin_val = src_reg->umin_value;
	6921	+
	6922	+ /* BPF_RSH is an unsigned shift. If the value in dst_reg might
	6923	+ * be negative, then either:
	6924	+ * 1) src_reg might be zero, so the sign bit of the result is
	6925	+ * unknown, so we lose our signed bounds
	6926	+ * 2) it's known negative, thus the unsigned bounds capture the
	6927	+ * signed bounds
	6928	+ * 3) the signed bounds cross zero, so they tell us nothing
	6929	+ * about the result
	6930	+ * If the value in dst_reg is known nonnegative, then again the
	6931	+ * unsigned bounts capture the signed bounds.
	6932	+ * Thus, in all cases it suffices to blow away our signed bounds
	6933	+ * and rely on inferring new ones from the unsigned bounds and
	6934	+ * var_off of the result.
	6935	+ */
	6936	+ dst_reg->smin_value = S64_MIN;
	6937	+ dst_reg->smax_value = S64_MAX;
	6938	+ dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val);
	6939	+ dst_reg->umin_value >>= umax_val;
	6940	+ dst_reg->umax_value >>= umin_val;
	6941	+
	6942	+ /* Its not easy to operate on alu32 bounds here because it depends
	6943	+ * on bits being shifted in. Take easy way out and mark unbounded
	6944	+ * so we can recalculate later from tnum.
	6945	+ */
	6946	+ __mark_reg32_unbounded(dst_reg);
	6947	+ __update_reg_bounds(dst_reg);
	6948	+}
	6949	+
	6950	+static void scalar32_min_max_arsh(struct bpf_reg_state *dst_reg,
	6951	+ struct bpf_reg_state *src_reg)
	6952	+{
	6953	+ u64 umin_val = src_reg->u32_min_value;
	6954	+
	6955	+ /* Upon reaching here, src_known is true and
	6956	+ * umax_val is equal to umin_val.
	6957	+ */
	6958	+ dst_reg->s32_min_value = (u32)(((s32)dst_reg->s32_min_value) >> umin_val);
	6959	+ dst_reg->s32_max_value = (u32)(((s32)dst_reg->s32_max_value) >> umin_val);
	6960	+
	6961	+ dst_reg->var_off = tnum_arshift(tnum_subreg(dst_reg->var_off), umin_val, 32);
	6962	+
	6963	+ /* blow away the dst_reg umin_value/umax_value and rely on
	6964	+ * dst_reg var_off to refine the result.
	6965	+ */
	6966	+ dst_reg->u32_min_value = 0;
	6967	+ dst_reg->u32_max_value = U32_MAX;
	6968	+
	6969	+ __mark_reg64_unbounded(dst_reg);
	6970	+ __update_reg32_bounds(dst_reg);
	6971	+}
	6972	+
	6973	+static void scalar_min_max_arsh(struct bpf_reg_state *dst_reg,
	6974	+ struct bpf_reg_state *src_reg)
	6975	+{
	6976	+ u64 umin_val = src_reg->umin_value;
	6977	+
	6978	+ /* Upon reaching here, src_known is true and umax_val is equal
	6979	+ * to umin_val.
	6980	+ */
	6981	+ dst_reg->smin_value >>= umin_val;
	6982	+ dst_reg->smax_value >>= umin_val;
	6983	+
	6984	+ dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val, 64);
	6985	+
	6986	+ /* blow away the dst_reg umin_value/umax_value and rely on
	6987	+ * dst_reg var_off to refine the result.
	6988	+ */
	6989	+ dst_reg->umin_value = 0;
	6990	+ dst_reg->umax_value = U64_MAX;
	6991	+
	6992	+ /* Its not easy to operate on alu32 bounds here because it depends
	6993	+ * on bits being shifted in from upper 32-bits. Take easy way out
	6994	+ * and mark unbounded so we can recalculate later from tnum.
	6995	+ */
	6996	+ __mark_reg32_unbounded(dst_reg);
	6997	+ __update_reg_bounds(dst_reg);
3221	6998	}
3222	6999
3223	7000	/* WARNING: This function does calculations on 64-bit values, but the actual
..	..	@@ -3231,40 +7008,52 @@
3231	7008	{
3232	7009	struct bpf_reg_state *regs = cur_regs(env);
3233	7010	u8 opcode = BPF_OP(insn->code);
3234		- bool src_known, dst_known;
	7011	+ bool src_known;
3235	7012	s64 smin_val, smax_val;
3236	7013	u64 umin_val, umax_val;
	7014	+ s32 s32_min_val, s32_max_val;
	7015	+ u32 u32_min_val, u32_max_val;
3237	7016	u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32;
	7017	+ bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64);
3238	7018	int ret;
3239		-
3240		- if (insn_bitness == 32) {
3241		- /* Relevant for 32-bit RSH: Information can propagate towards
3242		- * LSB, so it isn't sufficient to only truncate the output to
3243		- * 32 bits.
3244		- */
3245		- coerce_reg_to_size(dst_reg, 4);
3246		- coerce_reg_to_size(&src_reg, 4);
3247		- }
3248	7019
3249	7020	smin_val = src_reg.smin_value;
3250	7021	smax_val = src_reg.smax_value;
3251	7022	umin_val = src_reg.umin_value;
3252	7023	umax_val = src_reg.umax_value;
3253		- src_known = tnum_is_const(src_reg.var_off);
3254		- dst_known = tnum_is_const(dst_reg->var_off);
3255	7024
3256		- if ((src_known && (smin_val != smax_val \|\| umin_val != umax_val)) \|\|
3257		- smin_val > smax_val \|\| umin_val > umax_val) {
3258		- /* Taint dst register if offset had invalid bounds derived from
3259		- * e.g. dead branches.
3260		- */
3261		- __mark_reg_unknown(dst_reg);
3262		- return 0;
	7025	+ s32_min_val = src_reg.s32_min_value;
	7026	+ s32_max_val = src_reg.s32_max_value;
	7027	+ u32_min_val = src_reg.u32_min_value;
	7028	+ u32_max_val = src_reg.u32_max_value;
	7029	+
	7030	+ if (alu32) {
	7031	+ src_known = tnum_subreg_is_const(src_reg.var_off);
	7032	+ if ((src_known &&
	7033	+ (s32_min_val != s32_max_val \|\| u32_min_val != u32_max_val)) \|\|
	7034	+ s32_min_val > s32_max_val \|\| u32_min_val > u32_max_val) {
	7035	+ /* Taint dst register if offset had invalid bounds
	7036	+ * derived from e.g. dead branches.
	7037	+ */
	7038	+ __mark_reg_unknown(env, dst_reg);
	7039	+ return 0;
	7040	+ }
	7041	+ } else {
	7042	+ src_known = tnum_is_const(src_reg.var_off);
	7043	+ if ((src_known &&
	7044	+ (smin_val != smax_val \|\| umin_val != umax_val)) \|\|
	7045	+ smin_val > smax_val \|\| umin_val > umax_val) {
	7046	+ /* Taint dst register if offset had invalid bounds
	7047	+ * derived from e.g. dead branches.
	7048	+ */
	7049	+ __mark_reg_unknown(env, dst_reg);
	7050	+ return 0;
	7051	+ }
3263	7052	}
3264	7053
3265	7054	if (!src_known &&
3266	7055	opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) {
3267		- __mark_reg_unknown(dst_reg);
	7056	+ __mark_reg_unknown(env, dst_reg);
3268	7057	return 0;
3269	7058	}
3270	7059
..	..	@@ -3274,132 +7063,50 @@
3274	7063	return sanitize_err(env, insn, ret, NULL, NULL);
3275	7064	}
3276	7065
	7066	+ /* Calculate sign/unsigned bounds and tnum for alu32 and alu64 bit ops.
	7067	+ * There are two classes of instructions: The first class we track both
	7068	+ * alu32 and alu64 sign/unsigned bounds independently this provides the
	7069	+ * greatest amount of precision when alu operations are mixed with jmp32
	7070	+ * operations. These operations are BPF_ADD, BPF_SUB, BPF_MUL, BPF_ADD,
	7071	+ * and BPF_OR. This is possible because these ops have fairly easy to
	7072	+ * understand and calculate behavior in both 32-bit and 64-bit alu ops.
	7073	+ * See alu32 verifier tests for examples. The second class of
	7074	+ * operations, BPF_LSH, BPF_RSH, and BPF_ARSH, however are not so easy
	7075	+ * with regards to tracking sign/unsigned bounds because the bits may
	7076	+ * cross subreg boundaries in the alu64 case. When this happens we mark
	7077	+ * the reg unbounded in the subreg bound space and use the resulting
	7078	+ * tnum to calculate an approximation of the sign/unsigned bounds.
	7079	+ */
3277	7080	switch (opcode) {
3278	7081	case BPF_ADD:
3279		- if (signed_add_overflows(dst_reg->smin_value, smin_val) \|\|
3280		- signed_add_overflows(dst_reg->smax_value, smax_val)) {
3281		- dst_reg->smin_value = S64_MIN;
3282		- dst_reg->smax_value = S64_MAX;
3283		- } else {
3284		- dst_reg->smin_value += smin_val;
3285		- dst_reg->smax_value += smax_val;
3286		- }
3287		- if (dst_reg->umin_value + umin_val < umin_val \|\|
3288		- dst_reg->umax_value + umax_val < umax_val) {
3289		- dst_reg->umin_value = 0;
3290		- dst_reg->umax_value = U64_MAX;
3291		- } else {
3292		- dst_reg->umin_value += umin_val;
3293		- dst_reg->umax_value += umax_val;
3294		- }
	7082	+ scalar32_min_max_add(dst_reg, &src_reg);
	7083	+ scalar_min_max_add(dst_reg, &src_reg);
3295	7084	dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off);
3296	7085	break;
3297	7086	case BPF_SUB:
3298		- if (signed_sub_overflows(dst_reg->smin_value, smax_val) \|\|
3299		- signed_sub_overflows(dst_reg->smax_value, smin_val)) {
3300		- /* Overflow possible, we know nothing */
3301		- dst_reg->smin_value = S64_MIN;
3302		- dst_reg->smax_value = S64_MAX;
3303		- } else {
3304		- dst_reg->smin_value -= smax_val;
3305		- dst_reg->smax_value -= smin_val;
3306		- }
3307		- if (dst_reg->umin_value < umax_val) {
3308		- /* Overflow possible, we know nothing */
3309		- dst_reg->umin_value = 0;
3310		- dst_reg->umax_value = U64_MAX;
3311		- } else {
3312		- /* Cannot overflow (as long as bounds are consistent) */
3313		- dst_reg->umin_value -= umax_val;
3314		- dst_reg->umax_value -= umin_val;
3315		- }
	7087	+ scalar32_min_max_sub(dst_reg, &src_reg);
	7088	+ scalar_min_max_sub(dst_reg, &src_reg);
3316	7089	dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off);
3317	7090	break;
3318	7091	case BPF_MUL:
3319	7092	dst_reg->var_off = tnum_mul(dst_reg->var_off, src_reg.var_off);
3320		- if (smin_val < 0 \|\| dst_reg->smin_value < 0) {
3321		- /* Ain't nobody got time to multiply that sign */
3322		- __mark_reg_unbounded(dst_reg);
3323		- __update_reg_bounds(dst_reg);
3324		- break;
3325		- }
3326		- /* Both values are positive, so we can work with unsigned and
3327		- * copy the result to signed (unless it exceeds S64_MAX).
3328		- */
3329		- if (umax_val > U32_MAX \|\| dst_reg->umax_value > U32_MAX) {
3330		- /* Potential overflow, we know nothing */
3331		- __mark_reg_unbounded(dst_reg);
3332		- /* (except what we can learn from the var_off) */
3333		- __update_reg_bounds(dst_reg);
3334		- break;
3335		- }
3336		- dst_reg->umin_value *= umin_val;
3337		- dst_reg->umax_value *= umax_val;
3338		- if (dst_reg->umax_value > S64_MAX) {
3339		- /* Overflow possible, we know nothing */
3340		- dst_reg->smin_value = S64_MIN;
3341		- dst_reg->smax_value = S64_MAX;
3342		- } else {
3343		- dst_reg->smin_value = dst_reg->umin_value;
3344		- dst_reg->smax_value = dst_reg->umax_value;
3345		- }
	7093	+ scalar32_min_max_mul(dst_reg, &src_reg);
	7094	+ scalar_min_max_mul(dst_reg, &src_reg);
3346	7095	break;
3347	7096	case BPF_AND:
3348		- if (src_known && dst_known) {
3349		- __mark_reg_known(dst_reg, dst_reg->var_off.value &
3350		- src_reg.var_off.value);
3351		- break;
3352		- }
3353		- /* We get our minimum from the var_off, since that's inherently
3354		- * bitwise. Our maximum is the minimum of the operands' maxima.
3355		- */
3356	7097	dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg.var_off);
3357		- dst_reg->umin_value = dst_reg->var_off.value;
3358		- dst_reg->umax_value = min(dst_reg->umax_value, umax_val);
3359		- if (dst_reg->smin_value < 0 \|\| smin_val < 0) {
3360		- /* Lose signed bounds when ANDing negative numbers,
3361		- * ain't nobody got time for that.
3362		- */
3363		- dst_reg->smin_value = S64_MIN;
3364		- dst_reg->smax_value = S64_MAX;
3365		- } else {
3366		- /* ANDing two positives gives a positive, so safe to
3367		- * cast result into s64.
3368		- */
3369		- dst_reg->smin_value = dst_reg->umin_value;
3370		- dst_reg->smax_value = dst_reg->umax_value;
3371		- }
3372		- /* We may learn something more from the var_off */
3373		- __update_reg_bounds(dst_reg);
	7098	+ scalar32_min_max_and(dst_reg, &src_reg);
	7099	+ scalar_min_max_and(dst_reg, &src_reg);
3374	7100	break;
3375	7101	case BPF_OR:
3376		- if (src_known && dst_known) {
3377		- __mark_reg_known(dst_reg, dst_reg->var_off.value \|
3378		- src_reg.var_off.value);
3379		- break;
3380		- }
3381		- /* We get our maximum from the var_off, and our minimum is the
3382		- * maximum of the operands' minima
3383		- */
3384	7102	dst_reg->var_off = tnum_or(dst_reg->var_off, src_reg.var_off);
3385		- dst_reg->umin_value = max(dst_reg->umin_value, umin_val);
3386		- dst_reg->umax_value = dst_reg->var_off.value \|
3387		- dst_reg->var_off.mask;
3388		- if (dst_reg->smin_value < 0 \|\| smin_val < 0) {
3389		- /* Lose signed bounds when ORing negative numbers,
3390		- * ain't nobody got time for that.
3391		- */
3392		- dst_reg->smin_value = S64_MIN;
3393		- dst_reg->smax_value = S64_MAX;
3394		- } else {
3395		- /* ORing two positives gives a positive, so safe to
3396		- * cast result into s64.
3397		- */
3398		- dst_reg->smin_value = dst_reg->umin_value;
3399		- dst_reg->smax_value = dst_reg->umax_value;
3400		- }
3401		- /* We may learn something more from the var_off */
3402		- __update_reg_bounds(dst_reg);
	7103	+ scalar32_min_max_or(dst_reg, &src_reg);
	7104	+ scalar_min_max_or(dst_reg, &src_reg);
	7105	+ break;
	7106	+ case BPF_XOR:
	7107	+ dst_reg->var_off = tnum_xor(dst_reg->var_off, src_reg.var_off);
	7108	+ scalar32_min_max_xor(dst_reg, &src_reg);
	7109	+ scalar_min_max_xor(dst_reg, &src_reg);
3403	7110	break;
3404	7111	case BPF_LSH:
3405	7112	if (umax_val >= insn_bitness) {
..	..	@@ -3409,22 +7116,10 @@
3409	7116	mark_reg_unknown(env, regs, insn->dst_reg);
3410	7117	break;
3411	7118	}
3412		- /* We lose all sign bit information (except what we can pick
3413		- * up from var_off)
3414		- */
3415		- dst_reg->smin_value = S64_MIN;
3416		- dst_reg->smax_value = S64_MAX;
3417		- /* If we might shift our top bit out, then we know nothing */
3418		- if (dst_reg->umax_value > 1ULL << (63 - umax_val)) {
3419		- dst_reg->umin_value = 0;
3420		- dst_reg->umax_value = U64_MAX;
3421		- } else {
3422		- dst_reg->umin_value <<= umin_val;
3423		- dst_reg->umax_value <<= umax_val;
3424		- }
3425		- dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val);
3426		- /* We may learn something more from the var_off */
3427		- __update_reg_bounds(dst_reg);
	7119	+ if (alu32)
	7120	+ scalar32_min_max_lsh(dst_reg, &src_reg);
	7121	+ else
	7122	+ scalar_min_max_lsh(dst_reg, &src_reg);
3428	7123	break;
3429	7124	case BPF_RSH:
3430	7125	if (umax_val >= insn_bitness) {
..	..	@@ -3434,27 +7129,10 @@
3434	7129	mark_reg_unknown(env, regs, insn->dst_reg);
3435	7130	break;
3436	7131	}
3437		- /* BPF_RSH is an unsigned shift. If the value in dst_reg might
3438		- * be negative, then either:
3439		- * 1) src_reg might be zero, so the sign bit of the result is
3440		- * unknown, so we lose our signed bounds
3441		- * 2) it's known negative, thus the unsigned bounds capture the
3442		- * signed bounds
3443		- * 3) the signed bounds cross zero, so they tell us nothing
3444		- * about the result
3445		- * If the value in dst_reg is known nonnegative, then again the
3446		- * unsigned bounts capture the signed bounds.
3447		- * Thus, in all cases it suffices to blow away our signed bounds
3448		- * and rely on inferring new ones from the unsigned bounds and
3449		- * var_off of the result.
3450		- */
3451		- dst_reg->smin_value = S64_MIN;
3452		- dst_reg->smax_value = S64_MAX;
3453		- dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val);
3454		- dst_reg->umin_value >>= umax_val;
3455		- dst_reg->umax_value >>= umin_val;
3456		- /* We may learn something more from the var_off */
3457		- __update_reg_bounds(dst_reg);
	7132	+ if (alu32)
	7133	+ scalar32_min_max_rsh(dst_reg, &src_reg);
	7134	+ else
	7135	+ scalar_min_max_rsh(dst_reg, &src_reg);
3458	7136	break;
3459	7137	case BPF_ARSH:
3460	7138	if (umax_val >= insn_bitness) {
..	..	@@ -3464,40 +7142,20 @@
3464	7142	mark_reg_unknown(env, regs, insn->dst_reg);
3465	7143	break;
3466	7144	}
3467		-
3468		- /* Upon reaching here, src_known is true and
3469		- * umax_val is equal to umin_val.
3470		- */
3471		- if (insn_bitness == 32) {
3472		- dst_reg->smin_value = (u32)(((s32)dst_reg->smin_value) >> umin_val);
3473		- dst_reg->smax_value = (u32)(((s32)dst_reg->smax_value) >> umin_val);
3474		- } else {
3475		- dst_reg->smin_value >>= umin_val;
3476		- dst_reg->smax_value >>= umin_val;
3477		- }
3478		-
3479		- dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val,
3480		- insn_bitness);
3481		-
3482		- /* blow away the dst_reg umin_value/umax_value and rely on
3483		- * dst_reg var_off to refine the result.
3484		- */
3485		- dst_reg->umin_value = 0;
3486		- dst_reg->umax_value = U64_MAX;
3487		- __update_reg_bounds(dst_reg);
	7145	+ if (alu32)
	7146	+ scalar32_min_max_arsh(dst_reg, &src_reg);
	7147	+ else
	7148	+ scalar_min_max_arsh(dst_reg, &src_reg);
3488	7149	break;
3489	7150	default:
3490	7151	mark_reg_unknown(env, regs, insn->dst_reg);
3491	7152	break;
3492	7153	}
3493	7154
3494		- if (BPF_CLASS(insn->code) != BPF_ALU64) {
3495		- /* 32-bit ALU ops are (32,32)->32 */
3496		- coerce_reg_to_size(dst_reg, 4);
3497		- }
3498		-
3499		- __reg_deduce_bounds(dst_reg);
3500		- __reg_bound_offset(dst_reg);
	7155	+ /* ALU32 ops are zero extended into 64bit register */
	7156	+ if (alu32)
	7157	+ zext_32_to_64(dst_reg);
	7158	+ reg_bounds_sync(dst_reg);
3501	7159	return 0;
3502	7160	}
3503	7161
..	..	@@ -3512,11 +7170,17 @@
3512	7170	struct bpf_reg_state regs = state->regs, dst_reg, *src_reg;
3513	7171	struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
3514	7172	u8 opcode = BPF_OP(insn->code);
	7173	+ int err;
3515	7174
3516	7175	dst_reg = &regs[insn->dst_reg];
3517	7176	src_reg = NULL;
3518	7177	if (dst_reg->type != SCALAR_VALUE)
3519	7178	ptr_reg = dst_reg;
	7179	+ else
	7180	+ /* Make sure ID is cleared otherwise dst_reg min/max could be
	7181	+ * incorrectly propagated into other registers by find_equal_scalars()
	7182	+ */
	7183	+ dst_reg->id = 0;
3520	7184	if (BPF_SRC(insn->code) == BPF_X) {
3521	7185	src_reg = &regs[insn->src_reg];
3522	7186	if (src_reg->type != SCALAR_VALUE) {
..	..	@@ -3538,13 +7202,24 @@
3538	7202	* This is legal, but we have to reverse our
3539	7203	* src/dest handling in computing the range
3540	7204	*/
	7205	+ err = mark_chain_precision(env, insn->dst_reg);
	7206	+ if (err)
	7207	+ return err;
3541	7208	return adjust_ptr_min_max_vals(env, insn,
3542	7209	src_reg, dst_reg);
3543	7210	}
3544	7211	} else if (ptr_reg) {
3545	7212	/* pointer += scalar */
	7213	+ err = mark_chain_precision(env, insn->src_reg);
	7214	+ if (err)
	7215	+ return err;
3546	7216	return adjust_ptr_min_max_vals(env, insn,
3547	7217	dst_reg, src_reg);
	7218	+ } else if (dst_reg->precise) {
	7219	+ /* if dst_reg is precise, src_reg should be precise as well */
	7220	+ err = mark_chain_precision(env, insn->src_reg);
	7221	+ if (err)
	7222	+ return err;
3548	7223	}
3549	7224	} else {
3550	7225	/* Pretend the src is a reg with a known value, since we only
..	..	@@ -3644,8 +7319,15 @@
3644	7319	/* case: R1 = R2
3645	7320	* copy register state to dest reg
3646	7321	*/
3647		- dst_reg = src_reg;
	7322	+ if (src_reg->type == SCALAR_VALUE && !src_reg->id)
	7323	+ /* Assign src and dst registers the same ID
	7324	+ * that will be used by find_equal_scalars()
	7325	+ * to propagate min/max range.
	7326	+ */
	7327	+ src_reg->id = ++env->id_gen;
	7328	+ copy_register_state(dst_reg, src_reg);
3648	7329	dst_reg->live \|= REG_LIVE_WRITTEN;
	7330	+ dst_reg->subreg_def = DEF_NOT_SUBREG;
3649	7331	} else {
3650	7332	/* R1 = (u32) R2 */
3651	7333	if (is_pointer_value(env, insn->src_reg)) {
..	..	@@ -3654,13 +7336,20 @@
3654	7336	insn->src_reg);
3655	7337	return -EACCES;
3656	7338	} else if (src_reg->type == SCALAR_VALUE) {
3657		- dst_reg = src_reg;
	7339	+ copy_register_state(dst_reg, src_reg);
	7340	+ /* Make sure ID is cleared otherwise
	7341	+ * dst_reg min/max could be incorrectly
	7342	+ * propagated into src_reg by find_equal_scalars()
	7343	+ */
	7344	+ dst_reg->id = 0;
3658	7345	dst_reg->live \|= REG_LIVE_WRITTEN;
	7346	+ dst_reg->subreg_def = env->insn_idx + 1;
3659	7347	} else {
3660	7348	mark_reg_unknown(env, regs,
3661	7349	insn->dst_reg);
3662	7350	}
3663		- coerce_reg_to_size(dst_reg, 4);
	7351	+ zext_32_to_64(dst_reg);
	7352	+ reg_bounds_sync(dst_reg);
3664	7353	}
3665	7354	} else {
3666	7355	/* case: R = imm
..	..	@@ -3711,11 +7400,6 @@
3711	7400	return -EINVAL;
3712	7401	}
3713	7402
3714		- if (opcode == BPF_ARSH && BPF_CLASS(insn->code) != BPF_ALU64) {
3715		- verbose(env, "BPF_ARSH not supported for 32 bit ALU\n");
3716		- return -EINVAL;
3717		- }
3718		-
3719	7403	if ((opcode == BPF_LSH \|\| opcode == BPF_RSH \|\|
3720	7404	opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) {
3721	7405	int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
..	..	@@ -3742,10 +7426,9 @@
3742	7426	enum bpf_reg_type type,
3743	7427	bool range_right_open)
3744	7428	{
3745		- struct bpf_func_state *state = vstate->frame[vstate->curframe];
3746		- struct bpf_reg_state regs = state->regs, reg;
3747		- u16 new_range;
3748		- int i, j;
	7429	+ struct bpf_func_state *state;
	7430	+ struct bpf_reg_state *reg;
	7431	+ int new_range;
3749	7432
3750	7433	if (dst_reg->off < 0 \|\|
3751	7434	(dst_reg->off == 0 && range_right_open))
..	..	@@ -3810,33 +7493,90 @@
3810	7493	* the range won't allow anything.
3811	7494	* dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
3812	7495	*/
3813		- for (i = 0; i < MAX_BPF_REG; i++)
3814		- if (regs[i].type == type && regs[i].id == dst_reg->id)
	7496	+ bpf_for_each_reg_in_vstate(vstate, state, reg, ({
	7497	+ if (reg->type == type && reg->id == dst_reg->id)
3815	7498	/* keep the maximum range already checked */
3816		- regs[i].range = max(regs[i].range, new_range);
3817		-
3818		- for (j = 0; j <= vstate->curframe; j++) {
3819		- state = vstate->frame[j];
3820		- for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
3821		- if (state->stack[i].slot_type[0] != STACK_SPILL)
3822		- continue;
3823		- reg = &state->stack[i].spilled_ptr;
3824		- if (reg->type == type && reg->id == dst_reg->id)
3825		- reg->range = max(reg->range, new_range);
3826		- }
3827		- }
	7499	+ reg->range = max(reg->range, new_range);
	7500	+ }));
3828	7501	}
3829	7502
3830		-/* compute branch direction of the expression "if (reg opcode val) goto target;"
3831		- * and return:
3832		- * 1 - branch will be taken and "goto target" will be executed
3833		- * 0 - branch will not be taken and fall-through to next insn
3834		- * -1 - unknown. Example: "if (reg < 5)" is unknown when register value range [0,10]
3835		- */
3836		-static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
	7503	+static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode)
3837	7504	{
3838		- if (__is_pointer_value(false, reg))
3839		- return -1;
	7505	+ struct tnum subreg = tnum_subreg(reg->var_off);
	7506	+ s32 sval = (s32)val;
	7507	+
	7508	+ switch (opcode) {
	7509	+ case BPF_JEQ:
	7510	+ if (tnum_is_const(subreg))
	7511	+ return !!tnum_equals_const(subreg, val);
	7512	+ break;
	7513	+ case BPF_JNE:
	7514	+ if (tnum_is_const(subreg))
	7515	+ return !tnum_equals_const(subreg, val);
	7516	+ break;
	7517	+ case BPF_JSET:
	7518	+ if ((~subreg.mask & subreg.value) & val)
	7519	+ return 1;
	7520	+ if (!((subreg.mask \| subreg.value) & val))
	7521	+ return 0;
	7522	+ break;
	7523	+ case BPF_JGT:
	7524	+ if (reg->u32_min_value > val)
	7525	+ return 1;
	7526	+ else if (reg->u32_max_value <= val)
	7527	+ return 0;
	7528	+ break;
	7529	+ case BPF_JSGT:
	7530	+ if (reg->s32_min_value > sval)
	7531	+ return 1;
	7532	+ else if (reg->s32_max_value <= sval)
	7533	+ return 0;
	7534	+ break;
	7535	+ case BPF_JLT:
	7536	+ if (reg->u32_max_value < val)
	7537	+ return 1;
	7538	+ else if (reg->u32_min_value >= val)
	7539	+ return 0;
	7540	+ break;
	7541	+ case BPF_JSLT:
	7542	+ if (reg->s32_max_value < sval)
	7543	+ return 1;
	7544	+ else if (reg->s32_min_value >= sval)
	7545	+ return 0;
	7546	+ break;
	7547	+ case BPF_JGE:
	7548	+ if (reg->u32_min_value >= val)
	7549	+ return 1;
	7550	+ else if (reg->u32_max_value < val)
	7551	+ return 0;
	7552	+ break;
	7553	+ case BPF_JSGE:
	7554	+ if (reg->s32_min_value >= sval)
	7555	+ return 1;
	7556	+ else if (reg->s32_max_value < sval)
	7557	+ return 0;
	7558	+ break;
	7559	+ case BPF_JLE:
	7560	+ if (reg->u32_max_value <= val)
	7561	+ return 1;
	7562	+ else if (reg->u32_min_value > val)
	7563	+ return 0;
	7564	+ break;
	7565	+ case BPF_JSLE:
	7566	+ if (reg->s32_max_value <= sval)
	7567	+ return 1;
	7568	+ else if (reg->s32_min_value > sval)
	7569	+ return 0;
	7570	+ break;
	7571	+ }
	7572	+
	7573	+ return -1;
	7574	+}
	7575	+
	7576	+
	7577	+static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
	7578	+{
	7579	+ s64 sval = (s64)val;
3840	7580
3841	7581	switch (opcode) {
3842	7582	case BPF_JEQ:
..	..	@@ -3847,6 +7587,12 @@
3847	7587	if (tnum_is_const(reg->var_off))
3848	7588	return !tnum_equals_const(reg->var_off, val);
3849	7589	break;
	7590	+ case BPF_JSET:
	7591	+ if ((~reg->var_off.mask & reg->var_off.value) & val)
	7592	+ return 1;
	7593	+ if (!((reg->var_off.mask \| reg->var_off.value) & val))
	7594	+ return 0;
	7595	+ break;
3850	7596	case BPF_JGT:
3851	7597	if (reg->umin_value > val)
3852	7598	return 1;
..	..	@@ -3854,9 +7600,9 @@
3854	7600	return 0;
3855	7601	break;
3856	7602	case BPF_JSGT:
3857		- if (reg->smin_value > (s64)val)
	7603	+ if (reg->smin_value > sval)
3858	7604	return 1;
3859		- else if (reg->smax_value < (s64)val)
	7605	+ else if (reg->smax_value <= sval)
3860	7606	return 0;
3861	7607	break;
3862	7608	case BPF_JLT:
..	..	@@ -3866,9 +7612,9 @@
3866	7612	return 0;
3867	7613	break;
3868	7614	case BPF_JSLT:
3869		- if (reg->smax_value < (s64)val)
	7615	+ if (reg->smax_value < sval)
3870	7616	return 1;
3871		- else if (reg->smin_value >= (s64)val)
	7617	+ else if (reg->smin_value >= sval)
3872	7618	return 0;
3873	7619	break;
3874	7620	case BPF_JGE:
..	..	@@ -3878,9 +7624,9 @@
3878	7624	return 0;
3879	7625	break;
3880	7626	case BPF_JSGE:
3881		- if (reg->smin_value >= (s64)val)
	7627	+ if (reg->smin_value >= sval)
3882	7628	return 1;
3883		- else if (reg->smax_value < (s64)val)
	7629	+ else if (reg->smax_value < sval)
3884	7630	return 0;
3885	7631	break;
3886	7632	case BPF_JLE:
..	..	@@ -3890,13 +7636,109 @@
3890	7636	return 0;
3891	7637	break;
3892	7638	case BPF_JSLE:
3893		- if (reg->smax_value <= (s64)val)
	7639	+ if (reg->smax_value <= sval)
3894	7640	return 1;
3895		- else if (reg->smin_value > (s64)val)
	7641	+ else if (reg->smin_value > sval)
3896	7642	return 0;
3897	7643	break;
3898	7644	}
3899	7645
	7646	+ return -1;
	7647	+}
	7648	+
	7649	+/* compute branch direction of the expression "if (reg opcode val) goto target;"
	7650	+ * and return:
	7651	+ * 1 - branch will be taken and "goto target" will be executed
	7652	+ * 0 - branch will not be taken and fall-through to next insn
	7653	+ * -1 - unknown. Example: "if (reg < 5)" is unknown when register value
	7654	+ * range [0,10]
	7655	+ */
	7656	+static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode,
	7657	+ bool is_jmp32)
	7658	+{
	7659	+ if (__is_pointer_value(false, reg)) {
	7660	+ if (!reg_type_not_null(reg->type))
	7661	+ return -1;
	7662	+
	7663	+ /* If pointer is valid tests against zero will fail so we can
	7664	+ * use this to direct branch taken.
	7665	+ */
	7666	+ if (val != 0)
	7667	+ return -1;
	7668	+
	7669	+ switch (opcode) {
	7670	+ case BPF_JEQ:
	7671	+ return 0;
	7672	+ case BPF_JNE:
	7673	+ return 1;
	7674	+ default:
	7675	+ return -1;
	7676	+ }
	7677	+ }
	7678	+
	7679	+ if (is_jmp32)
	7680	+ return is_branch32_taken(reg, val, opcode);
	7681	+ return is_branch64_taken(reg, val, opcode);
	7682	+}
	7683	+
	7684	+static int flip_opcode(u32 opcode)
	7685	+{
	7686	+ /* How can we transform "a <op> b" into "b <op> a"? */
	7687	+ static const u8 opcode_flip[16] = {
	7688	+ /* these stay the same */
	7689	+ [BPF_JEQ >> 4] = BPF_JEQ,
	7690	+ [BPF_JNE >> 4] = BPF_JNE,
	7691	+ [BPF_JSET >> 4] = BPF_JSET,
	7692	+ /* these swap "lesser" and "greater" (L and G in the opcodes) */
	7693	+ [BPF_JGE >> 4] = BPF_JLE,
	7694	+ [BPF_JGT >> 4] = BPF_JLT,
	7695	+ [BPF_JLE >> 4] = BPF_JGE,
	7696	+ [BPF_JLT >> 4] = BPF_JGT,
	7697	+ [BPF_JSGE >> 4] = BPF_JSLE,
	7698	+ [BPF_JSGT >> 4] = BPF_JSLT,
	7699	+ [BPF_JSLE >> 4] = BPF_JSGE,
	7700	+ [BPF_JSLT >> 4] = BPF_JSGT
	7701	+ };
	7702	+ return opcode_flip[opcode >> 4];
	7703	+}
	7704	+
	7705	+static int is_pkt_ptr_branch_taken(struct bpf_reg_state *dst_reg,
	7706	+ struct bpf_reg_state *src_reg,
	7707	+ u8 opcode)
	7708	+{
	7709	+ struct bpf_reg_state *pkt;
	7710	+
	7711	+ if (src_reg->type == PTR_TO_PACKET_END) {
	7712	+ pkt = dst_reg;
	7713	+ } else if (dst_reg->type == PTR_TO_PACKET_END) {
	7714	+ pkt = src_reg;
	7715	+ opcode = flip_opcode(opcode);
	7716	+ } else {
	7717	+ return -1;
	7718	+ }
	7719	+
	7720	+ if (pkt->range >= 0)
	7721	+ return -1;
	7722	+
	7723	+ switch (opcode) {
	7724	+ case BPF_JLE:
	7725	+ /* pkt <= pkt_end */
	7726	+ fallthrough;
	7727	+ case BPF_JGT:
	7728	+ /* pkt > pkt_end */
	7729	+ if (pkt->range == BEYOND_PKT_END)
	7730	+ /* pkt has at last one extra byte beyond pkt_end */
	7731	+ return opcode == BPF_JGT;
	7732	+ break;
	7733	+ case BPF_JLT:
	7734	+ /* pkt < pkt_end */
	7735	+ fallthrough;
	7736	+ case BPF_JGE:
	7737	+ /* pkt >= pkt_end */
	7738	+ if (pkt->range == BEYOND_PKT_END \|\| pkt->range == AT_PKT_END)
	7739	+ return opcode == BPF_JGE;
	7740	+ break;
	7741	+ }
3900	7742	return -1;
3901	7743	}
3902	7744
..	..	@@ -3906,9 +7748,17 @@
3906	7748	* In JEQ/JNE cases we also adjust the var_off values.
3907	7749	*/
3908	7750	static void reg_set_min_max(struct bpf_reg_state *true_reg,
3909		- struct bpf_reg_state *false_reg, u64 val,
3910		- u8 opcode)
	7751	+ struct bpf_reg_state *false_reg,
	7752	+ u64 val, u32 val32,
	7753	+ u8 opcode, bool is_jmp32)
3911	7754	{
	7755	+ struct tnum false_32off = tnum_subreg(false_reg->var_off);
	7756	+ struct tnum false_64off = false_reg->var_off;
	7757	+ struct tnum true_32off = tnum_subreg(true_reg->var_off);
	7758	+ struct tnum true_64off = true_reg->var_off;
	7759	+ s64 sval = (s64)val;
	7760	+ s32 sval32 = (s32)val32;
	7761	+
3912	7762	/* If the dst_reg is a pointer, we can't learn anything about its
3913	7763	* variable offset from the compare (unless src_reg were a pointer into
3914	7764	* the same object, but we don't bother with that.
..	..	@@ -3919,137 +7769,155 @@
3919	7769	return;
3920	7770
3921	7771	switch (opcode) {
	7772	+ /* JEQ/JNE comparison doesn't change the register equivalence.
	7773	+ *
	7774	+ * r1 = r2;
	7775	+ * if (r1 == 42) goto label;
	7776	+ * ...
	7777	+ * label: // here both r1 and r2 are known to be 42.
	7778	+ *
	7779	+ * Hence when marking register as known preserve it's ID.
	7780	+ */
3922	7781	case BPF_JEQ:
3923		- /* If this is false then we know nothing Jon Snow, but if it is
3924		- * true then we know for sure.
3925		- */
3926		- __mark_reg_known(true_reg, val);
	7782	+ if (is_jmp32) {
	7783	+ __mark_reg32_known(true_reg, val32);
	7784	+ true_32off = tnum_subreg(true_reg->var_off);
	7785	+ } else {
	7786	+ ___mark_reg_known(true_reg, val);
	7787	+ true_64off = true_reg->var_off;
	7788	+ }
3927	7789	break;
3928	7790	case BPF_JNE:
3929		- /* If this is true we know nothing Jon Snow, but if it is false
3930		- * we know the value for sure;
3931		- */
3932		- __mark_reg_known(false_reg, val);
	7791	+ if (is_jmp32) {
	7792	+ __mark_reg32_known(false_reg, val32);
	7793	+ false_32off = tnum_subreg(false_reg->var_off);
	7794	+ } else {
	7795	+ ___mark_reg_known(false_reg, val);
	7796	+ false_64off = false_reg->var_off;
	7797	+ }
3933	7798	break;
3934		- case BPF_JGT:
3935		- false_reg->umax_value = min(false_reg->umax_value, val);
3936		- true_reg->umin_value = max(true_reg->umin_value, val + 1);
3937		- break;
3938		- case BPF_JSGT:
3939		- false_reg->smax_value = min_t(s64, false_reg->smax_value, val);
3940		- true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1);
3941		- break;
3942		- case BPF_JLT:
3943		- false_reg->umin_value = max(false_reg->umin_value, val);
3944		- true_reg->umax_value = min(true_reg->umax_value, val - 1);
3945		- break;
3946		- case BPF_JSLT:
3947		- false_reg->smin_value = max_t(s64, false_reg->smin_value, val);
3948		- true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1);
	7799	+ case BPF_JSET:
	7800	+ if (is_jmp32) {
	7801	+ false_32off = tnum_and(false_32off, tnum_const(~val32));
	7802	+ if (is_power_of_2(val32))
	7803	+ true_32off = tnum_or(true_32off,
	7804	+ tnum_const(val32));
	7805	+ } else {
	7806	+ false_64off = tnum_and(false_64off, tnum_const(~val));
	7807	+ if (is_power_of_2(val))
	7808	+ true_64off = tnum_or(true_64off,
	7809	+ tnum_const(val));
	7810	+ }
3949	7811	break;
3950	7812	case BPF_JGE:
3951		- false_reg->umax_value = min(false_reg->umax_value, val - 1);
3952		- true_reg->umin_value = max(true_reg->umin_value, val);
3953		- break;
3954		- case BPF_JSGE:
3955		- false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1);
3956		- true_reg->smin_value = max_t(s64, true_reg->smin_value, val);
3957		- break;
3958		- case BPF_JLE:
3959		- false_reg->umin_value = max(false_reg->umin_value, val + 1);
3960		- true_reg->umax_value = min(true_reg->umax_value, val);
3961		- break;
3962		- case BPF_JSLE:
3963		- false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1);
3964		- true_reg->smax_value = min_t(s64, true_reg->smax_value, val);
3965		- break;
3966		- default:
	7813	+ case BPF_JGT:
	7814	+ {
	7815	+ if (is_jmp32) {
	7816	+ u32 false_umax = opcode == BPF_JGT ? val32 : val32 - 1;
	7817	+ u32 true_umin = opcode == BPF_JGT ? val32 + 1 : val32;
	7818	+
	7819	+ false_reg->u32_max_value = min(false_reg->u32_max_value,
	7820	+ false_umax);
	7821	+ true_reg->u32_min_value = max(true_reg->u32_min_value,
	7822	+ true_umin);
	7823	+ } else {
	7824	+ u64 false_umax = opcode == BPF_JGT ? val : val - 1;
	7825	+ u64 true_umin = opcode == BPF_JGT ? val + 1 : val;
	7826	+
	7827	+ false_reg->umax_value = min(false_reg->umax_value, false_umax);
	7828	+ true_reg->umin_value = max(true_reg->umin_value, true_umin);
	7829	+ }
3967	7830	break;
3968	7831	}
	7832	+ case BPF_JSGE:
	7833	+ case BPF_JSGT:
	7834	+ {
	7835	+ if (is_jmp32) {
	7836	+ s32 false_smax = opcode == BPF_JSGT ? sval32 : sval32 - 1;
	7837	+ s32 true_smin = opcode == BPF_JSGT ? sval32 + 1 : sval32;
3969	7838
3970		- __reg_deduce_bounds(false_reg);
3971		- __reg_deduce_bounds(true_reg);
3972		- /* We might have learned some bits from the bounds. */
3973		- __reg_bound_offset(false_reg);
3974		- __reg_bound_offset(true_reg);
3975		- /* Intersecting with the old var_off might have improved our bounds
3976		- * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
3977		- * then new var_off is (0; 0x7f...fc) which improves our umax.
3978		- */
3979		- __update_reg_bounds(false_reg);
3980		- __update_reg_bounds(true_reg);
	7839	+ false_reg->s32_max_value = min(false_reg->s32_max_value, false_smax);
	7840	+ true_reg->s32_min_value = max(true_reg->s32_min_value, true_smin);
	7841	+ } else {
	7842	+ s64 false_smax = opcode == BPF_JSGT ? sval : sval - 1;
	7843	+ s64 true_smin = opcode == BPF_JSGT ? sval + 1 : sval;
	7844	+
	7845	+ false_reg->smax_value = min(false_reg->smax_value, false_smax);
	7846	+ true_reg->smin_value = max(true_reg->smin_value, true_smin);
	7847	+ }
	7848	+ break;
	7849	+ }
	7850	+ case BPF_JLE:
	7851	+ case BPF_JLT:
	7852	+ {
	7853	+ if (is_jmp32) {
	7854	+ u32 false_umin = opcode == BPF_JLT ? val32 : val32 + 1;
	7855	+ u32 true_umax = opcode == BPF_JLT ? val32 - 1 : val32;
	7856	+
	7857	+ false_reg->u32_min_value = max(false_reg->u32_min_value,
	7858	+ false_umin);
	7859	+ true_reg->u32_max_value = min(true_reg->u32_max_value,
	7860	+ true_umax);
	7861	+ } else {
	7862	+ u64 false_umin = opcode == BPF_JLT ? val : val + 1;
	7863	+ u64 true_umax = opcode == BPF_JLT ? val - 1 : val;
	7864	+
	7865	+ false_reg->umin_value = max(false_reg->umin_value, false_umin);
	7866	+ true_reg->umax_value = min(true_reg->umax_value, true_umax);
	7867	+ }
	7868	+ break;
	7869	+ }
	7870	+ case BPF_JSLE:
	7871	+ case BPF_JSLT:
	7872	+ {
	7873	+ if (is_jmp32) {
	7874	+ s32 false_smin = opcode == BPF_JSLT ? sval32 : sval32 + 1;
	7875	+ s32 true_smax = opcode == BPF_JSLT ? sval32 - 1 : sval32;
	7876	+
	7877	+ false_reg->s32_min_value = max(false_reg->s32_min_value, false_smin);
	7878	+ true_reg->s32_max_value = min(true_reg->s32_max_value, true_smax);
	7879	+ } else {
	7880	+ s64 false_smin = opcode == BPF_JSLT ? sval : sval + 1;
	7881	+ s64 true_smax = opcode == BPF_JSLT ? sval - 1 : sval;
	7882	+
	7883	+ false_reg->smin_value = max(false_reg->smin_value, false_smin);
	7884	+ true_reg->smax_value = min(true_reg->smax_value, true_smax);
	7885	+ }
	7886	+ break;
	7887	+ }
	7888	+ default:
	7889	+ return;
	7890	+ }
	7891	+
	7892	+ if (is_jmp32) {
	7893	+ false_reg->var_off = tnum_or(tnum_clear_subreg(false_64off),
	7894	+ tnum_subreg(false_32off));
	7895	+ true_reg->var_off = tnum_or(tnum_clear_subreg(true_64off),
	7896	+ tnum_subreg(true_32off));
	7897	+ __reg_combine_32_into_64(false_reg);
	7898	+ __reg_combine_32_into_64(true_reg);
	7899	+ } else {
	7900	+ false_reg->var_off = false_64off;
	7901	+ true_reg->var_off = true_64off;
	7902	+ __reg_combine_64_into_32(false_reg);
	7903	+ __reg_combine_64_into_32(true_reg);
	7904	+ }
3981	7905	}
3982	7906
3983	7907	/* Same as above, but for the case that dst_reg holds a constant and src_reg is
3984	7908	* the variable reg.
3985	7909	*/
3986	7910	static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
3987		- struct bpf_reg_state *false_reg, u64 val,
3988		- u8 opcode)
	7911	+ struct bpf_reg_state *false_reg,
	7912	+ u64 val, u32 val32,
	7913	+ u8 opcode, bool is_jmp32)
3989	7914	{
3990		- if (__is_pointer_value(false, false_reg))
3991		- return;
3992		-
3993		- switch (opcode) {
3994		- case BPF_JEQ:
3995		- /* If this is false then we know nothing Jon Snow, but if it is
3996		- * true then we know for sure.
3997		- */
3998		- __mark_reg_known(true_reg, val);
3999		- break;
4000		- case BPF_JNE:
4001		- /* If this is true we know nothing Jon Snow, but if it is false
4002		- * we know the value for sure;
4003		- */
4004		- __mark_reg_known(false_reg, val);
4005		- break;
4006		- case BPF_JGT:
4007		- true_reg->umax_value = min(true_reg->umax_value, val - 1);
4008		- false_reg->umin_value = max(false_reg->umin_value, val);
4009		- break;
4010		- case BPF_JSGT:
4011		- true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1);
4012		- false_reg->smin_value = max_t(s64, false_reg->smin_value, val);
4013		- break;
4014		- case BPF_JLT:
4015		- true_reg->umin_value = max(true_reg->umin_value, val + 1);
4016		- false_reg->umax_value = min(false_reg->umax_value, val);
4017		- break;
4018		- case BPF_JSLT:
4019		- true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1);
4020		- false_reg->smax_value = min_t(s64, false_reg->smax_value, val);
4021		- break;
4022		- case BPF_JGE:
4023		- true_reg->umax_value = min(true_reg->umax_value, val);
4024		- false_reg->umin_value = max(false_reg->umin_value, val + 1);
4025		- break;
4026		- case BPF_JSGE:
4027		- true_reg->smax_value = min_t(s64, true_reg->smax_value, val);
4028		- false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1);
4029		- break;
4030		- case BPF_JLE:
4031		- true_reg->umin_value = max(true_reg->umin_value, val);
4032		- false_reg->umax_value = min(false_reg->umax_value, val - 1);
4033		- break;
4034		- case BPF_JSLE:
4035		- true_reg->smin_value = max_t(s64, true_reg->smin_value, val);
4036		- false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1);
4037		- break;
4038		- default:
4039		- break;
4040		- }
4041		-
4042		- __reg_deduce_bounds(false_reg);
4043		- __reg_deduce_bounds(true_reg);
4044		- /* We might have learned some bits from the bounds. */
4045		- __reg_bound_offset(false_reg);
4046		- __reg_bound_offset(true_reg);
4047		- /* Intersecting with the old var_off might have improved our bounds
4048		- * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
4049		- * then new var_off is (0; 0x7f...fc) which improves our umax.
	7915	+ opcode = flip_opcode(opcode);
	7916	+ /* This uses zero as "not present in table"; luckily the zero opcode,
	7917	+ * BPF_JA, can't get here.
4050	7918	*/
4051		- __update_reg_bounds(false_reg);
4052		- __update_reg_bounds(true_reg);
	7919	+ if (opcode)
	7920	+ reg_set_min_max(true_reg, false_reg, val, val32, opcode, is_jmp32);
4053	7921	}
4054	7922
4055	7923	/* Regs are known to be equal, so intersect their min/max/var_off */
..	..	@@ -4066,21 +7934,8 @@
4066	7934	dst_reg->smax_value);
4067	7935	src_reg->var_off = dst_reg->var_off = tnum_intersect(src_reg->var_off,
4068	7936	dst_reg->var_off);
4069		- /* We might have learned new bounds from the var_off. */
4070		- __update_reg_bounds(src_reg);
4071		- __update_reg_bounds(dst_reg);
4072		- /* We might have learned something about the sign bit. */
4073		- __reg_deduce_bounds(src_reg);
4074		- __reg_deduce_bounds(dst_reg);
4075		- /* We might have learned some bits from the bounds. */
4076		- __reg_bound_offset(src_reg);
4077		- __reg_bound_offset(dst_reg);
4078		- /* Intersecting with the old var_off might have improved our bounds
4079		- * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
4080		- * then new var_off is (0; 0x7f...fc) which improves our umax.
4081		- */
4082		- __update_reg_bounds(src_reg);
4083		- __update_reg_bounds(dst_reg);
	7937	+ reg_bounds_sync(src_reg);
	7938	+ reg_bounds_sync(dst_reg);
4084	7939	}
4085	7940
4086	7941	static void reg_combine_min_max(struct bpf_reg_state *true_src,
..	..	@@ -4099,60 +7954,93 @@
4099	7954	}
4100	7955	}
4101	7956
4102		-static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
4103		- bool is_null)
	7957	+static void mark_ptr_or_null_reg(struct bpf_func_state *state,
	7958	+ struct bpf_reg_state *reg, u32 id,
	7959	+ bool is_null)
4104	7960	{
4105		- struct bpf_reg_state *reg = &regs[regno];
4106		-
4107		- if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) {
4108		- /* Old offset (both fixed and variable parts) should
4109		- * have been known-zero, because we don't allow pointer
4110		- * arithmetic on pointers that might be NULL.
4111		- */
	7961	+ if (reg_type_may_be_null(reg->type) && reg->id == id &&
	7962	+ !WARN_ON_ONCE(!reg->id)) {
4112	7963	if (WARN_ON_ONCE(reg->smin_value \|\| reg->smax_value \|\|
4113	7964	!tnum_equals_const(reg->var_off, 0) \|\|
4114	7965	reg->off)) {
4115		- __mark_reg_known_zero(reg);
4116		- reg->off = 0;
	7966	+ /* Old offset (both fixed and variable parts) should
	7967	+ * have been known-zero, because we don't allow pointer
	7968	+ * arithmetic on pointers that might be NULL. If we
	7969	+ * see this happening, don't convert the register.
	7970	+ */
	7971	+ return;
4117	7972	}
4118	7973	if (is_null) {
4119	7974	reg->type = SCALAR_VALUE;
4120		- } else if (reg->map_ptr->inner_map_meta) {
4121		- reg->type = CONST_PTR_TO_MAP;
4122		- reg->map_ptr = reg->map_ptr->inner_map_meta;
4123		- } else {
4124		- reg->type = PTR_TO_MAP_VALUE;
	7975	+ } else if (reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
	7976	+ const struct bpf_map *map = reg->map_ptr;
	7977	+
	7978	+ if (map->inner_map_meta) {
	7979	+ reg->type = CONST_PTR_TO_MAP;
	7980	+ reg->map_ptr = map->inner_map_meta;
	7981	+ } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
	7982	+ reg->type = PTR_TO_XDP_SOCK;
	7983	+ } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP \|\|
	7984	+ map->map_type == BPF_MAP_TYPE_SOCKHASH) {
	7985	+ reg->type = PTR_TO_SOCKET;
	7986	+ } else {
	7987	+ reg->type = PTR_TO_MAP_VALUE;
	7988	+ }
	7989	+ } else if (reg->type == PTR_TO_SOCKET_OR_NULL) {
	7990	+ reg->type = PTR_TO_SOCKET;
	7991	+ } else if (reg->type == PTR_TO_SOCK_COMMON_OR_NULL) {
	7992	+ reg->type = PTR_TO_SOCK_COMMON;
	7993	+ } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) {
	7994	+ reg->type = PTR_TO_TCP_SOCK;
	7995	+ } else if (reg->type == PTR_TO_BTF_ID_OR_NULL) {
	7996	+ reg->type = PTR_TO_BTF_ID;
	7997	+ } else if (reg->type == PTR_TO_MEM_OR_NULL) {
	7998	+ reg->type = PTR_TO_MEM;
	7999	+ } else if (reg->type == PTR_TO_RDONLY_BUF_OR_NULL) {
	8000	+ reg->type = PTR_TO_RDONLY_BUF;
	8001	+ } else if (reg->type == PTR_TO_RDWR_BUF_OR_NULL) {
	8002	+ reg->type = PTR_TO_RDWR_BUF;
4125	8003	}
4126		- /* We don't need id from this point onwards anymore, thus we
4127		- * should better reset it, so that state pruning has chances
4128		- * to take effect.
4129		- */
4130		- reg->id = 0;
	8004	+ if (is_null) {
	8005	+ /* We don't need id and ref_obj_id from this point
	8006	+ * onwards anymore, thus we should better reset it,
	8007	+ * so that state pruning has chances to take effect.
	8008	+ */
	8009	+ reg->id = 0;
	8010	+ reg->ref_obj_id = 0;
	8011	+ } else if (!reg_may_point_to_spin_lock(reg)) {
	8012	+ /* For not-NULL ptr, reg->ref_obj_id will be reset
	8013	+ * in release_reference().
	8014	+ *
	8015	+ * reg->id is still used by spin_lock ptr. Other
	8016	+ * than spin_lock ptr type, reg->id can be reset.
	8017	+ */
	8018	+ reg->id = 0;
	8019	+ }
4131	8020	}
4132	8021	}
4133	8022
4134	8023	/* The logic is similar to find_good_pkt_pointers(), both could eventually
4135	8024	* be folded together at some point.
4136	8025	*/
4137		-static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno,
4138		- bool is_null)
	8026	+static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
	8027	+ bool is_null)
4139	8028	{
4140	8029	struct bpf_func_state *state = vstate->frame[vstate->curframe];
4141		- struct bpf_reg_state *regs = state->regs;
	8030	+ struct bpf_reg_state regs = state->regs, reg;
	8031	+ u32 ref_obj_id = regs[regno].ref_obj_id;
4142	8032	u32 id = regs[regno].id;
4143		- int i, j;
4144	8033
4145		- for (i = 0; i < MAX_BPF_REG; i++)
4146		- mark_map_reg(regs, i, id, is_null);
	8034	+ if (ref_obj_id && ref_obj_id == id && is_null)
	8035	+ /* regs[regno] is in the " == NULL" branch.
	8036	+ * No one could have freed the reference state before
	8037	+ * doing the NULL check.
	8038	+ */
	8039	+ WARN_ON_ONCE(release_reference_state(state, id));
4147	8040
4148		- for (j = 0; j <= vstate->curframe; j++) {
4149		- state = vstate->frame[j];
4150		- for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
4151		- if (state->stack[i].slot_type[0] != STACK_SPILL)
4152		- continue;
4153		- mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null);
4154		- }
4155		- }
	8041	+ bpf_for_each_reg_in_vstate(vstate, state, reg, ({
	8042	+ mark_ptr_or_null_reg(state, reg, id, is_null);
	8043	+ }));
4156	8044	}
4157	8045
4158	8046	static bool try_match_pkt_pointers(const struct bpf_insn *insn,
..	..	@@ -4164,6 +8052,10 @@
4164	8052	if (BPF_SRC(insn->code) != BPF_X)
4165	8053	return false;
4166	8054
	8055	+ /* Pointers are always 64-bit. */
	8056	+ if (BPF_CLASS(insn->code) == BPF_JMP32)
	8057	+ return false;
	8058	+
4167	8059	switch (BPF_OP(insn->code)) {
4168	8060	case BPF_JGT:
4169	8061	if ((dst_reg->type == PTR_TO_PACKET &&
..	..	@@ -4173,6 +8065,7 @@
4173	8065	/* pkt_data' > pkt_end, pkt_meta' > pkt_data */
4174	8066	find_good_pkt_pointers(this_branch, dst_reg,
4175	8067	dst_reg->type, false);
	8068	+ mark_pkt_end(other_branch, insn->dst_reg, true);
4176	8069	} else if ((dst_reg->type == PTR_TO_PACKET_END &&
4177	8070	src_reg->type == PTR_TO_PACKET) \|\|
4178	8071	(reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
..	..	@@ -4180,6 +8073,7 @@
4180	8073	/* pkt_end > pkt_data', pkt_data > pkt_meta' */
4181	8074	find_good_pkt_pointers(other_branch, src_reg,
4182	8075	src_reg->type, true);
	8076	+ mark_pkt_end(this_branch, insn->src_reg, false);
4183	8077	} else {
4184	8078	return false;
4185	8079	}
..	..	@@ -4192,6 +8086,7 @@
4192	8086	/* pkt_data' < pkt_end, pkt_meta' < pkt_data */
4193	8087	find_good_pkt_pointers(other_branch, dst_reg,
4194	8088	dst_reg->type, true);
	8089	+ mark_pkt_end(this_branch, insn->dst_reg, false);
4195	8090	} else if ((dst_reg->type == PTR_TO_PACKET_END &&
4196	8091	src_reg->type == PTR_TO_PACKET) \|\|
4197	8092	(reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
..	..	@@ -4199,6 +8094,7 @@
4199	8094	/* pkt_end < pkt_data', pkt_data > pkt_meta' */
4200	8095	find_good_pkt_pointers(this_branch, src_reg,
4201	8096	src_reg->type, false);
	8097	+ mark_pkt_end(other_branch, insn->src_reg, true);
4202	8098	} else {
4203	8099	return false;
4204	8100	}
..	..	@@ -4211,6 +8107,7 @@
4211	8107	/* pkt_data' >= pkt_end, pkt_meta' >= pkt_data */
4212	8108	find_good_pkt_pointers(this_branch, dst_reg,
4213	8109	dst_reg->type, true);
	8110	+ mark_pkt_end(other_branch, insn->dst_reg, false);
4214	8111	} else if ((dst_reg->type == PTR_TO_PACKET_END &&
4215	8112	src_reg->type == PTR_TO_PACKET) \|\|
4216	8113	(reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
..	..	@@ -4218,6 +8115,7 @@
4218	8115	/* pkt_end >= pkt_data', pkt_data >= pkt_meta' */
4219	8116	find_good_pkt_pointers(other_branch, src_reg,
4220	8117	src_reg->type, false);
	8118	+ mark_pkt_end(this_branch, insn->src_reg, true);
4221	8119	} else {
4222	8120	return false;
4223	8121	}
..	..	@@ -4230,6 +8128,7 @@
4230	8128	/* pkt_data' <= pkt_end, pkt_meta' <= pkt_data */
4231	8129	find_good_pkt_pointers(other_branch, dst_reg,
4232	8130	dst_reg->type, false);
	8131	+ mark_pkt_end(this_branch, insn->dst_reg, true);
4233	8132	} else if ((dst_reg->type == PTR_TO_PACKET_END &&
4234	8133	src_reg->type == PTR_TO_PACKET) \|\|
4235	8134	(reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
..	..	@@ -4237,6 +8136,7 @@
4237	8136	/* pkt_end <= pkt_data', pkt_data <= pkt_meta' */
4238	8137	find_good_pkt_pointers(this_branch, src_reg,
4239	8138	src_reg->type, true);
	8139	+ mark_pkt_end(other_branch, insn->src_reg, false);
4240	8140	} else {
4241	8141	return false;
4242	8142	}
..	..	@@ -4248,6 +8148,18 @@
4248	8148	return true;
4249	8149	}
4250	8150
	8151	+static void find_equal_scalars(struct bpf_verifier_state *vstate,
	8152	+ struct bpf_reg_state *known_reg)
	8153	+{
	8154	+ struct bpf_func_state *state;
	8155	+ struct bpf_reg_state *reg;
	8156	+
	8157	+ bpf_for_each_reg_in_vstate(vstate, state, reg, ({
	8158	+ if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
	8159	+ copy_register_state(reg, known_reg);
	8160	+ }));
	8161	+}
	8162	+
4251	8163	static int check_cond_jmp_op(struct bpf_verifier_env *env,
4252	8164	struct bpf_insn insn, int insn_idx)
4253	8165	{
..	..	@@ -4256,17 +8168,19 @@
4256	8168	struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs;
4257	8169	struct bpf_reg_state dst_reg, other_branch_regs, *src_reg = NULL;
4258	8170	u8 opcode = BPF_OP(insn->code);
	8171	+ bool is_jmp32;
4259	8172	int pred = -1;
4260	8173	int err;
4261	8174
4262		- if (opcode > BPF_JSLE) {
4263		- verbose(env, "invalid BPF_JMP opcode %x\n", opcode);
	8175	+ /* Only conditional jumps are expected to reach here. */
	8176	+ if (opcode == BPF_JA \|\| opcode > BPF_JSLE) {
	8177	+ verbose(env, "invalid BPF_JMP/JMP32 opcode %x\n", opcode);
4264	8178	return -EINVAL;
4265	8179	}
4266	8180
4267	8181	if (BPF_SRC(insn->code) == BPF_X) {
4268	8182	if (insn->imm != 0) {
4269		- verbose(env, "BPF_JMP uses reserved fields\n");
	8183	+ verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
4270	8184	return -EINVAL;
4271	8185	}
4272	8186
..	..	@@ -4283,7 +8197,7 @@
4283	8197	src_reg = &regs[insn->src_reg];
4284	8198	} else {
4285	8199	if (insn->src_reg != BPF_REG_0) {
4286		- verbose(env, "BPF_JMP uses reserved fields\n");
	8200	+ verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
4287	8201	return -EINVAL;
4288	8202	}
4289	8203	}
..	..	@@ -4294,20 +8208,47 @@
4294	8208	return err;
4295	8209
4296	8210	dst_reg = &regs[insn->dst_reg];
	8211	+ is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
4297	8212
4298		- if (BPF_SRC(insn->code) == BPF_K)
4299		- pred = is_branch_taken(dst_reg, insn->imm, opcode);
4300		- else if (src_reg->type == SCALAR_VALUE &&
4301		- tnum_is_const(src_reg->var_off))
4302		- pred = is_branch_taken(dst_reg, src_reg->var_off.value,
4303		- opcode);
	8213	+ if (BPF_SRC(insn->code) == BPF_K) {
	8214	+ pred = is_branch_taken(dst_reg, insn->imm, opcode, is_jmp32);
	8215	+ } else if (src_reg->type == SCALAR_VALUE &&
	8216	+ is_jmp32 && tnum_is_const(tnum_subreg(src_reg->var_off))) {
	8217	+ pred = is_branch_taken(dst_reg,
	8218	+ tnum_subreg(src_reg->var_off).value,
	8219	+ opcode,
	8220	+ is_jmp32);
	8221	+ } else if (src_reg->type == SCALAR_VALUE &&
	8222	+ !is_jmp32 && tnum_is_const(src_reg->var_off)) {
	8223	+ pred = is_branch_taken(dst_reg,
	8224	+ src_reg->var_off.value,
	8225	+ opcode,
	8226	+ is_jmp32);
	8227	+ } else if (reg_is_pkt_pointer_any(dst_reg) &&
	8228	+ reg_is_pkt_pointer_any(src_reg) &&
	8229	+ !is_jmp32) {
	8230	+ pred = is_pkt_ptr_branch_taken(dst_reg, src_reg, opcode);
	8231	+ }
	8232	+
	8233	+ if (pred >= 0) {
	8234	+ /* If we get here with a dst_reg pointer type it is because
	8235	+ * above is_branch_taken() special cased the 0 comparison.
	8236	+ */
	8237	+ if (!__is_pointer_value(false, dst_reg))
	8238	+ err = mark_chain_precision(env, insn->dst_reg);
	8239	+ if (BPF_SRC(insn->code) == BPF_X && !err &&
	8240	+ !__is_pointer_value(false, src_reg))
	8241	+ err = mark_chain_precision(env, insn->src_reg);
	8242	+ if (err)
	8243	+ return err;
	8244	+ }
4304	8245
4305	8246	if (pred == 1) {
4306	8247	/* Only follow the goto, ignore fall-through. If needed, push
4307	8248	* the fall-through branch for simulation under speculative
4308	8249	* execution.
4309	8250	*/
4310		- if (!env->allow_ptr_leaks &&
	8251	+ if (!env->bypass_spec_v1 &&
4311	8252	!sanitize_speculative_path(env, insn, *insn_idx + 1,
4312	8253	*insn_idx))
4313	8254	return -EFAULT;
..	..	@@ -4318,7 +8259,7 @@
4318	8259	* program will go. If needed, push the goto branch for
4319	8260	* simulation under speculative execution.
4320	8261	*/
4321		- if (!env->allow_ptr_leaks &&
	8262	+ if (!env->bypass_spec_v1 &&
4322	8263	!sanitize_speculative_path(env, insn,
4323	8264	*insn_idx + insn->off + 1,
4324	8265	*insn_idx))
..	..	@@ -4340,37 +8281,65 @@
4340	8281	* comparable.
4341	8282	*/
4342	8283	if (BPF_SRC(insn->code) == BPF_X) {
	8284	+ struct bpf_reg_state *src_reg = &regs[insn->src_reg];
	8285	+
4343	8286	if (dst_reg->type == SCALAR_VALUE &&
4344		- regs[insn->src_reg].type == SCALAR_VALUE) {
4345		- if (tnum_is_const(regs[insn->src_reg].var_off))
	8287	+ src_reg->type == SCALAR_VALUE) {
	8288	+ if (tnum_is_const(src_reg->var_off) \|\|
	8289	+ (is_jmp32 &&
	8290	+ tnum_is_const(tnum_subreg(src_reg->var_off))))
4346	8291	reg_set_min_max(&other_branch_regs[insn->dst_reg],
4347		- dst_reg, regs[insn->src_reg].var_off.value,
4348		- opcode);
4349		- else if (tnum_is_const(dst_reg->var_off))
	8292	+ dst_reg,
	8293	+ src_reg->var_off.value,
	8294	+ tnum_subreg(src_reg->var_off).value,
	8295	+ opcode, is_jmp32);
	8296	+ else if (tnum_is_const(dst_reg->var_off) \|\|
	8297	+ (is_jmp32 &&
	8298	+ tnum_is_const(tnum_subreg(dst_reg->var_off))))
4350	8299	reg_set_min_max_inv(&other_branch_regs[insn->src_reg],
4351		- &regs[insn->src_reg],
4352		- dst_reg->var_off.value, opcode);
4353		- else if (opcode == BPF_JEQ \|\| opcode == BPF_JNE)
	8300	+ src_reg,
	8301	+ dst_reg->var_off.value,
	8302	+ tnum_subreg(dst_reg->var_off).value,
	8303	+ opcode, is_jmp32);
	8304	+ else if (!is_jmp32 &&
	8305	+ (opcode == BPF_JEQ \|\| opcode == BPF_JNE))
4354	8306	/* Comparing for equality, we can combine knowledge */
4355	8307	reg_combine_min_max(&other_branch_regs[insn->src_reg],
4356	8308	&other_branch_regs[insn->dst_reg],
4357		- &regs[insn->src_reg],
4358		- &regs[insn->dst_reg], opcode);
	8309	+ src_reg, dst_reg, opcode);
	8310	+ if (src_reg->id &&
	8311	+ !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) {
	8312	+ find_equal_scalars(this_branch, src_reg);
	8313	+ find_equal_scalars(other_branch, &other_branch_regs[insn->src_reg]);
	8314	+ }
	8315	+
4359	8316	}
4360	8317	} else if (dst_reg->type == SCALAR_VALUE) {
4361	8318	reg_set_min_max(&other_branch_regs[insn->dst_reg],
4362		- dst_reg, insn->imm, opcode);
	8319	+ dst_reg, insn->imm, (u32)insn->imm,
	8320	+ opcode, is_jmp32);
4363	8321	}
4364	8322
4365		- /* detect if R == 0 where R is returned from bpf_map_lookup_elem() */
4366		- if (BPF_SRC(insn->code) == BPF_K &&
	8323	+ if (dst_reg->type == SCALAR_VALUE && dst_reg->id &&
	8324	+ !WARN_ON_ONCE(dst_reg->id != other_branch_regs[insn->dst_reg].id)) {
	8325	+ find_equal_scalars(this_branch, dst_reg);
	8326	+ find_equal_scalars(other_branch, &other_branch_regs[insn->dst_reg]);
	8327	+ }
	8328	+
	8329	+ /* detect if R == 0 where R is returned from bpf_map_lookup_elem().
	8330	+ * NOTE: these optimizations below are related with pointer comparison
	8331	+ * which will never be JMP32.
	8332	+ */
	8333	+ if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K &&
4367	8334	insn->imm == 0 && (opcode == BPF_JEQ \|\| opcode == BPF_JNE) &&
4368		- dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
4369		- /* Mark all identical map registers in each branch as either
	8335	+ reg_type_may_be_null(dst_reg->type)) {
	8336	+ /* Mark all identical registers in each branch as either
4370	8337	* safe or unknown depending R == 0 or R != 0 conditional.
4371	8338	*/
4372		- mark_map_regs(this_branch, insn->dst_reg, opcode == BPF_JNE);
4373		- mark_map_regs(other_branch, insn->dst_reg, opcode == BPF_JEQ);
	8339	+ mark_ptr_or_null_regs(this_branch, insn->dst_reg,
	8340	+ opcode == BPF_JNE);
	8341	+ mark_ptr_or_null_regs(other_branch, insn->dst_reg,
	8342	+ opcode == BPF_JEQ);
4374	8343	} else if (!try_match_pkt_pointers(insn, dst_reg, &regs[insn->src_reg],
4375	8344	this_branch, other_branch) &&
4376	8345	is_pointer_value(env, insn->dst_reg)) {
..	..	@@ -4378,23 +8347,18 @@
4378	8347	insn->dst_reg);
4379	8348	return -EACCES;
4380	8349	}
4381		- if (env->log.level)
	8350	+ if (env->log.level & BPF_LOG_LEVEL)
4382	8351	print_verifier_state(env, this_branch->frame[this_branch->curframe]);
4383	8352	return 0;
4384		-}
4385		-
4386		-/* return the map pointer stored inside BPF_LD_IMM64 instruction */
4387		-static struct bpf_map ld_imm64_to_map_ptr(struct bpf_insn insn)
4388		-{
4389		- u64 imm64 = ((u64) (u32) insn[0].imm) \| ((u64) (u32) insn[1].imm) << 32;
4390		-
4391		- return (struct bpf_map *) (unsigned long) imm64;
4392	8353	}
4393	8354
4394	8355	/* verify BPF_LD_IMM64 instruction */
4395	8356	static int check_ld_imm(struct bpf_verifier_env env, struct bpf_insn insn)
4396	8357	{
	8358	+ struct bpf_insn_aux_data *aux = cur_aux(env);
4397	8359	struct bpf_reg_state *regs = cur_regs(env);
	8360	+ struct bpf_reg_state *dst_reg;
	8361	+ struct bpf_map *map;
4398	8362	int err;
4399	8363
4400	8364	if (BPF_SIZE(insn->code) != BPF_DW) {
..	..	@@ -4410,19 +8374,50 @@
4410	8374	if (err)
4411	8375	return err;
4412	8376
	8377	+ dst_reg = &regs[insn->dst_reg];
4413	8378	if (insn->src_reg == 0) {
4414	8379	u64 imm = ((u64)(insn + 1)->imm << 32) \| (u32)insn->imm;
4415	8380
4416		- regs[insn->dst_reg].type = SCALAR_VALUE;
	8381	+ dst_reg->type = SCALAR_VALUE;
4417	8382	__mark_reg_known(&regs[insn->dst_reg], imm);
4418	8383	return 0;
4419	8384	}
4420	8385
4421		- /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */
4422		- BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD);
	8386	+ if (insn->src_reg == BPF_PSEUDO_BTF_ID) {
	8387	+ mark_reg_known_zero(env, regs, insn->dst_reg);
4423	8388
4424		- regs[insn->dst_reg].type = CONST_PTR_TO_MAP;
4425		- regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn);
	8389	+ dst_reg->type = aux->btf_var.reg_type;
	8390	+ switch (dst_reg->type) {
	8391	+ case PTR_TO_MEM:
	8392	+ dst_reg->mem_size = aux->btf_var.mem_size;
	8393	+ break;
	8394	+ case PTR_TO_BTF_ID:
	8395	+ case PTR_TO_PERCPU_BTF_ID:
	8396	+ dst_reg->btf_id = aux->btf_var.btf_id;
	8397	+ break;
	8398	+ default:
	8399	+ verbose(env, "bpf verifier is misconfigured\n");
	8400	+ return -EFAULT;
	8401	+ }
	8402	+ return 0;
	8403	+ }
	8404	+
	8405	+ map = env->used_maps[aux->map_index];
	8406	+ mark_reg_known_zero(env, regs, insn->dst_reg);
	8407	+ dst_reg->map_ptr = map;
	8408	+
	8409	+ if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) {
	8410	+ dst_reg->type = PTR_TO_MAP_VALUE;
	8411	+ dst_reg->off = aux->map_off;
	8412	+ if (map_value_has_spin_lock(map))
	8413	+ dst_reg->id = ++env->id_gen;
	8414	+ } else if (insn->src_reg == BPF_PSEUDO_MAP_FD) {
	8415	+ dst_reg->type = CONST_PTR_TO_MAP;
	8416	+ } else {
	8417	+ verbose(env, "bpf verifier is misconfigured\n");
	8418	+ return -EINVAL;
	8419	+ }
	8420	+
4426	8421	return 0;
4427	8422	}
4428	8423
..	..	@@ -4460,25 +8455,13 @@
4460	8455	u8 mode = BPF_MODE(insn->code);
4461	8456	int i, err;
4462	8457
4463		- if (!may_access_skb(env->prog->type)) {
	8458	+ if (!may_access_skb(resolve_prog_type(env->prog))) {
4464	8459	verbose(env, "BPF_LD_[ABS\|IND] instructions not allowed for this program type\n");
4465	8460	return -EINVAL;
4466	8461	}
4467	8462
4468	8463	if (!env->ops->gen_ld_abs) {
4469	8464	verbose(env, "bpf verifier is misconfigured\n");
4470		- return -EINVAL;
4471		- }
4472		-
4473		- if (env->subprog_cnt > 1) {
4474		- /* when program has LD_ABS insn JITs and interpreter assume
4475		- * that r1 == ctx == skb which is not the case for callees
4476		- * that can have arbitrary arguments. It's problematic
4477		- * for main prog as well since JITs would need to analyze
4478		- * all functions in order to make proper register save/restore
4479		- * decisions in the main prog. Hence disallow LD_ABS with calls
4480		- */
4481		- verbose(env, "BPF_LD_[ABS\|IND] instructions cannot be mixed with bpf-to-bpf calls\n");
4482	8465	return -EINVAL;
4483	8466	}
4484	8467
..	..	@@ -4493,6 +8476,21 @@
4493	8476	err = check_reg_arg(env, ctx_reg, SRC_OP);
4494	8477	if (err)
4495	8478	return err;
	8479	+
	8480	+ /* Disallow usage of BPF_LD_[ABS\|IND] with reference tracking, as
	8481	+ * gen_ld_abs() may terminate the program at runtime, leading to
	8482	+ * reference leak.
	8483	+ */
	8484	+ err = check_reference_leak(env);
	8485	+ if (err) {
	8486	+ verbose(env, "BPF_LD_[ABS\|IND] cannot be mixed with socket references\n");
	8487	+ return err;
	8488	+ }
	8489	+
	8490	+ if (env->cur_state->active_spin_lock) {
	8491	+ verbose(env, "BPF_LD_[ABS\|IND] cannot be used inside bpf_spin_lock-ed region\n");
	8492	+ return -EINVAL;
	8493	+ }
4496	8494
4497	8495	if (regs[ctx_reg].type != PTR_TO_CTX) {
4498	8496	verbose(env,
..	..	@@ -4522,29 +8520,106 @@
4522	8520	* Already marked as written above.
4523	8521	*/
4524	8522	mark_reg_unknown(env, regs, BPF_REG_0);
	8523	+ /* ld_abs load up to 32-bit skb data. */
	8524	+ regs[BPF_REG_0].subreg_def = env->insn_idx + 1;
4525	8525	return 0;
4526	8526	}
4527	8527
4528	8528	static int check_return_code(struct bpf_verifier_env *env)
4529	8529	{
	8530	+ struct tnum enforce_attach_type_range = tnum_unknown;
	8531	+ const struct bpf_prog *prog = env->prog;
4530	8532	struct bpf_reg_state *reg;
4531	8533	struct tnum range = tnum_range(0, 1);
	8534	+ enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
	8535	+ int err;
	8536	+ const bool is_subprog = env->cur_state->frame[0]->subprogno;
4532	8537
4533		- switch (env->prog->type) {
	8538	+ /* LSM and struct_ops func-ptr's return type could be "void" */
	8539	+ if (!is_subprog &&
	8540	+ (prog_type == BPF_PROG_TYPE_STRUCT_OPS \|\|
	8541	+ prog_type == BPF_PROG_TYPE_LSM) &&
	8542	+ !prog->aux->attach_func_proto->type)
	8543	+ return 0;
	8544	+
	8545	+ /* eBPF calling convetion is such that R0 is used
	8546	+ * to return the value from eBPF program.
	8547	+ * Make sure that it's readable at this time
	8548	+ * of bpf_exit, which means that program wrote
	8549	+ * something into it earlier
	8550	+ */
	8551	+ err = check_reg_arg(env, BPF_REG_0, SRC_OP);
	8552	+ if (err)
	8553	+ return err;
	8554	+
	8555	+ if (is_pointer_value(env, BPF_REG_0)) {
	8556	+ verbose(env, "R0 leaks addr as return value\n");
	8557	+ return -EACCES;
	8558	+ }
	8559	+
	8560	+ reg = cur_regs(env) + BPF_REG_0;
	8561	+ if (is_subprog) {
	8562	+ if (reg->type != SCALAR_VALUE) {
	8563	+ verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n",
	8564	+ reg_type_str[reg->type]);
	8565	+ return -EINVAL;
	8566	+ }
	8567	+ return 0;
	8568	+ }
	8569	+
	8570	+ switch (prog_type) {
4534	8571	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
4535	8572	if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG \|\|
4536		- env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG)
	8573	+ env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG \|\|
	8574	+ env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME \|\|
	8575	+ env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME \|\|
	8576	+ env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME \|\|
	8577	+ env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME)
4537	8578	range = tnum_range(1, 1);
	8579	+ break;
4538	8580	case BPF_PROG_TYPE_CGROUP_SKB:
	8581	+ if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) {
	8582	+ range = tnum_range(0, 3);
	8583	+ enforce_attach_type_range = tnum_range(2, 3);
	8584	+ }
	8585	+ break;
4539	8586	case BPF_PROG_TYPE_CGROUP_SOCK:
4540	8587	case BPF_PROG_TYPE_SOCK_OPS:
4541	8588	case BPF_PROG_TYPE_CGROUP_DEVICE:
	8589	+ case BPF_PROG_TYPE_CGROUP_SYSCTL:
	8590	+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
4542	8591	break;
	8592	+ case BPF_PROG_TYPE_RAW_TRACEPOINT:
	8593	+ if (!env->prog->aux->attach_btf_id)
	8594	+ return 0;
	8595	+ range = tnum_const(0);
	8596	+ break;
	8597	+ case BPF_PROG_TYPE_TRACING:
	8598	+ switch (env->prog->expected_attach_type) {
	8599	+ case BPF_TRACE_FENTRY:
	8600	+ case BPF_TRACE_FEXIT:
	8601	+ range = tnum_const(0);
	8602	+ break;
	8603	+ case BPF_TRACE_RAW_TP:
	8604	+ case BPF_MODIFY_RETURN:
	8605	+ return 0;
	8606	+ case BPF_TRACE_ITER:
	8607	+ break;
	8608	+ default:
	8609	+ return -ENOTSUPP;
	8610	+ }
	8611	+ break;
	8612	+ case BPF_PROG_TYPE_SK_LOOKUP:
	8613	+ range = tnum_range(SK_DROP, SK_PASS);
	8614	+ break;
	8615	+ case BPF_PROG_TYPE_EXT:
	8616	+ /* freplace program can return anything as its return value
	8617	+ * depends on the to-be-replaced kernel func or bpf program.
	8618	+ */
4543	8619	default:
4544	8620	return 0;
4545	8621	}
4546	8622
4547		- reg = cur_regs(env) + BPF_REG_0;
4548	8623	if (reg->type != SCALAR_VALUE) {
4549	8624	verbose(env, "At program exit the register R0 is not a known value (%s)\n",
4550	8625	reg_type_str[reg->type]);
..	..	@@ -4565,6 +8640,10 @@
4565	8640	verbose(env, " should have been in %s\n", tn_buf);
4566	8641	return -EINVAL;
4567	8642	}
	8643	+
	8644	+ if (!tnum_is_unknown(enforce_attach_type_range) &&
	8645	+ tnum_in(enforce_attach_type_range, reg->var_off))
	8646	+ env->prog->enforce_expected_attach_type = 1;
4568	8647	return 0;
4569	8648	}
4570	8649
..	..	@@ -4608,19 +8687,37 @@
4608	8687	BRANCH = 2,
4609	8688	};
4610	8689
4611		-#define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L)
	8690	+static u32 state_htab_size(struct bpf_verifier_env *env)
	8691	+{
	8692	+ return env->prog->len;
	8693	+}
4612	8694
4613		-static int insn_stack; / stack of insns to process */
4614		-static int cur_stack; /* current stack index */
4615		-static int *insn_state;
	8695	+static struct bpf_verifier_state_list **explored_state(
	8696	+ struct bpf_verifier_env *env,
	8697	+ int idx)
	8698	+{
	8699	+ struct bpf_verifier_state *cur = env->cur_state;
	8700	+ struct bpf_func_state *state = cur->frame[cur->curframe];
	8701	+
	8702	+ return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)];
	8703	+}
	8704	+
	8705	+static void init_explored_state(struct bpf_verifier_env *env, int idx)
	8706	+{
	8707	+ env->insn_aux_data[idx].prune_point = true;
	8708	+}
4616	8709
4617	8710	/* t, w, e - match pseudo-code above:
4618	8711	* t - index of current instruction
4619	8712	* w - next instruction
4620	8713	* e - edge
4621	8714	*/
4622		-static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
	8715	+static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
	8716	+ bool loop_ok)
4623	8717	{
	8718	+ int *insn_stack = env->cfg.insn_stack;
	8719	+ int *insn_state = env->cfg.insn_state;
	8720	+
4624	8721	if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED \| FALLTHROUGH))
4625	8722	return 0;
4626	8723
..	..	@@ -4628,23 +8725,28 @@
4628	8725	return 0;
4629	8726
4630	8727	if (w < 0 \|\| w >= env->prog->len) {
	8728	+ verbose_linfo(env, t, "%d: ", t);
4631	8729	verbose(env, "jump out of range from insn %d to %d\n", t, w);
4632	8730	return -EINVAL;
4633	8731	}
4634	8732
4635	8733	if (e == BRANCH)
4636	8734	/* mark branch target for state pruning */
4637		- env->explored_states[w] = STATE_LIST_MARK;
	8735	+ init_explored_state(env, w);
4638	8736
4639	8737	if (insn_state[w] == 0) {
4640	8738	/* tree-edge */
4641	8739	insn_state[t] = DISCOVERED \| e;
4642	8740	insn_state[w] = DISCOVERED;
4643		- if (cur_stack >= env->prog->len)
	8741	+ if (env->cfg.cur_stack >= env->prog->len)
4644	8742	return -E2BIG;
4645		- insn_stack[cur_stack++] = w;
	8743	+ insn_stack[env->cfg.cur_stack++] = w;
4646	8744	return 1;
4647	8745	} else if ((insn_state[w] & 0xF0) == DISCOVERED) {
	8746	+ if (loop_ok && env->bpf_capable)
	8747	+ return 0;
	8748	+ verbose_linfo(env, t, "%d: ", t);
	8749	+ verbose_linfo(env, w, "%d: ", w);
4648	8750	verbose(env, "back-edge from insn %d to %d\n", t, w);
4649	8751	return -EINVAL;
4650	8752	} else if (insn_state[w] == EXPLORED) {
..	..	@@ -4664,48 +8766,47 @@
4664	8766	{
4665	8767	struct bpf_insn *insns = env->prog->insnsi;
4666	8768	int insn_cnt = env->prog->len;
	8769	+ int insn_stack, insn_state;
4667	8770	int ret = 0;
4668	8771	int i, t;
4669	8772
4670		- ret = check_subprogs(env);
4671		- if (ret < 0)
4672		- return ret;
4673		-
4674		- insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
	8773	+ insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
4675	8774	if (!insn_state)
4676	8775	return -ENOMEM;
4677	8776
4678		- insn_stack = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
	8777	+ insn_stack = env->cfg.insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
4679	8778	if (!insn_stack) {
4680		- kfree(insn_state);
	8779	+ kvfree(insn_state);
4681	8780	return -ENOMEM;
4682	8781	}
4683	8782
4684	8783	insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */
4685	8784	insn_stack[0] = 0; /* 0 is the first instruction */
4686		- cur_stack = 1;
	8785	+ env->cfg.cur_stack = 1;
4687	8786
4688	8787	peek_stack:
4689		- if (cur_stack == 0)
	8788	+ if (env->cfg.cur_stack == 0)
4690	8789	goto check_state;
4691		- t = insn_stack[cur_stack - 1];
	8790	+ t = insn_stack[env->cfg.cur_stack - 1];
4692	8791
4693		- if (BPF_CLASS(insns[t].code) == BPF_JMP) {
	8792	+ if (BPF_CLASS(insns[t].code) == BPF_JMP \|\|
	8793	+ BPF_CLASS(insns[t].code) == BPF_JMP32) {
4694	8794	u8 opcode = BPF_OP(insns[t].code);
4695	8795
4696	8796	if (opcode == BPF_EXIT) {
4697	8797	goto mark_explored;
4698	8798	} else if (opcode == BPF_CALL) {
4699		- ret = push_insn(t, t + 1, FALLTHROUGH, env);
	8799	+ ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
4700	8800	if (ret == 1)
4701	8801	goto peek_stack;
4702	8802	else if (ret < 0)
4703	8803	goto err_free;
4704	8804	if (t + 1 < insn_cnt)
4705		- env->explored_states[t + 1] = STATE_LIST_MARK;
	8805	+ init_explored_state(env, t + 1);
4706	8806	if (insns[t].src_reg == BPF_PSEUDO_CALL) {
4707		- env->explored_states[t] = STATE_LIST_MARK;
4708		- ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env);
	8807	+ init_explored_state(env, t);
	8808	+ ret = push_insn(t, t + insns[t].imm + 1, BRANCH,
	8809	+ env, false);
4709	8810	if (ret == 1)
4710	8811	goto peek_stack;
4711	8812	else if (ret < 0)
..	..	@@ -4718,26 +8819,31 @@
4718	8819	}
4719	8820	/* unconditional jump with single edge */
4720	8821	ret = push_insn(t, t + insns[t].off + 1,
4721		- FALLTHROUGH, env);
	8822	+ FALLTHROUGH, env, true);
4722	8823	if (ret == 1)
4723	8824	goto peek_stack;
4724	8825	else if (ret < 0)
4725	8826	goto err_free;
	8827	+ /* unconditional jmp is not a good pruning point,
	8828	+ * but it's marked, since backtracking needs
	8829	+ * to record jmp history in is_state_visited().
	8830	+ */
	8831	+ init_explored_state(env, t + insns[t].off + 1);
4726	8832	/* tell verifier to check for equivalent states
4727	8833	* after every call and jump
4728	8834	*/
4729	8835	if (t + 1 < insn_cnt)
4730		- env->explored_states[t + 1] = STATE_LIST_MARK;
	8836	+ init_explored_state(env, t + 1);
4731	8837	} else {
4732	8838	/* conditional jump with two edges */
4733		- env->explored_states[t] = STATE_LIST_MARK;
4734		- ret = push_insn(t, t + 1, FALLTHROUGH, env);
	8839	+ init_explored_state(env, t);
	8840	+ ret = push_insn(t, t + 1, FALLTHROUGH, env, true);
4735	8841	if (ret == 1)
4736	8842	goto peek_stack;
4737	8843	else if (ret < 0)
4738	8844	goto err_free;
4739	8845
4740		- ret = push_insn(t, t + insns[t].off + 1, BRANCH, env);
	8846	+ ret = push_insn(t, t + insns[t].off + 1, BRANCH, env, true);
4741	8847	if (ret == 1)
4742	8848	goto peek_stack;
4743	8849	else if (ret < 0)
..	..	@@ -4747,7 +8853,7 @@
4747	8853	/* all other non-branch instructions with single
4748	8854	* fall-through edge
4749	8855	*/
4750		- ret = push_insn(t, t + 1, FALLTHROUGH, env);
	8856	+ ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
4751	8857	if (ret == 1)
4752	8858	goto peek_stack;
4753	8859	else if (ret < 0)
..	..	@@ -4756,7 +8862,7 @@
4756	8862
4757	8863	mark_explored:
4758	8864	insn_state[t] = EXPLORED;
4759		- if (cur_stack-- <= 0) {
	8865	+ if (env->cfg.cur_stack-- <= 0) {
4760	8866	verbose(env, "pop stack internal bug\n");
4761	8867	ret = -EFAULT;
4762	8868	goto err_free;
..	..	@@ -4774,9 +8880,329 @@
4774	8880	ret = 0; /* cfg looks good */
4775	8881
4776	8882	err_free:
4777		- kfree(insn_state);
4778		- kfree(insn_stack);
	8883	+ kvfree(insn_state);
	8884	+ kvfree(insn_stack);
	8885	+ env->cfg.insn_state = env->cfg.insn_stack = NULL;
4779	8886	return ret;
	8887	+}
	8888	+
	8889	+static int check_abnormal_return(struct bpf_verifier_env *env)
	8890	+{
	8891	+ int i;
	8892	+
	8893	+ for (i = 1; i < env->subprog_cnt; i++) {
	8894	+ if (env->subprog_info[i].has_ld_abs) {
	8895	+ verbose(env, "LD_ABS is not allowed in subprogs without BTF\n");
	8896	+ return -EINVAL;
	8897	+ }
	8898	+ if (env->subprog_info[i].has_tail_call) {
	8899	+ verbose(env, "tail_call is not allowed in subprogs without BTF\n");
	8900	+ return -EINVAL;
	8901	+ }
	8902	+ }
	8903	+ return 0;
	8904	+}
	8905	+
	8906	+/* The minimum supported BTF func info size */
	8907	+#define MIN_BPF_FUNCINFO_SIZE 8
	8908	+#define MAX_FUNCINFO_REC_SIZE 252
	8909	+
	8910	+static int check_btf_func(struct bpf_verifier_env *env,
	8911	+ const union bpf_attr *attr,
	8912	+ union bpf_attr __user *uattr)
	8913	+{
	8914	+ const struct btf_type type, func_proto, *ret_type;
	8915	+ u32 i, nfuncs, urec_size, min_size;
	8916	+ u32 krec_size = sizeof(struct bpf_func_info);
	8917	+ struct bpf_func_info *krecord;
	8918	+ struct bpf_func_info_aux *info_aux = NULL;
	8919	+ struct bpf_prog *prog;
	8920	+ const struct btf *btf;
	8921	+ void __user *urecord;
	8922	+ u32 prev_offset = 0;
	8923	+ bool scalar_return;
	8924	+ int ret = -ENOMEM;
	8925	+
	8926	+ nfuncs = attr->func_info_cnt;
	8927	+ if (!nfuncs) {
	8928	+ if (check_abnormal_return(env))
	8929	+ return -EINVAL;
	8930	+ return 0;
	8931	+ }
	8932	+
	8933	+ if (nfuncs != env->subprog_cnt) {
	8934	+ verbose(env, "number of funcs in func_info doesn't match number of subprogs\n");
	8935	+ return -EINVAL;
	8936	+ }
	8937	+
	8938	+ urec_size = attr->func_info_rec_size;
	8939	+ if (urec_size < MIN_BPF_FUNCINFO_SIZE \|\|
	8940	+ urec_size > MAX_FUNCINFO_REC_SIZE \|\|
	8941	+ urec_size % sizeof(u32)) {
	8942	+ verbose(env, "invalid func info rec size %u\n", urec_size);
	8943	+ return -EINVAL;
	8944	+ }
	8945	+
	8946	+ prog = env->prog;
	8947	+ btf = prog->aux->btf;
	8948	+
	8949	+ urecord = u64_to_user_ptr(attr->func_info);
	8950	+ min_size = min_t(u32, krec_size, urec_size);
	8951	+
	8952	+ krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL \| __GFP_NOWARN);
	8953	+ if (!krecord)
	8954	+ return -ENOMEM;
	8955	+ info_aux = kcalloc(nfuncs, sizeof(*info_aux), GFP_KERNEL \| __GFP_NOWARN);
	8956	+ if (!info_aux)
	8957	+ goto err_free;
	8958	+
	8959	+ for (i = 0; i < nfuncs; i++) {
	8960	+ ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size);
	8961	+ if (ret) {
	8962	+ if (ret == -E2BIG) {
	8963	+ verbose(env, "nonzero tailing record in func info");
	8964	+ /* set the size kernel expects so loader can zero
	8965	+ * out the rest of the record.
	8966	+ */
	8967	+ if (put_user(min_size, &uattr->func_info_rec_size))
	8968	+ ret = -EFAULT;
	8969	+ }
	8970	+ goto err_free;
	8971	+ }
	8972	+
	8973	+ if (copy_from_user(&krecord[i], urecord, min_size)) {
	8974	+ ret = -EFAULT;
	8975	+ goto err_free;
	8976	+ }
	8977	+
	8978	+ /* check insn_off */
	8979	+ ret = -EINVAL;
	8980	+ if (i == 0) {
	8981	+ if (krecord[i].insn_off) {
	8982	+ verbose(env,
	8983	+ "nonzero insn_off %u for the first func info record",
	8984	+ krecord[i].insn_off);
	8985	+ goto err_free;
	8986	+ }
	8987	+ } else if (krecord[i].insn_off <= prev_offset) {
	8988	+ verbose(env,
	8989	+ "same or smaller insn offset (%u) than previous func info record (%u)",
	8990	+ krecord[i].insn_off, prev_offset);
	8991	+ goto err_free;
	8992	+ }
	8993	+
	8994	+ if (env->subprog_info[i].start != krecord[i].insn_off) {
	8995	+ verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n");
	8996	+ goto err_free;
	8997	+ }
	8998	+
	8999	+ /* check type_id */
	9000	+ type = btf_type_by_id(btf, krecord[i].type_id);
	9001	+ if (!type \|\| !btf_type_is_func(type)) {
	9002	+ verbose(env, "invalid type id %d in func info",
	9003	+ krecord[i].type_id);
	9004	+ goto err_free;
	9005	+ }
	9006	+ info_aux[i].linkage = BTF_INFO_VLEN(type->info);
	9007	+
	9008	+ func_proto = btf_type_by_id(btf, type->type);
	9009	+ if (unlikely(!func_proto \|\| !btf_type_is_func_proto(func_proto)))
	9010	+ /* btf_func_check() already verified it during BTF load */
	9011	+ goto err_free;
	9012	+ ret_type = btf_type_skip_modifiers(btf, func_proto->type, NULL);
	9013	+ scalar_return =
	9014	+ btf_type_is_small_int(ret_type) \|\| btf_type_is_enum(ret_type);
	9015	+ if (i && !scalar_return && env->subprog_info[i].has_ld_abs) {
	9016	+ verbose(env, "LD_ABS is only allowed in functions that return 'int'.\n");
	9017	+ goto err_free;
	9018	+ }
	9019	+ if (i && !scalar_return && env->subprog_info[i].has_tail_call) {
	9020	+ verbose(env, "tail_call is only allowed in functions that return 'int'.\n");
	9021	+ goto err_free;
	9022	+ }
	9023	+
	9024	+ prev_offset = krecord[i].insn_off;
	9025	+ urecord += urec_size;
	9026	+ }
	9027	+
	9028	+ prog->aux->func_info = krecord;
	9029	+ prog->aux->func_info_cnt = nfuncs;
	9030	+ prog->aux->func_info_aux = info_aux;
	9031	+ return 0;
	9032	+
	9033	+err_free:
	9034	+ kvfree(krecord);
	9035	+ kfree(info_aux);
	9036	+ return ret;
	9037	+}
	9038	+
	9039	+static void adjust_btf_func(struct bpf_verifier_env *env)
	9040	+{
	9041	+ struct bpf_prog_aux *aux = env->prog->aux;
	9042	+ int i;
	9043	+
	9044	+ if (!aux->func_info)
	9045	+ return;
	9046	+
	9047	+ for (i = 0; i < env->subprog_cnt; i++)
	9048	+ aux->func_info[i].insn_off = env->subprog_info[i].start;
	9049	+}
	9050	+
	9051	+#define MIN_BPF_LINEINFO_SIZE (offsetof(struct bpf_line_info, line_col) + \
	9052	+ sizeof(((struct bpf_line_info *)(0))->line_col))
	9053	+#define MAX_LINEINFO_REC_SIZE MAX_FUNCINFO_REC_SIZE
	9054	+
	9055	+static int check_btf_line(struct bpf_verifier_env *env,
	9056	+ const union bpf_attr *attr,
	9057	+ union bpf_attr __user *uattr)
	9058	+{
	9059	+ u32 i, s, nr_linfo, ncopy, expected_size, rec_size, prev_offset = 0;
	9060	+ struct bpf_subprog_info *sub;
	9061	+ struct bpf_line_info *linfo;
	9062	+ struct bpf_prog *prog;
	9063	+ const struct btf *btf;
	9064	+ void __user *ulinfo;
	9065	+ int err;
	9066	+
	9067	+ nr_linfo = attr->line_info_cnt;
	9068	+ if (!nr_linfo)
	9069	+ return 0;
	9070	+ if (nr_linfo > INT_MAX / sizeof(struct bpf_line_info))
	9071	+ return -EINVAL;
	9072	+
	9073	+ rec_size = attr->line_info_rec_size;
	9074	+ if (rec_size < MIN_BPF_LINEINFO_SIZE \|\|
	9075	+ rec_size > MAX_LINEINFO_REC_SIZE \|\|
	9076	+ rec_size & (sizeof(u32) - 1))
	9077	+ return -EINVAL;
	9078	+
	9079	+ /* Need to zero it in case the userspace may
	9080	+ * pass in a smaller bpf_line_info object.
	9081	+ */
	9082	+ linfo = kvcalloc(nr_linfo, sizeof(struct bpf_line_info),
	9083	+ GFP_KERNEL \| __GFP_NOWARN);
	9084	+ if (!linfo)
	9085	+ return -ENOMEM;
	9086	+
	9087	+ prog = env->prog;
	9088	+ btf = prog->aux->btf;
	9089	+
	9090	+ s = 0;
	9091	+ sub = env->subprog_info;
	9092	+ ulinfo = u64_to_user_ptr(attr->line_info);
	9093	+ expected_size = sizeof(struct bpf_line_info);
	9094	+ ncopy = min_t(u32, expected_size, rec_size);
	9095	+ for (i = 0; i < nr_linfo; i++) {
	9096	+ err = bpf_check_uarg_tail_zero(ulinfo, expected_size, rec_size);
	9097	+ if (err) {
	9098	+ if (err == -E2BIG) {
	9099	+ verbose(env, "nonzero tailing record in line_info");
	9100	+ if (put_user(expected_size,
	9101	+ &uattr->line_info_rec_size))
	9102	+ err = -EFAULT;
	9103	+ }
	9104	+ goto err_free;
	9105	+ }
	9106	+
	9107	+ if (copy_from_user(&linfo[i], ulinfo, ncopy)) {
	9108	+ err = -EFAULT;
	9109	+ goto err_free;
	9110	+ }
	9111	+
	9112	+ /*
	9113	+ * Check insn_off to ensure
	9114	+ * 1) strictly increasing AND
	9115	+ * 2) bounded by prog->len
	9116	+ *
	9117	+ * The linfo[0].insn_off == 0 check logically falls into
	9118	+ * the later "missing bpf_line_info for func..." case
	9119	+ * because the first linfo[0].insn_off must be the
	9120	+ * first sub also and the first sub must have
	9121	+ * subprog_info[0].start == 0.
	9122	+ */
	9123	+ if ((i && linfo[i].insn_off <= prev_offset) \|\|
	9124	+ linfo[i].insn_off >= prog->len) {
	9125	+ verbose(env, "Invalid line_info[%u].insn_off:%u (prev_offset:%u prog->len:%u)\n",
	9126	+ i, linfo[i].insn_off, prev_offset,
	9127	+ prog->len);
	9128	+ err = -EINVAL;
	9129	+ goto err_free;
	9130	+ }
	9131	+
	9132	+ if (!prog->insnsi[linfo[i].insn_off].code) {
	9133	+ verbose(env,
	9134	+ "Invalid insn code at line_info[%u].insn_off\n",
	9135	+ i);
	9136	+ err = -EINVAL;
	9137	+ goto err_free;
	9138	+ }
	9139	+
	9140	+ if (!btf_name_by_offset(btf, linfo[i].line_off) \|\|
	9141	+ !btf_name_by_offset(btf, linfo[i].file_name_off)) {
	9142	+ verbose(env, "Invalid line_info[%u].line_off or .file_name_off\n", i);
	9143	+ err = -EINVAL;
	9144	+ goto err_free;
	9145	+ }
	9146	+
	9147	+ if (s != env->subprog_cnt) {
	9148	+ if (linfo[i].insn_off == sub[s].start) {
	9149	+ sub[s].linfo_idx = i;
	9150	+ s++;
	9151	+ } else if (sub[s].start < linfo[i].insn_off) {
	9152	+ verbose(env, "missing bpf_line_info for func#%u\n", s);
	9153	+ err = -EINVAL;
	9154	+ goto err_free;
	9155	+ }
	9156	+ }
	9157	+
	9158	+ prev_offset = linfo[i].insn_off;
	9159	+ ulinfo += rec_size;
	9160	+ }
	9161	+
	9162	+ if (s != env->subprog_cnt) {
	9163	+ verbose(env, "missing bpf_line_info for %u funcs starting from func#%u\n",
	9164	+ env->subprog_cnt - s, s);
	9165	+ err = -EINVAL;
	9166	+ goto err_free;
	9167	+ }
	9168	+
	9169	+ prog->aux->linfo = linfo;
	9170	+ prog->aux->nr_linfo = nr_linfo;
	9171	+
	9172	+ return 0;
	9173	+
	9174	+err_free:
	9175	+ kvfree(linfo);
	9176	+ return err;
	9177	+}
	9178	+
	9179	+static int check_btf_info(struct bpf_verifier_env *env,
	9180	+ const union bpf_attr *attr,
	9181	+ union bpf_attr __user *uattr)
	9182	+{
	9183	+ struct btf *btf;
	9184	+ int err;
	9185	+
	9186	+ if (!attr->func_info_cnt && !attr->line_info_cnt) {
	9187	+ if (check_abnormal_return(env))
	9188	+ return -EINVAL;
	9189	+ return 0;
	9190	+ }
	9191	+
	9192	+ btf = btf_get_by_fd(attr->prog_btf_fd);
	9193	+ if (IS_ERR(btf))
	9194	+ return PTR_ERR(btf);
	9195	+ env->prog->aux->btf = btf;
	9196	+
	9197	+ err = check_btf_func(env, attr, uattr);
	9198	+ if (err)
	9199	+ return err;
	9200	+
	9201	+ err = check_btf_line(env, attr, uattr);
	9202	+ if (err)
	9203	+ return err;
	9204	+
	9205	+ return 0;
4780	9206	}
4781	9207
4782	9208	/* check %cur's range satisfies %old's */
..	..	@@ -4786,7 +9212,11 @@
4786	9212	return old->umin_value <= cur->umin_value &&
4787	9213	old->umax_value >= cur->umax_value &&
4788	9214	old->smin_value <= cur->smin_value &&
4789		- old->smax_value >= cur->smax_value;
	9215	+ old->smax_value >= cur->smax_value &&
	9216	+ old->u32_min_value <= cur->u32_min_value &&
	9217	+ old->u32_max_value >= cur->u32_max_value &&
	9218	+ old->s32_min_value <= cur->s32_min_value &&
	9219	+ old->s32_max_value >= cur->s32_max_value;
4790	9220	}
4791	9221
4792	9222	/* If in the old state two registers had the same id, then they need to have
..	..	@@ -4816,6 +9246,102 @@
4816	9246	/* We ran out of idmap slots, which should be impossible */
4817	9247	WARN_ON_ONCE(1);
4818	9248	return false;
	9249	+}
	9250	+
	9251	+static void clean_func_state(struct bpf_verifier_env *env,
	9252	+ struct bpf_func_state *st)
	9253	+{
	9254	+ enum bpf_reg_liveness live;
	9255	+ int i, j;
	9256	+
	9257	+ for (i = 0; i < BPF_REG_FP; i++) {
	9258	+ live = st->regs[i].live;
	9259	+ /* liveness must not touch this register anymore */
	9260	+ st->regs[i].live \|= REG_LIVE_DONE;
	9261	+ if (!(live & REG_LIVE_READ))
	9262	+ /* since the register is unused, clear its state
	9263	+ * to make further comparison simpler
	9264	+ */
	9265	+ __mark_reg_not_init(env, &st->regs[i]);
	9266	+ }
	9267	+
	9268	+ for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) {
	9269	+ live = st->stack[i].spilled_ptr.live;
	9270	+ /* liveness must not touch this stack slot anymore */
	9271	+ st->stack[i].spilled_ptr.live \|= REG_LIVE_DONE;
	9272	+ if (!(live & REG_LIVE_READ)) {
	9273	+ __mark_reg_not_init(env, &st->stack[i].spilled_ptr);
	9274	+ for (j = 0; j < BPF_REG_SIZE; j++)
	9275	+ st->stack[i].slot_type[j] = STACK_INVALID;
	9276	+ }
	9277	+ }
	9278	+}
	9279	+
	9280	+static void clean_verifier_state(struct bpf_verifier_env *env,
	9281	+ struct bpf_verifier_state *st)
	9282	+{
	9283	+ int i;
	9284	+
	9285	+ if (st->frame[0]->regs[0].live & REG_LIVE_DONE)
	9286	+ /* all regs in this state in all frames were already marked */
	9287	+ return;
	9288	+
	9289	+ for (i = 0; i <= st->curframe; i++)
	9290	+ clean_func_state(env, st->frame[i]);
	9291	+}
	9292	+
	9293	+/* the parentage chains form a tree.
	9294	+ * the verifier states are added to state lists at given insn and
	9295	+ * pushed into state stack for future exploration.
	9296	+ * when the verifier reaches bpf_exit insn some of the verifer states
	9297	+ * stored in the state lists have their final liveness state already,
	9298	+ * but a lot of states will get revised from liveness point of view when
	9299	+ * the verifier explores other branches.
	9300	+ * Example:
	9301	+ * 1: r0 = 1
	9302	+ * 2: if r1 == 100 goto pc+1
	9303	+ * 3: r0 = 2
	9304	+ * 4: exit
	9305	+ * when the verifier reaches exit insn the register r0 in the state list of
	9306	+ * insn 2 will be seen as !REG_LIVE_READ. Then the verifier pops the other_branch
	9307	+ * of insn 2 and goes exploring further. At the insn 4 it will walk the
	9308	+ * parentage chain from insn 4 into insn 2 and will mark r0 as REG_LIVE_READ.
	9309	+ *
	9310	+ * Since the verifier pushes the branch states as it sees them while exploring
	9311	+ * the program the condition of walking the branch instruction for the second
	9312	+ * time means that all states below this branch were already explored and
	9313	+ * their final liveness markes are already propagated.
	9314	+ * Hence when the verifier completes the search of state list in is_state_visited()
	9315	+ * we can call this clean_live_states() function to mark all liveness states
	9316	+ * as REG_LIVE_DONE to indicate that 'parent' pointers of 'struct bpf_reg_state'
	9317	+ * will not be used.
	9318	+ * This function also clears the registers and stack for states that !READ
	9319	+ * to simplify state merging.
	9320	+ *
	9321	+ * Important note here that walking the same branch instruction in the callee
	9322	+ * doesn't meant that the states are DONE. The verifier has to compare
	9323	+ * the callsites
	9324	+ */
	9325	+static void clean_live_states(struct bpf_verifier_env *env, int insn,
	9326	+ struct bpf_verifier_state *cur)
	9327	+{
	9328	+ struct bpf_verifier_state_list *sl;
	9329	+ int i;
	9330	+
	9331	+ sl = *explored_state(env, insn);
	9332	+ while (sl) {
	9333	+ if (sl->state.branches)
	9334	+ goto next;
	9335	+ if (sl->state.insn_idx != insn \|\|
	9336	+ sl->state.curframe != cur->curframe)
	9337	+ goto next;
	9338	+ for (i = 0; i <= cur->curframe; i++)
	9339	+ if (sl->state.frame[i]->callsite != cur->frame[i]->callsite)
	9340	+ goto next;
	9341	+ clean_verifier_state(env, &sl->state);
	9342	+next:
	9343	+ sl = sl->next;
	9344	+ }
4819	9345	}
4820	9346
4821	9347	/* Returns true if (rold safe implies rcur safe) */
..	..	@@ -4849,6 +9375,8 @@
4849	9375	if (env->explore_alu_limits)
4850	9376	return false;
4851	9377	if (rcur->type == SCALAR_VALUE) {
	9378	+ if (!rold->precise)
	9379	+ return true;
4852	9380	/* new val must satisfy old val knowledge */
4853	9381	return range_within(rold, rcur) &&
4854	9382	tnum_in(rold->var_off, rcur->var_off);
..	..	@@ -4865,8 +9393,11 @@
4865	9393	case PTR_TO_MAP_VALUE:
4866	9394	/* If the new min/max/var_off satisfy the old ones and
4867	9395	* everything else matches, we are OK.
4868		- * We don't care about the 'id' value, because nothing
4869		- * uses it for PTR_TO_MAP_VALUE (only for ..._OR_NULL)
	9396	+ * 'id' is not compared, since it's only used for maps with
	9397	+ * bpf_spin_lock inside map element and in such cases if
	9398	+ * the rest of the prog is valid for one map element then
	9399	+ * it's valid for all map elements regardless of the key
	9400	+ * used in bpf_map_lookup()
4870	9401	*/
4871	9402	return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
4872	9403	range_within(rold, rcur) &&
..	..	@@ -4911,6 +9442,14 @@
4911	9442	case PTR_TO_CTX:
4912	9443	case CONST_PTR_TO_MAP:
4913	9444	case PTR_TO_PACKET_END:
	9445	+ case PTR_TO_FLOW_KEYS:
	9446	+ case PTR_TO_SOCKET:
	9447	+ case PTR_TO_SOCKET_OR_NULL:
	9448	+ case PTR_TO_SOCK_COMMON:
	9449	+ case PTR_TO_SOCK_COMMON_OR_NULL:
	9450	+ case PTR_TO_TCP_SOCK:
	9451	+ case PTR_TO_TCP_SOCK_OR_NULL:
	9452	+ case PTR_TO_XDP_SOCK:
4914	9453	/* Only valid matches are exact, which memcmp() above
4915	9454	* would have accepted
4916	9455	*/
..	..	@@ -4929,12 +9468,6 @@
4929	9468	{
4930	9469	int i, spi;
4931	9470
4932		- /* if explored stack has more populated slots than current stack
4933		- * such stacks are not equivalent
4934		- */
4935		- if (old->allocated_stack > cur->allocated_stack)
4936		- return false;
4937		-
4938	9471	/* walk slots of the explored stack and ignore any additional
4939	9472	* slots in the current stack, since explored(safe) state
4940	9473	* didn't use them
..	..	@@ -4942,12 +9475,21 @@
4942	9475	for (i = 0; i < old->allocated_stack; i++) {
4943	9476	spi = i / BPF_REG_SIZE;
4944	9477
4945		- if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ))
	9478	+ if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) {
	9479	+ i += BPF_REG_SIZE - 1;
4946	9480	/* explored state didn't use this */
4947	9481	continue;
	9482	+ }
4948	9483
4949	9484	if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID)
4950	9485	continue;
	9486	+
	9487	+ /* explored stack has more populated slots than current stack
	9488	+ * and these slots were used
	9489	+ */
	9490	+ if (i >= cur->allocated_stack)
	9491	+ return false;
	9492	+
4951	9493	/* if old state was safe with misc data in the stack
4952	9494	* it will be safe with zero-initialized stack.
4953	9495	* The opposite is not true
..	..	@@ -4958,14 +9500,14 @@
4958	9500	if (old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
4959	9501	cur->stack[spi].slot_type[i % BPF_REG_SIZE])
4960	9502	/* Ex: old explored (safe) state has STACK_SPILL in
4961		- * this stack slot, but current has has STACK_MISC ->
	9503	+ * this stack slot, but current has STACK_MISC ->
4962	9504	* this verifier states are not equivalent,
4963	9505	* return false to continue verification of this path
4964	9506	*/
4965	9507	return false;
4966		- if (i % BPF_REG_SIZE)
	9508	+ if (i % BPF_REG_SIZE != BPF_REG_SIZE - 1)
4967	9509	continue;
4968		- if (old->stack[spi].slot_type[0] != STACK_SPILL)
	9510	+ if (!is_spilled_reg(&old->stack[spi]))
4969	9511	continue;
4970	9512	if (!regsafe(env, &old->stack[spi].spilled_ptr,
4971	9513	&cur->stack[spi].spilled_ptr, idmap))
..	..	@@ -4982,6 +9524,14 @@
4982	9524	return false;
4983	9525	}
4984	9526	return true;
	9527	+}
	9528	+
	9529	+static bool refsafe(struct bpf_func_state old, struct bpf_func_state cur)
	9530	+{
	9531	+ if (old->acquired_refs != cur->acquired_refs)
	9532	+ return false;
	9533	+ return !memcmp(old->refs, cur->refs,
	9534	+ sizeof(old->refs) old->acquired_refs);
4985	9535	}
4986	9536
4987	9537	/* compare two verifier states
..	..	@@ -5024,6 +9574,9 @@
5024	9574	if (!stacksafe(env, old, cur, env->idmap_scratch))
5025	9575	return false;
5026	9576
	9577	+ if (!refsafe(old, cur))
	9578	+ return false;
	9579	+
5027	9580	return true;
5028	9581	}
5029	9582
..	..	@@ -5042,6 +9595,9 @@
5042	9595	if (old->speculative && !cur->speculative)
5043	9596	return false;
5044	9597
	9598	+ if (old->active_spin_lock != cur->active_spin_lock)
	9599	+ return false;
	9600	+
5045	9601	/* for states to be equal callsites have to be the same
5046	9602	* and all frame states need to be equivalent
5047	9603	*/
..	..	@@ -5052,6 +9608,35 @@
5052	9608	return false;
5053	9609	}
5054	9610	return true;
	9611	+}
	9612	+
	9613	+/* Return 0 if no propagation happened. Return negative error code if error
	9614	+ * happened. Otherwise, return the propagated bit.
	9615	+ */
	9616	+static int propagate_liveness_reg(struct bpf_verifier_env *env,
	9617	+ struct bpf_reg_state *reg,
	9618	+ struct bpf_reg_state *parent_reg)
	9619	+{
	9620	+ u8 parent_flag = parent_reg->live & REG_LIVE_READ;
	9621	+ u8 flag = reg->live & REG_LIVE_READ;
	9622	+ int err;
	9623	+
	9624	+ /* When comes here, read flags of PARENT_REG or REG could be any of
	9625	+ * REG_LIVE_READ64, REG_LIVE_READ32, REG_LIVE_NONE. There is no need
	9626	+ * of propagation if PARENT_REG has strongest REG_LIVE_READ64.
	9627	+ */
	9628	+ if (parent_flag == REG_LIVE_READ64 \|\|
	9629	+ /* Or if there is no read flag from REG. */
	9630	+ !flag \|\|
	9631	+ /* Or if the read flag from REG is the same as PARENT_REG. */
	9632	+ parent_flag == flag)
	9633	+ return 0;
	9634	+
	9635	+ err = mark_reg_read(env, reg, parent_reg, flag);
	9636	+ if (err)
	9637	+ return err;
	9638	+
	9639	+ return flag;
5055	9640	}
5056	9641
5057	9642	/* A write screens off any subsequent reads; but write marks come from the
..	..	@@ -5065,8 +9650,9 @@
5065	9650	const struct bpf_verifier_state *vstate,
5066	9651	struct bpf_verifier_state *vparent)
5067	9652	{
5068		- int i, frame, err = 0;
	9653	+ struct bpf_reg_state state_reg, parent_reg;
5069	9654	struct bpf_func_state state, parent;
	9655	+ int i, frame, err = 0;
5070	9656
5071	9657	if (vparent->curframe != vstate->curframe) {
5072	9658	WARN(1, "propagate_live: parent frame %d current frame %d\n",
..	..	@@ -5075,50 +9661,160 @@
5075	9661	}
5076	9662	/* Propagate read liveness of registers... */
5077	9663	BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
5078		- /* We don't need to worry about FP liveness because it's read-only */
5079		- for (i = 0; i < BPF_REG_FP; i++) {
5080		- if (vparent->frame[vparent->curframe]->regs[i].live & REG_LIVE_READ)
5081		- continue;
5082		- if (vstate->frame[vstate->curframe]->regs[i].live & REG_LIVE_READ) {
5083		- err = mark_reg_read(env, &vstate->frame[vstate->curframe]->regs[i],
5084		- &vparent->frame[vstate->curframe]->regs[i]);
5085		- if (err)
	9664	+ for (frame = 0; frame <= vstate->curframe; frame++) {
	9665	+ parent = vparent->frame[frame];
	9666	+ state = vstate->frame[frame];
	9667	+ parent_reg = parent->regs;
	9668	+ state_reg = state->regs;
	9669	+ /* We don't need to worry about FP liveness, it's read-only */
	9670	+ for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) {
	9671	+ err = propagate_liveness_reg(env, &state_reg[i],
	9672	+ &parent_reg[i]);
	9673	+ if (err < 0)
	9674	+ return err;
	9675	+ if (err == REG_LIVE_READ64)
	9676	+ mark_insn_zext(env, &parent_reg[i]);
	9677	+ }
	9678	+
	9679	+ /* Propagate stack slots. */
	9680	+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE &&
	9681	+ i < parent->allocated_stack / BPF_REG_SIZE; i++) {
	9682	+ parent_reg = &parent->stack[i].spilled_ptr;
	9683	+ state_reg = &state->stack[i].spilled_ptr;
	9684	+ err = propagate_liveness_reg(env, state_reg,
	9685	+ parent_reg);
	9686	+ if (err < 0)
5086	9687	return err;
5087	9688	}
5088	9689	}
	9690	+ return 0;
	9691	+}
5089	9692
5090		- /* ... and stack slots */
5091		- for (frame = 0; frame <= vstate->curframe; frame++) {
5092		- state = vstate->frame[frame];
5093		- parent = vparent->frame[frame];
5094		- for (i = 0; i < state->allocated_stack / BPF_REG_SIZE &&
5095		- i < parent->allocated_stack / BPF_REG_SIZE; i++) {
5096		- if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ)
	9693	+/* find precise scalars in the previous equivalent state and
	9694	+ * propagate them into the current state
	9695	+ */
	9696	+static int propagate_precision(struct bpf_verifier_env *env,
	9697	+ const struct bpf_verifier_state *old)
	9698	+{
	9699	+ struct bpf_reg_state *state_reg;
	9700	+ struct bpf_func_state *state;
	9701	+ int i, err = 0, fr;
	9702	+
	9703	+ for (fr = old->curframe; fr >= 0; fr--) {
	9704	+ state = old->frame[fr];
	9705	+ state_reg = state->regs;
	9706	+ for (i = 0; i < BPF_REG_FP; i++, state_reg++) {
	9707	+ if (state_reg->type != SCALAR_VALUE \|\|
	9708	+ !state_reg->precise \|\|
	9709	+ !(state_reg->live & REG_LIVE_READ))
5097	9710	continue;
5098		- if (state->stack[i].spilled_ptr.live & REG_LIVE_READ)
5099		- mark_reg_read(env, &state->stack[i].spilled_ptr,
5100		- &parent->stack[i].spilled_ptr);
	9711	+ if (env->log.level & BPF_LOG_LEVEL2)
	9712	+ verbose(env, "frame %d: propagating r%d\n", fr, i);
	9713	+ err = mark_chain_precision_frame(env, fr, i);
	9714	+ if (err < 0)
	9715	+ return err;
	9716	+ }
	9717	+
	9718	+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
	9719	+ if (!is_spilled_reg(&state->stack[i]))
	9720	+ continue;
	9721	+ state_reg = &state->stack[i].spilled_ptr;
	9722	+ if (state_reg->type != SCALAR_VALUE \|\|
	9723	+ !state_reg->precise \|\|
	9724	+ !(state_reg->live & REG_LIVE_READ))
	9725	+ continue;
	9726	+ if (env->log.level & BPF_LOG_LEVEL2)
	9727	+ verbose(env, "frame %d: propagating fp%d\n",
	9728	+ fr, (-i - 1) * BPF_REG_SIZE);
	9729	+ err = mark_chain_precision_stack_frame(env, fr, i);
	9730	+ if (err < 0)
	9731	+ return err;
5101	9732	}
5102	9733	}
5103		- return err;
	9734	+ return 0;
5104	9735	}
	9736	+
	9737	+static bool states_maybe_looping(struct bpf_verifier_state *old,
	9738	+ struct bpf_verifier_state *cur)
	9739	+{
	9740	+ struct bpf_func_state fold, fcur;
	9741	+ int i, fr = cur->curframe;
	9742	+
	9743	+ if (old->curframe != fr)
	9744	+ return false;
	9745	+
	9746	+ fold = old->frame[fr];
	9747	+ fcur = cur->frame[fr];
	9748	+ for (i = 0; i < MAX_BPF_REG; i++)
	9749	+ if (memcmp(&fold->regs[i], &fcur->regs[i],
	9750	+ offsetof(struct bpf_reg_state, parent)))
	9751	+ return false;
	9752	+ return true;
	9753	+}
	9754	+
5105	9755
5106	9756	static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
5107	9757	{
5108	9758	struct bpf_verifier_state_list *new_sl;
5109		- struct bpf_verifier_state_list *sl;
	9759	+ struct bpf_verifier_state_list sl, *pprev;
5110	9760	struct bpf_verifier_state cur = env->cur_state, new;
5111	9761	int i, j, err, states_cnt = 0;
	9762	+ bool add_new_state = env->test_state_freq ? true : false;
5112	9763
5113		- sl = env->explored_states[insn_idx];
5114		- if (!sl)
	9764	+ cur->last_insn_idx = env->prev_insn_idx;
	9765	+ if (!env->insn_aux_data[insn_idx].prune_point)
5115	9766	/* this 'insn_idx' instruction wasn't marked, so we will not
5116	9767	* be doing state search here
5117	9768	*/
5118	9769	return 0;
5119	9770
5120		- while (sl != STATE_LIST_MARK) {
	9771	+ /* bpf progs typically have pruning point every 4 instructions
	9772	+ * http://vger.kernel.org/bpfconf2019.html#session-1
	9773	+ * Do not add new state for future pruning if the verifier hasn't seen
	9774	+ * at least 2 jumps and at least 8 instructions.
	9775	+ * This heuristics helps decrease 'total_states' and 'peak_states' metric.
	9776	+ * In tests that amounts to up to 50% reduction into total verifier
	9777	+ * memory consumption and 20% verifier time speedup.
	9778	+ */
	9779	+ if (env->jmps_processed - env->prev_jmps_processed >= 2 &&
	9780	+ env->insn_processed - env->prev_insn_processed >= 8)
	9781	+ add_new_state = true;
	9782	+
	9783	+ pprev = explored_state(env, insn_idx);
	9784	+ sl = *pprev;
	9785	+
	9786	+ clean_live_states(env, insn_idx, cur);
	9787	+
	9788	+ while (sl) {
	9789	+ states_cnt++;
	9790	+ if (sl->state.insn_idx != insn_idx)
	9791	+ goto next;
	9792	+ if (sl->state.branches) {
	9793	+ if (states_maybe_looping(&sl->state, cur) &&
	9794	+ states_equal(env, &sl->state, cur)) {
	9795	+ verbose_linfo(env, insn_idx, "; ");
	9796	+ verbose(env, "infinite loop detected at insn %d\n", insn_idx);
	9797	+ return -EINVAL;
	9798	+ }
	9799	+ /* if the verifier is processing a loop, avoid adding new state
	9800	+ * too often, since different loop iterations have distinct
	9801	+ * states and may not help future pruning.
	9802	+ * This threshold shouldn't be too low to make sure that
	9803	+ * a loop with large bound will be rejected quickly.
	9804	+ * The most abusive loop will be:
	9805	+ * r1 += 1
	9806	+ * if r1 < 1000000 goto pc-2
	9807	+ * 1M insn_procssed limit / 100 == 10k peak states.
	9808	+ * This threshold shouldn't be too high either, since states
	9809	+ * at the end of the loop are likely to be useful in pruning.
	9810	+ */
	9811	+ if (env->jmps_processed - env->prev_jmps_processed < 20 &&
	9812	+ env->insn_processed - env->prev_insn_processed < 100)
	9813	+ add_new_state = false;
	9814	+ goto miss;
	9815	+ }
5121	9816	if (states_equal(env, &sl->state, cur)) {
	9817	+ sl->hit_cnt++;
5122	9818	/* reached equivalent register/stack state,
5123	9819	* prune the search.
5124	9820	* Registers read by the continuation are read by us.
..	..	@@ -5130,27 +9826,91 @@
5130	9826	* this state and will pop a new one.
5131	9827	*/
5132	9828	err = propagate_liveness(env, &sl->state, cur);
	9829	+
	9830	+ /* if previous state reached the exit with precision and
	9831	+ * current state is equivalent to it (except precsion marks)
	9832	+ * the precision needs to be propagated back in
	9833	+ * the current state.
	9834	+ */
	9835	+ err = err ? : push_jmp_history(env, cur);
	9836	+ err = err ? : propagate_precision(env, &sl->state);
5133	9837	if (err)
5134	9838	return err;
5135	9839	return 1;
5136	9840	}
5137		- sl = sl->next;
5138		- states_cnt++;
	9841	+miss:
	9842	+ /* when new state is not going to be added do not increase miss count.
	9843	+ * Otherwise several loop iterations will remove the state
	9844	+ * recorded earlier. The goal of these heuristics is to have
	9845	+ * states from some iterations of the loop (some in the beginning
	9846	+ * and some at the end) to help pruning.
	9847	+ */
	9848	+ if (add_new_state)
	9849	+ sl->miss_cnt++;
	9850	+ /* heuristic to determine whether this state is beneficial
	9851	+ * to keep checking from state equivalence point of view.
	9852	+ * Higher numbers increase max_states_per_insn and verification time,
	9853	+ * but do not meaningfully decrease insn_processed.
	9854	+ */
	9855	+ if (sl->miss_cnt > sl->hit_cnt * 3 + 3) {
	9856	+ /* the state is unlikely to be useful. Remove it to
	9857	+ * speed up verification
	9858	+ */
	9859	+ *pprev = sl->next;
	9860	+ if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) {
	9861	+ u32 br = sl->state.branches;
	9862	+
	9863	+ WARN_ONCE(br,
	9864	+ "BUG live_done but branches_to_explore %d\n",
	9865	+ br);
	9866	+ free_verifier_state(&sl->state, false);
	9867	+ kfree(sl);
	9868	+ env->peak_states--;
	9869	+ } else {
	9870	+ /* cannot free this state, since parentage chain may
	9871	+ * walk it later. Add it for free_list instead to
	9872	+ * be freed at the end of verification
	9873	+ */
	9874	+ sl->next = env->free_list;
	9875	+ env->free_list = sl;
	9876	+ }
	9877	+ sl = *pprev;
	9878	+ continue;
	9879	+ }
	9880	+next:
	9881	+ pprev = &sl->next;
	9882	+ sl = *pprev;
5139	9883	}
5140	9884
5141		- if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
5142		- return 0;
	9885	+ if (env->max_states_per_insn < states_cnt)
	9886	+ env->max_states_per_insn = states_cnt;
5143	9887
5144		- /* there were no equivalent states, remember current one.
5145		- * technically the current state is not proven to be safe yet,
	9888	+ if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
	9889	+ return push_jmp_history(env, cur);
	9890	+
	9891	+ if (!add_new_state)
	9892	+ return push_jmp_history(env, cur);
	9893	+
	9894	+ /* There were no equivalent states, remember the current one.
	9895	+ * Technically the current state is not proven to be safe yet,
5146	9896	* but it will either reach outer most bpf_exit (which means it's safe)
5147		- * or it will be rejected. Since there are no loops, we won't be
	9897	+ * or it will be rejected. When there are no loops the verifier won't be
5148	9898	* seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx)
5149		- * again on the way to bpf_exit
	9899	+ * again on the way to bpf_exit.
	9900	+ * When looping the sl->state.branches will be > 0 and this state
	9901	+ * will not be considered for equivalence until branches == 0.
5150	9902	*/
5151	9903	new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL);
5152	9904	if (!new_sl)
5153	9905	return -ENOMEM;
	9906	+ env->total_states++;
	9907	+ env->peak_states++;
	9908	+ env->prev_jmps_processed = env->jmps_processed;
	9909	+ env->prev_insn_processed = env->insn_processed;
	9910	+
	9911	+ /* forget precise markings we inherited, see __mark_chain_precision */
	9912	+ if (env->bpf_capable)
	9913	+ mark_all_scalars_imprecise(env, cur);
5154	9914
5155	9915	/* add new state to the head of linked list */
5156	9916	new = &new_sl->state;
..	..	@@ -5160,19 +9920,34 @@
5160	9920	kfree(new_sl);
5161	9921	return err;
5162	9922	}
5163		- new_sl->next = env->explored_states[insn_idx];
5164		- env->explored_states[insn_idx] = new_sl;
5165		- /* connect new state to parentage chain */
5166		- for (i = 0; i < BPF_REG_FP; i++)
5167		- cur_regs(env)[i].parent = &new->frame[new->curframe]->regs[i];
	9923	+ new->insn_idx = insn_idx;
	9924	+ WARN_ONCE(new->branches != 1,
	9925	+ "BUG is_state_visited:branches_to_explore=%d insn %d\n", new->branches, insn_idx);
	9926	+
	9927	+ cur->parent = new;
	9928	+ cur->first_insn_idx = insn_idx;
	9929	+ clear_jmp_history(cur);
	9930	+ new_sl->next = *explored_state(env, insn_idx);
	9931	+ *explored_state(env, insn_idx) = new_sl;
	9932	+ /* connect new state to parentage chain. Current frame needs all
	9933	+ * registers connected. Only r6 - r9 of the callers are alive (pushed
	9934	+ * to the stack implicitly by JITs) so in callers' frames connect just
	9935	+ * r6 - r9 as an optimization. Callers will have r1 - r5 connected to
	9936	+ * the state of the call instruction (with WRITTEN set), and r0 comes
	9937	+ * from callee with its full parentage chain, anyway.
	9938	+ */
5168	9939	/* clear write marks in current state: the writes we did are not writes
5169	9940	* our child did, so they don't screen off its reads from us.
5170	9941	* (There are no read marks in current state, because reads always mark
5171	9942	* their parent and current state never has children yet. Only
5172	9943	* explored_states can get read marks.)
5173	9944	*/
5174		- for (i = 0; i < BPF_REG_FP; i++)
5175		- cur->frame[cur->curframe]->regs[i].live = REG_LIVE_NONE;
	9945	+ for (j = 0; j <= cur->curframe; j++) {
	9946	+ for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++)
	9947	+ cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i];
	9948	+ for (i = 0; i < BPF_REG_FP; i++)
	9949	+ cur->frame[j]->regs[i].live = REG_LIVE_NONE;
	9950	+ }
5176	9951
5177	9952	/* all stack frames are accessible from callee, clear them all */
5178	9953	for (j = 0; j <= cur->curframe; j++) {
..	..	@@ -5188,36 +9963,60 @@
5188	9963	return 0;
5189	9964	}
5190	9965
	9966	+/* Return true if it's OK to have the same insn return a different type. */
	9967	+static bool reg_type_mismatch_ok(enum bpf_reg_type type)
	9968	+{
	9969	+ switch (type) {
	9970	+ case PTR_TO_CTX:
	9971	+ case PTR_TO_SOCKET:
	9972	+ case PTR_TO_SOCKET_OR_NULL:
	9973	+ case PTR_TO_SOCK_COMMON:
	9974	+ case PTR_TO_SOCK_COMMON_OR_NULL:
	9975	+ case PTR_TO_TCP_SOCK:
	9976	+ case PTR_TO_TCP_SOCK_OR_NULL:
	9977	+ case PTR_TO_XDP_SOCK:
	9978	+ case PTR_TO_BTF_ID:
	9979	+ case PTR_TO_BTF_ID_OR_NULL:
	9980	+ return false;
	9981	+ default:
	9982	+ return true;
	9983	+ }
	9984	+}
	9985	+
	9986	+/* If an instruction was previously used with particular pointer types, then we
	9987	+ * need to be careful to avoid cases such as the below, where it may be ok
	9988	+ * for one branch accessing the pointer, but not ok for the other branch:
	9989	+ *
	9990	+ * R1 = sock_ptr
	9991	+ * goto X;
	9992	+ * ...
	9993	+ * R1 = some_other_valid_ptr;
	9994	+ * goto X;
	9995	+ * ...
	9996	+ * R2 = (u32 )(R1 + 0);
	9997	+ */
	9998	+static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev)
	9999	+{
	10000	+ return src != prev && (!reg_type_mismatch_ok(src) \|\|
	10001	+ !reg_type_mismatch_ok(prev));
	10002	+}
	10003	+
5191	10004	static int do_check(struct bpf_verifier_env *env)
5192	10005	{
5193		- struct bpf_verifier_state *state;
	10006	+ bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
	10007	+ struct bpf_verifier_state *state = env->cur_state;
5194	10008	struct bpf_insn *insns = env->prog->insnsi;
5195	10009	struct bpf_reg_state *regs;
5196		- int insn_cnt = env->prog->len, i;
5197		- int insn_processed = 0;
	10010	+ int insn_cnt = env->prog->len;
5198	10011	bool do_print_state = false;
5199		-
5200		- state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL);
5201		- if (!state)
5202		- return -ENOMEM;
5203		- state->curframe = 0;
5204		- state->speculative = false;
5205		- state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL);
5206		- if (!state->frame[0]) {
5207		- kfree(state);
5208		- return -ENOMEM;
5209		- }
5210		- env->cur_state = state;
5211		- init_func_state(env, state->frame[0],
5212		- BPF_MAIN_FUNC /* callsite */,
5213		- 0 /* frameno */,
5214		- 0 /* subprogno, zero == main subprog */);
	10012	+ int prev_insn_idx = -1;
5215	10013
5216	10014	for (;;) {
5217	10015	struct bpf_insn *insn;
5218	10016	u8 class;
5219	10017	int err;
5220	10018
	10019	+ env->prev_insn_idx = prev_insn_idx;
5221	10020	if (env->insn_idx >= insn_cnt) {
5222	10021	verbose(env, "invalid insn idx %d insn_cnt %d\n",
5223	10022	env->insn_idx, insn_cnt);
..	..	@@ -5227,10 +10026,10 @@
5227	10026	insn = &insns[env->insn_idx];
5228	10027	class = BPF_CLASS(insn->code);
5229	10028
5230		- if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
	10029	+ if (++env->insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
5231	10030	verbose(env,
5232	10031	"BPF program is too large. Processed %d insn\n",
5233		- insn_processed);
	10032	+ env->insn_processed);
5234	10033	return -E2BIG;
5235	10034	}
5236	10035
..	..	@@ -5239,7 +10038,7 @@
5239	10038	return err;
5240	10039	if (err == 1) {
5241	10040	/* found equivalent state, can prune the search */
5242		- if (env->log.level) {
	10041	+ if (env->log.level & BPF_LOG_LEVEL) {
5243	10042	if (do_print_state)
5244	10043	verbose(env, "\nfrom %d to %d%s: safe\n",
5245	10044	env->prev_insn_idx, env->insn_idx,
..	..	@@ -5257,8 +10056,9 @@
5257	10056	if (need_resched())
5258	10057	cond_resched();
5259	10058
5260		- if (env->log.level > 1 \|\| (env->log.level && do_print_state)) {
5261		- if (env->log.level > 1)
	10059	+ if (env->log.level & BPF_LOG_LEVEL2 \|\|
	10060	+ (env->log.level & BPF_LOG_LEVEL && do_print_state)) {
	10061	+ if (env->log.level & BPF_LOG_LEVEL2)
5262	10062	verbose(env, "%d:", env->insn_idx);
5263	10063	else
5264	10064	verbose(env, "\nfrom %d to %d%s:",
..	..	@@ -5269,12 +10069,13 @@
5269	10069	do_print_state = false;
5270	10070	}
5271	10071
5272		- if (env->log.level) {
	10072	+ if (env->log.level & BPF_LOG_LEVEL) {
5273	10073	const struct bpf_insn_cbs cbs = {
5274	10074	.cb_print = verbose,
5275	10075	.private_data = env,
5276	10076	};
5277	10077
	10078	+ verbose_linfo(env, env->insn_idx, "; ");
5278	10079	verbose(env, "%d: ", env->insn_idx);
5279	10080	print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
5280	10081	}
..	..	@@ -5288,6 +10089,7 @@
5288	10089
5289	10090	regs = cur_regs(env);
5290	10091	sanitize_mark_insn_seen(env);
	10092	+ prev_insn_idx = env->insn_idx;
5291	10093
5292	10094	if (class == BPF_ALU \|\| class == BPF_ALU64) {
5293	10095	err = check_alu_op(env, insn);
..	..	@@ -5328,9 +10130,7 @@
5328	10130	*/
5329	10131	*prev_src_type = src_reg_type;
5330	10132
5331		- } else if (src_reg_type != *prev_src_type &&
5332		- (src_reg_type == PTR_TO_CTX \|\|
5333		- *prev_src_type == PTR_TO_CTX)) {
	10133	+ } else if (reg_type_mismatch(src_reg_type, *prev_src_type)) {
5334	10134	/* ABuser program is trying to use the same insn
5335	10135	* dst_reg = (u32) (src_reg + off)
5336	10136	* with different pointer types:
..	..	@@ -5375,9 +10175,7 @@
5375	10175
5376	10176	if (*prev_dst_type == NOT_INIT) {
5377	10177	*prev_dst_type = dst_reg_type;
5378		- } else if (dst_reg_type != *prev_dst_type &&
5379		- (dst_reg_type == PTR_TO_CTX \|\|
5380		- *prev_dst_type == PTR_TO_CTX)) {
	10178	+ } else if (reg_type_mismatch(dst_reg_type, *prev_dst_type)) {
5381	10179	verbose(env, "same insn cannot be used with different pointers\n");
5382	10180	return -EINVAL;
5383	10181	}
..	..	@@ -5394,8 +10192,9 @@
5394	10192	return err;
5395	10193
5396	10194	if (is_ctx_reg(env, insn->dst_reg)) {
5397		- verbose(env, "BPF_ST stores into R%d context is not allowed\n",
5398		- insn->dst_reg);
	10195	+ verbose(env, "BPF_ST stores into R%d %s is not allowed\n",
	10196	+ insn->dst_reg,
	10197	+ reg_type_str[reg_state(env, insn->dst_reg)->type]);
5399	10198	return -EACCES;
5400	10199	}
5401	10200
..	..	@@ -5406,19 +10205,27 @@
5406	10205	if (err)
5407	10206	return err;
5408	10207
5409		- } else if (class == BPF_JMP) {
	10208	+ } else if (class == BPF_JMP \|\| class == BPF_JMP32) {
5410	10209	u8 opcode = BPF_OP(insn->code);
5411	10210
	10211	+ env->jmps_processed++;
5412	10212	if (opcode == BPF_CALL) {
5413	10213	if (BPF_SRC(insn->code) != BPF_K \|\|
5414	10214	insn->off != 0 \|\|
5415	10215	(insn->src_reg != BPF_REG_0 &&
5416	10216	insn->src_reg != BPF_PSEUDO_CALL) \|\|
5417		- insn->dst_reg != BPF_REG_0) {
	10217	+ insn->dst_reg != BPF_REG_0 \|\|
	10218	+ class == BPF_JMP32) {
5418	10219	verbose(env, "BPF_CALL uses reserved fields\n");
5419	10220	return -EINVAL;
5420	10221	}
5421	10222
	10223	+ if (env->cur_state->active_spin_lock &&
	10224	+ (insn->src_reg == BPF_PSEUDO_CALL \|\|
	10225	+ insn->imm != BPF_FUNC_spin_unlock)) {
	10226	+ verbose(env, "function calls are not allowed while holding a lock\n");
	10227	+ return -EINVAL;
	10228	+ }
5422	10229	if (insn->src_reg == BPF_PSEUDO_CALL)
5423	10230	err = check_func_call(env, insn, &env->insn_idx);
5424	10231	else
..	..	@@ -5430,7 +10237,8 @@
5430	10237	if (BPF_SRC(insn->code) != BPF_K \|\|
5431	10238	insn->imm != 0 \|\|
5432	10239	insn->src_reg != BPF_REG_0 \|\|
5433		- insn->dst_reg != BPF_REG_0) {
	10240	+ insn->dst_reg != BPF_REG_0 \|\|
	10241	+ class == BPF_JMP32) {
5434	10242	verbose(env, "BPF_JA uses reserved fields\n");
5435	10243	return -EINVAL;
5436	10244	}
..	..	@@ -5442,14 +10250,19 @@
5442	10250	if (BPF_SRC(insn->code) != BPF_K \|\|
5443	10251	insn->imm != 0 \|\|
5444	10252	insn->src_reg != BPF_REG_0 \|\|
5445		- insn->dst_reg != BPF_REG_0) {
	10253	+ insn->dst_reg != BPF_REG_0 \|\|
	10254	+ class == BPF_JMP32) {
5446	10255	verbose(env, "BPF_EXIT uses reserved fields\n");
	10256	+ return -EINVAL;
	10257	+ }
	10258	+
	10259	+ if (env->cur_state->active_spin_lock) {
	10260	+ verbose(env, "bpf_spin_unlock is missing\n");
5447	10261	return -EINVAL;
5448	10262	}
5449	10263
5450	10264	if (state->curframe) {
5451	10265	/* exit from nested function */
5452		- env->prev_insn_idx = env->insn_idx;
5453	10266	err = prepare_func_exit(env, &env->insn_idx);
5454	10267	if (err)
5455	10268	return err;
..	..	@@ -5457,27 +10270,17 @@
5457	10270	continue;
5458	10271	}
5459	10272
5460		- /* eBPF calling convetion is such that R0 is used
5461		- * to return the value from eBPF program.
5462		- * Make sure that it's readable at this time
5463		- * of bpf_exit, which means that program wrote
5464		- * something into it earlier
5465		- */
5466		- err = check_reg_arg(env, BPF_REG_0, SRC_OP);
	10273	+ err = check_reference_leak(env);
5467	10274	if (err)
5468	10275	return err;
5469		-
5470		- if (is_pointer_value(env, BPF_REG_0)) {
5471		- verbose(env, "R0 leaks addr as return value\n");
5472		- return -EACCES;
5473		- }
5474	10276
5475	10277	err = check_return_code(env);
5476	10278	if (err)
5477	10279	return err;
5478	10280	process_bpf_exit:
5479		- err = pop_stack(env, &env->prev_insn_idx,
5480		- &env->insn_idx);
	10281	+ update_branch_counts(env, env->cur_state);
	10282	+ err = pop_stack(env, &prev_insn_idx,
	10283	+ &env->insn_idx, pop_log);
5481	10284	if (err < 0) {
5482	10285	if (err != -ENOENT)
5483	10286	return err;
..	..	@@ -5518,17 +10321,93 @@
5518	10321	env->insn_idx++;
5519	10322	}
5520	10323
5521		- verbose(env, "processed %d insns (limit %d), stack depth ",
5522		- insn_processed, BPF_COMPLEXITY_LIMIT_INSNS);
5523		- for (i = 0; i < env->subprog_cnt; i++) {
5524		- u32 depth = env->subprog_info[i].stack_depth;
	10324	+ return 0;
	10325	+}
5525	10326
5526		- verbose(env, "%d", depth);
5527		- if (i + 1 < env->subprog_cnt)
5528		- verbose(env, "+");
	10327	+/* replace pseudo btf_id with kernel symbol address */
	10328	+static int check_pseudo_btf_id(struct bpf_verifier_env *env,
	10329	+ struct bpf_insn *insn,
	10330	+ struct bpf_insn_aux_data *aux)
	10331	+{
	10332	+ const struct btf_var_secinfo *vsi;
	10333	+ const struct btf_type *datasec;
	10334	+ const struct btf_type *t;
	10335	+ const char *sym_name;
	10336	+ bool percpu = false;
	10337	+ u32 type, id = insn->imm;
	10338	+ s32 datasec_id;
	10339	+ u64 addr;
	10340	+ int i;
	10341	+
	10342	+ if (!btf_vmlinux) {
	10343	+ verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n");
	10344	+ return -EINVAL;
5529	10345	}
5530		- verbose(env, "\n");
5531		- env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
	10346	+
	10347	+ if (insn[1].imm != 0) {
	10348	+ verbose(env, "reserved field (insn[1].imm) is used in pseudo_btf_id ldimm64 insn.\n");
	10349	+ return -EINVAL;
	10350	+ }
	10351	+
	10352	+ t = btf_type_by_id(btf_vmlinux, id);
	10353	+ if (!t) {
	10354	+ verbose(env, "ldimm64 insn specifies invalid btf_id %d.\n", id);
	10355	+ return -ENOENT;
	10356	+ }
	10357	+
	10358	+ if (!btf_type_is_var(t)) {
	10359	+ verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n",
	10360	+ id);
	10361	+ return -EINVAL;
	10362	+ }
	10363	+
	10364	+ sym_name = btf_name_by_offset(btf_vmlinux, t->name_off);
	10365	+ addr = kallsyms_lookup_name(sym_name);
	10366	+ if (!addr) {
	10367	+ verbose(env, "ldimm64 failed to find the address for kernel symbol '%s'.\n",
	10368	+ sym_name);
	10369	+ return -ENOENT;
	10370	+ }
	10371	+
	10372	+ datasec_id = btf_find_by_name_kind(btf_vmlinux, ".data..percpu",
	10373	+ BTF_KIND_DATASEC);
	10374	+ if (datasec_id > 0) {
	10375	+ datasec = btf_type_by_id(btf_vmlinux, datasec_id);
	10376	+ for_each_vsi(i, datasec, vsi) {
	10377	+ if (vsi->type == id) {
	10378	+ percpu = true;
	10379	+ break;
	10380	+ }
	10381	+ }
	10382	+ }
	10383	+
	10384	+ insn[0].imm = (u32)addr;
	10385	+ insn[1].imm = addr >> 32;
	10386	+
	10387	+ type = t->type;
	10388	+ t = btf_type_skip_modifiers(btf_vmlinux, type, NULL);
	10389	+ if (percpu) {
	10390	+ aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID;
	10391	+ aux->btf_var.btf_id = type;
	10392	+ } else if (!btf_type_is_struct(t)) {
	10393	+ const struct btf_type *ret;
	10394	+ const char *tname;
	10395	+ u32 tsize;
	10396	+
	10397	+ /* resolve the type size of ksym. */
	10398	+ ret = btf_resolve_size(btf_vmlinux, t, &tsize);
	10399	+ if (IS_ERR(ret)) {
	10400	+ tname = btf_name_by_offset(btf_vmlinux, t->name_off);
	10401	+ verbose(env, "ldimm64 unable to resolve the size of type '%s': %ld\n",
	10402	+ tname, PTR_ERR(ret));
	10403	+ return -EINVAL;
	10404	+ }
	10405	+ aux->btf_var.reg_type = PTR_TO_MEM;
	10406	+ aux->btf_var.mem_size = tsize;
	10407	+ } else {
	10408	+ aux->btf_var.reg_type = PTR_TO_BTF_ID;
	10409	+ aux->btf_var.btf_id = type;
	10410	+ }
5532	10411	return 0;
5533	10412	}
5534	10413
..	..	@@ -5540,26 +10419,69 @@
5540	10419	!(map->map_flags & BPF_F_NO_PREALLOC);
5541	10420	}
5542	10421
	10422	+static bool is_tracing_prog_type(enum bpf_prog_type type)
	10423	+{
	10424	+ switch (type) {
	10425	+ case BPF_PROG_TYPE_KPROBE:
	10426	+ case BPF_PROG_TYPE_TRACEPOINT:
	10427	+ case BPF_PROG_TYPE_PERF_EVENT:
	10428	+ case BPF_PROG_TYPE_RAW_TRACEPOINT:
	10429	+ return true;
	10430	+ default:
	10431	+ return false;
	10432	+ }
	10433	+}
	10434	+
	10435	+static bool is_preallocated_map(struct bpf_map *map)
	10436	+{
	10437	+ if (!check_map_prealloc(map))
	10438	+ return false;
	10439	+ if (map->inner_map_meta && !check_map_prealloc(map->inner_map_meta))
	10440	+ return false;
	10441	+ return true;
	10442	+}
	10443	+
5543	10444	static int check_map_prog_compatibility(struct bpf_verifier_env *env,
5544	10445	struct bpf_map *map,
5545	10446	struct bpf_prog *prog)
5546	10447
5547	10448	{
5548		- /* Make sure that BPF_PROG_TYPE_PERF_EVENT programs only use
5549		- * preallocated hash maps, since doing memory allocation
5550		- * in overflow_handler can crash depending on where nmi got
5551		- * triggered.
	10449	+ enum bpf_prog_type prog_type = resolve_prog_type(prog);
	10450	+ /*
	10451	+ * Validate that trace type programs use preallocated hash maps.
	10452	+ *
	10453	+ * For programs attached to PERF events this is mandatory as the
	10454	+ * perf NMI can hit any arbitrary code sequence.
	10455	+ *
	10456	+ * All other trace types using preallocated hash maps are unsafe as
	10457	+ * well because tracepoint or kprobes can be inside locked regions
	10458	+ * of the memory allocator or at a place where a recursion into the
	10459	+ * memory allocator would see inconsistent state.
	10460	+ *
	10461	+ * On RT enabled kernels run-time allocation of all trace type
	10462	+ * programs is strictly prohibited due to lock type constraints. On
	10463	+ * !RT kernels it is allowed for backwards compatibility reasons for
	10464	+ * now, but warnings are emitted so developers are made aware of
	10465	+ * the unsafety and can fix their programs before this is enforced.
5552	10466	*/
5553		- if (prog->type == BPF_PROG_TYPE_PERF_EVENT) {
5554		- if (!check_map_prealloc(map)) {
	10467	+ if (is_tracing_prog_type(prog_type) && !is_preallocated_map(map)) {
	10468	+ if (prog_type == BPF_PROG_TYPE_PERF_EVENT) {
5555	10469	verbose(env, "perf_event programs can only use preallocated hash map\n");
5556	10470	return -EINVAL;
5557	10471	}
5558		- if (map->inner_map_meta &&
5559		- !check_map_prealloc(map->inner_map_meta)) {
5560		- verbose(env, "perf_event programs can only use preallocated inner hash map\n");
	10472	+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
	10473	+ verbose(env, "trace type programs can only use preallocated hash map\n");
5561	10474	return -EINVAL;
5562	10475	}
	10476	+ WARN_ONCE(1, "trace type BPF program uses run-time allocation\n");
	10477	+ verbose(env, "trace type programs with run-time allocated hash maps are unsafe. Switch to preallocated hash maps.\n");
	10478	+ }
	10479	+
	10480	+ if ((is_tracing_prog_type(prog_type) \|\|
	10481	+ prog_type == BPF_PROG_TYPE_SOCKET_FILTER) &&
	10482	+ map_value_has_spin_lock(map)) {
	10483	+ verbose(env, "tracing progs cannot use bpf_spin_lock yet\n");
	10484	+ return -EINVAL;
5563	10485	}
5564	10486
5565	10487	if ((bpf_prog_is_dev_bound(prog->aux) \|\| bpf_map_is_dev_bound(map)) &&
..	..	@@ -5568,13 +10490,45 @@
5568	10490	return -EINVAL;
5569	10491	}
5570	10492
	10493	+ if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
	10494	+ verbose(env, "bpf_struct_ops map cannot be used in prog\n");
	10495	+ return -EINVAL;
	10496	+ }
	10497	+
	10498	+ if (prog->aux->sleepable)
	10499	+ switch (map->map_type) {
	10500	+ case BPF_MAP_TYPE_HASH:
	10501	+ case BPF_MAP_TYPE_LRU_HASH:
	10502	+ case BPF_MAP_TYPE_ARRAY:
	10503	+ if (!is_preallocated_map(map)) {
	10504	+ verbose(env,
	10505	+ "Sleepable programs can only use preallocated hash maps\n");
	10506	+ return -EINVAL;
	10507	+ }
	10508	+ break;
	10509	+ default:
	10510	+ verbose(env,
	10511	+ "Sleepable programs can only use array and hash maps\n");
	10512	+ return -EINVAL;
	10513	+ }
	10514	+
5571	10515	return 0;
5572	10516	}
5573	10517
5574		-/* look for pseudo eBPF instructions that access map FDs and
5575		- * replace them with actual map pointers
	10518	+static bool bpf_map_is_cgroup_storage(struct bpf_map *map)
	10519	+{
	10520	+ return (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE \|\|
	10521	+ map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE);
	10522	+}
	10523	+
	10524	+/* find and rewrite pseudo imm in ld_imm64 instructions:
	10525	+ *
	10526	+ * 1. if it accesses map FD, replace it with actual map pointer.
	10527	+ * 2. if it accesses btf_id of a VAR, replace it with pointer to the var.
	10528	+ *
	10529	+ * NOTE: btf_vmlinux is required for converting pseudo btf_id.
5576	10530	*/
5577		-static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
	10531	+static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
5578	10532	{
5579	10533	struct bpf_insn *insn = env->prog->insnsi;
5580	10534	int insn_cnt = env->prog->len;
..	..	@@ -5599,8 +10553,10 @@
5599	10553	}
5600	10554
5601	10555	if (insn[0].code == (BPF_LD \| BPF_IMM \| BPF_DW)) {
	10556	+ struct bpf_insn_aux_data *aux;
5602	10557	struct bpf_map *map;
5603	10558	struct fd f;
	10559	+ u64 addr;
5604	10560
5605	10561	if (i == insn_cnt - 1 \|\| insn[1].code != 0 \|\|
5606	10562	insn[1].dst_reg != 0 \|\| insn[1].src_reg != 0 \|\|
..	..	@@ -5609,21 +10565,35 @@
5609	10565	return -EINVAL;
5610	10566	}
5611	10567
5612		- if (insn->src_reg == 0)
	10568	+ if (insn[0].src_reg == 0)
5613	10569	/* valid generic load 64-bit imm */
5614	10570	goto next_insn;
5615	10571
5616		- if (insn->src_reg != BPF_PSEUDO_MAP_FD) {
	10572	+ if (insn[0].src_reg == BPF_PSEUDO_BTF_ID) {
	10573	+ aux = &env->insn_aux_data[i];
	10574	+ err = check_pseudo_btf_id(env, insn, aux);
	10575	+ if (err)
	10576	+ return err;
	10577	+ goto next_insn;
	10578	+ }
	10579	+
	10580	+ /* In final convert_pseudo_ld_imm64() step, this is
	10581	+ * converted into regular 64-bit imm load insn.
	10582	+ */
	10583	+ if ((insn[0].src_reg != BPF_PSEUDO_MAP_FD &&
	10584	+ insn[0].src_reg != BPF_PSEUDO_MAP_VALUE) \|\|
	10585	+ (insn[0].src_reg == BPF_PSEUDO_MAP_FD &&
	10586	+ insn[1].imm != 0)) {
5617	10587	verbose(env,
5618	10588	"unrecognized bpf_ld_imm64 insn\n");
5619	10589	return -EINVAL;
5620	10590	}
5621	10591
5622		- f = fdget(insn->imm);
	10592	+ f = fdget(insn[0].imm);
5623	10593	map = __bpf_map_get(f);
5624	10594	if (IS_ERR(map)) {
5625	10595	verbose(env, "fd %d is not pointing to valid bpf_map\n",
5626		- insn->imm);
	10596	+ insn[0].imm);
5627	10597	return PTR_ERR(map);
5628	10598	}
5629	10599
..	..	@@ -5633,16 +10603,47 @@
5633	10603	return err;
5634	10604	}
5635	10605
5636		- /* store map pointer inside BPF_LD_IMM64 instruction */
5637		- insn[0].imm = (u32) (unsigned long) map;
5638		- insn[1].imm = ((u64) (unsigned long) map) >> 32;
	10606	+ aux = &env->insn_aux_data[i];
	10607	+ if (insn->src_reg == BPF_PSEUDO_MAP_FD) {
	10608	+ addr = (unsigned long)map;
	10609	+ } else {
	10610	+ u32 off = insn[1].imm;
	10611	+
	10612	+ if (off >= BPF_MAX_VAR_OFF) {
	10613	+ verbose(env, "direct value offset of %u is not allowed\n", off);
	10614	+ fdput(f);
	10615	+ return -EINVAL;
	10616	+ }
	10617	+
	10618	+ if (!map->ops->map_direct_value_addr) {
	10619	+ verbose(env, "no direct value access support for this map type\n");
	10620	+ fdput(f);
	10621	+ return -EINVAL;
	10622	+ }
	10623	+
	10624	+ err = map->ops->map_direct_value_addr(map, &addr, off);
	10625	+ if (err) {
	10626	+ verbose(env, "invalid access to map value pointer, value_size=%u off=%u\n",
	10627	+ map->value_size, off);
	10628	+ fdput(f);
	10629	+ return err;
	10630	+ }
	10631	+
	10632	+ aux->map_off = off;
	10633	+ addr += off;
	10634	+ }
	10635	+
	10636	+ insn[0].imm = (u32)addr;
	10637	+ insn[1].imm = addr >> 32;
5639	10638
5640	10639	/* check whether we recorded this map already */
5641		- for (j = 0; j < env->used_map_cnt; j++)
	10640	+ for (j = 0; j < env->used_map_cnt; j++) {
5642	10641	if (env->used_maps[j] == map) {
	10642	+ aux->map_index = j;
5643	10643	fdput(f);
5644	10644	goto next_insn;
5645	10645	}
	10646	+ }
5646	10647
5647	10648	if (env->used_map_cnt >= MAX_USED_MAPS) {
5648	10649	fdput(f);
..	..	@@ -5654,17 +10655,14 @@
5654	10655	* will be used by the valid program until it's unloaded
5655	10656	* and all maps are released in free_used_maps()
5656	10657	*/
5657		- map = bpf_map_inc(map, false);
5658		- if (IS_ERR(map)) {
5659		- fdput(f);
5660		- return PTR_ERR(map);
5661		- }
	10658	+ bpf_map_inc(map);
	10659	+
	10660	+ aux->map_index = env->used_map_cnt;
5662	10661	env->used_maps[env->used_map_cnt++] = map;
5663	10662
5664		- if (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE &&
5665		- bpf_cgroup_storage_assign(env->prog, map)) {
5666		- verbose(env,
5667		- "only one cgroup storage is allowed\n");
	10663	+ if (bpf_map_is_cgroup_storage(map) &&
	10664	+ bpf_cgroup_storage_assign(env->prog->aux, map)) {
	10665	+ verbose(env, "only one cgroup storage of each type is allowed\n");
5668	10666	fdput(f);
5669	10667	return -EBUSY;
5670	10668	}
..	..	@@ -5693,14 +10691,8 @@
5693	10691	/* drop refcnt of maps used by the rejected program */
5694	10692	static void release_maps(struct bpf_verifier_env *env)
5695	10693	{
5696		- int i;
5697		-
5698		- if (env->prog->aux->cgroup_storage)
5699		- bpf_cgroup_storage_release(env->prog,
5700		- env->prog->aux->cgroup_storage);
5701		-
5702		- for (i = 0; i < env->used_map_cnt; i++)
5703		- bpf_map_put(env->used_maps[i]);
	10694	+ __bpf_free_used_maps(env->prog->aux, env->used_maps,
	10695	+ env->used_map_cnt);
5704	10696	}
5705	10697
5706	10698	/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */
..	..	@@ -5719,29 +10711,36 @@
5719	10711	* insni[off, off + cnt). Adjust corresponding insn_aux_data by copying
5720	10712	* [0, off) and [off, end) to new locations, so the patched range stays zero
5721	10713	*/
5722		-static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,
5723		- u32 off, u32 cnt)
	10714	+static void adjust_insn_aux_data(struct bpf_verifier_env *env,
	10715	+ struct bpf_insn_aux_data *new_data,
	10716	+ struct bpf_prog *new_prog, u32 off, u32 cnt)
5724	10717	{
5725		- struct bpf_insn_aux_data new_data, old_data = env->insn_aux_data;
5726		- bool old_seen = old_data[off].seen;
	10718	+ struct bpf_insn_aux_data *old_data = env->insn_aux_data;
	10719	+ struct bpf_insn *insn = new_prog->insnsi;
	10720	+ u32 old_seen = old_data[off].seen;
	10721	+ u32 prog_len;
5727	10722	int i;
5728	10723
	10724	+ /* aux info at OFF always needs adjustment, no matter fast path
	10725	+ * (cnt == 1) is taken or not. There is no guarantee INSN at OFF is the
	10726	+ * original insn at old prog.
	10727	+ */
	10728	+ old_data[off].zext_dst = insn_has_def32(env, insn + off + cnt - 1);
	10729	+
5729	10730	if (cnt == 1)
5730		- return 0;
5731		- new_data = vzalloc(array_size(prog_len,
5732		- sizeof(struct bpf_insn_aux_data)));
5733		- if (!new_data)
5734		- return -ENOMEM;
	10731	+ return;
	10732	+ prog_len = new_prog->len;
	10733	+
5735	10734	memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off);
5736	10735	memcpy(new_data + off + cnt - 1, old_data + off,
5737	10736	sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
5738	10737	for (i = off; i < off + cnt - 1; i++) {
5739	10738	/* Expand insni[off]'s seen count to the patched range. */
5740	10739	new_data[i].seen = old_seen;
	10740	+ new_data[i].zext_dst = insn_has_def32(env, insn + i);
5741	10741	}
5742	10742	env->insn_aux_data = new_data;
5743	10743	vfree(old_data);
5744		- return 0;
5745	10744	}
5746	10745
5747	10746	static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len)
..	..	@@ -5758,18 +10757,193 @@
5758	10757	}
5759	10758	}
5760	10759
	10760	+static void adjust_poke_descs(struct bpf_prog *prog, u32 off, u32 len)
	10761	+{
	10762	+ struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab;
	10763	+ int i, sz = prog->aux->size_poke_tab;
	10764	+ struct bpf_jit_poke_descriptor *desc;
	10765	+
	10766	+ for (i = 0; i < sz; i++) {
	10767	+ desc = &tab[i];
	10768	+ if (desc->insn_idx <= off)
	10769	+ continue;
	10770	+ desc->insn_idx += len - 1;
	10771	+ }
	10772	+}
	10773	+
5761	10774	static struct bpf_prog bpf_patch_insn_data(struct bpf_verifier_env env, u32 off,
5762	10775	const struct bpf_insn *patch, u32 len)
5763	10776	{
5764	10777	struct bpf_prog *new_prog;
	10778	+ struct bpf_insn_aux_data *new_data = NULL;
	10779	+
	10780	+ if (len > 1) {
	10781	+ new_data = vzalloc(array_size(env->prog->len + len - 1,
	10782	+ sizeof(struct bpf_insn_aux_data)));
	10783	+ if (!new_data)
	10784	+ return NULL;
	10785	+ }
5765	10786
5766	10787	new_prog = bpf_patch_insn_single(env->prog, off, patch, len);
5767		- if (!new_prog)
	10788	+ if (IS_ERR(new_prog)) {
	10789	+ if (PTR_ERR(new_prog) == -ERANGE)
	10790	+ verbose(env,
	10791	+ "insn %d cannot be patched due to 16-bit range\n",
	10792	+ env->insn_aux_data[off].orig_idx);
	10793	+ vfree(new_data);
5768	10794	return NULL;
5769		- if (adjust_insn_aux_data(env, new_prog->len, off, len))
5770		- return NULL;
	10795	+ }
	10796	+ adjust_insn_aux_data(env, new_data, new_prog, off, len);
5771	10797	adjust_subprog_starts(env, off, len);
	10798	+ adjust_poke_descs(new_prog, off, len);
5772	10799	return new_prog;
	10800	+}
	10801	+
	10802	+static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env,
	10803	+ u32 off, u32 cnt)
	10804	+{
	10805	+ int i, j;
	10806	+
	10807	+ /* find first prog starting at or after off (first to remove) */
	10808	+ for (i = 0; i < env->subprog_cnt; i++)
	10809	+ if (env->subprog_info[i].start >= off)
	10810	+ break;
	10811	+ /* find first prog starting at or after off + cnt (first to stay) */
	10812	+ for (j = i; j < env->subprog_cnt; j++)
	10813	+ if (env->subprog_info[j].start >= off + cnt)
	10814	+ break;
	10815	+ /* if j doesn't start exactly at off + cnt, we are just removing
	10816	+ * the front of previous prog
	10817	+ */
	10818	+ if (env->subprog_info[j].start != off + cnt)
	10819	+ j--;
	10820	+
	10821	+ if (j > i) {
	10822	+ struct bpf_prog_aux *aux = env->prog->aux;
	10823	+ int move;
	10824	+
	10825	+ /* move fake 'exit' subprog as well */
	10826	+ move = env->subprog_cnt + 1 - j;
	10827	+
	10828	+ memmove(env->subprog_info + i,
	10829	+ env->subprog_info + j,
	10830	+ sizeof(env->subprog_info) move);
	10831	+ env->subprog_cnt -= j - i;
	10832	+
	10833	+ /* remove func_info */
	10834	+ if (aux->func_info) {
	10835	+ move = aux->func_info_cnt - j;
	10836	+
	10837	+ memmove(aux->func_info + i,
	10838	+ aux->func_info + j,
	10839	+ sizeof(aux->func_info) move);
	10840	+ aux->func_info_cnt -= j - i;
	10841	+ /* func_info->insn_off is set after all code rewrites,
	10842	+ * in adjust_btf_func() - no need to adjust
	10843	+ */
	10844	+ }
	10845	+ } else {
	10846	+ /* convert i from "first prog to remove" to "first to adjust" */
	10847	+ if (env->subprog_info[i].start == off)
	10848	+ i++;
	10849	+ }
	10850	+
	10851	+ /* update fake 'exit' subprog as well */
	10852	+ for (; i <= env->subprog_cnt; i++)
	10853	+ env->subprog_info[i].start -= cnt;
	10854	+
	10855	+ return 0;
	10856	+}
	10857	+
	10858	+static int bpf_adj_linfo_after_remove(struct bpf_verifier_env *env, u32 off,
	10859	+ u32 cnt)
	10860	+{
	10861	+ struct bpf_prog *prog = env->prog;
	10862	+ u32 i, l_off, l_cnt, nr_linfo;
	10863	+ struct bpf_line_info *linfo;
	10864	+
	10865	+ nr_linfo = prog->aux->nr_linfo;
	10866	+ if (!nr_linfo)
	10867	+ return 0;
	10868	+
	10869	+ linfo = prog->aux->linfo;
	10870	+
	10871	+ /* find first line info to remove, count lines to be removed */
	10872	+ for (i = 0; i < nr_linfo; i++)
	10873	+ if (linfo[i].insn_off >= off)
	10874	+ break;
	10875	+
	10876	+ l_off = i;
	10877	+ l_cnt = 0;
	10878	+ for (; i < nr_linfo; i++)
	10879	+ if (linfo[i].insn_off < off + cnt)
	10880	+ l_cnt++;
	10881	+ else
	10882	+ break;
	10883	+
	10884	+ /* First live insn doesn't match first live linfo, it needs to "inherit"
	10885	+ * last removed linfo. prog is already modified, so prog->len == off
	10886	+ * means no live instructions after (tail of the program was removed).
	10887	+ */
	10888	+ if (prog->len != off && l_cnt &&
	10889	+ (i == nr_linfo \|\| linfo[i].insn_off != off + cnt)) {
	10890	+ l_cnt--;
	10891	+ linfo[--i].insn_off = off + cnt;
	10892	+ }
	10893	+
	10894	+ /* remove the line info which refer to the removed instructions */
	10895	+ if (l_cnt) {
	10896	+ memmove(linfo + l_off, linfo + i,
	10897	+ sizeof(linfo) (nr_linfo - i));
	10898	+
	10899	+ prog->aux->nr_linfo -= l_cnt;
	10900	+ nr_linfo = prog->aux->nr_linfo;
	10901	+ }
	10902	+
	10903	+ /* pull all linfo[i].insn_off >= off + cnt in by cnt */
	10904	+ for (i = l_off; i < nr_linfo; i++)
	10905	+ linfo[i].insn_off -= cnt;
	10906	+
	10907	+ /* fix up all subprogs (incl. 'exit') which start >= off */
	10908	+ for (i = 0; i <= env->subprog_cnt; i++)
	10909	+ if (env->subprog_info[i].linfo_idx > l_off) {
	10910	+ /* program may have started in the removed region but
	10911	+ * may not be fully removed
	10912	+ */
	10913	+ if (env->subprog_info[i].linfo_idx >= l_off + l_cnt)
	10914	+ env->subprog_info[i].linfo_idx -= l_cnt;
	10915	+ else
	10916	+ env->subprog_info[i].linfo_idx = l_off;
	10917	+ }
	10918	+
	10919	+ return 0;
	10920	+}
	10921	+
	10922	+static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
	10923	+{
	10924	+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
	10925	+ unsigned int orig_prog_len = env->prog->len;
	10926	+ int err;
	10927	+
	10928	+ if (bpf_prog_is_dev_bound(env->prog->aux))
	10929	+ bpf_prog_offload_remove_insns(env, off, cnt);
	10930	+
	10931	+ err = bpf_remove_insns(env->prog, off, cnt);
	10932	+ if (err)
	10933	+ return err;
	10934	+
	10935	+ err = adjust_subprog_starts_after_remove(env, off, cnt);
	10936	+ if (err)
	10937	+ return err;
	10938	+
	10939	+ err = bpf_adj_linfo_after_remove(env, off, cnt);
	10940	+ if (err)
	10941	+ return err;
	10942	+
	10943	+ memmove(aux_data + off, aux_data + off + cnt,
	10944	+ sizeof(aux_data) (orig_prog_len - off - cnt));
	10945	+
	10946	+ return 0;
5773	10947	}
5774	10948
5775	10949	/* The verifier does more data flow analysis than llvm and will not
..	..	@@ -5795,11 +10969,177 @@
5795	10969	if (aux_data[i].seen)
5796	10970	continue;
5797	10971	memcpy(insn + i, &trap, sizeof(trap));
	10972	+ aux_data[i].zext_dst = false;
5798	10973	}
5799	10974	}
5800	10975
5801		-/* convert load instructions that access fields of 'struct __sk_buff'
5802		- * into sequence of instructions that access fields of 'struct sk_buff'
	10976	+static bool insn_is_cond_jump(u8 code)
	10977	+{
	10978	+ u8 op;
	10979	+
	10980	+ if (BPF_CLASS(code) == BPF_JMP32)
	10981	+ return true;
	10982	+
	10983	+ if (BPF_CLASS(code) != BPF_JMP)
	10984	+ return false;
	10985	+
	10986	+ op = BPF_OP(code);
	10987	+ return op != BPF_JA && op != BPF_EXIT && op != BPF_CALL;
	10988	+}
	10989	+
	10990	+static void opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env)
	10991	+{
	10992	+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
	10993	+ struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0);
	10994	+ struct bpf_insn *insn = env->prog->insnsi;
	10995	+ const int insn_cnt = env->prog->len;
	10996	+ int i;
	10997	+
	10998	+ for (i = 0; i < insn_cnt; i++, insn++) {
	10999	+ if (!insn_is_cond_jump(insn->code))
	11000	+ continue;
	11001	+
	11002	+ if (!aux_data[i + 1].seen)
	11003	+ ja.off = insn->off;
	11004	+ else if (!aux_data[i + 1 + insn->off].seen)
	11005	+ ja.off = 0;
	11006	+ else
	11007	+ continue;
	11008	+
	11009	+ if (bpf_prog_is_dev_bound(env->prog->aux))
	11010	+ bpf_prog_offload_replace_insn(env, i, &ja);
	11011	+
	11012	+ memcpy(insn, &ja, sizeof(ja));
	11013	+ }
	11014	+}
	11015	+
	11016	+static int opt_remove_dead_code(struct bpf_verifier_env *env)
	11017	+{
	11018	+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
	11019	+ int insn_cnt = env->prog->len;
	11020	+ int i, err;
	11021	+
	11022	+ for (i = 0; i < insn_cnt; i++) {
	11023	+ int j;
	11024	+
	11025	+ j = 0;
	11026	+ while (i + j < insn_cnt && !aux_data[i + j].seen)
	11027	+ j++;
	11028	+ if (!j)
	11029	+ continue;
	11030	+
	11031	+ err = verifier_remove_insns(env, i, j);
	11032	+ if (err)
	11033	+ return err;
	11034	+ insn_cnt = env->prog->len;
	11035	+ }
	11036	+
	11037	+ return 0;
	11038	+}
	11039	+
	11040	+static int opt_remove_nops(struct bpf_verifier_env *env)
	11041	+{
	11042	+ const struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0);
	11043	+ struct bpf_insn *insn = env->prog->insnsi;
	11044	+ int insn_cnt = env->prog->len;
	11045	+ int i, err;
	11046	+
	11047	+ for (i = 0; i < insn_cnt; i++) {
	11048	+ if (memcmp(&insn[i], &ja, sizeof(ja)))
	11049	+ continue;
	11050	+
	11051	+ err = verifier_remove_insns(env, i, 1);
	11052	+ if (err)
	11053	+ return err;
	11054	+ insn_cnt--;
	11055	+ i--;
	11056	+ }
	11057	+
	11058	+ return 0;
	11059	+}
	11060	+
	11061	+static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
	11062	+ const union bpf_attr *attr)
	11063	+{
	11064	+ struct bpf_insn *patch, zext_patch[2], rnd_hi32_patch[4];
	11065	+ struct bpf_insn_aux_data *aux = env->insn_aux_data;
	11066	+ int i, patch_len, delta = 0, len = env->prog->len;
	11067	+ struct bpf_insn *insns = env->prog->insnsi;
	11068	+ struct bpf_prog *new_prog;
	11069	+ bool rnd_hi32;
	11070	+
	11071	+ rnd_hi32 = attr->prog_flags & BPF_F_TEST_RND_HI32;
	11072	+ zext_patch[1] = BPF_ZEXT_REG(0);
	11073	+ rnd_hi32_patch[1] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, 0);
	11074	+ rnd_hi32_patch[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32);
	11075	+ rnd_hi32_patch[3] = BPF_ALU64_REG(BPF_OR, 0, BPF_REG_AX);
	11076	+ for (i = 0; i < len; i++) {
	11077	+ int adj_idx = i + delta;
	11078	+ struct bpf_insn insn;
	11079	+
	11080	+ insn = insns[adj_idx];
	11081	+ if (!aux[adj_idx].zext_dst) {
	11082	+ u8 code, class;
	11083	+ u32 imm_rnd;
	11084	+
	11085	+ if (!rnd_hi32)
	11086	+ continue;
	11087	+
	11088	+ code = insn.code;
	11089	+ class = BPF_CLASS(code);
	11090	+ if (insn_no_def(&insn))
	11091	+ continue;
	11092	+
	11093	+ /* NOTE: arg "reg" (the fourth one) is only used for
	11094	+ * BPF_STX which has been ruled out in above
	11095	+ * check, it is safe to pass NULL here.
	11096	+ */
	11097	+ if (is_reg64(env, &insn, insn.dst_reg, NULL, DST_OP)) {
	11098	+ if (class == BPF_LD &&
	11099	+ BPF_MODE(code) == BPF_IMM)
	11100	+ i++;
	11101	+ continue;
	11102	+ }
	11103	+
	11104	+ /* ctx load could be transformed into wider load. */
	11105	+ if (class == BPF_LDX &&
	11106	+ aux[adj_idx].ptr_type == PTR_TO_CTX)
	11107	+ continue;
	11108	+
	11109	+ imm_rnd = get_random_int();
	11110	+ rnd_hi32_patch[0] = insn;
	11111	+ rnd_hi32_patch[1].imm = imm_rnd;
	11112	+ rnd_hi32_patch[3].dst_reg = insn.dst_reg;
	11113	+ patch = rnd_hi32_patch;
	11114	+ patch_len = 4;
	11115	+ goto apply_patch_buffer;
	11116	+ }
	11117	+
	11118	+ if (!bpf_jit_needs_zext())
	11119	+ continue;
	11120	+
	11121	+ zext_patch[0] = insn;
	11122	+ zext_patch[1].dst_reg = insn.dst_reg;
	11123	+ zext_patch[1].src_reg = insn.dst_reg;
	11124	+ patch = zext_patch;
	11125	+ patch_len = 2;
	11126	+apply_patch_buffer:
	11127	+ new_prog = bpf_patch_insn_data(env, adj_idx, patch, patch_len);
	11128	+ if (!new_prog)
	11129	+ return -ENOMEM;
	11130	+ env->prog = new_prog;
	11131	+ insns = new_prog->insnsi;
	11132	+ aux = env->insn_aux_data;
	11133	+ delta += patch_len - 1;
	11134	+ }
	11135	+
	11136	+ return 0;
	11137	+}
	11138	+
	11139	+/* convert load instructions that access fields of a context type into a
	11140	+ * sequence of instructions that access fields of the underlying structure:
	11141	+ * struct __sk_buff -> struct sk_buff
	11142	+ * struct bpf_sock_ops -> struct sock
5803	11143	*/
5804	11144	static int convert_ctx_accesses(struct bpf_verifier_env *env)
5805	11145	{
..	..	@@ -5812,7 +11152,11 @@
5812	11152	enum bpf_access_type type;
5813	11153	bool is_narrower_load;
5814	11154
5815		- if (ops->gen_prologue) {
	11155	+ if (ops->gen_prologue \|\| env->seen_direct_write) {
	11156	+ if (!ops->gen_prologue) {
	11157	+ verbose(env, "bpf verifier is misconfigured\n");
	11158	+ return -EINVAL;
	11159	+ }
5816	11160	cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
5817	11161	env->prog);
5818	11162	if (cnt >= ARRAY_SIZE(insn_buf)) {
..	..	@@ -5828,12 +11172,13 @@
5828	11172	}
5829	11173	}
5830	11174
5831		- if (!ops->convert_ctx_access \|\| bpf_prog_is_dev_bound(env->prog->aux))
	11175	+ if (bpf_prog_is_dev_bound(env->prog->aux))
5832	11176	return 0;
5833	11177
5834	11178	insn = env->prog->insnsi + delta;
5835	11179
5836	11180	for (i = 0; i < insn_cnt; i++, insn++) {
	11181	+ bpf_convert_ctx_access_t convert_ctx_access;
5837	11182	bool ctx_access;
5838	11183
5839	11184	if (insn->code == (BPF_LDX \| BPF_MEM \| BPF_B) \|\|
..	..	@@ -5877,8 +11222,35 @@
5877	11222	if (!ctx_access)
5878	11223	continue;
5879	11224
5880		- if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX)
	11225	+ switch (env->insn_aux_data[i + delta].ptr_type) {
	11226	+ case PTR_TO_CTX:
	11227	+ if (!ops->convert_ctx_access)
	11228	+ continue;
	11229	+ convert_ctx_access = ops->convert_ctx_access;
	11230	+ break;
	11231	+ case PTR_TO_SOCKET:
	11232	+ case PTR_TO_SOCK_COMMON:
	11233	+ convert_ctx_access = bpf_sock_convert_ctx_access;
	11234	+ break;
	11235	+ case PTR_TO_TCP_SOCK:
	11236	+ convert_ctx_access = bpf_tcp_sock_convert_ctx_access;
	11237	+ break;
	11238	+ case PTR_TO_XDP_SOCK:
	11239	+ convert_ctx_access = bpf_xdp_sock_convert_ctx_access;
	11240	+ break;
	11241	+ case PTR_TO_BTF_ID:
	11242	+ if (type == BPF_READ) {
	11243	+ insn->code = BPF_LDX \| BPF_PROBE_MEM \|
	11244	+ BPF_SIZE((insn)->code);
	11245	+ env->prog->aux->num_exentries++;
	11246	+ } else if (resolve_prog_type(env->prog) != BPF_PROG_TYPE_STRUCT_OPS) {
	11247	+ verbose(env, "Writes through BTF pointers are not allowed\n");
	11248	+ return -EINVAL;
	11249	+ }
5881	11250	continue;
	11251	+ default:
	11252	+ continue;
	11253	+ }
5882	11254
5883	11255	ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size;
5884	11256	size = BPF_LDST_BYTES(insn);
..	..	@@ -5910,8 +11282,8 @@
5910	11282	}
5911	11283
5912	11284	target_size = 0;
5913		- cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog,
5914		- &target_size);
	11285	+ cnt = convert_ctx_access(type, insn, insn_buf, env->prog,
	11286	+ &target_size);
5915	11287	if (cnt == 0 \|\| cnt >= ARRAY_SIZE(insn_buf) \|\|
5916	11288	(ctx_field_size && !target_size)) {
5917	11289	verbose(env, "bpf verifier is misconfigured\n");
..	..	@@ -5919,8 +11291,12 @@
5919	11291	}
5920	11292
5921	11293	if (is_narrower_load && size < target_size) {
5922		- u8 shift = (off & (size_default - 1)) * 8;
5923		-
	11294	+ u8 shift = bpf_ctx_narrow_access_offset(
	11295	+ off, size, size_default) * 8;
	11296	+ if (shift && cnt + 1 >= ARRAY_SIZE(insn_buf)) {
	11297	+ verbose(env, "bpf verifier narrow ctx load misconfigured\n");
	11298	+ return -EINVAL;
	11299	+ }
5924	11300	if (ctx_field_size <= 4) {
5925	11301	if (shift)
5926	11302	insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH,
..	..	@@ -5933,7 +11309,7 @@
5933	11309	insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH,
5934	11310	insn->dst_reg,
5935	11311	shift);
5936		- insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg,
	11312	+ insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
5937	11313	(1ULL << size * 8) - 1);
5938	11314	}
5939	11315	}
..	..	@@ -5956,9 +11332,10 @@
5956	11332	{
5957	11333	struct bpf_prog prog = env->prog, func, tmp;
5958	11334	int i, j, subprog_start, subprog_end = 0, len, subprog;
	11335	+ struct bpf_map *map_ptr;
5959	11336	struct bpf_insn *insn;
5960	11337	void *old_bpf_func;
5961		- int err = -ENOMEM;
	11338	+ int err, num_exentries;
5962	11339
5963	11340	if (env->subprog_cnt <= 1)
5964	11341	return 0;
..	..	@@ -5989,6 +11366,11 @@
5989	11366	insn->imm = 1;
5990	11367	}
5991	11368
	11369	+ err = bpf_prog_alloc_jited_linfo(prog);
	11370	+ if (err)
	11371	+ goto out_undo_insn;
	11372	+
	11373	+ err = -ENOMEM;
5992	11374	func = kcalloc(env->subprog_cnt, sizeof(prog), GFP_KERNEL);
5993	11375	if (!func)
5994	11376	goto out_undo_insn;
..	..	@@ -5998,7 +11380,12 @@
5998	11380	subprog_end = env->subprog_info[i + 1].start;
5999	11381
6000	11382	len = subprog_end - subprog_start;
6001		- func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER);
	11383	+ /* BPF_PROG_RUN doesn't call subprogs directly,
	11384	+ * hence main prog stats include the runtime of subprogs.
	11385	+ * subprogs don't have IDs and not reachable via prog_get_next_id
	11386	+ * func[i]->aux->stats will never be accessed and stays NULL
	11387	+ */
	11388	+ func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER);
6002	11389	if (!func[i])
6003	11390	goto out_free;
6004	11391	memcpy(func[i]->insnsi, &prog->insnsi[subprog_start],
..	..	@@ -6008,12 +11395,53 @@
6008	11395	if (bpf_prog_calc_tag(func[i]))
6009	11396	goto out_free;
6010	11397	func[i]->is_func = 1;
6011		- /* Use bpf_prog_F_tag to indicate functions in stack traces.
6012		- * Long term would need debug info to populate names
6013		- */
	11398	+ func[i]->aux->func_idx = i;
	11399	+ /* the btf and func_info will be freed only at prog->aux */
	11400	+ func[i]->aux->btf = prog->aux->btf;
	11401	+ func[i]->aux->func_info = prog->aux->func_info;
	11402	+ func[i]->aux->func_info_cnt = prog->aux->func_info_cnt;
	11403	+
	11404	+ for (j = 0; j < prog->aux->size_poke_tab; j++) {
	11405	+ u32 insn_idx = prog->aux->poke_tab[j].insn_idx;
	11406	+ int ret;
	11407	+
	11408	+ if (!(insn_idx >= subprog_start &&
	11409	+ insn_idx <= subprog_end))
	11410	+ continue;
	11411	+
	11412	+ ret = bpf_jit_add_poke_descriptor(func[i],
	11413	+ &prog->aux->poke_tab[j]);
	11414	+ if (ret < 0) {
	11415	+ verbose(env, "adding tail call poke descriptor failed\n");
	11416	+ goto out_free;
	11417	+ }
	11418	+
	11419	+ func[i]->insnsi[insn_idx - subprog_start].imm = ret + 1;
	11420	+
	11421	+ map_ptr = func[i]->aux->poke_tab[ret].tail_call.map;
	11422	+ ret = map_ptr->ops->map_poke_track(map_ptr, func[i]->aux);
	11423	+ if (ret < 0) {
	11424	+ verbose(env, "tracking tail call prog failed\n");
	11425	+ goto out_free;
	11426	+ }
	11427	+ }
	11428	+
6014	11429	func[i]->aux->name[0] = 'F';
6015	11430	func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
6016	11431	func[i]->jit_requested = 1;
	11432	+ func[i]->aux->linfo = prog->aux->linfo;
	11433	+ func[i]->aux->nr_linfo = prog->aux->nr_linfo;
	11434	+ func[i]->aux->jited_linfo = prog->aux->jited_linfo;
	11435	+ func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx;
	11436	+ num_exentries = 0;
	11437	+ insn = func[i]->insnsi;
	11438	+ for (j = 0; j < func[i]->len; j++, insn++) {
	11439	+ if (BPF_CLASS(insn->code) == BPF_LDX &&
	11440	+ BPF_MODE(insn->code) == BPF_PROBE_MEM)
	11441	+ num_exentries++;
	11442	+ }
	11443	+ func[i]->aux->num_exentries = num_exentries;
	11444	+ func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable;
6017	11445	func[i] = bpf_int_jit_compile(func[i]);
6018	11446	if (!func[i]->jited) {
6019	11447	err = -ENOTSUPP;
..	..	@@ -6021,6 +11449,19 @@
6021	11449	}
6022	11450	cond_resched();
6023	11451	}
	11452	+
	11453	+ /* Untrack main program's aux structs so that during map_poke_run()
	11454	+ * we will not stumble upon the unfilled poke descriptors; each
	11455	+ * of the main program's poke descs got distributed across subprogs
	11456	+ * and got tracked onto map, so we are sure that none of them will
	11457	+ * be missed after the operation below
	11458	+ */
	11459	+ for (i = 0; i < prog->aux->size_poke_tab; i++) {
	11460	+ map_ptr = prog->aux->poke_tab[i].tail_call.map;
	11461	+
	11462	+ map_ptr->ops->map_poke_untrack(map_ptr, prog->aux);
	11463	+ }
	11464	+
6024	11465	/* at this point all bpf functions were successfully JITed
6025	11466	* now populate all bpf_calls with correct addresses and
6026	11467	* run last pass of JIT
..	..	@@ -6032,9 +11473,8 @@
6032	11473	insn->src_reg != BPF_PSEUDO_CALL)
6033	11474	continue;
6034	11475	subprog = insn->off;
6035		- insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
6036		- func[subprog]->bpf_func -
6037		- __bpf_call_base;
	11476	+ insn->imm = BPF_CAST_CALL(func[subprog]->bpf_func) -
	11477	+ __bpf_call_base;
6038	11478	}
6039	11479
6040	11480	/* we use the aux data to keep a list of the start addresses
..	..	@@ -6087,11 +11527,19 @@
6087	11527	prog->bpf_func = func[0]->bpf_func;
6088	11528	prog->aux->func = func;
6089	11529	prog->aux->func_cnt = env->subprog_cnt;
	11530	+ bpf_prog_free_unused_jited_linfo(prog);
6090	11531	return 0;
6091	11532	out_free:
6092		- for (i = 0; i < env->subprog_cnt; i++)
6093		- if (func[i])
6094		- bpf_jit_free(func[i]);
	11533	+ for (i = 0; i < env->subprog_cnt; i++) {
	11534	+ if (!func[i])
	11535	+ continue;
	11536	+
	11537	+ for (j = 0; j < func[i]->aux->size_poke_tab; j++) {
	11538	+ map_ptr = func[i]->aux->poke_tab[j].tail_call.map;
	11539	+ map_ptr->ops->map_poke_untrack(map_ptr, func[i]->aux);
	11540	+ }
	11541	+ bpf_jit_free(func[i]);
	11542	+ }
6095	11543	kfree(func);
6096	11544	out_undo_insn:
6097	11545	/* cleanup main prog to be interpreted */
..	..	@@ -6103,6 +11551,7 @@
6103	11551	insn->off = 0;
6104	11552	insn->imm = env->insn_aux_data[i].call_imm;
6105	11553	}
	11554	+ bpf_prog_free_jited_linfo(prog);
6106	11555	return err;
6107	11556	}
6108	11557
..	..	@@ -6113,10 +11562,10 @@
6113	11562	struct bpf_insn *insn = prog->insnsi;
6114	11563	int i, depth;
6115	11564	#endif
6116		- int err;
	11565	+ int err = 0;
6117	11566
6118		- err = 0;
6119		- if (env->prog->jit_requested) {
	11567	+ if (env->prog->jit_requested &&
	11568	+ !bpf_prog_is_dev_bound(env->prog->aux)) {
6120	11569	err = jit_subprogs(env);
6121	11570	if (err == 0)
6122	11571	return 0;
..	..	@@ -6124,6 +11573,13 @@
6124	11573	return err;
6125	11574	}
6126	11575	#ifndef CONFIG_BPF_JIT_ALWAYS_ON
	11576	+ if (env->subprog_cnt > 1 && env->prog->aux->tail_call_reachable) {
	11577	+ /* When JIT fails the progs with bpf2bpf calls and tail_calls
	11578	+ * have to be rejected, since interpreter doesn't support them yet.
	11579	+ */
	11580	+ verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n");
	11581	+ return -EINVAL;
	11582	+ }
6127	11583	for (i = 0; i < prog->len; i++, insn++) {
6128	11584	if (insn->code != (BPF_JMP \| BPF_CALL) \|\|
6129	11585	insn->src_reg != BPF_PSEUDO_CALL)
..	..	@@ -6146,6 +11602,7 @@
6146	11602	static int fixup_bpf_calls(struct bpf_verifier_env *env)
6147	11603	{
6148	11604	struct bpf_prog *prog = env->prog;
	11605	+ bool expect_blinding = bpf_jit_blinding_enabled(prog);
6149	11606	struct bpf_insn *insn = prog->insnsi;
6150	11607	const struct bpf_func_proto *fn;
6151	11608	const int insn_cnt = prog->len;
..	..	@@ -6154,7 +11611,7 @@
6154	11611	struct bpf_insn insn_buf[16];
6155	11612	struct bpf_prog *new_prog;
6156	11613	struct bpf_map *map_ptr;
6157		- int i, cnt, delta = 0;
	11614	+ int i, ret, cnt, delta = 0;
6158	11615
6159	11616	for (i = 0; i < insn_cnt; i++, insn++) {
6160	11617	if (insn->code == (BPF_ALU64 \| BPF_MOD \| BPF_X) \|\|
..	..	@@ -6162,31 +11619,30 @@
6162	11619	insn->code == (BPF_ALU \| BPF_MOD \| BPF_X) \|\|
6163	11620	insn->code == (BPF_ALU \| BPF_DIV \| BPF_X)) {
6164	11621	bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
6165		- struct bpf_insn mask_and_div[] = {
6166		- BPF_MOV_REG(BPF_CLASS(insn->code), BPF_REG_AX, insn->src_reg),
	11622	+ bool isdiv = BPF_OP(insn->code) == BPF_DIV;
	11623	+ struct bpf_insn *patchlet;
	11624	+ struct bpf_insn chk_and_div[] = {
6167	11625	/* [R,W]x div 0 -> 0 */
6168		- BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, 2),
6169		- BPF_RAW_REG(*insn, insn->dst_reg, BPF_REG_AX),
	11626	+ BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) \|
	11627	+ BPF_JNE \| BPF_K, insn->src_reg,
	11628	+ 0, 2, 0),
	11629	+ BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg),
6170	11630	BPF_JMP_IMM(BPF_JA, 0, 0, 1),
6171		- BPF_ALU_REG(BPF_CLASS(insn->code), BPF_XOR, insn->dst_reg, insn->dst_reg),
	11631	+ *insn,
6172	11632	};
6173		- struct bpf_insn mask_and_mod[] = {
6174		- BPF_MOV_REG(BPF_CLASS(insn->code), BPF_REG_AX, insn->src_reg),
6175		- BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, 1 + (is64 ? 0 : 1)),
6176		- BPF_RAW_REG(*insn, insn->dst_reg, BPF_REG_AX),
	11633	+ struct bpf_insn chk_and_mod[] = {
	11634	+ /* [R,W]x mod 0 -> [R,W]x */
	11635	+ BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) \|
	11636	+ BPF_JEQ \| BPF_K, insn->src_reg,
	11637	+ 0, 1 + (is64 ? 0 : 1), 0),
	11638	+ *insn,
6177	11639	BPF_JMP_IMM(BPF_JA, 0, 0, 1),
6178	11640	BPF_MOV32_REG(insn->dst_reg, insn->dst_reg),
6179	11641	};
6180		- struct bpf_insn *patchlet;
6181	11642
6182		- if (insn->code == (BPF_ALU64 \| BPF_DIV \| BPF_X) \|\|
6183		- insn->code == (BPF_ALU \| BPF_DIV \| BPF_X)) {
6184		- patchlet = mask_and_div;
6185		- cnt = ARRAY_SIZE(mask_and_div);
6186		- } else {
6187		- patchlet = mask_and_mod;
6188		- cnt = ARRAY_SIZE(mask_and_mod) - (is64 ? 2 : 0);
6189		- }
	11643	+ patchlet = isdiv ? chk_and_div : chk_and_mod;
	11644	+ cnt = isdiv ? ARRAY_SIZE(chk_and_div) :
	11645	+ ARRAY_SIZE(chk_and_mod) - (is64 ? 2 : 0);
6190	11646
6191	11647	new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt);
6192	11648	if (!new_prog)
..	..	@@ -6288,7 +11744,9 @@
6288	11744	* the program array.
6289	11745	*/
6290	11746	prog->cb_access = 1;
6291		- env->prog->aux->stack_depth = MAX_BPF_STACK;
	11747	+ if (!allow_tail_call_in_subprogs(env))
	11748	+ prog->aux->stack_depth = MAX_BPF_STACK;
	11749	+ prog->aux->max_pkt_offset = MAX_PACKET_OFF;
6292	11750
6293	11751	/* mark bpf_tail_call as different opcode to avoid
6294	11752	* conditional branch in the interpeter for every normal
..	..	@@ -6299,6 +11757,28 @@
6299	11757	insn->code = BPF_JMP \| BPF_TAIL_CALL;
6300	11758
6301	11759	aux = &env->insn_aux_data[i + delta];
	11760	+ if (env->bpf_capable && !expect_blinding &&
	11761	+ prog->jit_requested &&
	11762	+ !bpf_map_key_poisoned(aux) &&
	11763	+ !bpf_map_ptr_poisoned(aux) &&
	11764	+ !bpf_map_ptr_unpriv(aux)) {
	11765	+ struct bpf_jit_poke_descriptor desc = {
	11766	+ .reason = BPF_POKE_REASON_TAIL_CALL,
	11767	+ .tail_call.map = BPF_MAP_PTR(aux->map_ptr_state),
	11768	+ .tail_call.key = bpf_map_key_immediate(aux),
	11769	+ .insn_idx = i + delta,
	11770	+ };
	11771	+
	11772	+ ret = bpf_jit_add_poke_descriptor(prog, &desc);
	11773	+ if (ret < 0) {
	11774	+ verbose(env, "adding tail call poke descriptor failed\n");
	11775	+ return ret;
	11776	+ }
	11777	+
	11778	+ insn->imm = ret + 1;
	11779	+ continue;
	11780	+ }
	11781	+
6302	11782	if (!bpf_map_ptr_unpriv(aux))
6303	11783	continue;
6304	11784
..	..	@@ -6313,7 +11793,7 @@
6313	11793	return -EINVAL;
6314	11794	}
6315	11795
6316		- map_ptr = BPF_MAP_PTR(aux->map_state);
	11796	+ map_ptr = BPF_MAP_PTR(aux->map_ptr_state);
6317	11797	insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
6318	11798	map_ptr->max_entries, 2);
6319	11799	insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
..	..	@@ -6339,17 +11819,22 @@
6339	11819	if (prog->jit_requested && BITS_PER_LONG == 64 &&
6340	11820	(insn->imm == BPF_FUNC_map_lookup_elem \|\|
6341	11821	insn->imm == BPF_FUNC_map_update_elem \|\|
6342		- insn->imm == BPF_FUNC_map_delete_elem)) {
	11822	+ insn->imm == BPF_FUNC_map_delete_elem \|\|
	11823	+ insn->imm == BPF_FUNC_map_push_elem \|\|
	11824	+ insn->imm == BPF_FUNC_map_pop_elem \|\|
	11825	+ insn->imm == BPF_FUNC_map_peek_elem)) {
6343	11826	aux = &env->insn_aux_data[i + delta];
6344	11827	if (bpf_map_ptr_poisoned(aux))
6345	11828	goto patch_call_imm;
6346	11829
6347		- map_ptr = BPF_MAP_PTR(aux->map_state);
	11830	+ map_ptr = BPF_MAP_PTR(aux->map_ptr_state);
6348	11831	ops = map_ptr->ops;
6349	11832	if (insn->imm == BPF_FUNC_map_lookup_elem &&
6350	11833	ops->map_gen_lookup) {
6351	11834	cnt = ops->map_gen_lookup(map_ptr, insn_buf);
6352		- if (cnt == 0 \|\| cnt >= ARRAY_SIZE(insn_buf)) {
	11835	+ if (cnt == -EOPNOTSUPP)
	11836	+ goto patch_map_ops_generic;
	11837	+ if (cnt <= 0 \|\| cnt >= ARRAY_SIZE(insn_buf)) {
6353	11838	verbose(env, "bpf verifier is misconfigured\n");
6354	11839	return -EINVAL;
6355	11840	}
..	..	@@ -6372,6 +11857,14 @@
6372	11857	BUILD_BUG_ON(!__same_type(ops->map_update_elem,
6373	11858	(int ()(struct bpf_map map, void key, void value,
6374	11859	u64 flags))NULL));
	11860	+ BUILD_BUG_ON(!__same_type(ops->map_push_elem,
	11861	+ (int ()(struct bpf_map map, void *value,
	11862	+ u64 flags))NULL));
	11863	+ BUILD_BUG_ON(!__same_type(ops->map_pop_elem,
	11864	+ (int ()(struct bpf_map map, void *value))NULL));
	11865	+ BUILD_BUG_ON(!__same_type(ops->map_peek_elem,
	11866	+ (int ()(struct bpf_map map, void *value))NULL));
	11867	+patch_map_ops_generic:
6375	11868	switch (insn->imm) {
6376	11869	case BPF_FUNC_map_lookup_elem:
6377	11870	insn->imm = BPF_CAST_CALL(ops->map_lookup_elem) -
..	..	@@ -6385,9 +11878,45 @@
6385	11878	insn->imm = BPF_CAST_CALL(ops->map_delete_elem) -
6386	11879	__bpf_call_base;
6387	11880	continue;
	11881	+ case BPF_FUNC_map_push_elem:
	11882	+ insn->imm = BPF_CAST_CALL(ops->map_push_elem) -
	11883	+ __bpf_call_base;
	11884	+ continue;
	11885	+ case BPF_FUNC_map_pop_elem:
	11886	+ insn->imm = BPF_CAST_CALL(ops->map_pop_elem) -
	11887	+ __bpf_call_base;
	11888	+ continue;
	11889	+ case BPF_FUNC_map_peek_elem:
	11890	+ insn->imm = BPF_CAST_CALL(ops->map_peek_elem) -
	11891	+ __bpf_call_base;
	11892	+ continue;
6388	11893	}
6389	11894
6390	11895	goto patch_call_imm;
	11896	+ }
	11897	+
	11898	+ if (prog->jit_requested && BITS_PER_LONG == 64 &&
	11899	+ insn->imm == BPF_FUNC_jiffies64) {
	11900	+ struct bpf_insn ld_jiffies_addr[2] = {
	11901	+ BPF_LD_IMM64(BPF_REG_0,
	11902	+ (unsigned long)&jiffies),
	11903	+ };
	11904	+
	11905	+ insn_buf[0] = ld_jiffies_addr[0];
	11906	+ insn_buf[1] = ld_jiffies_addr[1];
	11907	+ insn_buf[2] = BPF_LDX_MEM(BPF_DW, BPF_REG_0,
	11908	+ BPF_REG_0, 0);
	11909	+ cnt = 3;
	11910	+
	11911	+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf,
	11912	+ cnt);
	11913	+ if (!new_prog)
	11914	+ return -ENOMEM;
	11915	+
	11916	+ delta += cnt - 1;
	11917	+ env->prog = prog = new_prog;
	11918	+ insn = new_prog->insnsi + i + delta;
	11919	+ continue;
6391	11920	}
6392	11921
6393	11922	patch_call_imm:
..	..	@@ -6404,6 +11933,23 @@
6404	11933	insn->imm = fn->func - __bpf_call_base;
6405	11934	}
6406	11935
	11936	+ /* Since poke tab is now finalized, publish aux to tracker. */
	11937	+ for (i = 0; i < prog->aux->size_poke_tab; i++) {
	11938	+ map_ptr = prog->aux->poke_tab[i].tail_call.map;
	11939	+ if (!map_ptr->ops->map_poke_track \|\|
	11940	+ !map_ptr->ops->map_poke_untrack \|\|
	11941	+ !map_ptr->ops->map_poke_run) {
	11942	+ verbose(env, "bpf verifier is misconfigured\n");
	11943	+ return -EINVAL;
	11944	+ }
	11945	+
	11946	+ ret = map_ptr->ops->map_poke_track(map_ptr, prog->aux);
	11947	+ if (ret < 0) {
	11948	+ verbose(env, "tracking tail call prog failed\n");
	11949	+ return ret;
	11950	+ }
	11951	+ }
	11952	+
6407	11953	return 0;
6408	11954	}
6409	11955
..	..	@@ -6412,29 +11958,605 @@
6412	11958	struct bpf_verifier_state_list sl, sln;
6413	11959	int i;
6414	11960
	11961	+ sl = env->free_list;
	11962	+ while (sl) {
	11963	+ sln = sl->next;
	11964	+ free_verifier_state(&sl->state, false);
	11965	+ kfree(sl);
	11966	+ sl = sln;
	11967	+ }
	11968	+ env->free_list = NULL;
	11969	+
6415	11970	if (!env->explored_states)
6416	11971	return;
6417	11972
6418		- for (i = 0; i < env->prog->len; i++) {
	11973	+ for (i = 0; i < state_htab_size(env); i++) {
6419	11974	sl = env->explored_states[i];
6420	11975
6421		- if (sl)
6422		- while (sl != STATE_LIST_MARK) {
6423		- sln = sl->next;
6424		- free_verifier_state(&sl->state, false);
6425		- kfree(sl);
6426		- sl = sln;
6427		- }
	11976	+ while (sl) {
	11977	+ sln = sl->next;
	11978	+ free_verifier_state(&sl->state, false);
	11979	+ kfree(sl);
	11980	+ sl = sln;
	11981	+ }
	11982	+ env->explored_states[i] = NULL;
6428	11983	}
6429		-
6430		- kfree(env->explored_states);
6431	11984	}
6432	11985
6433		-int bpf_check(struct bpf_prog *prog, union bpf_attr attr)
	11986	+static int do_check_common(struct bpf_verifier_env *env, int subprog)
6434	11987	{
	11988	+ bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
	11989	+ struct bpf_verifier_state *state;
	11990	+ struct bpf_reg_state *regs;
	11991	+ int ret, i;
	11992	+
	11993	+ env->prev_linfo = NULL;
	11994	+ env->pass_cnt++;
	11995	+
	11996	+ state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL);
	11997	+ if (!state)
	11998	+ return -ENOMEM;
	11999	+ state->curframe = 0;
	12000	+ state->speculative = false;
	12001	+ state->branches = 1;
	12002	+ state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL);
	12003	+ if (!state->frame[0]) {
	12004	+ kfree(state);
	12005	+ return -ENOMEM;
	12006	+ }
	12007	+ env->cur_state = state;
	12008	+ init_func_state(env, state->frame[0],
	12009	+ BPF_MAIN_FUNC /* callsite */,
	12010	+ 0 /* frameno */,
	12011	+ subprog);
	12012	+
	12013	+ state->first_insn_idx = env->subprog_info[subprog].start;
	12014	+ state->last_insn_idx = -1;
	12015	+
	12016	+ regs = state->frame[state->curframe]->regs;
	12017	+ if (subprog \|\| env->prog->type == BPF_PROG_TYPE_EXT) {
	12018	+ ret = btf_prepare_func_args(env, subprog, regs);
	12019	+ if (ret)
	12020	+ goto out;
	12021	+ for (i = BPF_REG_1; i <= BPF_REG_5; i++) {
	12022	+ if (regs[i].type == PTR_TO_CTX)
	12023	+ mark_reg_known_zero(env, regs, i);
	12024	+ else if (regs[i].type == SCALAR_VALUE)
	12025	+ mark_reg_unknown(env, regs, i);
	12026	+ }
	12027	+ } else {
	12028	+ /* 1st arg to a function */
	12029	+ regs[BPF_REG_1].type = PTR_TO_CTX;
	12030	+ mark_reg_known_zero(env, regs, BPF_REG_1);
	12031	+ ret = btf_check_func_arg_match(env, subprog, regs);
	12032	+ if (ret == -EFAULT)
	12033	+ /* unlikely verifier bug. abort.
	12034	+ * ret == 0 and ret < 0 are sadly acceptable for
	12035	+ * main() function due to backward compatibility.
	12036	+ * Like socket filter program may be written as:
	12037	+ * int bpf_prog(struct pt_regs *ctx)
	12038	+ * and never dereference that ctx in the program.
	12039	+ * 'struct pt_regs' is a type mismatch for socket
	12040	+ * filter that should be using 'struct __sk_buff'.
	12041	+ */
	12042	+ goto out;
	12043	+ }
	12044	+
	12045	+ ret = do_check(env);
	12046	+out:
	12047	+ /* check for NULL is necessary, since cur_state can be freed inside
	12048	+ * do_check() under memory pressure.
	12049	+ */
	12050	+ if (env->cur_state) {
	12051	+ free_verifier_state(env->cur_state, true);
	12052	+ env->cur_state = NULL;
	12053	+ }
	12054	+ while (!pop_stack(env, NULL, NULL, false));
	12055	+ if (!ret && pop_log)
	12056	+ bpf_vlog_reset(&env->log, 0);
	12057	+ free_states(env);
	12058	+ return ret;
	12059	+}
	12060	+
	12061	+/* Verify all global functions in a BPF program one by one based on their BTF.
	12062	+ * All global functions must pass verification. Otherwise the whole program is rejected.
	12063	+ * Consider:
	12064	+ * int bar(int);
	12065	+ * int foo(int f)
	12066	+ * {
	12067	+ * return bar(f);
	12068	+ * }
	12069	+ * int bar(int b)
	12070	+ * {
	12071	+ * ...
	12072	+ * }
	12073	+ * foo() will be verified first for R1=any_scalar_value. During verification it
	12074	+ * will be assumed that bar() already verified successfully and call to bar()
	12075	+ * from foo() will be checked for type match only. Later bar() will be verified
	12076	+ * independently to check that it's safe for R1=any_scalar_value.
	12077	+ */
	12078	+static int do_check_subprogs(struct bpf_verifier_env *env)
	12079	+{
	12080	+ struct bpf_prog_aux *aux = env->prog->aux;
	12081	+ int i, ret;
	12082	+
	12083	+ if (!aux->func_info)
	12084	+ return 0;
	12085	+
	12086	+ for (i = 1; i < env->subprog_cnt; i++) {
	12087	+ if (aux->func_info_aux[i].linkage != BTF_FUNC_GLOBAL)
	12088	+ continue;
	12089	+ env->insn_idx = env->subprog_info[i].start;
	12090	+ WARN_ON_ONCE(env->insn_idx == 0);
	12091	+ ret = do_check_common(env, i);
	12092	+ if (ret) {
	12093	+ return ret;
	12094	+ } else if (env->log.level & BPF_LOG_LEVEL) {
	12095	+ verbose(env,
	12096	+ "Func#%d is safe for any args that match its prototype\n",
	12097	+ i);
	12098	+ }
	12099	+ }
	12100	+ return 0;
	12101	+}
	12102	+
	12103	+static int do_check_main(struct bpf_verifier_env *env)
	12104	+{
	12105	+ int ret;
	12106	+
	12107	+ env->insn_idx = 0;
	12108	+ ret = do_check_common(env, 0);
	12109	+ if (!ret)
	12110	+ env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
	12111	+ return ret;
	12112	+}
	12113	+
	12114	+
	12115	+static void print_verification_stats(struct bpf_verifier_env *env)
	12116	+{
	12117	+ int i;
	12118	+
	12119	+ if (env->log.level & BPF_LOG_STATS) {
	12120	+ verbose(env, "verification time %lld usec\n",
	12121	+ div_u64(env->verification_time, 1000));
	12122	+ verbose(env, "stack depth ");
	12123	+ for (i = 0; i < env->subprog_cnt; i++) {
	12124	+ u32 depth = env->subprog_info[i].stack_depth;
	12125	+
	12126	+ verbose(env, "%d", depth);
	12127	+ if (i + 1 < env->subprog_cnt)
	12128	+ verbose(env, "+");
	12129	+ }
	12130	+ verbose(env, "\n");
	12131	+ }
	12132	+ verbose(env, "processed %d insns (limit %d) max_states_per_insn %d "
	12133	+ "total_states %d peak_states %d mark_read %d\n",
	12134	+ env->insn_processed, BPF_COMPLEXITY_LIMIT_INSNS,
	12135	+ env->max_states_per_insn, env->total_states,
	12136	+ env->peak_states, env->longest_mark_read_walk);
	12137	+}
	12138	+
	12139	+static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
	12140	+{
	12141	+ const struct btf_type t, func_proto;
	12142	+ const struct bpf_struct_ops *st_ops;
	12143	+ const struct btf_member *member;
	12144	+ struct bpf_prog *prog = env->prog;
	12145	+ u32 btf_id, member_idx;
	12146	+ const char *mname;
	12147	+
	12148	+ if (!prog->gpl_compatible) {
	12149	+ verbose(env, "struct ops programs must have a GPL compatible license\n");
	12150	+ return -EINVAL;
	12151	+ }
	12152	+
	12153	+ btf_id = prog->aux->attach_btf_id;
	12154	+ st_ops = bpf_struct_ops_find(btf_id);
	12155	+ if (!st_ops) {
	12156	+ verbose(env, "attach_btf_id %u is not a supported struct\n",
	12157	+ btf_id);
	12158	+ return -ENOTSUPP;
	12159	+ }
	12160	+
	12161	+ t = st_ops->type;
	12162	+ member_idx = prog->expected_attach_type;
	12163	+ if (member_idx >= btf_type_vlen(t)) {
	12164	+ verbose(env, "attach to invalid member idx %u of struct %s\n",
	12165	+ member_idx, st_ops->name);
	12166	+ return -EINVAL;
	12167	+ }
	12168	+
	12169	+ member = &btf_type_member(t)[member_idx];
	12170	+ mname = btf_name_by_offset(btf_vmlinux, member->name_off);
	12171	+ func_proto = btf_type_resolve_func_ptr(btf_vmlinux, member->type,
	12172	+ NULL);
	12173	+ if (!func_proto) {
	12174	+ verbose(env, "attach to invalid member %s(@idx %u) of struct %s\n",
	12175	+ mname, member_idx, st_ops->name);
	12176	+ return -EINVAL;
	12177	+ }
	12178	+
	12179	+ if (st_ops->check_member) {
	12180	+ int err = st_ops->check_member(t, member);
	12181	+
	12182	+ if (err) {
	12183	+ verbose(env, "attach to unsupported member %s of struct %s\n",
	12184	+ mname, st_ops->name);
	12185	+ return err;
	12186	+ }
	12187	+ }
	12188	+
	12189	+ prog->aux->attach_func_proto = func_proto;
	12190	+ prog->aux->attach_func_name = mname;
	12191	+ env->ops = st_ops->verifier_ops;
	12192	+
	12193	+ return 0;
	12194	+}
	12195	+#define SECURITY_PREFIX "security_"
	12196	+
	12197	+static int check_attach_modify_return(unsigned long addr, const char *func_name)
	12198	+{
	12199	+ if (within_error_injection_list(addr) \|\|
	12200	+ !strncmp(SECURITY_PREFIX, func_name, sizeof(SECURITY_PREFIX) - 1))
	12201	+ return 0;
	12202	+
	12203	+ return -EINVAL;
	12204	+}
	12205	+
	12206	+/* non exhaustive list of sleepable bpf_lsm_() functions /
	12207	+BTF_SET_START(btf_sleepable_lsm_hooks)
	12208	+#ifdef CONFIG_BPF_LSM
	12209	+BTF_ID(func, bpf_lsm_bprm_committed_creds)
	12210	+#else
	12211	+BTF_ID_UNUSED
	12212	+#endif
	12213	+BTF_SET_END(btf_sleepable_lsm_hooks)
	12214	+
	12215	+static int check_sleepable_lsm_hook(u32 btf_id)
	12216	+{
	12217	+ return btf_id_set_contains(&btf_sleepable_lsm_hooks, btf_id);
	12218	+}
	12219	+
	12220	+/* list of non-sleepable functions that are otherwise on
	12221	+ * ALLOW_ERROR_INJECTION list
	12222	+ */
	12223	+BTF_SET_START(btf_non_sleepable_error_inject)
	12224	+/* Three functions below can be called from sleepable and non-sleepable context.
	12225	+ * Assume non-sleepable from bpf safety point of view.
	12226	+ */
	12227	+BTF_ID(func, __add_to_page_cache_locked)
	12228	+BTF_ID(func, should_fail_alloc_page)
	12229	+BTF_ID(func, should_failslab)
	12230	+BTF_SET_END(btf_non_sleepable_error_inject)
	12231	+
	12232	+static int check_non_sleepable_error_inject(u32 btf_id)
	12233	+{
	12234	+ return btf_id_set_contains(&btf_non_sleepable_error_inject, btf_id);
	12235	+}
	12236	+
	12237	+int bpf_check_attach_target(struct bpf_verifier_log *log,
	12238	+ const struct bpf_prog *prog,
	12239	+ const struct bpf_prog *tgt_prog,
	12240	+ u32 btf_id,
	12241	+ struct bpf_attach_target_info *tgt_info)
	12242	+{
	12243	+ bool prog_extension = prog->type == BPF_PROG_TYPE_EXT;
	12244	+ const char prefix[] = "btf_trace_";
	12245	+ int ret = 0, subprog = -1, i;
	12246	+ const struct btf_type *t;
	12247	+ bool conservative = true;
	12248	+ const char *tname;
	12249	+ struct btf *btf;
	12250	+ long addr = 0;
	12251	+
	12252	+ if (!btf_id) {
	12253	+ bpf_log(log, "Tracing programs must provide btf_id\n");
	12254	+ return -EINVAL;
	12255	+ }
	12256	+ btf = tgt_prog ? tgt_prog->aux->btf : btf_vmlinux;
	12257	+ if (!btf) {
	12258	+ bpf_log(log,
	12259	+ "FENTRY/FEXIT program can only be attached to another program annotated with BTF\n");
	12260	+ return -EINVAL;
	12261	+ }
	12262	+ t = btf_type_by_id(btf, btf_id);
	12263	+ if (!t) {
	12264	+ bpf_log(log, "attach_btf_id %u is invalid\n", btf_id);
	12265	+ return -EINVAL;
	12266	+ }
	12267	+ tname = btf_name_by_offset(btf, t->name_off);
	12268	+ if (!tname) {
	12269	+ bpf_log(log, "attach_btf_id %u doesn't have a name\n", btf_id);
	12270	+ return -EINVAL;
	12271	+ }
	12272	+ if (tgt_prog) {
	12273	+ struct bpf_prog_aux *aux = tgt_prog->aux;
	12274	+
	12275	+ for (i = 0; i < aux->func_info_cnt; i++)
	12276	+ if (aux->func_info[i].type_id == btf_id) {
	12277	+ subprog = i;
	12278	+ break;
	12279	+ }
	12280	+ if (subprog == -1) {
	12281	+ bpf_log(log, "Subprog %s doesn't exist\n", tname);
	12282	+ return -EINVAL;
	12283	+ }
	12284	+ conservative = aux->func_info_aux[subprog].unreliable;
	12285	+ if (prog_extension) {
	12286	+ if (conservative) {
	12287	+ bpf_log(log,
	12288	+ "Cannot replace static functions\n");
	12289	+ return -EINVAL;
	12290	+ }
	12291	+ if (!prog->jit_requested) {
	12292	+ bpf_log(log,
	12293	+ "Extension programs should be JITed\n");
	12294	+ return -EINVAL;
	12295	+ }
	12296	+ }
	12297	+ if (!tgt_prog->jited) {
	12298	+ bpf_log(log, "Can attach to only JITed progs\n");
	12299	+ return -EINVAL;
	12300	+ }
	12301	+ if (tgt_prog->type == prog->type) {
	12302	+ /* Cannot fentry/fexit another fentry/fexit program.
	12303	+ * Cannot attach program extension to another extension.
	12304	+ * It's ok to attach fentry/fexit to extension program.
	12305	+ */
	12306	+ bpf_log(log, "Cannot recursively attach\n");
	12307	+ return -EINVAL;
	12308	+ }
	12309	+ if (tgt_prog->type == BPF_PROG_TYPE_TRACING &&
	12310	+ prog_extension &&
	12311	+ (tgt_prog->expected_attach_type == BPF_TRACE_FENTRY \|\|
	12312	+ tgt_prog->expected_attach_type == BPF_TRACE_FEXIT)) {
	12313	+ /* Program extensions can extend all program types
	12314	+ * except fentry/fexit. The reason is the following.
	12315	+ * The fentry/fexit programs are used for performance
	12316	+ * analysis, stats and can be attached to any program
	12317	+ * type except themselves. When extension program is
	12318	+ * replacing XDP function it is necessary to allow
	12319	+ * performance analysis of all functions. Both original
	12320	+ * XDP program and its program extension. Hence
	12321	+ * attaching fentry/fexit to BPF_PROG_TYPE_EXT is
	12322	+ * allowed. If extending of fentry/fexit was allowed it
	12323	+ * would be possible to create long call chain
	12324	+ * fentry->extension->fentry->extension beyond
	12325	+ * reasonable stack size. Hence extending fentry is not
	12326	+ * allowed.
	12327	+ */
	12328	+ bpf_log(log, "Cannot extend fentry/fexit\n");
	12329	+ return -EINVAL;
	12330	+ }
	12331	+ } else {
	12332	+ if (prog_extension) {
	12333	+ bpf_log(log, "Cannot replace kernel functions\n");
	12334	+ return -EINVAL;
	12335	+ }
	12336	+ }
	12337	+
	12338	+ switch (prog->expected_attach_type) {
	12339	+ case BPF_TRACE_RAW_TP:
	12340	+ if (tgt_prog) {
	12341	+ bpf_log(log,
	12342	+ "Only FENTRY/FEXIT progs are attachable to another BPF prog\n");
	12343	+ return -EINVAL;
	12344	+ }
	12345	+ if (!btf_type_is_typedef(t)) {
	12346	+ bpf_log(log, "attach_btf_id %u is not a typedef\n",
	12347	+ btf_id);
	12348	+ return -EINVAL;
	12349	+ }
	12350	+ if (strncmp(prefix, tname, sizeof(prefix) - 1)) {
	12351	+ bpf_log(log, "attach_btf_id %u points to wrong type name %s\n",
	12352	+ btf_id, tname);
	12353	+ return -EINVAL;
	12354	+ }
	12355	+ tname += sizeof(prefix) - 1;
	12356	+ t = btf_type_by_id(btf, t->type);
	12357	+ if (!btf_type_is_ptr(t))
	12358	+ /* should never happen in valid vmlinux build */
	12359	+ return -EINVAL;
	12360	+ t = btf_type_by_id(btf, t->type);
	12361	+ if (!btf_type_is_func_proto(t))
	12362	+ /* should never happen in valid vmlinux build */
	12363	+ return -EINVAL;
	12364	+
	12365	+ break;
	12366	+ case BPF_TRACE_ITER:
	12367	+ if (!btf_type_is_func(t)) {
	12368	+ bpf_log(log, "attach_btf_id %u is not a function\n",
	12369	+ btf_id);
	12370	+ return -EINVAL;
	12371	+ }
	12372	+ t = btf_type_by_id(btf, t->type);
	12373	+ if (!btf_type_is_func_proto(t))
	12374	+ return -EINVAL;
	12375	+ ret = btf_distill_func_proto(log, btf, t, tname, &tgt_info->fmodel);
	12376	+ if (ret)
	12377	+ return ret;
	12378	+ break;
	12379	+ default:
	12380	+ if (!prog_extension)
	12381	+ return -EINVAL;
	12382	+ fallthrough;
	12383	+ case BPF_MODIFY_RETURN:
	12384	+ case BPF_LSM_MAC:
	12385	+ case BPF_TRACE_FENTRY:
	12386	+ case BPF_TRACE_FEXIT:
	12387	+ if (!btf_type_is_func(t)) {
	12388	+ bpf_log(log, "attach_btf_id %u is not a function\n",
	12389	+ btf_id);
	12390	+ return -EINVAL;
	12391	+ }
	12392	+ if (prog_extension &&
	12393	+ btf_check_type_match(log, prog, btf, t))
	12394	+ return -EINVAL;
	12395	+ t = btf_type_by_id(btf, t->type);
	12396	+ if (!btf_type_is_func_proto(t))
	12397	+ return -EINVAL;
	12398	+
	12399	+ if ((prog->aux->saved_dst_prog_type \|\| prog->aux->saved_dst_attach_type) &&
	12400	+ (!tgt_prog \|\| prog->aux->saved_dst_prog_type != tgt_prog->type \|\|
	12401	+ prog->aux->saved_dst_attach_type != tgt_prog->expected_attach_type))
	12402	+ return -EINVAL;
	12403	+
	12404	+ if (tgt_prog && conservative)
	12405	+ t = NULL;
	12406	+
	12407	+ ret = btf_distill_func_proto(log, btf, t, tname, &tgt_info->fmodel);
	12408	+ if (ret < 0)
	12409	+ return ret;
	12410	+
	12411	+ if (tgt_prog) {
	12412	+ if (subprog == 0)
	12413	+ addr = (long) tgt_prog->bpf_func;
	12414	+ else
	12415	+ addr = (long) tgt_prog->aux->func[subprog]->bpf_func;
	12416	+ } else {
	12417	+ addr = kallsyms_lookup_name(tname);
	12418	+ if (!addr) {
	12419	+ bpf_log(log,
	12420	+ "The address of function %s cannot be found\n",
	12421	+ tname);
	12422	+ return -ENOENT;
	12423	+ }
	12424	+ }
	12425	+
	12426	+ if (prog->aux->sleepable) {
	12427	+ ret = -EINVAL;
	12428	+ switch (prog->type) {
	12429	+ case BPF_PROG_TYPE_TRACING:
	12430	+ /* fentry/fexit/fmod_ret progs can be sleepable only if they are
	12431	+ * attached to ALLOW_ERROR_INJECTION and are not in denylist.
	12432	+ */
	12433	+ if (!check_non_sleepable_error_inject(btf_id) &&
	12434	+ within_error_injection_list(addr))
	12435	+ ret = 0;
	12436	+ break;
	12437	+ case BPF_PROG_TYPE_LSM:
	12438	+ /* LSM progs check that they are attached to bpf_lsm_*() funcs.
	12439	+ * Only some of them are sleepable.
	12440	+ */
	12441	+ if (check_sleepable_lsm_hook(btf_id))
	12442	+ ret = 0;
	12443	+ break;
	12444	+ default:
	12445	+ break;
	12446	+ }
	12447	+ if (ret) {
	12448	+ bpf_log(log, "%s is not sleepable\n", tname);
	12449	+ return ret;
	12450	+ }
	12451	+ } else if (prog->expected_attach_type == BPF_MODIFY_RETURN) {
	12452	+ if (tgt_prog) {
	12453	+ bpf_log(log, "can't modify return codes of BPF programs\n");
	12454	+ return -EINVAL;
	12455	+ }
	12456	+ ret = check_attach_modify_return(addr, tname);
	12457	+ if (ret) {
	12458	+ bpf_log(log, "%s() is not modifiable\n", tname);
	12459	+ return ret;
	12460	+ }
	12461	+ }
	12462	+
	12463	+ break;
	12464	+ }
	12465	+ tgt_info->tgt_addr = addr;
	12466	+ tgt_info->tgt_name = tname;
	12467	+ tgt_info->tgt_type = t;
	12468	+ return 0;
	12469	+}
	12470	+
	12471	+static int check_attach_btf_id(struct bpf_verifier_env *env)
	12472	+{
	12473	+ struct bpf_prog *prog = env->prog;
	12474	+ struct bpf_prog *tgt_prog = prog->aux->dst_prog;
	12475	+ struct bpf_attach_target_info tgt_info = {};
	12476	+ u32 btf_id = prog->aux->attach_btf_id;
	12477	+ struct bpf_trampoline *tr;
	12478	+ int ret;
	12479	+ u64 key;
	12480	+
	12481	+ if (prog->aux->sleepable && prog->type != BPF_PROG_TYPE_TRACING &&
	12482	+ prog->type != BPF_PROG_TYPE_LSM) {
	12483	+ verbose(env, "Only fentry/fexit/fmod_ret and lsm programs can be sleepable\n");
	12484	+ return -EINVAL;
	12485	+ }
	12486	+
	12487	+ if (prog->type == BPF_PROG_TYPE_STRUCT_OPS)
	12488	+ return check_struct_ops_btf_id(env);
	12489	+
	12490	+ if (prog->type != BPF_PROG_TYPE_TRACING &&
	12491	+ prog->type != BPF_PROG_TYPE_LSM &&
	12492	+ prog->type != BPF_PROG_TYPE_EXT)
	12493	+ return 0;
	12494	+
	12495	+ ret = bpf_check_attach_target(&env->log, prog, tgt_prog, btf_id, &tgt_info);
	12496	+ if (ret)
	12497	+ return ret;
	12498	+
	12499	+ if (tgt_prog && prog->type == BPF_PROG_TYPE_EXT) {
	12500	+ /* to make freplace equivalent to their targets, they need to
	12501	+ * inherit env->ops and expected_attach_type for the rest of the
	12502	+ * verification
	12503	+ */
	12504	+ env->ops = bpf_verifier_ops[tgt_prog->type];
	12505	+ prog->expected_attach_type = tgt_prog->expected_attach_type;
	12506	+ }
	12507	+
	12508	+ /* store info about the attachment target that will be used later */
	12509	+ prog->aux->attach_func_proto = tgt_info.tgt_type;
	12510	+ prog->aux->attach_func_name = tgt_info.tgt_name;
	12511	+
	12512	+ if (tgt_prog) {
	12513	+ prog->aux->saved_dst_prog_type = tgt_prog->type;
	12514	+ prog->aux->saved_dst_attach_type = tgt_prog->expected_attach_type;
	12515	+ }
	12516	+
	12517	+ if (prog->expected_attach_type == BPF_TRACE_RAW_TP) {
	12518	+ prog->aux->attach_btf_trace = true;
	12519	+ return 0;
	12520	+ } else if (prog->expected_attach_type == BPF_TRACE_ITER) {
	12521	+ if (!bpf_iter_prog_supported(prog))
	12522	+ return -EINVAL;
	12523	+ return 0;
	12524	+ }
	12525	+
	12526	+ if (prog->type == BPF_PROG_TYPE_LSM) {
	12527	+ ret = bpf_lsm_verify_prog(&env->log, prog);
	12528	+ if (ret < 0)
	12529	+ return ret;
	12530	+ }
	12531	+
	12532	+ key = bpf_trampoline_compute_key(tgt_prog, btf_id);
	12533	+ tr = bpf_trampoline_get(key, &tgt_info);
	12534	+ if (!tr)
	12535	+ return -ENOMEM;
	12536	+
	12537	+ prog->aux->dst_trampoline = tr;
	12538	+ return 0;
	12539	+}
	12540	+
	12541	+struct btf *bpf_get_btf_vmlinux(void)
	12542	+{
	12543	+ if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
	12544	+ mutex_lock(&bpf_verifier_lock);
	12545	+ if (!btf_vmlinux)
	12546	+ btf_vmlinux = btf_parse_vmlinux();
	12547	+ mutex_unlock(&bpf_verifier_lock);
	12548	+ }
	12549	+ return btf_vmlinux;
	12550	+}
	12551	+
	12552	+int bpf_check(struct bpf_prog *prog, union bpf_attr attr,
	12553	+ union bpf_attr __user *uattr)
	12554	+{
	12555	+ u64 start_time = ktime_get_ns();
6435	12556	struct bpf_verifier_env *env;
6436	12557	struct bpf_verifier_log *log;
6437		- int ret = -EINVAL;
	12558	+ int i, len, ret = -EINVAL;
	12559	+ bool is_priv;
6438	12560
6439	12561	/* no program is valid */
6440	12562	if (ARRAY_SIZE(bpf_verifier_ops) == 0)
..	..	@@ -6448,17 +12570,23 @@
6448	12570	return -ENOMEM;
6449	12571	log = &env->log;
6450	12572
	12573	+ len = (*prog)->len;
6451	12574	env->insn_aux_data =
6452		- vzalloc(array_size(sizeof(struct bpf_insn_aux_data),
6453		- (*prog)->len));
	12575	+ vzalloc(array_size(sizeof(struct bpf_insn_aux_data), len));
6454	12576	ret = -ENOMEM;
6455	12577	if (!env->insn_aux_data)
6456	12578	goto err_free_env;
	12579	+ for (i = 0; i < len; i++)
	12580	+ env->insn_aux_data[i].orig_idx = i;
6457	12581	env->prog = *prog;
6458	12582	env->ops = bpf_verifier_ops[env->prog->type];
	12583	+ is_priv = bpf_capable();
	12584	+
	12585	+ bpf_get_btf_vmlinux();
6459	12586
6460	12587	/* grab the mutex to protect few globals used by verifier */
6461		- mutex_lock(&bpf_verifier_lock);
	12588	+ if (!is_priv)
	12589	+ mutex_lock(&bpf_verifier_lock);
6462	12590
6463	12591	if (attr->log_level \|\| attr->log_buf \|\| attr->log_size) {
6464	12592	/* user requested verbose verifier output
..	..	@@ -6468,58 +12596,93 @@
6468	12596	log->ubuf = (char __user *) (unsigned long) attr->log_buf;
6469	12597	log->len_total = attr->log_size;
6470	12598
6471		- ret = -EINVAL;
6472	12599	/* log attributes have to be sane */
6473		- if (log->len_total < 128 \|\| log->len_total > UINT_MAX >> 8 \|\|
6474		- !log->level \|\| !log->ubuf)
	12600	+ if (!bpf_verifier_log_attr_valid(log)) {
	12601	+ ret = -EINVAL;
6475	12602	goto err_unlock;
	12603	+ }
	12604	+ }
	12605	+
	12606	+ if (IS_ERR(btf_vmlinux)) {
	12607	+ /* Either gcc or pahole or kernel are broken. */
	12608	+ verbose(env, "in-kernel BTF is malformed\n");
	12609	+ ret = PTR_ERR(btf_vmlinux);
	12610	+ goto skip_full_check;
6476	12611	}
6477	12612
6478	12613	env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
6479	12614	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
6480	12615	env->strict_alignment = true;
6481		-
6482	12616	if (attr->prog_flags & BPF_F_ANY_ALIGNMENT)
6483	12617	env->strict_alignment = false;
6484	12618
6485		- ret = replace_map_fd_with_map_ptr(env);
6486		- if (ret < 0)
6487		- goto skip_full_check;
	12619	+ env->allow_ptr_leaks = bpf_allow_ptr_leaks();
	12620	+ env->allow_uninit_stack = bpf_allow_uninit_stack();
	12621	+ env->allow_ptr_to_map_access = bpf_allow_ptr_to_map_access();
	12622	+ env->bypass_spec_v1 = bpf_bypass_spec_v1();
	12623	+ env->bypass_spec_v4 = bpf_bypass_spec_v4();
	12624	+ env->bpf_capable = bpf_capable();
6488	12625
6489		- if (bpf_prog_is_dev_bound(env->prog->aux)) {
6490		- ret = bpf_prog_offload_verifier_prep(env);
6491		- if (ret)
6492		- goto skip_full_check;
6493		- }
	12626	+ if (is_priv)
	12627	+ env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
6494	12628
6495		- env->explored_states = kcalloc(env->prog->len,
	12629	+ env->explored_states = kvcalloc(state_htab_size(env),
6496	12630	sizeof(struct bpf_verifier_state_list *),
6497	12631	GFP_USER);
6498	12632	ret = -ENOMEM;
6499	12633	if (!env->explored_states)
6500	12634	goto skip_full_check;
6501	12635
6502		- env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
	12636	+ ret = check_subprogs(env);
	12637	+ if (ret < 0)
	12638	+ goto skip_full_check;
	12639	+
	12640	+ ret = check_btf_info(env, attr, uattr);
	12641	+ if (ret < 0)
	12642	+ goto skip_full_check;
	12643	+
	12644	+ ret = check_attach_btf_id(env);
	12645	+ if (ret)
	12646	+ goto skip_full_check;
	12647	+
	12648	+ ret = resolve_pseudo_ldimm64(env);
	12649	+ if (ret < 0)
	12650	+ goto skip_full_check;
	12651	+
	12652	+ if (bpf_prog_is_dev_bound(env->prog->aux)) {
	12653	+ ret = bpf_prog_offload_verifier_prep(env->prog);
	12654	+ if (ret)
	12655	+ goto skip_full_check;
	12656	+ }
6503	12657
6504	12658	ret = check_cfg(env);
6505	12659	if (ret < 0)
6506	12660	goto skip_full_check;
6507	12661
6508		- ret = do_check(env);
6509		- if (env->cur_state) {
6510		- free_verifier_state(env->cur_state, true);
6511		- env->cur_state = NULL;
6512		- }
	12662	+ ret = do_check_subprogs(env);
	12663	+ ret = ret ?: do_check_main(env);
	12664	+
	12665	+ if (ret == 0 && bpf_prog_is_dev_bound(env->prog->aux))
	12666	+ ret = bpf_prog_offload_finalize(env);
6513	12667
6514	12668	skip_full_check:
6515		- while (!pop_stack(env, NULL, NULL));
6516		- free_states(env);
6517		-
6518		- if (ret == 0)
6519		- sanitize_dead_code(env);
	12669	+ kvfree(env->explored_states);
6520	12670
6521	12671	if (ret == 0)
6522	12672	ret = check_max_stack_depth(env);
	12673	+
	12674	+ /* instruction rewrites happen after this point */
	12675	+ if (is_priv) {
	12676	+ if (ret == 0)
	12677	+ opt_hard_wire_dead_code_branches(env);
	12678	+ if (ret == 0)
	12679	+ ret = opt_remove_dead_code(env);
	12680	+ if (ret == 0)
	12681	+ ret = opt_remove_nops(env);
	12682	+ } else {
	12683	+ if (ret == 0)
	12684	+ sanitize_dead_code(env);
	12685	+ }
6523	12686
6524	12687	if (ret == 0)
6525	12688	/* program is valid, convert (u32)(ctx + off) accesses */
..	..	@@ -6528,8 +12691,20 @@
6528	12691	if (ret == 0)
6529	12692	ret = fixup_bpf_calls(env);
6530	12693
	12694	+ /* do 32-bit optimization after insn patching has done so those patched
	12695	+ * insns could be handled correctly.
	12696	+ */
	12697	+ if (ret == 0 && !bpf_prog_is_dev_bound(env->prog->aux)) {
	12698	+ ret = opt_subreg_zext_lo32_rnd_hi32(env, attr);
	12699	+ env->prog->aux->verifier_zext = bpf_jit_needs_zext() ? !ret
	12700	+ : false;
	12701	+ }
	12702	+
6531	12703	if (ret == 0)
6532	12704	ret = fixup_call_args(env);
	12705	+
	12706	+ env->verification_time = ktime_get_ns() - start_time;
	12707	+ print_verification_stats(env);
6533	12708
6534	12709	if (log->level && bpf_verifier_log_full(log))
6535	12710	ret = -ENOSPC;
..	..	@@ -6559,15 +12734,26 @@
6559	12734	convert_pseudo_ld_imm64(env);
6560	12735	}
6561	12736
	12737	+ if (ret == 0)
	12738	+ adjust_btf_func(env);
	12739	+
6562	12740	err_release_maps:
6563	12741	if (!env->prog->aux->used_maps)
6564	12742	/* if we didn't copy map pointers into bpf_prog_info, release
6565	12743	* them now. Otherwise free_used_maps() will release them.
6566	12744	*/
6567	12745	release_maps(env);
	12746	+
	12747	+ /* extension progs temporarily inherit the attach_type of their targets
	12748	+ for verification purposes, so set it back to zero before returning
	12749	+ */
	12750	+ if (env->prog->type == BPF_PROG_TYPE_EXT)
	12751	+ env->prog->expected_attach_type = 0;
	12752	+
6568	12753	*prog = env->prog;
6569	12754	err_unlock:
6570		- mutex_unlock(&bpf_verifier_lock);
	12755	+ if (!is_priv)
	12756	+ mutex_unlock(&bpf_verifier_lock);
6571	12757	vfree(env->insn_aux_data);
6572	12758	err_free_env:
6573	12759	kfree(env);