~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,46 +1,178 @@
	1	+// SPDX-License-Identifier: GPL-2.0-only
1	2	/*
2	3	* Functions to manage eBPF programs attached to cgroups
3	4	*
4	5	* Copyright (c) 2016 Daniel Mack
5		- *
6		- * This file is subject to the terms and conditions of version 2 of the GNU
7		- * General Public License. See the file COPYING in the main directory of the
8		- * Linux distribution for more details.
9	6	*/
10	7
11	8	#include <linux/kernel.h>
12	9	#include <linux/atomic.h>
13	10	#include <linux/cgroup.h>
	11	+#include <linux/filter.h>
14	12	#include <linux/slab.h>
	13	+#include <linux/sysctl.h>
	14	+#include <linux/string.h>
15	15	#include <linux/bpf.h>
16	16	#include <linux/bpf-cgroup.h>
17	17	#include <net/sock.h>
	18	+#include <net/bpf_sk_storage.h>
	19	+
	20	+#include "../cgroup/cgroup-internal.h"
18	21
19	22	DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
20	23	EXPORT_SYMBOL(cgroup_bpf_enabled_key);
21	24
22		-/**
23		- * cgroup_bpf_put() - put references of all bpf programs
24		- * @cgrp: the cgroup to modify
25		- */
26		-void cgroup_bpf_put(struct cgroup *cgrp)
	25	+void cgroup_bpf_offline(struct cgroup *cgrp)
27	26	{
	27	+ cgroup_get(cgrp);
	28	+ percpu_ref_kill(&cgrp->bpf.refcnt);
	29	+}
	30	+
	31	+static void bpf_cgroup_storages_free(struct bpf_cgroup_storage *storages[])
	32	+{
	33	+ enum bpf_cgroup_storage_type stype;
	34	+
	35	+ for_each_cgroup_storage_type(stype)
	36	+ bpf_cgroup_storage_free(storages[stype]);
	37	+}
	38	+
	39	+static int bpf_cgroup_storages_alloc(struct bpf_cgroup_storage *storages[],
	40	+ struct bpf_cgroup_storage *new_storages[],
	41	+ enum bpf_attach_type type,
	42	+ struct bpf_prog *prog,
	43	+ struct cgroup *cgrp)
	44	+{
	45	+ enum bpf_cgroup_storage_type stype;
	46	+ struct bpf_cgroup_storage_key key;
	47	+ struct bpf_map *map;
	48	+
	49	+ key.cgroup_inode_id = cgroup_id(cgrp);
	50	+ key.attach_type = type;
	51	+
	52	+ for_each_cgroup_storage_type(stype) {
	53	+ map = prog->aux->cgroup_storage[stype];
	54	+ if (!map)
	55	+ continue;
	56	+
	57	+ storages[stype] = cgroup_storage_lookup((void *)map, &key, false);
	58	+ if (storages[stype])
	59	+ continue;
	60	+
	61	+ storages[stype] = bpf_cgroup_storage_alloc(prog, stype);
	62	+ if (IS_ERR(storages[stype])) {
	63	+ bpf_cgroup_storages_free(new_storages);
	64	+ return -ENOMEM;
	65	+ }
	66	+
	67	+ new_storages[stype] = storages[stype];
	68	+ }
	69	+
	70	+ return 0;
	71	+}
	72	+
	73	+static void bpf_cgroup_storages_assign(struct bpf_cgroup_storage *dst[],
	74	+ struct bpf_cgroup_storage *src[])
	75	+{
	76	+ enum bpf_cgroup_storage_type stype;
	77	+
	78	+ for_each_cgroup_storage_type(stype)
	79	+ dst[stype] = src[stype];
	80	+}
	81	+
	82	+static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[],
	83	+ struct cgroup *cgrp,
	84	+ enum bpf_attach_type attach_type)
	85	+{
	86	+ enum bpf_cgroup_storage_type stype;
	87	+
	88	+ for_each_cgroup_storage_type(stype)
	89	+ bpf_cgroup_storage_link(storages[stype], cgrp, attach_type);
	90	+}
	91	+
	92	+/* Called when bpf_cgroup_link is auto-detached from dying cgroup.
	93	+ * It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It
	94	+ * doesn't free link memory, which will eventually be done by bpf_link's
	95	+ * release() callback, when its last FD is closed.
	96	+ */
	97	+static void bpf_cgroup_link_auto_detach(struct bpf_cgroup_link *link)
	98	+{
	99	+ cgroup_put(link->cgroup);
	100	+ link->cgroup = NULL;
	101	+}
	102	+
	103	+/**
	104	+ * cgroup_bpf_release() - put references of all bpf programs and
	105	+ * release all cgroup bpf data
	106	+ * @work: work structure embedded into the cgroup to modify
	107	+ */
	108	+static void cgroup_bpf_release(struct work_struct *work)
	109	+{
	110	+ struct cgroup p, cgrp = container_of(work, struct cgroup,
	111	+ bpf.release_work);
	112	+ struct bpf_prog_array *old_array;
	113	+ struct list_head *storages = &cgrp->bpf.storages;
	114	+ struct bpf_cgroup_storage storage, stmp;
	115	+
28	116	unsigned int type;
	117	+
	118	+ mutex_lock(&cgroup_mutex);
29	119
30	120	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
31	121	struct list_head *progs = &cgrp->bpf.progs[type];
32		- struct bpf_prog_list pl, tmp;
	122	+ struct bpf_prog_list pl, pltmp;
33	123
34		- list_for_each_entry_safe(pl, tmp, progs, node) {
	124	+ list_for_each_entry_safe(pl, pltmp, progs, node) {
35	125	list_del(&pl->node);
36		- bpf_prog_put(pl->prog);
37		- bpf_cgroup_storage_unlink(pl->storage);
38		- bpf_cgroup_storage_free(pl->storage);
	126	+ if (pl->prog)
	127	+ bpf_prog_put(pl->prog);
	128	+ if (pl->link)
	129	+ bpf_cgroup_link_auto_detach(pl->link);
39	130	kfree(pl);
40	131	static_branch_dec(&cgroup_bpf_enabled_key);
41	132	}
42		- bpf_prog_array_free(cgrp->bpf.effective[type]);
	133	+ old_array = rcu_dereference_protected(
	134	+ cgrp->bpf.effective[type],
	135	+ lockdep_is_held(&cgroup_mutex));
	136	+ bpf_prog_array_free(old_array);
43	137	}
	138	+
	139	+ list_for_each_entry_safe(storage, stmp, storages, list_cg) {
	140	+ bpf_cgroup_storage_unlink(storage);
	141	+ bpf_cgroup_storage_free(storage);
	142	+ }
	143	+
	144	+ mutex_unlock(&cgroup_mutex);
	145	+
	146	+ for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
	147	+ cgroup_bpf_put(p);
	148	+
	149	+ percpu_ref_exit(&cgrp->bpf.refcnt);
	150	+ cgroup_put(cgrp);
	151	+}
	152	+
	153	+/**
	154	+ * cgroup_bpf_release_fn() - callback used to schedule releasing
	155	+ * of bpf cgroup data
	156	+ * @ref: percpu ref counter structure
	157	+ */
	158	+static void cgroup_bpf_release_fn(struct percpu_ref *ref)
	159	+{
	160	+ struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
	161	+
	162	+ INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
	163	+ queue_work(system_wq, &cgrp->bpf.release_work);
	164	+}
	165	+
	166	+/* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through
	167	+ * link or direct prog.
	168	+ */
	169	+static struct bpf_prog prog_list_prog(struct bpf_prog_list pl)
	170	+{
	171	+ if (pl->prog)
	172	+ return pl->prog;
	173	+ if (pl->link)
	174	+ return pl->link->link.prog;
	175	+ return NULL;
44	176	}
45	177
46	178	/* count number of elements in the list.
..	..	@@ -52,7 +184,7 @@
52	184	u32 cnt = 0;
53	185
54	186	list_for_each_entry(pl, head, node) {
55		- if (!pl->prog)
	187	+ if (!prog_list_prog(pl))
56	188	continue;
57	189	cnt++;
58	190	}
..	..	@@ -64,8 +196,7 @@
64	196	* if parent has overridable or multi-prog, allow attaching
65	197	*/
66	198	static bool hierarchy_allows_attach(struct cgroup *cgrp,
67		- enum bpf_attach_type type,
68		- u32 new_flags)
	199	+ enum bpf_attach_type type)
69	200	{
70	201	struct cgroup *p;
71	202
..	..	@@ -95,8 +226,9 @@
95	226	*/
96	227	static int compute_effective_progs(struct cgroup *cgrp,
97	228	enum bpf_attach_type type,
98		- struct bpf_prog_array __rcu **array)
	229	+ struct bpf_prog_array **array)
99	230	{
	231	+ struct bpf_prog_array_item *item;
100	232	struct bpf_prog_array *progs;
101	233	struct bpf_prog_list *pl;
102	234	struct cgroup *p = cgrp;
..	..	@@ -121,26 +253,27 @@
121	253	continue;
122	254
123	255	list_for_each_entry(pl, &p->bpf.progs[type], node) {
124		- if (!pl->prog)
	256	+ if (!prog_list_prog(pl))
125	257	continue;
126	258
127		- progs->items[cnt].prog = pl->prog;
128		- progs->items[cnt].cgroup_storage = pl->storage;
	259	+ item = &progs->items[cnt];
	260	+ item->prog = prog_list_prog(pl);
	261	+ bpf_cgroup_storages_assign(item->cgroup_storage,
	262	+ pl->storage);
129	263	cnt++;
130	264	}
131	265	} while ((p = cgroup_parent(p)));
132	266
133		- rcu_assign_pointer(*array, progs);
	267	+ *array = progs;
134	268	return 0;
135	269	}
136	270
137	271	static void activate_effective_progs(struct cgroup *cgrp,
138	272	enum bpf_attach_type type,
139		- struct bpf_prog_array __rcu *array)
	273	+ struct bpf_prog_array *old_array)
140	274	{
141		- struct bpf_prog_array __rcu *old_array;
142		-
143		- old_array = xchg(&cgrp->bpf.effective[type], array);
	275	+ old_array = rcu_replace_pointer(cgrp->bpf.effective[type], old_array,
	276	+ lockdep_is_held(&cgroup_mutex));
144	277	/* free prog array after grace period, since __cgroup_bpf_run_*()
145	278	* might be still walking the array
146	279	*/
..	..	@@ -157,11 +290,22 @@
157	290	* that array below is variable length
158	291	*/
159	292	#define NR ARRAY_SIZE(cgrp->bpf.effective)
160		- struct bpf_prog_array __rcu *arrays[NR] = {};
161		- int i;
	293	+ struct bpf_prog_array *arrays[NR] = {};
	294	+ struct cgroup *p;
	295	+ int ret, i;
	296	+
	297	+ ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0,
	298	+ GFP_KERNEL);
	299	+ if (ret)
	300	+ return ret;
	301	+
	302	+ for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
	303	+ cgroup_bpf_get(p);
162	304
163	305	for (i = 0; i < NR; i++)
164	306	INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
	307	+
	308	+ INIT_LIST_HEAD(&cgrp->bpf.storages);
165	309
166	310	for (i = 0; i < NR; i++)
167	311	if (compute_effective_progs(cgrp, i, &arrays[i]))
..	..	@@ -174,6 +318,12 @@
174	318	cleanup:
175	319	for (i = 0; i < NR; i++)
176	320	bpf_prog_array_free(arrays[i]);
	321	+
	322	+ for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
	323	+ cgroup_bpf_put(p);
	324	+
	325	+ percpu_ref_exit(&cgrp->bpf.refcnt);
	326	+
177	327	return -ENOMEM;
178	328	}
179	329
..	..	@@ -187,6 +337,9 @@
187	337	css_for_each_descendant_pre(css, &cgrp->self) {
188	338	struct cgroup *desc = container_of(css, struct cgroup, self);
189	339
	340	+ if (percpu_ref_is_zero(&desc->bpf.refcnt))
	341	+ continue;
	342	+
190	343	err = compute_effective_progs(desc, type, &desc->bpf.inactive);
191	344	if (err)
192	345	goto cleanup;
..	..	@@ -195,6 +348,14 @@
195	348	/* all allocations were successful. Activate all prog arrays */
196	349	css_for_each_descendant_pre(css, &cgrp->self) {
197	350	struct cgroup *desc = container_of(css, struct cgroup, self);
	351	+
	352	+ if (percpu_ref_is_zero(&desc->bpf.refcnt)) {
	353	+ if (unlikely(desc->bpf.inactive)) {
	354	+ bpf_prog_array_free(desc->bpf.inactive);
	355	+ desc->bpf.inactive = NULL;
	356	+ }
	357	+ continue;
	358	+ }
198	359
199	360	activate_effective_progs(desc, type, desc->bpf.inactive);
200	361	desc->bpf.inactive = NULL;
..	..	@@ -218,33 +379,85 @@
218	379
219	380	#define BPF_CGROUP_MAX_PROGS 64
220	381
	382	+static struct bpf_prog_list find_attach_entry(struct list_head progs,
	383	+ struct bpf_prog *prog,
	384	+ struct bpf_cgroup_link *link,
	385	+ struct bpf_prog *replace_prog,
	386	+ bool allow_multi)
	387	+{
	388	+ struct bpf_prog_list *pl;
	389	+
	390	+ /* single-attach case */
	391	+ if (!allow_multi) {
	392	+ if (list_empty(progs))
	393	+ return NULL;
	394	+ return list_first_entry(progs, typeof(*pl), node);
	395	+ }
	396	+
	397	+ list_for_each_entry(pl, progs, node) {
	398	+ if (prog && pl->prog == prog && prog != replace_prog)
	399	+ /* disallow attaching the same prog twice */
	400	+ return ERR_PTR(-EINVAL);
	401	+ if (link && pl->link == link)
	402	+ /* disallow attaching the same link twice */
	403	+ return ERR_PTR(-EINVAL);
	404	+ }
	405	+
	406	+ /* direct prog multi-attach w/ replacement case */
	407	+ if (replace_prog) {
	408	+ list_for_each_entry(pl, progs, node) {
	409	+ if (pl->prog == replace_prog)
	410	+ /* a match found */
	411	+ return pl;
	412	+ }
	413	+ /* prog to replace not found for cgroup */
	414	+ return ERR_PTR(-ENOENT);
	415	+ }
	416	+
	417	+ return NULL;
	418	+}
	419	+
221	420	/**
222		- * __cgroup_bpf_attach() - Attach the program to a cgroup, and
	421	+ * __cgroup_bpf_attach() - Attach the program or the link to a cgroup, and
223	422	* propagate the change to descendants
224	423	* @cgrp: The cgroup which descendants to traverse
225	424	* @prog: A program to attach
	425	+ * @link: A link to attach
	426	+ * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set
226	427	* @type: Type of attach operation
	428	+ * @flags: Option flags
227	429	*
	430	+ * Exactly one of @prog or @link can be non-null.
228	431	* Must be called with cgroup_mutex held.
229	432	*/
230		-int __cgroup_bpf_attach(struct cgroup cgrp, struct bpf_prog prog,
	433	+int __cgroup_bpf_attach(struct cgroup *cgrp,
	434	+ struct bpf_prog prog, struct bpf_prog replace_prog,
	435	+ struct bpf_cgroup_link *link,
231	436	enum bpf_attach_type type, u32 flags)
232	437	{
	438	+ u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE \| BPF_F_ALLOW_MULTI));
233	439	struct list_head *progs = &cgrp->bpf.progs[type];
234	440	struct bpf_prog *old_prog = NULL;
235		- struct bpf_cgroup_storage storage, old_storage = NULL;
	441	+ struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
	442	+ struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
236	443	struct bpf_prog_list *pl;
237		- bool pl_was_allocated;
238	444	int err;
239	445
240		- if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI))
	446	+ if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) \|\|
	447	+ ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI)))
241	448	/* invalid combination */
242	449	return -EINVAL;
	450	+ if (link && (prog \|\| replace_prog))
	451	+ /* only either link or prog/replace_prog can be specified */
	452	+ return -EINVAL;
	453	+ if (!!replace_prog != !!(flags & BPF_F_REPLACE))
	454	+ /* replace_prog implies BPF_F_REPLACE, and vice versa */
	455	+ return -EINVAL;
243	456
244		- if (!hierarchy_allows_attach(cgrp, type, flags))
	457	+ if (!hierarchy_allows_attach(cgrp, type))
245	458	return -EPERM;
246	459
247		- if (!list_empty(progs) && cgrp->bpf.flags[type] != flags)
	460	+ if (!list_empty(progs) && cgrp->bpf.flags[type] != saved_flags)
248	461	/* Disallow attaching non-overridable on top
249	462	* of existing overridable in this cgroup.
250	463	* Disallow attaching multi-prog if overridable or none
..	..	@@ -254,152 +467,297 @@
254	467	if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
255	468	return -E2BIG;
256	469
257		- storage = bpf_cgroup_storage_alloc(prog);
258		- if (IS_ERR(storage))
	470	+ pl = find_attach_entry(progs, prog, link, replace_prog,
	471	+ flags & BPF_F_ALLOW_MULTI);
	472	+ if (IS_ERR(pl))
	473	+ return PTR_ERR(pl);
	474	+
	475	+ if (bpf_cgroup_storages_alloc(storage, new_storage, type,
	476	+ prog ? : link->link.prog, cgrp))
259	477	return -ENOMEM;
260	478
261		- if (flags & BPF_F_ALLOW_MULTI) {
262		- list_for_each_entry(pl, progs, node) {
263		- if (pl->prog == prog) {
264		- /* disallow attaching the same prog twice */
265		- bpf_cgroup_storage_free(storage);
266		- return -EINVAL;
267		- }
268		- }
269		-
	479	+ if (pl) {
	480	+ old_prog = pl->prog;
	481	+ } else {
270	482	pl = kmalloc(sizeof(*pl), GFP_KERNEL);
271	483	if (!pl) {
272		- bpf_cgroup_storage_free(storage);
	484	+ bpf_cgroup_storages_free(new_storage);
273	485	return -ENOMEM;
274	486	}
275		-
276		- pl_was_allocated = true;
277		- pl->prog = prog;
278		- pl->storage = storage;
279	487	list_add_tail(&pl->node, progs);
280		- } else {
281		- if (list_empty(progs)) {
282		- pl = kmalloc(sizeof(*pl), GFP_KERNEL);
283		- if (!pl) {
284		- bpf_cgroup_storage_free(storage);
285		- return -ENOMEM;
286		- }
287		- pl_was_allocated = true;
288		- list_add_tail(&pl->node, progs);
289		- } else {
290		- pl = list_first_entry(progs, typeof(*pl), node);
291		- old_prog = pl->prog;
292		- old_storage = pl->storage;
293		- bpf_cgroup_storage_unlink(old_storage);
294		- pl_was_allocated = false;
295		- }
296		- pl->prog = prog;
297		- pl->storage = storage;
298	488	}
299	489
300		- cgrp->bpf.flags[type] = flags;
	490	+ pl->prog = prog;
	491	+ pl->link = link;
	492	+ bpf_cgroup_storages_assign(pl->storage, storage);
	493	+ cgrp->bpf.flags[type] = saved_flags;
301	494
302	495	err = update_effective_progs(cgrp, type);
303	496	if (err)
304	497	goto cleanup;
305	498
306		- static_branch_inc(&cgroup_bpf_enabled_key);
307		- if (old_storage)
308		- bpf_cgroup_storage_free(old_storage);
309		- if (old_prog) {
	499	+ if (old_prog)
310	500	bpf_prog_put(old_prog);
311		- static_branch_dec(&cgroup_bpf_enabled_key);
312		- }
313		- bpf_cgroup_storage_link(storage, cgrp, type);
	501	+ else
	502	+ static_branch_inc(&cgroup_bpf_enabled_key);
	503	+ bpf_cgroup_storages_link(new_storage, cgrp, type);
314	504	return 0;
315	505
316	506	cleanup:
317		- /* and cleanup the prog list */
318		- pl->prog = old_prog;
319		- bpf_cgroup_storage_free(pl->storage);
320		- pl->storage = old_storage;
321		- bpf_cgroup_storage_link(old_storage, cgrp, type);
322		- if (pl_was_allocated) {
	507	+ if (old_prog) {
	508	+ pl->prog = old_prog;
	509	+ pl->link = NULL;
	510	+ }
	511	+ bpf_cgroup_storages_free(new_storage);
	512	+ if (!old_prog) {
323	513	list_del(&pl->node);
324	514	kfree(pl);
325	515	}
326	516	return err;
327	517	}
328	518
	519	+/* Swap updated BPF program for given link in effective program arrays across
	520	+ * all descendant cgroups. This function is guaranteed to succeed.
	521	+ */
	522	+static void replace_effective_prog(struct cgroup *cgrp,
	523	+ enum bpf_attach_type type,
	524	+ struct bpf_cgroup_link *link)
	525	+{
	526	+ struct bpf_prog_array_item *item;
	527	+ struct cgroup_subsys_state *css;
	528	+ struct bpf_prog_array *progs;
	529	+ struct bpf_prog_list *pl;
	530	+ struct list_head *head;
	531	+ struct cgroup *cg;
	532	+ int pos;
	533	+
	534	+ css_for_each_descendant_pre(css, &cgrp->self) {
	535	+ struct cgroup *desc = container_of(css, struct cgroup, self);
	536	+
	537	+ if (percpu_ref_is_zero(&desc->bpf.refcnt))
	538	+ continue;
	539	+
	540	+ /* find position of link in effective progs array */
	541	+ for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
	542	+ if (pos && !(cg->bpf.flags[type] & BPF_F_ALLOW_MULTI))
	543	+ continue;
	544	+
	545	+ head = &cg->bpf.progs[type];
	546	+ list_for_each_entry(pl, head, node) {
	547	+ if (!prog_list_prog(pl))
	548	+ continue;
	549	+ if (pl->link == link)
	550	+ goto found;
	551	+ pos++;
	552	+ }
	553	+ }
	554	+found:
	555	+ BUG_ON(!cg);
	556	+ progs = rcu_dereference_protected(
	557	+ desc->bpf.effective[type],
	558	+ lockdep_is_held(&cgroup_mutex));
	559	+ item = &progs->items[pos];
	560	+ WRITE_ONCE(item->prog, link->link.prog);
	561	+ }
	562	+}
	563	+
329	564	/**
330		- * __cgroup_bpf_detach() - Detach the program from a cgroup, and
331		- * propagate the change to descendants
	565	+ * __cgroup_bpf_replace() - Replace link's program and propagate the change
	566	+ * to descendants
332	567	* @cgrp: The cgroup which descendants to traverse
333		- * @prog: A program to detach or NULL
334		- * @type: Type of detach operation
	568	+ * @link: A link for which to replace BPF program
	569	+ * @type: Type of attach operation
335	570	*
336	571	* Must be called with cgroup_mutex held.
337	572	*/
	573	+static int __cgroup_bpf_replace(struct cgroup *cgrp,
	574	+ struct bpf_cgroup_link *link,
	575	+ struct bpf_prog *new_prog)
	576	+{
	577	+ struct list_head *progs = &cgrp->bpf.progs[link->type];
	578	+ struct bpf_prog *old_prog;
	579	+ struct bpf_prog_list *pl;
	580	+ bool found = false;
	581	+
	582	+ if (link->link.prog->type != new_prog->type)
	583	+ return -EINVAL;
	584	+
	585	+ list_for_each_entry(pl, progs, node) {
	586	+ if (pl->link == link) {
	587	+ found = true;
	588	+ break;
	589	+ }
	590	+ }
	591	+ if (!found)
	592	+ return -ENOENT;
	593	+
	594	+ old_prog = xchg(&link->link.prog, new_prog);
	595	+ replace_effective_prog(cgrp, link->type, link);
	596	+ bpf_prog_put(old_prog);
	597	+ return 0;
	598	+}
	599	+
	600	+static int cgroup_bpf_replace(struct bpf_link link, struct bpf_prog new_prog,
	601	+ struct bpf_prog *old_prog)
	602	+{
	603	+ struct bpf_cgroup_link *cg_link;
	604	+ int ret;
	605	+
	606	+ cg_link = container_of(link, struct bpf_cgroup_link, link);
	607	+
	608	+ mutex_lock(&cgroup_mutex);
	609	+ /* link might have been auto-released by dying cgroup, so fail */
	610	+ if (!cg_link->cgroup) {
	611	+ ret = -ENOLINK;
	612	+ goto out_unlock;
	613	+ }
	614	+ if (old_prog && link->prog != old_prog) {
	615	+ ret = -EPERM;
	616	+ goto out_unlock;
	617	+ }
	618	+ ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog);
	619	+out_unlock:
	620	+ mutex_unlock(&cgroup_mutex);
	621	+ return ret;
	622	+}
	623	+
	624	+static struct bpf_prog_list find_detach_entry(struct list_head progs,
	625	+ struct bpf_prog *prog,
	626	+ struct bpf_cgroup_link *link,
	627	+ bool allow_multi)
	628	+{
	629	+ struct bpf_prog_list *pl;
	630	+
	631	+ if (!allow_multi) {
	632	+ if (list_empty(progs))
	633	+ /* report error when trying to detach and nothing is attached */
	634	+ return ERR_PTR(-ENOENT);
	635	+
	636	+ /* to maintain backward compatibility NONE and OVERRIDE cgroups
	637	+ * allow detaching with invalid FD (prog==NULL) in legacy mode
	638	+ */
	639	+ return list_first_entry(progs, typeof(*pl), node);
	640	+ }
	641	+
	642	+ if (!prog && !link)
	643	+ /* to detach MULTI prog the user has to specify valid FD
	644	+ * of the program or link to be detached
	645	+ */
	646	+ return ERR_PTR(-EINVAL);
	647	+
	648	+ /* find the prog or link and detach it */
	649	+ list_for_each_entry(pl, progs, node) {
	650	+ if (pl->prog == prog && pl->link == link)
	651	+ return pl;
	652	+ }
	653	+ return ERR_PTR(-ENOENT);
	654	+}
	655	+
	656	+/**
	657	+ * purge_effective_progs() - After compute_effective_progs fails to alloc new
	658	+ * cgrp->bpf.inactive table we can recover by
	659	+ * recomputing the array in place.
	660	+ *
	661	+ * @cgrp: The cgroup which descendants to travers
	662	+ * @prog: A program to detach or NULL
	663	+ * @link: A link to detach or NULL
	664	+ * @type: Type of detach operation
	665	+ */
	666	+static void purge_effective_progs(struct cgroup cgrp, struct bpf_prog prog,
	667	+ struct bpf_cgroup_link *link,
	668	+ enum bpf_attach_type type)
	669	+{
	670	+ struct cgroup_subsys_state *css;
	671	+ struct bpf_prog_array *progs;
	672	+ struct bpf_prog_list *pl;
	673	+ struct list_head *head;
	674	+ struct cgroup *cg;
	675	+ int pos;
	676	+
	677	+ /* recompute effective prog array in place */
	678	+ css_for_each_descendant_pre(css, &cgrp->self) {
	679	+ struct cgroup *desc = container_of(css, struct cgroup, self);
	680	+
	681	+ if (percpu_ref_is_zero(&desc->bpf.refcnt))
	682	+ continue;
	683	+
	684	+ /* find position of link or prog in effective progs array */
	685	+ for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
	686	+ if (pos && !(cg->bpf.flags[type] & BPF_F_ALLOW_MULTI))
	687	+ continue;
	688	+
	689	+ head = &cg->bpf.progs[type];
	690	+ list_for_each_entry(pl, head, node) {
	691	+ if (!prog_list_prog(pl))
	692	+ continue;
	693	+ if (pl->prog == prog && pl->link == link)
	694	+ goto found;
	695	+ pos++;
	696	+ }
	697	+ }
	698	+
	699	+ /* no link or prog match, skip the cgroup of this layer */
	700	+ continue;
	701	+found:
	702	+ progs = rcu_dereference_protected(
	703	+ desc->bpf.effective[type],
	704	+ lockdep_is_held(&cgroup_mutex));
	705	+
	706	+ /* Remove the program from the array */
	707	+ WARN_ONCE(bpf_prog_array_delete_safe_at(progs, pos),
	708	+ "Failed to purge a prog from array at index %d", pos);
	709	+ }
	710	+}
	711	+
	712	+/**
	713	+ * __cgroup_bpf_detach() - Detach the program or link from a cgroup, and
	714	+ * propagate the change to descendants
	715	+ * @cgrp: The cgroup which descendants to traverse
	716	+ * @prog: A program to detach or NULL
	717	+ * @prog: A link to detach or NULL
	718	+ * @type: Type of detach operation
	719	+ *
	720	+ * At most one of @prog or @link can be non-NULL.
	721	+ * Must be called with cgroup_mutex held.
	722	+ */
338	723	int __cgroup_bpf_detach(struct cgroup cgrp, struct bpf_prog prog,
339		- enum bpf_attach_type type, u32 unused_flags)
	724	+ struct bpf_cgroup_link *link, enum bpf_attach_type type)
340	725	{
341	726	struct list_head *progs = &cgrp->bpf.progs[type];
342	727	u32 flags = cgrp->bpf.flags[type];
343		- struct bpf_prog *old_prog = NULL;
344	728	struct bpf_prog_list *pl;
345		- int err;
	729	+ struct bpf_prog *old_prog;
346	730
347		- if (flags & BPF_F_ALLOW_MULTI) {
348		- if (!prog)
349		- /* to detach MULTI prog the user has to specify valid FD
350		- * of the program to be detached
351		- */
352		- return -EINVAL;
353		- } else {
354		- if (list_empty(progs))
355		- /* report error when trying to detach and nothing is attached */
356		- return -ENOENT;
	731	+ if (prog && link)
	732	+ /* only one of prog or link can be specified */
	733	+ return -EINVAL;
	734	+
	735	+ pl = find_detach_entry(progs, prog, link, flags & BPF_F_ALLOW_MULTI);
	736	+ if (IS_ERR(pl))
	737	+ return PTR_ERR(pl);
	738	+
	739	+ /* mark it deleted, so it's ignored while recomputing effective */
	740	+ old_prog = pl->prog;
	741	+ pl->prog = NULL;
	742	+ pl->link = NULL;
	743	+
	744	+ if (update_effective_progs(cgrp, type)) {
	745	+ /* if update effective array failed replace the prog with a dummy prog*/
	746	+ pl->prog = old_prog;
	747	+ pl->link = link;
	748	+ purge_effective_progs(cgrp, old_prog, link, type);
357	749	}
358		-
359		- if (flags & BPF_F_ALLOW_MULTI) {
360		- /* find the prog and detach it */
361		- list_for_each_entry(pl, progs, node) {
362		- if (pl->prog != prog)
363		- continue;
364		- old_prog = prog;
365		- /* mark it deleted, so it's ignored while
366		- * recomputing effective
367		- */
368		- pl->prog = NULL;
369		- break;
370		- }
371		- if (!old_prog)
372		- return -ENOENT;
373		- } else {
374		- /* to maintain backward compatibility NONE and OVERRIDE cgroups
375		- * allow detaching with invalid FD (prog==NULL)
376		- */
377		- pl = list_first_entry(progs, typeof(*pl), node);
378		- old_prog = pl->prog;
379		- pl->prog = NULL;
380		- }
381		-
382		- err = update_effective_progs(cgrp, type);
383		- if (err)
384		- goto cleanup;
385	750
386	751	/* now can actually delete it from this cgroup list */
387	752	list_del(&pl->node);
388		- bpf_cgroup_storage_unlink(pl->storage);
389		- bpf_cgroup_storage_free(pl->storage);
390	753	kfree(pl);
391	754	if (list_empty(progs))
392	755	/* last program was detached, reset flags to zero */
393	756	cgrp->bpf.flags[type] = 0;
394		-
395		- bpf_prog_put(old_prog);
	757	+ if (old_prog)
	758	+ bpf_prog_put(old_prog);
396	759	static_branch_dec(&cgroup_bpf_enabled_key);
397	760	return 0;
398		-
399		-cleanup:
400		- /* and restore back old_prog */
401		- pl->prog = old_prog;
402		- return err;
403	761	}
404	762
405	763	/* Must be called with cgroup_mutex held to avoid races. */
..	..	@@ -410,10 +768,15 @@
410	768	enum bpf_attach_type type = attr->query.attach_type;
411	769	struct list_head *progs = &cgrp->bpf.progs[type];
412	770	u32 flags = cgrp->bpf.flags[type];
	771	+ struct bpf_prog_array *effective;
	772	+ struct bpf_prog *prog;
413	773	int cnt, ret = 0, i;
414	774
	775	+ effective = rcu_dereference_protected(cgrp->bpf.effective[type],
	776	+ lockdep_is_held(&cgroup_mutex));
	777	+
415	778	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
416		- cnt = bpf_prog_array_length(cgrp->bpf.effective[type]);
	779	+ cnt = bpf_prog_array_length(effective);
417	780	else
418	781	cnt = prog_list_length(progs);
419	782
..	..	@@ -430,15 +793,15 @@
430	793	}
431	794
432	795	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
433		- return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type],
434		- prog_ids, cnt);
	796	+ return bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
435	797	} else {
436	798	struct bpf_prog_list *pl;
437	799	u32 id;
438	800
439	801	i = 0;
440	802	list_for_each_entry(pl, progs, node) {
441		- id = pl->prog->aux->id;
	803	+ prog = prog_list_prog(pl);
	804	+ id = prog->aux->id;
442	805	if (copy_to_user(prog_ids + i, &id, sizeof(id)))
443	806	return -EFAULT;
444	807	if (++i == cnt)
..	..	@@ -451,6 +814,7 @@
451	814	int cgroup_bpf_prog_attach(const union bpf_attr *attr,
452	815	enum bpf_prog_type ptype, struct bpf_prog *prog)
453	816	{
	817	+ struct bpf_prog *replace_prog = NULL;
454	818	struct cgroup *cgrp;
455	819	int ret;
456	820
..	..	@@ -458,8 +822,20 @@
458	822	if (IS_ERR(cgrp))
459	823	return PTR_ERR(cgrp);
460	824
461		- ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type,
462		- attr->attach_flags);
	825	+ if ((attr->attach_flags & BPF_F_ALLOW_MULTI) &&
	826	+ (attr->attach_flags & BPF_F_REPLACE)) {
	827	+ replace_prog = bpf_prog_get_type(attr->replace_bpf_fd, ptype);
	828	+ if (IS_ERR(replace_prog)) {
	829	+ cgroup_put(cgrp);
	830	+ return PTR_ERR(replace_prog);
	831	+ }
	832	+ }
	833	+
	834	+ ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL,
	835	+ attr->attach_type, attr->attach_flags);
	836	+
	837	+ if (replace_prog)
	838	+ bpf_prog_put(replace_prog);
463	839	cgroup_put(cgrp);
464	840	return ret;
465	841	}
..	..	@@ -478,12 +854,147 @@
478	854	if (IS_ERR(prog))
479	855	prog = NULL;
480	856
481		- ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0);
	857	+ ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type);
482	858	if (prog)
483	859	bpf_prog_put(prog);
484	860
485	861	cgroup_put(cgrp);
486	862	return ret;
	863	+}
	864	+
	865	+static void bpf_cgroup_link_release(struct bpf_link *link)
	866	+{
	867	+ struct bpf_cgroup_link *cg_link =
	868	+ container_of(link, struct bpf_cgroup_link, link);
	869	+ struct cgroup *cg;
	870	+
	871	+ /* link might have been auto-detached by dying cgroup already,
	872	+ * in that case our work is done here
	873	+ */
	874	+ if (!cg_link->cgroup)
	875	+ return;
	876	+
	877	+ mutex_lock(&cgroup_mutex);
	878	+
	879	+ /* re-check cgroup under lock again */
	880	+ if (!cg_link->cgroup) {
	881	+ mutex_unlock(&cgroup_mutex);
	882	+ return;
	883	+ }
	884	+
	885	+ WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link,
	886	+ cg_link->type));
	887	+
	888	+ cg = cg_link->cgroup;
	889	+ cg_link->cgroup = NULL;
	890	+
	891	+ mutex_unlock(&cgroup_mutex);
	892	+
	893	+ cgroup_put(cg);
	894	+}
	895	+
	896	+static void bpf_cgroup_link_dealloc(struct bpf_link *link)
	897	+{
	898	+ struct bpf_cgroup_link *cg_link =
	899	+ container_of(link, struct bpf_cgroup_link, link);
	900	+
	901	+ kfree(cg_link);
	902	+}
	903	+
	904	+static int bpf_cgroup_link_detach(struct bpf_link *link)
	905	+{
	906	+ bpf_cgroup_link_release(link);
	907	+
	908	+ return 0;
	909	+}
	910	+
	911	+static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link,
	912	+ struct seq_file *seq)
	913	+{
	914	+ struct bpf_cgroup_link *cg_link =
	915	+ container_of(link, struct bpf_cgroup_link, link);
	916	+ u64 cg_id = 0;
	917	+
	918	+ mutex_lock(&cgroup_mutex);
	919	+ if (cg_link->cgroup)
	920	+ cg_id = cgroup_id(cg_link->cgroup);
	921	+ mutex_unlock(&cgroup_mutex);
	922	+
	923	+ seq_printf(seq,
	924	+ "cgroup_id:\t%llu\n"
	925	+ "attach_type:\t%d\n",
	926	+ cg_id,
	927	+ cg_link->type);
	928	+}
	929	+
	930	+static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link,
	931	+ struct bpf_link_info *info)
	932	+{
	933	+ struct bpf_cgroup_link *cg_link =
	934	+ container_of(link, struct bpf_cgroup_link, link);
	935	+ u64 cg_id = 0;
	936	+
	937	+ mutex_lock(&cgroup_mutex);
	938	+ if (cg_link->cgroup)
	939	+ cg_id = cgroup_id(cg_link->cgroup);
	940	+ mutex_unlock(&cgroup_mutex);
	941	+
	942	+ info->cgroup.cgroup_id = cg_id;
	943	+ info->cgroup.attach_type = cg_link->type;
	944	+ return 0;
	945	+}
	946	+
	947	+static const struct bpf_link_ops bpf_cgroup_link_lops = {
	948	+ .release = bpf_cgroup_link_release,
	949	+ .dealloc = bpf_cgroup_link_dealloc,
	950	+ .detach = bpf_cgroup_link_detach,
	951	+ .update_prog = cgroup_bpf_replace,
	952	+ .show_fdinfo = bpf_cgroup_link_show_fdinfo,
	953	+ .fill_link_info = bpf_cgroup_link_fill_link_info,
	954	+};
	955	+
	956	+int cgroup_bpf_link_attach(const union bpf_attr attr, struct bpf_prog prog)
	957	+{
	958	+ struct bpf_link_primer link_primer;
	959	+ struct bpf_cgroup_link *link;
	960	+ struct cgroup *cgrp;
	961	+ int err;
	962	+
	963	+ if (attr->link_create.flags)
	964	+ return -EINVAL;
	965	+
	966	+ cgrp = cgroup_get_from_fd(attr->link_create.target_fd);
	967	+ if (IS_ERR(cgrp))
	968	+ return PTR_ERR(cgrp);
	969	+
	970	+ link = kzalloc(sizeof(*link), GFP_USER);
	971	+ if (!link) {
	972	+ err = -ENOMEM;
	973	+ goto out_put_cgroup;
	974	+ }
	975	+ bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops,
	976	+ prog);
	977	+ link->cgroup = cgrp;
	978	+ link->type = attr->link_create.attach_type;
	979	+
	980	+ err = bpf_link_prime(&link->link, &link_primer);
	981	+ if (err) {
	982	+ kfree(link);
	983	+ goto out_put_cgroup;
	984	+ }
	985	+
	986	+ err = cgroup_bpf_attach(cgrp, NULL, NULL, link, link->type,
	987	+ BPF_F_ALLOW_MULTI);
	988	+ if (err) {
	989	+ bpf_link_cleanup(&link_primer);
	990	+ goto out_put_cgroup;
	991	+ }
	992	+
	993	+ return bpf_link_settle(&link_primer);
	994	+
	995	+out_put_cgroup:
	996	+ cgroup_put(cgrp);
	997	+ return err;
487	998	}
488	999
489	1000	int cgroup_bpf_prog_query(const union bpf_attr *attr,
..	..	@@ -514,8 +1025,16 @@
514	1025	* The program type passed in via @type must be suitable for network
515	1026	* filtering. No further check is performed to assert that.
516	1027	*
517		- * This function will return %-EPERM if any if an attached program was found
518		- * and if it returned != 1 during execution. In all other cases, 0 is returned.
	1028	+ * For egress packets, this function can return:
	1029	+ * NET_XMIT_SUCCESS (0) - continue with packet output
	1030	+ * NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr
	1031	+ * NET_XMIT_CN (2) - continue with packet output and notify TCP
	1032	+ * to call cwr
	1033	+ * -EPERM - drop packet
	1034	+ *
	1035	+ * For ingress packets, this function will return -EPERM if any
	1036	+ * attached program was found and if it returned != 1 during execution.
	1037	+ * Otherwise 0 is returned.
519	1038	*/
520	1039	int __cgroup_bpf_run_filter_skb(struct sock *sk,
521	1040	struct sk_buff *skb,
..	..	@@ -523,6 +1042,7 @@
523	1042	{
524	1043	unsigned int offset = skb->data - skb_network_header(skb);
525	1044	struct sock *save_sk;
	1045	+ void *saved_data_end;
526	1046	struct cgroup *cgrp;
527	1047	int ret;
528	1048
..	..	@@ -536,11 +1056,23 @@
536	1056	save_sk = skb->sk;
537	1057	skb->sk = sk;
538	1058	__skb_push(skb, offset);
539		- ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
540		- bpf_prog_run_save_cb);
	1059	+
	1060	+ /* compute pointers for the bpf prog */
	1061	+ bpf_compute_and_save_data_end(skb, &saved_data_end);
	1062	+
	1063	+ if (type == BPF_CGROUP_INET_EGRESS) {
	1064	+ ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(
	1065	+ cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb);
	1066	+ } else {
	1067	+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
	1068	+ __bpf_prog_run_save_cb);
	1069	+ ret = (ret == 1 ? 0 : -EPERM);
	1070	+ }
	1071	+ bpf_restore_data_end(skb, saved_data_end);
541	1072	__skb_pull(skb, offset);
542	1073	skb->sk = save_sk;
543		- return ret == 1 ? 0 : -EPERM;
	1074	+
	1075	+ return ret;
544	1076	}
545	1077	EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
546	1078
..	..	@@ -661,28 +1193,28 @@
661	1193
662	1194	return !allow;
663	1195	}
664		-EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission);
665	1196
666	1197	static const struct bpf_func_proto *
667		-cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
	1198	+cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
668	1199	{
669	1200	switch (func_id) {
670		- case BPF_FUNC_map_lookup_elem:
671		- return &bpf_map_lookup_elem_proto;
672		- case BPF_FUNC_map_update_elem:
673		- return &bpf_map_update_elem_proto;
674		- case BPF_FUNC_map_delete_elem:
675		- return &bpf_map_delete_elem_proto;
676	1201	case BPF_FUNC_get_current_uid_gid:
677	1202	return &bpf_get_current_uid_gid_proto;
678	1203	case BPF_FUNC_get_local_storage:
679	1204	return &bpf_get_local_storage_proto;
680		- case BPF_FUNC_trace_printk:
681		- if (capable(CAP_SYS_ADMIN))
682		- return bpf_get_trace_printk_proto();
	1205	+ case BPF_FUNC_get_current_cgroup_id:
	1206	+ return &bpf_get_current_cgroup_id_proto;
	1207	+ case BPF_FUNC_perf_event_output:
	1208	+ return &bpf_event_output_data_proto;
683	1209	default:
684		- return NULL;
	1210	+ return bpf_base_func_proto(func_id);
685	1211	}
	1212	+}
	1213	+
	1214	+static const struct bpf_func_proto *
	1215	+cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
	1216	+{
	1217	+ return cgroup_base_func_proto(func_id, prog);
686	1218	}
687	1219
688	1220	static bool cgroup_dev_is_valid_access(int off, int size,
..	..	@@ -722,3 +1254,715 @@
722	1254	.get_func_proto = cgroup_dev_func_proto,
723	1255	.is_valid_access = cgroup_dev_is_valid_access,
724	1256	};
	1257	+
	1258	+/**
	1259	+ * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl
	1260	+ *
	1261	+ * @head: sysctl table header
	1262	+ * @table: sysctl table
	1263	+ * @write: sysctl is being read (= 0) or written (= 1)
	1264	+ * @buf: pointer to buffer (in and out)
	1265	+ * @pcount: value-result argument: value is size of buffer pointed to by @buf,
	1266	+ * result is size of @new_buf if program set new value, initial value
	1267	+ * otherwise
	1268	+ * @ppos: value-result argument: value is position at which read from or write
	1269	+ * to sysctl is happening, result is new position if program overrode it,
	1270	+ * initial value otherwise
	1271	+ * @type: type of program to be executed
	1272	+ *
	1273	+ * Program is run when sysctl is being accessed, either read or written, and
	1274	+ * can allow or deny such access.
	1275	+ *
	1276	+ * This function will return %-EPERM if an attached program is found and
	1277	+ * returned value != 1 during execution. In all other cases 0 is returned.
	1278	+ */
	1279	+int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
	1280	+ struct ctl_table *table, int write,
	1281	+ char *buf, size_t pcount, loff_t *ppos,
	1282	+ enum bpf_attach_type type)
	1283	+{
	1284	+ struct bpf_sysctl_kern ctx = {
	1285	+ .head = head,
	1286	+ .table = table,
	1287	+ .write = write,
	1288	+ .ppos = ppos,
	1289	+ .cur_val = NULL,
	1290	+ .cur_len = PAGE_SIZE,
	1291	+ .new_val = NULL,
	1292	+ .new_len = 0,
	1293	+ .new_updated = 0,
	1294	+ };
	1295	+ struct cgroup *cgrp;
	1296	+ loff_t pos = 0;
	1297	+ int ret;
	1298	+
	1299	+ ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL);
	1300	+ if (!ctx.cur_val \|\|
	1301	+ table->proc_handler(table, 0, ctx.cur_val, &ctx.cur_len, &pos)) {
	1302	+ /* Let BPF program decide how to proceed. */
	1303	+ ctx.cur_len = 0;
	1304	+ }
	1305	+
	1306	+ if (write && buf && pcount) {
	1307	+ /* BPF program should be able to override new value with a
	1308	+ * buffer bigger than provided by user.
	1309	+ */
	1310	+ ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL);
	1311	+ ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount);
	1312	+ if (ctx.new_val) {
	1313	+ memcpy(ctx.new_val, *buf, ctx.new_len);
	1314	+ } else {
	1315	+ /* Let BPF program decide how to proceed. */
	1316	+ ctx.new_len = 0;
	1317	+ }
	1318	+ }
	1319	+
	1320	+ rcu_read_lock();
	1321	+ cgrp = task_dfl_cgroup(current);
	1322	+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
	1323	+ rcu_read_unlock();
	1324	+
	1325	+ kfree(ctx.cur_val);
	1326	+
	1327	+ if (ret == 1 && ctx.new_updated) {
	1328	+ kfree(*buf);
	1329	+ *buf = ctx.new_val;
	1330	+ *pcount = ctx.new_len;
	1331	+ } else {
	1332	+ kfree(ctx.new_val);
	1333	+ }
	1334	+
	1335	+ return ret == 1 ? 0 : -EPERM;
	1336	+}
	1337	+
	1338	+#ifdef CONFIG_NET
	1339	+static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
	1340	+ enum bpf_attach_type attach_type)
	1341	+{
	1342	+ struct bpf_prog_array *prog_array;
	1343	+ bool empty;
	1344	+
	1345	+ rcu_read_lock();
	1346	+ prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]);
	1347	+ empty = bpf_prog_array_is_empty(prog_array);
	1348	+ rcu_read_unlock();
	1349	+
	1350	+ return empty;
	1351	+}
	1352	+
	1353	+static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
	1354	+{
	1355	+ if (unlikely(max_optlen < 0))
	1356	+ return -EINVAL;
	1357	+
	1358	+ if (unlikely(max_optlen > PAGE_SIZE)) {
	1359	+ /* We don't expose optvals that are greater than PAGE_SIZE
	1360	+ * to the BPF program.
	1361	+ */
	1362	+ max_optlen = PAGE_SIZE;
	1363	+ }
	1364	+
	1365	+ ctx->optval = kzalloc(max_optlen, GFP_USER);
	1366	+ if (!ctx->optval)
	1367	+ return -ENOMEM;
	1368	+
	1369	+ ctx->optval_end = ctx->optval + max_optlen;
	1370	+
	1371	+ return max_optlen;
	1372	+}
	1373	+
	1374	+static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)
	1375	+{
	1376	+ kfree(ctx->optval);
	1377	+}
	1378	+
	1379	+int __cgroup_bpf_run_filter_setsockopt(struct sock sk, int level,
	1380	+ int optname, char __user optval,
	1381	+ int optlen, char *kernel_optval)
	1382	+{
	1383	+ struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
	1384	+ struct bpf_sockopt_kern ctx = {
	1385	+ .sk = sk,
	1386	+ .level = *level,
	1387	+ .optname = *optname,
	1388	+ };
	1389	+ int ret, max_optlen;
	1390	+
	1391	+ /* Opportunistic check to see whether we have any BPF program
	1392	+ * attached to the hook so we don't waste time allocating
	1393	+ * memory and locking the socket.
	1394	+ */
	1395	+ if (!cgroup_bpf_enabled \|\|
	1396	+ __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT))
	1397	+ return 0;
	1398	+
	1399	+ /* Allocate a bit more than the initial user buffer for
	1400	+ * BPF program. The canonical use case is overriding
	1401	+ * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
	1402	+ */
	1403	+ max_optlen = max_t(int, 16, *optlen);
	1404	+
	1405	+ max_optlen = sockopt_alloc_buf(&ctx, max_optlen);
	1406	+ if (max_optlen < 0)
	1407	+ return max_optlen;
	1408	+
	1409	+ ctx.optlen = *optlen;
	1410	+
	1411	+ if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) {
	1412	+ ret = -EFAULT;
	1413	+ goto out;
	1414	+ }
	1415	+
	1416	+ lock_sock(sk);
	1417	+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT],
	1418	+ &ctx, BPF_PROG_RUN);
	1419	+ release_sock(sk);
	1420	+
	1421	+ if (!ret) {
	1422	+ ret = -EPERM;
	1423	+ goto out;
	1424	+ }
	1425	+
	1426	+ if (ctx.optlen == -1) {
	1427	+ /* optlen set to -1, bypass kernel */
	1428	+ ret = 1;
	1429	+ } else if (ctx.optlen > max_optlen \|\| ctx.optlen < -1) {
	1430	+ /* optlen is out of bounds */
	1431	+ ret = -EFAULT;
	1432	+ } else {
	1433	+ /* optlen within bounds, run kernel handler */
	1434	+ ret = 0;
	1435	+
	1436	+ /* export any potential modifications */
	1437	+ *level = ctx.level;
	1438	+ *optname = ctx.optname;
	1439	+
	1440	+ /* optlen == 0 from BPF indicates that we should
	1441	+ * use original userspace data.
	1442	+ */
	1443	+ if (ctx.optlen != 0) {
	1444	+ *optlen = ctx.optlen;
	1445	+ *kernel_optval = ctx.optval;
	1446	+ /* export and don't free sockopt buf */
	1447	+ return 0;
	1448	+ }
	1449	+ }
	1450	+
	1451	+out:
	1452	+ sockopt_free_buf(&ctx);
	1453	+ return ret;
	1454	+}
	1455	+
	1456	+int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
	1457	+ int optname, char __user *optval,
	1458	+ int __user *optlen, int max_optlen,
	1459	+ int retval)
	1460	+{
	1461	+ struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
	1462	+ struct bpf_sockopt_kern ctx = {
	1463	+ .sk = sk,
	1464	+ .level = level,
	1465	+ .optname = optname,
	1466	+ .retval = retval,
	1467	+ };
	1468	+ int ret;
	1469	+
	1470	+ /* Opportunistic check to see whether we have any BPF program
	1471	+ * attached to the hook so we don't waste time allocating
	1472	+ * memory and locking the socket.
	1473	+ */
	1474	+ if (!cgroup_bpf_enabled \|\|
	1475	+ __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT))
	1476	+ return retval;
	1477	+
	1478	+ ctx.optlen = max_optlen;
	1479	+
	1480	+ max_optlen = sockopt_alloc_buf(&ctx, max_optlen);
	1481	+ if (max_optlen < 0)
	1482	+ return max_optlen;
	1483	+
	1484	+ if (!retval) {
	1485	+ /* If kernel getsockopt finished successfully,
	1486	+ * copy whatever was returned to the user back
	1487	+ * into our temporary buffer. Set optlen to the
	1488	+ * one that kernel returned as well to let
	1489	+ * BPF programs inspect the value.
	1490	+ */
	1491	+
	1492	+ if (get_user(ctx.optlen, optlen)) {
	1493	+ ret = -EFAULT;
	1494	+ goto out;
	1495	+ }
	1496	+
	1497	+ if (ctx.optlen < 0) {
	1498	+ ret = -EFAULT;
	1499	+ goto out;
	1500	+ }
	1501	+
	1502	+ if (copy_from_user(ctx.optval, optval,
	1503	+ min(ctx.optlen, max_optlen)) != 0) {
	1504	+ ret = -EFAULT;
	1505	+ goto out;
	1506	+ }
	1507	+ }
	1508	+
	1509	+ lock_sock(sk);
	1510	+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
	1511	+ &ctx, BPF_PROG_RUN);
	1512	+ release_sock(sk);
	1513	+
	1514	+ if (!ret) {
	1515	+ ret = -EPERM;
	1516	+ goto out;
	1517	+ }
	1518	+
	1519	+ if (optval && (ctx.optlen > max_optlen \|\| ctx.optlen < 0)) {
	1520	+ ret = -EFAULT;
	1521	+ goto out;
	1522	+ }
	1523	+
	1524	+ /* BPF programs only allowed to set retval to 0, not some
	1525	+ * arbitrary value.
	1526	+ */
	1527	+ if (ctx.retval != 0 && ctx.retval != retval) {
	1528	+ ret = -EFAULT;
	1529	+ goto out;
	1530	+ }
	1531	+
	1532	+ if (ctx.optlen != 0) {
	1533	+ if (optval && copy_to_user(optval, ctx.optval, ctx.optlen)) {
	1534	+ ret = -EFAULT;
	1535	+ goto out;
	1536	+ }
	1537	+ if (put_user(ctx.optlen, optlen)) {
	1538	+ ret = -EFAULT;
	1539	+ goto out;
	1540	+ }
	1541	+ }
	1542	+
	1543	+ ret = ctx.retval;
	1544	+
	1545	+out:
	1546	+ sockopt_free_buf(&ctx);
	1547	+ return ret;
	1548	+}
	1549	+#endif
	1550	+
	1551	+static ssize_t sysctl_cpy_dir(const struct ctl_dir dir, char *bufp,
	1552	+ size_t *lenp)
	1553	+{
	1554	+ ssize_t tmp_ret = 0, ret;
	1555	+
	1556	+ if (dir->header.parent) {
	1557	+ tmp_ret = sysctl_cpy_dir(dir->header.parent, bufp, lenp);
	1558	+ if (tmp_ret < 0)
	1559	+ return tmp_ret;
	1560	+ }
	1561	+
	1562	+ ret = strscpy(bufp, dir->header.ctl_table[0].procname, lenp);
	1563	+ if (ret < 0)
	1564	+ return ret;
	1565	+ *bufp += ret;
	1566	+ *lenp -= ret;
	1567	+ ret += tmp_ret;
	1568	+
	1569	+ /* Avoid leading slash. */
	1570	+ if (!ret)
	1571	+ return ret;
	1572	+
	1573	+ tmp_ret = strscpy(bufp, "/", lenp);
	1574	+ if (tmp_ret < 0)
	1575	+ return tmp_ret;
	1576	+ *bufp += tmp_ret;
	1577	+ *lenp -= tmp_ret;
	1578	+
	1579	+ return ret + tmp_ret;
	1580	+}
	1581	+
	1582	+BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern , ctx, char , buf,
	1583	+ size_t, buf_len, u64, flags)
	1584	+{
	1585	+ ssize_t tmp_ret = 0, ret;
	1586	+
	1587	+ if (!buf)
	1588	+ return -EINVAL;
	1589	+
	1590	+ if (!(flags & BPF_F_SYSCTL_BASE_NAME)) {
	1591	+ if (!ctx->head)
	1592	+ return -EINVAL;
	1593	+ tmp_ret = sysctl_cpy_dir(ctx->head->parent, &buf, &buf_len);
	1594	+ if (tmp_ret < 0)
	1595	+ return tmp_ret;
	1596	+ }
	1597	+
	1598	+ ret = strscpy(buf, ctx->table->procname, buf_len);
	1599	+
	1600	+ return ret < 0 ? ret : tmp_ret + ret;
	1601	+}
	1602	+
	1603	+static const struct bpf_func_proto bpf_sysctl_get_name_proto = {
	1604	+ .func = bpf_sysctl_get_name,
	1605	+ .gpl_only = false,
	1606	+ .ret_type = RET_INTEGER,
	1607	+ .arg1_type = ARG_PTR_TO_CTX,
	1608	+ .arg2_type = ARG_PTR_TO_MEM,
	1609	+ .arg3_type = ARG_CONST_SIZE,
	1610	+ .arg4_type = ARG_ANYTHING,
	1611	+};
	1612	+
	1613	+static int copy_sysctl_value(char dst, size_t dst_len, char src,
	1614	+ size_t src_len)
	1615	+{
	1616	+ if (!dst)
	1617	+ return -EINVAL;
	1618	+
	1619	+ if (!dst_len)
	1620	+ return -E2BIG;
	1621	+
	1622	+ if (!src \|\| !src_len) {
	1623	+ memset(dst, 0, dst_len);
	1624	+ return -EINVAL;
	1625	+ }
	1626	+
	1627	+ memcpy(dst, src, min(dst_len, src_len));
	1628	+
	1629	+ if (dst_len > src_len) {
	1630	+ memset(dst + src_len, '\0', dst_len - src_len);
	1631	+ return src_len;
	1632	+ }
	1633	+
	1634	+ dst[dst_len - 1] = '\0';
	1635	+
	1636	+ return -E2BIG;
	1637	+}
	1638	+
	1639	+BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx,
	1640	+ char *, buf, size_t, buf_len)
	1641	+{
	1642	+ return copy_sysctl_value(buf, buf_len, ctx->cur_val, ctx->cur_len);
	1643	+}
	1644	+
	1645	+static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = {
	1646	+ .func = bpf_sysctl_get_current_value,
	1647	+ .gpl_only = false,
	1648	+ .ret_type = RET_INTEGER,
	1649	+ .arg1_type = ARG_PTR_TO_CTX,
	1650	+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
	1651	+ .arg3_type = ARG_CONST_SIZE,
	1652	+};
	1653	+
	1654	+BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern , ctx, char , buf,
	1655	+ size_t, buf_len)
	1656	+{
	1657	+ if (!ctx->write) {
	1658	+ if (buf && buf_len)
	1659	+ memset(buf, '\0', buf_len);
	1660	+ return -EINVAL;
	1661	+ }
	1662	+ return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len);
	1663	+}
	1664	+
	1665	+static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = {
	1666	+ .func = bpf_sysctl_get_new_value,
	1667	+ .gpl_only = false,
	1668	+ .ret_type = RET_INTEGER,
	1669	+ .arg1_type = ARG_PTR_TO_CTX,
	1670	+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
	1671	+ .arg3_type = ARG_CONST_SIZE,
	1672	+};
	1673	+
	1674	+BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx,
	1675	+ const char *, buf, size_t, buf_len)
	1676	+{
	1677	+ if (!ctx->write \|\| !ctx->new_val \|\| !ctx->new_len \|\| !buf \|\| !buf_len)
	1678	+ return -EINVAL;
	1679	+
	1680	+ if (buf_len > PAGE_SIZE - 1)
	1681	+ return -E2BIG;
	1682	+
	1683	+ memcpy(ctx->new_val, buf, buf_len);
	1684	+ ctx->new_len = buf_len;
	1685	+ ctx->new_updated = 1;
	1686	+
	1687	+ return 0;
	1688	+}
	1689	+
	1690	+static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
	1691	+ .func = bpf_sysctl_set_new_value,
	1692	+ .gpl_only = false,
	1693	+ .ret_type = RET_INTEGER,
	1694	+ .arg1_type = ARG_PTR_TO_CTX,
	1695	+ .arg2_type = ARG_PTR_TO_MEM,
	1696	+ .arg3_type = ARG_CONST_SIZE,
	1697	+};
	1698	+
	1699	+static const struct bpf_func_proto *
	1700	+sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
	1701	+{
	1702	+ switch (func_id) {
	1703	+ case BPF_FUNC_strtol:
	1704	+ return &bpf_strtol_proto;
	1705	+ case BPF_FUNC_strtoul:
	1706	+ return &bpf_strtoul_proto;
	1707	+ case BPF_FUNC_sysctl_get_name:
	1708	+ return &bpf_sysctl_get_name_proto;
	1709	+ case BPF_FUNC_sysctl_get_current_value:
	1710	+ return &bpf_sysctl_get_current_value_proto;
	1711	+ case BPF_FUNC_sysctl_get_new_value:
	1712	+ return &bpf_sysctl_get_new_value_proto;
	1713	+ case BPF_FUNC_sysctl_set_new_value:
	1714	+ return &bpf_sysctl_set_new_value_proto;
	1715	+ default:
	1716	+ return cgroup_base_func_proto(func_id, prog);
	1717	+ }
	1718	+}
	1719	+
	1720	+static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type,
	1721	+ const struct bpf_prog *prog,
	1722	+ struct bpf_insn_access_aux *info)
	1723	+{
	1724	+ const int size_default = sizeof(__u32);
	1725	+
	1726	+ if (off < 0 \|\| off + size > sizeof(struct bpf_sysctl) \|\| off % size)
	1727	+ return false;
	1728	+
	1729	+ switch (off) {
	1730	+ case bpf_ctx_range(struct bpf_sysctl, write):
	1731	+ if (type != BPF_READ)
	1732	+ return false;
	1733	+ bpf_ctx_record_field_size(info, size_default);
	1734	+ return bpf_ctx_narrow_access_ok(off, size, size_default);
	1735	+ case bpf_ctx_range(struct bpf_sysctl, file_pos):
	1736	+ if (type == BPF_READ) {
	1737	+ bpf_ctx_record_field_size(info, size_default);
	1738	+ return bpf_ctx_narrow_access_ok(off, size, size_default);
	1739	+ } else {
	1740	+ return size == size_default;
	1741	+ }
	1742	+ default:
	1743	+ return false;
	1744	+ }
	1745	+}
	1746	+
	1747	+static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
	1748	+ const struct bpf_insn *si,
	1749	+ struct bpf_insn *insn_buf,
	1750	+ struct bpf_prog prog, u32 target_size)
	1751	+{
	1752	+ struct bpf_insn *insn = insn_buf;
	1753	+ u32 read_size;
	1754	+
	1755	+ switch (si->off) {
	1756	+ case offsetof(struct bpf_sysctl, write):
	1757	+ *insn++ = BPF_LDX_MEM(
	1758	+ BPF_SIZE(si->code), si->dst_reg, si->src_reg,
	1759	+ bpf_target_off(struct bpf_sysctl_kern, write,
	1760	+ sizeof_field(struct bpf_sysctl_kern,
	1761	+ write),
	1762	+ target_size));
	1763	+ break;
	1764	+ case offsetof(struct bpf_sysctl, file_pos):
	1765	+ /* ppos is a pointer so it should be accessed via indirect
	1766	+ * loads and stores. Also for stores additional temporary
	1767	+ * register is used since neither src_reg nor dst_reg can be
	1768	+ * overridden.
	1769	+ */
	1770	+ if (type == BPF_WRITE) {
	1771	+ int treg = BPF_REG_9;
	1772	+
	1773	+ if (si->src_reg == treg \|\| si->dst_reg == treg)
	1774	+ --treg;
	1775	+ if (si->src_reg == treg \|\| si->dst_reg == treg)
	1776	+ --treg;
	1777	+ *insn++ = BPF_STX_MEM(
	1778	+ BPF_DW, si->dst_reg, treg,
	1779	+ offsetof(struct bpf_sysctl_kern, tmp_reg));
	1780	+ *insn++ = BPF_LDX_MEM(
	1781	+ BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
	1782	+ treg, si->dst_reg,
	1783	+ offsetof(struct bpf_sysctl_kern, ppos));
	1784	+ *insn++ = BPF_STX_MEM(
	1785	+ BPF_SIZEOF(u32), treg, si->src_reg,
	1786	+ bpf_ctx_narrow_access_offset(
	1787	+ 0, sizeof(u32), sizeof(loff_t)));
	1788	+ *insn++ = BPF_LDX_MEM(
	1789	+ BPF_DW, treg, si->dst_reg,
	1790	+ offsetof(struct bpf_sysctl_kern, tmp_reg));
	1791	+ } else {
	1792	+ *insn++ = BPF_LDX_MEM(
	1793	+ BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
	1794	+ si->dst_reg, si->src_reg,
	1795	+ offsetof(struct bpf_sysctl_kern, ppos));
	1796	+ read_size = bpf_size_to_bytes(BPF_SIZE(si->code));
	1797	+ *insn++ = BPF_LDX_MEM(
	1798	+ BPF_SIZE(si->code), si->dst_reg, si->dst_reg,
	1799	+ bpf_ctx_narrow_access_offset(
	1800	+ 0, read_size, sizeof(loff_t)));
	1801	+ }
	1802	+ *target_size = sizeof(u32);
	1803	+ break;
	1804	+ }
	1805	+
	1806	+ return insn - insn_buf;
	1807	+}
	1808	+
	1809	+const struct bpf_verifier_ops cg_sysctl_verifier_ops = {
	1810	+ .get_func_proto = sysctl_func_proto,
	1811	+ .is_valid_access = sysctl_is_valid_access,
	1812	+ .convert_ctx_access = sysctl_convert_ctx_access,
	1813	+};
	1814	+
	1815	+const struct bpf_prog_ops cg_sysctl_prog_ops = {
	1816	+};
	1817	+
	1818	+static const struct bpf_func_proto *
	1819	+cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
	1820	+{
	1821	+ switch (func_id) {
	1822	+#ifdef CONFIG_NET
	1823	+ case BPF_FUNC_sk_storage_get:
	1824	+ return &bpf_sk_storage_get_proto;
	1825	+ case BPF_FUNC_sk_storage_delete:
	1826	+ return &bpf_sk_storage_delete_proto;
	1827	+#endif
	1828	+#ifdef CONFIG_INET
	1829	+ case BPF_FUNC_tcp_sock:
	1830	+ return &bpf_tcp_sock_proto;
	1831	+#endif
	1832	+ default:
	1833	+ return cgroup_base_func_proto(func_id, prog);
	1834	+ }
	1835	+}
	1836	+
	1837	+static bool cg_sockopt_is_valid_access(int off, int size,
	1838	+ enum bpf_access_type type,
	1839	+ const struct bpf_prog *prog,
	1840	+ struct bpf_insn_access_aux *info)
	1841	+{
	1842	+ const int size_default = sizeof(__u32);
	1843	+
	1844	+ if (off < 0 \|\| off >= sizeof(struct bpf_sockopt))
	1845	+ return false;
	1846	+
	1847	+ if (off % size != 0)
	1848	+ return false;
	1849	+
	1850	+ if (type == BPF_WRITE) {
	1851	+ switch (off) {
	1852	+ case offsetof(struct bpf_sockopt, retval):
	1853	+ if (size != size_default)
	1854	+ return false;
	1855	+ return prog->expected_attach_type ==
	1856	+ BPF_CGROUP_GETSOCKOPT;
	1857	+ case offsetof(struct bpf_sockopt, optname):
	1858	+ fallthrough;
	1859	+ case offsetof(struct bpf_sockopt, level):
	1860	+ if (size != size_default)
	1861	+ return false;
	1862	+ return prog->expected_attach_type ==
	1863	+ BPF_CGROUP_SETSOCKOPT;
	1864	+ case offsetof(struct bpf_sockopt, optlen):
	1865	+ return size == size_default;
	1866	+ default:
	1867	+ return false;
	1868	+ }
	1869	+ }
	1870	+
	1871	+ switch (off) {
	1872	+ case offsetof(struct bpf_sockopt, sk):
	1873	+ if (size != sizeof(__u64))
	1874	+ return false;
	1875	+ info->reg_type = PTR_TO_SOCKET;
	1876	+ break;
	1877	+ case offsetof(struct bpf_sockopt, optval):
	1878	+ if (size != sizeof(__u64))
	1879	+ return false;
	1880	+ info->reg_type = PTR_TO_PACKET;
	1881	+ break;
	1882	+ case offsetof(struct bpf_sockopt, optval_end):
	1883	+ if (size != sizeof(__u64))
	1884	+ return false;
	1885	+ info->reg_type = PTR_TO_PACKET_END;
	1886	+ break;
	1887	+ case offsetof(struct bpf_sockopt, retval):
	1888	+ if (size != size_default)
	1889	+ return false;
	1890	+ return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT;
	1891	+ default:
	1892	+ if (size != size_default)
	1893	+ return false;
	1894	+ break;
	1895	+ }
	1896	+ return true;
	1897	+}
	1898	+
	1899	+#define CG_SOCKOPT_ACCESS_FIELD(T, F) \
	1900	+ T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \
	1901	+ si->dst_reg, si->src_reg, \
	1902	+ offsetof(struct bpf_sockopt_kern, F))
	1903	+
	1904	+static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
	1905	+ const struct bpf_insn *si,
	1906	+ struct bpf_insn *insn_buf,
	1907	+ struct bpf_prog *prog,
	1908	+ u32 *target_size)
	1909	+{
	1910	+ struct bpf_insn *insn = insn_buf;
	1911	+
	1912	+ switch (si->off) {
	1913	+ case offsetof(struct bpf_sockopt, sk):
	1914	+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk);
	1915	+ break;
	1916	+ case offsetof(struct bpf_sockopt, level):
	1917	+ if (type == BPF_WRITE)
	1918	+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level);
	1919	+ else
	1920	+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level);
	1921	+ break;
	1922	+ case offsetof(struct bpf_sockopt, optname):
	1923	+ if (type == BPF_WRITE)
	1924	+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname);
	1925	+ else
	1926	+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname);
	1927	+ break;
	1928	+ case offsetof(struct bpf_sockopt, optlen):
	1929	+ if (type == BPF_WRITE)
	1930	+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen);
	1931	+ else
	1932	+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen);
	1933	+ break;
	1934	+ case offsetof(struct bpf_sockopt, retval):
	1935	+ if (type == BPF_WRITE)
	1936	+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval);
	1937	+ else
	1938	+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval);
	1939	+ break;
	1940	+ case offsetof(struct bpf_sockopt, optval):
	1941	+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval);
	1942	+ break;
	1943	+ case offsetof(struct bpf_sockopt, optval_end):
	1944	+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end);
	1945	+ break;
	1946	+ }
	1947	+
	1948	+ return insn - insn_buf;
	1949	+}
	1950	+
	1951	+static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf,
	1952	+ bool direct_write,
	1953	+ const struct bpf_prog *prog)
	1954	+{
	1955	+ /* Nothing to do for sockopt argument. The data is kzalloc'ated.
	1956	+ */
	1957	+ return 0;
	1958	+}
	1959	+
	1960	+const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
	1961	+ .get_func_proto = cg_sockopt_func_proto,
	1962	+ .is_valid_access = cg_sockopt_is_valid_access,
	1963	+ .convert_ctx_access = cg_sockopt_convert_ctx_access,
	1964	+ .gen_prologue = cg_sockopt_get_prologue,
	1965	+};
	1966	+
	1967	+const struct bpf_prog_ops cg_sockopt_prog_ops = {
	1968	+};