~hc/RK356X_SDK_RELEASE.git

..	..	@@ -13,6 +13,7 @@
13	13	* Mode 2 allows user-defined system call filters in the form
14	14	* of Berkeley Packet Filters/Linux Socket Filters.
15	15	*/
	16	+#define pr_fmt(fmt) "seccomp: " fmt
16	17
17	18	#include <linux/refcount.h>
18	19	#include <linux/audit.h>
..	..	@@ -28,31 +29,175 @@
28	29	#include <linux/syscalls.h>
29	30	#include <linux/sysctl.h>
30	31
31		-/* Not exposed in headers: strictly internal use only. */
32		-#define SECCOMP_MODE_DEAD (SECCOMP_MODE_FILTER + 1)
33		-
34	32	#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
35	33	#include <asm/syscall.h>
36	34	#endif
37	35
38	36	#ifdef CONFIG_SECCOMP_FILTER
	37	+#include <linux/file.h>
39	38	#include <linux/filter.h>
40	39	#include <linux/pid.h>
41	40	#include <linux/ptrace.h>
42	41	#include <linux/capability.h>
43	42	#include <linux/tracehook.h>
44	43	#include <linux/uaccess.h>
	44	+#include <linux/anon_inodes.h>
	45	+#include <linux/lockdep.h>
	46	+
	47	+/*
	48	+ * When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced, it had the
	49	+ * wrong direction flag in the ioctl number. This is the broken one,
	50	+ * which the kernel needs to keep supporting until all userspaces stop
	51	+ * using the wrong command number.
	52	+ */
	53	+#define SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR SECCOMP_IOR(2, __u64)
	54	+
	55	+enum notify_state {
	56	+ SECCOMP_NOTIFY_INIT,
	57	+ SECCOMP_NOTIFY_SENT,
	58	+ SECCOMP_NOTIFY_REPLIED,
	59	+};
	60	+
	61	+struct seccomp_knotif {
	62	+ /* The struct pid of the task whose filter triggered the notification */
	63	+ struct task_struct *task;
	64	+
	65	+ /* The "cookie" for this request; this is unique for this filter. */
	66	+ u64 id;
	67	+
	68	+ /*
	69	+ * The seccomp data. This pointer is valid the entire time this
	70	+ * notification is active, since it comes from __seccomp_filter which
	71	+ * eclipses the entire lifecycle here.
	72	+ */
	73	+ const struct seccomp_data *data;
	74	+
	75	+ /*
	76	+ * Notification states. When SECCOMP_RET_USER_NOTIF is returned, a
	77	+ * struct seccomp_knotif is created and starts out in INIT. Once the
	78	+ * handler reads the notification off of an FD, it transitions to SENT.
	79	+ * If a signal is received the state transitions back to INIT and
	80	+ * another message is sent. When the userspace handler replies, state
	81	+ * transitions to REPLIED.
	82	+ */
	83	+ enum notify_state state;
	84	+
	85	+ /* The return values, only valid when in SECCOMP_NOTIFY_REPLIED */
	86	+ int error;
	87	+ long val;
	88	+ u32 flags;
	89	+
	90	+ /*
	91	+ * Signals when this has changed states, such as the listener
	92	+ * dying, a new seccomp addfd message, or changing to REPLIED
	93	+ */
	94	+ struct completion ready;
	95	+
	96	+ struct list_head list;
	97	+
	98	+ /* outstanding addfd requests */
	99	+ struct list_head addfd;
	100	+};
	101	+
	102	+/**
	103	+ * struct seccomp_kaddfd - container for seccomp_addfd ioctl messages
	104	+ *
	105	+ * @file: A reference to the file to install in the other task
	106	+ * @fd: The fd number to install it at. If the fd number is -1, it means the
	107	+ * installing process should allocate the fd as normal.
	108	+ * @flags: The flags for the new file descriptor. At the moment, only O_CLOEXEC
	109	+ * is allowed.
	110	+ * @ret: The return value of the installing process. It is set to the fd num
	111	+ * upon success (>= 0).
	112	+ * @completion: Indicates that the installing process has completed fd
	113	+ * installation, or gone away (either due to successful
	114	+ * reply, or signal)
	115	+ *
	116	+ */
	117	+struct seccomp_kaddfd {
	118	+ struct file *file;
	119	+ int fd;
	120	+ unsigned int flags;
	121	+
	122	+ /* To only be set on reply */
	123	+ int ret;
	124	+ struct completion completion;
	125	+ struct list_head list;
	126	+};
	127	+
	128	+/**
	129	+ * struct notification - container for seccomp userspace notifications. Since
	130	+ * most seccomp filters will not have notification listeners attached and this
	131	+ * structure is fairly large, we store the notification-specific stuff in a
	132	+ * separate structure.
	133	+ *
	134	+ * @request: A semaphore that users of this notification can wait on for
	135	+ * changes. Actual reads and writes are still controlled with
	136	+ * filter->notify_lock.
	137	+ * @next_id: The id of the next request.
	138	+ * @notifications: A list of struct seccomp_knotif elements.
	139	+ */
	140	+struct notification {
	141	+ struct semaphore request;
	142	+ u64 next_id;
	143	+ struct list_head notifications;
	144	+};
	145	+
	146	+#ifdef SECCOMP_ARCH_NATIVE
	147	+/**
	148	+ * struct action_cache - per-filter cache of seccomp actions per
	149	+ * arch/syscall pair
	150	+ *
	151	+ * @allow_native: A bitmap where each bit represents whether the
	152	+ * filter will always allow the syscall, for the
	153	+ * native architecture.
	154	+ * @allow_compat: A bitmap where each bit represents whether the
	155	+ * filter will always allow the syscall, for the
	156	+ * compat architecture.
	157	+ */
	158	+struct action_cache {
	159	+ DECLARE_BITMAP(allow_native, SECCOMP_ARCH_NATIVE_NR);
	160	+#ifdef SECCOMP_ARCH_COMPAT
	161	+ DECLARE_BITMAP(allow_compat, SECCOMP_ARCH_COMPAT_NR);
	162	+#endif
	163	+};
	164	+#else
	165	+struct action_cache { };
	166	+
	167	+static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter,
	168	+ const struct seccomp_data *sd)
	169	+{
	170	+ return false;
	171	+}
	172	+
	173	+static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter)
	174	+{
	175	+}
	176	+#endif /* SECCOMP_ARCH_NATIVE */
45	177
46	178	/**
47	179	* struct seccomp_filter - container for seccomp BPF programs
48	180	*
49		- * @usage: reference count to manage the object lifetime.
50		- * get/put helpers should be used when accessing an instance
51		- * outside of a lifetime-guarded section. In general, this
52		- * is only needed for handling filters shared across tasks.
	181	+ * @refs: Reference count to manage the object lifetime.
	182	+ * A filter's reference count is incremented for each directly
	183	+ * attached task, once for the dependent filter, and if
	184	+ * requested for the user notifier. When @refs reaches zero,
	185	+ * the filter can be freed.
	186	+ * @users: A filter's @users count is incremented for each directly
	187	+ * attached task (filter installation, fork(), thread_sync),
	188	+ * and once for the dependent filter (tracked in filter->prev).
	189	+ * When it reaches zero it indicates that no direct or indirect
	190	+ * users of that filter exist. No new tasks can get associated with
	191	+ * this filter after reaching 0. The @users count is always smaller
	192	+ * or equal to @refs. Hence, reaching 0 for @users does not mean
	193	+ * the filter can be freed.
	194	+ * @cache: cache of arch/syscall mappings to actions
53	195	* @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
54	196	* @prev: points to a previously installed, or inherited, filter
55	197	* @prog: the BPF program to evaluate
	198	+ * @notif: the struct that holds all notification related information
	199	+ * @notify_lock: A lock for all notification-related accesses.
	200	+ * @wqh: A wait queue for poll if a notifier is in use.
56	201	*
57	202	* seccomp_filter objects are organized in a tree linked via the @prev
58	203	* pointer. For any task, it appears to be a singly-linked list starting
..	..	@@ -62,13 +207,18 @@
62	207	* how namespaces work.
63	208	*
64	209	* seccomp_filter objects should never be modified after being attached
65		- * to a task_struct (other than @usage).
	210	+ * to a task_struct (other than @refs).
66	211	*/
67	212	struct seccomp_filter {
68		- refcount_t usage;
	213	+ refcount_t refs;
	214	+ refcount_t users;
69	215	bool log;
	216	+ struct action_cache cache;
70	217	struct seccomp_filter *prev;
71	218	struct bpf_prog *prog;
	219	+ struct notification *notif;
	220	+ struct mutex notify_lock;
	221	+ wait_queue_head_t wqh;
72	222	};
73	223
74	224	/* Limit any path through the tree to 256KB worth of instructions. */
..	..	@@ -80,13 +230,17 @@
80	230	*/
81	231	static void populate_seccomp_data(struct seccomp_data *sd)
82	232	{
	233	+ /*
	234	+ * Instead of using current_pt_reg(), we're already doing the work
	235	+ * to safely fetch "current", so just use "task" everywhere below.
	236	+ */
83	237	struct task_struct *task = current;
84	238	struct pt_regs *regs = task_pt_regs(task);
85	239	unsigned long args[6];
86	240
87	241	sd->nr = syscall_get_nr(task, regs);
88		- sd->arch = syscall_get_arch();
89		- syscall_get_arguments(task, regs, 0, 6, args);
	242	+ sd->arch = syscall_get_arch(task);
	243	+ syscall_get_arguments(task, regs, args);
90	244	sd->args[0] = args[0];
91	245	sd->args[1] = args[1];
92	246	sd->args[2] = args[2];
..	..	@@ -178,6 +332,52 @@
178	332	return 0;
179	333	}
180	334
	335	+#ifdef SECCOMP_ARCH_NATIVE
	336	+static inline bool seccomp_cache_check_allow_bitmap(const void *bitmap,
	337	+ size_t bitmap_size,
	338	+ int syscall_nr)
	339	+{
	340	+ if (unlikely(syscall_nr < 0 \|\| syscall_nr >= bitmap_size))
	341	+ return false;
	342	+ syscall_nr = array_index_nospec(syscall_nr, bitmap_size);
	343	+
	344	+ return test_bit(syscall_nr, bitmap);
	345	+}
	346	+
	347	+/**
	348	+ * seccomp_cache_check_allow - lookup seccomp cache
	349	+ * @sfilter: The seccomp filter
	350	+ * @sd: The seccomp data to lookup the cache with
	351	+ *
	352	+ * Returns true if the seccomp_data is cached and allowed.
	353	+ */
	354	+static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter,
	355	+ const struct seccomp_data *sd)
	356	+{
	357	+ int syscall_nr = sd->nr;
	358	+ const struct action_cache *cache = &sfilter->cache;
	359	+
	360	+#ifndef SECCOMP_ARCH_COMPAT
	361	+ /* A native-only architecture doesn't need to check sd->arch. */
	362	+ return seccomp_cache_check_allow_bitmap(cache->allow_native,
	363	+ SECCOMP_ARCH_NATIVE_NR,
	364	+ syscall_nr);
	365	+#else
	366	+ if (likely(sd->arch == SECCOMP_ARCH_NATIVE))
	367	+ return seccomp_cache_check_allow_bitmap(cache->allow_native,
	368	+ SECCOMP_ARCH_NATIVE_NR,
	369	+ syscall_nr);
	370	+ if (likely(sd->arch == SECCOMP_ARCH_COMPAT))
	371	+ return seccomp_cache_check_allow_bitmap(cache->allow_compat,
	372	+ SECCOMP_ARCH_COMPAT_NR,
	373	+ syscall_nr);
	374	+#endif /* SECCOMP_ARCH_COMPAT */
	375	+
	376	+ WARN_ON_ONCE(true);
	377	+ return false;
	378	+}
	379	+#endif /* SECCOMP_ARCH_NATIVE */
	380	+
181	381	/**
182	382	* seccomp_run_filters - evaluates all seccomp filters against @sd
183	383	* @sd: optional seccomp data to be passed to filters
..	..	@@ -191,27 +391,24 @@
191	391	static u32 seccomp_run_filters(const struct seccomp_data *sd,
192	392	struct seccomp_filter **match)
193	393	{
194		- struct seccomp_data sd_local;
195	394	u32 ret = SECCOMP_RET_ALLOW;
196	395	/* Make sure cross-thread synced filter points somewhere sane. */
197	396	struct seccomp_filter *f =
198	397	READ_ONCE(current->seccomp.filter);
199	398
200	399	/* Ensure unexpected behavior doesn't result in failing open. */
201		- if (unlikely(WARN_ON(f == NULL)))
	400	+ if (WARN_ON(f == NULL))
202	401	return SECCOMP_RET_KILL_PROCESS;
203	402
204		- if (!sd) {
205		- populate_seccomp_data(&sd_local);
206		- sd = &sd_local;
207		- }
	403	+ if (seccomp_cache_check_allow(f, sd))
	404	+ return SECCOMP_RET_ALLOW;
208	405
209	406	/*
210	407	* All filters in the list are evaluated and the lowest BPF return
211	408	* value always takes priority (ignoring the DATA).
212	409	*/
213	410	for (; f; f = f->prev) {
214		- u32 cur_ret = BPF_PROG_RUN(f->prog, sd);
	411	+ u32 cur_ret = bpf_prog_run_pin_on_cpu(f->prog, sd);
215	412
216	413	if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) {
217	414	ret = cur_ret;
..	..	@@ -272,7 +469,7 @@
272	469	* Expects sighand and cred_guard_mutex locks to be held.
273	470	*
274	471	* Returns 0 on success, -ve on error, or the pid of a thread which was
275		- * either not in the correct seccomp mode or it did not have an ancestral
	472	+ * either not in the correct seccomp mode or did not have an ancestral
276	473	* seccomp filter.
277	474	*/
278	475	static inline pid_t seccomp_can_sync_threads(void)
..	..	@@ -300,12 +497,65 @@
300	497	/* Return the first thread that cannot be synchronized. */
301	498	failed = task_pid_vnr(thread);
302	499	/* If the pid cannot be resolved, then return -ESRCH */
303		- if (unlikely(WARN_ON(failed == 0)))
	500	+ if (WARN_ON(failed == 0))
304	501	failed = -ESRCH;
305	502	return failed;
306	503	}
307	504
308	505	return 0;
	506	+}
	507	+
	508	+static inline void seccomp_filter_free(struct seccomp_filter *filter)
	509	+{
	510	+ if (filter) {
	511	+ bpf_prog_destroy(filter->prog);
	512	+ kfree(filter);
	513	+ }
	514	+}
	515	+
	516	+static void __seccomp_filter_orphan(struct seccomp_filter *orig)
	517	+{
	518	+ while (orig && refcount_dec_and_test(&orig->users)) {
	519	+ if (waitqueue_active(&orig->wqh))
	520	+ wake_up_poll(&orig->wqh, EPOLLHUP);
	521	+ orig = orig->prev;
	522	+ }
	523	+}
	524	+
	525	+static void __put_seccomp_filter(struct seccomp_filter *orig)
	526	+{
	527	+ /* Clean up single-reference branches iteratively. */
	528	+ while (orig && refcount_dec_and_test(&orig->refs)) {
	529	+ struct seccomp_filter *freeme = orig;
	530	+ orig = orig->prev;
	531	+ seccomp_filter_free(freeme);
	532	+ }
	533	+}
	534	+
	535	+static void __seccomp_filter_release(struct seccomp_filter *orig)
	536	+{
	537	+ /* Notify about any unused filters in the task's former filter tree. */
	538	+ __seccomp_filter_orphan(orig);
	539	+ /* Finally drop all references to the task's former tree. */
	540	+ __put_seccomp_filter(orig);
	541	+}
	542	+
	543	+/**
	544	+ * seccomp_filter_release - Detach the task from its filter tree,
	545	+ * drop its reference count, and notify
	546	+ * about unused filters
	547	+ *
	548	+ * This function should only be called when the task is exiting as
	549	+ * it detaches it from its filter tree. As such, READ_ONCE() and
	550	+ * barriers are not needed here, as would normally be needed.
	551	+ */
	552	+void seccomp_filter_release(struct task_struct *tsk)
	553	+{
	554	+ struct seccomp_filter *orig = tsk->seccomp.filter;
	555	+
	556	+ /* Detach task from its filter tree. */
	557	+ tsk->seccomp.filter = NULL;
	558	+ __seccomp_filter_release(orig);
309	559	}
310	560
311	561	/**
..	..	@@ -332,14 +582,19 @@
332	582
333	583	/* Get a task reference for the new leaf node. */
334	584	get_seccomp_filter(caller);
	585	+
335	586	/*
336	587	* Drop the task reference to the shared ancestor since
337	588	* current's path will hold a reference. (This also
338	589	* allows a put before the assignment.)
339	590	*/
340		- put_seccomp_filter(thread);
	591	+ __seccomp_filter_release(thread->seccomp.filter);
	592	+
	593	+ /* Make our new filter tree visible. */
341	594	smp_store_release(&thread->seccomp.filter,
342	595	caller->seccomp.filter);
	596	+ atomic_set(&thread->seccomp.filter_count,
	597	+ atomic_read(&caller->seccomp.filter_count));
343	598
344	599	/*
345	600	* Don't let an unprivileged task work around
..	..	@@ -372,7 +627,12 @@
372	627	{
373	628	struct seccomp_filter *sfilter;
374	629	int ret;
375		- const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE);
	630	+ const bool save_orig =
	631	+#if defined(CONFIG_CHECKPOINT_RESTORE) \|\| defined(SECCOMP_ARCH_NATIVE)
	632	+ true;
	633	+#else
	634	+ false;
	635	+#endif
376	636
377	637	if (fprog->len == 0 \|\| fprog->len > BPF_MAXINSNS)
378	638	return ERR_PTR(-EINVAL);
..	..	@@ -394,6 +654,7 @@
394	654	if (!sfilter)
395	655	return ERR_PTR(-ENOMEM);
396	656
	657	+ mutex_init(&sfilter->notify_lock);
397	658	ret = bpf_prog_create_from_user(&sfilter->prog, fprog,
398	659	seccomp_check_filter, save_orig);
399	660	if (ret < 0) {
..	..	@@ -401,7 +662,9 @@
401	662	return ERR_PTR(ret);
402	663	}
403	664
404		- refcount_set(&sfilter->usage, 1);
	665	+ refcount_set(&sfilter->refs, 1);
	666	+ refcount_set(&sfilter->users, 1);
	667	+ init_waitqueue_head(&sfilter->wqh);
405	668
406	669	return sfilter;
407	670	}
..	..	@@ -434,6 +697,148 @@
434	697	return filter;
435	698	}
436	699
	700	+#ifdef SECCOMP_ARCH_NATIVE
	701	+/**
	702	+ * seccomp_is_const_allow - check if filter is constant allow with given data
	703	+ * @fprog: The BPF programs
	704	+ * @sd: The seccomp data to check against, only syscall number and arch
	705	+ * number are considered constant.
	706	+ */
	707	+static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog,
	708	+ struct seccomp_data *sd)
	709	+{
	710	+ unsigned int reg_value = 0;
	711	+ unsigned int pc;
	712	+ bool op_res;
	713	+
	714	+ if (WARN_ON_ONCE(!fprog))
	715	+ return false;
	716	+
	717	+ for (pc = 0; pc < fprog->len; pc++) {
	718	+ struct sock_filter *insn = &fprog->filter[pc];
	719	+ u16 code = insn->code;
	720	+ u32 k = insn->k;
	721	+
	722	+ switch (code) {
	723	+ case BPF_LD \| BPF_W \| BPF_ABS:
	724	+ switch (k) {
	725	+ case offsetof(struct seccomp_data, nr):
	726	+ reg_value = sd->nr;
	727	+ break;
	728	+ case offsetof(struct seccomp_data, arch):
	729	+ reg_value = sd->arch;
	730	+ break;
	731	+ default:
	732	+ /* can't optimize (non-constant value load) */
	733	+ return false;
	734	+ }
	735	+ break;
	736	+ case BPF_RET \| BPF_K:
	737	+ /* reached return with constant values only, check allow */
	738	+ return k == SECCOMP_RET_ALLOW;
	739	+ case BPF_JMP \| BPF_JA:
	740	+ pc += insn->k;
	741	+ break;
	742	+ case BPF_JMP \| BPF_JEQ \| BPF_K:
	743	+ case BPF_JMP \| BPF_JGE \| BPF_K:
	744	+ case BPF_JMP \| BPF_JGT \| BPF_K:
	745	+ case BPF_JMP \| BPF_JSET \| BPF_K:
	746	+ switch (BPF_OP(code)) {
	747	+ case BPF_JEQ:
	748	+ op_res = reg_value == k;
	749	+ break;
	750	+ case BPF_JGE:
	751	+ op_res = reg_value >= k;
	752	+ break;
	753	+ case BPF_JGT:
	754	+ op_res = reg_value > k;
	755	+ break;
	756	+ case BPF_JSET:
	757	+ op_res = !!(reg_value & k);
	758	+ break;
	759	+ default:
	760	+ /* can't optimize (unknown jump) */
	761	+ return false;
	762	+ }
	763	+
	764	+ pc += op_res ? insn->jt : insn->jf;
	765	+ break;
	766	+ case BPF_ALU \| BPF_AND \| BPF_K:
	767	+ reg_value &= k;
	768	+ break;
	769	+ default:
	770	+ /* can't optimize (unknown insn) */
	771	+ return false;
	772	+ }
	773	+ }
	774	+
	775	+ /* ran off the end of the filter?! */
	776	+ WARN_ON(1);
	777	+ return false;
	778	+}
	779	+
	780	+static void seccomp_cache_prepare_bitmap(struct seccomp_filter *sfilter,
	781	+ void bitmap, const void bitmap_prev,
	782	+ size_t bitmap_size, int arch)
	783	+{
	784	+ struct sock_fprog_kern *fprog = sfilter->prog->orig_prog;
	785	+ struct seccomp_data sd;
	786	+ int nr;
	787	+
	788	+ if (bitmap_prev) {
	789	+ /* The new filter must be as restrictive as the last. */
	790	+ bitmap_copy(bitmap, bitmap_prev, bitmap_size);
	791	+ } else {
	792	+ /* Before any filters, all syscalls are always allowed. */
	793	+ bitmap_fill(bitmap, bitmap_size);
	794	+ }
	795	+
	796	+ for (nr = 0; nr < bitmap_size; nr++) {
	797	+ /* No bitmap change: not a cacheable action. */
	798	+ if (!test_bit(nr, bitmap))
	799	+ continue;
	800	+
	801	+ sd.nr = nr;
	802	+ sd.arch = arch;
	803	+
	804	+ /* No bitmap change: continue to always allow. */
	805	+ if (seccomp_is_const_allow(fprog, &sd))
	806	+ continue;
	807	+
	808	+ /*
	809	+ * Not a cacheable action: always run filters.
	810	+ * atomic clear_bit() not needed, filter not visible yet.
	811	+ */
	812	+ __clear_bit(nr, bitmap);
	813	+ }
	814	+}
	815	+
	816	+/**
	817	+ * seccomp_cache_prepare - emulate the filter to find cachable syscalls
	818	+ * @sfilter: The seccomp filter
	819	+ *
	820	+ * Returns 0 if successful or -errno if error occurred.
	821	+ */
	822	+static void seccomp_cache_prepare(struct seccomp_filter *sfilter)
	823	+{
	824	+ struct action_cache *cache = &sfilter->cache;
	825	+ const struct action_cache *cache_prev =
	826	+ sfilter->prev ? &sfilter->prev->cache : NULL;
	827	+
	828	+ seccomp_cache_prepare_bitmap(sfilter, cache->allow_native,
	829	+ cache_prev ? cache_prev->allow_native : NULL,
	830	+ SECCOMP_ARCH_NATIVE_NR,
	831	+ SECCOMP_ARCH_NATIVE);
	832	+
	833	+#ifdef SECCOMP_ARCH_COMPAT
	834	+ seccomp_cache_prepare_bitmap(sfilter, cache->allow_compat,
	835	+ cache_prev ? cache_prev->allow_compat : NULL,
	836	+ SECCOMP_ARCH_COMPAT_NR,
	837	+ SECCOMP_ARCH_COMPAT);
	838	+#endif /* SECCOMP_ARCH_COMPAT */
	839	+}
	840	+#endif /* SECCOMP_ARCH_NATIVE */
	841	+
437	842	/**
438	843	* seccomp_attach_filter: validate and attach filter
439	844	* @flags: flags to change filter behavior
..	..	@@ -441,7 +846,10 @@
441	846	*
442	847	* Caller must be holding current->sighand->siglock lock.
443	848	*
444		- * Returns 0 on success, -ve on error.
	849	+ * Returns 0 on success, -ve on error, or
	850	+ * - in TSYNC mode: the pid of a thread which was either not in the correct
	851	+ * seccomp mode or did not have an ancestral seccomp filter
	852	+ * - in NEW_LISTENER mode: the fd of the new listener
445	853	*/
446	854	static long seccomp_attach_filter(unsigned int flags,
447	855	struct seccomp_filter *filter)
..	..	@@ -463,8 +871,12 @@
463	871	int ret;
464	872
465	873	ret = seccomp_can_sync_threads();
466		- if (ret)
467		- return ret;
	874	+ if (ret) {
	875	+ if (flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH)
	876	+ return -ESRCH;
	877	+ else
	878	+ return ret;
	879	+ }
468	880	}
469	881
470	882	/* Set log flag, if present. */
..	..	@@ -476,7 +888,9 @@
476	888	* task reference.
477	889	*/
478	890	filter->prev = current->seccomp.filter;
	891	+ seccomp_cache_prepare(filter);
479	892	current->seccomp.filter = filter;
	893	+ atomic_inc(&current->seccomp.filter_count);
480	894
481	895	/* Now that the new filter is in place, synchronize to all threads. */
482	896	if (flags & SECCOMP_FILTER_FLAG_TSYNC)
..	..	@@ -487,8 +901,7 @@
487	901
488	902	static void __get_seccomp_filter(struct seccomp_filter *filter)
489	903	{
490		- /* Reference count is bounded by the number of total processes. */
491		- refcount_inc(&filter->usage);
	904	+ refcount_inc(&filter->refs);
492	905	}
493	906
494	907	/* get_seccomp_filter - increments the reference count of the filter on @tsk */
..	..	@@ -498,40 +911,17 @@
498	911	if (!orig)
499	912	return;
500	913	__get_seccomp_filter(orig);
	914	+ refcount_inc(&orig->users);
501	915	}
502	916
503		-static inline void seccomp_filter_free(struct seccomp_filter *filter)
504		-{
505		- if (filter) {
506		- bpf_prog_destroy(filter->prog);
507		- kfree(filter);
508		- }
509		-}
510		-
511		-static void __put_seccomp_filter(struct seccomp_filter *orig)
512		-{
513		- /* Clean up single-reference branches iteratively. */
514		- while (orig && refcount_dec_and_test(&orig->usage)) {
515		- struct seccomp_filter *freeme = orig;
516		- orig = orig->prev;
517		- seccomp_filter_free(freeme);
518		- }
519		-}
520		-
521		-/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
522		-void put_seccomp_filter(struct task_struct *tsk)
523		-{
524		- __put_seccomp_filter(tsk->seccomp.filter);
525		-}
526		-
527		-static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason)
	917	+static void seccomp_init_siginfo(kernel_siginfo_t *info, int syscall, int reason)
528	918	{
529	919	clear_siginfo(info);
530	920	info->si_signo = SIGSYS;
531	921	info->si_code = SYS_SECCOMP;
532	922	info->si_call_addr = (void __user *)KSTK_EIP(current);
533	923	info->si_errno = reason;
534		- info->si_arch = syscall_get_arch();
	924	+ info->si_arch = syscall_get_arch(current);
535	925	info->si_syscall = syscall;
536	926	}
537	927
..	..	@@ -544,9 +934,9 @@
544	934	*/
545	935	static void seccomp_send_sigsys(int syscall, int reason)
546	936	{
547		- struct siginfo info;
	937	+ struct kernel_siginfo info;
548	938	seccomp_init_siginfo(&info, syscall, reason);
549		- force_sig_info(SIGSYS, &info, current);
	939	+ force_sig_info(&info);
550	940	}
551	941	#endif /* CONFIG_SECCOMP_FILTER */
552	942
..	..	@@ -558,11 +948,13 @@
558	948	#define SECCOMP_LOG_TRACE (1 << 4)
559	949	#define SECCOMP_LOG_LOG (1 << 5)
560	950	#define SECCOMP_LOG_ALLOW (1 << 6)
	951	+#define SECCOMP_LOG_USER_NOTIF (1 << 7)
561	952
562	953	static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_PROCESS \|
563	954	SECCOMP_LOG_KILL_THREAD \|
564	955	SECCOMP_LOG_TRAP \|
565	956	SECCOMP_LOG_ERRNO \|
	957	+ SECCOMP_LOG_USER_NOTIF \|
566	958	SECCOMP_LOG_TRACE \|
567	959	SECCOMP_LOG_LOG;
568	960
..	..	@@ -582,6 +974,9 @@
582	974	break;
583	975	case SECCOMP_RET_TRACE:
584	976	log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE;
	977	+ break;
	978	+ case SECCOMP_RET_USER_NOTIF:
	979	+ log = requested && seccomp_actions_logged & SECCOMP_LOG_USER_NOTIF;
585	980	break;
586	981	case SECCOMP_RET_LOG:
587	982	log = seccomp_actions_logged & SECCOMP_LOG_LOG;
..	..	@@ -613,25 +1008,24 @@
613	1008	*/
614	1009	static const int mode1_syscalls[] = {
615	1010	__NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn,
616		- 0, /* null terminated */
	1011	+ -1, /* negative terminated */
617	1012	};
618	1013
619	1014	static void __secure_computing_strict(int this_syscall)
620	1015	{
621		- const int *syscall_whitelist = mode1_syscalls;
	1016	+ const int *allowed_syscalls = mode1_syscalls;
622	1017	#ifdef CONFIG_COMPAT
623	1018	if (in_compat_syscall())
624		- syscall_whitelist = get_compat_mode1_syscalls();
	1019	+ allowed_syscalls = get_compat_mode1_syscalls();
625	1020	#endif
626	1021	do {
627		- if (*syscall_whitelist == this_syscall)
	1022	+ if (*allowed_syscalls == this_syscall)
628	1023	return;
629		- } while (*++syscall_whitelist);
	1024	+ } while (*++allowed_syscalls != -1);
630	1025
631	1026	#ifdef SECCOMP_DEBUG
632	1027	dump_stack();
633	1028	#endif
634		- current->seccomp.mode = SECCOMP_MODE_DEAD;
635	1029	seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL_THREAD, true);
636	1030	do_exit(SIGKILL);
637	1031	}
..	..	@@ -655,18 +1049,126 @@
655	1049	#else
656	1050
657	1051	#ifdef CONFIG_SECCOMP_FILTER
	1052	+static u64 seccomp_next_notify_id(struct seccomp_filter *filter)
	1053	+{
	1054	+ /*
	1055	+ * Note: overflow is ok here, the id just needs to be unique per
	1056	+ * filter.
	1057	+ */
	1058	+ lockdep_assert_held(&filter->notify_lock);
	1059	+ return filter->notif->next_id++;
	1060	+}
	1061	+
	1062	+static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd)
	1063	+{
	1064	+ /*
	1065	+ * Remove the notification, and reset the list pointers, indicating
	1066	+ * that it has been handled.
	1067	+ */
	1068	+ list_del_init(&addfd->list);
	1069	+ addfd->ret = receive_fd_replace(addfd->fd, addfd->file, addfd->flags);
	1070	+ complete(&addfd->completion);
	1071	+}
	1072	+
	1073	+static int seccomp_do_user_notification(int this_syscall,
	1074	+ struct seccomp_filter *match,
	1075	+ const struct seccomp_data *sd)
	1076	+{
	1077	+ int err;
	1078	+ u32 flags = 0;
	1079	+ long ret = 0;
	1080	+ struct seccomp_knotif n = {};
	1081	+ struct seccomp_kaddfd addfd, tmp;
	1082	+
	1083	+ mutex_lock(&match->notify_lock);
	1084	+ err = -ENOSYS;
	1085	+ if (!match->notif)
	1086	+ goto out;
	1087	+
	1088	+ n.task = current;
	1089	+ n.state = SECCOMP_NOTIFY_INIT;
	1090	+ n.data = sd;
	1091	+ n.id = seccomp_next_notify_id(match);
	1092	+ init_completion(&n.ready);
	1093	+ list_add(&n.list, &match->notif->notifications);
	1094	+ INIT_LIST_HEAD(&n.addfd);
	1095	+
	1096	+ up(&match->notif->request);
	1097	+ wake_up_poll(&match->wqh, EPOLLIN \| EPOLLRDNORM);
	1098	+
	1099	+ /*
	1100	+ * This is where we wait for a reply from userspace.
	1101	+ */
	1102	+ do {
	1103	+ mutex_unlock(&match->notify_lock);
	1104	+ err = wait_for_completion_interruptible(&n.ready);
	1105	+ mutex_lock(&match->notify_lock);
	1106	+ if (err != 0)
	1107	+ goto interrupted;
	1108	+
	1109	+ addfd = list_first_entry_or_null(&n.addfd,
	1110	+ struct seccomp_kaddfd, list);
	1111	+ /* Check if we were woken up by a addfd message */
	1112	+ if (addfd)
	1113	+ seccomp_handle_addfd(addfd);
	1114	+
	1115	+ } while (n.state != SECCOMP_NOTIFY_REPLIED);
	1116	+
	1117	+ ret = n.val;
	1118	+ err = n.error;
	1119	+ flags = n.flags;
	1120	+
	1121	+interrupted:
	1122	+ /* If there were any pending addfd calls, clear them out */
	1123	+ list_for_each_entry_safe(addfd, tmp, &n.addfd, list) {
	1124	+ /* The process went away before we got a chance to handle it */
	1125	+ addfd->ret = -ESRCH;
	1126	+ list_del_init(&addfd->list);
	1127	+ complete(&addfd->completion);
	1128	+ }
	1129	+
	1130	+ /*
	1131	+ * Note that it's possible the listener died in between the time when
	1132	+ * we were notified of a response (or a signal) and when we were able to
	1133	+ * re-acquire the lock, so only delete from the list if the
	1134	+ * notification actually exists.
	1135	+ *
	1136	+ * Also note that this test is only valid because there's no way to
	1137	+ * reattach to a notifier right now. If one is added, we'll need to
	1138	+ * keep track of the notif itself and make sure they match here.
	1139	+ */
	1140	+ if (match->notif)
	1141	+ list_del(&n.list);
	1142	+out:
	1143	+ mutex_unlock(&match->notify_lock);
	1144	+
	1145	+ /* Userspace requests to continue the syscall. */
	1146	+ if (flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE)
	1147	+ return 0;
	1148	+
	1149	+ syscall_set_return_value(current, current_pt_regs(),
	1150	+ err, ret);
	1151	+ return -1;
	1152	+}
	1153	+
658	1154	static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
659	1155	const bool recheck_after_trace)
660	1156	{
661	1157	u32 filter_ret, action;
662	1158	struct seccomp_filter *match = NULL;
663	1159	int data;
	1160	+ struct seccomp_data sd_local;
664	1161
665	1162	/*
666	1163	* Make sure that any changes to mode from another thread have
667	1164	* been seen after TIF_SECCOMP was seen.
668	1165	*/
669	1166	rmb();
	1167	+
	1168	+ if (!sd) {
	1169	+ populate_seccomp_data(&sd_local);
	1170	+ sd = &sd_local;
	1171	+ }
670	1172
671	1173	filter_ret = seccomp_run_filters(sd, &match);
672	1174	data = filter_ret & SECCOMP_RET_DATA;
..	..	@@ -677,13 +1179,13 @@
677	1179	/* Set low-order bits as an errno, capped at MAX_ERRNO. */
678	1180	if (data > MAX_ERRNO)
679	1181	data = MAX_ERRNO;
680		- syscall_set_return_value(current, task_pt_regs(current),
	1182	+ syscall_set_return_value(current, current_pt_regs(),
681	1183	-data, 0);
682	1184	goto skip;
683	1185
684	1186	case SECCOMP_RET_TRAP:
685	1187	/* Show the handler the original registers. */
686		- syscall_rollback(current, task_pt_regs(current));
	1188	+ syscall_rollback(current, current_pt_regs());
687	1189	/* Let the filter pass back 16 bits of data. */
688	1190	seccomp_send_sigsys(this_syscall, data);
689	1191	goto skip;
..	..	@@ -696,7 +1198,7 @@
696	1198	/* ENOSYS these calls if there is no tracer attached. */
697	1199	if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
698	1200	syscall_set_return_value(current,
699		- task_pt_regs(current),
	1201	+ current_pt_regs(),
700	1202	-ENOSYS, 0);
701	1203	goto skip;
702	1204	}
..	..	@@ -716,7 +1218,7 @@
716	1218	if (fatal_signal_pending(current))
717	1219	goto skip;
718	1220	/* Check if the tracer forced the syscall to be skipped. */
719		- this_syscall = syscall_get_nr(current, task_pt_regs(current));
	1221	+ this_syscall = syscall_get_nr(current, current_pt_regs());
720	1222	if (this_syscall < 0)
721	1223	goto skip;
722	1224
..	..	@@ -728,6 +1230,12 @@
728	1230	*/
729	1231	if (__seccomp_filter(this_syscall, NULL, true))
730	1232	return -1;
	1233	+
	1234	+ return 0;
	1235	+
	1236	+ case SECCOMP_RET_USER_NOTIF:
	1237	+ if (seccomp_do_user_notification(this_syscall, match, sd))
	1238	+ goto skip;
731	1239
732	1240	return 0;
733	1241
..	..	@@ -746,23 +1254,22 @@
746	1254	case SECCOMP_RET_KILL_THREAD:
747	1255	case SECCOMP_RET_KILL_PROCESS:
748	1256	default:
749		- current->seccomp.mode = SECCOMP_MODE_DEAD;
750	1257	seccomp_log(this_syscall, SIGSYS, action, true);
751	1258	/* Dump core only if this is the last remaining thread. */
752		- if (action == SECCOMP_RET_KILL_PROCESS \|\|
	1259	+ if (action != SECCOMP_RET_KILL_THREAD \|\|
753	1260	get_nr_threads(current) == 1) {
754		- siginfo_t info;
	1261	+ kernel_siginfo_t info;
755	1262
756	1263	/* Show the original registers in the dump. */
757		- syscall_rollback(current, task_pt_regs(current));
	1264	+ syscall_rollback(current, current_pt_regs());
758	1265	/* Trigger a manual coredump since do_exit skips it. */
759	1266	seccomp_init_siginfo(&info, this_syscall, data);
760	1267	do_coredump(&info);
761	1268	}
762		- if (action == SECCOMP_RET_KILL_PROCESS)
763		- do_group_exit(SIGSYS);
764		- else
	1269	+ if (action == SECCOMP_RET_KILL_THREAD)
765	1270	do_exit(SIGSYS);
	1271	+ else
	1272	+ do_group_exit(SIGSYS);
766	1273	}
767	1274
768	1275	unreachable();
..	..	@@ -791,7 +1298,7 @@
791	1298	return 0;
792	1299
793	1300	this_syscall = sd ? sd->nr :
794		- syscall_get_nr(current, task_pt_regs(current));
	1301	+ syscall_get_nr(current, current_pt_regs());
795	1302
796	1303	switch (mode) {
797	1304	case SECCOMP_MODE_STRICT:
..	..	@@ -799,11 +1306,6 @@
799	1306	return 0;
800	1307	case SECCOMP_MODE_FILTER:
801	1308	return __seccomp_filter(this_syscall, sd, false);
802		- /* Surviving SECCOMP_RET_KILL_* must be proactively impossible. */
803		- case SECCOMP_MODE_DEAD:
804		- WARN_ON_ONCE(1);
805		- do_exit(SIGKILL);
806		- return -1;
807	1309	default:
808	1310	BUG();
809	1311	}
..	..	@@ -845,6 +1347,420 @@
845	1347	}
846	1348
847	1349	#ifdef CONFIG_SECCOMP_FILTER
	1350	+static void seccomp_notify_free(struct seccomp_filter *filter)
	1351	+{
	1352	+ kfree(filter->notif);
	1353	+ filter->notif = NULL;
	1354	+}
	1355	+
	1356	+static void seccomp_notify_detach(struct seccomp_filter *filter)
	1357	+{
	1358	+ struct seccomp_knotif *knotif;
	1359	+
	1360	+ if (!filter)
	1361	+ return;
	1362	+
	1363	+ mutex_lock(&filter->notify_lock);
	1364	+
	1365	+ /*
	1366	+ * If this file is being closed because e.g. the task who owned it
	1367	+ * died, let's wake everyone up who was waiting on us.
	1368	+ */
	1369	+ list_for_each_entry(knotif, &filter->notif->notifications, list) {
	1370	+ if (knotif->state == SECCOMP_NOTIFY_REPLIED)
	1371	+ continue;
	1372	+
	1373	+ knotif->state = SECCOMP_NOTIFY_REPLIED;
	1374	+ knotif->error = -ENOSYS;
	1375	+ knotif->val = 0;
	1376	+
	1377	+ /*
	1378	+ * We do not need to wake up any pending addfd messages, as
	1379	+ * the notifier will do that for us, as this just looks
	1380	+ * like a standard reply.
	1381	+ */
	1382	+ complete(&knotif->ready);
	1383	+ }
	1384	+
	1385	+ seccomp_notify_free(filter);
	1386	+ mutex_unlock(&filter->notify_lock);
	1387	+}
	1388	+
	1389	+static int seccomp_notify_release(struct inode inode, struct file file)
	1390	+{
	1391	+ struct seccomp_filter *filter = file->private_data;
	1392	+
	1393	+ seccomp_notify_detach(filter);
	1394	+ __put_seccomp_filter(filter);
	1395	+ return 0;
	1396	+}
	1397	+
	1398	+/* must be called with notif_lock held */
	1399	+static inline struct seccomp_knotif *
	1400	+find_notification(struct seccomp_filter *filter, u64 id)
	1401	+{
	1402	+ struct seccomp_knotif *cur;
	1403	+
	1404	+ lockdep_assert_held(&filter->notify_lock);
	1405	+
	1406	+ list_for_each_entry(cur, &filter->notif->notifications, list) {
	1407	+ if (cur->id == id)
	1408	+ return cur;
	1409	+ }
	1410	+
	1411	+ return NULL;
	1412	+}
	1413	+
	1414	+
	1415	+static long seccomp_notify_recv(struct seccomp_filter *filter,
	1416	+ void __user *buf)
	1417	+{
	1418	+ struct seccomp_knotif knotif = NULL, cur;
	1419	+ struct seccomp_notif unotif;
	1420	+ ssize_t ret;
	1421	+
	1422	+ /* Verify that we're not given garbage to keep struct extensible. */
	1423	+ ret = check_zeroed_user(buf, sizeof(unotif));
	1424	+ if (ret < 0)
	1425	+ return ret;
	1426	+ if (!ret)
	1427	+ return -EINVAL;
	1428	+
	1429	+ memset(&unotif, 0, sizeof(unotif));
	1430	+
	1431	+ ret = down_interruptible(&filter->notif->request);
	1432	+ if (ret < 0)
	1433	+ return ret;
	1434	+
	1435	+ mutex_lock(&filter->notify_lock);
	1436	+ list_for_each_entry(cur, &filter->notif->notifications, list) {
	1437	+ if (cur->state == SECCOMP_NOTIFY_INIT) {
	1438	+ knotif = cur;
	1439	+ break;
	1440	+ }
	1441	+ }
	1442	+
	1443	+ /*
	1444	+ * If we didn't find a notification, it could be that the task was
	1445	+ * interrupted by a fatal signal between the time we were woken and
	1446	+ * when we were able to acquire the rw lock.
	1447	+ */
	1448	+ if (!knotif) {
	1449	+ ret = -ENOENT;
	1450	+ goto out;
	1451	+ }
	1452	+
	1453	+ unotif.id = knotif->id;
	1454	+ unotif.pid = task_pid_vnr(knotif->task);
	1455	+ unotif.data = *(knotif->data);
	1456	+
	1457	+ knotif->state = SECCOMP_NOTIFY_SENT;
	1458	+ wake_up_poll(&filter->wqh, EPOLLOUT \| EPOLLWRNORM);
	1459	+ ret = 0;
	1460	+out:
	1461	+ mutex_unlock(&filter->notify_lock);
	1462	+
	1463	+ if (ret == 0 && copy_to_user(buf, &unotif, sizeof(unotif))) {
	1464	+ ret = -EFAULT;
	1465	+
	1466	+ /*
	1467	+ * Userspace screwed up. To make sure that we keep this
	1468	+ * notification alive, let's reset it back to INIT. It
	1469	+ * may have died when we released the lock, so we need to make
	1470	+ * sure it's still around.
	1471	+ */
	1472	+ mutex_lock(&filter->notify_lock);
	1473	+ knotif = find_notification(filter, unotif.id);
	1474	+ if (knotif) {
	1475	+ knotif->state = SECCOMP_NOTIFY_INIT;
	1476	+ up(&filter->notif->request);
	1477	+ }
	1478	+ mutex_unlock(&filter->notify_lock);
	1479	+ }
	1480	+
	1481	+ return ret;
	1482	+}
	1483	+
	1484	+static long seccomp_notify_send(struct seccomp_filter *filter,
	1485	+ void __user *buf)
	1486	+{
	1487	+ struct seccomp_notif_resp resp = {};
	1488	+ struct seccomp_knotif *knotif;
	1489	+ long ret;
	1490	+
	1491	+ if (copy_from_user(&resp, buf, sizeof(resp)))
	1492	+ return -EFAULT;
	1493	+
	1494	+ if (resp.flags & ~SECCOMP_USER_NOTIF_FLAG_CONTINUE)
	1495	+ return -EINVAL;
	1496	+
	1497	+ if ((resp.flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE) &&
	1498	+ (resp.error \|\| resp.val))
	1499	+ return -EINVAL;
	1500	+
	1501	+ ret = mutex_lock_interruptible(&filter->notify_lock);
	1502	+ if (ret < 0)
	1503	+ return ret;
	1504	+
	1505	+ knotif = find_notification(filter, resp.id);
	1506	+ if (!knotif) {
	1507	+ ret = -ENOENT;
	1508	+ goto out;
	1509	+ }
	1510	+
	1511	+ /* Allow exactly one reply. */
	1512	+ if (knotif->state != SECCOMP_NOTIFY_SENT) {
	1513	+ ret = -EINPROGRESS;
	1514	+ goto out;
	1515	+ }
	1516	+
	1517	+ ret = 0;
	1518	+ knotif->state = SECCOMP_NOTIFY_REPLIED;
	1519	+ knotif->error = resp.error;
	1520	+ knotif->val = resp.val;
	1521	+ knotif->flags = resp.flags;
	1522	+ complete(&knotif->ready);
	1523	+out:
	1524	+ mutex_unlock(&filter->notify_lock);
	1525	+ return ret;
	1526	+}
	1527	+
	1528	+static long seccomp_notify_id_valid(struct seccomp_filter *filter,
	1529	+ void __user *buf)
	1530	+{
	1531	+ struct seccomp_knotif *knotif;
	1532	+ u64 id;
	1533	+ long ret;
	1534	+
	1535	+ if (copy_from_user(&id, buf, sizeof(id)))
	1536	+ return -EFAULT;
	1537	+
	1538	+ ret = mutex_lock_interruptible(&filter->notify_lock);
	1539	+ if (ret < 0)
	1540	+ return ret;
	1541	+
	1542	+ knotif = find_notification(filter, id);
	1543	+ if (knotif && knotif->state == SECCOMP_NOTIFY_SENT)
	1544	+ ret = 0;
	1545	+ else
	1546	+ ret = -ENOENT;
	1547	+
	1548	+ mutex_unlock(&filter->notify_lock);
	1549	+ return ret;
	1550	+}
	1551	+
	1552	+static long seccomp_notify_addfd(struct seccomp_filter *filter,
	1553	+ struct seccomp_notif_addfd __user *uaddfd,
	1554	+ unsigned int size)
	1555	+{
	1556	+ struct seccomp_notif_addfd addfd;
	1557	+ struct seccomp_knotif *knotif;
	1558	+ struct seccomp_kaddfd kaddfd;
	1559	+ int ret;
	1560	+
	1561	+ BUILD_BUG_ON(sizeof(addfd) < SECCOMP_NOTIFY_ADDFD_SIZE_VER0);
	1562	+ BUILD_BUG_ON(sizeof(addfd) != SECCOMP_NOTIFY_ADDFD_SIZE_LATEST);
	1563	+
	1564	+ if (size < SECCOMP_NOTIFY_ADDFD_SIZE_VER0 \|\| size >= PAGE_SIZE)
	1565	+ return -EINVAL;
	1566	+
	1567	+ ret = copy_struct_from_user(&addfd, sizeof(addfd), uaddfd, size);
	1568	+ if (ret)
	1569	+ return ret;
	1570	+
	1571	+ if (addfd.newfd_flags & ~O_CLOEXEC)
	1572	+ return -EINVAL;
	1573	+
	1574	+ if (addfd.flags & ~SECCOMP_ADDFD_FLAG_SETFD)
	1575	+ return -EINVAL;
	1576	+
	1577	+ if (addfd.newfd && !(addfd.flags & SECCOMP_ADDFD_FLAG_SETFD))
	1578	+ return -EINVAL;
	1579	+
	1580	+ kaddfd.file = fget(addfd.srcfd);
	1581	+ if (!kaddfd.file)
	1582	+ return -EBADF;
	1583	+
	1584	+ kaddfd.flags = addfd.newfd_flags;
	1585	+ kaddfd.fd = (addfd.flags & SECCOMP_ADDFD_FLAG_SETFD) ?
	1586	+ addfd.newfd : -1;
	1587	+ init_completion(&kaddfd.completion);
	1588	+
	1589	+ ret = mutex_lock_interruptible(&filter->notify_lock);
	1590	+ if (ret < 0)
	1591	+ goto out;
	1592	+
	1593	+ knotif = find_notification(filter, addfd.id);
	1594	+ if (!knotif) {
	1595	+ ret = -ENOENT;
	1596	+ goto out_unlock;
	1597	+ }
	1598	+
	1599	+ /*
	1600	+ * We do not want to allow for FD injection to occur before the
	1601	+ * notification has been picked up by a userspace handler, or after
	1602	+ * the notification has been replied to.
	1603	+ */
	1604	+ if (knotif->state != SECCOMP_NOTIFY_SENT) {
	1605	+ ret = -EINPROGRESS;
	1606	+ goto out_unlock;
	1607	+ }
	1608	+
	1609	+ list_add(&kaddfd.list, &knotif->addfd);
	1610	+ complete(&knotif->ready);
	1611	+ mutex_unlock(&filter->notify_lock);
	1612	+
	1613	+ /* Now we wait for it to be processed or be interrupted */
	1614	+ ret = wait_for_completion_interruptible(&kaddfd.completion);
	1615	+ if (ret == 0) {
	1616	+ /*
	1617	+ * We had a successful completion. The other side has already
	1618	+ * removed us from the addfd queue, and
	1619	+ * wait_for_completion_interruptible has a memory barrier upon
	1620	+ * success that lets us read this value directly without
	1621	+ * locking.
	1622	+ */
	1623	+ ret = kaddfd.ret;
	1624	+ goto out;
	1625	+ }
	1626	+
	1627	+ mutex_lock(&filter->notify_lock);
	1628	+ /*
	1629	+ * Even though we were woken up by a signal and not a successful
	1630	+ * completion, a completion may have happened in the mean time.
	1631	+ *
	1632	+ * We need to check again if the addfd request has been handled,
	1633	+ * and if not, we will remove it from the queue.
	1634	+ */
	1635	+ if (list_empty(&kaddfd.list))
	1636	+ ret = kaddfd.ret;
	1637	+ else
	1638	+ list_del(&kaddfd.list);
	1639	+
	1640	+out_unlock:
	1641	+ mutex_unlock(&filter->notify_lock);
	1642	+out:
	1643	+ fput(kaddfd.file);
	1644	+
	1645	+ return ret;
	1646	+}
	1647	+
	1648	+static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
	1649	+ unsigned long arg)
	1650	+{
	1651	+ struct seccomp_filter *filter = file->private_data;
	1652	+ void __user buf = (void __user )arg;
	1653	+
	1654	+ /* Fixed-size ioctls */
	1655	+ switch (cmd) {
	1656	+ case SECCOMP_IOCTL_NOTIF_RECV:
	1657	+ return seccomp_notify_recv(filter, buf);
	1658	+ case SECCOMP_IOCTL_NOTIF_SEND:
	1659	+ return seccomp_notify_send(filter, buf);
	1660	+ case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR:
	1661	+ case SECCOMP_IOCTL_NOTIF_ID_VALID:
	1662	+ return seccomp_notify_id_valid(filter, buf);
	1663	+ }
	1664	+
	1665	+ /* Extensible Argument ioctls */
	1666	+#define EA_IOCTL(cmd) ((cmd) & ~(IOC_INOUT \| IOCSIZE_MASK))
	1667	+ switch (EA_IOCTL(cmd)) {
	1668	+ case EA_IOCTL(SECCOMP_IOCTL_NOTIF_ADDFD):
	1669	+ return seccomp_notify_addfd(filter, buf, _IOC_SIZE(cmd));
	1670	+ default:
	1671	+ return -EINVAL;
	1672	+ }
	1673	+}
	1674	+
	1675	+static __poll_t seccomp_notify_poll(struct file *file,
	1676	+ struct poll_table_struct *poll_tab)
	1677	+{
	1678	+ struct seccomp_filter *filter = file->private_data;
	1679	+ __poll_t ret = 0;
	1680	+ struct seccomp_knotif *cur;
	1681	+
	1682	+ poll_wait(file, &filter->wqh, poll_tab);
	1683	+
	1684	+ if (mutex_lock_interruptible(&filter->notify_lock) < 0)
	1685	+ return EPOLLERR;
	1686	+
	1687	+ list_for_each_entry(cur, &filter->notif->notifications, list) {
	1688	+ if (cur->state == SECCOMP_NOTIFY_INIT)
	1689	+ ret \|= EPOLLIN \| EPOLLRDNORM;
	1690	+ if (cur->state == SECCOMP_NOTIFY_SENT)
	1691	+ ret \|= EPOLLOUT \| EPOLLWRNORM;
	1692	+ if ((ret & EPOLLIN) && (ret & EPOLLOUT))
	1693	+ break;
	1694	+ }
	1695	+
	1696	+ mutex_unlock(&filter->notify_lock);
	1697	+
	1698	+ if (refcount_read(&filter->users) == 0)
	1699	+ ret \|= EPOLLHUP;
	1700	+
	1701	+ return ret;
	1702	+}
	1703	+
	1704	+static const struct file_operations seccomp_notify_ops = {
	1705	+ .poll = seccomp_notify_poll,
	1706	+ .release = seccomp_notify_release,
	1707	+ .unlocked_ioctl = seccomp_notify_ioctl,
	1708	+ .compat_ioctl = seccomp_notify_ioctl,
	1709	+};
	1710	+
	1711	+static struct file init_listener(struct seccomp_filter filter)
	1712	+{
	1713	+ struct file *ret;
	1714	+
	1715	+ ret = ERR_PTR(-ENOMEM);
	1716	+ filter->notif = kzalloc(sizeof(*(filter->notif)), GFP_KERNEL);
	1717	+ if (!filter->notif)
	1718	+ goto out;
	1719	+
	1720	+ sema_init(&filter->notif->request, 0);
	1721	+ filter->notif->next_id = get_random_u64();
	1722	+ INIT_LIST_HEAD(&filter->notif->notifications);
	1723	+
	1724	+ ret = anon_inode_getfile("seccomp notify", &seccomp_notify_ops,
	1725	+ filter, O_RDWR);
	1726	+ if (IS_ERR(ret))
	1727	+ goto out_notif;
	1728	+
	1729	+ /* The file has a reference to it now */
	1730	+ __get_seccomp_filter(filter);
	1731	+
	1732	+out_notif:
	1733	+ if (IS_ERR(ret))
	1734	+ seccomp_notify_free(filter);
	1735	+out:
	1736	+ return ret;
	1737	+}
	1738	+
	1739	+/*
	1740	+ * Does @new_child have a listener while an ancestor also has a listener?
	1741	+ * If so, we'll want to reject this filter.
	1742	+ * This only has to be tested for the current process, even in the TSYNC case,
	1743	+ * because TSYNC installs @child with the same parent on all threads.
	1744	+ * Note that @new_child is not hooked up to its parent at this point yet, so
	1745	+ * we use current->seccomp.filter.
	1746	+ */
	1747	+static bool has_duplicate_listener(struct seccomp_filter *new_child)
	1748	+{
	1749	+ struct seccomp_filter *cur;
	1750	+
	1751	+ /* must be protected against concurrent TSYNC */
	1752	+ lockdep_assert_held(&current->sighand->siglock);
	1753	+
	1754	+ if (!new_child->notif)
	1755	+ return false;
	1756	+ for (cur = current->seccomp.filter; cur; cur = cur->prev) {
	1757	+ if (cur->notif)
	1758	+ return true;
	1759	+ }
	1760	+
	1761	+ return false;
	1762	+}
	1763	+
848	1764	/**
849	1765	* seccomp_set_mode_filter: internal function for setting seccomp filter
850	1766	* @flags: flags to change filter behavior
..	..	@@ -864,9 +1780,23 @@
864	1780	const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
865	1781	struct seccomp_filter *prepared = NULL;
866	1782	long ret = -EINVAL;
	1783	+ int listener = -1;
	1784	+ struct file *listener_f = NULL;
867	1785
868	1786	/* Validate flags. */
869	1787	if (flags & ~SECCOMP_FILTER_FLAG_MASK)
	1788	+ return -EINVAL;
	1789	+
	1790	+ /*
	1791	+ * In the successful case, NEW_LISTENER returns the new listener fd.
	1792	+ * But in the failure case, TSYNC returns the thread that died. If you
	1793	+ * combine these two flags, there's no way to tell whether something
	1794	+ * succeeded or failed. So, let's disallow this combination if the user
	1795	+ * has not explicitly requested no errors from TSYNC.
	1796	+ */
	1797	+ if ((flags & SECCOMP_FILTER_FLAG_TSYNC) &&
	1798	+ (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) &&
	1799	+ ((flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH) == 0))
870	1800	return -EINVAL;
871	1801
872	1802	/* Prepare the new filter before holding any locks. */
..	..	@@ -874,18 +1804,38 @@
874	1804	if (IS_ERR(prepared))
875	1805	return PTR_ERR(prepared);
876	1806
	1807	+ if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) {
	1808	+ listener = get_unused_fd_flags(O_CLOEXEC);
	1809	+ if (listener < 0) {
	1810	+ ret = listener;
	1811	+ goto out_free;
	1812	+ }
	1813	+
	1814	+ listener_f = init_listener(prepared);
	1815	+ if (IS_ERR(listener_f)) {
	1816	+ put_unused_fd(listener);
	1817	+ ret = PTR_ERR(listener_f);
	1818	+ goto out_free;
	1819	+ }
	1820	+ }
	1821	+
877	1822	/*
878	1823	* Make sure we cannot change seccomp or nnp state via TSYNC
879	1824	* while another thread is in the middle of calling exec.
880	1825	*/
881	1826	if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
882	1827	mutex_lock_killable(&current->signal->cred_guard_mutex))
883		- goto out_free;
	1828	+ goto out_put_fd;
884	1829
885	1830	spin_lock_irq(&current->sighand->siglock);
886	1831
887	1832	if (!seccomp_may_assign_mode(seccomp_mode))
888	1833	goto out;
	1834	+
	1835	+ if (has_duplicate_listener(prepared)) {
	1836	+ ret = -EBUSY;
	1837	+ goto out;
	1838	+ }
889	1839
890	1840	ret = seccomp_attach_filter(flags, prepared);
891	1841	if (ret)
..	..	@@ -898,6 +1848,18 @@
898	1848	spin_unlock_irq(&current->sighand->siglock);
899	1849	if (flags & SECCOMP_FILTER_FLAG_TSYNC)
900	1850	mutex_unlock(&current->signal->cred_guard_mutex);
	1851	+out_put_fd:
	1852	+ if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) {
	1853	+ if (ret) {
	1854	+ listener_f->private_data = NULL;
	1855	+ fput(listener_f);
	1856	+ put_unused_fd(listener);
	1857	+ seccomp_notify_detach(prepared);
	1858	+ } else {
	1859	+ fd_install(listener, listener_f);
	1860	+ ret = listener;
	1861	+ }
	1862	+ }
901	1863	out_free:
902	1864	seccomp_filter_free(prepared);
903	1865	return ret;
..	..	@@ -922,6 +1884,7 @@
922	1884	case SECCOMP_RET_KILL_THREAD:
923	1885	case SECCOMP_RET_TRAP:
924	1886	case SECCOMP_RET_ERRNO:
	1887	+ case SECCOMP_RET_USER_NOTIF:
925	1888	case SECCOMP_RET_TRACE:
926	1889	case SECCOMP_RET_LOG:
927	1890	case SECCOMP_RET_ALLOW:
..	..	@@ -933,9 +1896,23 @@
933	1896	return 0;
934	1897	}
935	1898
	1899	+static long seccomp_get_notif_sizes(void __user *usizes)
	1900	+{
	1901	+ struct seccomp_notif_sizes sizes = {
	1902	+ .seccomp_notif = sizeof(struct seccomp_notif),
	1903	+ .seccomp_notif_resp = sizeof(struct seccomp_notif_resp),
	1904	+ .seccomp_data = sizeof(struct seccomp_data),
	1905	+ };
	1906	+
	1907	+ if (copy_to_user(usizes, &sizes, sizeof(sizes)))
	1908	+ return -EFAULT;
	1909	+
	1910	+ return 0;
	1911	+}
	1912	+
936	1913	/* Common entry point for both prctl and syscall. */
937	1914	static long do_seccomp(unsigned int op, unsigned int flags,
938		- const char __user *uargs)
	1915	+ void __user *uargs)
939	1916	{
940	1917	switch (op) {
941	1918	case SECCOMP_SET_MODE_STRICT:
..	..	@@ -949,13 +1926,18 @@
949	1926	return -EINVAL;
950	1927
951	1928	return seccomp_get_action_avail(uargs);
	1929	+ case SECCOMP_GET_NOTIF_SIZES:
	1930	+ if (flags != 0)
	1931	+ return -EINVAL;
	1932	+
	1933	+ return seccomp_get_notif_sizes(uargs);
952	1934	default:
953	1935	return -EINVAL;
954	1936	}
955	1937	}
956	1938
957	1939	SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags,
958		- const char __user *, uargs)
	1940	+ void __user *, uargs)
959	1941	{
960	1942	return do_seccomp(op, flags, uargs);
961	1943	}
..	..	@@ -967,10 +1949,10 @@
967	1949	*
968	1950	* Returns 0 on success or -EINVAL on failure.
969	1951	*/
970		-long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
	1952	+long prctl_set_seccomp(unsigned long seccomp_mode, void __user *filter)
971	1953	{
972	1954	unsigned int op;
973		- char __user *uargs;
	1955	+ void __user *uargs;
974	1956
975	1957	switch (seccomp_mode) {
976	1958	case SECCOMP_MODE_STRICT:
..	..	@@ -1122,6 +2104,7 @@
1122	2104	#define SECCOMP_RET_KILL_THREAD_NAME "kill_thread"
1123	2105	#define SECCOMP_RET_TRAP_NAME "trap"
1124	2106	#define SECCOMP_RET_ERRNO_NAME "errno"
	2107	+#define SECCOMP_RET_USER_NOTIF_NAME "user_notif"
1125	2108	#define SECCOMP_RET_TRACE_NAME "trace"
1126	2109	#define SECCOMP_RET_LOG_NAME "log"
1127	2110	#define SECCOMP_RET_ALLOW_NAME "allow"
..	..	@@ -1131,6 +2114,7 @@
1131	2114	SECCOMP_RET_KILL_THREAD_NAME " "
1132	2115	SECCOMP_RET_TRAP_NAME " "
1133	2116	SECCOMP_RET_ERRNO_NAME " "
	2117	+ SECCOMP_RET_USER_NOTIF_NAME " "
1134	2118	SECCOMP_RET_TRACE_NAME " "
1135	2119	SECCOMP_RET_LOG_NAME " "
1136	2120	SECCOMP_RET_ALLOW_NAME;
..	..	@@ -1145,6 +2129,7 @@
1145	2129	{ SECCOMP_LOG_KILL_THREAD, SECCOMP_RET_KILL_THREAD_NAME },
1146	2130	{ SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME },
1147	2131	{ SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME },
	2132	+ { SECCOMP_LOG_USER_NOTIF, SECCOMP_RET_USER_NOTIF_NAME },
1148	2133	{ SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME },
1149	2134	{ SECCOMP_LOG_LOG, SECCOMP_RET_LOG_NAME },
1150	2135	{ SECCOMP_LOG_ALLOW, SECCOMP_RET_ALLOW_NAME },
..	..	@@ -1217,7 +2202,7 @@
1217	2202	return true;
1218	2203	}
1219	2204
1220		-static int read_actions_logged(struct ctl_table ro_table, void __user buffer,
	2205	+static int read_actions_logged(struct ctl_table ro_table, void buffer,
1221	2206	size_t lenp, loff_t ppos)
1222	2207	{
1223	2208	char names[sizeof(seccomp_actions_avail)];
..	..	@@ -1235,7 +2220,7 @@
1235	2220	return proc_dostring(&table, 0, buffer, lenp, ppos);
1236	2221	}
1237	2222
1238		-static int write_actions_logged(struct ctl_table ro_table, void __user buffer,
	2223	+static int write_actions_logged(struct ctl_table ro_table, void buffer,
1239	2224	size_t lenp, loff_t ppos, u32 *actions_logged)
1240	2225	{
1241	2226	char names[sizeof(seccomp_actions_avail)];
..	..	@@ -1297,7 +2282,7 @@
1297	2282	}
1298	2283
1299	2284	static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write,
1300		- void __user buffer, size_t lenp,
	2285	+ void buffer, size_t lenp,
1301	2286	loff_t *ppos)
1302	2287	{
1303	2288	int ret;
..	..	@@ -1343,7 +2328,7 @@
1343	2328
1344	2329	hdr = register_sysctl_paths(seccomp_sysctl_path, seccomp_sysctl_table);
1345	2330	if (!hdr)
1346		- pr_warn("seccomp: sysctl registration failed\n");
	2331	+ pr_warn("sysctl registration failed\n");
1347	2332	else
1348	2333	kmemleak_not_leak(hdr);
1349	2334