~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,10 +1,11 @@
	1	+// SPDX-License-Identifier: GPL-2.0-only
1	2	/*
2	3	* Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
3		- * Use of this source code is governed by the GPLv2 license.
4	4	*
5	5	* Test code for seccomp bpf.
6	6	*/
7	7
	8	+#define _GNU_SOURCE
8	9	#include <sys/types.h>
9	10
10	11	/*
..	..	@@ -34,18 +35,29 @@
34	35	#include <stdbool.h>
35	36	#include <string.h>
36	37	#include <time.h>
	38	+#include <limits.h>
37	39	#include <linux/elf.h>
38	40	#include <sys/uio.h>
39	41	#include <sys/utsname.h>
40	42	#include <sys/fcntl.h>
41	43	#include <sys/mman.h>
42	44	#include <sys/times.h>
	45	+#include <sys/socket.h>
	46	+#include <sys/ioctl.h>
	47	+#include <linux/kcmp.h>
	48	+#include <sys/resource.h>
43	49
44		-#define _GNU_SOURCE
45	50	#include <unistd.h>
46	51	#include <sys/syscall.h>
	52	+#include <poll.h>
47	53
48	54	#include "../kselftest_harness.h"
	55	+#include "../clone3/clone3_selftests.h"
	56	+
	57	+/* Attempt to de-conflict with the selftests tree. */
	58	+#ifndef SKIP
	59	+#define SKIP(s, ...) XFAIL(s, ##__VA_ARGS__)
	60	+#endif
49	61
50	62	#ifndef PR_SET_PTRACER
51	63	# define PR_SET_PTRACER 0x59616d61
..	..	@@ -109,12 +121,20 @@
109	121	# define __NR_seccomp 383
110	122	# elif defined(__aarch64__)
111	123	# define __NR_seccomp 277
	124	+# elif defined(__riscv)
	125	+# define __NR_seccomp 277
	126	+# elif defined(__csky__)
	127	+# define __NR_seccomp 277
112	128	# elif defined(__hppa__)
113	129	# define __NR_seccomp 338
114	130	# elif defined(__powerpc__)
115	131	# define __NR_seccomp 358
116	132	# elif defined(__s390__)
117	133	# define __NR_seccomp 348
	134	+# elif defined(__xtensa__)
	135	+# define __NR_seccomp 337
	136	+# elif defined(__sh__)
	137	+# define __NR_seccomp 372
118	138	# else
119	139	# warning "seccomp syscall number unknown for this architecture"
120	140	# define __NR_seccomp 0xffff
..	..	@@ -131,6 +151,10 @@
131	151
132	152	#ifndef SECCOMP_GET_ACTION_AVAIL
133	153	#define SECCOMP_GET_ACTION_AVAIL 2
	154	+#endif
	155	+
	156	+#ifndef SECCOMP_GET_NOTIF_SIZES
	157	+#define SECCOMP_GET_NOTIF_SIZES 3
134	158	#endif
135	159
136	160	#ifndef SECCOMP_FILTER_FLAG_TSYNC
..	..	@@ -154,6 +178,92 @@
154	178	};
155	179	#endif
156	180
	181	+#ifndef SECCOMP_FILTER_FLAG_NEW_LISTENER
	182	+#define SECCOMP_FILTER_FLAG_NEW_LISTENER (1UL << 3)
	183	+#endif
	184	+
	185	+#ifndef SECCOMP_RET_USER_NOTIF
	186	+#define SECCOMP_RET_USER_NOTIF 0x7fc00000U
	187	+
	188	+#define SECCOMP_IOC_MAGIC '!'
	189	+#define SECCOMP_IO(nr) _IO(SECCOMP_IOC_MAGIC, nr)
	190	+#define SECCOMP_IOR(nr, type) _IOR(SECCOMP_IOC_MAGIC, nr, type)
	191	+#define SECCOMP_IOW(nr, type) _IOW(SECCOMP_IOC_MAGIC, nr, type)
	192	+#define SECCOMP_IOWR(nr, type) _IOWR(SECCOMP_IOC_MAGIC, nr, type)
	193	+
	194	+/* Flags for seccomp notification fd ioctl. */
	195	+#define SECCOMP_IOCTL_NOTIF_RECV SECCOMP_IOWR(0, struct seccomp_notif)
	196	+#define SECCOMP_IOCTL_NOTIF_SEND SECCOMP_IOWR(1, \
	197	+ struct seccomp_notif_resp)
	198	+#define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOW(2, __u64)
	199	+
	200	+struct seccomp_notif {
	201	+ __u64 id;
	202	+ __u32 pid;
	203	+ __u32 flags;
	204	+ struct seccomp_data data;
	205	+};
	206	+
	207	+struct seccomp_notif_resp {
	208	+ __u64 id;
	209	+ __s64 val;
	210	+ __s32 error;
	211	+ __u32 flags;
	212	+};
	213	+
	214	+struct seccomp_notif_sizes {
	215	+ __u16 seccomp_notif;
	216	+ __u16 seccomp_notif_resp;
	217	+ __u16 seccomp_data;
	218	+};
	219	+#endif
	220	+
	221	+#ifndef SECCOMP_IOCTL_NOTIF_ADDFD
	222	+/* On success, the return value is the remote process's added fd number */
	223	+#define SECCOMP_IOCTL_NOTIF_ADDFD SECCOMP_IOW(3, \
	224	+ struct seccomp_notif_addfd)
	225	+
	226	+/* valid flags for seccomp_notif_addfd */
	227	+#define SECCOMP_ADDFD_FLAG_SETFD (1UL << 0) /* Specify remote fd */
	228	+
	229	+struct seccomp_notif_addfd {
	230	+ __u64 id;
	231	+ __u32 flags;
	232	+ __u32 srcfd;
	233	+ __u32 newfd;
	234	+ __u32 newfd_flags;
	235	+};
	236	+#endif
	237	+
	238	+struct seccomp_notif_addfd_small {
	239	+ __u64 id;
	240	+ char weird[4];
	241	+};
	242	+#define SECCOMP_IOCTL_NOTIF_ADDFD_SMALL \
	243	+ SECCOMP_IOW(3, struct seccomp_notif_addfd_small)
	244	+
	245	+struct seccomp_notif_addfd_big {
	246	+ union {
	247	+ struct seccomp_notif_addfd addfd;
	248	+ char buf[sizeof(struct seccomp_notif_addfd) + 8];
	249	+ };
	250	+};
	251	+#define SECCOMP_IOCTL_NOTIF_ADDFD_BIG \
	252	+ SECCOMP_IOWR(3, struct seccomp_notif_addfd_big)
	253	+
	254	+#ifndef PTRACE_EVENTMSG_SYSCALL_ENTRY
	255	+#define PTRACE_EVENTMSG_SYSCALL_ENTRY 1
	256	+#define PTRACE_EVENTMSG_SYSCALL_EXIT 2
	257	+#endif
	258	+
	259	+#ifndef SECCOMP_USER_NOTIF_FLAG_CONTINUE
	260	+#define SECCOMP_USER_NOTIF_FLAG_CONTINUE 0x00000001
	261	+#endif
	262	+
	263	+#ifndef SECCOMP_FILTER_FLAG_TSYNC_ESRCH
	264	+#define SECCOMP_FILTER_FLAG_TSYNC_ESRCH (1UL << 4)
	265	+#endif
	266	+
157	267	#ifndef seccomp
158	268	int seccomp(unsigned int op, unsigned int flags, void *args)
159	269	{
..	..	@@ -173,6 +283,40 @@
173	283	#define SIBLING_EXIT_UNKILLED 0xbadbeef
174	284	#define SIBLING_EXIT_FAILURE 0xbadface
175	285	#define SIBLING_EXIT_NEWPRIVS 0xbadfeed
	286	+
	287	+static int __filecmp(pid_t pid1, pid_t pid2, int fd1, int fd2)
	288	+{
	289	+#ifdef __NR_kcmp
	290	+ errno = 0;
	291	+ return syscall(__NR_kcmp, pid1, pid2, KCMP_FILE, fd1, fd2);
	292	+#else
	293	+ errno = ENOSYS;
	294	+ return -1;
	295	+#endif
	296	+}
	297	+
	298	+/* Have TH_LOG report actual location filecmp() is used. */
	299	+#define filecmp(pid1, pid2, fd1, fd2) ({ \
	300	+ int _ret; \
	301	+ \
	302	+ _ret = __filecmp(pid1, pid2, fd1, fd2); \
	303	+ if (_ret != 0) { \
	304	+ if (_ret < 0 && errno == ENOSYS) { \
	305	+ TH_LOG("kcmp() syscall missing (test is less accurate)");\
	306	+ _ret = 0; \
	307	+ } \
	308	+ } \
	309	+ _ret; })
	310	+
	311	+TEST(kcmp)
	312	+{
	313	+ int ret;
	314	+
	315	+ ret = __filecmp(getpid(), getpid(), 1, 1);
	316	+ EXPECT_EQ(ret, 0);
	317	+ if (ret != 0 && errno == ENOSYS)
	318	+ SKIP(return, "Kernel does not support kcmp() (missing CONFIG_KCMP?)");
	319	+}
176	320
177	321	TEST(mode_strict_support)
178	322	{
..	..	@@ -630,8 +774,15 @@
630	774	return (void *)SIBLING_EXIT_UNKILLED;
631	775	}
632	776
	777	+enum kill_t {
	778	+ KILL_THREAD,
	779	+ KILL_PROCESS,
	780	+ RET_UNKNOWN
	781	+};
	782	+
633	783	/* Prepare a thread that will kill itself or both of us. */
634		-void kill_thread_or_group(struct __test_metadata *_metadata, bool kill_process)
	784	+void kill_thread_or_group(struct __test_metadata *_metadata,
	785	+ enum kill_t kill_how)
635	786	{
636	787	pthread_t thread;
637	788	void *status;
..	..	@@ -647,11 +798,12 @@
647	798	.len = (unsigned short)ARRAY_SIZE(filter_thread),
648	799	.filter = filter_thread,
649	800	};
	801	+ int kill = kill_how == KILL_PROCESS ? SECCOMP_RET_KILL_PROCESS : 0xAAAAAAAA;
650	802	struct sock_filter filter_process[] = {
651	803	BPF_STMT(BPF_LD\|BPF_W\|BPF_ABS,
652	804	offsetof(struct seccomp_data, nr)),
653	805	BPF_JUMP(BPF_JMP\|BPF_JEQ\|BPF_K, __NR_prctl, 0, 1),
654		- BPF_STMT(BPF_RET\|BPF_K, SECCOMP_RET_KILL_PROCESS),
	806	+ BPF_STMT(BPF_RET\|BPF_K, kill),
655	807	BPF_STMT(BPF_RET\|BPF_K, SECCOMP_RET_ALLOW),
656	808	};
657	809	struct sock_fprog prog_process = {
..	..	@@ -664,13 +816,15 @@
664	816	}
665	817
666	818	ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0,
667		- kill_process ? &prog_process : &prog_thread));
	819	+ kill_how == KILL_THREAD ? &prog_thread
	820	+ : &prog_process));
668	821
669	822	/*
670	823	* Add the KILL_THREAD rule again to make sure that the KILL_PROCESS
671	824	* flag cannot be downgraded by a new filter.
672	825	*/
673		- ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog_thread));
	826	+ if (kill_how == KILL_PROCESS)
	827	+ ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog_thread));
674	828
675	829	/* Start a thread that will exit immediately. */
676	830	ASSERT_EQ(0, pthread_create(&thread, NULL, kill_thread, (void *)false));
..	..	@@ -698,7 +852,7 @@
698	852	child_pid = fork();
699	853	ASSERT_LE(0, child_pid);
700	854	if (child_pid == 0) {
701		- kill_thread_or_group(_metadata, false);
	855	+ kill_thread_or_group(_metadata, KILL_THREAD);
702	856	_exit(38);
703	857	}
704	858
..	..	@@ -717,7 +871,7 @@
717	871	child_pid = fork();
718	872	ASSERT_LE(0, child_pid);
719	873	if (child_pid == 0) {
720		- kill_thread_or_group(_metadata, true);
	874	+ kill_thread_or_group(_metadata, KILL_PROCESS);
721	875	_exit(38);
722	876	}
723	877
..	..	@@ -725,6 +879,27 @@
725	879
726	880	/* If the entire process was killed, we'll see SIGSYS. */
727	881	ASSERT_TRUE(WIFSIGNALED(status));
	882	+ ASSERT_EQ(SIGSYS, WTERMSIG(status));
	883	+}
	884	+
	885	+TEST(KILL_unknown)
	886	+{
	887	+ int status;
	888	+ pid_t child_pid;
	889	+
	890	+ child_pid = fork();
	891	+ ASSERT_LE(0, child_pid);
	892	+ if (child_pid == 0) {
	893	+ kill_thread_or_group(_metadata, RET_UNKNOWN);
	894	+ _exit(38);
	895	+ }
	896	+
	897	+ ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
	898	+
	899	+ /* If the entire process was killed, we'll see SIGSYS. */
	900	+ EXPECT_TRUE(WIFSIGNALED(status)) {
	901	+ TH_LOG("Unknown SECCOMP_RET is only killing the thread?");
	902	+ }
728	903	ASSERT_EQ(SIGSYS, WTERMSIG(status));
729	904	}
730	905
..	..	@@ -776,7 +951,7 @@
776	951	ASSERT_EQ(0, ret);
777	952
778	953	EXPECT_EQ(parent, syscall(__NR_getppid));
779		- EXPECT_EQ(-1, read(0, NULL, 0));
	954	+ EXPECT_EQ(-1, read(-1, NULL, 0));
780	955	EXPECT_EQ(E2BIG, errno);
781	956	}
782	957
..	..	@@ -795,7 +970,7 @@
795	970
796	971	EXPECT_EQ(parent, syscall(__NR_getppid));
797	972	/* "errno" of 0 is ok. */
798		- EXPECT_EQ(0, read(0, NULL, 0));
	973	+ EXPECT_EQ(0, read(-1, NULL, 0));
799	974	}
800	975
801	976	/*
..	..	@@ -816,7 +991,7 @@
816	991	ASSERT_EQ(0, ret);
817	992
818	993	EXPECT_EQ(parent, syscall(__NR_getppid));
819		- EXPECT_EQ(-1, read(0, NULL, 0));
	994	+ EXPECT_EQ(-1, read(-1, NULL, 0));
820	995	EXPECT_EQ(4095, errno);
821	996	}
822	997
..	..	@@ -847,11 +1022,11 @@
847	1022	ASSERT_EQ(0, ret);
848	1023
849	1024	EXPECT_EQ(parent, syscall(__NR_getppid));
850		- EXPECT_EQ(-1, read(0, NULL, 0));
	1025	+ EXPECT_EQ(-1, read(-1, NULL, 0));
851	1026	EXPECT_EQ(12, errno);
852	1027	}
853	1028
854		-FIXTURE_DATA(TRAP) {
	1029	+FIXTURE(TRAP) {
855	1030	struct sock_fprog prog;
856	1031	};
857	1032
..	..	@@ -962,7 +1137,7 @@
962	1137	EXPECT_NE(0, (unsigned long)sigsys->_call_addr);
963	1138	}
964	1139
965		-FIXTURE_DATA(precedence) {
	1140	+FIXTURE(precedence) {
966	1141	struct sock_fprog allow;
967	1142	struct sock_fprog log;
968	1143	struct sock_fprog trace;
..	..	@@ -1408,6 +1583,7 @@
1408	1583
1409	1584	return tracer_pid;
1410	1585	}
	1586	+
1411	1587	void teardown_trace_fixture(struct __test_metadata *_metadata,
1412	1588	pid_t tracer)
1413	1589	{
..	..	@@ -1451,7 +1627,7 @@
1451	1627	EXPECT_EQ(0, ret);
1452	1628	}
1453	1629
1454		-FIXTURE_DATA(TRACE_poke) {
	1630	+FIXTURE(TRACE_poke) {
1455	1631	struct sock_fprog prog;
1456	1632	pid_t tracer;
1457	1633	long poked;
..	..	@@ -1522,45 +1698,157 @@
1522	1698	}
1523	1699
1524	1700	#if defined(__x86_64__)
1525		-# define ARCH_REGS struct user_regs_struct
1526		-# define SYSCALL_NUM orig_rax
1527		-# define SYSCALL_RET rax
	1701	+# define ARCH_REGS struct user_regs_struct
	1702	+# define SYSCALL_NUM(_regs) (_regs).orig_rax
	1703	+# define SYSCALL_RET(_regs) (_regs).rax
1528	1704	#elif defined(__i386__)
1529		-# define ARCH_REGS struct user_regs_struct
1530		-# define SYSCALL_NUM orig_eax
1531		-# define SYSCALL_RET eax
	1705	+# define ARCH_REGS struct user_regs_struct
	1706	+# define SYSCALL_NUM(_regs) (_regs).orig_eax
	1707	+# define SYSCALL_RET(_regs) (_regs).eax
1532	1708	#elif defined(__arm__)
1533		-# define ARCH_REGS struct pt_regs
1534		-# define SYSCALL_NUM ARM_r7
1535		-# define SYSCALL_RET ARM_r0
	1709	+# define ARCH_REGS struct pt_regs
	1710	+# define SYSCALL_NUM(_regs) (_regs).ARM_r7
	1711	+# ifndef PTRACE_SET_SYSCALL
	1712	+# define PTRACE_SET_SYSCALL 23
	1713	+# endif
	1714	+# define SYSCALL_NUM_SET(_regs, _nr) \
	1715	+ EXPECT_EQ(0, ptrace(PTRACE_SET_SYSCALL, tracee, NULL, _nr))
	1716	+# define SYSCALL_RET(_regs) (_regs).ARM_r0
1536	1717	#elif defined(__aarch64__)
1537		-# define ARCH_REGS struct user_pt_regs
1538		-# define SYSCALL_NUM regs[8]
1539		-# define SYSCALL_RET regs[0]
	1718	+# define ARCH_REGS struct user_pt_regs
	1719	+# define SYSCALL_NUM(_regs) (_regs).regs[8]
	1720	+# ifndef NT_ARM_SYSTEM_CALL
	1721	+# define NT_ARM_SYSTEM_CALL 0x404
	1722	+# endif
	1723	+# define SYSCALL_NUM_SET(_regs, _nr) \
	1724	+ do { \
	1725	+ struct iovec __v; \
	1726	+ typeof(_nr) __nr = (_nr); \
	1727	+ __v.iov_base = &__nr; \
	1728	+ __v.iov_len = sizeof(__nr); \
	1729	+ EXPECT_EQ(0, ptrace(PTRACE_SETREGSET, tracee, \
	1730	+ NT_ARM_SYSTEM_CALL, &__v)); \
	1731	+ } while (0)
	1732	+# define SYSCALL_RET(_regs) (_regs).regs[0]
	1733	+#elif defined(__riscv) && __riscv_xlen == 64
	1734	+# define ARCH_REGS struct user_regs_struct
	1735	+# define SYSCALL_NUM(_regs) (_regs).a7
	1736	+# define SYSCALL_RET(_regs) (_regs).a0
	1737	+#elif defined(__csky__)
	1738	+# define ARCH_REGS struct pt_regs
	1739	+# if defined(__CSKYABIV2__)
	1740	+# define SYSCALL_NUM(_regs) (_regs).regs[3]
	1741	+# else
	1742	+# define SYSCALL_NUM(_regs) (_regs).regs[9]
	1743	+# endif
	1744	+# define SYSCALL_RET(_regs) (_regs).a0
1540	1745	#elif defined(__hppa__)
1541		-# define ARCH_REGS struct user_regs_struct
1542		-# define SYSCALL_NUM gr[20]
1543		-# define SYSCALL_RET gr[28]
	1746	+# define ARCH_REGS struct user_regs_struct
	1747	+# define SYSCALL_NUM(_regs) (_regs).gr[20]
	1748	+# define SYSCALL_RET(_regs) (_regs).gr[28]
1544	1749	#elif defined(__powerpc__)
1545		-# define ARCH_REGS struct pt_regs
1546		-# define SYSCALL_NUM gpr[0]
1547		-# define SYSCALL_RET gpr[3]
	1750	+# define ARCH_REGS struct pt_regs
	1751	+# define SYSCALL_NUM(_regs) (_regs).gpr[0]
	1752	+# define SYSCALL_RET(_regs) (_regs).gpr[3]
	1753	+# define SYSCALL_RET_SET(_regs, _val) \
	1754	+ do { \
	1755	+ typeof(_val) _result = (_val); \
	1756	+ if ((_regs.trap & 0xfff0) == 0x3000) { \
	1757	+ /* \
	1758	+ * scv 0 system call uses -ve result \
	1759	+ * for error, so no need to adjust. \
	1760	+ */ \
	1761	+ SYSCALL_RET(_regs) = _result; \
	1762	+ } else { \
	1763	+ /* \
	1764	+ * A syscall error is signaled by the \
	1765	+ * CR0 SO bit and the code is stored as \
	1766	+ * a positive value. \
	1767	+ */ \
	1768	+ if (_result < 0) { \
	1769	+ SYSCALL_RET(_regs) = -_result; \
	1770	+ (_regs).ccr \|= 0x10000000; \
	1771	+ } else { \
	1772	+ SYSCALL_RET(_regs) = _result; \
	1773	+ (_regs).ccr &= ~0x10000000; \
	1774	+ } \
	1775	+ } \
	1776	+ } while (0)
	1777	+# define SYSCALL_RET_SET_ON_PTRACE_EXIT
1548	1778	#elif defined(__s390__)
1549		-# define ARCH_REGS s390_regs
1550		-# define SYSCALL_NUM gprs[2]
1551		-# define SYSCALL_RET gprs[2]
	1779	+# define ARCH_REGS s390_regs
	1780	+# define SYSCALL_NUM(_regs) (_regs).gprs[2]
	1781	+# define SYSCALL_RET_SET(_regs, _val) \
	1782	+ TH_LOG("Can't modify syscall return on this architecture")
1552	1783	#elif defined(__mips__)
1553		-# define ARCH_REGS struct pt_regs
1554		-# define SYSCALL_NUM regs[2]
1555		-# define SYSCALL_SYSCALL_NUM regs[4]
1556		-# define SYSCALL_RET regs[2]
1557		-# define SYSCALL_NUM_RET_SHARE_REG
	1784	+# include <asm/unistd_nr_n32.h>
	1785	+# include <asm/unistd_nr_n64.h>
	1786	+# include <asm/unistd_nr_o32.h>
	1787	+# define ARCH_REGS struct pt_regs
	1788	+# define SYSCALL_NUM(_regs) \
	1789	+ ({ \
	1790	+ typeof((_regs).regs[2]) _nr; \
	1791	+ if ((_regs).regs[2] == __NR_O32_Linux) \
	1792	+ _nr = (_regs).regs[4]; \
	1793	+ else \
	1794	+ _nr = (_regs).regs[2]; \
	1795	+ _nr; \
	1796	+ })
	1797	+# define SYSCALL_NUM_SET(_regs, _nr) \
	1798	+ do { \
	1799	+ if ((_regs).regs[2] == __NR_O32_Linux) \
	1800	+ (_regs).regs[4] = _nr; \
	1801	+ else \
	1802	+ (_regs).regs[2] = _nr; \
	1803	+ } while (0)
	1804	+# define SYSCALL_RET_SET(_regs, _val) \
	1805	+ TH_LOG("Can't modify syscall return on this architecture")
	1806	+#elif defined(__xtensa__)
	1807	+# define ARCH_REGS struct user_pt_regs
	1808	+# define SYSCALL_NUM(_regs) (_regs).syscall
	1809	+/*
	1810	+ * On xtensa syscall return value is in the register
	1811	+ * a2 of the current window which is not fixed.
	1812	+ */
	1813	+#define SYSCALL_RET(_regs) (_regs).a[(_regs).windowbase * 4 + 2]
	1814	+#elif defined(__sh__)
	1815	+# define ARCH_REGS struct pt_regs
	1816	+# define SYSCALL_NUM(_regs) (_regs).regs[3]
	1817	+# define SYSCALL_RET(_regs) (_regs).regs[0]
1558	1818	#else
1559	1819	# error "Do not know how to find your architecture's registers and syscalls"
1560	1820	#endif
1561	1821
	1822	+/*
	1823	+ * Most architectures can change the syscall by just updating the
	1824	+ * associated register. This is the default if not defined above.
	1825	+ */
	1826	+#ifndef SYSCALL_NUM_SET
	1827	+# define SYSCALL_NUM_SET(_regs, _nr) \
	1828	+ do { \
	1829	+ SYSCALL_NUM(_regs) = (_nr); \
	1830	+ } while (0)
	1831	+#endif
	1832	+/*
	1833	+ * Most architectures can change the syscall return value by just
	1834	+ * writing to the SYSCALL_RET register. This is the default if not
	1835	+ * defined above. If an architecture cannot set the return value
	1836	+ * (for example when the syscall and return value register is
	1837	+ * shared), report it with TH_LOG() in an arch-specific definition
	1838	+ * of SYSCALL_RET_SET() above, and leave SYSCALL_RET undefined.
	1839	+ */
	1840	+#if !defined(SYSCALL_RET) && !defined(SYSCALL_RET_SET)
	1841	+# error "One of SYSCALL_RET or SYSCALL_RET_SET is needed for this arch"
	1842	+#endif
	1843	+#ifndef SYSCALL_RET_SET
	1844	+# define SYSCALL_RET_SET(_regs, _val) \
	1845	+ do { \
	1846	+ SYSCALL_RET(_regs) = (_val); \
	1847	+ } while (0)
	1848	+#endif
	1849	+
1562	1850	/* When the syscall return can't be changed, stub out the tests for it. */
1563		-#ifdef SYSCALL_NUM_RET_SHARE_REG
	1851	+#ifndef SYSCALL_RET
1564	1852	# define EXPECT_SYSCALL_RETURN(val, action) EXPECT_EQ(-1, action)
1565	1853	#else
1566	1854	# define EXPECT_SYSCALL_RETURN(val, action) \
..	..	@@ -1575,115 +1863,95 @@
1575	1863	} while (0)
1576	1864	#endif
1577	1865
1578		-/* Use PTRACE_GETREGS and PTRACE_SETREGS when available. This is useful for
	1866	+/*
	1867	+ * Some architectures (e.g. powerpc) can only set syscall
	1868	+ * return values on syscall exit during ptrace.
	1869	+ */
	1870	+const bool ptrace_entry_set_syscall_nr = true;
	1871	+const bool ptrace_entry_set_syscall_ret =
	1872	+#ifndef SYSCALL_RET_SET_ON_PTRACE_EXIT
	1873	+ true;
	1874	+#else
	1875	+ false;
	1876	+#endif
	1877	+
	1878	+/*
	1879	+ * Use PTRACE_GETREGS and PTRACE_SETREGS when available. This is useful for
1579	1880	* architectures without HAVE_ARCH_TRACEHOOK (e.g. User-mode Linux).
1580	1881	*/
1581	1882	#if defined(__x86_64__) \|\| defined(__i386__) \|\| defined(__mips__)
1582		-#define HAVE_GETREGS
	1883	+# define ARCH_GETREGS(_regs) ptrace(PTRACE_GETREGS, tracee, 0, &(_regs))
	1884	+# define ARCH_SETREGS(_regs) ptrace(PTRACE_SETREGS, tracee, 0, &(_regs))
	1885	+#else
	1886	+# define ARCH_GETREGS(_regs) ({ \
	1887	+ struct iovec __v; \
	1888	+ __v.iov_base = &(_regs); \
	1889	+ __v.iov_len = sizeof(_regs); \
	1890	+ ptrace(PTRACE_GETREGSET, tracee, NT_PRSTATUS, &__v); \
	1891	+ })
	1892	+# define ARCH_SETREGS(_regs) ({ \
	1893	+ struct iovec __v; \
	1894	+ __v.iov_base = &(_regs); \
	1895	+ __v.iov_len = sizeof(_regs); \
	1896	+ ptrace(PTRACE_SETREGSET, tracee, NT_PRSTATUS, &__v); \
	1897	+ })
1583	1898	#endif
1584	1899
1585	1900	/* Architecture-specific syscall fetching routine. */
1586	1901	int get_syscall(struct __test_metadata *_metadata, pid_t tracee)
1587	1902	{
1588	1903	ARCH_REGS regs;
1589		-#ifdef HAVE_GETREGS
1590		- EXPECT_EQ(0, ptrace(PTRACE_GETREGS, tracee, 0, &regs)) {
1591		- TH_LOG("PTRACE_GETREGS failed");
	1904	+
	1905	+ EXPECT_EQ(0, ARCH_GETREGS(regs)) {
1592	1906	return -1;
1593	1907	}
1594		-#else
1595		- struct iovec iov;
1596	1908
1597		- iov.iov_base = &regs;
1598		- iov.iov_len = sizeof(regs);
1599		- EXPECT_EQ(0, ptrace(PTRACE_GETREGSET, tracee, NT_PRSTATUS, &iov)) {
1600		- TH_LOG("PTRACE_GETREGSET failed");
1601		- return -1;
1602		- }
1603		-#endif
1604		-
1605		-#if defined(__mips__)
1606		- if (regs.SYSCALL_NUM == __NR_O32_Linux)
1607		- return regs.SYSCALL_SYSCALL_NUM;
1608		-#endif
1609		- return regs.SYSCALL_NUM;
	1909	+ return SYSCALL_NUM(regs);
1610	1910	}
1611	1911
1612	1912	/* Architecture-specific syscall changing routine. */
1613		-void change_syscall(struct __test_metadata *_metadata,
1614		- pid_t tracee, int syscall, int result)
	1913	+void __change_syscall(struct __test_metadata *_metadata,
	1914	+ pid_t tracee, long syscall, long ret)
1615	1915	{
1616		- int ret;
1617		- ARCH_REGS regs;
1618		-#ifdef HAVE_GETREGS
1619		- ret = ptrace(PTRACE_GETREGS, tracee, 0, &regs);
1620		-#else
1621		- struct iovec iov;
1622		- iov.iov_base = &regs;
1623		- iov.iov_len = sizeof(regs);
1624		- ret = ptrace(PTRACE_GETREGSET, tracee, NT_PRSTATUS, &iov);
1625		-#endif
1626		- EXPECT_EQ(0, ret) {}
	1916	+ ARCH_REGS orig, regs;
1627	1917
1628		-#if defined(__x86_64__) \|\| defined(__i386__) \|\| defined(__powerpc__) \|\| \
1629		- defined(__s390__) \|\| defined(__hppa__)
1630		- {
1631		- regs.SYSCALL_NUM = syscall;
	1918	+ /* Do not get/set registers if we have nothing to do. */
	1919	+ if (!syscall && !ret)
	1920	+ return;
	1921	+
	1922	+ EXPECT_EQ(0, ARCH_GETREGS(regs)) {
	1923	+ return;
1632	1924	}
1633		-#elif defined(__mips__)
1634		- {
1635		- if (regs.SYSCALL_NUM == __NR_O32_Linux)
1636		- regs.SYSCALL_SYSCALL_NUM = syscall;
1637		- else
1638		- regs.SYSCALL_NUM = syscall;
1639		- }
	1925	+ orig = regs;
1640	1926
1641		-#elif defined(__arm__)
1642		-# ifndef PTRACE_SET_SYSCALL
1643		-# define PTRACE_SET_SYSCALL 23
1644		-# endif
1645		- {
1646		- ret = ptrace(PTRACE_SET_SYSCALL, tracee, NULL, syscall);
1647		- EXPECT_EQ(0, ret);
1648		- }
	1927	+ if (syscall)
	1928	+ SYSCALL_NUM_SET(regs, *syscall);
1649	1929
1650		-#elif defined(__aarch64__)
1651		-# ifndef NT_ARM_SYSTEM_CALL
1652		-# define NT_ARM_SYSTEM_CALL 0x404
1653		-# endif
1654		- {
1655		- iov.iov_base = &syscall;
1656		- iov.iov_len = sizeof(syscall);
1657		- ret = ptrace(PTRACE_SETREGSET, tracee, NT_ARM_SYSTEM_CALL,
1658		- &iov);
1659		- EXPECT_EQ(0, ret);
1660		- }
	1930	+ if (ret)
	1931	+ SYSCALL_RET_SET(regs, *ret);
1661	1932
1662		-#else
1663		- ASSERT_EQ(1, 0) {
1664		- TH_LOG("How is the syscall changed on this architecture?");
1665		- }
1666		-#endif
1667		-
1668		- /* If syscall is skipped, change return value. */
1669		- if (syscall == -1)
1670		-#ifdef SYSCALL_NUM_RET_SHARE_REG
1671		- TH_LOG("Can't modify syscall return on this architecture");
1672		-#else
1673		- regs.SYSCALL_RET = result;
1674		-#endif
1675		-
1676		-#ifdef HAVE_GETREGS
1677		- ret = ptrace(PTRACE_SETREGS, tracee, 0, &regs);
1678		-#else
1679		- iov.iov_base = &regs;
1680		- iov.iov_len = sizeof(regs);
1681		- ret = ptrace(PTRACE_SETREGSET, tracee, NT_PRSTATUS, &iov);
1682		-#endif
1683		- EXPECT_EQ(0, ret);
	1933	+ /* Flush any register changes made. */
	1934	+ if (memcmp(&orig, &regs, sizeof(orig)) != 0)
	1935	+ EXPECT_EQ(0, ARCH_SETREGS(regs));
1684	1936	}
1685	1937
1686		-void tracer_syscall(struct __test_metadata *_metadata, pid_t tracee,
	1938	+/* Change only syscall number. */
	1939	+void change_syscall_nr(struct __test_metadata *_metadata,
	1940	+ pid_t tracee, long syscall)
	1941	+{
	1942	+ __change_syscall(_metadata, tracee, &syscall, NULL);
	1943	+}
	1944	+
	1945	+/* Change syscall return value (and set syscall number to -1). */
	1946	+void change_syscall_ret(struct __test_metadata *_metadata,
	1947	+ pid_t tracee, long ret)
	1948	+{
	1949	+ long syscall = -1;
	1950	+
	1951	+ __change_syscall(_metadata, tracee, &syscall, &ret);
	1952	+}
	1953	+
	1954	+void tracer_seccomp(struct __test_metadata *_metadata, pid_t tracee,
1687	1955	int status, void *args)
1688	1956	{
1689	1957	int ret;
..	..	@@ -1698,17 +1966,17 @@
1698	1966	case 0x1002:
1699	1967	/* change getpid to getppid. */
1700	1968	EXPECT_EQ(__NR_getpid, get_syscall(_metadata, tracee));
1701		- change_syscall(_metadata, tracee, __NR_getppid, 0);
	1969	+ change_syscall_nr(_metadata, tracee, __NR_getppid);
1702	1970	break;
1703	1971	case 0x1003:
1704	1972	/* skip gettid with valid return code. */
1705	1973	EXPECT_EQ(__NR_gettid, get_syscall(_metadata, tracee));
1706		- change_syscall(_metadata, tracee, -1, 45000);
	1974	+ change_syscall_ret(_metadata, tracee, 45000);
1707	1975	break;
1708	1976	case 0x1004:
1709	1977	/* skip openat with error. */
1710	1978	EXPECT_EQ(__NR_openat, get_syscall(_metadata, tracee));
1711		- change_syscall(_metadata, tracee, -1, -ESRCH);
	1979	+ change_syscall_ret(_metadata, tracee, -ESRCH);
1712	1980	break;
1713	1981	case 0x1005:
1714	1982	/* do nothing (allow getppid) */
..	..	@@ -1723,36 +1991,92 @@
1723	1991
1724	1992	}
1725	1993
	1994	+FIXTURE(TRACE_syscall) {
	1995	+ struct sock_fprog prog;
	1996	+ pid_t tracer, mytid, mypid, parent;
	1997	+ long syscall_nr;
	1998	+};
	1999	+
1726	2000	void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee,
1727	2001	int status, void *args)
1728	2002	{
1729		- int ret, nr;
	2003	+ int ret;
1730	2004	unsigned long msg;
1731	2005	static bool entry;
	2006	+ long syscall_nr_val, syscall_ret_val;
	2007	+ long syscall_nr = NULL, syscall_ret = NULL;
	2008	+ FIXTURE_DATA(TRACE_syscall) *self = args;
1732	2009
1733		- /* Make sure we got an empty message. */
	2010	+ /*
	2011	+ * The traditional way to tell PTRACE_SYSCALL entry/exit
	2012	+ * is by counting.
	2013	+ */
	2014	+ entry = !entry;
	2015	+
	2016	+ /* Make sure we got an appropriate message. */
1734	2017	ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
1735	2018	EXPECT_EQ(0, ret);
1736		- EXPECT_EQ(0, msg);
	2019	+ EXPECT_EQ(entry ? PTRACE_EVENTMSG_SYSCALL_ENTRY
	2020	+ : PTRACE_EVENTMSG_SYSCALL_EXIT, msg);
1737	2021
1738		- /* The only way to tell PTRACE_SYSCALL entry/exit is by counting. */
1739		- entry = !entry;
1740		- if (!entry)
	2022	+ /*
	2023	+ * Some architectures only support setting return values during
	2024	+ * syscall exit under ptrace, and on exit the syscall number may
	2025	+ * no longer be available. Therefore, save the initial sycall
	2026	+ * number here, so it can be examined during both entry and exit
	2027	+ * phases.
	2028	+ */
	2029	+ if (entry)
	2030	+ self->syscall_nr = get_syscall(_metadata, tracee);
	2031	+
	2032	+ /*
	2033	+ * Depending on the architecture's syscall setting abilities, we
	2034	+ * pick which things to set during this phase (entry or exit).
	2035	+ */
	2036	+ if (entry == ptrace_entry_set_syscall_nr)
	2037	+ syscall_nr = &syscall_nr_val;
	2038	+ if (entry == ptrace_entry_set_syscall_ret)
	2039	+ syscall_ret = &syscall_ret_val;
	2040	+
	2041	+ /* Now handle the actual rewriting cases. */
	2042	+ switch (self->syscall_nr) {
	2043	+ case __NR_getpid:
	2044	+ syscall_nr_val = __NR_getppid;
	2045	+ /* Never change syscall return for this case. */
	2046	+ syscall_ret = NULL;
	2047	+ break;
	2048	+ case __NR_gettid:
	2049	+ syscall_nr_val = -1;
	2050	+ syscall_ret_val = 45000;
	2051	+ break;
	2052	+ case __NR_openat:
	2053	+ syscall_nr_val = -1;
	2054	+ syscall_ret_val = -ESRCH;
	2055	+ break;
	2056	+ default:
	2057	+ /* Unhandled, do nothing. */
1741	2058	return;
	2059	+ }
1742	2060
1743		- nr = get_syscall(_metadata, tracee);
1744		-
1745		- if (nr == __NR_getpid)
1746		- change_syscall(_metadata, tracee, __NR_getppid, 0);
1747		- if (nr == __NR_gettid)
1748		- change_syscall(_metadata, tracee, -1, 45000);
1749		- if (nr == __NR_openat)
1750		- change_syscall(_metadata, tracee, -1, -ESRCH);
	2061	+ __change_syscall(_metadata, tracee, syscall_nr, syscall_ret);
1751	2062	}
1752	2063
1753		-FIXTURE_DATA(TRACE_syscall) {
1754		- struct sock_fprog prog;
1755		- pid_t tracer, mytid, mypid, parent;
	2064	+FIXTURE_VARIANT(TRACE_syscall) {
	2065	+ /*
	2066	+ * All of the SECCOMP_RET_TRACE behaviors can be tested with either
	2067	+ * SECCOMP_RET_TRACE+PTRACE_CONT or plain ptrace()+PTRACE_SYSCALL.
	2068	+ * This indicates if we should use SECCOMP_RET_TRACE (false), or
	2069	+ * ptrace (true).
	2070	+ */
	2071	+ bool use_ptrace;
	2072	+};
	2073	+
	2074	+FIXTURE_VARIANT_ADD(TRACE_syscall, ptrace) {
	2075	+ .use_ptrace = true,
	2076	+};
	2077	+
	2078	+FIXTURE_VARIANT_ADD(TRACE_syscall, seccomp) {
	2079	+ .use_ptrace = false,
1756	2080	};
1757	2081
1758	2082	FIXTURE_SETUP(TRACE_syscall)
..	..	@@ -1770,12 +2094,11 @@
1770	2094	BPF_STMT(BPF_RET\|BPF_K, SECCOMP_RET_TRACE \| 0x1005),
1771	2095	BPF_STMT(BPF_RET\|BPF_K, SECCOMP_RET_ALLOW),
1772	2096	};
1773		-
1774		- memset(&self->prog, 0, sizeof(self->prog));
1775		- self->prog.filter = malloc(sizeof(filter));
1776		- ASSERT_NE(NULL, self->prog.filter);
1777		- memcpy(self->prog.filter, filter, sizeof(filter));
1778		- self->prog.len = (unsigned short)ARRAY_SIZE(filter);
	2097	+ struct sock_fprog prog = {
	2098	+ .len = (unsigned short)ARRAY_SIZE(filter),
	2099	+ .filter = filter,
	2100	+ };
	2101	+ long ret;
1779	2102
1780	2103	/* Prepare some testable syscall results. */
1781	2104	self->mytid = syscall(__NR_gettid);
..	..	@@ -1793,60 +2116,48 @@
1793	2116	ASSERT_NE(self->parent, self->mypid);
1794	2117
1795	2118	/* Launch tracer. */
1796		- self->tracer = setup_trace_fixture(_metadata, tracer_syscall, NULL,
1797		- false);
	2119	+ self->tracer = setup_trace_fixture(_metadata,
	2120	+ variant->use_ptrace ? tracer_ptrace
	2121	+ : tracer_seccomp,
	2122	+ self, variant->use_ptrace);
	2123	+
	2124	+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
	2125	+ ASSERT_EQ(0, ret);
	2126	+
	2127	+ if (variant->use_ptrace)
	2128	+ return;
	2129	+
	2130	+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
	2131	+ ASSERT_EQ(0, ret);
1798	2132	}
1799	2133
1800	2134	FIXTURE_TEARDOWN(TRACE_syscall)
1801	2135	{
1802	2136	teardown_trace_fixture(_metadata, self->tracer);
1803		- if (self->prog.filter)
1804		- free(self->prog.filter);
1805	2137	}
1806	2138
1807		-TEST_F(TRACE_syscall, ptrace_syscall_redirected)
	2139	+TEST(negative_ENOSYS)
1808	2140	{
1809		- /* Swap SECCOMP_RET_TRACE tracer for PTRACE_SYSCALL tracer. */
1810		- teardown_trace_fixture(_metadata, self->tracer);
1811		- self->tracer = setup_trace_fixture(_metadata, tracer_ptrace, NULL,
1812		- true);
1813		-
1814		- /* Tracer will redirect getpid to getppid. */
1815		- EXPECT_NE(self->mypid, syscall(__NR_getpid));
	2141	+ /*
	2142	+ * There should be no difference between an "internal" skip
	2143	+ * and userspace asking for syscall "-1".
	2144	+ */
	2145	+ errno = 0;
	2146	+ EXPECT_EQ(-1, syscall(-1));
	2147	+ EXPECT_EQ(errno, ENOSYS);
	2148	+ /* And no difference for "still not valid but not -1". */
	2149	+ errno = 0;
	2150	+ EXPECT_EQ(-1, syscall(-101));
	2151	+ EXPECT_EQ(errno, ENOSYS);
1816	2152	}
1817	2153
1818		-TEST_F(TRACE_syscall, ptrace_syscall_errno)
	2154	+TEST_F(TRACE_syscall, negative_ENOSYS)
1819	2155	{
1820		- /* Swap SECCOMP_RET_TRACE tracer for PTRACE_SYSCALL tracer. */
1821		- teardown_trace_fixture(_metadata, self->tracer);
1822		- self->tracer = setup_trace_fixture(_metadata, tracer_ptrace, NULL,
1823		- true);
1824		-
1825		- /* Tracer should skip the open syscall, resulting in ESRCH. */
1826		- EXPECT_SYSCALL_RETURN(-ESRCH, syscall(__NR_openat));
1827		-}
1828		-
1829		-TEST_F(TRACE_syscall, ptrace_syscall_faked)
1830		-{
1831		- /* Swap SECCOMP_RET_TRACE tracer for PTRACE_SYSCALL tracer. */
1832		- teardown_trace_fixture(_metadata, self->tracer);
1833		- self->tracer = setup_trace_fixture(_metadata, tracer_ptrace, NULL,
1834		- true);
1835		-
1836		- /* Tracer should skip the gettid syscall, resulting fake pid. */
1837		- EXPECT_SYSCALL_RETURN(45000, syscall(__NR_gettid));
	2156	+ negative_ENOSYS(_metadata);
1838	2157	}
1839	2158
1840	2159	TEST_F(TRACE_syscall, syscall_allowed)
1841	2160	{
1842		- long ret;
1843		-
1844		- ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1845		- ASSERT_EQ(0, ret);
1846		-
1847		- ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
1848		- ASSERT_EQ(0, ret);
1849		-
1850	2161	/* getppid works as expected (no changes). */
1851	2162	EXPECT_EQ(self->parent, syscall(__NR_getppid));
1852	2163	EXPECT_NE(self->mypid, syscall(__NR_getppid));
..	..	@@ -1854,14 +2165,6 @@
1854	2165
1855	2166	TEST_F(TRACE_syscall, syscall_redirected)
1856	2167	{
1857		- long ret;
1858		-
1859		- ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1860		- ASSERT_EQ(0, ret);
1861		-
1862		- ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
1863		- ASSERT_EQ(0, ret);
1864		-
1865	2168	/* getpid has been redirected to getppid as expected. */
1866	2169	EXPECT_EQ(self->parent, syscall(__NR_getpid));
1867	2170	EXPECT_NE(self->mypid, syscall(__NR_getpid));
..	..	@@ -1869,33 +2172,17 @@
1869	2172
1870	2173	TEST_F(TRACE_syscall, syscall_errno)
1871	2174	{
1872		- long ret;
1873		-
1874		- ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1875		- ASSERT_EQ(0, ret);
1876		-
1877		- ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
1878		- ASSERT_EQ(0, ret);
1879		-
1880		- /* openat has been skipped and an errno return. */
	2175	+ /* Tracer should skip the open syscall, resulting in ESRCH. */
1881	2176	EXPECT_SYSCALL_RETURN(-ESRCH, syscall(__NR_openat));
1882	2177	}
1883	2178
1884	2179	TEST_F(TRACE_syscall, syscall_faked)
1885	2180	{
1886		- long ret;
1887		-
1888		- ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1889		- ASSERT_EQ(0, ret);
1890		-
1891		- ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
1892		- ASSERT_EQ(0, ret);
1893		-
1894		- /* gettid has been skipped and an altered return value stored. */
	2181	+ /* Tracer skips the gettid syscall and store altered return value. */
1895	2182	EXPECT_SYSCALL_RETURN(45000, syscall(__NR_gettid));
1896	2183	}
1897	2184
1898		-TEST_F(TRACE_syscall, skip_after_RET_TRACE)
	2185	+TEST_F(TRACE_syscall, skip_after)
1899	2186	{
1900	2187	struct sock_filter filter[] = {
1901	2188	BPF_STMT(BPF_LD\|BPF_W\|BPF_ABS,
..	..	@@ -1910,14 +2197,7 @@
1910	2197	};
1911	2198	long ret;
1912	2199
1913		- ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1914		- ASSERT_EQ(0, ret);
1915		-
1916		- /* Install fixture filter. */
1917		- ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
1918		- ASSERT_EQ(0, ret);
1919		-
1920		- /* Install "errno on getppid" filter. */
	2200	+ /* Install additional "errno on getppid" filter. */
1921	2201	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
1922	2202	ASSERT_EQ(0, ret);
1923	2203
..	..	@@ -1927,7 +2207,7 @@
1927	2207	EXPECT_EQ(EPERM, errno);
1928	2208	}
1929	2209
1930		-TEST_F_SIGNAL(TRACE_syscall, kill_after_RET_TRACE, SIGSYS)
	2210	+TEST_F_SIGNAL(TRACE_syscall, kill_after, SIGSYS)
1931	2211	{
1932	2212	struct sock_filter filter[] = {
1933	2213	BPF_STMT(BPF_LD\|BPF_W\|BPF_ABS,
..	..	@@ -1942,77 +2222,7 @@
1942	2222	};
1943	2223	long ret;
1944	2224
1945		- ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1946		- ASSERT_EQ(0, ret);
1947		-
1948		- /* Install fixture filter. */
1949		- ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
1950		- ASSERT_EQ(0, ret);
1951		-
1952		- /* Install "death on getppid" filter. */
1953		- ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
1954		- ASSERT_EQ(0, ret);
1955		-
1956		- /* Tracer will redirect getpid to getppid, and we should die. */
1957		- EXPECT_NE(self->mypid, syscall(__NR_getpid));
1958		-}
1959		-
1960		-TEST_F(TRACE_syscall, skip_after_ptrace)
1961		-{
1962		- struct sock_filter filter[] = {
1963		- BPF_STMT(BPF_LD\|BPF_W\|BPF_ABS,
1964		- offsetof(struct seccomp_data, nr)),
1965		- BPF_JUMP(BPF_JMP\|BPF_JEQ\|BPF_K, __NR_getppid, 0, 1),
1966		- BPF_STMT(BPF_RET\|BPF_K, SECCOMP_RET_ERRNO \| EPERM),
1967		- BPF_STMT(BPF_RET\|BPF_K, SECCOMP_RET_ALLOW),
1968		- };
1969		- struct sock_fprog prog = {
1970		- .len = (unsigned short)ARRAY_SIZE(filter),
1971		- .filter = filter,
1972		- };
1973		- long ret;
1974		-
1975		- /* Swap SECCOMP_RET_TRACE tracer for PTRACE_SYSCALL tracer. */
1976		- teardown_trace_fixture(_metadata, self->tracer);
1977		- self->tracer = setup_trace_fixture(_metadata, tracer_ptrace, NULL,
1978		- true);
1979		-
1980		- ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1981		- ASSERT_EQ(0, ret);
1982		-
1983		- /* Install "errno on getppid" filter. */
1984		- ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
1985		- ASSERT_EQ(0, ret);
1986		-
1987		- /* Tracer will redirect getpid to getppid, and we should see EPERM. */
1988		- EXPECT_EQ(-1, syscall(__NR_getpid));
1989		- EXPECT_EQ(EPERM, errno);
1990		-}
1991		-
1992		-TEST_F_SIGNAL(TRACE_syscall, kill_after_ptrace, SIGSYS)
1993		-{
1994		- struct sock_filter filter[] = {
1995		- BPF_STMT(BPF_LD\|BPF_W\|BPF_ABS,
1996		- offsetof(struct seccomp_data, nr)),
1997		- BPF_JUMP(BPF_JMP\|BPF_JEQ\|BPF_K, __NR_getppid, 0, 1),
1998		- BPF_STMT(BPF_RET\|BPF_K, SECCOMP_RET_KILL),
1999		- BPF_STMT(BPF_RET\|BPF_K, SECCOMP_RET_ALLOW),
2000		- };
2001		- struct sock_fprog prog = {
2002		- .len = (unsigned short)ARRAY_SIZE(filter),
2003		- .filter = filter,
2004		- };
2005		- long ret;
2006		-
2007		- /* Swap SECCOMP_RET_TRACE tracer for PTRACE_SYSCALL tracer. */
2008		- teardown_trace_fixture(_metadata, self->tracer);
2009		- self->tracer = setup_trace_fixture(_metadata, tracer_ptrace, NULL,
2010		- true);
2011		-
2012		- ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
2013		- ASSERT_EQ(0, ret);
2014		-
2015		- /* Install "death on getppid" filter. */
	2225	+ /* Install additional "death on getppid" filter. */
2016	2226	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2017	2227	ASSERT_EQ(0, ret);
2018	2228
..	..	@@ -2119,12 +2329,17 @@
2119	2329	{
2120	2330	unsigned int flags[] = { SECCOMP_FILTER_FLAG_TSYNC,
2121	2331	SECCOMP_FILTER_FLAG_LOG,
2122		- SECCOMP_FILTER_FLAG_SPEC_ALLOW };
2123		- unsigned int flag, all_flags;
	2332	+ SECCOMP_FILTER_FLAG_SPEC_ALLOW,
	2333	+ SECCOMP_FILTER_FLAG_NEW_LISTENER,
	2334	+ SECCOMP_FILTER_FLAG_TSYNC_ESRCH };
	2335	+ unsigned int exclusive[] = {
	2336	+ SECCOMP_FILTER_FLAG_TSYNC,
	2337	+ SECCOMP_FILTER_FLAG_NEW_LISTENER };
	2338	+ unsigned int flag, all_flags, exclusive_mask;
2124	2339	int i;
2125	2340	long ret;
2126	2341
2127		- /* Test detection of known-good filter flags */
	2342	+ /* Test detection of individual known-good filter flags */
2128	2343	for (i = 0, all_flags = 0; i < ARRAY_SIZE(flags); i++) {
2129	2344	int bits = 0;
2130	2345
..	..	@@ -2151,16 +2366,29 @@
2151	2366	all_flags \|= flag;
2152	2367	}
2153	2368
2154		- /* Test detection of all known-good filter flags */
2155		- ret = seccomp(SECCOMP_SET_MODE_FILTER, all_flags, NULL);
2156		- EXPECT_EQ(-1, ret);
2157		- EXPECT_EQ(EFAULT, errno) {
2158		- TH_LOG("Failed to detect that all known-good filter flags (0x%X) are supported!",
2159		- all_flags);
	2369	+ /*
	2370	+ * Test detection of all known-good filter flags combined. But
	2371	+ * for the exclusive flags we need to mask them out and try them
	2372	+ * individually for the "all flags" testing.
	2373	+ */
	2374	+ exclusive_mask = 0;
	2375	+ for (i = 0; i < ARRAY_SIZE(exclusive); i++)
	2376	+ exclusive_mask \|= exclusive[i];
	2377	+ for (i = 0; i < ARRAY_SIZE(exclusive); i++) {
	2378	+ flag = all_flags & ~exclusive_mask;
	2379	+ flag \|= exclusive[i];
	2380	+
	2381	+ ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
	2382	+ EXPECT_EQ(-1, ret);
	2383	+ EXPECT_EQ(EFAULT, errno) {
	2384	+ TH_LOG("Failed to detect that all known-good filter flags (0x%X) are supported!",
	2385	+ flag);
	2386	+ }
2160	2387	}
2161	2388
2162		- /* Test detection of an unknown filter flag */
	2389	+ /* Test detection of an unknown filter flags, without exclusives. */
2163	2390	flag = -1;
	2391	+ flag &= ~exclusive_mask;
2164	2392	ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2165	2393	EXPECT_EQ(-1, ret);
2166	2394	EXPECT_EQ(EINVAL, errno) {
..	..	@@ -2237,7 +2465,7 @@
2237	2465	} \
2238	2466	} while (0)
2239	2467
2240		-FIXTURE_DATA(TSYNC) {
	2468	+FIXTURE(TSYNC) {
2241	2469	struct sock_fprog root_prog, apply_prog;
2242	2470	struct tsync_sibling sibling[TSYNC_SIBLINGS];
2243	2471	sem_t started;
..	..	@@ -2347,7 +2575,7 @@
2347	2575	ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
2348	2576	if (!ret)
2349	2577	return (void *)SIBLING_EXIT_NEWPRIVS;
2350		- read(0, NULL, 0);
	2578	+ read(-1, NULL, 0);
2351	2579	return (void *)SIBLING_EXIT_UNKILLED;
2352	2580	}
2353	2581
..	..	@@ -2561,10 +2789,60 @@
2561	2789	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2562	2790	}
2563	2791
	2792	+TEST_F(TSYNC, two_siblings_with_one_divergence_no_tid_in_err)
	2793	+{
	2794	+ long ret, flags;
	2795	+ void *status;
	2796	+
	2797	+ ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
	2798	+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
	2799	+ }
	2800	+
	2801	+ ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
	2802	+ ASSERT_NE(ENOSYS, errno) {
	2803	+ TH_LOG("Kernel does not support seccomp syscall!");
	2804	+ }
	2805	+ ASSERT_EQ(0, ret) {
	2806	+ TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
	2807	+ }
	2808	+ self->sibling[0].diverge = 1;
	2809	+ tsync_start_sibling(&self->sibling[0]);
	2810	+ tsync_start_sibling(&self->sibling[1]);
	2811	+
	2812	+ while (self->sibling_count < TSYNC_SIBLINGS) {
	2813	+ sem_wait(&self->started);
	2814	+ self->sibling_count++;
	2815	+ }
	2816	+
	2817	+ flags = SECCOMP_FILTER_FLAG_TSYNC \| \
	2818	+ SECCOMP_FILTER_FLAG_TSYNC_ESRCH;
	2819	+ ret = seccomp(SECCOMP_SET_MODE_FILTER, flags, &self->apply_prog);
	2820	+ ASSERT_EQ(ESRCH, errno) {
	2821	+ TH_LOG("Did not return ESRCH for diverged sibling.");
	2822	+ }
	2823	+ ASSERT_EQ(-1, ret) {
	2824	+ TH_LOG("Did not fail on diverged sibling.");
	2825	+ }
	2826	+
	2827	+ /* Wake the threads */
	2828	+ pthread_mutex_lock(&self->mutex);
	2829	+ ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
	2830	+ TH_LOG("cond broadcast non-zero");
	2831	+ }
	2832	+ pthread_mutex_unlock(&self->mutex);
	2833	+
	2834	+ /* Ensure they are both unkilled. */
	2835	+ PTHREAD_JOIN(self->sibling[0].tid, &status);
	2836	+ EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
	2837	+ PTHREAD_JOIN(self->sibling[1].tid, &status);
	2838	+ EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
	2839	+}
	2840	+
2564	2841	TEST_F(TSYNC, two_siblings_not_under_filter)
2565	2842	{
2566	2843	long ret, sib;
2567	2844	void *status;
	2845	+ struct timespec delay = { .tv_nsec = 100000000 };
2568	2846
2569	2847	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2570	2848	TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
..	..	@@ -2618,7 +2896,7 @@
2618	2896	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2619	2897	/* Poll for actual task death. pthread_join doesn't guarantee it. */
2620	2898	while (!kill(self->sibling[sib].system_tid, 0))
2621		- sleep(0.1);
	2899	+ nanosleep(&delay, NULL);
2622	2900	/* Switch to the remaining sibling */
2623	2901	sib = !sib;
2624	2902
..	..	@@ -2643,7 +2921,7 @@
2643	2921	EXPECT_EQ(0, (long)status);
2644	2922	/* Poll for actual task death. pthread_join doesn't guarantee it. */
2645	2923	while (!kill(self->sibling[sib].system_tid, 0))
2646		- sleep(0.1);
	2924	+ nanosleep(&delay, NULL);
2647	2925
2648	2926	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2649	2927	&self->apply_prog);
..	..	@@ -2664,12 +2942,13 @@
2664	2942	offsetof(struct seccomp_data, nr)),
2665	2943
2666	2944	#ifdef __NR_sigreturn
2667		- BPF_JUMP(BPF_JMP\|BPF_JEQ\|BPF_K, __NR_sigreturn, 6, 0),
	2945	+ BPF_JUMP(BPF_JMP\|BPF_JEQ\|BPF_K, __NR_sigreturn, 7, 0),
2668	2946	#endif
2669		- BPF_JUMP(BPF_JMP\|BPF_JEQ\|BPF_K, __NR_read, 5, 0),
2670		- BPF_JUMP(BPF_JMP\|BPF_JEQ\|BPF_K, __NR_exit, 4, 0),
2671		- BPF_JUMP(BPF_JMP\|BPF_JEQ\|BPF_K, __NR_rt_sigreturn, 3, 0),
2672		- BPF_JUMP(BPF_JMP\|BPF_JEQ\|BPF_K, __NR_nanosleep, 4, 0),
	2947	+ BPF_JUMP(BPF_JMP\|BPF_JEQ\|BPF_K, __NR_read, 6, 0),
	2948	+ BPF_JUMP(BPF_JMP\|BPF_JEQ\|BPF_K, __NR_exit, 5, 0),
	2949	+ BPF_JUMP(BPF_JMP\|BPF_JEQ\|BPF_K, __NR_rt_sigreturn, 4, 0),
	2950	+ BPF_JUMP(BPF_JMP\|BPF_JEQ\|BPF_K, __NR_nanosleep, 5, 0),
	2951	+ BPF_JUMP(BPF_JMP\|BPF_JEQ\|BPF_K, __NR_clock_nanosleep, 4, 0),
2673	2952	BPF_JUMP(BPF_JMP\|BPF_JEQ\|BPF_K, __NR_restart_syscall, 4, 0),
2674	2953
2675	2954	/* Allow __NR_write for easy logging. */
..	..	@@ -2756,7 +3035,8 @@
2756	3035	ASSERT_EQ(PTRACE_EVENT_SECCOMP, (status >> 16));
2757	3036	ASSERT_EQ(0, ptrace(PTRACE_GETEVENTMSG, child_pid, NULL, &msg));
2758	3037	ASSERT_EQ(0x100, msg);
2759		- EXPECT_EQ(__NR_nanosleep, get_syscall(_metadata, child_pid));
	3038	+ ret = get_syscall(_metadata, child_pid);
	3039	+ EXPECT_TRUE(ret == __NR_nanosleep \|\| ret == __NR_clock_nanosleep);
2760	3040
2761	3041	/* Might as well check siginfo for sanity while we're here. */
2762	3042	ASSERT_EQ(0, ptrace(PTRACE_GETSIGINFO, child_pid, NULL, &info));
..	..	@@ -2773,9 +3053,14 @@
2773	3053	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
2774	3054	ASSERT_EQ(true, WIFSTOPPED(status));
2775	3055	ASSERT_EQ(SIGSTOP, WSTOPSIG(status));
2776		- /* Verify signal delivery came from parent now. */
2777	3056	ASSERT_EQ(0, ptrace(PTRACE_GETSIGINFO, child_pid, NULL, &info));
2778		- EXPECT_EQ(getpid(), info.si_pid);
	3057	+ /*
	3058	+ * There is no siginfo on SIGSTOP any more, so we can't verify
	3059	+ * signal delivery came from parent now (getpid() == info.si_pid).
	3060	+ * https://lkml.kernel.org/r/CAGXu5jJaZAOzP1qFz66tYrtbuywqb+UN2SOA1VLHpCCOiYvYeg@mail.gmail.com
	3061	+ * At least verify the SIGSTOP via PTRACE_GETSIGINFO.
	3062	+ */
	3063	+ EXPECT_EQ(SIGSTOP, info.si_signo);
2779	3064
2780	3065	/* Restart nanosleep with SIGCONT, which triggers restart_syscall. */
2781	3066	ASSERT_EQ(0, kill(child_pid, SIGCONT));
..	..	@@ -2922,7 +3207,7 @@
2922	3207
2923	3208	/* Only real root can get metadata. */
2924	3209	if (geteuid()) {
2925		- XFAIL(return, "get_metadata requires real root");
	3210	+ SKIP(return, "get_metadata requires real root");
2926	3211	return;
2927	3212	}
2928	3213
..	..	@@ -2940,11 +3225,11 @@
2940	3225	};
2941	3226
2942	3227	/* one with log, one without */
2943		- ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER,
	3228	+ EXPECT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER,
2944	3229	SECCOMP_FILTER_FLAG_LOG, &prog));
2945		- ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog));
	3230	+ EXPECT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog));
2946	3231
2947		- ASSERT_EQ(0, close(pipefd[0]));
	3232	+ EXPECT_EQ(0, close(pipefd[0]));
2948	3233	ASSERT_EQ(1, write(pipefd[1], "1", 1));
2949	3234	ASSERT_EQ(0, close(pipefd[1]));
2950	3235
..	..	@@ -2965,7 +3250,7 @@
2965	3250	ret = ptrace(PTRACE_SECCOMP_GET_METADATA, pid, sizeof(md), &md);
2966	3251	EXPECT_EQ(sizeof(md), ret) {
2967	3252	if (errno == EINVAL)
2968		- XFAIL(goto skip, "Kernel does not support PTRACE_SECCOMP_GET_METADATA (missing CONFIG_CHECKPOINT_RESTORE?)");
	3253	+ SKIP(goto skip, "Kernel does not support PTRACE_SECCOMP_GET_METADATA (missing CONFIG_CHECKPOINT_RESTORE?)");
2969	3254	}
2970	3255
2971	3256	EXPECT_EQ(md.flags, SECCOMP_FILTER_FLAG_LOG);
..	..	@@ -2981,9 +3266,890 @@
2981	3266	ASSERT_EQ(0, kill(pid, SIGKILL));
2982	3267	}
2983	3268
	3269	+static int user_notif_syscall(int nr, unsigned int flags)
	3270	+{
	3271	+ struct sock_filter filter[] = {
	3272	+ BPF_STMT(BPF_LD\|BPF_W\|BPF_ABS,
	3273	+ offsetof(struct seccomp_data, nr)),
	3274	+ BPF_JUMP(BPF_JMP\|BPF_JEQ\|BPF_K, nr, 0, 1),
	3275	+ BPF_STMT(BPF_RET\|BPF_K, SECCOMP_RET_USER_NOTIF),
	3276	+ BPF_STMT(BPF_RET\|BPF_K, SECCOMP_RET_ALLOW),
	3277	+ };
	3278	+
	3279	+ struct sock_fprog prog = {
	3280	+ .len = (unsigned short)ARRAY_SIZE(filter),
	3281	+ .filter = filter,
	3282	+ };
	3283	+
	3284	+ return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog);
	3285	+}
	3286	+
	3287	+#define USER_NOTIF_MAGIC INT_MAX
	3288	+TEST(user_notification_basic)
	3289	+{
	3290	+ pid_t pid;
	3291	+ long ret;
	3292	+ int status, listener;
	3293	+ struct seccomp_notif req = {};
	3294	+ struct seccomp_notif_resp resp = {};
	3295	+ struct pollfd pollfd;
	3296	+
	3297	+ struct sock_filter filter[] = {
	3298	+ BPF_STMT(BPF_RET\|BPF_K, SECCOMP_RET_ALLOW),
	3299	+ };
	3300	+ struct sock_fprog prog = {
	3301	+ .len = (unsigned short)ARRAY_SIZE(filter),
	3302	+ .filter = filter,
	3303	+ };
	3304	+
	3305	+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
	3306	+ ASSERT_EQ(0, ret) {
	3307	+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
	3308	+ }
	3309	+
	3310	+ pid = fork();
	3311	+ ASSERT_GE(pid, 0);
	3312	+
	3313	+ /* Check that we get -ENOSYS with no listener attached */
	3314	+ if (pid == 0) {
	3315	+ if (user_notif_syscall(__NR_getppid, 0) < 0)
	3316	+ exit(1);
	3317	+ ret = syscall(__NR_getppid);
	3318	+ exit(ret >= 0 \|\| errno != ENOSYS);
	3319	+ }
	3320	+
	3321	+ EXPECT_EQ(waitpid(pid, &status, 0), pid);
	3322	+ EXPECT_EQ(true, WIFEXITED(status));
	3323	+ EXPECT_EQ(0, WEXITSTATUS(status));
	3324	+
	3325	+ /* Add some no-op filters for grins. */
	3326	+ EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
	3327	+ EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
	3328	+ EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
	3329	+ EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
	3330	+
	3331	+ /* Check that the basic notification machinery works */
	3332	+ listener = user_notif_syscall(__NR_getppid,
	3333	+ SECCOMP_FILTER_FLAG_NEW_LISTENER);
	3334	+ ASSERT_GE(listener, 0);
	3335	+
	3336	+ /* Installing a second listener in the chain should EBUSY */
	3337	+ EXPECT_EQ(user_notif_syscall(__NR_getppid,
	3338	+ SECCOMP_FILTER_FLAG_NEW_LISTENER),
	3339	+ -1);
	3340	+ EXPECT_EQ(errno, EBUSY);
	3341	+
	3342	+ pid = fork();
	3343	+ ASSERT_GE(pid, 0);
	3344	+
	3345	+ if (pid == 0) {
	3346	+ ret = syscall(__NR_getppid);
	3347	+ exit(ret != USER_NOTIF_MAGIC);
	3348	+ }
	3349	+
	3350	+ pollfd.fd = listener;
	3351	+ pollfd.events = POLLIN \| POLLOUT;
	3352	+
	3353	+ EXPECT_GT(poll(&pollfd, 1, -1), 0);
	3354	+ EXPECT_EQ(pollfd.revents, POLLIN);
	3355	+
	3356	+ /* Test that we can't pass garbage to the kernel. */
	3357	+ memset(&req, 0, sizeof(req));
	3358	+ req.pid = -1;
	3359	+ errno = 0;
	3360	+ ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req);
	3361	+ EXPECT_EQ(-1, ret);
	3362	+ EXPECT_EQ(EINVAL, errno);
	3363	+
	3364	+ if (ret) {
	3365	+ req.pid = 0;
	3366	+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
	3367	+ }
	3368	+
	3369	+ pollfd.fd = listener;
	3370	+ pollfd.events = POLLIN \| POLLOUT;
	3371	+
	3372	+ EXPECT_GT(poll(&pollfd, 1, -1), 0);
	3373	+ EXPECT_EQ(pollfd.revents, POLLOUT);
	3374	+
	3375	+ EXPECT_EQ(req.data.nr, __NR_getppid);
	3376	+
	3377	+ resp.id = req.id;
	3378	+ resp.error = 0;
	3379	+ resp.val = USER_NOTIF_MAGIC;
	3380	+
	3381	+ /* check that we make sure flags == 0 */
	3382	+ resp.flags = 1;
	3383	+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
	3384	+ EXPECT_EQ(errno, EINVAL);
	3385	+
	3386	+ resp.flags = 0;
	3387	+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
	3388	+
	3389	+ EXPECT_EQ(waitpid(pid, &status, 0), pid);
	3390	+ EXPECT_EQ(true, WIFEXITED(status));
	3391	+ EXPECT_EQ(0, WEXITSTATUS(status));
	3392	+}
	3393	+
	3394	+TEST(user_notification_with_tsync)
	3395	+{
	3396	+ int ret;
	3397	+ unsigned int flags;
	3398	+
	3399	+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
	3400	+ ASSERT_EQ(0, ret) {
	3401	+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
	3402	+ }
	3403	+
	3404	+ /* these were exclusive */
	3405	+ flags = SECCOMP_FILTER_FLAG_NEW_LISTENER \|
	3406	+ SECCOMP_FILTER_FLAG_TSYNC;
	3407	+ ASSERT_EQ(-1, user_notif_syscall(__NR_getppid, flags));
	3408	+ ASSERT_EQ(EINVAL, errno);
	3409	+
	3410	+ /* but now they're not */
	3411	+ flags \|= SECCOMP_FILTER_FLAG_TSYNC_ESRCH;
	3412	+ ret = user_notif_syscall(__NR_getppid, flags);
	3413	+ close(ret);
	3414	+ ASSERT_LE(0, ret);
	3415	+}
	3416	+
	3417	+TEST(user_notification_kill_in_middle)
	3418	+{
	3419	+ pid_t pid;
	3420	+ long ret;
	3421	+ int listener;
	3422	+ struct seccomp_notif req = {};
	3423	+ struct seccomp_notif_resp resp = {};
	3424	+
	3425	+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
	3426	+ ASSERT_EQ(0, ret) {
	3427	+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
	3428	+ }
	3429	+
	3430	+ listener = user_notif_syscall(__NR_getppid,
	3431	+ SECCOMP_FILTER_FLAG_NEW_LISTENER);
	3432	+ ASSERT_GE(listener, 0);
	3433	+
	3434	+ /*
	3435	+ * Check that nothing bad happens when we kill the task in the middle
	3436	+ * of a syscall.
	3437	+ */
	3438	+ pid = fork();
	3439	+ ASSERT_GE(pid, 0);
	3440	+
	3441	+ if (pid == 0) {
	3442	+ ret = syscall(__NR_getppid);
	3443	+ exit(ret != USER_NOTIF_MAGIC);
	3444	+ }
	3445	+
	3446	+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
	3447	+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req.id), 0);
	3448	+
	3449	+ EXPECT_EQ(kill(pid, SIGKILL), 0);
	3450	+ EXPECT_EQ(waitpid(pid, NULL, 0), pid);
	3451	+
	3452	+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req.id), -1);
	3453	+
	3454	+ resp.id = req.id;
	3455	+ ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp);
	3456	+ EXPECT_EQ(ret, -1);
	3457	+ EXPECT_EQ(errno, ENOENT);
	3458	+}
	3459	+
	3460	+static int handled = -1;
	3461	+
	3462	+static void signal_handler(int signal)
	3463	+{
	3464	+ if (write(handled, "c", 1) != 1)
	3465	+ perror("write from signal");
	3466	+}
	3467	+
	3468	+TEST(user_notification_signal)
	3469	+{
	3470	+ pid_t pid;
	3471	+ long ret;
	3472	+ int status, listener, sk_pair[2];
	3473	+ struct seccomp_notif req = {};
	3474	+ struct seccomp_notif_resp resp = {};
	3475	+ char c;
	3476	+
	3477	+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
	3478	+ ASSERT_EQ(0, ret) {
	3479	+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
	3480	+ }
	3481	+
	3482	+ ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0);
	3483	+
	3484	+ listener = user_notif_syscall(__NR_gettid,
	3485	+ SECCOMP_FILTER_FLAG_NEW_LISTENER);
	3486	+ ASSERT_GE(listener, 0);
	3487	+
	3488	+ pid = fork();
	3489	+ ASSERT_GE(pid, 0);
	3490	+
	3491	+ if (pid == 0) {
	3492	+ close(sk_pair[0]);
	3493	+ handled = sk_pair[1];
	3494	+ if (signal(SIGUSR1, signal_handler) == SIG_ERR) {
	3495	+ perror("signal");
	3496	+ exit(1);
	3497	+ }
	3498	+ /*
	3499	+ * ERESTARTSYS behavior is a bit hard to test, because we need
	3500	+ * to rely on a signal that has not yet been handled. Let's at
	3501	+ * least check that the error code gets propagated through, and
	3502	+ * hope that it doesn't break when there is actually a signal :)
	3503	+ */
	3504	+ ret = syscall(__NR_gettid);
	3505	+ exit(!(ret == -1 && errno == 512));
	3506	+ }
	3507	+
	3508	+ close(sk_pair[1]);
	3509	+
	3510	+ memset(&req, 0, sizeof(req));
	3511	+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
	3512	+
	3513	+ EXPECT_EQ(kill(pid, SIGUSR1), 0);
	3514	+
	3515	+ /*
	3516	+ * Make sure the signal really is delivered, which means we're not
	3517	+ * stuck in the user notification code any more and the notification
	3518	+ * should be dead.
	3519	+ */
	3520	+ EXPECT_EQ(read(sk_pair[0], &c, 1), 1);
	3521	+
	3522	+ resp.id = req.id;
	3523	+ resp.error = -EPERM;
	3524	+ resp.val = 0;
	3525	+
	3526	+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
	3527	+ EXPECT_EQ(errno, ENOENT);
	3528	+
	3529	+ memset(&req, 0, sizeof(req));
	3530	+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
	3531	+
	3532	+ resp.id = req.id;
	3533	+ resp.error = -512; /* -ERESTARTSYS */
	3534	+ resp.val = 0;
	3535	+
	3536	+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
	3537	+
	3538	+ EXPECT_EQ(waitpid(pid, &status, 0), pid);
	3539	+ EXPECT_EQ(true, WIFEXITED(status));
	3540	+ EXPECT_EQ(0, WEXITSTATUS(status));
	3541	+}
	3542	+
	3543	+TEST(user_notification_closed_listener)
	3544	+{
	3545	+ pid_t pid;
	3546	+ long ret;
	3547	+ int status, listener;
	3548	+
	3549	+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
	3550	+ ASSERT_EQ(0, ret) {
	3551	+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
	3552	+ }
	3553	+
	3554	+ listener = user_notif_syscall(__NR_getppid,
	3555	+ SECCOMP_FILTER_FLAG_NEW_LISTENER);
	3556	+ ASSERT_GE(listener, 0);
	3557	+
	3558	+ /*
	3559	+ * Check that we get an ENOSYS when the listener is closed.
	3560	+ */
	3561	+ pid = fork();
	3562	+ ASSERT_GE(pid, 0);
	3563	+ if (pid == 0) {
	3564	+ close(listener);
	3565	+ ret = syscall(__NR_getppid);
	3566	+ exit(ret != -1 && errno != ENOSYS);
	3567	+ }
	3568	+
	3569	+ close(listener);
	3570	+
	3571	+ EXPECT_EQ(waitpid(pid, &status, 0), pid);
	3572	+ EXPECT_EQ(true, WIFEXITED(status));
	3573	+ EXPECT_EQ(0, WEXITSTATUS(status));
	3574	+}
	3575	+
	3576	+/*
	3577	+ * Check that a pid in a child namespace still shows up as valid in ours.
	3578	+ */
	3579	+TEST(user_notification_child_pid_ns)
	3580	+{
	3581	+ pid_t pid;
	3582	+ int status, listener;
	3583	+ struct seccomp_notif req = {};
	3584	+ struct seccomp_notif_resp resp = {};
	3585	+
	3586	+ ASSERT_EQ(unshare(CLONE_NEWUSER \| CLONE_NEWPID), 0) {
	3587	+ if (errno == EINVAL)
	3588	+ SKIP(return, "kernel missing CLONE_NEWUSER support");
	3589	+ };
	3590	+
	3591	+ listener = user_notif_syscall(__NR_getppid,
	3592	+ SECCOMP_FILTER_FLAG_NEW_LISTENER);
	3593	+ ASSERT_GE(listener, 0);
	3594	+
	3595	+ pid = fork();
	3596	+ ASSERT_GE(pid, 0);
	3597	+
	3598	+ if (pid == 0)
	3599	+ exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
	3600	+
	3601	+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
	3602	+ EXPECT_EQ(req.pid, pid);
	3603	+
	3604	+ resp.id = req.id;
	3605	+ resp.error = 0;
	3606	+ resp.val = USER_NOTIF_MAGIC;
	3607	+
	3608	+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
	3609	+
	3610	+ EXPECT_EQ(waitpid(pid, &status, 0), pid);
	3611	+ EXPECT_EQ(true, WIFEXITED(status));
	3612	+ EXPECT_EQ(0, WEXITSTATUS(status));
	3613	+ close(listener);
	3614	+}
	3615	+
	3616	+/*
	3617	+ * Check that a pid in a sibling (i.e. unrelated) namespace shows up as 0, i.e.
	3618	+ * invalid.
	3619	+ */
	3620	+TEST(user_notification_sibling_pid_ns)
	3621	+{
	3622	+ pid_t pid, pid2;
	3623	+ int status, listener;
	3624	+ struct seccomp_notif req = {};
	3625	+ struct seccomp_notif_resp resp = {};
	3626	+
	3627	+ ASSERT_EQ(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0), 0) {
	3628	+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
	3629	+ }
	3630	+
	3631	+ listener = user_notif_syscall(__NR_getppid,
	3632	+ SECCOMP_FILTER_FLAG_NEW_LISTENER);
	3633	+ ASSERT_GE(listener, 0);
	3634	+
	3635	+ pid = fork();
	3636	+ ASSERT_GE(pid, 0);
	3637	+
	3638	+ if (pid == 0) {
	3639	+ ASSERT_EQ(unshare(CLONE_NEWPID), 0);
	3640	+
	3641	+ pid2 = fork();
	3642	+ ASSERT_GE(pid2, 0);
	3643	+
	3644	+ if (pid2 == 0)
	3645	+ exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
	3646	+
	3647	+ EXPECT_EQ(waitpid(pid2, &status, 0), pid2);
	3648	+ EXPECT_EQ(true, WIFEXITED(status));
	3649	+ EXPECT_EQ(0, WEXITSTATUS(status));
	3650	+ exit(WEXITSTATUS(status));
	3651	+ }
	3652	+
	3653	+ /* Create the sibling ns, and sibling in it. */
	3654	+ ASSERT_EQ(unshare(CLONE_NEWPID), 0) {
	3655	+ if (errno == EPERM)
	3656	+ SKIP(return, "CLONE_NEWPID requires CAP_SYS_ADMIN");
	3657	+ }
	3658	+ ASSERT_EQ(errno, 0);
	3659	+
	3660	+ pid2 = fork();
	3661	+ ASSERT_GE(pid2, 0);
	3662	+
	3663	+ if (pid2 == 0) {
	3664	+ ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
	3665	+ /*
	3666	+ * The pid should be 0, i.e. the task is in some namespace that
	3667	+ * we can't "see".
	3668	+ */
	3669	+ EXPECT_EQ(req.pid, 0);
	3670	+
	3671	+ resp.id = req.id;
	3672	+ resp.error = 0;
	3673	+ resp.val = USER_NOTIF_MAGIC;
	3674	+
	3675	+ ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
	3676	+ exit(0);
	3677	+ }
	3678	+
	3679	+ close(listener);
	3680	+
	3681	+ EXPECT_EQ(waitpid(pid, &status, 0), pid);
	3682	+ EXPECT_EQ(true, WIFEXITED(status));
	3683	+ EXPECT_EQ(0, WEXITSTATUS(status));
	3684	+
	3685	+ EXPECT_EQ(waitpid(pid2, &status, 0), pid2);
	3686	+ EXPECT_EQ(true, WIFEXITED(status));
	3687	+ EXPECT_EQ(0, WEXITSTATUS(status));
	3688	+}
	3689	+
	3690	+TEST(user_notification_fault_recv)
	3691	+{
	3692	+ pid_t pid;
	3693	+ int status, listener;
	3694	+ struct seccomp_notif req = {};
	3695	+ struct seccomp_notif_resp resp = {};
	3696	+
	3697	+ ASSERT_EQ(unshare(CLONE_NEWUSER), 0);
	3698	+
	3699	+ listener = user_notif_syscall(__NR_getppid,
	3700	+ SECCOMP_FILTER_FLAG_NEW_LISTENER);
	3701	+ ASSERT_GE(listener, 0);
	3702	+
	3703	+ pid = fork();
	3704	+ ASSERT_GE(pid, 0);
	3705	+
	3706	+ if (pid == 0)
	3707	+ exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
	3708	+
	3709	+ /* Do a bad recv() */
	3710	+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, NULL), -1);
	3711	+ EXPECT_EQ(errno, EFAULT);
	3712	+
	3713	+ /* We should still be able to receive this notification, though. */
	3714	+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
	3715	+ EXPECT_EQ(req.pid, pid);
	3716	+
	3717	+ resp.id = req.id;
	3718	+ resp.error = 0;
	3719	+ resp.val = USER_NOTIF_MAGIC;
	3720	+
	3721	+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
	3722	+
	3723	+ EXPECT_EQ(waitpid(pid, &status, 0), pid);
	3724	+ EXPECT_EQ(true, WIFEXITED(status));
	3725	+ EXPECT_EQ(0, WEXITSTATUS(status));
	3726	+}
	3727	+
	3728	+TEST(seccomp_get_notif_sizes)
	3729	+{
	3730	+ struct seccomp_notif_sizes sizes;
	3731	+
	3732	+ ASSERT_EQ(seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes), 0);
	3733	+ EXPECT_EQ(sizes.seccomp_notif, sizeof(struct seccomp_notif));
	3734	+ EXPECT_EQ(sizes.seccomp_notif_resp, sizeof(struct seccomp_notif_resp));
	3735	+}
	3736	+
	3737	+TEST(user_notification_continue)
	3738	+{
	3739	+ pid_t pid;
	3740	+ long ret;
	3741	+ int status, listener;
	3742	+ struct seccomp_notif req = {};
	3743	+ struct seccomp_notif_resp resp = {};
	3744	+ struct pollfd pollfd;
	3745	+
	3746	+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
	3747	+ ASSERT_EQ(0, ret) {
	3748	+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
	3749	+ }
	3750	+
	3751	+ listener = user_notif_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER);
	3752	+ ASSERT_GE(listener, 0);
	3753	+
	3754	+ pid = fork();
	3755	+ ASSERT_GE(pid, 0);
	3756	+
	3757	+ if (pid == 0) {
	3758	+ int dup_fd, pipe_fds[2];
	3759	+ pid_t self;
	3760	+
	3761	+ ASSERT_GE(pipe(pipe_fds), 0);
	3762	+
	3763	+ dup_fd = dup(pipe_fds[0]);
	3764	+ ASSERT_GE(dup_fd, 0);
	3765	+ EXPECT_NE(pipe_fds[0], dup_fd);
	3766	+
	3767	+ self = getpid();
	3768	+ ASSERT_EQ(filecmp(self, self, pipe_fds[0], dup_fd), 0);
	3769	+ exit(0);
	3770	+ }
	3771	+
	3772	+ pollfd.fd = listener;
	3773	+ pollfd.events = POLLIN \| POLLOUT;
	3774	+
	3775	+ EXPECT_GT(poll(&pollfd, 1, -1), 0);
	3776	+ EXPECT_EQ(pollfd.revents, POLLIN);
	3777	+
	3778	+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
	3779	+
	3780	+ pollfd.fd = listener;
	3781	+ pollfd.events = POLLIN \| POLLOUT;
	3782	+
	3783	+ EXPECT_GT(poll(&pollfd, 1, -1), 0);
	3784	+ EXPECT_EQ(pollfd.revents, POLLOUT);
	3785	+
	3786	+ EXPECT_EQ(req.data.nr, __NR_dup);
	3787	+
	3788	+ resp.id = req.id;
	3789	+ resp.flags = SECCOMP_USER_NOTIF_FLAG_CONTINUE;
	3790	+
	3791	+ /*
	3792	+ * Verify that setting SECCOMP_USER_NOTIF_FLAG_CONTINUE enforces other
	3793	+ * args be set to 0.
	3794	+ */
	3795	+ resp.error = 0;
	3796	+ resp.val = USER_NOTIF_MAGIC;
	3797	+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
	3798	+ EXPECT_EQ(errno, EINVAL);
	3799	+
	3800	+ resp.error = USER_NOTIF_MAGIC;
	3801	+ resp.val = 0;
	3802	+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
	3803	+ EXPECT_EQ(errno, EINVAL);
	3804	+
	3805	+ resp.error = 0;
	3806	+ resp.val = 0;
	3807	+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0) {
	3808	+ if (errno == EINVAL)
	3809	+ SKIP(goto skip, "Kernel does not support SECCOMP_USER_NOTIF_FLAG_CONTINUE");
	3810	+ }
	3811	+
	3812	+skip:
	3813	+ EXPECT_EQ(waitpid(pid, &status, 0), pid);
	3814	+ EXPECT_EQ(true, WIFEXITED(status));
	3815	+ EXPECT_EQ(0, WEXITSTATUS(status)) {
	3816	+ if (WEXITSTATUS(status) == 2) {
	3817	+ SKIP(return, "Kernel does not support kcmp() syscall");
	3818	+ return;
	3819	+ }
	3820	+ }
	3821	+}
	3822	+
	3823	+TEST(user_notification_filter_empty)
	3824	+{
	3825	+ pid_t pid;
	3826	+ long ret;
	3827	+ int status;
	3828	+ struct pollfd pollfd;
	3829	+ struct __clone_args args = {
	3830	+ .flags = CLONE_FILES,
	3831	+ .exit_signal = SIGCHLD,
	3832	+ };
	3833	+
	3834	+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
	3835	+ ASSERT_EQ(0, ret) {
	3836	+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
	3837	+ }
	3838	+
	3839	+ pid = sys_clone3(&args, sizeof(args));
	3840	+ ASSERT_GE(pid, 0);
	3841	+
	3842	+ if (pid == 0) {
	3843	+ int listener;
	3844	+
	3845	+ listener = user_notif_syscall(__NR_mknodat, SECCOMP_FILTER_FLAG_NEW_LISTENER);
	3846	+ if (listener < 0)
	3847	+ _exit(EXIT_FAILURE);
	3848	+
	3849	+ if (dup2(listener, 200) != 200)
	3850	+ _exit(EXIT_FAILURE);
	3851	+
	3852	+ close(listener);
	3853	+
	3854	+ _exit(EXIT_SUCCESS);
	3855	+ }
	3856	+
	3857	+ EXPECT_EQ(waitpid(pid, &status, 0), pid);
	3858	+ EXPECT_EQ(true, WIFEXITED(status));
	3859	+ EXPECT_EQ(0, WEXITSTATUS(status));
	3860	+
	3861	+ /*
	3862	+ * The seccomp filter has become unused so we should be notified once
	3863	+ * the kernel gets around to cleaning up task struct.
	3864	+ */
	3865	+ pollfd.fd = 200;
	3866	+ pollfd.events = POLLHUP;
	3867	+
	3868	+ EXPECT_GT(poll(&pollfd, 1, 2000), 0);
	3869	+ EXPECT_GT((pollfd.revents & POLLHUP) ?: 0, 0);
	3870	+}
	3871	+
	3872	+static void do_thread(void data)
	3873	+{
	3874	+ return NULL;
	3875	+}
	3876	+
	3877	+TEST(user_notification_filter_empty_threaded)
	3878	+{
	3879	+ pid_t pid;
	3880	+ long ret;
	3881	+ int status;
	3882	+ struct pollfd pollfd;
	3883	+ struct __clone_args args = {
	3884	+ .flags = CLONE_FILES,
	3885	+ .exit_signal = SIGCHLD,
	3886	+ };
	3887	+
	3888	+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
	3889	+ ASSERT_EQ(0, ret) {
	3890	+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
	3891	+ }
	3892	+
	3893	+ pid = sys_clone3(&args, sizeof(args));
	3894	+ ASSERT_GE(pid, 0);
	3895	+
	3896	+ if (pid == 0) {
	3897	+ pid_t pid1, pid2;
	3898	+ int listener, status;
	3899	+ pthread_t thread;
	3900	+
	3901	+ listener = user_notif_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER);
	3902	+ if (listener < 0)
	3903	+ _exit(EXIT_FAILURE);
	3904	+
	3905	+ if (dup2(listener, 200) != 200)
	3906	+ _exit(EXIT_FAILURE);
	3907	+
	3908	+ close(listener);
	3909	+
	3910	+ pid1 = fork();
	3911	+ if (pid1 < 0)
	3912	+ _exit(EXIT_FAILURE);
	3913	+
	3914	+ if (pid1 == 0)
	3915	+ _exit(EXIT_SUCCESS);
	3916	+
	3917	+ pid2 = fork();
	3918	+ if (pid2 < 0)
	3919	+ _exit(EXIT_FAILURE);
	3920	+
	3921	+ if (pid2 == 0)
	3922	+ _exit(EXIT_SUCCESS);
	3923	+
	3924	+ if (pthread_create(&thread, NULL, do_thread, NULL) \|\|
	3925	+ pthread_join(thread, NULL))
	3926	+ _exit(EXIT_FAILURE);
	3927	+
	3928	+ if (pthread_create(&thread, NULL, do_thread, NULL) \|\|
	3929	+ pthread_join(thread, NULL))
	3930	+ _exit(EXIT_FAILURE);
	3931	+
	3932	+ if (waitpid(pid1, &status, 0) != pid1 \|\| !WIFEXITED(status) \|\|
	3933	+ WEXITSTATUS(status))
	3934	+ _exit(EXIT_FAILURE);
	3935	+
	3936	+ if (waitpid(pid2, &status, 0) != pid2 \|\| !WIFEXITED(status) \|\|
	3937	+ WEXITSTATUS(status))
	3938	+ _exit(EXIT_FAILURE);
	3939	+
	3940	+ exit(EXIT_SUCCESS);
	3941	+ }
	3942	+
	3943	+ EXPECT_EQ(waitpid(pid, &status, 0), pid);
	3944	+ EXPECT_EQ(true, WIFEXITED(status));
	3945	+ EXPECT_EQ(0, WEXITSTATUS(status));
	3946	+
	3947	+ /*
	3948	+ * The seccomp filter has become unused so we should be notified once
	3949	+ * the kernel gets around to cleaning up task struct.
	3950	+ */
	3951	+ pollfd.fd = 200;
	3952	+ pollfd.events = POLLHUP;
	3953	+
	3954	+ EXPECT_GT(poll(&pollfd, 1, 2000), 0);
	3955	+ EXPECT_GT((pollfd.revents & POLLHUP) ?: 0, 0);
	3956	+}
	3957	+
	3958	+TEST(user_notification_addfd)
	3959	+{
	3960	+ pid_t pid;
	3961	+ long ret;
	3962	+ int status, listener, memfd, fd;
	3963	+ struct seccomp_notif_addfd addfd = {};
	3964	+ struct seccomp_notif_addfd_small small = {};
	3965	+ struct seccomp_notif_addfd_big big = {};
	3966	+ struct seccomp_notif req = {};
	3967	+ struct seccomp_notif_resp resp = {};
	3968	+ /* 100 ms */
	3969	+ struct timespec delay = { .tv_nsec = 100000000 };
	3970	+
	3971	+ memfd = memfd_create("test", 0);
	3972	+ ASSERT_GE(memfd, 0);
	3973	+
	3974	+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
	3975	+ ASSERT_EQ(0, ret) {
	3976	+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
	3977	+ }
	3978	+
	3979	+ /* Check that the basic notification machinery works */
	3980	+ listener = user_notif_syscall(__NR_getppid,
	3981	+ SECCOMP_FILTER_FLAG_NEW_LISTENER);
	3982	+ ASSERT_GE(listener, 0);
	3983	+
	3984	+ pid = fork();
	3985	+ ASSERT_GE(pid, 0);
	3986	+
	3987	+ if (pid == 0) {
	3988	+ if (syscall(__NR_getppid) != USER_NOTIF_MAGIC)
	3989	+ exit(1);
	3990	+ exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
	3991	+ }
	3992	+
	3993	+ ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
	3994	+
	3995	+ addfd.srcfd = memfd;
	3996	+ addfd.newfd = 0;
	3997	+ addfd.id = req.id;
	3998	+ addfd.flags = 0x0;
	3999	+
	4000	+ /* Verify bad newfd_flags cannot be set */
	4001	+ addfd.newfd_flags = ~O_CLOEXEC;
	4002	+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
	4003	+ EXPECT_EQ(errno, EINVAL);
	4004	+ addfd.newfd_flags = O_CLOEXEC;
	4005	+
	4006	+ /* Verify bad flags cannot be set */
	4007	+ addfd.flags = 0xff;
	4008	+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
	4009	+ EXPECT_EQ(errno, EINVAL);
	4010	+ addfd.flags = 0;
	4011	+
	4012	+ /* Verify that remote_fd cannot be set without setting flags */
	4013	+ addfd.newfd = 1;
	4014	+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
	4015	+ EXPECT_EQ(errno, EINVAL);
	4016	+ addfd.newfd = 0;
	4017	+
	4018	+ /* Verify small size cannot be set */
	4019	+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_SMALL, &small), -1);
	4020	+ EXPECT_EQ(errno, EINVAL);
	4021	+
	4022	+ /* Verify we can't send bits filled in unknown buffer area */
	4023	+ memset(&big, 0xAA, sizeof(big));
	4024	+ big.addfd = addfd;
	4025	+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_BIG, &big), -1);
	4026	+ EXPECT_EQ(errno, E2BIG);
	4027	+
	4028	+
	4029	+ /* Verify we can set an arbitrary remote fd */
	4030	+ fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
	4031	+ /*
	4032	+ * The child has fds 0(stdin), 1(stdout), 2(stderr), 3(memfd),
	4033	+ * 4(listener), so the newly allocated fd should be 5.
	4034	+ */
	4035	+ EXPECT_EQ(fd, 5);
	4036	+ EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0);
	4037	+
	4038	+ /* Verify we can set an arbitrary remote fd with large size */
	4039	+ memset(&big, 0x0, sizeof(big));
	4040	+ big.addfd = addfd;
	4041	+ fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_BIG, &big);
	4042	+ EXPECT_EQ(fd, 6);
	4043	+
	4044	+ /* Verify we can set a specific remote fd */
	4045	+ addfd.newfd = 42;
	4046	+ addfd.flags = SECCOMP_ADDFD_FLAG_SETFD;
	4047	+ fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
	4048	+ EXPECT_EQ(fd, 42);
	4049	+ EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0);
	4050	+
	4051	+ /* Resume syscall */
	4052	+ resp.id = req.id;
	4053	+ resp.error = 0;
	4054	+ resp.val = USER_NOTIF_MAGIC;
	4055	+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
	4056	+
	4057	+ /*
	4058	+ * This sets the ID of the ADD FD to the last request plus 1. The
	4059	+ * notification ID increments 1 per notification.
	4060	+ */
	4061	+ addfd.id = req.id + 1;
	4062	+
	4063	+ /* This spins until the underlying notification is generated */
	4064	+ while (ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd) != -1 &&
	4065	+ errno != -EINPROGRESS)
	4066	+ nanosleep(&delay, NULL);
	4067	+
	4068	+ memset(&req, 0, sizeof(req));
	4069	+ ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
	4070	+ ASSERT_EQ(addfd.id, req.id);
	4071	+
	4072	+ resp.id = req.id;
	4073	+ resp.error = 0;
	4074	+ resp.val = USER_NOTIF_MAGIC;
	4075	+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
	4076	+
	4077	+ /* Wait for child to finish. */
	4078	+ EXPECT_EQ(waitpid(pid, &status, 0), pid);
	4079	+ EXPECT_EQ(true, WIFEXITED(status));
	4080	+ EXPECT_EQ(0, WEXITSTATUS(status));
	4081	+
	4082	+ close(memfd);
	4083	+}
	4084	+
	4085	+TEST(user_notification_addfd_rlimit)
	4086	+{
	4087	+ pid_t pid;
	4088	+ long ret;
	4089	+ int status, listener, memfd;
	4090	+ struct seccomp_notif_addfd addfd = {};
	4091	+ struct seccomp_notif req = {};
	4092	+ struct seccomp_notif_resp resp = {};
	4093	+ const struct rlimit lim = {
	4094	+ .rlim_cur = 0,
	4095	+ .rlim_max = 0,
	4096	+ };
	4097	+
	4098	+ memfd = memfd_create("test", 0);
	4099	+ ASSERT_GE(memfd, 0);
	4100	+
	4101	+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
	4102	+ ASSERT_EQ(0, ret) {
	4103	+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
	4104	+ }
	4105	+
	4106	+ /* Check that the basic notification machinery works */
	4107	+ listener = user_notif_syscall(__NR_getppid,
	4108	+ SECCOMP_FILTER_FLAG_NEW_LISTENER);
	4109	+ ASSERT_GE(listener, 0);
	4110	+
	4111	+ pid = fork();
	4112	+ ASSERT_GE(pid, 0);
	4113	+
	4114	+ if (pid == 0)
	4115	+ exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
	4116	+
	4117	+
	4118	+ ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
	4119	+
	4120	+ ASSERT_EQ(prlimit(pid, RLIMIT_NOFILE, &lim, NULL), 0);
	4121	+
	4122	+ addfd.srcfd = memfd;
	4123	+ addfd.newfd_flags = O_CLOEXEC;
	4124	+ addfd.newfd = 0;
	4125	+ addfd.id = req.id;
	4126	+ addfd.flags = 0;
	4127	+
	4128	+ /* Should probably spot check /proc/sys/fs/file-nr */
	4129	+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
	4130	+ EXPECT_EQ(errno, EMFILE);
	4131	+
	4132	+ addfd.newfd = 100;
	4133	+ addfd.flags = SECCOMP_ADDFD_FLAG_SETFD;
	4134	+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
	4135	+ EXPECT_EQ(errno, EBADF);
	4136	+
	4137	+ resp.id = req.id;
	4138	+ resp.error = 0;
	4139	+ resp.val = USER_NOTIF_MAGIC;
	4140	+
	4141	+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
	4142	+
	4143	+ /* Wait for child to finish. */
	4144	+ EXPECT_EQ(waitpid(pid, &status, 0), pid);
	4145	+ EXPECT_EQ(true, WIFEXITED(status));
	4146	+ EXPECT_EQ(0, WEXITSTATUS(status));
	4147	+
	4148	+ close(memfd);
	4149	+}
	4150	+
2984	4151	/*
2985	4152	* TODO:
2986		- * - add microbenchmarks
2987	4153	* - expand NNP testing
2988	4154	* - better arch-specific TRACE and TRAP handlers.
2989	4155	* - endianness checking when appropriate
..	..	@@ -2991,7 +4157,6 @@
2991	4157	* - arch value testing (x86 modes especially)
2992	4158	* - verify that FILTER_FLAG_LOG filters generate log messages
2993	4159	* - verify that RET_LOG generates log messages
2994		- * - ...
2995	4160	*/
2996	4161
2997	4162	TEST_HARNESS_MAIN