~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0-only
1	2	/*
2	3	* linux/fs/exec.c
3	4	*
..	..	@@ -22,6 +23,7 @@
22	23	* formats.
23	24	*/
24	25
	26	+#include <linux/kernel_read_file.h>
25	27	#include <linux/slab.h>
26	28	#include <linux/file.h>
27	29	#include <linux/fdtable.h>
..	..	@@ -58,10 +60,10 @@
58	60	#include <linux/kmod.h>
59	61	#include <linux/fsnotify.h>
60	62	#include <linux/fs_struct.h>
61		-#include <linux/pipe_fs_i.h>
62	63	#include <linux/oom.h>
63	64	#include <linux/compat.h>
64	65	#include <linux/vmalloc.h>
	66	+#include <linux/io_uring.h>
65	67
66	68	#include <linux/uaccess.h>
67	69	#include <asm/mmu_context.h>
..	..	@@ -71,6 +73,10 @@
71	73	#include "internal.h"
72	74
73	75	#include <trace/events/sched.h>
	76	+
	77	+EXPORT_TRACEPOINT_SYMBOL_GPL(task_rename);
	78	+
	79	+static int bprm_creds_from_file(struct linux_binprm *bprm);
74	80
75	81	int suid_dumpable = 0;
76	82
..	..	@@ -139,12 +145,14 @@
139	145	if (IS_ERR(file))
140	146	goto out;
141	147
142		- error = -EINVAL;
143		- if (!S_ISREG(file_inode(file)->i_mode))
144		- goto exit;
145		-
	148	+ /*
	149	+ * may_open() has already checked for this, so it should be
	150	+ * impossible to trip now. But we need to be extra cautious
	151	+ * and check again at the very end too.
	152	+ */
146	153	error = -EACCES;
147		- if (path_noexec(&file->f_path))
	154	+ if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) \|\|
	155	+ path_noexec(&file->f_path)))
148	156	goto exit;
149	157
150	158	fsnotify_open(file);
..	..	@@ -213,65 +221,20 @@
213	221	* We are doing an exec(). 'current' is the process
214	222	* doing the exec and bprm->mm is the new process's mm.
215	223	*/
216		- ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags,
	224	+ ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags,
217	225	&page, NULL, NULL);
218	226	if (ret <= 0)
219	227	return NULL;
220	228
221		- if (write) {
222		- unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
223		- unsigned long ptr_size, limit;
224		-
225		- /*
226		- * Since the stack will hold pointers to the strings, we
227		- * must account for them as well.
228		- *
229		- * The size calculation is the entire vma while each arg page is
230		- * built, so each time we get here it's calculating how far it
231		- * is currently (rather than each call being just the newly
232		- * added size from the arg page). As a result, we need to
233		- * always add the entire size of the pointers, so that on the
234		- * last call to get_arg_page() we'll actually have the entire
235		- * correct size.
236		- */
237		- ptr_size = (bprm->argc + bprm->envc) * sizeof(void *);
238		- if (ptr_size > ULONG_MAX - size)
239		- goto fail;
240		- size += ptr_size;
241		-
242		- acct_arg_size(bprm, size / PAGE_SIZE);
243		-
244		- /*
245		- * We've historically supported up to 32 pages (ARG_MAX)
246		- * of argument strings even with small stacks
247		- */
248		- if (size <= ARG_MAX)
249		- return page;
250		-
251		- /*
252		- * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM
253		- * (whichever is smaller) for the argv+env strings.
254		- * This ensures that:
255		- * - the remaining binfmt code will not run out of stack space,
256		- * - the program will have a reasonable amount of stack left
257		- * to work from.
258		- */
259		- limit = _STK_LIM / 4 * 3;
260		- limit = min(limit, bprm->rlim_stack.rlim_cur / 4);
261		- if (size > limit)
262		- goto fail;
263		- }
	229	+ if (write)
	230	+ acct_arg_size(bprm, vma_pages(bprm->vma));
264	231
265	232	return page;
266		-
267		-fail:
268		- put_page(page);
269		- return NULL;
270	233	}
271	234
272	235	static void put_arg_page(struct page *page)
273	236	{
274		- put_page(page);
	237	+ put_user_page(page);
275	238	}
276	239
277	240	static void free_arg_pages(struct linux_binprm *bprm)
..	..	@@ -295,7 +258,7 @@
295	258	return -ENOMEM;
296	259	vma_set_anonymous(vma);
297	260
298		- if (down_write_killable(&mm->mmap_sem)) {
	261	+ if (mmap_write_lock_killable(mm)) {
299	262	err = -EINTR;
300	263	goto err_free;
301	264	}
..	..	@@ -317,12 +280,11 @@
317	280	goto err;
318	281
319	282	mm->stack_vm = mm->total_vm = 1;
320		- arch_bprm_mm_init(mm, vma);
321		- up_write(&mm->mmap_sem);
	283	+ mmap_write_unlock(mm);
322	284	bprm->p = vma->vm_end - sizeof(void *);
323	285	return 0;
324	286	err:
325		- up_write(&mm->mmap_sem);
	287	+ mmap_write_unlock(mm);
326	288	err_free:
327	289	bprm->vma = NULL;
328	290	vm_area_free(vma);
..	..	@@ -492,6 +454,64 @@
492	454	return i;
493	455	}
494	456
	457	+static int count_strings_kernel(const char const argv)
	458	+{
	459	+ int i;
	460	+
	461	+ if (!argv)
	462	+ return 0;
	463	+
	464	+ for (i = 0; argv[i]; ++i) {
	465	+ if (i >= MAX_ARG_STRINGS)
	466	+ return -E2BIG;
	467	+ if (fatal_signal_pending(current))
	468	+ return -ERESTARTNOHAND;
	469	+ cond_resched();
	470	+ }
	471	+ return i;
	472	+}
	473	+
	474	+static int bprm_stack_limits(struct linux_binprm *bprm)
	475	+{
	476	+ unsigned long limit, ptr_size;
	477	+
	478	+ /*
	479	+ * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM
	480	+ * (whichever is smaller) for the argv+env strings.
	481	+ * This ensures that:
	482	+ * - the remaining binfmt code will not run out of stack space,
	483	+ * - the program will have a reasonable amount of stack left
	484	+ * to work from.
	485	+ */
	486	+ limit = _STK_LIM / 4 * 3;
	487	+ limit = min(limit, bprm->rlim_stack.rlim_cur / 4);
	488	+ /*
	489	+ * We've historically supported up to 32 pages (ARG_MAX)
	490	+ * of argument strings even with small stacks
	491	+ */
	492	+ limit = max_t(unsigned long, limit, ARG_MAX);
	493	+ /*
	494	+ * We must account for the size of all the argv and envp pointers to
	495	+ * the argv and envp strings, since they will also take up space in
	496	+ * the stack. They aren't stored until much later when we can't
	497	+ * signal to the parent that the child has run out of stack space.
	498	+ * Instead, calculate it here so it's possible to fail gracefully.
	499	+ *
	500	+ * In the case of argc = 0, make sure there is space for adding a
	501	+ * empty string (which will bump argc to 1), to ensure confused
	502	+ * userspace programs don't start processing from argv[1], thinking
	503	+ * argc can never be 0, to keep them from walking envp by accident.
	504	+ * See do_execveat_common().
	505	+ */
	506	+ ptr_size = (max(bprm->argc, 1) + bprm->envc) * sizeof(void *);
	507	+ if (limit <= ptr_size)
	508	+ return -E2BIG;
	509	+ limit -= ptr_size;
	510	+
	511	+ bprm->argmin = bprm->p - limit;
	512	+ return 0;
	513	+}
	514	+
495	515	/*
496	516	* 'copy_strings()' copies argument/environment strings from the old
497	517	* processes's memory to the new process's stack. The call to get_user_pages()
..	..	@@ -527,6 +547,10 @@
527	547	pos = bprm->p;
528	548	str += len;
529	549	bprm->p -= len;
	550	+#ifdef CONFIG_MMU
	551	+ if (bprm->p < bprm->argmin)
	552	+ goto out;
	553	+#endif
530	554
531	555	while (len > 0) {
532	556	int offset, bytes_to_copy;
..	..	@@ -586,24 +610,62 @@
586	610	}
587	611
588	612	/*
589		- * Like copy_strings, but get argv and its values from kernel memory.
	613	+ * Copy and argument/environment string from the kernel to the processes stack.
590	614	*/
591		-int copy_strings_kernel(int argc, const char const __argv,
592		- struct linux_binprm *bprm)
	615	+int copy_string_kernel(const char arg, struct linux_binprm bprm)
593	616	{
594		- int r;
595		- mm_segment_t oldfs = get_fs();
596		- struct user_arg_ptr argv = {
597		- .ptr.native = (const char __user const __user )__argv,
598		- };
	617	+ int len = strnlen(arg, MAX_ARG_STRLEN) + 1 /* terminating NUL */;
	618	+ unsigned long pos = bprm->p;
599	619
600		- set_fs(KERNEL_DS);
601		- r = copy_strings(argc, argv, bprm);
602		- set_fs(oldfs);
	620	+ if (len == 0)
	621	+ return -EFAULT;
	622	+ if (!valid_arg_len(bprm, len))
	623	+ return -E2BIG;
603	624
604		- return r;
	625	+ /* We're going to work our way backwards. */
	626	+ arg += len;
	627	+ bprm->p -= len;
	628	+ if (IS_ENABLED(CONFIG_MMU) && bprm->p < bprm->argmin)
	629	+ return -E2BIG;
	630	+
	631	+ while (len > 0) {
	632	+ unsigned int bytes_to_copy = min_t(unsigned int, len,
	633	+ min_not_zero(offset_in_page(pos), PAGE_SIZE));
	634	+ struct page *page;
	635	+ char *kaddr;
	636	+
	637	+ pos -= bytes_to_copy;
	638	+ arg -= bytes_to_copy;
	639	+ len -= bytes_to_copy;
	640	+
	641	+ page = get_arg_page(bprm, pos, 1);
	642	+ if (!page)
	643	+ return -E2BIG;
	644	+ kaddr = kmap_atomic(page);
	645	+ flush_arg_page(bprm, pos & PAGE_MASK, page);
	646	+ memcpy(kaddr + offset_in_page(pos), arg, bytes_to_copy);
	647	+ flush_kernel_dcache_page(page);
	648	+ kunmap_atomic(kaddr);
	649	+ put_arg_page(page);
	650	+ }
	651	+
	652	+ return 0;
605	653	}
606		-EXPORT_SYMBOL(copy_strings_kernel);
	654	+EXPORT_SYMBOL(copy_string_kernel);
	655	+
	656	+static int copy_strings_kernel(int argc, const char const argv,
	657	+ struct linux_binprm *bprm)
	658	+{
	659	+ while (argc-- > 0) {
	660	+ int ret = copy_string_kernel(argv[argc], bprm);
	661	+ if (ret < 0)
	662	+ return ret;
	663	+ if (fatal_signal_pending(current))
	664	+ return -ERESTARTNOHAND;
	665	+ cond_resched();
	666	+ }
	667	+ return 0;
	668	+}
607	669
608	670	#ifdef CONFIG_MMU
609	671
..	..	@@ -735,7 +797,7 @@
735	797	bprm->loader -= stack_shift;
736	798	bprm->exec -= stack_shift;
737	799
738		- if (down_write_killable(&mm->mmap_sem))
	800	+ if (mmap_write_lock_killable(mm))
739	801	return -EINTR;
740	802
741	803	vm_flags = VM_STACK_FLAGS;
..	..	@@ -757,6 +819,11 @@
757	819	if (ret)
758	820	goto out_unlock;
759	821	BUG_ON(prev != vma);
	822	+
	823	+ if (unlikely(vm_flags & VM_EXEC)) {
	824	+ pr_warn_once("process '%pD4' started with executable stack\n",
	825	+ bprm->file);
	826	+ }
760	827
761	828	/* Move stack pages down in memory. */
762	829	if (stack_shift) {
..	..	@@ -792,7 +859,7 @@
792	859	ret = -EFAULT;
793	860
794	861	out_unlock:
795		- up_write(&mm->mmap_sem);
	862	+ mmap_write_unlock(mm);
796	863	return ret;
797	864	}
798	865	EXPORT_SYMBOL(setup_arg_pages);
..	..	@@ -854,11 +921,14 @@
854	921	if (IS_ERR(file))
855	922	goto out;
856	923
	924	+ /*
	925	+ * may_open() has already checked for this, so it should be
	926	+ * impossible to trip now. But we need to be extra cautious
	927	+ * and check again at the very end too.
	928	+ */
857	929	err = -EACCES;
858		- if (!S_ISREG(file_inode(file)->i_mode))
859		- goto exit;
860		-
861		- if (path_noexec(&file->f_path))
	930	+ if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) \|\|
	931	+ path_noexec(&file->f_path)))
862	932	goto exit;
863	933
864	934	err = deny_write_access(file);
..	..	@@ -889,146 +959,57 @@
889	959	}
890	960	EXPORT_SYMBOL(open_exec);
891	961
892		-int kernel_read_file(struct file file, void buf, loff_t size,
893		- loff_t max_size, enum kernel_read_file_id id)
894		-{
895		- loff_t i_size, pos;
896		- ssize_t bytes = 0;
897		- int ret;
898		-
899		- if (!S_ISREG(file_inode(file)->i_mode) \|\| max_size < 0)
900		- return -EINVAL;
901		-
902		- ret = deny_write_access(file);
903		- if (ret)
904		- return ret;
905		-
906		- ret = security_kernel_read_file(file, id);
907		- if (ret)
908		- goto out;
909		-
910		- i_size = i_size_read(file_inode(file));
911		- if (max_size > 0 && i_size > max_size) {
912		- ret = -EFBIG;
913		- goto out;
914		- }
915		- if (i_size <= 0) {
916		- ret = -EINVAL;
917		- goto out;
918		- }
919		-
920		- if (id != READING_FIRMWARE_PREALLOC_BUFFER)
921		- *buf = vmalloc(i_size);
922		- if (!*buf) {
923		- ret = -ENOMEM;
924		- goto out;
925		- }
926		-
927		- pos = 0;
928		- while (pos < i_size) {
929		- bytes = kernel_read(file, *buf + pos, i_size - pos, &pos);
930		- if (bytes < 0) {
931		- ret = bytes;
932		- goto out_free;
933		- }
934		-
935		- if (bytes == 0)
936		- break;
937		- }
938		-
939		- if (pos != i_size) {
940		- ret = -EIO;
941		- goto out_free;
942		- }
943		-
944		- ret = security_kernel_post_read_file(file, *buf, i_size, id);
945		- if (!ret)
946		- *size = pos;
947		-
948		-out_free:
949		- if (ret < 0) {
950		- if (id != READING_FIRMWARE_PREALLOC_BUFFER) {
951		- vfree(*buf);
952		- *buf = NULL;
953		- }
954		- }
955		-
956		-out:
957		- allow_write_access(file);
958		- return ret;
959		-}
960		-EXPORT_SYMBOL_GPL(kernel_read_file);
961		-
962		-int kernel_read_file_from_path(const char path, void buf, loff_t size,
963		- loff_t max_size, enum kernel_read_file_id id)
964		-{
965		- struct file *file;
966		- int ret;
967		-
968		- if (!path \|\| !*path)
969		- return -EINVAL;
970		-
971		- file = filp_open(path, O_RDONLY, 0);
972		- if (IS_ERR(file))
973		- return PTR_ERR(file);
974		-
975		- ret = kernel_read_file(file, buf, size, max_size, id);
976		- fput(file);
977		- return ret;
978		-}
979		-EXPORT_SYMBOL_GPL(kernel_read_file_from_path);
980		-
981		-int kernel_read_file_from_fd(int fd, void *buf, loff_t size, loff_t max_size,
982		- enum kernel_read_file_id id)
983		-{
984		- struct fd f = fdget(fd);
985		- int ret = -EBADF;
986		-
987		- if (!f.file \|\| !(f.file->f_mode & FMODE_READ))
988		- goto out;
989		-
990		- ret = kernel_read_file(f.file, buf, size, max_size, id);
991		-out:
992		- fdput(f);
993		- return ret;
994		-}
995		-EXPORT_SYMBOL_GPL(kernel_read_file_from_fd);
996		-
	962	+#if defined(CONFIG_HAVE_AOUT) \|\| defined(CONFIG_BINFMT_FLAT) \|\| \
	963	+ defined(CONFIG_BINFMT_ELF_FDPIC)
997	964	ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
998	965	{
999	966	ssize_t res = vfs_read(file, (void __user *)addr, len, &pos);
1000	967	if (res > 0)
1001		- flush_icache_range(addr, addr + len);
	968	+ flush_icache_user_range(addr, addr + len);
1002	969	return res;
1003	970	}
1004	971	EXPORT_SYMBOL(read_code);
	972	+#endif
1005	973
	974	+/*
	975	+ * Maps the mm_struct mm into the current task struct.
	976	+ * On success, this function returns with exec_update_lock
	977	+ * held for writing.
	978	+ */
1006	979	static int exec_mmap(struct mm_struct *mm)
1007	980	{
1008	981	struct task_struct *tsk;
1009	982	struct mm_struct old_mm, active_mm;
	983	+ int ret;
1010	984
1011	985	/* Notify parent that we're no longer interested in the old VM */
1012	986	tsk = current;
1013	987	old_mm = current->mm;
1014	988	exec_mm_release(tsk, old_mm);
	989	+ if (old_mm)
	990	+ sync_mm_rss(old_mm);
	991	+
	992	+ ret = down_write_killable(&tsk->signal->exec_update_lock);
	993	+ if (ret)
	994	+ return ret;
1015	995
1016	996	if (old_mm) {
1017		- sync_mm_rss(old_mm);
1018	997	/*
1019	998	* Make sure that if there is a core dump in progress
1020	999	* for the old mm, we get out and die instead of going
1021		- * through with the exec. We must hold mmap_sem around
	1000	+ * through with the exec. We must hold mmap_lock around
1022	1001	* checking core_state and changing tsk->mm.
1023	1002	*/
1024		- down_read(&old_mm->mmap_sem);
	1003	+ mmap_read_lock(old_mm);
1025	1004	if (unlikely(old_mm->core_state)) {
1026		- up_read(&old_mm->mmap_sem);
	1005	+ mmap_read_unlock(old_mm);
	1006	+ up_write(&tsk->signal->exec_update_lock);
1027	1007	return -EINTR;
1028	1008	}
1029	1009	}
	1010	+
1030	1011	task_lock(tsk);
1031		- preempt_disable_rt();
	1012	+ membarrier_exec_mmap(mm);
1032	1013
1033	1014	local_irq_disable();
1034	1015	active_mm = tsk->active_mm;
..	..	@@ -1048,10 +1029,9 @@
1048	1029	local_irq_enable();
1049	1030	tsk->mm->vmacache_seqnum = 0;
1050	1031	vmacache_flush(tsk);
1051		- preempt_enable_rt();
1052	1032	task_unlock(tsk);
1053	1033	if (old_mm) {
1054		- up_read(&old_mm->mmap_sem);
	1034	+ mmap_read_unlock(old_mm);
1055	1035	BUG_ON(active_mm != old_mm);
1056	1036	setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
1057	1037	mm_update_next_owner(old_mm);
..	..	@@ -1062,12 +1042,6 @@
1062	1042	return 0;
1063	1043	}
1064	1044
1065		-/*
1066		- * This function makes sure the current process has its own signal table,
1067		- * so that flush_signal_handlers can later reset the handlers without
1068		- * disturbing other processes. (Other processes might share the signal
1069		- * table via the CLONE_SIGHAND option to clone().)
1070		- */
1071	1045	static int de_thread(struct task_struct *tsk)
1072	1046	{
1073	1047	struct signal_struct *sig = tsk->signal;
..	..	@@ -1099,7 +1073,7 @@
1099	1073	__set_current_state(TASK_KILLABLE);
1100	1074	spin_unlock_irq(lock);
1101	1075	schedule();
1102		- if (unlikely(__fatal_signal_pending(tsk)))
	1076	+ if (__fatal_signal_pending(tsk))
1103	1077	goto killed;
1104	1078	spin_lock_irq(lock);
1105	1079	}
..	..	@@ -1127,7 +1101,7 @@
1127	1101	write_unlock_irq(&tasklist_lock);
1128	1102	cgroup_threadgroup_change_end(tsk);
1129	1103	schedule();
1130		- if (unlikely(__fatal_signal_pending(tsk)))
	1104	+ if (__fatal_signal_pending(tsk))
1131	1105	goto killed;
1132	1106	}
1133	1107
..	..	@@ -1142,10 +1116,9 @@
1142	1116	* also take its birthdate (always earlier than our own).
1143	1117	*/
1144	1118	tsk->start_time = leader->start_time;
1145		- tsk->real_start_time = leader->real_start_time;
	1119	+ tsk->start_boottime = leader->start_boottime;
1146	1120
1147	1121	BUG_ON(!same_thread_group(leader, tsk));
1148		- BUG_ON(has_group_leader_pid(tsk));
1149	1122	/*
1150	1123	* An exec() starts a new thread group with the
1151	1124	* TGID of the previous thread group. Rehash the
..	..	@@ -1155,11 +1128,8 @@
1155	1128
1156	1129	/* Become a process group leader with the old leader's pid.
1157	1130	* The old leader becomes a thread of the this thread group.
1158		- * Note: The old leader also uses this pid until release_task
1159		- * is called. Odd but simple and correct.
1160	1131	*/
1161		- tsk->pid = leader->pid;
1162		- change_pid(tsk, PIDTYPE_PID, task_pid(leader));
	1132	+ exchange_tids(tsk, leader);
1163	1133	transfer_pid(leader, tsk, PIDTYPE_TGID);
1164	1134	transfer_pid(leader, tsk, PIDTYPE_PGID);
1165	1135	transfer_pid(leader, tsk, PIDTYPE_SID);
..	..	@@ -1196,34 +1166,6 @@
1196	1166	/* we have changed execution domain */
1197	1167	tsk->exit_signal = SIGCHLD;
1198	1168
1199		-#ifdef CONFIG_POSIX_TIMERS
1200		- exit_itimers(sig);
1201		- flush_itimer_signals();
1202		-#endif
1203		-
1204		- if (atomic_read(&oldsighand->count) != 1) {
1205		- struct sighand_struct *newsighand;
1206		- /*
1207		- * This ->sighand is shared with the CLONE_SIGHAND
1208		- * but not CLONE_THREAD task, switch to the new one.
1209		- */
1210		- newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
1211		- if (!newsighand)
1212		- return -ENOMEM;
1213		-
1214		- atomic_set(&newsighand->count, 1);
1215		- memcpy(newsighand->action, oldsighand->action,
1216		- sizeof(newsighand->action));
1217		-
1218		- write_lock_irq(&tasklist_lock);
1219		- spin_lock(&oldsighand->siglock);
1220		- rcu_assign_pointer(tsk->sighand, newsighand);
1221		- spin_unlock(&oldsighand->siglock);
1222		- write_unlock_irq(&tasklist_lock);
1223		-
1224		- __cleanup_sighand(oldsighand);
1225		- }
1226		-
1227	1169	BUG_ON(!thread_group_leader(tsk));
1228	1170	return 0;
1229	1171
..	..	@@ -1234,6 +1176,42 @@
1234	1176	sig->notify_count = 0;
1235	1177	read_unlock(&tasklist_lock);
1236	1178	return -EAGAIN;
	1179	+}
	1180	+
	1181	+
	1182	+/*
	1183	+ * This function makes sure the current process has its own signal table,
	1184	+ * so that flush_signal_handlers can later reset the handlers without
	1185	+ * disturbing other processes. (Other processes might share the signal
	1186	+ * table via the CLONE_SIGHAND option to clone().)
	1187	+ */
	1188	+static int unshare_sighand(struct task_struct *me)
	1189	+{
	1190	+ struct sighand_struct *oldsighand = me->sighand;
	1191	+
	1192	+ if (refcount_read(&oldsighand->count) != 1) {
	1193	+ struct sighand_struct *newsighand;
	1194	+ /*
	1195	+ * This ->sighand is shared with the CLONE_SIGHAND
	1196	+ * but not CLONE_THREAD task, switch to the new one.
	1197	+ */
	1198	+ newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
	1199	+ if (!newsighand)
	1200	+ return -ENOMEM;
	1201	+
	1202	+ refcount_set(&newsighand->count, 1);
	1203	+
	1204	+ write_lock_irq(&tasklist_lock);
	1205	+ spin_lock(&oldsighand->siglock);
	1206	+ memcpy(newsighand->action, oldsighand->action,
	1207	+ sizeof(newsighand->action));
	1208	+ rcu_assign_pointer(me->sighand, newsighand);
	1209	+ spin_unlock(&oldsighand->siglock);
	1210	+ write_unlock_irq(&tasklist_lock);
	1211	+
	1212	+ __cleanup_sighand(oldsighand);
	1213	+ }
	1214	+ return 0;
1237	1215	}
1238	1216
1239	1217	char __get_task_comm(char buf, size_t buf_size, struct task_struct *tsk)
..	..	@@ -1263,17 +1241,27 @@
1263	1241	* Calling this is the point of no return. None of the failures will be
1264	1242	* seen by userspace since either the process is already taking a fatal
1265	1243	* signal (via de_thread() or coredump), or will have SEGV raised
1266		- * (after exec_mmap()) by search_binary_handlers (see below).
	1244	+ * (after exec_mmap()) by search_binary_handler (see below).
1267	1245	*/
1268		-int flush_old_exec(struct linux_binprm * bprm)
	1246	+int begin_new_exec(struct linux_binprm * bprm)
1269	1247	{
	1248	+ struct task_struct *me = current;
1270	1249	int retval;
1271	1250
	1251	+ /* Once we are committed compute the creds */
	1252	+ retval = bprm_creds_from_file(bprm);
	1253	+ if (retval)
	1254	+ return retval;
	1255	+
1272	1256	/*
1273		- * Make sure we have a private signal table and that
1274		- * we are unassociated from the previous thread group.
	1257	+ * Ensure all future errors are fatal.
1275	1258	*/
1276		- retval = de_thread(current);
	1259	+ bprm->point_of_no_return = true;
	1260	+
	1261	+ /*
	1262	+ * Make this the only thread in the thread group.
	1263	+ */
	1264	+ retval = de_thread(me);
1277	1265	if (retval)
1278	1266	goto out;
1279	1267
..	..	@@ -1284,7 +1272,10 @@
1284	1272	*/
1285	1273	set_mm_exe_file(bprm->mm, bprm->file);
1286	1274
	1275	+ /* If the binary is not readable then enforce mm->dumpable=0 */
1287	1276	would_dump(bprm, bprm->file);
	1277	+ if (bprm->have_execfd)
	1278	+ would_dump(bprm, bprm->executable);
1288	1279
1289	1280	/*
1290	1281	* Release all of the old mmap stuff
..	..	@@ -1294,19 +1285,33 @@
1294	1285	if (retval)
1295	1286	goto out;
1296	1287
1297		- /*
1298		- * After clearing bprm->mm (to mark that current is using the
1299		- * prepared mm now), we have nothing left of the original
1300		- * process. If anything from here on returns an error, the check
1301		- * in search_binary_handler() will SEGV current.
1302		- */
1303	1288	bprm->mm = NULL;
1304	1289
1305		- set_fs(USER_DS);
1306		- current->flags &= ~(PF_RANDOMIZE \| PF_FORKNOEXEC \| PF_KTHREAD \|
	1290	+#ifdef CONFIG_POSIX_TIMERS
	1291	+ spin_lock_irq(&me->sighand->siglock);
	1292	+ posix_cpu_timers_exit(me);
	1293	+ spin_unlock_irq(&me->sighand->siglock);
	1294	+ exit_itimers(me);
	1295	+ flush_itimer_signals();
	1296	+#endif
	1297	+
	1298	+ /*
	1299	+ * Make the signal table private.
	1300	+ */
	1301	+ retval = unshare_sighand(me);
	1302	+ if (retval)
	1303	+ goto out_unlock;
	1304	+
	1305	+ /*
	1306	+ * Ensure that the uaccess routines can actually operate on userspace
	1307	+ * pointers:
	1308	+ */
	1309	+ force_uaccess_begin();
	1310	+
	1311	+ me->flags &= ~(PF_RANDOMIZE \| PF_FORKNOEXEC \| PF_KTHREAD \|
1307	1312	PF_NOFREEZE \| PF_NO_SETAFFINITY);
1308	1313	flush_thread();
1309		- current->personality &= ~bprm->per_clear;
	1314	+ me->personality &= ~bprm->per_clear;
1310	1315
1311	1316	/*
1312	1317	* We have to apply CLOEXEC before we change whether the process is
..	..	@@ -1314,18 +1319,90 @@
1314	1319	* trying to access the should-be-closed file descriptors of a process
1315	1320	* undergoing exec(2).
1316	1321	*/
1317		- do_close_on_exec(current->files);
	1322	+ do_close_on_exec(me->files);
	1323	+
	1324	+ if (bprm->secureexec) {
	1325	+ /* Make sure parent cannot signal privileged process. */
	1326	+ me->pdeath_signal = 0;
	1327	+
	1328	+ /*
	1329	+ * For secureexec, reset the stack limit to sane default to
	1330	+ * avoid bad behavior from the prior rlimits. This has to
	1331	+ * happen before arch_pick_mmap_layout(), which examines
	1332	+ * RLIMIT_STACK, but after the point of no return to avoid
	1333	+ * needing to clean up the change on failure.
	1334	+ */
	1335	+ if (bprm->rlim_stack.rlim_cur > _STK_LIM)
	1336	+ bprm->rlim_stack.rlim_cur = _STK_LIM;
	1337	+ }
	1338	+
	1339	+ me->sas_ss_sp = me->sas_ss_size = 0;
	1340	+
	1341	+ /*
	1342	+ * Figure out dumpability. Note that this checking only of current
	1343	+ * is wrong, but userspace depends on it. This should be testing
	1344	+ * bprm->secureexec instead.
	1345	+ */
	1346	+ if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP \|\|
	1347	+ !(uid_eq(current_euid(), current_uid()) &&
	1348	+ gid_eq(current_egid(), current_gid())))
	1349	+ set_dumpable(current->mm, suid_dumpable);
	1350	+ else
	1351	+ set_dumpable(current->mm, SUID_DUMP_USER);
	1352	+
	1353	+ perf_event_exec();
	1354	+ __set_task_comm(me, kbasename(bprm->filename), true);
	1355	+
	1356	+ /* An exec changes our domain. We are no longer part of the thread
	1357	+ group */
	1358	+ WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1);
	1359	+ flush_signal_handlers(me, 0);
	1360	+
	1361	+ /*
	1362	+ * install the new credentials for this executable
	1363	+ */
	1364	+ security_bprm_committing_creds(bprm);
	1365	+
	1366	+ commit_creds(bprm->cred);
	1367	+ bprm->cred = NULL;
	1368	+
	1369	+ /*
	1370	+ * Disable monitoring for regular users
	1371	+ * when executing setuid binaries. Must
	1372	+ * wait until new credentials are committed
	1373	+ * by commit_creds() above
	1374	+ */
	1375	+ if (get_dumpable(me->mm) != SUID_DUMP_USER)
	1376	+ perf_event_exit_task(me);
	1377	+ /*
	1378	+ * cred_guard_mutex must be held at least to this point to prevent
	1379	+ * ptrace_attach() from altering our determination of the task's
	1380	+ * credentials; any time after this it may be unlocked.
	1381	+ */
	1382	+ security_bprm_committed_creds(bprm);
	1383	+
	1384	+ /* Pass the opened binary to the interpreter. */
	1385	+ if (bprm->have_execfd) {
	1386	+ retval = get_unused_fd_flags(0);
	1387	+ if (retval < 0)
	1388	+ goto out_unlock;
	1389	+ fd_install(retval, bprm->executable);
	1390	+ bprm->executable = NULL;
	1391	+ bprm->execfd = retval;
	1392	+ }
1318	1393	return 0;
1319	1394
	1395	+out_unlock:
	1396	+ up_write(&me->signal->exec_update_lock);
1320	1397	out:
1321	1398	return retval;
1322	1399	}
1323		-EXPORT_SYMBOL(flush_old_exec);
	1400	+EXPORT_SYMBOL(begin_new_exec);
1324	1401
1325	1402	void would_dump(struct linux_binprm bprm, struct file file)
1326	1403	{
1327	1404	struct inode *inode = file_inode(file);
1328		- if (inode_permission2(file->f_path.mnt, inode, MAY_READ) < 0) {
	1405	+ if (inode_permission(inode, MAY_READ) < 0) {
1329	1406	struct user_namespace old, user_ns;
1330	1407	bprm->interp_flags \|= BINPRM_FLAGS_ENFORCE_NONDUMP;
1331	1408
..	..	@@ -1345,58 +1422,20 @@
1345	1422
1346	1423	void setup_new_exec(struct linux_binprm * bprm)
1347	1424	{
1348		- /*
1349		- * Once here, prepare_binrpm() will not be called any more, so
1350		- * the final state of setuid/setgid/fscaps can be merged into the
1351		- * secureexec flag.
1352		- */
1353		- bprm->secureexec \|= bprm->cap_elevated;
	1425	+ /* Setup things that can depend upon the personality */
	1426	+ struct task_struct *me = current;
1354	1427
1355		- if (bprm->secureexec) {
1356		- /* Make sure parent cannot signal privileged process. */
1357		- current->pdeath_signal = 0;
1358		-
1359		- /*
1360		- * For secureexec, reset the stack limit to sane default to
1361		- * avoid bad behavior from the prior rlimits. This has to
1362		- * happen before arch_pick_mmap_layout(), which examines
1363		- * RLIMIT_STACK, but after the point of no return to avoid
1364		- * needing to clean up the change on failure.
1365		- */
1366		- if (bprm->rlim_stack.rlim_cur > _STK_LIM)
1367		- bprm->rlim_stack.rlim_cur = _STK_LIM;
1368		- }
1369		-
1370		- arch_pick_mmap_layout(current->mm, &bprm->rlim_stack);
1371		-
1372		- current->sas_ss_sp = current->sas_ss_size = 0;
1373		-
1374		- /*
1375		- * Figure out dumpability. Note that this checking only of current
1376		- * is wrong, but userspace depends on it. This should be testing
1377		- * bprm->secureexec instead.
1378		- */
1379		- if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP \|\|
1380		- !(uid_eq(current_euid(), current_uid()) &&
1381		- gid_eq(current_egid(), current_gid())))
1382		- set_dumpable(current->mm, suid_dumpable);
1383		- else
1384		- set_dumpable(current->mm, SUID_DUMP_USER);
	1428	+ arch_pick_mmap_layout(me->mm, &bprm->rlim_stack);
1385	1429
1386	1430	arch_setup_new_exec();
1387		- perf_event_exec();
1388		- __set_task_comm(current, kbasename(bprm->filename), true);
1389	1431
1390	1432	/* Set the new mm task size. We have to do that late because it may
1391	1433	* depend on TIF_32BIT which is only updated in flush_thread() on
1392	1434	* some architectures like powerpc
1393	1435	*/
1394		- current->mm->task_size = TASK_SIZE;
1395		-
1396		- /* An exec changes our domain. We are no longer part of the thread
1397		- group */
1398		- WRITE_ONCE(current->self_exec_id, current->self_exec_id + 1);
1399		- flush_signal_handlers(current, 0);
	1436	+ me->mm->task_size = TASK_SIZE;
	1437	+ up_write(&me->signal->exec_update_lock);
	1438	+ mutex_unlock(&me->signal->cred_guard_mutex);
1400	1439	}
1401	1440	EXPORT_SYMBOL(setup_new_exec);
1402	1441
..	..	@@ -1412,11 +1451,11 @@
1412	1451
1413	1452	/*
1414	1453	* Prepare credentials and lock ->cred_guard_mutex.
1415		- * install_exec_creds() commits the new creds and drops the lock.
	1454	+ * setup_new_exec() commits the new creds and drops the lock.
1416	1455	* Or, if exec fails before, free_bprm() should release ->cred and
1417	1456	* and unlock.
1418	1457	*/
1419		-int prepare_bprm_creds(struct linux_binprm *bprm)
	1458	+static int prepare_bprm_creds(struct linux_binprm *bprm)
1420	1459	{
1421	1460	if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
1422	1461	return -ERESTARTNOINTR;
..	..	@@ -1431,6 +1470,10 @@
1431	1470
1432	1471	static void free_bprm(struct linux_binprm *bprm)
1433	1472	{
	1473	+ if (bprm->mm) {
	1474	+ acct_arg_size(bprm, 0);
	1475	+ mmput(bprm->mm);
	1476	+ }
1434	1477	free_arg_pages(bprm);
1435	1478	if (bprm->cred) {
1436	1479	mutex_unlock(&current->signal->cred_guard_mutex);
..	..	@@ -1440,10 +1483,46 @@
1440	1483	allow_write_access(bprm->file);
1441	1484	fput(bprm->file);
1442	1485	}
	1486	+ if (bprm->executable)
	1487	+ fput(bprm->executable);
1443	1488	/* If a binfmt changed the interp, free it. */
1444	1489	if (bprm->interp != bprm->filename)
1445	1490	kfree(bprm->interp);
	1491	+ kfree(bprm->fdpath);
1446	1492	kfree(bprm);
	1493	+}
	1494	+
	1495	+static struct linux_binprm alloc_bprm(int fd, struct filename filename)
	1496	+{
	1497	+ struct linux_binprm bprm = kzalloc(sizeof(bprm), GFP_KERNEL);
	1498	+ int retval = -ENOMEM;
	1499	+ if (!bprm)
	1500	+ goto out;
	1501	+
	1502	+ if (fd == AT_FDCWD \|\| filename->name[0] == '/') {
	1503	+ bprm->filename = filename->name;
	1504	+ } else {
	1505	+ if (filename->name[0] == '\0')
	1506	+ bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd);
	1507	+ else
	1508	+ bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s",
	1509	+ fd, filename->name);
	1510	+ if (!bprm->fdpath)
	1511	+ goto out_free;
	1512	+
	1513	+ bprm->filename = bprm->fdpath;
	1514	+ }
	1515	+ bprm->interp = bprm->filename;
	1516	+
	1517	+ retval = bprm_mm_init(bprm);
	1518	+ if (retval)
	1519	+ goto out_free;
	1520	+ return bprm;
	1521	+
	1522	+out_free:
	1523	+ free_bprm(bprm);
	1524	+out:
	1525	+ return ERR_PTR(retval);
1447	1526	}
1448	1527
1449	1528	int bprm_change_interp(const char interp, struct linux_binprm bprm)
..	..	@@ -1457,34 +1536,6 @@
1457	1536	return 0;
1458	1537	}
1459	1538	EXPORT_SYMBOL(bprm_change_interp);
1460		-
1461		-/*
1462		- * install the new credentials for this executable
1463		- */
1464		-void install_exec_creds(struct linux_binprm *bprm)
1465		-{
1466		- security_bprm_committing_creds(bprm);
1467		-
1468		- commit_creds(bprm->cred);
1469		- bprm->cred = NULL;
1470		-
1471		- /*
1472		- * Disable monitoring for regular users
1473		- * when executing setuid binaries. Must
1474		- * wait until new credentials are committed
1475		- * by commit_creds() above
1476		- */
1477		- if (get_dumpable(current->mm) != SUID_DUMP_USER)
1478		- perf_event_exit_task(current);
1479		- /*
1480		- * cred_guard_mutex must be held at least to this point to prevent
1481		- * ptrace_attach() from altering our determination of the task's
1482		- * credentials; any time after this it may be unlocked.
1483		- */
1484		- security_bprm_committed_creds(bprm);
1485		- mutex_unlock(&current->signal->cred_guard_mutex);
1486		-}
1487		-EXPORT_SYMBOL(install_exec_creds);
1488	1539
1489	1540	/*
1490	1541	* determine how safe it is to execute the proposed program
..	..	@@ -1523,29 +1574,21 @@
1523	1574	spin_unlock(&p->fs->lock);
1524	1575	}
1525	1576
1526		-static void bprm_fill_uid(struct linux_binprm *bprm)
	1577	+static void bprm_fill_uid(struct linux_binprm bprm, struct file file)
1527	1578	{
	1579	+ /* Handle suid and sgid on files */
1528	1580	struct inode *inode;
1529	1581	unsigned int mode;
1530	1582	kuid_t uid;
1531	1583	kgid_t gid;
1532	1584
1533		- /*
1534		- * Since this can be called multiple times (via prepare_binprm),
1535		- * we must clear any previous work done when setting set[ug]id
1536		- * bits from any earlier bprm->file uses (for example when run
1537		- * first for a setuid script then again for its interpreter).
1538		- */
1539		- bprm->cred->euid = current_euid();
1540		- bprm->cred->egid = current_egid();
1541		-
1542		- if (!mnt_may_suid(bprm->file->f_path.mnt))
	1585	+ if (!mnt_may_suid(file->f_path.mnt))
1543	1586	return;
1544	1587
1545	1588	if (task_no_new_privs(current))
1546	1589	return;
1547	1590
1548		- inode = bprm->file->f_path.dentry->d_inode;
	1591	+ inode = file->f_path.dentry->d_inode;
1549	1592	mode = READ_ONCE(inode->i_mode);
1550	1593	if (!(mode & (S_ISUID\|S_ISGID)))
1551	1594	return;
..	..	@@ -1576,29 +1619,30 @@
1576	1619	}
1577	1620
1578	1621	/*
	1622	+ * Compute brpm->cred based upon the final binary.
	1623	+ */
	1624	+static int bprm_creds_from_file(struct linux_binprm *bprm)
	1625	+{
	1626	+ /* Compute creds based on which file? */
	1627	+ struct file *file = bprm->execfd_creds ? bprm->executable : bprm->file;
	1628	+
	1629	+ bprm_fill_uid(bprm, file);
	1630	+ return security_bprm_creds_from_file(bprm, file);
	1631	+}
	1632	+
	1633	+/*
1579	1634	* Fill the binprm structure from the inode.
1580		- * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
	1635	+ * Read the first BINPRM_BUF_SIZE bytes
1581	1636	*
1582	1637	* This may be called multiple times for binary chains (scripts for example).
1583	1638	*/
1584		-int prepare_binprm(struct linux_binprm *bprm)
	1639	+static int prepare_binprm(struct linux_binprm *bprm)
1585	1640	{
1586		- int retval;
1587	1641	loff_t pos = 0;
1588		-
1589		- bprm_fill_uid(bprm);
1590		-
1591		- /* fill in binprm security blob */
1592		- retval = security_bprm_set_creds(bprm);
1593		- if (retval)
1594		- return retval;
1595		- bprm->called_set_creds = 1;
1596	1642
1597	1643	memset(bprm->buf, 0, BINPRM_BUF_SIZE);
1598	1644	return kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, &pos);
1599	1645	}
1600		-
1601		-EXPORT_SYMBOL(prepare_binprm);
1602	1646
1603	1647	/*
1604	1648	* Arguments are '\0' separated strings found at the location bprm->p
..	..	@@ -1645,15 +1689,15 @@
1645	1689	/*
1646	1690	* cycle the list of binary formats handler, until one recognizes the image
1647	1691	*/
1648		-int search_binary_handler(struct linux_binprm *bprm)
	1692	+static int search_binary_handler(struct linux_binprm *bprm)
1649	1693	{
1650	1694	bool need_retry = IS_ENABLED(CONFIG_MODULES);
1651	1695	struct linux_binfmt *fmt;
1652	1696	int retval;
1653	1697
1654		- /* This allows 4 levels of binfmt rewrites before failing hard. */
1655		- if (bprm->recursion_depth > 5)
1656		- return -ELOOP;
	1698	+ retval = prepare_binprm(bprm);
	1699	+ if (retval < 0)
	1700	+ return retval;
1657	1701
1658	1702	retval = security_bprm_check(bprm);
1659	1703	if (retval)
..	..	@@ -1666,18 +1710,12 @@
1666	1710	if (!try_module_get(fmt->module))
1667	1711	continue;
1668	1712	read_unlock(&binfmt_lock);
1669		- bprm->recursion_depth++;
	1713	+
1670	1714	retval = fmt->load_binary(bprm);
	1715	+
1671	1716	read_lock(&binfmt_lock);
1672	1717	put_binfmt(fmt);
1673		- bprm->recursion_depth--;
1674		- if (retval < 0 && !bprm->mm) {
1675		- /* we got to flush_old_exec() and failed after it */
1676		- read_unlock(&binfmt_lock);
1677		- force_sigsegv(SIGSEGV, current);
1678		- return retval;
1679		- }
1680		- if (retval != -ENOEXEC \|\| !bprm->file) {
	1718	+ if (bprm->point_of_no_return \|\| (retval != -ENOEXEC)) {
1681	1719	read_unlock(&binfmt_lock);
1682	1720	return retval;
1683	1721	}
..	..	@@ -1696,12 +1734,11 @@
1696	1734
1697	1735	return retval;
1698	1736	}
1699		-EXPORT_SYMBOL(search_binary_handler);
1700	1737
1701	1738	static int exec_binprm(struct linux_binprm *bprm)
1702	1739	{
1703	1740	pid_t old_pid, old_vpid;
1704		- int ret;
	1741	+ int ret, depth;
1705	1742
1706	1743	/* Need to fetch pid before load_binary changes it */
1707	1744	old_pid = current->pid;
..	..	@@ -1709,28 +1746,129 @@
1709	1746	old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
1710	1747	rcu_read_unlock();
1711	1748
1712		- ret = search_binary_handler(bprm);
1713		- if (ret >= 0) {
1714		- audit_bprm(bprm);
1715		- trace_sched_process_exec(current, old_pid, bprm);
1716		- ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
1717		- proc_exec_connector(current);
	1749	+ /* This allows 4 levels of binfmt rewrites before failing hard. */
	1750	+ for (depth = 0;; depth++) {
	1751	+ struct file *exec;
	1752	+ if (depth > 5)
	1753	+ return -ELOOP;
	1754	+
	1755	+ ret = search_binary_handler(bprm);
	1756	+ if (ret < 0)
	1757	+ return ret;
	1758	+ if (!bprm->interpreter)
	1759	+ break;
	1760	+
	1761	+ exec = bprm->file;
	1762	+ bprm->file = bprm->interpreter;
	1763	+ bprm->interpreter = NULL;
	1764	+
	1765	+ allow_write_access(exec);
	1766	+ if (unlikely(bprm->have_execfd)) {
	1767	+ if (bprm->executable) {
	1768	+ fput(exec);
	1769	+ return -ENOEXEC;
	1770	+ }
	1771	+ bprm->executable = exec;
	1772	+ } else
	1773	+ fput(exec);
1718	1774	}
1719	1775
1720		- return ret;
	1776	+ audit_bprm(bprm);
	1777	+ trace_sched_process_exec(current, old_pid, bprm);
	1778	+ ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
	1779	+ proc_exec_connector(current);
	1780	+ return 0;
1721	1781	}
1722	1782
1723	1783	/*
1724	1784	* sys_execve() executes a new program.
1725	1785	*/
1726		-static int __do_execve_file(int fd, struct filename *filename,
1727		- struct user_arg_ptr argv,
1728		- struct user_arg_ptr envp,
1729		- int flags, struct file *file)
	1786	+static int bprm_execve(struct linux_binprm *bprm,
	1787	+ int fd, struct filename *filename, int flags)
1730	1788	{
1731		- char *pathbuf = NULL;
1732		- struct linux_binprm *bprm;
	1789	+ struct file *file;
1733	1790	struct files_struct *displaced;
	1791	+ int retval;
	1792	+
	1793	+ /*
	1794	+ * Cancel any io_uring activity across execve
	1795	+ */
	1796	+ io_uring_task_cancel();
	1797	+
	1798	+ retval = unshare_files(&displaced);
	1799	+ if (retval)
	1800	+ return retval;
	1801	+
	1802	+ retval = prepare_bprm_creds(bprm);
	1803	+ if (retval)
	1804	+ goto out_files;
	1805	+
	1806	+ check_unsafe_exec(bprm);
	1807	+ current->in_execve = 1;
	1808	+
	1809	+ file = do_open_execat(fd, filename, flags);
	1810	+ retval = PTR_ERR(file);
	1811	+ if (IS_ERR(file))
	1812	+ goto out_unmark;
	1813	+
	1814	+ sched_exec();
	1815	+
	1816	+ bprm->file = file;
	1817	+ /*
	1818	+ * Record that a name derived from an O_CLOEXEC fd will be
	1819	+ * inaccessible after exec. Relies on having exclusive access to
	1820	+ * current->files (due to unshare_files above).
	1821	+ */
	1822	+ if (bprm->fdpath &&
	1823	+ close_on_exec(fd, rcu_dereference_raw(current->files->fdt)))
	1824	+ bprm->interp_flags \|= BINPRM_FLAGS_PATH_INACCESSIBLE;
	1825	+
	1826	+ /* Set the unchanging part of bprm->cred */
	1827	+ retval = security_bprm_creds_for_exec(bprm);
	1828	+ if (retval)
	1829	+ goto out;
	1830	+
	1831	+ retval = exec_binprm(bprm);
	1832	+ if (retval < 0)
	1833	+ goto out;
	1834	+
	1835	+ /* execve succeeded */
	1836	+ current->fs->in_exec = 0;
	1837	+ current->in_execve = 0;
	1838	+ rseq_execve(current);
	1839	+ acct_update_integrals(current);
	1840	+ task_numa_free(current, false);
	1841	+ if (displaced)
	1842	+ put_files_struct(displaced);
	1843	+ return retval;
	1844	+
	1845	+out:
	1846	+ /*
	1847	+ * If past the point of no return ensure the the code never
	1848	+ * returns to the userspace process. Use an existing fatal
	1849	+ * signal if present otherwise terminate the process with
	1850	+ * SIGSEGV.
	1851	+ */
	1852	+ if (bprm->point_of_no_return && !fatal_signal_pending(current))
	1853	+ force_sigsegv(SIGSEGV);
	1854	+
	1855	+out_unmark:
	1856	+ current->fs->in_exec = 0;
	1857	+ current->in_execve = 0;
	1858	+
	1859	+out_files:
	1860	+ if (displaced)
	1861	+ reset_files_struct(displaced);
	1862	+
	1863	+ return retval;
	1864	+}
	1865	+
	1866	+static int do_execveat_common(int fd, struct filename *filename,
	1867	+ struct user_arg_ptr argv,
	1868	+ struct user_arg_ptr envp,
	1869	+ int flags)
	1870	+{
	1871	+ struct linux_binprm *bprm;
1734	1872	int retval;
1735	1873
1736	1874	if (IS_ERR(filename))
..	..	@@ -1752,144 +1890,120 @@
1752	1890	* further execve() calls fail. */
1753	1891	current->flags &= ~PF_NPROC_EXCEEDED;
1754	1892
1755		- retval = unshare_files(&displaced);
1756		- if (retval)
	1893	+ bprm = alloc_bprm(fd, filename);
	1894	+ if (IS_ERR(bprm)) {
	1895	+ retval = PTR_ERR(bprm);
1757	1896	goto out_ret;
	1897	+ }
1758	1898
1759		- retval = -ENOMEM;
1760		- bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
1761		- if (!bprm)
1762		- goto out_files;
	1899	+ retval = count(argv, MAX_ARG_STRINGS);
	1900	+ if (retval == 0)
	1901	+ pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n",
	1902	+ current->comm, bprm->filename);
	1903	+ if (retval < 0)
	1904	+ goto out_free;
	1905	+ bprm->argc = retval;
1763	1906
1764		- retval = prepare_bprm_creds(bprm);
1765		- if (retval)
	1907	+ retval = count(envp, MAX_ARG_STRINGS);
	1908	+ if (retval < 0)
	1909	+ goto out_free;
	1910	+ bprm->envc = retval;
	1911	+
	1912	+ retval = bprm_stack_limits(bprm);
	1913	+ if (retval < 0)
1766	1914	goto out_free;
1767	1915
1768		- check_unsafe_exec(bprm);
1769		- current->in_execve = 1;
1770		-
1771		- if (!file)
1772		- file = do_open_execat(fd, filename, flags);
1773		- retval = PTR_ERR(file);
1774		- if (IS_ERR(file))
1775		- goto out_unmark;
1776		-
1777		- sched_exec();
1778		-
1779		- bprm->file = file;
1780		- if (!filename) {
1781		- bprm->filename = "none";
1782		- } else if (fd == AT_FDCWD \|\| filename->name[0] == '/') {
1783		- bprm->filename = filename->name;
1784		- } else {
1785		- if (filename->name[0] == '\0')
1786		- pathbuf = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd);
1787		- else
1788		- pathbuf = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s",
1789		- fd, filename->name);
1790		- if (!pathbuf) {
1791		- retval = -ENOMEM;
1792		- goto out_unmark;
1793		- }
1794		- /*
1795		- * Record that a name derived from an O_CLOEXEC fd will be
1796		- * inaccessible after exec. Relies on having exclusive access to
1797		- * current->files (due to unshare_files above).
1798		- */
1799		- if (close_on_exec(fd, rcu_dereference_raw(current->files->fdt)))
1800		- bprm->interp_flags \|= BINPRM_FLAGS_PATH_INACCESSIBLE;
1801		- bprm->filename = pathbuf;
1802		- }
1803		- bprm->interp = bprm->filename;
1804		-
1805		- retval = bprm_mm_init(bprm);
1806		- if (retval)
1807		- goto out_unmark;
1808		-
1809		- bprm->argc = count(argv, MAX_ARG_STRINGS);
1810		- if ((retval = bprm->argc) < 0)
1811		- goto out;
1812		-
1813		- bprm->envc = count(envp, MAX_ARG_STRINGS);
1814		- if ((retval = bprm->envc) < 0)
1815		- goto out;
1816		-
1817		- retval = prepare_binprm(bprm);
	1916	+ retval = copy_string_kernel(bprm->filename, bprm);
1818	1917	if (retval < 0)
1819		- goto out;
1820		-
1821		- retval = copy_strings_kernel(1, &bprm->filename, bprm);
1822		- if (retval < 0)
1823		- goto out;
1824		-
	1918	+ goto out_free;
1825	1919	bprm->exec = bprm->p;
	1920	+
1826	1921	retval = copy_strings(bprm->envc, envp, bprm);
1827	1922	if (retval < 0)
1828		- goto out;
	1923	+ goto out_free;
1829	1924
1830	1925	retval = copy_strings(bprm->argc, argv, bprm);
1831	1926	if (retval < 0)
1832		- goto out;
	1927	+ goto out_free;
1833	1928
1834		- retval = exec_binprm(bprm);
1835		- if (retval < 0)
1836		- goto out;
1837		-
1838		- /* execve succeeded */
1839		- current->fs->in_exec = 0;
1840		- current->in_execve = 0;
1841		- membarrier_execve(current);
1842		- rseq_execve(current);
1843		- acct_update_integrals(current);
1844		- task_numa_free(current, false);
1845		- free_bprm(bprm);
1846		- kfree(pathbuf);
1847		- if (filename)
1848		- putname(filename);
1849		- if (displaced)
1850		- put_files_struct(displaced);
1851		- return retval;
1852		-
1853		-out:
1854		- if (bprm->mm) {
1855		- acct_arg_size(bprm, 0);
1856		- mmput(bprm->mm);
	1929	+ /*
	1930	+ * When argv is empty, add an empty string ("") as argv[0] to
	1931	+ * ensure confused userspace programs that start processing
	1932	+ * from argv[1] won't end up walking envp. See also
	1933	+ * bprm_stack_limits().
	1934	+ */
	1935	+ if (bprm->argc == 0) {
	1936	+ retval = copy_string_kernel("", bprm);
	1937	+ if (retval < 0)
	1938	+ goto out_free;
	1939	+ bprm->argc = 1;
1857	1940	}
1858	1941
1859		-out_unmark:
1860		- current->fs->in_exec = 0;
1861		- current->in_execve = 0;
1862		-
	1942	+ retval = bprm_execve(bprm, fd, filename, flags);
1863	1943	out_free:
1864	1944	free_bprm(bprm);
1865		- kfree(pathbuf);
1866	1945
1867		-out_files:
1868		- if (displaced)
1869		- reset_files_struct(displaced);
1870	1946	out_ret:
1871		- if (filename)
1872		- putname(filename);
	1947	+ putname(filename);
1873	1948	return retval;
1874	1949	}
1875	1950
1876		-static int do_execveat_common(int fd, struct filename *filename,
1877		- struct user_arg_ptr argv,
1878		- struct user_arg_ptr envp,
1879		- int flags)
	1951	+int kernel_execve(const char *kernel_filename,
	1952	+ const char const argv, const char const envp)
1880	1953	{
1881		- return __do_execve_file(fd, filename, argv, envp, flags, NULL);
	1954	+ struct filename *filename;
	1955	+ struct linux_binprm *bprm;
	1956	+ int fd = AT_FDCWD;
	1957	+ int retval;
	1958	+
	1959	+ filename = getname_kernel(kernel_filename);
	1960	+ if (IS_ERR(filename))
	1961	+ return PTR_ERR(filename);
	1962	+
	1963	+ bprm = alloc_bprm(fd, filename);
	1964	+ if (IS_ERR(bprm)) {
	1965	+ retval = PTR_ERR(bprm);
	1966	+ goto out_ret;
	1967	+ }
	1968	+
	1969	+ retval = count_strings_kernel(argv);
	1970	+ if (WARN_ON_ONCE(retval == 0))
	1971	+ retval = -EINVAL;
	1972	+ if (retval < 0)
	1973	+ goto out_free;
	1974	+ bprm->argc = retval;
	1975	+
	1976	+ retval = count_strings_kernel(envp);
	1977	+ if (retval < 0)
	1978	+ goto out_free;
	1979	+ bprm->envc = retval;
	1980	+
	1981	+ retval = bprm_stack_limits(bprm);
	1982	+ if (retval < 0)
	1983	+ goto out_free;
	1984	+
	1985	+ retval = copy_string_kernel(bprm->filename, bprm);
	1986	+ if (retval < 0)
	1987	+ goto out_free;
	1988	+ bprm->exec = bprm->p;
	1989	+
	1990	+ retval = copy_strings_kernel(bprm->envc, envp, bprm);
	1991	+ if (retval < 0)
	1992	+ goto out_free;
	1993	+
	1994	+ retval = copy_strings_kernel(bprm->argc, argv, bprm);
	1995	+ if (retval < 0)
	1996	+ goto out_free;
	1997	+
	1998	+ retval = bprm_execve(bprm, fd, filename, 0);
	1999	+out_free:
	2000	+ free_bprm(bprm);
	2001	+out_ret:
	2002	+ putname(filename);
	2003	+ return retval;
1882	2004	}
1883	2005
1884		-int do_execve_file(struct file file, void __argv, void *__envp)
1885		-{
1886		- struct user_arg_ptr argv = { .ptr.native = __argv };
1887		- struct user_arg_ptr envp = { .ptr.native = __envp };
1888		-
1889		- return __do_execve_file(AT_FDCWD, NULL, argv, envp, 0, file);
1890		-}
1891		-
1892		-int do_execve(struct filename *filename,
	2006	+static int do_execve(struct filename *filename,
1893	2007	const char __user const __user __argv,
1894	2008	const char __user const __user __envp)
1895	2009	{
..	..	@@ -1898,7 +2012,7 @@
1898	2012	return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
1899	2013	}
1900	2014
1901		-int do_execveat(int fd, struct filename *filename,
	2015	+static int do_execveat(int fd, struct filename *filename,
1902	2016	const char __user const __user __argv,
1903	2017	const char __user const __user __envp,
1904	2018	int flags)
..	..	@@ -1960,15 +2074,10 @@
1960	2074	*/
1961	2075	void set_dumpable(struct mm_struct *mm, int value)
1962	2076	{
1963		- unsigned long old, new;
1964		-
1965	2077	if (WARN_ON((unsigned)value > SUID_DUMP_ROOT))
1966	2078	return;
1967	2079
1968		- do {
1969		- old = READ_ONCE(mm->flags);
1970		- new = (old & ~MMF_DUMPABLE_MASK) \| value;
1971		- } while (cmpxchg(&mm->flags, old, new) != old);
	2080	+ set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value);
1972	2081	}
1973	2082
1974	2083	SYSCALL_DEFINE3(execve,