hc
2024-05-10 9999e48639b3cecb08ffb37358bcba3b48161b29
kernel/fs/exec.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * linux/fs/exec.c
34 *
....@@ -22,6 +23,7 @@
2223 * formats.
2324 */
2425
26
+#include <linux/kernel_read_file.h>
2527 #include <linux/slab.h>
2628 #include <linux/file.h>
2729 #include <linux/fdtable.h>
....@@ -58,10 +60,10 @@
5860 #include <linux/kmod.h>
5961 #include <linux/fsnotify.h>
6062 #include <linux/fs_struct.h>
61
-#include <linux/pipe_fs_i.h>
6263 #include <linux/oom.h>
6364 #include <linux/compat.h>
6465 #include <linux/vmalloc.h>
66
+#include <linux/io_uring.h>
6567
6668 #include <linux/uaccess.h>
6769 #include <asm/mmu_context.h>
....@@ -71,6 +73,10 @@
7173 #include "internal.h"
7274
7375 #include <trace/events/sched.h>
76
+
77
+EXPORT_TRACEPOINT_SYMBOL_GPL(task_rename);
78
+
79
+static int bprm_creds_from_file(struct linux_binprm *bprm);
7480
7581 int suid_dumpable = 0;
7682
....@@ -139,12 +145,14 @@
139145 if (IS_ERR(file))
140146 goto out;
141147
142
- error = -EINVAL;
143
- if (!S_ISREG(file_inode(file)->i_mode))
144
- goto exit;
145
-
148
+ /*
149
+ * may_open() has already checked for this, so it should be
150
+ * impossible to trip now. But we need to be extra cautious
151
+ * and check again at the very end too.
152
+ */
146153 error = -EACCES;
147
- if (path_noexec(&file->f_path))
154
+ if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) ||
155
+ path_noexec(&file->f_path)))
148156 goto exit;
149157
150158 fsnotify_open(file);
....@@ -213,65 +221,20 @@
213221 * We are doing an exec(). 'current' is the process
214222 * doing the exec and bprm->mm is the new process's mm.
215223 */
216
- ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags,
224
+ ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags,
217225 &page, NULL, NULL);
218226 if (ret <= 0)
219227 return NULL;
220228
221
- if (write) {
222
- unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
223
- unsigned long ptr_size, limit;
224
-
225
- /*
226
- * Since the stack will hold pointers to the strings, we
227
- * must account for them as well.
228
- *
229
- * The size calculation is the entire vma while each arg page is
230
- * built, so each time we get here it's calculating how far it
231
- * is currently (rather than each call being just the newly
232
- * added size from the arg page). As a result, we need to
233
- * always add the entire size of the pointers, so that on the
234
- * last call to get_arg_page() we'll actually have the entire
235
- * correct size.
236
- */
237
- ptr_size = (bprm->argc + bprm->envc) * sizeof(void *);
238
- if (ptr_size > ULONG_MAX - size)
239
- goto fail;
240
- size += ptr_size;
241
-
242
- acct_arg_size(bprm, size / PAGE_SIZE);
243
-
244
- /*
245
- * We've historically supported up to 32 pages (ARG_MAX)
246
- * of argument strings even with small stacks
247
- */
248
- if (size <= ARG_MAX)
249
- return page;
250
-
251
- /*
252
- * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM
253
- * (whichever is smaller) for the argv+env strings.
254
- * This ensures that:
255
- * - the remaining binfmt code will not run out of stack space,
256
- * - the program will have a reasonable amount of stack left
257
- * to work from.
258
- */
259
- limit = _STK_LIM / 4 * 3;
260
- limit = min(limit, bprm->rlim_stack.rlim_cur / 4);
261
- if (size > limit)
262
- goto fail;
263
- }
229
+ if (write)
230
+ acct_arg_size(bprm, vma_pages(bprm->vma));
264231
265232 return page;
266
-
267
-fail:
268
- put_page(page);
269
- return NULL;
270233 }
271234
272235 static void put_arg_page(struct page *page)
273236 {
274
- put_page(page);
237
+ put_user_page(page);
275238 }
276239
277240 static void free_arg_pages(struct linux_binprm *bprm)
....@@ -295,7 +258,7 @@
295258 return -ENOMEM;
296259 vma_set_anonymous(vma);
297260
298
- if (down_write_killable(&mm->mmap_sem)) {
261
+ if (mmap_write_lock_killable(mm)) {
299262 err = -EINTR;
300263 goto err_free;
301264 }
....@@ -317,12 +280,11 @@
317280 goto err;
318281
319282 mm->stack_vm = mm->total_vm = 1;
320
- arch_bprm_mm_init(mm, vma);
321
- up_write(&mm->mmap_sem);
283
+ mmap_write_unlock(mm);
322284 bprm->p = vma->vm_end - sizeof(void *);
323285 return 0;
324286 err:
325
- up_write(&mm->mmap_sem);
287
+ mmap_write_unlock(mm);
326288 err_free:
327289 bprm->vma = NULL;
328290 vm_area_free(vma);
....@@ -492,6 +454,64 @@
492454 return i;
493455 }
494456
457
+static int count_strings_kernel(const char *const *argv)
458
+{
459
+ int i;
460
+
461
+ if (!argv)
462
+ return 0;
463
+
464
+ for (i = 0; argv[i]; ++i) {
465
+ if (i >= MAX_ARG_STRINGS)
466
+ return -E2BIG;
467
+ if (fatal_signal_pending(current))
468
+ return -ERESTARTNOHAND;
469
+ cond_resched();
470
+ }
471
+ return i;
472
+}
473
+
474
+static int bprm_stack_limits(struct linux_binprm *bprm)
475
+{
476
+ unsigned long limit, ptr_size;
477
+
478
+ /*
479
+ * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM
480
+ * (whichever is smaller) for the argv+env strings.
481
+ * This ensures that:
482
+ * - the remaining binfmt code will not run out of stack space,
483
+ * - the program will have a reasonable amount of stack left
484
+ * to work from.
485
+ */
486
+ limit = _STK_LIM / 4 * 3;
487
+ limit = min(limit, bprm->rlim_stack.rlim_cur / 4);
488
+ /*
489
+ * We've historically supported up to 32 pages (ARG_MAX)
490
+ * of argument strings even with small stacks
491
+ */
492
+ limit = max_t(unsigned long, limit, ARG_MAX);
493
+ /*
494
+ * We must account for the size of all the argv and envp pointers to
495
+ * the argv and envp strings, since they will also take up space in
496
+ * the stack. They aren't stored until much later when we can't
497
+ * signal to the parent that the child has run out of stack space.
498
+ * Instead, calculate it here so it's possible to fail gracefully.
499
+ *
500
+ * In the case of argc = 0, make sure there is space for adding a
501
+ * empty string (which will bump argc to 1), to ensure confused
502
+ * userspace programs don't start processing from argv[1], thinking
503
+ * argc can never be 0, to keep them from walking envp by accident.
504
+ * See do_execveat_common().
505
+ */
506
+ ptr_size = (max(bprm->argc, 1) + bprm->envc) * sizeof(void *);
507
+ if (limit <= ptr_size)
508
+ return -E2BIG;
509
+ limit -= ptr_size;
510
+
511
+ bprm->argmin = bprm->p - limit;
512
+ return 0;
513
+}
514
+
495515 /*
496516 * 'copy_strings()' copies argument/environment strings from the old
497517 * processes's memory to the new process's stack. The call to get_user_pages()
....@@ -527,6 +547,10 @@
527547 pos = bprm->p;
528548 str += len;
529549 bprm->p -= len;
550
+#ifdef CONFIG_MMU
551
+ if (bprm->p < bprm->argmin)
552
+ goto out;
553
+#endif
530554
531555 while (len > 0) {
532556 int offset, bytes_to_copy;
....@@ -586,24 +610,62 @@
586610 }
587611
588612 /*
589
- * Like copy_strings, but get argv and its values from kernel memory.
613
+ * Copy and argument/environment string from the kernel to the processes stack.
590614 */
591
-int copy_strings_kernel(int argc, const char *const *__argv,
592
- struct linux_binprm *bprm)
615
+int copy_string_kernel(const char *arg, struct linux_binprm *bprm)
593616 {
594
- int r;
595
- mm_segment_t oldfs = get_fs();
596
- struct user_arg_ptr argv = {
597
- .ptr.native = (const char __user *const __user *)__argv,
598
- };
617
+ int len = strnlen(arg, MAX_ARG_STRLEN) + 1 /* terminating NUL */;
618
+ unsigned long pos = bprm->p;
599619
600
- set_fs(KERNEL_DS);
601
- r = copy_strings(argc, argv, bprm);
602
- set_fs(oldfs);
620
+ if (len == 0)
621
+ return -EFAULT;
622
+ if (!valid_arg_len(bprm, len))
623
+ return -E2BIG;
603624
604
- return r;
625
+ /* We're going to work our way backwards. */
626
+ arg += len;
627
+ bprm->p -= len;
628
+ if (IS_ENABLED(CONFIG_MMU) && bprm->p < bprm->argmin)
629
+ return -E2BIG;
630
+
631
+ while (len > 0) {
632
+ unsigned int bytes_to_copy = min_t(unsigned int, len,
633
+ min_not_zero(offset_in_page(pos), PAGE_SIZE));
634
+ struct page *page;
635
+ char *kaddr;
636
+
637
+ pos -= bytes_to_copy;
638
+ arg -= bytes_to_copy;
639
+ len -= bytes_to_copy;
640
+
641
+ page = get_arg_page(bprm, pos, 1);
642
+ if (!page)
643
+ return -E2BIG;
644
+ kaddr = kmap_atomic(page);
645
+ flush_arg_page(bprm, pos & PAGE_MASK, page);
646
+ memcpy(kaddr + offset_in_page(pos), arg, bytes_to_copy);
647
+ flush_kernel_dcache_page(page);
648
+ kunmap_atomic(kaddr);
649
+ put_arg_page(page);
650
+ }
651
+
652
+ return 0;
605653 }
606
-EXPORT_SYMBOL(copy_strings_kernel);
654
+EXPORT_SYMBOL(copy_string_kernel);
655
+
656
+static int copy_strings_kernel(int argc, const char *const *argv,
657
+ struct linux_binprm *bprm)
658
+{
659
+ while (argc-- > 0) {
660
+ int ret = copy_string_kernel(argv[argc], bprm);
661
+ if (ret < 0)
662
+ return ret;
663
+ if (fatal_signal_pending(current))
664
+ return -ERESTARTNOHAND;
665
+ cond_resched();
666
+ }
667
+ return 0;
668
+}
607669
608670 #ifdef CONFIG_MMU
609671
....@@ -735,7 +797,7 @@
735797 bprm->loader -= stack_shift;
736798 bprm->exec -= stack_shift;
737799
738
- if (down_write_killable(&mm->mmap_sem))
800
+ if (mmap_write_lock_killable(mm))
739801 return -EINTR;
740802
741803 vm_flags = VM_STACK_FLAGS;
....@@ -757,6 +819,11 @@
757819 if (ret)
758820 goto out_unlock;
759821 BUG_ON(prev != vma);
822
+
823
+ if (unlikely(vm_flags & VM_EXEC)) {
824
+ pr_warn_once("process '%pD4' started with executable stack\n",
825
+ bprm->file);
826
+ }
760827
761828 /* Move stack pages down in memory. */
762829 if (stack_shift) {
....@@ -792,7 +859,7 @@
792859 ret = -EFAULT;
793860
794861 out_unlock:
795
- up_write(&mm->mmap_sem);
862
+ mmap_write_unlock(mm);
796863 return ret;
797864 }
798865 EXPORT_SYMBOL(setup_arg_pages);
....@@ -854,11 +921,14 @@
854921 if (IS_ERR(file))
855922 goto out;
856923
924
+ /*
925
+ * may_open() has already checked for this, so it should be
926
+ * impossible to trip now. But we need to be extra cautious
927
+ * and check again at the very end too.
928
+ */
857929 err = -EACCES;
858
- if (!S_ISREG(file_inode(file)->i_mode))
859
- goto exit;
860
-
861
- if (path_noexec(&file->f_path))
930
+ if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) ||
931
+ path_noexec(&file->f_path)))
862932 goto exit;
863933
864934 err = deny_write_access(file);
....@@ -889,145 +959,57 @@
889959 }
890960 EXPORT_SYMBOL(open_exec);
891961
892
-int kernel_read_file(struct file *file, void **buf, loff_t *size,
893
- loff_t max_size, enum kernel_read_file_id id)
894
-{
895
- loff_t i_size, pos;
896
- ssize_t bytes = 0;
897
- int ret;
898
-
899
- if (!S_ISREG(file_inode(file)->i_mode) || max_size < 0)
900
- return -EINVAL;
901
-
902
- ret = deny_write_access(file);
903
- if (ret)
904
- return ret;
905
-
906
- ret = security_kernel_read_file(file, id);
907
- if (ret)
908
- goto out;
909
-
910
- i_size = i_size_read(file_inode(file));
911
- if (max_size > 0 && i_size > max_size) {
912
- ret = -EFBIG;
913
- goto out;
914
- }
915
- if (i_size <= 0) {
916
- ret = -EINVAL;
917
- goto out;
918
- }
919
-
920
- if (id != READING_FIRMWARE_PREALLOC_BUFFER)
921
- *buf = vmalloc(i_size);
922
- if (!*buf) {
923
- ret = -ENOMEM;
924
- goto out;
925
- }
926
-
927
- pos = 0;
928
- while (pos < i_size) {
929
- bytes = kernel_read(file, *buf + pos, i_size - pos, &pos);
930
- if (bytes < 0) {
931
- ret = bytes;
932
- goto out_free;
933
- }
934
-
935
- if (bytes == 0)
936
- break;
937
- }
938
-
939
- if (pos != i_size) {
940
- ret = -EIO;
941
- goto out_free;
942
- }
943
-
944
- ret = security_kernel_post_read_file(file, *buf, i_size, id);
945
- if (!ret)
946
- *size = pos;
947
-
948
-out_free:
949
- if (ret < 0) {
950
- if (id != READING_FIRMWARE_PREALLOC_BUFFER) {
951
- vfree(*buf);
952
- *buf = NULL;
953
- }
954
- }
955
-
956
-out:
957
- allow_write_access(file);
958
- return ret;
959
-}
960
-EXPORT_SYMBOL_GPL(kernel_read_file);
961
-
962
-int kernel_read_file_from_path(const char *path, void **buf, loff_t *size,
963
- loff_t max_size, enum kernel_read_file_id id)
964
-{
965
- struct file *file;
966
- int ret;
967
-
968
- if (!path || !*path)
969
- return -EINVAL;
970
-
971
- file = filp_open(path, O_RDONLY, 0);
972
- if (IS_ERR(file))
973
- return PTR_ERR(file);
974
-
975
- ret = kernel_read_file(file, buf, size, max_size, id);
976
- fput(file);
977
- return ret;
978
-}
979
-EXPORT_SYMBOL_GPL(kernel_read_file_from_path);
980
-
981
-int kernel_read_file_from_fd(int fd, void **buf, loff_t *size, loff_t max_size,
982
- enum kernel_read_file_id id)
983
-{
984
- struct fd f = fdget(fd);
985
- int ret = -EBADF;
986
-
987
- if (!f.file || !(f.file->f_mode & FMODE_READ))
988
- goto out;
989
-
990
- ret = kernel_read_file(f.file, buf, size, max_size, id);
991
-out:
992
- fdput(f);
993
- return ret;
994
-}
995
-EXPORT_SYMBOL_GPL(kernel_read_file_from_fd);
996
-
962
+#if defined(CONFIG_HAVE_AOUT) || defined(CONFIG_BINFMT_FLAT) || \
963
+ defined(CONFIG_BINFMT_ELF_FDPIC)
997964 ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
998965 {
999966 ssize_t res = vfs_read(file, (void __user *)addr, len, &pos);
1000967 if (res > 0)
1001
- flush_icache_range(addr, addr + len);
968
+ flush_icache_user_range(addr, addr + len);
1002969 return res;
1003970 }
1004971 EXPORT_SYMBOL(read_code);
972
+#endif
1005973
974
+/*
975
+ * Maps the mm_struct mm into the current task struct.
976
+ * On success, this function returns with exec_update_lock
977
+ * held for writing.
978
+ */
1006979 static int exec_mmap(struct mm_struct *mm)
1007980 {
1008981 struct task_struct *tsk;
1009982 struct mm_struct *old_mm, *active_mm;
983
+ int ret;
1010984
1011985 /* Notify parent that we're no longer interested in the old VM */
1012986 tsk = current;
1013987 old_mm = current->mm;
1014988 exec_mm_release(tsk, old_mm);
989
+ if (old_mm)
990
+ sync_mm_rss(old_mm);
991
+
992
+ ret = down_write_killable(&tsk->signal->exec_update_lock);
993
+ if (ret)
994
+ return ret;
1015995
1016996 if (old_mm) {
1017
- sync_mm_rss(old_mm);
1018997 /*
1019998 * Make sure that if there is a core dump in progress
1020999 * for the old mm, we get out and die instead of going
1021
- * through with the exec. We must hold mmap_sem around
1000
+ * through with the exec. We must hold mmap_lock around
10221001 * checking core_state and changing tsk->mm.
10231002 */
1024
- down_read(&old_mm->mmap_sem);
1003
+ mmap_read_lock(old_mm);
10251004 if (unlikely(old_mm->core_state)) {
1026
- up_read(&old_mm->mmap_sem);
1005
+ mmap_read_unlock(old_mm);
1006
+ up_write(&tsk->signal->exec_update_lock);
10271007 return -EINTR;
10281008 }
10291009 }
1010
+
10301011 task_lock(tsk);
1012
+ membarrier_exec_mmap(mm);
10311013
10321014 local_irq_disable();
10331015 active_mm = tsk->active_mm;
....@@ -1049,7 +1031,7 @@
10491031 vmacache_flush(tsk);
10501032 task_unlock(tsk);
10511033 if (old_mm) {
1052
- up_read(&old_mm->mmap_sem);
1034
+ mmap_read_unlock(old_mm);
10531035 BUG_ON(active_mm != old_mm);
10541036 setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
10551037 mm_update_next_owner(old_mm);
....@@ -1060,12 +1042,6 @@
10601042 return 0;
10611043 }
10621044
1063
-/*
1064
- * This function makes sure the current process has its own signal table,
1065
- * so that flush_signal_handlers can later reset the handlers without
1066
- * disturbing other processes. (Other processes might share the signal
1067
- * table via the CLONE_SIGHAND option to clone().)
1068
- */
10691045 static int de_thread(struct task_struct *tsk)
10701046 {
10711047 struct signal_struct *sig = tsk->signal;
....@@ -1097,7 +1073,7 @@
10971073 __set_current_state(TASK_KILLABLE);
10981074 spin_unlock_irq(lock);
10991075 schedule();
1100
- if (unlikely(__fatal_signal_pending(tsk)))
1076
+ if (__fatal_signal_pending(tsk))
11011077 goto killed;
11021078 spin_lock_irq(lock);
11031079 }
....@@ -1125,7 +1101,7 @@
11251101 write_unlock_irq(&tasklist_lock);
11261102 cgroup_threadgroup_change_end(tsk);
11271103 schedule();
1128
- if (unlikely(__fatal_signal_pending(tsk)))
1104
+ if (__fatal_signal_pending(tsk))
11291105 goto killed;
11301106 }
11311107
....@@ -1140,10 +1116,9 @@
11401116 * also take its birthdate (always earlier than our own).
11411117 */
11421118 tsk->start_time = leader->start_time;
1143
- tsk->real_start_time = leader->real_start_time;
1119
+ tsk->start_boottime = leader->start_boottime;
11441120
11451121 BUG_ON(!same_thread_group(leader, tsk));
1146
- BUG_ON(has_group_leader_pid(tsk));
11471122 /*
11481123 * An exec() starts a new thread group with the
11491124 * TGID of the previous thread group. Rehash the
....@@ -1153,11 +1128,8 @@
11531128
11541129 /* Become a process group leader with the old leader's pid.
11551130 * The old leader becomes a thread of the this thread group.
1156
- * Note: The old leader also uses this pid until release_task
1157
- * is called. Odd but simple and correct.
11581131 */
1159
- tsk->pid = leader->pid;
1160
- change_pid(tsk, PIDTYPE_PID, task_pid(leader));
1132
+ exchange_tids(tsk, leader);
11611133 transfer_pid(leader, tsk, PIDTYPE_TGID);
11621134 transfer_pid(leader, tsk, PIDTYPE_PGID);
11631135 transfer_pid(leader, tsk, PIDTYPE_SID);
....@@ -1194,34 +1166,6 @@
11941166 /* we have changed execution domain */
11951167 tsk->exit_signal = SIGCHLD;
11961168
1197
-#ifdef CONFIG_POSIX_TIMERS
1198
- exit_itimers(sig);
1199
- flush_itimer_signals();
1200
-#endif
1201
-
1202
- if (atomic_read(&oldsighand->count) != 1) {
1203
- struct sighand_struct *newsighand;
1204
- /*
1205
- * This ->sighand is shared with the CLONE_SIGHAND
1206
- * but not CLONE_THREAD task, switch to the new one.
1207
- */
1208
- newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
1209
- if (!newsighand)
1210
- return -ENOMEM;
1211
-
1212
- atomic_set(&newsighand->count, 1);
1213
- memcpy(newsighand->action, oldsighand->action,
1214
- sizeof(newsighand->action));
1215
-
1216
- write_lock_irq(&tasklist_lock);
1217
- spin_lock(&oldsighand->siglock);
1218
- rcu_assign_pointer(tsk->sighand, newsighand);
1219
- spin_unlock(&oldsighand->siglock);
1220
- write_unlock_irq(&tasklist_lock);
1221
-
1222
- __cleanup_sighand(oldsighand);
1223
- }
1224
-
12251169 BUG_ON(!thread_group_leader(tsk));
12261170 return 0;
12271171
....@@ -1232,6 +1176,42 @@
12321176 sig->notify_count = 0;
12331177 read_unlock(&tasklist_lock);
12341178 return -EAGAIN;
1179
+}
1180
+
1181
+
1182
+/*
1183
+ * This function makes sure the current process has its own signal table,
1184
+ * so that flush_signal_handlers can later reset the handlers without
1185
+ * disturbing other processes. (Other processes might share the signal
1186
+ * table via the CLONE_SIGHAND option to clone().)
1187
+ */
1188
+static int unshare_sighand(struct task_struct *me)
1189
+{
1190
+ struct sighand_struct *oldsighand = me->sighand;
1191
+
1192
+ if (refcount_read(&oldsighand->count) != 1) {
1193
+ struct sighand_struct *newsighand;
1194
+ /*
1195
+ * This ->sighand is shared with the CLONE_SIGHAND
1196
+ * but not CLONE_THREAD task, switch to the new one.
1197
+ */
1198
+ newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
1199
+ if (!newsighand)
1200
+ return -ENOMEM;
1201
+
1202
+ refcount_set(&newsighand->count, 1);
1203
+
1204
+ write_lock_irq(&tasklist_lock);
1205
+ spin_lock(&oldsighand->siglock);
1206
+ memcpy(newsighand->action, oldsighand->action,
1207
+ sizeof(newsighand->action));
1208
+ rcu_assign_pointer(me->sighand, newsighand);
1209
+ spin_unlock(&oldsighand->siglock);
1210
+ write_unlock_irq(&tasklist_lock);
1211
+
1212
+ __cleanup_sighand(oldsighand);
1213
+ }
1214
+ return 0;
12351215 }
12361216
12371217 char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk)
....@@ -1261,17 +1241,27 @@
12611241 * Calling this is the point of no return. None of the failures will be
12621242 * seen by userspace since either the process is already taking a fatal
12631243 * signal (via de_thread() or coredump), or will have SEGV raised
1264
- * (after exec_mmap()) by search_binary_handlers (see below).
1244
+ * (after exec_mmap()) by search_binary_handler (see below).
12651245 */
1266
-int flush_old_exec(struct linux_binprm * bprm)
1246
+int begin_new_exec(struct linux_binprm * bprm)
12671247 {
1248
+ struct task_struct *me = current;
12681249 int retval;
12691250
1251
+ /* Once we are committed compute the creds */
1252
+ retval = bprm_creds_from_file(bprm);
1253
+ if (retval)
1254
+ return retval;
1255
+
12701256 /*
1271
- * Make sure we have a private signal table and that
1272
- * we are unassociated from the previous thread group.
1257
+ * Ensure all future errors are fatal.
12731258 */
1274
- retval = de_thread(current);
1259
+ bprm->point_of_no_return = true;
1260
+
1261
+ /*
1262
+ * Make this the only thread in the thread group.
1263
+ */
1264
+ retval = de_thread(me);
12751265 if (retval)
12761266 goto out;
12771267
....@@ -1282,7 +1272,10 @@
12821272 */
12831273 set_mm_exe_file(bprm->mm, bprm->file);
12841274
1275
+ /* If the binary is not readable then enforce mm->dumpable=0 */
12851276 would_dump(bprm, bprm->file);
1277
+ if (bprm->have_execfd)
1278
+ would_dump(bprm, bprm->executable);
12861279
12871280 /*
12881281 * Release all of the old mmap stuff
....@@ -1292,19 +1285,33 @@
12921285 if (retval)
12931286 goto out;
12941287
1295
- /*
1296
- * After clearing bprm->mm (to mark that current is using the
1297
- * prepared mm now), we have nothing left of the original
1298
- * process. If anything from here on returns an error, the check
1299
- * in search_binary_handler() will SEGV current.
1300
- */
13011288 bprm->mm = NULL;
13021289
1303
- set_fs(USER_DS);
1304
- current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
1290
+#ifdef CONFIG_POSIX_TIMERS
1291
+ spin_lock_irq(&me->sighand->siglock);
1292
+ posix_cpu_timers_exit(me);
1293
+ spin_unlock_irq(&me->sighand->siglock);
1294
+ exit_itimers(me);
1295
+ flush_itimer_signals();
1296
+#endif
1297
+
1298
+ /*
1299
+ * Make the signal table private.
1300
+ */
1301
+ retval = unshare_sighand(me);
1302
+ if (retval)
1303
+ goto out_unlock;
1304
+
1305
+ /*
1306
+ * Ensure that the uaccess routines can actually operate on userspace
1307
+ * pointers:
1308
+ */
1309
+ force_uaccess_begin();
1310
+
1311
+ me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
13051312 PF_NOFREEZE | PF_NO_SETAFFINITY);
13061313 flush_thread();
1307
- current->personality &= ~bprm->per_clear;
1314
+ me->personality &= ~bprm->per_clear;
13081315
13091316 /*
13101317 * We have to apply CLOEXEC before we change whether the process is
....@@ -1312,18 +1319,90 @@
13121319 * trying to access the should-be-closed file descriptors of a process
13131320 * undergoing exec(2).
13141321 */
1315
- do_close_on_exec(current->files);
1322
+ do_close_on_exec(me->files);
1323
+
1324
+ if (bprm->secureexec) {
1325
+ /* Make sure parent cannot signal privileged process. */
1326
+ me->pdeath_signal = 0;
1327
+
1328
+ /*
1329
+ * For secureexec, reset the stack limit to sane default to
1330
+ * avoid bad behavior from the prior rlimits. This has to
1331
+ * happen before arch_pick_mmap_layout(), which examines
1332
+ * RLIMIT_STACK, but after the point of no return to avoid
1333
+ * needing to clean up the change on failure.
1334
+ */
1335
+ if (bprm->rlim_stack.rlim_cur > _STK_LIM)
1336
+ bprm->rlim_stack.rlim_cur = _STK_LIM;
1337
+ }
1338
+
1339
+ me->sas_ss_sp = me->sas_ss_size = 0;
1340
+
1341
+ /*
1342
+ * Figure out dumpability. Note that this checking only of current
1343
+ * is wrong, but userspace depends on it. This should be testing
1344
+ * bprm->secureexec instead.
1345
+ */
1346
+ if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP ||
1347
+ !(uid_eq(current_euid(), current_uid()) &&
1348
+ gid_eq(current_egid(), current_gid())))
1349
+ set_dumpable(current->mm, suid_dumpable);
1350
+ else
1351
+ set_dumpable(current->mm, SUID_DUMP_USER);
1352
+
1353
+ perf_event_exec();
1354
+ __set_task_comm(me, kbasename(bprm->filename), true);
1355
+
1356
+ /* An exec changes our domain. We are no longer part of the thread
1357
+ group */
1358
+ WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1);
1359
+ flush_signal_handlers(me, 0);
1360
+
1361
+ /*
1362
+ * install the new credentials for this executable
1363
+ */
1364
+ security_bprm_committing_creds(bprm);
1365
+
1366
+ commit_creds(bprm->cred);
1367
+ bprm->cred = NULL;
1368
+
1369
+ /*
1370
+ * Disable monitoring for regular users
1371
+ * when executing setuid binaries. Must
1372
+ * wait until new credentials are committed
1373
+ * by commit_creds() above
1374
+ */
1375
+ if (get_dumpable(me->mm) != SUID_DUMP_USER)
1376
+ perf_event_exit_task(me);
1377
+ /*
1378
+ * cred_guard_mutex must be held at least to this point to prevent
1379
+ * ptrace_attach() from altering our determination of the task's
1380
+ * credentials; any time after this it may be unlocked.
1381
+ */
1382
+ security_bprm_committed_creds(bprm);
1383
+
1384
+ /* Pass the opened binary to the interpreter. */
1385
+ if (bprm->have_execfd) {
1386
+ retval = get_unused_fd_flags(0);
1387
+ if (retval < 0)
1388
+ goto out_unlock;
1389
+ fd_install(retval, bprm->executable);
1390
+ bprm->executable = NULL;
1391
+ bprm->execfd = retval;
1392
+ }
13161393 return 0;
13171394
1395
+out_unlock:
1396
+ up_write(&me->signal->exec_update_lock);
13181397 out:
13191398 return retval;
13201399 }
1321
-EXPORT_SYMBOL(flush_old_exec);
1400
+EXPORT_SYMBOL(begin_new_exec);
13221401
13231402 void would_dump(struct linux_binprm *bprm, struct file *file)
13241403 {
13251404 struct inode *inode = file_inode(file);
1326
- if (inode_permission2(file->f_path.mnt, inode, MAY_READ) < 0) {
1405
+ if (inode_permission(inode, MAY_READ) < 0) {
13271406 struct user_namespace *old, *user_ns;
13281407 bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
13291408
....@@ -1343,58 +1422,20 @@
13431422
13441423 void setup_new_exec(struct linux_binprm * bprm)
13451424 {
1346
- /*
1347
- * Once here, prepare_binrpm() will not be called any more, so
1348
- * the final state of setuid/setgid/fscaps can be merged into the
1349
- * secureexec flag.
1350
- */
1351
- bprm->secureexec |= bprm->cap_elevated;
1425
+ /* Setup things that can depend upon the personality */
1426
+ struct task_struct *me = current;
13521427
1353
- if (bprm->secureexec) {
1354
- /* Make sure parent cannot signal privileged process. */
1355
- current->pdeath_signal = 0;
1356
-
1357
- /*
1358
- * For secureexec, reset the stack limit to sane default to
1359
- * avoid bad behavior from the prior rlimits. This has to
1360
- * happen before arch_pick_mmap_layout(), which examines
1361
- * RLIMIT_STACK, but after the point of no return to avoid
1362
- * needing to clean up the change on failure.
1363
- */
1364
- if (bprm->rlim_stack.rlim_cur > _STK_LIM)
1365
- bprm->rlim_stack.rlim_cur = _STK_LIM;
1366
- }
1367
-
1368
- arch_pick_mmap_layout(current->mm, &bprm->rlim_stack);
1369
-
1370
- current->sas_ss_sp = current->sas_ss_size = 0;
1371
-
1372
- /*
1373
- * Figure out dumpability. Note that this checking only of current
1374
- * is wrong, but userspace depends on it. This should be testing
1375
- * bprm->secureexec instead.
1376
- */
1377
- if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP ||
1378
- !(uid_eq(current_euid(), current_uid()) &&
1379
- gid_eq(current_egid(), current_gid())))
1380
- set_dumpable(current->mm, suid_dumpable);
1381
- else
1382
- set_dumpable(current->mm, SUID_DUMP_USER);
1428
+ arch_pick_mmap_layout(me->mm, &bprm->rlim_stack);
13831429
13841430 arch_setup_new_exec();
1385
- perf_event_exec();
1386
- __set_task_comm(current, kbasename(bprm->filename), true);
13871431
13881432 /* Set the new mm task size. We have to do that late because it may
13891433 * depend on TIF_32BIT which is only updated in flush_thread() on
13901434 * some architectures like powerpc
13911435 */
1392
- current->mm->task_size = TASK_SIZE;
1393
-
1394
- /* An exec changes our domain. We are no longer part of the thread
1395
- group */
1396
- WRITE_ONCE(current->self_exec_id, current->self_exec_id + 1);
1397
- flush_signal_handlers(current, 0);
1436
+ me->mm->task_size = TASK_SIZE;
1437
+ up_write(&me->signal->exec_update_lock);
1438
+ mutex_unlock(&me->signal->cred_guard_mutex);
13981439 }
13991440 EXPORT_SYMBOL(setup_new_exec);
14001441
....@@ -1410,11 +1451,11 @@
14101451
14111452 /*
14121453 * Prepare credentials and lock ->cred_guard_mutex.
1413
- * install_exec_creds() commits the new creds and drops the lock.
1454
+ * setup_new_exec() commits the new creds and drops the lock.
14141455 * Or, if exec fails before, free_bprm() should release ->cred and
14151456 * and unlock.
14161457 */
1417
-int prepare_bprm_creds(struct linux_binprm *bprm)
1458
+static int prepare_bprm_creds(struct linux_binprm *bprm)
14181459 {
14191460 if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
14201461 return -ERESTARTNOINTR;
....@@ -1429,6 +1470,10 @@
14291470
14301471 static void free_bprm(struct linux_binprm *bprm)
14311472 {
1473
+ if (bprm->mm) {
1474
+ acct_arg_size(bprm, 0);
1475
+ mmput(bprm->mm);
1476
+ }
14321477 free_arg_pages(bprm);
14331478 if (bprm->cred) {
14341479 mutex_unlock(&current->signal->cred_guard_mutex);
....@@ -1438,10 +1483,46 @@
14381483 allow_write_access(bprm->file);
14391484 fput(bprm->file);
14401485 }
1486
+ if (bprm->executable)
1487
+ fput(bprm->executable);
14411488 /* If a binfmt changed the interp, free it. */
14421489 if (bprm->interp != bprm->filename)
14431490 kfree(bprm->interp);
1491
+ kfree(bprm->fdpath);
14441492 kfree(bprm);
1493
+}
1494
+
1495
+static struct linux_binprm *alloc_bprm(int fd, struct filename *filename)
1496
+{
1497
+ struct linux_binprm *bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
1498
+ int retval = -ENOMEM;
1499
+ if (!bprm)
1500
+ goto out;
1501
+
1502
+ if (fd == AT_FDCWD || filename->name[0] == '/') {
1503
+ bprm->filename = filename->name;
1504
+ } else {
1505
+ if (filename->name[0] == '\0')
1506
+ bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd);
1507
+ else
1508
+ bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s",
1509
+ fd, filename->name);
1510
+ if (!bprm->fdpath)
1511
+ goto out_free;
1512
+
1513
+ bprm->filename = bprm->fdpath;
1514
+ }
1515
+ bprm->interp = bprm->filename;
1516
+
1517
+ retval = bprm_mm_init(bprm);
1518
+ if (retval)
1519
+ goto out_free;
1520
+ return bprm;
1521
+
1522
+out_free:
1523
+ free_bprm(bprm);
1524
+out:
1525
+ return ERR_PTR(retval);
14451526 }
14461527
14471528 int bprm_change_interp(const char *interp, struct linux_binprm *bprm)
....@@ -1455,34 +1536,6 @@
14551536 return 0;
14561537 }
14571538 EXPORT_SYMBOL(bprm_change_interp);
1458
-
1459
-/*
1460
- * install the new credentials for this executable
1461
- */
1462
-void install_exec_creds(struct linux_binprm *bprm)
1463
-{
1464
- security_bprm_committing_creds(bprm);
1465
-
1466
- commit_creds(bprm->cred);
1467
- bprm->cred = NULL;
1468
-
1469
- /*
1470
- * Disable monitoring for regular users
1471
- * when executing setuid binaries. Must
1472
- * wait until new credentials are committed
1473
- * by commit_creds() above
1474
- */
1475
- if (get_dumpable(current->mm) != SUID_DUMP_USER)
1476
- perf_event_exit_task(current);
1477
- /*
1478
- * cred_guard_mutex must be held at least to this point to prevent
1479
- * ptrace_attach() from altering our determination of the task's
1480
- * credentials; any time after this it may be unlocked.
1481
- */
1482
- security_bprm_committed_creds(bprm);
1483
- mutex_unlock(&current->signal->cred_guard_mutex);
1484
-}
1485
-EXPORT_SYMBOL(install_exec_creds);
14861539
14871540 /*
14881541 * determine how safe it is to execute the proposed program
....@@ -1521,29 +1574,21 @@
15211574 spin_unlock(&p->fs->lock);
15221575 }
15231576
1524
-static void bprm_fill_uid(struct linux_binprm *bprm)
1577
+static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file)
15251578 {
1579
+ /* Handle suid and sgid on files */
15261580 struct inode *inode;
15271581 unsigned int mode;
15281582 kuid_t uid;
15291583 kgid_t gid;
15301584
1531
- /*
1532
- * Since this can be called multiple times (via prepare_binprm),
1533
- * we must clear any previous work done when setting set[ug]id
1534
- * bits from any earlier bprm->file uses (for example when run
1535
- * first for a setuid script then again for its interpreter).
1536
- */
1537
- bprm->cred->euid = current_euid();
1538
- bprm->cred->egid = current_egid();
1539
-
1540
- if (!mnt_may_suid(bprm->file->f_path.mnt))
1585
+ if (!mnt_may_suid(file->f_path.mnt))
15411586 return;
15421587
15431588 if (task_no_new_privs(current))
15441589 return;
15451590
1546
- inode = bprm->file->f_path.dentry->d_inode;
1591
+ inode = file->f_path.dentry->d_inode;
15471592 mode = READ_ONCE(inode->i_mode);
15481593 if (!(mode & (S_ISUID|S_ISGID)))
15491594 return;
....@@ -1574,29 +1619,30 @@
15741619 }
15751620
15761621 /*
1622
+ * Compute brpm->cred based upon the final binary.
1623
+ */
1624
+static int bprm_creds_from_file(struct linux_binprm *bprm)
1625
+{
1626
+ /* Compute creds based on which file? */
1627
+ struct file *file = bprm->execfd_creds ? bprm->executable : bprm->file;
1628
+
1629
+ bprm_fill_uid(bprm, file);
1630
+ return security_bprm_creds_from_file(bprm, file);
1631
+}
1632
+
1633
+/*
15771634 * Fill the binprm structure from the inode.
1578
- * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
1635
+ * Read the first BINPRM_BUF_SIZE bytes
15791636 *
15801637 * This may be called multiple times for binary chains (scripts for example).
15811638 */
1582
-int prepare_binprm(struct linux_binprm *bprm)
1639
+static int prepare_binprm(struct linux_binprm *bprm)
15831640 {
1584
- int retval;
15851641 loff_t pos = 0;
1586
-
1587
- bprm_fill_uid(bprm);
1588
-
1589
- /* fill in binprm security blob */
1590
- retval = security_bprm_set_creds(bprm);
1591
- if (retval)
1592
- return retval;
1593
- bprm->called_set_creds = 1;
15941642
15951643 memset(bprm->buf, 0, BINPRM_BUF_SIZE);
15961644 return kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, &pos);
15971645 }
1598
-
1599
-EXPORT_SYMBOL(prepare_binprm);
16001646
16011647 /*
16021648 * Arguments are '\0' separated strings found at the location bprm->p
....@@ -1643,15 +1689,15 @@
16431689 /*
16441690 * cycle the list of binary formats handler, until one recognizes the image
16451691 */
1646
-int search_binary_handler(struct linux_binprm *bprm)
1692
+static int search_binary_handler(struct linux_binprm *bprm)
16471693 {
16481694 bool need_retry = IS_ENABLED(CONFIG_MODULES);
16491695 struct linux_binfmt *fmt;
16501696 int retval;
16511697
1652
- /* This allows 4 levels of binfmt rewrites before failing hard. */
1653
- if (bprm->recursion_depth > 5)
1654
- return -ELOOP;
1698
+ retval = prepare_binprm(bprm);
1699
+ if (retval < 0)
1700
+ return retval;
16551701
16561702 retval = security_bprm_check(bprm);
16571703 if (retval)
....@@ -1664,18 +1710,12 @@
16641710 if (!try_module_get(fmt->module))
16651711 continue;
16661712 read_unlock(&binfmt_lock);
1667
- bprm->recursion_depth++;
1713
+
16681714 retval = fmt->load_binary(bprm);
1715
+
16691716 read_lock(&binfmt_lock);
16701717 put_binfmt(fmt);
1671
- bprm->recursion_depth--;
1672
- if (retval < 0 && !bprm->mm) {
1673
- /* we got to flush_old_exec() and failed after it */
1674
- read_unlock(&binfmt_lock);
1675
- force_sigsegv(SIGSEGV, current);
1676
- return retval;
1677
- }
1678
- if (retval != -ENOEXEC || !bprm->file) {
1718
+ if (bprm->point_of_no_return || (retval != -ENOEXEC)) {
16791719 read_unlock(&binfmt_lock);
16801720 return retval;
16811721 }
....@@ -1694,12 +1734,11 @@
16941734
16951735 return retval;
16961736 }
1697
-EXPORT_SYMBOL(search_binary_handler);
16981737
16991738 static int exec_binprm(struct linux_binprm *bprm)
17001739 {
17011740 pid_t old_pid, old_vpid;
1702
- int ret;
1741
+ int ret, depth;
17031742
17041743 /* Need to fetch pid before load_binary changes it */
17051744 old_pid = current->pid;
....@@ -1707,28 +1746,129 @@
17071746 old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
17081747 rcu_read_unlock();
17091748
1710
- ret = search_binary_handler(bprm);
1711
- if (ret >= 0) {
1712
- audit_bprm(bprm);
1713
- trace_sched_process_exec(current, old_pid, bprm);
1714
- ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
1715
- proc_exec_connector(current);
1749
+ /* This allows 4 levels of binfmt rewrites before failing hard. */
1750
+ for (depth = 0;; depth++) {
1751
+ struct file *exec;
1752
+ if (depth > 5)
1753
+ return -ELOOP;
1754
+
1755
+ ret = search_binary_handler(bprm);
1756
+ if (ret < 0)
1757
+ return ret;
1758
+ if (!bprm->interpreter)
1759
+ break;
1760
+
1761
+ exec = bprm->file;
1762
+ bprm->file = bprm->interpreter;
1763
+ bprm->interpreter = NULL;
1764
+
1765
+ allow_write_access(exec);
1766
+ if (unlikely(bprm->have_execfd)) {
1767
+ if (bprm->executable) {
1768
+ fput(exec);
1769
+ return -ENOEXEC;
1770
+ }
1771
+ bprm->executable = exec;
1772
+ } else
1773
+ fput(exec);
17161774 }
17171775
1718
- return ret;
1776
+ audit_bprm(bprm);
1777
+ trace_sched_process_exec(current, old_pid, bprm);
1778
+ ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
1779
+ proc_exec_connector(current);
1780
+ return 0;
17191781 }
17201782
17211783 /*
17221784 * sys_execve() executes a new program.
17231785 */
1724
-static int __do_execve_file(int fd, struct filename *filename,
1725
- struct user_arg_ptr argv,
1726
- struct user_arg_ptr envp,
1727
- int flags, struct file *file)
1786
+static int bprm_execve(struct linux_binprm *bprm,
1787
+ int fd, struct filename *filename, int flags)
17281788 {
1729
- char *pathbuf = NULL;
1730
- struct linux_binprm *bprm;
1789
+ struct file *file;
17311790 struct files_struct *displaced;
1791
+ int retval;
1792
+
1793
+ /*
1794
+ * Cancel any io_uring activity across execve
1795
+ */
1796
+ io_uring_task_cancel();
1797
+
1798
+ retval = unshare_files(&displaced);
1799
+ if (retval)
1800
+ return retval;
1801
+
1802
+ retval = prepare_bprm_creds(bprm);
1803
+ if (retval)
1804
+ goto out_files;
1805
+
1806
+ check_unsafe_exec(bprm);
1807
+ current->in_execve = 1;
1808
+
1809
+ file = do_open_execat(fd, filename, flags);
1810
+ retval = PTR_ERR(file);
1811
+ if (IS_ERR(file))
1812
+ goto out_unmark;
1813
+
1814
+ sched_exec();
1815
+
1816
+ bprm->file = file;
1817
+ /*
1818
+ * Record that a name derived from an O_CLOEXEC fd will be
1819
+ * inaccessible after exec. Relies on having exclusive access to
1820
+ * current->files (due to unshare_files above).
1821
+ */
1822
+ if (bprm->fdpath &&
1823
+ close_on_exec(fd, rcu_dereference_raw(current->files->fdt)))
1824
+ bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
1825
+
1826
+ /* Set the unchanging part of bprm->cred */
1827
+ retval = security_bprm_creds_for_exec(bprm);
1828
+ if (retval)
1829
+ goto out;
1830
+
1831
+ retval = exec_binprm(bprm);
1832
+ if (retval < 0)
1833
+ goto out;
1834
+
1835
+ /* execve succeeded */
1836
+ current->fs->in_exec = 0;
1837
+ current->in_execve = 0;
1838
+ rseq_execve(current);
1839
+ acct_update_integrals(current);
1840
+ task_numa_free(current, false);
1841
+ if (displaced)
1842
+ put_files_struct(displaced);
1843
+ return retval;
1844
+
1845
+out:
1846
+ /*
1847
+ * If past the point of no return ensure the the code never
1848
+ * returns to the userspace process. Use an existing fatal
1849
+ * signal if present otherwise terminate the process with
1850
+ * SIGSEGV.
1851
+ */
1852
+ if (bprm->point_of_no_return && !fatal_signal_pending(current))
1853
+ force_sigsegv(SIGSEGV);
1854
+
1855
+out_unmark:
1856
+ current->fs->in_exec = 0;
1857
+ current->in_execve = 0;
1858
+
1859
+out_files:
1860
+ if (displaced)
1861
+ reset_files_struct(displaced);
1862
+
1863
+ return retval;
1864
+}
1865
+
1866
+static int do_execveat_common(int fd, struct filename *filename,
1867
+ struct user_arg_ptr argv,
1868
+ struct user_arg_ptr envp,
1869
+ int flags)
1870
+{
1871
+ struct linux_binprm *bprm;
17321872 int retval;
17331873
17341874 if (IS_ERR(filename))
....@@ -1750,144 +1890,120 @@
17501890 * further execve() calls fail. */
17511891 current->flags &= ~PF_NPROC_EXCEEDED;
17521892
1753
- retval = unshare_files(&displaced);
1754
- if (retval)
1893
+ bprm = alloc_bprm(fd, filename);
1894
+ if (IS_ERR(bprm)) {
1895
+ retval = PTR_ERR(bprm);
17551896 goto out_ret;
1897
+ }
17561898
1757
- retval = -ENOMEM;
1758
- bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
1759
- if (!bprm)
1760
- goto out_files;
1899
+ retval = count(argv, MAX_ARG_STRINGS);
1900
+ if (retval == 0)
1901
+ pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n",
1902
+ current->comm, bprm->filename);
1903
+ if (retval < 0)
1904
+ goto out_free;
1905
+ bprm->argc = retval;
17611906
1762
- retval = prepare_bprm_creds(bprm);
1763
- if (retval)
1907
+ retval = count(envp, MAX_ARG_STRINGS);
1908
+ if (retval < 0)
1909
+ goto out_free;
1910
+ bprm->envc = retval;
1911
+
1912
+ retval = bprm_stack_limits(bprm);
1913
+ if (retval < 0)
17641914 goto out_free;
17651915
1766
- check_unsafe_exec(bprm);
1767
- current->in_execve = 1;
1768
-
1769
- if (!file)
1770
- file = do_open_execat(fd, filename, flags);
1771
- retval = PTR_ERR(file);
1772
- if (IS_ERR(file))
1773
- goto out_unmark;
1774
-
1775
- sched_exec();
1776
-
1777
- bprm->file = file;
1778
- if (!filename) {
1779
- bprm->filename = "none";
1780
- } else if (fd == AT_FDCWD || filename->name[0] == '/') {
1781
- bprm->filename = filename->name;
1782
- } else {
1783
- if (filename->name[0] == '\0')
1784
- pathbuf = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd);
1785
- else
1786
- pathbuf = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s",
1787
- fd, filename->name);
1788
- if (!pathbuf) {
1789
- retval = -ENOMEM;
1790
- goto out_unmark;
1791
- }
1792
- /*
1793
- * Record that a name derived from an O_CLOEXEC fd will be
1794
- * inaccessible after exec. Relies on having exclusive access to
1795
- * current->files (due to unshare_files above).
1796
- */
1797
- if (close_on_exec(fd, rcu_dereference_raw(current->files->fdt)))
1798
- bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
1799
- bprm->filename = pathbuf;
1800
- }
1801
- bprm->interp = bprm->filename;
1802
-
1803
- retval = bprm_mm_init(bprm);
1804
- if (retval)
1805
- goto out_unmark;
1806
-
1807
- bprm->argc = count(argv, MAX_ARG_STRINGS);
1808
- if ((retval = bprm->argc) < 0)
1809
- goto out;
1810
-
1811
- bprm->envc = count(envp, MAX_ARG_STRINGS);
1812
- if ((retval = bprm->envc) < 0)
1813
- goto out;
1814
-
1815
- retval = prepare_binprm(bprm);
1916
+ retval = copy_string_kernel(bprm->filename, bprm);
18161917 if (retval < 0)
1817
- goto out;
1818
-
1819
- retval = copy_strings_kernel(1, &bprm->filename, bprm);
1820
- if (retval < 0)
1821
- goto out;
1822
-
1918
+ goto out_free;
18231919 bprm->exec = bprm->p;
1920
+
18241921 retval = copy_strings(bprm->envc, envp, bprm);
18251922 if (retval < 0)
1826
- goto out;
1923
+ goto out_free;
18271924
18281925 retval = copy_strings(bprm->argc, argv, bprm);
18291926 if (retval < 0)
1830
- goto out;
1927
+ goto out_free;
18311928
1832
- retval = exec_binprm(bprm);
1833
- if (retval < 0)
1834
- goto out;
1835
-
1836
- /* execve succeeded */
1837
- current->fs->in_exec = 0;
1838
- current->in_execve = 0;
1839
- membarrier_execve(current);
1840
- rseq_execve(current);
1841
- acct_update_integrals(current);
1842
- task_numa_free(current, false);
1843
- free_bprm(bprm);
1844
- kfree(pathbuf);
1845
- if (filename)
1846
- putname(filename);
1847
- if (displaced)
1848
- put_files_struct(displaced);
1849
- return retval;
1850
-
1851
-out:
1852
- if (bprm->mm) {
1853
- acct_arg_size(bprm, 0);
1854
- mmput(bprm->mm);
1929
+ /*
1930
+ * When argv is empty, add an empty string ("") as argv[0] to
1931
+ * ensure confused userspace programs that start processing
1932
+ * from argv[1] won't end up walking envp. See also
1933
+ * bprm_stack_limits().
1934
+ */
1935
+ if (bprm->argc == 0) {
1936
+ retval = copy_string_kernel("", bprm);
1937
+ if (retval < 0)
1938
+ goto out_free;
1939
+ bprm->argc = 1;
18551940 }
18561941
1857
-out_unmark:
1858
- current->fs->in_exec = 0;
1859
- current->in_execve = 0;
1860
-
1942
+ retval = bprm_execve(bprm, fd, filename, flags);
18611943 out_free:
18621944 free_bprm(bprm);
1863
- kfree(pathbuf);
18641945
1865
-out_files:
1866
- if (displaced)
1867
- reset_files_struct(displaced);
18681946 out_ret:
1869
- if (filename)
1870
- putname(filename);
1947
+ putname(filename);
18711948 return retval;
18721949 }
18731950
1874
-static int do_execveat_common(int fd, struct filename *filename,
1875
- struct user_arg_ptr argv,
1876
- struct user_arg_ptr envp,
1877
- int flags)
1951
+int kernel_execve(const char *kernel_filename,
1952
+ const char *const *argv, const char *const *envp)
18781953 {
1879
- return __do_execve_file(fd, filename, argv, envp, flags, NULL);
1954
+ struct filename *filename;
1955
+ struct linux_binprm *bprm;
1956
+ int fd = AT_FDCWD;
1957
+ int retval;
1958
+
1959
+ filename = getname_kernel(kernel_filename);
1960
+ if (IS_ERR(filename))
1961
+ return PTR_ERR(filename);
1962
+
1963
+ bprm = alloc_bprm(fd, filename);
1964
+ if (IS_ERR(bprm)) {
1965
+ retval = PTR_ERR(bprm);
1966
+ goto out_ret;
1967
+ }
1968
+
1969
+ retval = count_strings_kernel(argv);
1970
+ if (WARN_ON_ONCE(retval == 0))
1971
+ retval = -EINVAL;
1972
+ if (retval < 0)
1973
+ goto out_free;
1974
+ bprm->argc = retval;
1975
+
1976
+ retval = count_strings_kernel(envp);
1977
+ if (retval < 0)
1978
+ goto out_free;
1979
+ bprm->envc = retval;
1980
+
1981
+ retval = bprm_stack_limits(bprm);
1982
+ if (retval < 0)
1983
+ goto out_free;
1984
+
1985
+ retval = copy_string_kernel(bprm->filename, bprm);
1986
+ if (retval < 0)
1987
+ goto out_free;
1988
+ bprm->exec = bprm->p;
1989
+
1990
+ retval = copy_strings_kernel(bprm->envc, envp, bprm);
1991
+ if (retval < 0)
1992
+ goto out_free;
1993
+
1994
+ retval = copy_strings_kernel(bprm->argc, argv, bprm);
1995
+ if (retval < 0)
1996
+ goto out_free;
1997
+
1998
+ retval = bprm_execve(bprm, fd, filename, 0);
1999
+out_free:
2000
+ free_bprm(bprm);
2001
+out_ret:
2002
+ putname(filename);
2003
+ return retval;
18802004 }
18812005
1882
-int do_execve_file(struct file *file, void *__argv, void *__envp)
1883
-{
1884
- struct user_arg_ptr argv = { .ptr.native = __argv };
1885
- struct user_arg_ptr envp = { .ptr.native = __envp };
1886
-
1887
- return __do_execve_file(AT_FDCWD, NULL, argv, envp, 0, file);
1888
-}
1889
-
1890
-int do_execve(struct filename *filename,
2006
+static int do_execve(struct filename *filename,
18912007 const char __user *const __user *__argv,
18922008 const char __user *const __user *__envp)
18932009 {
....@@ -1896,7 +2012,7 @@
18962012 return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
18972013 }
18982014
1899
-int do_execveat(int fd, struct filename *filename,
2015
+static int do_execveat(int fd, struct filename *filename,
19002016 const char __user *const __user *__argv,
19012017 const char __user *const __user *__envp,
19022018 int flags)
....@@ -1958,15 +2074,10 @@
19582074 */
19592075 void set_dumpable(struct mm_struct *mm, int value)
19602076 {
1961
- unsigned long old, new;
1962
-
19632077 if (WARN_ON((unsigned)value > SUID_DUMP_ROOT))
19642078 return;
19652079
1966
- do {
1967
- old = READ_ONCE(mm->flags);
1968
- new = (old & ~MMF_DUMPABLE_MASK) | value;
1969
- } while (cmpxchg(&mm->flags, old, new) != old);
2080
+ set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value);
19702081 }
19712082
19722083 SYSCALL_DEFINE3(execve,