hc
2024-05-10 9999e48639b3cecb08ffb37358bcba3b48161b29
kernel/fs/exec.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * linux/fs/exec.c
34 *
....@@ -22,6 +23,7 @@
2223 * formats.
2324 */
2425
26
+#include <linux/kernel_read_file.h>
2527 #include <linux/slab.h>
2628 #include <linux/file.h>
2729 #include <linux/fdtable.h>
....@@ -58,10 +60,10 @@
5860 #include <linux/kmod.h>
5961 #include <linux/fsnotify.h>
6062 #include <linux/fs_struct.h>
61
-#include <linux/pipe_fs_i.h>
6263 #include <linux/oom.h>
6364 #include <linux/compat.h>
6465 #include <linux/vmalloc.h>
66
+#include <linux/io_uring.h>
6567
6668 #include <linux/uaccess.h>
6769 #include <asm/mmu_context.h>
....@@ -71,6 +73,10 @@
7173 #include "internal.h"
7274
7375 #include <trace/events/sched.h>
76
+
77
+EXPORT_TRACEPOINT_SYMBOL_GPL(task_rename);
78
+
79
+static int bprm_creds_from_file(struct linux_binprm *bprm);
7480
7581 int suid_dumpable = 0;
7682
....@@ -139,12 +145,14 @@
139145 if (IS_ERR(file))
140146 goto out;
141147
142
- error = -EINVAL;
143
- if (!S_ISREG(file_inode(file)->i_mode))
144
- goto exit;
145
-
148
+ /*
149
+ * may_open() has already checked for this, so it should be
150
+ * impossible to trip now. But we need to be extra cautious
151
+ * and check again at the very end too.
152
+ */
146153 error = -EACCES;
147
- if (path_noexec(&file->f_path))
154
+ if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) ||
155
+ path_noexec(&file->f_path)))
148156 goto exit;
149157
150158 fsnotify_open(file);
....@@ -213,65 +221,20 @@
213221 * We are doing an exec(). 'current' is the process
214222 * doing the exec and bprm->mm is the new process's mm.
215223 */
216
- ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags,
224
+ ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags,
217225 &page, NULL, NULL);
218226 if (ret <= 0)
219227 return NULL;
220228
221
- if (write) {
222
- unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
223
- unsigned long ptr_size, limit;
224
-
225
- /*
226
- * Since the stack will hold pointers to the strings, we
227
- * must account for them as well.
228
- *
229
- * The size calculation is the entire vma while each arg page is
230
- * built, so each time we get here it's calculating how far it
231
- * is currently (rather than each call being just the newly
232
- * added size from the arg page). As a result, we need to
233
- * always add the entire size of the pointers, so that on the
234
- * last call to get_arg_page() we'll actually have the entire
235
- * correct size.
236
- */
237
- ptr_size = (bprm->argc + bprm->envc) * sizeof(void *);
238
- if (ptr_size > ULONG_MAX - size)
239
- goto fail;
240
- size += ptr_size;
241
-
242
- acct_arg_size(bprm, size / PAGE_SIZE);
243
-
244
- /*
245
- * We've historically supported up to 32 pages (ARG_MAX)
246
- * of argument strings even with small stacks
247
- */
248
- if (size <= ARG_MAX)
249
- return page;
250
-
251
- /*
252
- * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM
253
- * (whichever is smaller) for the argv+env strings.
254
- * This ensures that:
255
- * - the remaining binfmt code will not run out of stack space,
256
- * - the program will have a reasonable amount of stack left
257
- * to work from.
258
- */
259
- limit = _STK_LIM / 4 * 3;
260
- limit = min(limit, bprm->rlim_stack.rlim_cur / 4);
261
- if (size > limit)
262
- goto fail;
263
- }
229
+ if (write)
230
+ acct_arg_size(bprm, vma_pages(bprm->vma));
264231
265232 return page;
266
-
267
-fail:
268
- put_page(page);
269
- return NULL;
270233 }
271234
272235 static void put_arg_page(struct page *page)
273236 {
274
- put_page(page);
237
+ put_user_page(page);
275238 }
276239
277240 static void free_arg_pages(struct linux_binprm *bprm)
....@@ -295,7 +258,7 @@
295258 return -ENOMEM;
296259 vma_set_anonymous(vma);
297260
298
- if (down_write_killable(&mm->mmap_sem)) {
261
+ if (mmap_write_lock_killable(mm)) {
299262 err = -EINTR;
300263 goto err_free;
301264 }
....@@ -317,12 +280,11 @@
317280 goto err;
318281
319282 mm->stack_vm = mm->total_vm = 1;
320
- arch_bprm_mm_init(mm, vma);
321
- up_write(&mm->mmap_sem);
283
+ mmap_write_unlock(mm);
322284 bprm->p = vma->vm_end - sizeof(void *);
323285 return 0;
324286 err:
325
- up_write(&mm->mmap_sem);
287
+ mmap_write_unlock(mm);
326288 err_free:
327289 bprm->vma = NULL;
328290 vm_area_free(vma);
....@@ -492,6 +454,64 @@
492454 return i;
493455 }
494456
457
+static int count_strings_kernel(const char *const *argv)
458
+{
459
+ int i;
460
+
461
+ if (!argv)
462
+ return 0;
463
+
464
+ for (i = 0; argv[i]; ++i) {
465
+ if (i >= MAX_ARG_STRINGS)
466
+ return -E2BIG;
467
+ if (fatal_signal_pending(current))
468
+ return -ERESTARTNOHAND;
469
+ cond_resched();
470
+ }
471
+ return i;
472
+}
473
+
474
+static int bprm_stack_limits(struct linux_binprm *bprm)
475
+{
476
+ unsigned long limit, ptr_size;
477
+
478
+ /*
479
+ * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM
480
+ * (whichever is smaller) for the argv+env strings.
481
+ * This ensures that:
482
+ * - the remaining binfmt code will not run out of stack space,
483
+ * - the program will have a reasonable amount of stack left
484
+ * to work from.
485
+ */
486
+ limit = _STK_LIM / 4 * 3;
487
+ limit = min(limit, bprm->rlim_stack.rlim_cur / 4);
488
+ /*
489
+ * We've historically supported up to 32 pages (ARG_MAX)
490
+ * of argument strings even with small stacks
491
+ */
492
+ limit = max_t(unsigned long, limit, ARG_MAX);
493
+ /*
494
+ * We must account for the size of all the argv and envp pointers to
495
+ * the argv and envp strings, since they will also take up space in
496
+ * the stack. They aren't stored until much later when we can't
497
+ * signal to the parent that the child has run out of stack space.
498
+ * Instead, calculate it here so it's possible to fail gracefully.
499
+ *
500
+ * In the case of argc = 0, make sure there is space for adding a
501
+ * empty string (which will bump argc to 1), to ensure confused
502
+ * userspace programs don't start processing from argv[1], thinking
503
+ * argc can never be 0, to keep them from walking envp by accident.
504
+ * See do_execveat_common().
505
+ */
506
+ ptr_size = (max(bprm->argc, 1) + bprm->envc) * sizeof(void *);
507
+ if (limit <= ptr_size)
508
+ return -E2BIG;
509
+ limit -= ptr_size;
510
+
511
+ bprm->argmin = bprm->p - limit;
512
+ return 0;
513
+}
514
+
495515 /*
496516 * 'copy_strings()' copies argument/environment strings from the old
497517 * processes's memory to the new process's stack. The call to get_user_pages()
....@@ -527,6 +547,10 @@
527547 pos = bprm->p;
528548 str += len;
529549 bprm->p -= len;
550
+#ifdef CONFIG_MMU
551
+ if (bprm->p < bprm->argmin)
552
+ goto out;
553
+#endif
530554
531555 while (len > 0) {
532556 int offset, bytes_to_copy;
....@@ -586,24 +610,62 @@
586610 }
587611
588612 /*
589
- * Like copy_strings, but get argv and its values from kernel memory.
613
+ * Copy and argument/environment string from the kernel to the processes stack.
590614 */
591
-int copy_strings_kernel(int argc, const char *const *__argv,
592
- struct linux_binprm *bprm)
615
+int copy_string_kernel(const char *arg, struct linux_binprm *bprm)
593616 {
594
- int r;
595
- mm_segment_t oldfs = get_fs();
596
- struct user_arg_ptr argv = {
597
- .ptr.native = (const char __user *const __user *)__argv,
598
- };
617
+ int len = strnlen(arg, MAX_ARG_STRLEN) + 1 /* terminating NUL */;
618
+ unsigned long pos = bprm->p;
599619
600
- set_fs(KERNEL_DS);
601
- r = copy_strings(argc, argv, bprm);
602
- set_fs(oldfs);
620
+ if (len == 0)
621
+ return -EFAULT;
622
+ if (!valid_arg_len(bprm, len))
623
+ return -E2BIG;
603624
604
- return r;
625
+ /* We're going to work our way backwards. */
626
+ arg += len;
627
+ bprm->p -= len;
628
+ if (IS_ENABLED(CONFIG_MMU) && bprm->p < bprm->argmin)
629
+ return -E2BIG;
630
+
631
+ while (len > 0) {
632
+ unsigned int bytes_to_copy = min_t(unsigned int, len,
633
+ min_not_zero(offset_in_page(pos), PAGE_SIZE));
634
+ struct page *page;
635
+ char *kaddr;
636
+
637
+ pos -= bytes_to_copy;
638
+ arg -= bytes_to_copy;
639
+ len -= bytes_to_copy;
640
+
641
+ page = get_arg_page(bprm, pos, 1);
642
+ if (!page)
643
+ return -E2BIG;
644
+ kaddr = kmap_atomic(page);
645
+ flush_arg_page(bprm, pos & PAGE_MASK, page);
646
+ memcpy(kaddr + offset_in_page(pos), arg, bytes_to_copy);
647
+ flush_kernel_dcache_page(page);
648
+ kunmap_atomic(kaddr);
649
+ put_arg_page(page);
650
+ }
651
+
652
+ return 0;
605653 }
606
-EXPORT_SYMBOL(copy_strings_kernel);
654
+EXPORT_SYMBOL(copy_string_kernel);
655
+
656
+static int copy_strings_kernel(int argc, const char *const *argv,
657
+ struct linux_binprm *bprm)
658
+{
659
+ while (argc-- > 0) {
660
+ int ret = copy_string_kernel(argv[argc], bprm);
661
+ if (ret < 0)
662
+ return ret;
663
+ if (fatal_signal_pending(current))
664
+ return -ERESTARTNOHAND;
665
+ cond_resched();
666
+ }
667
+ return 0;
668
+}
607669
608670 #ifdef CONFIG_MMU
609671
....@@ -735,7 +797,7 @@
735797 bprm->loader -= stack_shift;
736798 bprm->exec -= stack_shift;
737799
738
- if (down_write_killable(&mm->mmap_sem))
800
+ if (mmap_write_lock_killable(mm))
739801 return -EINTR;
740802
741803 vm_flags = VM_STACK_FLAGS;
....@@ -757,6 +819,11 @@
757819 if (ret)
758820 goto out_unlock;
759821 BUG_ON(prev != vma);
822
+
823
+ if (unlikely(vm_flags & VM_EXEC)) {
824
+ pr_warn_once("process '%pD4' started with executable stack\n",
825
+ bprm->file);
826
+ }
760827
761828 /* Move stack pages down in memory. */
762829 if (stack_shift) {
....@@ -792,7 +859,7 @@
792859 ret = -EFAULT;
793860
794861 out_unlock:
795
- up_write(&mm->mmap_sem);
862
+ mmap_write_unlock(mm);
796863 return ret;
797864 }
798865 EXPORT_SYMBOL(setup_arg_pages);
....@@ -854,11 +921,14 @@
854921 if (IS_ERR(file))
855922 goto out;
856923
924
+ /*
925
+ * may_open() has already checked for this, so it should be
926
+ * impossible to trip now. But we need to be extra cautious
927
+ * and check again at the very end too.
928
+ */
857929 err = -EACCES;
858
- if (!S_ISREG(file_inode(file)->i_mode))
859
- goto exit;
860
-
861
- if (path_noexec(&file->f_path))
930
+ if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) ||
931
+ path_noexec(&file->f_path)))
862932 goto exit;
863933
864934 err = deny_write_access(file);
....@@ -889,146 +959,57 @@
889959 }
890960 EXPORT_SYMBOL(open_exec);
891961
892
-int kernel_read_file(struct file *file, void **buf, loff_t *size,
893
- loff_t max_size, enum kernel_read_file_id id)
894
-{
895
- loff_t i_size, pos;
896
- ssize_t bytes = 0;
897
- int ret;
898
-
899
- if (!S_ISREG(file_inode(file)->i_mode) || max_size < 0)
900
- return -EINVAL;
901
-
902
- ret = deny_write_access(file);
903
- if (ret)
904
- return ret;
905
-
906
- ret = security_kernel_read_file(file, id);
907
- if (ret)
908
- goto out;
909
-
910
- i_size = i_size_read(file_inode(file));
911
- if (max_size > 0 && i_size > max_size) {
912
- ret = -EFBIG;
913
- goto out;
914
- }
915
- if (i_size <= 0) {
916
- ret = -EINVAL;
917
- goto out;
918
- }
919
-
920
- if (id != READING_FIRMWARE_PREALLOC_BUFFER)
921
- *buf = vmalloc(i_size);
922
- if (!*buf) {
923
- ret = -ENOMEM;
924
- goto out;
925
- }
926
-
927
- pos = 0;
928
- while (pos < i_size) {
929
- bytes = kernel_read(file, *buf + pos, i_size - pos, &pos);
930
- if (bytes < 0) {
931
- ret = bytes;
932
- goto out_free;
933
- }
934
-
935
- if (bytes == 0)
936
- break;
937
- }
938
-
939
- if (pos != i_size) {
940
- ret = -EIO;
941
- goto out_free;
942
- }
943
-
944
- ret = security_kernel_post_read_file(file, *buf, i_size, id);
945
- if (!ret)
946
- *size = pos;
947
-
948
-out_free:
949
- if (ret < 0) {
950
- if (id != READING_FIRMWARE_PREALLOC_BUFFER) {
951
- vfree(*buf);
952
- *buf = NULL;
953
- }
954
- }
955
-
956
-out:
957
- allow_write_access(file);
958
- return ret;
959
-}
960
-EXPORT_SYMBOL_GPL(kernel_read_file);
961
-
962
-int kernel_read_file_from_path(const char *path, void **buf, loff_t *size,
963
- loff_t max_size, enum kernel_read_file_id id)
964
-{
965
- struct file *file;
966
- int ret;
967
-
968
- if (!path || !*path)
969
- return -EINVAL;
970
-
971
- file = filp_open(path, O_RDONLY, 0);
972
- if (IS_ERR(file))
973
- return PTR_ERR(file);
974
-
975
- ret = kernel_read_file(file, buf, size, max_size, id);
976
- fput(file);
977
- return ret;
978
-}
979
-EXPORT_SYMBOL_GPL(kernel_read_file_from_path);
980
-
981
-int kernel_read_file_from_fd(int fd, void **buf, loff_t *size, loff_t max_size,
982
- enum kernel_read_file_id id)
983
-{
984
- struct fd f = fdget(fd);
985
- int ret = -EBADF;
986
-
987
- if (!f.file || !(f.file->f_mode & FMODE_READ))
988
- goto out;
989
-
990
- ret = kernel_read_file(f.file, buf, size, max_size, id);
991
-out:
992
- fdput(f);
993
- return ret;
994
-}
995
-EXPORT_SYMBOL_GPL(kernel_read_file_from_fd);
996
-
962
+#if defined(CONFIG_HAVE_AOUT) || defined(CONFIG_BINFMT_FLAT) || \
963
+ defined(CONFIG_BINFMT_ELF_FDPIC)
997964 ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
998965 {
999966 ssize_t res = vfs_read(file, (void __user *)addr, len, &pos);
1000967 if (res > 0)
1001
- flush_icache_range(addr, addr + len);
968
+ flush_icache_user_range(addr, addr + len);
1002969 return res;
1003970 }
1004971 EXPORT_SYMBOL(read_code);
972
+#endif
1005973
974
+/*
975
+ * Maps the mm_struct mm into the current task struct.
976
+ * On success, this function returns with exec_update_lock
977
+ * held for writing.
978
+ */
1006979 static int exec_mmap(struct mm_struct *mm)
1007980 {
1008981 struct task_struct *tsk;
1009982 struct mm_struct *old_mm, *active_mm;
983
+ int ret;
1010984
1011985 /* Notify parent that we're no longer interested in the old VM */
1012986 tsk = current;
1013987 old_mm = current->mm;
1014988 exec_mm_release(tsk, old_mm);
989
+ if (old_mm)
990
+ sync_mm_rss(old_mm);
991
+
992
+ ret = down_write_killable(&tsk->signal->exec_update_lock);
993
+ if (ret)
994
+ return ret;
1015995
1016996 if (old_mm) {
1017
- sync_mm_rss(old_mm);
1018997 /*
1019998 * Make sure that if there is a core dump in progress
1020999 * for the old mm, we get out and die instead of going
1021
- * through with the exec. We must hold mmap_sem around
1000
+ * through with the exec. We must hold mmap_lock around
10221001 * checking core_state and changing tsk->mm.
10231002 */
1024
- down_read(&old_mm->mmap_sem);
1003
+ mmap_read_lock(old_mm);
10251004 if (unlikely(old_mm->core_state)) {
1026
- up_read(&old_mm->mmap_sem);
1005
+ mmap_read_unlock(old_mm);
1006
+ up_write(&tsk->signal->exec_update_lock);
10271007 return -EINTR;
10281008 }
10291009 }
1010
+
10301011 task_lock(tsk);
1031
- preempt_disable_rt();
1012
+ membarrier_exec_mmap(mm);
10321013
10331014 local_irq_disable();
10341015 active_mm = tsk->active_mm;
....@@ -1048,10 +1029,9 @@
10481029 local_irq_enable();
10491030 tsk->mm->vmacache_seqnum = 0;
10501031 vmacache_flush(tsk);
1051
- preempt_enable_rt();
10521032 task_unlock(tsk);
10531033 if (old_mm) {
1054
- up_read(&old_mm->mmap_sem);
1034
+ mmap_read_unlock(old_mm);
10551035 BUG_ON(active_mm != old_mm);
10561036 setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
10571037 mm_update_next_owner(old_mm);
....@@ -1062,12 +1042,6 @@
10621042 return 0;
10631043 }
10641044
1065
-/*
1066
- * This function makes sure the current process has its own signal table,
1067
- * so that flush_signal_handlers can later reset the handlers without
1068
- * disturbing other processes. (Other processes might share the signal
1069
- * table via the CLONE_SIGHAND option to clone().)
1070
- */
10711045 static int de_thread(struct task_struct *tsk)
10721046 {
10731047 struct signal_struct *sig = tsk->signal;
....@@ -1099,7 +1073,7 @@
10991073 __set_current_state(TASK_KILLABLE);
11001074 spin_unlock_irq(lock);
11011075 schedule();
1102
- if (unlikely(__fatal_signal_pending(tsk)))
1076
+ if (__fatal_signal_pending(tsk))
11031077 goto killed;
11041078 spin_lock_irq(lock);
11051079 }
....@@ -1127,7 +1101,7 @@
11271101 write_unlock_irq(&tasklist_lock);
11281102 cgroup_threadgroup_change_end(tsk);
11291103 schedule();
1130
- if (unlikely(__fatal_signal_pending(tsk)))
1104
+ if (__fatal_signal_pending(tsk))
11311105 goto killed;
11321106 }
11331107
....@@ -1142,10 +1116,9 @@
11421116 * also take its birthdate (always earlier than our own).
11431117 */
11441118 tsk->start_time = leader->start_time;
1145
- tsk->real_start_time = leader->real_start_time;
1119
+ tsk->start_boottime = leader->start_boottime;
11461120
11471121 BUG_ON(!same_thread_group(leader, tsk));
1148
- BUG_ON(has_group_leader_pid(tsk));
11491122 /*
11501123 * An exec() starts a new thread group with the
11511124 * TGID of the previous thread group. Rehash the
....@@ -1155,11 +1128,8 @@
11551128
11561129 /* Become a process group leader with the old leader's pid.
11571130 * The old leader becomes a thread of the this thread group.
1158
- * Note: The old leader also uses this pid until release_task
1159
- * is called. Odd but simple and correct.
11601131 */
1161
- tsk->pid = leader->pid;
1162
- change_pid(tsk, PIDTYPE_PID, task_pid(leader));
1132
+ exchange_tids(tsk, leader);
11631133 transfer_pid(leader, tsk, PIDTYPE_TGID);
11641134 transfer_pid(leader, tsk, PIDTYPE_PGID);
11651135 transfer_pid(leader, tsk, PIDTYPE_SID);
....@@ -1196,34 +1166,6 @@
11961166 /* we have changed execution domain */
11971167 tsk->exit_signal = SIGCHLD;
11981168
1199
-#ifdef CONFIG_POSIX_TIMERS
1200
- exit_itimers(sig);
1201
- flush_itimer_signals();
1202
-#endif
1203
-
1204
- if (atomic_read(&oldsighand->count) != 1) {
1205
- struct sighand_struct *newsighand;
1206
- /*
1207
- * This ->sighand is shared with the CLONE_SIGHAND
1208
- * but not CLONE_THREAD task, switch to the new one.
1209
- */
1210
- newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
1211
- if (!newsighand)
1212
- return -ENOMEM;
1213
-
1214
- atomic_set(&newsighand->count, 1);
1215
- memcpy(newsighand->action, oldsighand->action,
1216
- sizeof(newsighand->action));
1217
-
1218
- write_lock_irq(&tasklist_lock);
1219
- spin_lock(&oldsighand->siglock);
1220
- rcu_assign_pointer(tsk->sighand, newsighand);
1221
- spin_unlock(&oldsighand->siglock);
1222
- write_unlock_irq(&tasklist_lock);
1223
-
1224
- __cleanup_sighand(oldsighand);
1225
- }
1226
-
12271169 BUG_ON(!thread_group_leader(tsk));
12281170 return 0;
12291171
....@@ -1234,6 +1176,42 @@
12341176 sig->notify_count = 0;
12351177 read_unlock(&tasklist_lock);
12361178 return -EAGAIN;
1179
+}
1180
+
1181
+
1182
+/*
1183
+ * This function makes sure the current process has its own signal table,
1184
+ * so that flush_signal_handlers can later reset the handlers without
1185
+ * disturbing other processes. (Other processes might share the signal
1186
+ * table via the CLONE_SIGHAND option to clone().)
1187
+ */
1188
+static int unshare_sighand(struct task_struct *me)
1189
+{
1190
+ struct sighand_struct *oldsighand = me->sighand;
1191
+
1192
+ if (refcount_read(&oldsighand->count) != 1) {
1193
+ struct sighand_struct *newsighand;
1194
+ /*
1195
+ * This ->sighand is shared with the CLONE_SIGHAND
1196
+ * but not CLONE_THREAD task, switch to the new one.
1197
+ */
1198
+ newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
1199
+ if (!newsighand)
1200
+ return -ENOMEM;
1201
+
1202
+ refcount_set(&newsighand->count, 1);
1203
+
1204
+ write_lock_irq(&tasklist_lock);
1205
+ spin_lock(&oldsighand->siglock);
1206
+ memcpy(newsighand->action, oldsighand->action,
1207
+ sizeof(newsighand->action));
1208
+ rcu_assign_pointer(me->sighand, newsighand);
1209
+ spin_unlock(&oldsighand->siglock);
1210
+ write_unlock_irq(&tasklist_lock);
1211
+
1212
+ __cleanup_sighand(oldsighand);
1213
+ }
1214
+ return 0;
12371215 }
12381216
12391217 char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk)
....@@ -1263,17 +1241,27 @@
12631241 * Calling this is the point of no return. None of the failures will be
12641242 * seen by userspace since either the process is already taking a fatal
12651243 * signal (via de_thread() or coredump), or will have SEGV raised
1266
- * (after exec_mmap()) by search_binary_handlers (see below).
1244
+ * (after exec_mmap()) by search_binary_handler (see below).
12671245 */
1268
-int flush_old_exec(struct linux_binprm * bprm)
1246
+int begin_new_exec(struct linux_binprm * bprm)
12691247 {
1248
+ struct task_struct *me = current;
12701249 int retval;
12711250
1251
+ /* Once we are committed compute the creds */
1252
+ retval = bprm_creds_from_file(bprm);
1253
+ if (retval)
1254
+ return retval;
1255
+
12721256 /*
1273
- * Make sure we have a private signal table and that
1274
- * we are unassociated from the previous thread group.
1257
+ * Ensure all future errors are fatal.
12751258 */
1276
- retval = de_thread(current);
1259
+ bprm->point_of_no_return = true;
1260
+
1261
+ /*
1262
+ * Make this the only thread in the thread group.
1263
+ */
1264
+ retval = de_thread(me);
12771265 if (retval)
12781266 goto out;
12791267
....@@ -1284,7 +1272,10 @@
12841272 */
12851273 set_mm_exe_file(bprm->mm, bprm->file);
12861274
1275
+ /* If the binary is not readable then enforce mm->dumpable=0 */
12871276 would_dump(bprm, bprm->file);
1277
+ if (bprm->have_execfd)
1278
+ would_dump(bprm, bprm->executable);
12881279
12891280 /*
12901281 * Release all of the old mmap stuff
....@@ -1294,19 +1285,33 @@
12941285 if (retval)
12951286 goto out;
12961287
1297
- /*
1298
- * After clearing bprm->mm (to mark that current is using the
1299
- * prepared mm now), we have nothing left of the original
1300
- * process. If anything from here on returns an error, the check
1301
- * in search_binary_handler() will SEGV current.
1302
- */
13031288 bprm->mm = NULL;
13041289
1305
- set_fs(USER_DS);
1306
- current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
1290
+#ifdef CONFIG_POSIX_TIMERS
1291
+ spin_lock_irq(&me->sighand->siglock);
1292
+ posix_cpu_timers_exit(me);
1293
+ spin_unlock_irq(&me->sighand->siglock);
1294
+ exit_itimers(me);
1295
+ flush_itimer_signals();
1296
+#endif
1297
+
1298
+ /*
1299
+ * Make the signal table private.
1300
+ */
1301
+ retval = unshare_sighand(me);
1302
+ if (retval)
1303
+ goto out_unlock;
1304
+
1305
+ /*
1306
+ * Ensure that the uaccess routines can actually operate on userspace
1307
+ * pointers:
1308
+ */
1309
+ force_uaccess_begin();
1310
+
1311
+ me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
13071312 PF_NOFREEZE | PF_NO_SETAFFINITY);
13081313 flush_thread();
1309
- current->personality &= ~bprm->per_clear;
1314
+ me->personality &= ~bprm->per_clear;
13101315
13111316 /*
13121317 * We have to apply CLOEXEC before we change whether the process is
....@@ -1314,18 +1319,90 @@
13141319 * trying to access the should-be-closed file descriptors of a process
13151320 * undergoing exec(2).
13161321 */
1317
- do_close_on_exec(current->files);
1322
+ do_close_on_exec(me->files);
1323
+
1324
+ if (bprm->secureexec) {
1325
+ /* Make sure parent cannot signal privileged process. */
1326
+ me->pdeath_signal = 0;
1327
+
1328
+ /*
1329
+ * For secureexec, reset the stack limit to sane default to
1330
+ * avoid bad behavior from the prior rlimits. This has to
1331
+ * happen before arch_pick_mmap_layout(), which examines
1332
+ * RLIMIT_STACK, but after the point of no return to avoid
1333
+ * needing to clean up the change on failure.
1334
+ */
1335
+ if (bprm->rlim_stack.rlim_cur > _STK_LIM)
1336
+ bprm->rlim_stack.rlim_cur = _STK_LIM;
1337
+ }
1338
+
1339
+ me->sas_ss_sp = me->sas_ss_size = 0;
1340
+
1341
+ /*
1342
+ * Figure out dumpability. Note that this checking only of current
1343
+ * is wrong, but userspace depends on it. This should be testing
1344
+ * bprm->secureexec instead.
1345
+ */
1346
+ if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP ||
1347
+ !(uid_eq(current_euid(), current_uid()) &&
1348
+ gid_eq(current_egid(), current_gid())))
1349
+ set_dumpable(current->mm, suid_dumpable);
1350
+ else
1351
+ set_dumpable(current->mm, SUID_DUMP_USER);
1352
+
1353
+ perf_event_exec();
1354
+ __set_task_comm(me, kbasename(bprm->filename), true);
1355
+
1356
+ /* An exec changes our domain. We are no longer part of the thread
1357
+ group */
1358
+ WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1);
1359
+ flush_signal_handlers(me, 0);
1360
+
1361
+ /*
1362
+ * install the new credentials for this executable
1363
+ */
1364
+ security_bprm_committing_creds(bprm);
1365
+
1366
+ commit_creds(bprm->cred);
1367
+ bprm->cred = NULL;
1368
+
1369
+ /*
1370
+ * Disable monitoring for regular users
1371
+ * when executing setuid binaries. Must
1372
+ * wait until new credentials are committed
1373
+ * by commit_creds() above
1374
+ */
1375
+ if (get_dumpable(me->mm) != SUID_DUMP_USER)
1376
+ perf_event_exit_task(me);
1377
+ /*
1378
+ * cred_guard_mutex must be held at least to this point to prevent
1379
+ * ptrace_attach() from altering our determination of the task's
1380
+ * credentials; any time after this it may be unlocked.
1381
+ */
1382
+ security_bprm_committed_creds(bprm);
1383
+
1384
+ /* Pass the opened binary to the interpreter. */
1385
+ if (bprm->have_execfd) {
1386
+ retval = get_unused_fd_flags(0);
1387
+ if (retval < 0)
1388
+ goto out_unlock;
1389
+ fd_install(retval, bprm->executable);
1390
+ bprm->executable = NULL;
1391
+ bprm->execfd = retval;
1392
+ }
13181393 return 0;
13191394
1395
+out_unlock:
1396
+ up_write(&me->signal->exec_update_lock);
13201397 out:
13211398 return retval;
13221399 }
1323
-EXPORT_SYMBOL(flush_old_exec);
1400
+EXPORT_SYMBOL(begin_new_exec);
13241401
13251402 void would_dump(struct linux_binprm *bprm, struct file *file)
13261403 {
13271404 struct inode *inode = file_inode(file);
1328
- if (inode_permission2(file->f_path.mnt, inode, MAY_READ) < 0) {
1405
+ if (inode_permission(inode, MAY_READ) < 0) {
13291406 struct user_namespace *old, *user_ns;
13301407 bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
13311408
....@@ -1345,58 +1422,20 @@
13451422
13461423 void setup_new_exec(struct linux_binprm * bprm)
13471424 {
1348
- /*
1349
- * Once here, prepare_binrpm() will not be called any more, so
1350
- * the final state of setuid/setgid/fscaps can be merged into the
1351
- * secureexec flag.
1352
- */
1353
- bprm->secureexec |= bprm->cap_elevated;
1425
+ /* Setup things that can depend upon the personality */
1426
+ struct task_struct *me = current;
13541427
1355
- if (bprm->secureexec) {
1356
- /* Make sure parent cannot signal privileged process. */
1357
- current->pdeath_signal = 0;
1358
-
1359
- /*
1360
- * For secureexec, reset the stack limit to sane default to
1361
- * avoid bad behavior from the prior rlimits. This has to
1362
- * happen before arch_pick_mmap_layout(), which examines
1363
- * RLIMIT_STACK, but after the point of no return to avoid
1364
- * needing to clean up the change on failure.
1365
- */
1366
- if (bprm->rlim_stack.rlim_cur > _STK_LIM)
1367
- bprm->rlim_stack.rlim_cur = _STK_LIM;
1368
- }
1369
-
1370
- arch_pick_mmap_layout(current->mm, &bprm->rlim_stack);
1371
-
1372
- current->sas_ss_sp = current->sas_ss_size = 0;
1373
-
1374
- /*
1375
- * Figure out dumpability. Note that this checking only of current
1376
- * is wrong, but userspace depends on it. This should be testing
1377
- * bprm->secureexec instead.
1378
- */
1379
- if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP ||
1380
- !(uid_eq(current_euid(), current_uid()) &&
1381
- gid_eq(current_egid(), current_gid())))
1382
- set_dumpable(current->mm, suid_dumpable);
1383
- else
1384
- set_dumpable(current->mm, SUID_DUMP_USER);
1428
+ arch_pick_mmap_layout(me->mm, &bprm->rlim_stack);
13851429
13861430 arch_setup_new_exec();
1387
- perf_event_exec();
1388
- __set_task_comm(current, kbasename(bprm->filename), true);
13891431
13901432 /* Set the new mm task size. We have to do that late because it may
13911433 * depend on TIF_32BIT which is only updated in flush_thread() on
13921434 * some architectures like powerpc
13931435 */
1394
- current->mm->task_size = TASK_SIZE;
1395
-
1396
- /* An exec changes our domain. We are no longer part of the thread
1397
- group */
1398
- WRITE_ONCE(current->self_exec_id, current->self_exec_id + 1);
1399
- flush_signal_handlers(current, 0);
1436
+ me->mm->task_size = TASK_SIZE;
1437
+ up_write(&me->signal->exec_update_lock);
1438
+ mutex_unlock(&me->signal->cred_guard_mutex);
14001439 }
14011440 EXPORT_SYMBOL(setup_new_exec);
14021441
....@@ -1412,11 +1451,11 @@
14121451
14131452 /*
14141453 * Prepare credentials and lock ->cred_guard_mutex.
1415
- * install_exec_creds() commits the new creds and drops the lock.
1454
+ * setup_new_exec() commits the new creds and drops the lock.
14161455 * Or, if exec fails before, free_bprm() should release ->cred and
14171456 * and unlock.
14181457 */
1419
-int prepare_bprm_creds(struct linux_binprm *bprm)
1458
+static int prepare_bprm_creds(struct linux_binprm *bprm)
14201459 {
14211460 if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
14221461 return -ERESTARTNOINTR;
....@@ -1431,6 +1470,10 @@
14311470
14321471 static void free_bprm(struct linux_binprm *bprm)
14331472 {
1473
+ if (bprm->mm) {
1474
+ acct_arg_size(bprm, 0);
1475
+ mmput(bprm->mm);
1476
+ }
14341477 free_arg_pages(bprm);
14351478 if (bprm->cred) {
14361479 mutex_unlock(&current->signal->cred_guard_mutex);
....@@ -1440,10 +1483,46 @@
14401483 allow_write_access(bprm->file);
14411484 fput(bprm->file);
14421485 }
1486
+ if (bprm->executable)
1487
+ fput(bprm->executable);
14431488 /* If a binfmt changed the interp, free it. */
14441489 if (bprm->interp != bprm->filename)
14451490 kfree(bprm->interp);
1491
+ kfree(bprm->fdpath);
14461492 kfree(bprm);
1493
+}
1494
+
1495
+static struct linux_binprm *alloc_bprm(int fd, struct filename *filename)
1496
+{
1497
+ struct linux_binprm *bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
1498
+ int retval = -ENOMEM;
1499
+ if (!bprm)
1500
+ goto out;
1501
+
1502
+ if (fd == AT_FDCWD || filename->name[0] == '/') {
1503
+ bprm->filename = filename->name;
1504
+ } else {
1505
+ if (filename->name[0] == '\0')
1506
+ bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd);
1507
+ else
1508
+ bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s",
1509
+ fd, filename->name);
1510
+ if (!bprm->fdpath)
1511
+ goto out_free;
1512
+
1513
+ bprm->filename = bprm->fdpath;
1514
+ }
1515
+ bprm->interp = bprm->filename;
1516
+
1517
+ retval = bprm_mm_init(bprm);
1518
+ if (retval)
1519
+ goto out_free;
1520
+ return bprm;
1521
+
1522
+out_free:
1523
+ free_bprm(bprm);
1524
+out:
1525
+ return ERR_PTR(retval);
14471526 }
14481527
14491528 int bprm_change_interp(const char *interp, struct linux_binprm *bprm)
....@@ -1457,34 +1536,6 @@
14571536 return 0;
14581537 }
14591538 EXPORT_SYMBOL(bprm_change_interp);
1460
-
1461
-/*
1462
- * install the new credentials for this executable
1463
- */
1464
-void install_exec_creds(struct linux_binprm *bprm)
1465
-{
1466
- security_bprm_committing_creds(bprm);
1467
-
1468
- commit_creds(bprm->cred);
1469
- bprm->cred = NULL;
1470
-
1471
- /*
1472
- * Disable monitoring for regular users
1473
- * when executing setuid binaries. Must
1474
- * wait until new credentials are committed
1475
- * by commit_creds() above
1476
- */
1477
- if (get_dumpable(current->mm) != SUID_DUMP_USER)
1478
- perf_event_exit_task(current);
1479
- /*
1480
- * cred_guard_mutex must be held at least to this point to prevent
1481
- * ptrace_attach() from altering our determination of the task's
1482
- * credentials; any time after this it may be unlocked.
1483
- */
1484
- security_bprm_committed_creds(bprm);
1485
- mutex_unlock(&current->signal->cred_guard_mutex);
1486
-}
1487
-EXPORT_SYMBOL(install_exec_creds);
14881539
14891540 /*
14901541 * determine how safe it is to execute the proposed program
....@@ -1523,29 +1574,21 @@
15231574 spin_unlock(&p->fs->lock);
15241575 }
15251576
1526
-static void bprm_fill_uid(struct linux_binprm *bprm)
1577
+static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file)
15271578 {
1579
+ /* Handle suid and sgid on files */
15281580 struct inode *inode;
15291581 unsigned int mode;
15301582 kuid_t uid;
15311583 kgid_t gid;
15321584
1533
- /*
1534
- * Since this can be called multiple times (via prepare_binprm),
1535
- * we must clear any previous work done when setting set[ug]id
1536
- * bits from any earlier bprm->file uses (for example when run
1537
- * first for a setuid script then again for its interpreter).
1538
- */
1539
- bprm->cred->euid = current_euid();
1540
- bprm->cred->egid = current_egid();
1541
-
1542
- if (!mnt_may_suid(bprm->file->f_path.mnt))
1585
+ if (!mnt_may_suid(file->f_path.mnt))
15431586 return;
15441587
15451588 if (task_no_new_privs(current))
15461589 return;
15471590
1548
- inode = bprm->file->f_path.dentry->d_inode;
1591
+ inode = file->f_path.dentry->d_inode;
15491592 mode = READ_ONCE(inode->i_mode);
15501593 if (!(mode & (S_ISUID|S_ISGID)))
15511594 return;
....@@ -1576,29 +1619,30 @@
15761619 }
15771620
15781621 /*
1622
+ * Compute brpm->cred based upon the final binary.
1623
+ */
1624
+static int bprm_creds_from_file(struct linux_binprm *bprm)
1625
+{
1626
+ /* Compute creds based on which file? */
1627
+ struct file *file = bprm->execfd_creds ? bprm->executable : bprm->file;
1628
+
1629
+ bprm_fill_uid(bprm, file);
1630
+ return security_bprm_creds_from_file(bprm, file);
1631
+}
1632
+
1633
+/*
15791634 * Fill the binprm structure from the inode.
1580
- * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
1635
+ * Read the first BINPRM_BUF_SIZE bytes
15811636 *
15821637 * This may be called multiple times for binary chains (scripts for example).
15831638 */
1584
-int prepare_binprm(struct linux_binprm *bprm)
1639
+static int prepare_binprm(struct linux_binprm *bprm)
15851640 {
1586
- int retval;
15871641 loff_t pos = 0;
1588
-
1589
- bprm_fill_uid(bprm);
1590
-
1591
- /* fill in binprm security blob */
1592
- retval = security_bprm_set_creds(bprm);
1593
- if (retval)
1594
- return retval;
1595
- bprm->called_set_creds = 1;
15961642
15971643 memset(bprm->buf, 0, BINPRM_BUF_SIZE);
15981644 return kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, &pos);
15991645 }
1600
-
1601
-EXPORT_SYMBOL(prepare_binprm);
16021646
16031647 /*
16041648 * Arguments are '\0' separated strings found at the location bprm->p
....@@ -1645,15 +1689,15 @@
16451689 /*
16461690 * cycle the list of binary formats handler, until one recognizes the image
16471691 */
1648
-int search_binary_handler(struct linux_binprm *bprm)
1692
+static int search_binary_handler(struct linux_binprm *bprm)
16491693 {
16501694 bool need_retry = IS_ENABLED(CONFIG_MODULES);
16511695 struct linux_binfmt *fmt;
16521696 int retval;
16531697
1654
- /* This allows 4 levels of binfmt rewrites before failing hard. */
1655
- if (bprm->recursion_depth > 5)
1656
- return -ELOOP;
1698
+ retval = prepare_binprm(bprm);
1699
+ if (retval < 0)
1700
+ return retval;
16571701
16581702 retval = security_bprm_check(bprm);
16591703 if (retval)
....@@ -1666,18 +1710,12 @@
16661710 if (!try_module_get(fmt->module))
16671711 continue;
16681712 read_unlock(&binfmt_lock);
1669
- bprm->recursion_depth++;
1713
+
16701714 retval = fmt->load_binary(bprm);
1715
+
16711716 read_lock(&binfmt_lock);
16721717 put_binfmt(fmt);
1673
- bprm->recursion_depth--;
1674
- if (retval < 0 && !bprm->mm) {
1675
- /* we got to flush_old_exec() and failed after it */
1676
- read_unlock(&binfmt_lock);
1677
- force_sigsegv(SIGSEGV, current);
1678
- return retval;
1679
- }
1680
- if (retval != -ENOEXEC || !bprm->file) {
1718
+ if (bprm->point_of_no_return || (retval != -ENOEXEC)) {
16811719 read_unlock(&binfmt_lock);
16821720 return retval;
16831721 }
....@@ -1696,12 +1734,11 @@
16961734
16971735 return retval;
16981736 }
1699
-EXPORT_SYMBOL(search_binary_handler);
17001737
17011738 static int exec_binprm(struct linux_binprm *bprm)
17021739 {
17031740 pid_t old_pid, old_vpid;
1704
- int ret;
1741
+ int ret, depth;
17051742
17061743 /* Need to fetch pid before load_binary changes it */
17071744 old_pid = current->pid;
....@@ -1709,28 +1746,129 @@
17091746 old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
17101747 rcu_read_unlock();
17111748
1712
- ret = search_binary_handler(bprm);
1713
- if (ret >= 0) {
1714
- audit_bprm(bprm);
1715
- trace_sched_process_exec(current, old_pid, bprm);
1716
- ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
1717
- proc_exec_connector(current);
1749
+ /* This allows 4 levels of binfmt rewrites before failing hard. */
1750
+ for (depth = 0;; depth++) {
1751
+ struct file *exec;
1752
+ if (depth > 5)
1753
+ return -ELOOP;
1754
+
1755
+ ret = search_binary_handler(bprm);
1756
+ if (ret < 0)
1757
+ return ret;
1758
+ if (!bprm->interpreter)
1759
+ break;
1760
+
1761
+ exec = bprm->file;
1762
+ bprm->file = bprm->interpreter;
1763
+ bprm->interpreter = NULL;
1764
+
1765
+ allow_write_access(exec);
1766
+ if (unlikely(bprm->have_execfd)) {
1767
+ if (bprm->executable) {
1768
+ fput(exec);
1769
+ return -ENOEXEC;
1770
+ }
1771
+ bprm->executable = exec;
1772
+ } else
1773
+ fput(exec);
17181774 }
17191775
1720
- return ret;
1776
+ audit_bprm(bprm);
1777
+ trace_sched_process_exec(current, old_pid, bprm);
1778
+ ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
1779
+ proc_exec_connector(current);
1780
+ return 0;
17211781 }
17221782
17231783 /*
17241784 * sys_execve() executes a new program.
17251785 */
1726
-static int __do_execve_file(int fd, struct filename *filename,
1727
- struct user_arg_ptr argv,
1728
- struct user_arg_ptr envp,
1729
- int flags, struct file *file)
1786
+static int bprm_execve(struct linux_binprm *bprm,
1787
+ int fd, struct filename *filename, int flags)
17301788 {
1731
- char *pathbuf = NULL;
1732
- struct linux_binprm *bprm;
1789
+ struct file *file;
17331790 struct files_struct *displaced;
1791
+ int retval;
1792
+
1793
+ /*
1794
+ * Cancel any io_uring activity across execve
1795
+ */
1796
+ io_uring_task_cancel();
1797
+
1798
+ retval = unshare_files(&displaced);
1799
+ if (retval)
1800
+ return retval;
1801
+
1802
+ retval = prepare_bprm_creds(bprm);
1803
+ if (retval)
1804
+ goto out_files;
1805
+
1806
+ check_unsafe_exec(bprm);
1807
+ current->in_execve = 1;
1808
+
1809
+ file = do_open_execat(fd, filename, flags);
1810
+ retval = PTR_ERR(file);
1811
+ if (IS_ERR(file))
1812
+ goto out_unmark;
1813
+
1814
+ sched_exec();
1815
+
1816
+ bprm->file = file;
1817
+ /*
1818
+ * Record that a name derived from an O_CLOEXEC fd will be
1819
+ * inaccessible after exec. Relies on having exclusive access to
1820
+ * current->files (due to unshare_files above).
1821
+ */
1822
+ if (bprm->fdpath &&
1823
+ close_on_exec(fd, rcu_dereference_raw(current->files->fdt)))
1824
+ bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
1825
+
1826
+ /* Set the unchanging part of bprm->cred */
1827
+ retval = security_bprm_creds_for_exec(bprm);
1828
+ if (retval)
1829
+ goto out;
1830
+
1831
+ retval = exec_binprm(bprm);
1832
+ if (retval < 0)
1833
+ goto out;
1834
+
1835
+ /* execve succeeded */
1836
+ current->fs->in_exec = 0;
1837
+ current->in_execve = 0;
1838
+ rseq_execve(current);
1839
+ acct_update_integrals(current);
1840
+ task_numa_free(current, false);
1841
+ if (displaced)
1842
+ put_files_struct(displaced);
1843
+ return retval;
1844
+
1845
+out:
1846
+ /*
1847
+ * If past the point of no return ensure the the code never
1848
+ * returns to the userspace process. Use an existing fatal
1849
+ * signal if present otherwise terminate the process with
1850
+ * SIGSEGV.
1851
+ */
1852
+ if (bprm->point_of_no_return && !fatal_signal_pending(current))
1853
+ force_sigsegv(SIGSEGV);
1854
+
1855
+out_unmark:
1856
+ current->fs->in_exec = 0;
1857
+ current->in_execve = 0;
1858
+
1859
+out_files:
1860
+ if (displaced)
1861
+ reset_files_struct(displaced);
1862
+
1863
+ return retval;
1864
+}
1865
+
1866
+static int do_execveat_common(int fd, struct filename *filename,
1867
+ struct user_arg_ptr argv,
1868
+ struct user_arg_ptr envp,
1869
+ int flags)
1870
+{
1871
+ struct linux_binprm *bprm;
17341872 int retval;
17351873
17361874 if (IS_ERR(filename))
....@@ -1752,144 +1890,120 @@
17521890 * further execve() calls fail. */
17531891 current->flags &= ~PF_NPROC_EXCEEDED;
17541892
1755
- retval = unshare_files(&displaced);
1756
- if (retval)
1893
+ bprm = alloc_bprm(fd, filename);
1894
+ if (IS_ERR(bprm)) {
1895
+ retval = PTR_ERR(bprm);
17571896 goto out_ret;
1897
+ }
17581898
1759
- retval = -ENOMEM;
1760
- bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
1761
- if (!bprm)
1762
- goto out_files;
1899
+ retval = count(argv, MAX_ARG_STRINGS);
1900
+ if (retval == 0)
1901
+ pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n",
1902
+ current->comm, bprm->filename);
1903
+ if (retval < 0)
1904
+ goto out_free;
1905
+ bprm->argc = retval;
17631906
1764
- retval = prepare_bprm_creds(bprm);
1765
- if (retval)
1907
+ retval = count(envp, MAX_ARG_STRINGS);
1908
+ if (retval < 0)
1909
+ goto out_free;
1910
+ bprm->envc = retval;
1911
+
1912
+ retval = bprm_stack_limits(bprm);
1913
+ if (retval < 0)
17661914 goto out_free;
17671915
1768
- check_unsafe_exec(bprm);
1769
- current->in_execve = 1;
1770
-
1771
- if (!file)
1772
- file = do_open_execat(fd, filename, flags);
1773
- retval = PTR_ERR(file);
1774
- if (IS_ERR(file))
1775
- goto out_unmark;
1776
-
1777
- sched_exec();
1778
-
1779
- bprm->file = file;
1780
- if (!filename) {
1781
- bprm->filename = "none";
1782
- } else if (fd == AT_FDCWD || filename->name[0] == '/') {
1783
- bprm->filename = filename->name;
1784
- } else {
1785
- if (filename->name[0] == '\0')
1786
- pathbuf = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd);
1787
- else
1788
- pathbuf = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s",
1789
- fd, filename->name);
1790
- if (!pathbuf) {
1791
- retval = -ENOMEM;
1792
- goto out_unmark;
1793
- }
1794
- /*
1795
- * Record that a name derived from an O_CLOEXEC fd will be
1796
- * inaccessible after exec. Relies on having exclusive access to
1797
- * current->files (due to unshare_files above).
1798
- */
1799
- if (close_on_exec(fd, rcu_dereference_raw(current->files->fdt)))
1800
- bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
1801
- bprm->filename = pathbuf;
1802
- }
1803
- bprm->interp = bprm->filename;
1804
-
1805
- retval = bprm_mm_init(bprm);
1806
- if (retval)
1807
- goto out_unmark;
1808
-
1809
- bprm->argc = count(argv, MAX_ARG_STRINGS);
1810
- if ((retval = bprm->argc) < 0)
1811
- goto out;
1812
-
1813
- bprm->envc = count(envp, MAX_ARG_STRINGS);
1814
- if ((retval = bprm->envc) < 0)
1815
- goto out;
1816
-
1817
- retval = prepare_binprm(bprm);
1916
+ retval = copy_string_kernel(bprm->filename, bprm);
18181917 if (retval < 0)
1819
- goto out;
1820
-
1821
- retval = copy_strings_kernel(1, &bprm->filename, bprm);
1822
- if (retval < 0)
1823
- goto out;
1824
-
1918
+ goto out_free;
18251919 bprm->exec = bprm->p;
1920
+
18261921 retval = copy_strings(bprm->envc, envp, bprm);
18271922 if (retval < 0)
1828
- goto out;
1923
+ goto out_free;
18291924
18301925 retval = copy_strings(bprm->argc, argv, bprm);
18311926 if (retval < 0)
1832
- goto out;
1927
+ goto out_free;
18331928
1834
- retval = exec_binprm(bprm);
1835
- if (retval < 0)
1836
- goto out;
1837
-
1838
- /* execve succeeded */
1839
- current->fs->in_exec = 0;
1840
- current->in_execve = 0;
1841
- membarrier_execve(current);
1842
- rseq_execve(current);
1843
- acct_update_integrals(current);
1844
- task_numa_free(current, false);
1845
- free_bprm(bprm);
1846
- kfree(pathbuf);
1847
- if (filename)
1848
- putname(filename);
1849
- if (displaced)
1850
- put_files_struct(displaced);
1851
- return retval;
1852
-
1853
-out:
1854
- if (bprm->mm) {
1855
- acct_arg_size(bprm, 0);
1856
- mmput(bprm->mm);
1929
+ /*
1930
+ * When argv is empty, add an empty string ("") as argv[0] to
1931
+ * ensure confused userspace programs that start processing
1932
+ * from argv[1] won't end up walking envp. See also
1933
+ * bprm_stack_limits().
1934
+ */
1935
+ if (bprm->argc == 0) {
1936
+ retval = copy_string_kernel("", bprm);
1937
+ if (retval < 0)
1938
+ goto out_free;
1939
+ bprm->argc = 1;
18571940 }
18581941
1859
-out_unmark:
1860
- current->fs->in_exec = 0;
1861
- current->in_execve = 0;
1862
-
1942
+ retval = bprm_execve(bprm, fd, filename, flags);
18631943 out_free:
18641944 free_bprm(bprm);
1865
- kfree(pathbuf);
18661945
1867
-out_files:
1868
- if (displaced)
1869
- reset_files_struct(displaced);
18701946 out_ret:
1871
- if (filename)
1872
- putname(filename);
1947
+ putname(filename);
18731948 return retval;
18741949 }
18751950
1876
-static int do_execveat_common(int fd, struct filename *filename,
1877
- struct user_arg_ptr argv,
1878
- struct user_arg_ptr envp,
1879
- int flags)
1951
+int kernel_execve(const char *kernel_filename,
1952
+ const char *const *argv, const char *const *envp)
18801953 {
1881
- return __do_execve_file(fd, filename, argv, envp, flags, NULL);
1954
+ struct filename *filename;
1955
+ struct linux_binprm *bprm;
1956
+ int fd = AT_FDCWD;
1957
+ int retval;
1958
+
1959
+ filename = getname_kernel(kernel_filename);
1960
+ if (IS_ERR(filename))
1961
+ return PTR_ERR(filename);
1962
+
1963
+ bprm = alloc_bprm(fd, filename);
1964
+ if (IS_ERR(bprm)) {
1965
+ retval = PTR_ERR(bprm);
1966
+ goto out_ret;
1967
+ }
1968
+
1969
+ retval = count_strings_kernel(argv);
1970
+ if (WARN_ON_ONCE(retval == 0))
1971
+ retval = -EINVAL;
1972
+ if (retval < 0)
1973
+ goto out_free;
1974
+ bprm->argc = retval;
1975
+
1976
+ retval = count_strings_kernel(envp);
1977
+ if (retval < 0)
1978
+ goto out_free;
1979
+ bprm->envc = retval;
1980
+
1981
+ retval = bprm_stack_limits(bprm);
1982
+ if (retval < 0)
1983
+ goto out_free;
1984
+
1985
+ retval = copy_string_kernel(bprm->filename, bprm);
1986
+ if (retval < 0)
1987
+ goto out_free;
1988
+ bprm->exec = bprm->p;
1989
+
1990
+ retval = copy_strings_kernel(bprm->envc, envp, bprm);
1991
+ if (retval < 0)
1992
+ goto out_free;
1993
+
1994
+ retval = copy_strings_kernel(bprm->argc, argv, bprm);
1995
+ if (retval < 0)
1996
+ goto out_free;
1997
+
1998
+ retval = bprm_execve(bprm, fd, filename, 0);
1999
+out_free:
2000
+ free_bprm(bprm);
2001
+out_ret:
2002
+ putname(filename);
2003
+ return retval;
18822004 }
18832005
1884
-int do_execve_file(struct file *file, void *__argv, void *__envp)
1885
-{
1886
- struct user_arg_ptr argv = { .ptr.native = __argv };
1887
- struct user_arg_ptr envp = { .ptr.native = __envp };
1888
-
1889
- return __do_execve_file(AT_FDCWD, NULL, argv, envp, 0, file);
1890
-}
1891
-
1892
-int do_execve(struct filename *filename,
2006
+static int do_execve(struct filename *filename,
18932007 const char __user *const __user *__argv,
18942008 const char __user *const __user *__envp)
18952009 {
....@@ -1898,7 +2012,7 @@
18982012 return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
18992013 }
19002014
1901
-int do_execveat(int fd, struct filename *filename,
2015
+static int do_execveat(int fd, struct filename *filename,
19022016 const char __user *const __user *__argv,
19032017 const char __user *const __user *__envp,
19042018 int flags)
....@@ -1960,15 +2074,10 @@
19602074 */
19612075 void set_dumpable(struct mm_struct *mm, int value)
19622076 {
1963
- unsigned long old, new;
1964
-
19652077 if (WARN_ON((unsigned)value > SUID_DUMP_ROOT))
19662078 return;
19672079
1968
- do {
1969
- old = READ_ONCE(mm->flags);
1970
- new = (old & ~MMF_DUMPABLE_MASK) | value;
1971
- } while (cmpxchg(&mm->flags, old, new) != old);
2080
+ set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value);
19722081 }
19732082
19742083 SYSCALL_DEFINE3(execve,