hc
2023-12-09 b22da3d8526a935aa31e086e63f60ff3246cb61c
kernel/kernel/fork.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * linux/kernel/fork.c
34 *
....@@ -39,9 +40,9 @@
3940 #include <linux/binfmts.h>
4041 #include <linux/mman.h>
4142 #include <linux/mmu_notifier.h>
42
-#include <linux/hmm.h>
4343 #include <linux/fs.h>
4444 #include <linux/mm.h>
45
+#include <linux/kprobes.h>
4546 #include <linux/vmacache.h>
4647 #include <linux/nsproxy.h>
4748 #include <linux/capability.h>
....@@ -79,7 +80,6 @@
7980 #include <linux/blkdev.h>
8081 #include <linux/fs_struct.h>
8182 #include <linux/magic.h>
82
-#include <linux/sched/mm.h>
8383 #include <linux/perf_event.h>
8484 #include <linux/posix-timers.h>
8585 #include <linux/user-return-notifier.h>
....@@ -93,10 +93,12 @@
9393 #include <linux/kcov.h>
9494 #include <linux/livepatch.h>
9595 #include <linux/thread_info.h>
96
-#include <linux/cpufreq_times.h>
96
+#include <linux/stackleak.h>
97
+#include <linux/kasan.h>
9798 #include <linux/scs.h>
99
+#include <linux/io_uring.h>
100
+#include <linux/cpufreq_times.h>
98101
99
-#include <asm/pgtable.h>
100102 #include <asm/pgalloc.h>
101103 #include <linux/uaccess.h>
102104 #include <asm/mmu_context.h>
....@@ -108,6 +110,8 @@
108110 #define CREATE_TRACE_POINTS
109111 #include <trace/events/task.h>
110112
113
+#undef CREATE_TRACE_POINTS
114
+#include <trace/hooks/sched.h>
111115 /*
112116 * Minimum number of threads to boot the kernel
113117 */
....@@ -118,17 +122,29 @@
118122 */
119123 #define MAX_THREADS FUTEX_TID_MASK
120124
125
+EXPORT_TRACEPOINT_SYMBOL_GPL(task_newtask);
126
+
121127 /*
122128 * Protected counters by write_lock_irq(&tasklist_lock)
123129 */
124130 unsigned long total_forks; /* Handle normal Linux uptimes. */
125131 int nr_threads; /* The idle threads do not count.. */
126132
127
-int max_threads; /* tunable limit on nr_threads */
133
+static int max_threads; /* tunable limit on nr_threads */
134
+
135
+#define NAMED_ARRAY_INDEX(x) [x] = __stringify(x)
136
+
137
+static const char * const resident_page_types[] = {
138
+ NAMED_ARRAY_INDEX(MM_FILEPAGES),
139
+ NAMED_ARRAY_INDEX(MM_ANONPAGES),
140
+ NAMED_ARRAY_INDEX(MM_SWAPENTS),
141
+ NAMED_ARRAY_INDEX(MM_SHMEMPAGES),
142
+};
128143
129144 DEFINE_PER_CPU(unsigned long, process_counts) = 0;
130145
131146 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
147
+EXPORT_SYMBOL_GPL(tasklist_lock);
132148
133149 #ifdef CONFIG_PROVE_RCU
134150 int lockdep_tasklist_lock_is_held(void)
....@@ -216,6 +232,9 @@
216232 if (!s)
217233 continue;
218234
235
+ /* Mark stack accessible for KASAN. */
236
+ kasan_unpoison_range(s->addr, THREAD_SIZE);
237
+
219238 /* Clear stale pointers from reused stack. */
220239 memset(s->addr, 0, THREAD_SIZE);
221240
....@@ -224,9 +243,14 @@
224243 return s->addr;
225244 }
226245
246
+ /*
247
+ * Allocated stacks are cached and later reused by new threads,
248
+ * so memcg accounting is performed manually on assigning/releasing
249
+ * stacks to tasks. Drop __GFP_ACCOUNT.
250
+ */
227251 stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
228252 VMALLOC_START, VMALLOC_END,
229
- THREADINFO_GFP,
253
+ THREADINFO_GFP & ~__GFP_ACCOUNT,
230254 PAGE_KERNEL,
231255 0, node, __builtin_return_address(0));
232256
....@@ -245,7 +269,7 @@
245269 THREAD_SIZE_ORDER);
246270
247271 if (likely(page)) {
248
- tsk->stack = page_address(page);
272
+ tsk->stack = kasan_reset_tag(page_address(page));
249273 return tsk->stack;
250274 }
251275 return NULL;
....@@ -255,8 +279,13 @@
255279 static inline void free_thread_stack(struct task_struct *tsk)
256280 {
257281 #ifdef CONFIG_VMAP_STACK
258
- if (task_stack_vm_area(tsk)) {
282
+ struct vm_struct *vm = task_stack_vm_area(tsk);
283
+
284
+ if (vm) {
259285 int i;
286
+
287
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
288
+ memcg_kmem_uncharge_page(vm->pages[i], 0);
260289
261290 for (i = 0; i < NR_CACHED_STACKS; i++) {
262291 if (this_cpu_cmpxchg(cached_stacks[i],
....@@ -266,7 +295,7 @@
266295 return;
267296 }
268297
269
- vfree_atomic(tsk->stack);
298
+ vfree(tsk->stack);
270299 return;
271300 }
272301 #endif
....@@ -281,6 +310,7 @@
281310 {
282311 unsigned long *stack;
283312 stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
313
+ stack = kasan_reset_tag(stack);
284314 tsk->stack = stack;
285315 return stack;
286316 }
....@@ -333,8 +363,15 @@
333363 struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
334364
335365 if (new) {
336
- *new = *orig;
337
- INIT_LIST_HEAD(&new->anon_vma_chain);
366
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
367
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
368
+ /*
369
+ * orig->shared.rb may be modified concurrently, but the clone
370
+ * will be reinitialized.
371
+ */
372
+ *new = data_race(*orig);
373
+ INIT_VMA(new);
374
+ new->vm_next = new->vm_prev = NULL;
338375 }
339376 return new;
340377 }
....@@ -349,6 +386,22 @@
349386 void *stack = task_stack_page(tsk);
350387 struct vm_struct *vm = task_stack_vm_area(tsk);
351388
389
+
390
+ /* All stack pages are in the same node. */
391
+ if (vm)
392
+ mod_lruvec_page_state(vm->pages[0], NR_KERNEL_STACK_KB,
393
+ account * (THREAD_SIZE / 1024));
394
+ else
395
+ mod_lruvec_slab_state(stack, NR_KERNEL_STACK_KB,
396
+ account * (THREAD_SIZE / 1024));
397
+}
398
+
399
+static int memcg_charge_kernel_stack(struct task_struct *tsk)
400
+{
401
+#ifdef CONFIG_VMAP_STACK
402
+ struct vm_struct *vm = task_stack_vm_area(tsk);
403
+ int ret;
404
+
352405 BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
353406
354407 if (vm) {
....@@ -357,27 +410,19 @@
357410 BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
358411
359412 for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
360
- mod_zone_page_state(page_zone(vm->pages[i]),
361
- NR_KERNEL_STACK_KB,
362
- PAGE_SIZE / 1024 * account);
413
+ /*
414
+ * If memcg_kmem_charge_page() fails, page->mem_cgroup
415
+ * pointer is NULL, and memcg_kmem_uncharge_page() in
416
+ * free_thread_stack() will ignore this page.
417
+ */
418
+ ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL,
419
+ 0);
420
+ if (ret)
421
+ return ret;
363422 }
364
-
365
- /* All stack pages belong to the same memcg. */
366
- mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
367
- account * (THREAD_SIZE / 1024));
368
- } else {
369
- /*
370
- * All stack pages are in the same zone and belong to the
371
- * same memcg.
372
- */
373
- struct page *first_page = virt_to_page(stack);
374
-
375
- mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
376
- THREAD_SIZE / 1024 * account);
377
-
378
- mod_memcg_page_state(first_page, MEMCG_KERNEL_STACK_KB,
379
- account * (THREAD_SIZE / 1024));
380423 }
424
+#endif
425
+ return 0;
381426 }
382427
383428 static void release_task_stack(struct task_struct *tsk)
....@@ -396,9 +441,10 @@
396441 #ifdef CONFIG_THREAD_INFO_IN_TASK
397442 void put_task_stack(struct task_struct *tsk)
398443 {
399
- if (atomic_dec_and_test(&tsk->stack_refcount))
444
+ if (refcount_dec_and_test(&tsk->stack_refcount))
400445 release_task_stack(tsk);
401446 }
447
+EXPORT_SYMBOL_GPL(put_task_stack);
402448 #endif
403449
404450 void free_task(struct task_struct *tsk)
....@@ -406,6 +452,7 @@
406452 cpufreq_task_times_exit(tsk);
407453 scs_release(tsk);
408454
455
+ trace_android_vh_free_task(tsk);
409456 #ifndef CONFIG_THREAD_INFO_IN_TASK
410457 /*
411458 * The task is finally done with both the stack and thread_info,
....@@ -417,11 +464,10 @@
417464 * If the task had a separate stack allocation, it should be gone
418465 * by now.
419466 */
420
- WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0);
467
+ WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0);
421468 #endif
422469 rt_mutex_debug_task_free(tsk);
423470 ftrace_graph_exit_task(tsk);
424
- put_seccomp_filter(tsk);
425471 arch_release_task_struct(tsk);
426472 if (tsk->flags & PF_KTHREAD)
427473 free_kthread_struct(tsk);
....@@ -433,14 +479,14 @@
433479 static __latent_entropy int dup_mmap(struct mm_struct *mm,
434480 struct mm_struct *oldmm)
435481 {
436
- struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
482
+ struct vm_area_struct *mpnt, *tmp, *prev, **pprev, *last = NULL;
437483 struct rb_node **rb_link, *rb_parent;
438484 int retval;
439485 unsigned long charge;
440486 LIST_HEAD(uf);
441487
442488 uprobe_start_dup_mmap();
443
- if (down_write_killable(&oldmm->mmap_sem)) {
489
+ if (mmap_write_lock_killable(oldmm)) {
444490 retval = -EINTR;
445491 goto fail_uprobe_end;
446492 }
....@@ -449,7 +495,7 @@
449495 /*
450496 * Not linked in yet - no deadlock potential:
451497 */
452
- down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
498
+ mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
453499
454500 /* No ordering required: file already has been exposed. */
455501 RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
....@@ -504,14 +550,15 @@
504550 if (retval)
505551 goto fail_nomem_anon_vma_fork;
506552 if (tmp->vm_flags & VM_WIPEONFORK) {
507
- /* VM_WIPEONFORK gets a clean slate in the child. */
553
+ /*
554
+ * VM_WIPEONFORK gets a clean slate in the child.
555
+ * Don't prepare anon_vma until fault since we don't
556
+ * copy page for current vma.
557
+ */
508558 tmp->anon_vma = NULL;
509
- if (anon_vma_prepare(tmp))
510
- goto fail_nomem_anon_vma_fork;
511559 } else if (anon_vma_fork(tmp, mpnt))
512560 goto fail_nomem_anon_vma_fork;
513561 tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
514
- tmp->vm_next = tmp->vm_prev = NULL;
515562 file = tmp->vm_file;
516563 if (file) {
517564 struct inode *inode = file_inode(file);
....@@ -519,10 +566,10 @@
519566
520567 get_file(file);
521568 if (tmp->vm_flags & VM_DENYWRITE)
522
- atomic_dec(&inode->i_writecount);
569
+ put_write_access(inode);
523570 i_mmap_lock_write(mapping);
524571 if (tmp->vm_flags & VM_SHARED)
525
- atomic_inc(&mapping->i_mmap_writable);
572
+ mapping_allow_writable(mapping);
526573 flush_dcache_mmap_lock(mapping);
527574 /* insert tmp into the share list, just after mpnt */
528575 vma_interval_tree_insert_after(tmp, mpnt,
....@@ -552,8 +599,18 @@
552599 rb_parent = &tmp->vm_rb;
553600
554601 mm->map_count++;
555
- if (!(tmp->vm_flags & VM_WIPEONFORK))
556
- retval = copy_page_range(mm, oldmm, mpnt);
602
+ if (!(tmp->vm_flags & VM_WIPEONFORK)) {
603
+ if (IS_ENABLED(CONFIG_SPECULATIVE_PAGE_FAULT)) {
604
+ /*
605
+ * Mark this VMA as changing to prevent the
606
+ * speculative page fault hanlder to process
607
+ * it until the TLB are flushed below.
608
+ */
609
+ last = mpnt;
610
+ vm_write_begin(mpnt);
611
+ }
612
+ retval = copy_page_range(tmp, mpnt);
613
+ }
557614
558615 if (tmp->vm_ops && tmp->vm_ops->open)
559616 tmp->vm_ops->open(tmp);
....@@ -564,9 +621,25 @@
564621 /* a new mm has just been created */
565622 retval = arch_dup_mmap(oldmm, mm);
566623 out:
567
- up_write(&mm->mmap_sem);
624
+ mmap_write_unlock(mm);
568625 flush_tlb_mm(oldmm);
569
- up_write(&oldmm->mmap_sem);
626
+
627
+ if (IS_ENABLED(CONFIG_SPECULATIVE_PAGE_FAULT)) {
628
+ /*
629
+ * Since the TLB has been flush, we can safely unmark the
630
+ * copied VMAs and allows the speculative page fault handler to
631
+ * process them again.
632
+ * Walk back the VMA list from the last marked VMA.
633
+ */
634
+ for (; last; last = last->vm_prev) {
635
+ if (last->vm_flags & VM_DONTCOPY)
636
+ continue;
637
+ if (!(last->vm_flags & VM_WIPEONFORK))
638
+ vm_write_end(last);
639
+ }
640
+ }
641
+
642
+ mmap_write_unlock(oldmm);
570643 dup_userfaultfd_complete(&uf);
571644 fail_uprobe_end:
572645 uprobe_end_dup_mmap();
....@@ -596,9 +669,9 @@
596669 #else
597670 static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
598671 {
599
- down_write(&oldmm->mmap_sem);
672
+ mmap_write_lock(oldmm);
600673 RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
601
- up_write(&oldmm->mmap_sem);
674
+ mmap_write_unlock(oldmm);
602675 return 0;
603676 }
604677 #define mm_alloc_pgd(mm) (0)
....@@ -609,12 +682,15 @@
609682 {
610683 int i;
611684
685
+ BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS,
686
+ "Please make sure 'struct resident_page_types[]' is updated as well");
687
+
612688 for (i = 0; i < NR_MM_COUNTERS; i++) {
613689 long x = atomic_long_read(&mm->rss_stat.count[i]);
614690
615691 if (unlikely(x))
616
- printk(KERN_ALERT "BUG: Bad rss-counter state "
617
- "mm:%p idx:%d val:%ld\n", mm, i, x);
692
+ pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
693
+ mm, resident_page_types[i], x);
618694 }
619695
620696 if (mm_pgtables_bytes(mm))
....@@ -641,13 +717,25 @@
641717 WARN_ON_ONCE(mm == current->active_mm);
642718 mm_free_pgd(mm);
643719 destroy_context(mm);
644
- hmm_mm_destroy(mm);
645
- mmu_notifier_mm_destroy(mm);
720
+ mmu_notifier_subscriptions_destroy(mm);
646721 check_mm(mm);
647722 put_user_ns(mm->user_ns);
648723 free_mm(mm);
649724 }
650725 EXPORT_SYMBOL_GPL(__mmdrop);
726
+
727
+#ifdef CONFIG_PREEMPT_RT
728
+/*
729
+ * RCU callback for delayed mm drop. Not strictly rcu, but we don't
730
+ * want another facility to make this work.
731
+ */
732
+void __mmdrop_delayed(struct rcu_head *rhp)
733
+{
734
+ struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);
735
+
736
+ __mmdrop(mm);
737
+}
738
+#endif
651739
652740 static void mmdrop_async_fn(struct work_struct *work)
653741 {
....@@ -680,16 +768,26 @@
680768
681769 static inline void put_signal_struct(struct signal_struct *sig)
682770 {
683
- if (atomic_dec_and_test(&sig->sigcnt))
771
+ if (refcount_dec_and_test(&sig->sigcnt))
684772 free_signal_struct(sig);
685773 }
686774
687775 void __put_task_struct(struct task_struct *tsk)
688776 {
689777 WARN_ON(!tsk->exit_state);
690
- WARN_ON(atomic_read(&tsk->usage));
778
+ WARN_ON(refcount_read(&tsk->usage));
691779 WARN_ON(tsk == current);
692780
781
+ /*
782
+ * Remove function-return probe instances associated with this
783
+ * task and put them back on the free list.
784
+ */
785
+ kprobe_flush_task(tsk);
786
+
787
+ /* Task is done with its stack. */
788
+ put_task_stack(tsk);
789
+
790
+ io_uring_free(tsk);
693791 cgroup_free(tsk);
694792 task_numa_free(tsk, true);
695793 security_task_free(tsk);
....@@ -710,15 +808,16 @@
710808 static void set_max_threads(unsigned int max_threads_suggested)
711809 {
712810 u64 threads;
811
+ unsigned long nr_pages = totalram_pages();
713812
714813 /*
715814 * The number of threads shall be limited such that the thread
716815 * structures may only consume a small part of the available memory.
717816 */
718
- if (fls64(totalram_pages) + fls64(PAGE_SIZE) > 64)
817
+ if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64)
719818 threads = MAX_THREADS;
720819 else
721
- threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE,
820
+ threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE,
722821 (u64) THREAD_SIZE * 8UL);
723822
724823 if (threads > max_threads_suggested)
....@@ -732,6 +831,7 @@
732831 int arch_task_struct_size __read_mostly;
733832 #endif
734833
834
+#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
735835 static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
736836 {
737837 /* Fetch thread_struct whitelist for the architecture. */
....@@ -746,6 +846,7 @@
746846 else
747847 *offset += offsetof(struct task_struct, thread);
748848 }
849
+#endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */
749850
750851 void __init fork_init(void)
751852 {
....@@ -787,6 +888,7 @@
787888 scs_init();
788889
789890 lockdep_init_task(&init_task);
891
+ uprobes_init();
790892 }
791893
792894 int __weak arch_dup_task_struct(struct task_struct *dst,
....@@ -808,7 +910,7 @@
808910 {
809911 struct task_struct *tsk;
810912 unsigned long *stack;
811
- struct vm_struct *stack_vm_area;
913
+ struct vm_struct *stack_vm_area __maybe_unused;
812914 int err;
813915
814916 if (node == NUMA_NO_NODE)
....@@ -820,6 +922,9 @@
820922 stack = alloc_thread_stack_node(tsk, node);
821923 if (!stack)
822924 goto free_tsk;
925
+
926
+ if (memcg_charge_kernel_stack(tsk))
927
+ goto free_stack;
823928
824929 stack_vm_area = task_stack_vm_area(tsk);
825930
....@@ -835,7 +940,7 @@
835940 tsk->stack_vm_area = stack_vm_area;
836941 #endif
837942 #ifdef CONFIG_THREAD_INFO_IN_TASK
838
- atomic_set(&tsk->stack_refcount, 1);
943
+ refcount_set(&tsk->stack_refcount, 1);
839944 #endif
840945
841946 if (err)
....@@ -863,22 +968,29 @@
863968 #ifdef CONFIG_STACKPROTECTOR
864969 tsk->stack_canary = get_random_canary();
865970 #endif
971
+ if (orig->cpus_ptr == &orig->cpus_mask)
972
+ tsk->cpus_ptr = &tsk->cpus_mask;
866973
867974 /*
868
- * One for us, one for whoever does the "release_task()" (usually
869
- * parent)
975
+ * One for the user space visible state that goes away when reaped.
976
+ * One for the scheduler.
870977 */
871
- atomic_set(&tsk->usage, 2);
978
+ refcount_set(&tsk->rcu_users, 2);
979
+ /* One for the rcu users */
980
+ refcount_set(&tsk->usage, 1);
872981 #ifdef CONFIG_BLK_DEV_IO_TRACE
873982 tsk->btrace_seq = 0;
874983 #endif
875984 tsk->splice_pipe = NULL;
876985 tsk->task_frag.page = NULL;
877986 tsk->wake_q.next = NULL;
987
+ tsk->wake_q_sleeper.next = NULL;
988
+ tsk->pf_io_worker = NULL;
878989
879990 account_kernel_stack(tsk, 1);
880991
881992 kcov_task_init(tsk);
993
+ kmap_local_fork(tsk);
882994
883995 #ifdef CONFIG_FAULT_INJECTION
884996 tsk->fail_nth = 0;
....@@ -892,6 +1004,11 @@
8921004 #ifdef CONFIG_MEMCG
8931005 tsk->active_memcg = NULL;
8941006 #endif
1007
+
1008
+ android_init_vendor_data(tsk, 1);
1009
+ android_init_oem_data(tsk, 1);
1010
+
1011
+ trace_android_vh_dup_task_struct(tsk, orig);
8951012 return tsk;
8961013
8971014 free_stack:
....@@ -941,6 +1058,13 @@
9411058 #endif
9421059 }
9431060
1061
+static void mm_init_pasid(struct mm_struct *mm)
1062
+{
1063
+#ifdef CONFIG_IOMMU_SUPPORT
1064
+ mm->pasid = INIT_PASID;
1065
+#endif
1066
+}
1067
+
9441068 static void mm_init_uprobes_state(struct mm_struct *mm)
9451069 {
9461070 #ifdef CONFIG_UPROBES
....@@ -954,24 +1078,30 @@
9541078 mm->mmap = NULL;
9551079 mm->mm_rb = RB_ROOT;
9561080 mm->vmacache_seqnum = 0;
1081
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
1082
+ rwlock_init(&mm->mm_rb_lock);
1083
+#endif
9571084 atomic_set(&mm->mm_users, 1);
9581085 atomic_set(&mm->mm_count, 1);
959
- init_rwsem(&mm->mmap_sem);
1086
+ seqcount_init(&mm->write_protect_seq);
1087
+ mmap_init_lock(mm);
9601088 INIT_LIST_HEAD(&mm->mmlist);
9611089 mm->core_state = NULL;
9621090 mm_pgtables_bytes_init(mm);
9631091 mm->map_count = 0;
9641092 mm->locked_vm = 0;
965
- mm->pinned_vm = 0;
1093
+ atomic_set(&mm->has_pinned, 0);
1094
+ atomic64_set(&mm->pinned_vm, 0);
9661095 memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
9671096 spin_lock_init(&mm->page_table_lock);
9681097 spin_lock_init(&mm->arg_lock);
9691098 mm_init_cpumask(mm);
9701099 mm_init_aio(mm);
9711100 mm_init_owner(mm, p);
1101
+ mm_init_pasid(mm);
9721102 RCU_INIT_POINTER(mm->exe_file, NULL);
973
- mmu_notifier_mm_init(mm);
974
- hmm_mm_init(mm);
1103
+ if (!mmu_notifier_subscriptions_init(mm))
1104
+ goto fail_nopgd;
9751105 init_tlb_flush_pending(mm);
9761106 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
9771107 mm->pmd_huge_pte = NULL;
....@@ -1046,8 +1176,10 @@
10461176 {
10471177 might_sleep();
10481178
1049
- if (atomic_dec_and_test(&mm->mm_users))
1179
+ if (atomic_dec_and_test(&mm->mm_users)) {
1180
+ trace_android_vh_mmput(NULL);
10501181 __mmput(mm);
1182
+ }
10511183 }
10521184 EXPORT_SYMBOL_GPL(mmput);
10531185
....@@ -1067,6 +1199,7 @@
10671199 schedule_work(&mm->async_put_work);
10681200 }
10691201 }
1202
+EXPORT_SYMBOL_GPL(mmput_async);
10701203 #endif
10711204
10721205 /**
....@@ -1171,7 +1304,7 @@
11711304 struct mm_struct *mm;
11721305 int err;
11731306
1174
- err = mutex_lock_killable(&task->signal->cred_guard_mutex);
1307
+ err = down_read_killable(&task->signal->exec_update_lock);
11751308 if (err)
11761309 return ERR_PTR(err);
11771310
....@@ -1181,7 +1314,7 @@
11811314 mmput(mm);
11821315 mm = ERR_PTR(-EACCES);
11831316 }
1184
- mutex_unlock(&task->signal->cred_guard_mutex);
1317
+ up_read(&task->signal->exec_update_lock);
11851318
11861319 return mm;
11871320 }
....@@ -1279,13 +1412,20 @@
12791412 mm_release(tsk, mm);
12801413 }
12811414
1282
-/*
1283
- * Allocate a new mm structure and copy contents from the
1284
- * mm structure of the passed in task structure.
1415
+/**
1416
+ * dup_mm() - duplicates an existing mm structure
1417
+ * @tsk: the task_struct with which the new mm will be associated.
1418
+ * @oldmm: the mm to duplicate.
1419
+ *
1420
+ * Allocates a new mm structure and duplicates the provided @oldmm structure
1421
+ * content into it.
1422
+ *
1423
+ * Return: the duplicated mm or NULL on failure.
12851424 */
1286
-static struct mm_struct *dup_mm(struct task_struct *tsk)
1425
+static struct mm_struct *dup_mm(struct task_struct *tsk,
1426
+ struct mm_struct *oldmm)
12871427 {
1288
- struct mm_struct *mm, *oldmm = current->mm;
1428
+ struct mm_struct *mm;
12891429 int err;
12901430
12911431 mm = allocate_mm();
....@@ -1353,7 +1493,7 @@
13531493 }
13541494
13551495 retval = -ENOMEM;
1356
- mm = dup_mm(tsk);
1496
+ mm = dup_mm(tsk, current->mm);
13571497 if (!mm)
13581498 goto fail_nomem;
13591499
....@@ -1403,7 +1543,7 @@
14031543 goto out;
14041544 }
14051545
1406
- newf = dup_fd(oldf, &error);
1546
+ newf = dup_fd(oldf, NR_OPEN_MAX, &error);
14071547 if (!newf)
14081548 goto out;
14091549
....@@ -1444,24 +1584,29 @@
14441584 struct sighand_struct *sig;
14451585
14461586 if (clone_flags & CLONE_SIGHAND) {
1447
- atomic_inc(&current->sighand->count);
1587
+ refcount_inc(&current->sighand->count);
14481588 return 0;
14491589 }
14501590 sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
1451
- rcu_assign_pointer(tsk->sighand, sig);
1591
+ RCU_INIT_POINTER(tsk->sighand, sig);
14521592 if (!sig)
14531593 return -ENOMEM;
14541594
1455
- atomic_set(&sig->count, 1);
1595
+ refcount_set(&sig->count, 1);
14561596 spin_lock_irq(&current->sighand->siglock);
14571597 memcpy(sig->action, current->sighand->action, sizeof(sig->action));
14581598 spin_unlock_irq(&current->sighand->siglock);
1599
+
1600
+ /* Reset all signal handler not set to SIG_IGN to SIG_DFL. */
1601
+ if (clone_flags & CLONE_CLEAR_SIGHAND)
1602
+ flush_signal_handlers(tsk, 0);
1603
+
14591604 return 0;
14601605 }
14611606
14621607 void __cleanup_sighand(struct sighand_struct *sighand)
14631608 {
1464
- if (atomic_dec_and_test(&sighand->count)) {
1609
+ if (refcount_dec_and_test(&sighand->count)) {
14651610 signalfd_cleanup(sighand);
14661611 /*
14671612 * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it
....@@ -1471,28 +1616,17 @@
14711616 }
14721617 }
14731618
1474
-#ifdef CONFIG_POSIX_TIMERS
14751619 /*
14761620 * Initialize POSIX timer handling for a thread group.
14771621 */
14781622 static void posix_cpu_timers_init_group(struct signal_struct *sig)
14791623 {
1624
+ struct posix_cputimers *pct = &sig->posix_cputimers;
14801625 unsigned long cpu_limit;
14811626
14821627 cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
1483
- if (cpu_limit != RLIM_INFINITY) {
1484
- sig->cputime_expires.prof_exp = cpu_limit * NSEC_PER_SEC;
1485
- sig->cputimer.running = true;
1486
- }
1487
-
1488
- /* The timer lists. */
1489
- INIT_LIST_HEAD(&sig->cpu_timers[0]);
1490
- INIT_LIST_HEAD(&sig->cpu_timers[1]);
1491
- INIT_LIST_HEAD(&sig->cpu_timers[2]);
1628
+ posix_cputimers_group_init(pct, cpu_limit);
14921629 }
1493
-#else
1494
-static inline void posix_cpu_timers_init_group(struct signal_struct *sig) { }
1495
-#endif
14961630
14971631 static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
14981632 {
....@@ -1508,7 +1642,7 @@
15081642
15091643 sig->nr_threads = 1;
15101644 atomic_set(&sig->live, 1);
1511
- atomic_set(&sig->sigcnt, 1);
1645
+ refcount_set(&sig->sigcnt, 1);
15121646
15131647 /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */
15141648 sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
....@@ -1540,6 +1674,7 @@
15401674 sig->oom_score_adj_min = current->signal->oom_score_adj_min;
15411675
15421676 mutex_init(&sig->cred_guard_mutex);
1677
+ init_rwsem(&sig->exec_update_lock);
15431678
15441679 return 0;
15451680 }
....@@ -1594,23 +1729,6 @@
15941729 #endif
15951730 }
15961731
1597
-#ifdef CONFIG_POSIX_TIMERS
1598
-/*
1599
- * Initialize POSIX timer handling for a single task.
1600
- */
1601
-static void posix_cpu_timers_init(struct task_struct *tsk)
1602
-{
1603
- tsk->cputime_expires.prof_exp = 0;
1604
- tsk->cputime_expires.virt_exp = 0;
1605
- tsk->cputime_expires.sched_exp = 0;
1606
- INIT_LIST_HEAD(&tsk->cpu_timers[0]);
1607
- INIT_LIST_HEAD(&tsk->cpu_timers[1]);
1608
- INIT_LIST_HEAD(&tsk->cpu_timers[2]);
1609
-}
1610
-#else
1611
-static inline void posix_cpu_timers_init(struct task_struct *tsk) { }
1612
-#endif
1613
-
16141732 static inline void init_task_pid_links(struct task_struct *task)
16151733 {
16161734 enum pid_type type;
....@@ -1642,7 +1760,125 @@
16421760 INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
16431761 p->rcu_tasks_idle_cpu = -1;
16441762 #endif /* #ifdef CONFIG_TASKS_RCU */
1763
+#ifdef CONFIG_TASKS_TRACE_RCU
1764
+ p->trc_reader_nesting = 0;
1765
+ p->trc_reader_special.s = 0;
1766
+ INIT_LIST_HEAD(&p->trc_holdout_list);
1767
+#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
16451768 }
1769
+
1770
+struct pid *pidfd_pid(const struct file *file)
1771
+{
1772
+ if (file->f_op == &pidfd_fops)
1773
+ return file->private_data;
1774
+
1775
+ return ERR_PTR(-EBADF);
1776
+}
1777
+
1778
+static int pidfd_release(struct inode *inode, struct file *file)
1779
+{
1780
+ struct pid *pid = file->private_data;
1781
+
1782
+ file->private_data = NULL;
1783
+ put_pid(pid);
1784
+ return 0;
1785
+}
1786
+
1787
+#ifdef CONFIG_PROC_FS
1788
+/**
1789
+ * pidfd_show_fdinfo - print information about a pidfd
1790
+ * @m: proc fdinfo file
1791
+ * @f: file referencing a pidfd
1792
+ *
1793
+ * Pid:
1794
+ * This function will print the pid that a given pidfd refers to in the
1795
+ * pid namespace of the procfs instance.
1796
+ * If the pid namespace of the process is not a descendant of the pid
1797
+ * namespace of the procfs instance 0 will be shown as its pid. This is
1798
+ * similar to calling getppid() on a process whose parent is outside of
1799
+ * its pid namespace.
1800
+ *
1801
+ * NSpid:
1802
+ * If pid namespaces are supported then this function will also print
1803
+ * the pid of a given pidfd refers to for all descendant pid namespaces
1804
+ * starting from the current pid namespace of the instance, i.e. the
1805
+ * Pid field and the first entry in the NSpid field will be identical.
1806
+ * If the pid namespace of the process is not a descendant of the pid
1807
+ * namespace of the procfs instance 0 will be shown as its first NSpid
1808
+ * entry and no others will be shown.
1809
+ * Note that this differs from the Pid and NSpid fields in
1810
+ * /proc/<pid>/status where Pid and NSpid are always shown relative to
1811
+ * the pid namespace of the procfs instance. The difference becomes
1812
+ * obvious when sending around a pidfd between pid namespaces from a
1813
+ * different branch of the tree, i.e. where no ancestoral relation is
1814
+ * present between the pid namespaces:
1815
+ * - create two new pid namespaces ns1 and ns2 in the initial pid
1816
+ * namespace (also take care to create new mount namespaces in the
1817
+ * new pid namespace and mount procfs)
1818
+ * - create a process with a pidfd in ns1
1819
+ * - send pidfd from ns1 to ns2
1820
+ * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid
1821
+ * have exactly one entry, which is 0
1822
+ */
1823
+static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
1824
+{
1825
+ struct pid *pid = f->private_data;
1826
+ struct pid_namespace *ns;
1827
+ pid_t nr = -1;
1828
+
1829
+ if (likely(pid_has_task(pid, PIDTYPE_PID))) {
1830
+ ns = proc_pid_ns(file_inode(m->file)->i_sb);
1831
+ nr = pid_nr_ns(pid, ns);
1832
+ }
1833
+
1834
+ seq_put_decimal_ll(m, "Pid:\t", nr);
1835
+
1836
+#ifdef CONFIG_PID_NS
1837
+ seq_put_decimal_ll(m, "\nNSpid:\t", nr);
1838
+ if (nr > 0) {
1839
+ int i;
1840
+
1841
+ /* If nr is non-zero it means that 'pid' is valid and that
1842
+ * ns, i.e. the pid namespace associated with the procfs
1843
+ * instance, is in the pid namespace hierarchy of pid.
1844
+ * Start at one below the already printed level.
1845
+ */
1846
+ for (i = ns->level + 1; i <= pid->level; i++)
1847
+ seq_put_decimal_ll(m, "\t", pid->numbers[i].nr);
1848
+ }
1849
+#endif
1850
+ seq_putc(m, '\n');
1851
+}
1852
+#endif
1853
+
1854
+/*
1855
+ * Poll support for process exit notification.
1856
+ */
1857
+static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
1858
+{
1859
+ struct pid *pid = file->private_data;
1860
+ __poll_t poll_flags = 0;
1861
+
1862
+ poll_wait(file, &pid->wait_pidfd, pts);
1863
+
1864
+ /*
1865
+ * Inform pollers only when the whole thread group exits.
1866
+ * If the thread group leader exits before all other threads in the
1867
+ * group, then poll(2) should block, similar to the wait(2) family.
1868
+ */
1869
+ if (thread_group_exited(pid))
1870
+ poll_flags = EPOLLIN | EPOLLRDNORM;
1871
+
1872
+ return poll_flags;
1873
+}
1874
+
1875
+const struct file_operations pidfd_fops = {
1876
+ .release = pidfd_release,
1877
+ .poll = pidfd_poll,
1878
+#ifdef CONFIG_PROC_FS
1879
+ .show_fdinfo = pidfd_show_fdinfo,
1880
+#endif
1881
+};
16461882
16471883 static void __delayed_free_task(struct rcu_head *rhp)
16481884 {
....@@ -1657,84 +1893,6 @@
16571893 call_rcu(&tsk->rcu, __delayed_free_task);
16581894 else
16591895 free_task(tsk);
1660
-}
1661
-
1662
-static int pidfd_release(struct inode *inode, struct file *file)
1663
-{
1664
- struct pid *pid = file->private_data;
1665
-
1666
- file->private_data = NULL;
1667
- put_pid(pid);
1668
- return 0;
1669
-}
1670
-
1671
-#ifdef CONFIG_PROC_FS
1672
-static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
1673
-{
1674
- struct pid_namespace *ns = proc_pid_ns(file_inode(m->file));
1675
- struct pid *pid = f->private_data;
1676
-
1677
- seq_put_decimal_ull(m, "Pid:\t", pid_nr_ns(pid, ns));
1678
- seq_putc(m, '\n');
1679
-}
1680
-#endif
1681
-
1682
-/*
1683
- * Poll support for process exit notification.
1684
- */
1685
-static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
1686
-{
1687
- struct task_struct *task;
1688
- struct pid *pid = file->private_data;
1689
- __poll_t poll_flags = 0;
1690
-
1691
- poll_wait(file, &pid->wait_pidfd, pts);
1692
-
1693
- rcu_read_lock();
1694
- task = pid_task(pid, PIDTYPE_PID);
1695
- /*
1696
- * Inform pollers only when the whole thread group exits.
1697
- * If the thread group leader exits before all other threads in the
1698
- * group, then poll(2) should block, similar to the wait(2) family.
1699
- */
1700
- if (!task || (task->exit_state && thread_group_empty(task)))
1701
- poll_flags = EPOLLIN | EPOLLRDNORM;
1702
- rcu_read_unlock();
1703
-
1704
- return poll_flags;
1705
-}
1706
-
1707
-const struct file_operations pidfd_fops = {
1708
- .release = pidfd_release,
1709
- .poll = pidfd_poll,
1710
-#ifdef CONFIG_PROC_FS
1711
- .show_fdinfo = pidfd_show_fdinfo,
1712
-#endif
1713
-};
1714
-
1715
-/**
1716
- * pidfd_create() - Create a new pid file descriptor.
1717
- *
1718
- * @pid: struct pid that the pidfd will reference
1719
- *
1720
- * This creates a new pid file descriptor with the O_CLOEXEC flag set.
1721
- *
1722
- * Note, that this function can only be called after the fd table has
1723
- * been unshared to avoid leaking the pidfd to the new process.
1724
- *
1725
- * Return: On success, a cloexec pidfd is returned.
1726
- * On error, a negative errno number will be returned.
1727
- */
1728
-static int pidfd_create(struct pid *pid)
1729
-{
1730
- int fd;
1731
-
1732
- fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
1733
- O_RDWR | O_CLOEXEC);
1734
- if (fd < 0)
1735
- put_pid(pid);
1736
-
1737
- return fd;
17381896 }
17391897
17401898 static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk)
....@@ -1765,19 +1923,17 @@
17651923 * flags). The actual kick-off is left to the caller.
17661924 */
17671925 static __latent_entropy struct task_struct *copy_process(
1768
- unsigned long clone_flags,
1769
- unsigned long stack_start,
1770
- unsigned long stack_size,
1771
- int __user *parent_tidptr,
1772
- int __user *child_tidptr,
17731926 struct pid *pid,
17741927 int trace,
1775
- unsigned long tls,
1776
- int node)
1928
+ int node,
1929
+ struct kernel_clone_args *args)
17771930 {
17781931 int pidfd = -1, retval;
17791932 struct task_struct *p;
17801933 struct multiprocess_signals delayed;
1934
+ struct file *pidfile = NULL;
1935
+ u64 clone_flags = args->flags;
1936
+ struct nsproxy *nsp = current->nsproxy;
17811937
17821938 /*
17831939 * Don't allow sharing the root directory with processes in a different
....@@ -1820,21 +1976,26 @@
18201976 */
18211977 if (clone_flags & CLONE_THREAD) {
18221978 if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
1823
- (task_active_pid_ns(current) !=
1824
- current->nsproxy->pid_ns_for_children))
1979
+ (task_active_pid_ns(current) != nsp->pid_ns_for_children))
1980
+ return ERR_PTR(-EINVAL);
1981
+ }
1982
+
1983
+ /*
1984
+ * If the new process will be in a different time namespace
1985
+ * do not allow it to share VM or a thread group with the forking task.
1986
+ */
1987
+ if (clone_flags & (CLONE_THREAD | CLONE_VM)) {
1988
+ if (nsp->time_ns != nsp->time_ns_for_children)
18251989 return ERR_PTR(-EINVAL);
18261990 }
18271991
18281992 if (clone_flags & CLONE_PIDFD) {
18291993 /*
1830
- * - CLONE_PARENT_SETTID is useless for pidfds and also
1831
- * parent_tidptr is used to return pidfds.
18321994 * - CLONE_DETACHED is blocked so that we can potentially
18331995 * reuse it later for CLONE_PIDFD.
18341996 * - CLONE_THREAD is blocked until someone really needs it.
18351997 */
1836
- if (clone_flags &
1837
- (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD))
1998
+ if (clone_flags & (CLONE_DETACHED | CLONE_THREAD))
18381999 return ERR_PTR(-EINVAL);
18392000 }
18402001
....@@ -1853,13 +2014,21 @@
18532014 recalc_sigpending();
18542015 spin_unlock_irq(&current->sighand->siglock);
18552016 retval = -ERESTARTNOINTR;
1856
- if (signal_pending(current))
2017
+ if (task_sigpending(current))
18572018 goto fork_out;
18582019
18592020 retval = -ENOMEM;
18602021 p = dup_task_struct(current, node);
18612022 if (!p)
18622023 goto fork_out;
2024
+ if (args->io_thread) {
2025
+ /*
2026
+ * Mark us an IO worker, and block any signal that isn't
2027
+ * fatal or STOP
2028
+ */
2029
+ p->flags |= PF_IO_WORKER;
2030
+ siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
2031
+ }
18632032
18642033 cpufreq_task_times_init(p);
18652034
....@@ -1869,18 +2038,18 @@
18692038 * p->set_child_tid which is (ab)used as a kthread's data pointer for
18702039 * kernel threads (PF_KTHREAD).
18712040 */
1872
- p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
2041
+ p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
18732042 /*
18742043 * Clear TID on mm_release()?
18752044 */
1876
- p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
2045
+ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL;
18772046
18782047 ftrace_graph_init_task(p);
18792048
18802049 rt_mutex_init_task(p);
18812050
2051
+ lockdep_assert_irqs_enabled();
18822052 #ifdef CONFIG_PROVE_LOCKING
1883
- DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
18842053 DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
18852054 #endif
18862055 retval = -EAGAIN;
....@@ -1902,7 +2071,7 @@
19022071 * to stop root fork bombs.
19032072 */
19042073 retval = -EAGAIN;
1905
- if (nr_threads >= max_threads)
2074
+ if (data_race(nr_threads >= max_threads))
19062075 goto bad_fork_cleanup_count;
19072076
19082077 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
....@@ -1915,6 +2084,7 @@
19152084 spin_lock_init(&p->alloc_lock);
19162085
19172086 init_sigpending(&p->pending);
2087
+ p->sigqueue_cache = NULL;
19182088
19192089 p->utime = p->stime = p->gtime = 0;
19202090 #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
....@@ -1926,6 +2096,10 @@
19262096 seqcount_init(&p->vtime.seqcount);
19272097 p->vtime.starttime = 0;
19282098 p->vtime.state = VTIME_INACTIVE;
2099
+#endif
2100
+
2101
+#ifdef CONFIG_IO_URING
2102
+ p->io_uring = NULL;
19292103 #endif
19302104
19312105 #if defined(SPLIT_RSS_COUNTING)
....@@ -1941,7 +2115,7 @@
19412115 task_io_accounting_init(&p->ioac);
19422116 acct_clear_integrals(p);
19432117
1944
- posix_cpu_timers_init(p);
2118
+ posix_cputimers_init(&p->posix_cputimers);
19452119
19462120 p->io_context = NULL;
19472121 audit_set_context(p, NULL);
....@@ -1957,30 +2131,19 @@
19572131 #ifdef CONFIG_CPUSETS
19582132 p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
19592133 p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
1960
- seqcount_init(&p->mems_allowed_seq);
2134
+ seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock);
19612135 #endif
19622136 #ifdef CONFIG_TRACE_IRQFLAGS
1963
- p->irq_events = 0;
1964
- p->hardirqs_enabled = 0;
1965
- p->hardirq_enable_ip = 0;
1966
- p->hardirq_enable_event = 0;
1967
- p->hardirq_disable_ip = _THIS_IP_;
1968
- p->hardirq_disable_event = 0;
1969
- p->softirqs_enabled = 1;
1970
- p->softirq_enable_ip = _THIS_IP_;
1971
- p->softirq_enable_event = 0;
1972
- p->softirq_disable_ip = 0;
1973
- p->softirq_disable_event = 0;
1974
- p->hardirq_context = 0;
1975
- p->softirq_context = 0;
2137
+ memset(&p->irqtrace, 0, sizeof(p->irqtrace));
2138
+ p->irqtrace.hardirq_disable_ip = _THIS_IP_;
2139
+ p->irqtrace.softirq_enable_ip = _THIS_IP_;
2140
+ p->softirqs_enabled = 1;
2141
+ p->softirq_context = 0;
19762142 #endif
19772143
19782144 p->pagefault_disabled = 0;
19792145
19802146 #ifdef CONFIG_LOCKDEP
1981
- p->lockdep_depth = 0; /* no locks held yet */
1982
- p->curr_chain_key = 0;
1983
- p->lockdep_recursion = 0;
19842147 lockdep_init_task(p);
19852148 #endif
19862149
....@@ -2032,12 +2195,15 @@
20322195 retval = copy_io(clone_flags, p);
20332196 if (retval)
20342197 goto bad_fork_cleanup_namespaces;
2035
- retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);
2198
+ retval = copy_thread(clone_flags, args->stack, args->stack_size, p, args->tls);
20362199 if (retval)
20372200 goto bad_fork_cleanup_io;
20382201
2202
+ stackleak_task_init(p);
2203
+
20392204 if (pid != &init_struct_pid) {
2040
- pid = alloc_pid(p->nsproxy->pid_ns_for_children);
2205
+ pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid,
2206
+ args->set_tid_size);
20412207 if (IS_ERR(pid)) {
20422208 retval = PTR_ERR(pid);
20432209 goto bad_fork_cleanup_thread;
....@@ -2050,12 +2216,22 @@
20502216 * if the fd table isn't shared).
20512217 */
20522218 if (clone_flags & CLONE_PIDFD) {
2053
- retval = pidfd_create(pid);
2219
+ retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
20542220 if (retval < 0)
20552221 goto bad_fork_free_pid;
20562222
20572223 pidfd = retval;
2058
- retval = put_user(pidfd, parent_tidptr);
2224
+
2225
+ pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
2226
+ O_RDWR | O_CLOEXEC);
2227
+ if (IS_ERR(pidfile)) {
2228
+ put_unused_fd(pidfd);
2229
+ retval = PTR_ERR(pidfile);
2230
+ goto bad_fork_free_pid;
2231
+ }
2232
+ get_pid(pid); /* held by pidfile now */
2233
+
2234
+ retval = put_user(pidfd, args->pidfd);
20592235 if (retval)
20602236 goto bad_fork_put_pidfd;
20612237 }
....@@ -2080,7 +2256,7 @@
20802256 #ifdef TIF_SYSCALL_EMU
20812257 clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
20822258 #endif
2083
- clear_all_latency_tracing(p);
2259
+ clear_tsk_latency_tracing(p);
20842260
20852261 /* ok, now we should be set up.. */
20862262 p->pid = pid_nr(pid);
....@@ -2099,17 +2275,28 @@
20992275 p->pdeath_signal = 0;
21002276 INIT_LIST_HEAD(&p->thread_group);
21012277 p->task_works = NULL;
2278
+ clear_posix_cputimers_work(p);
21022279
2103
- cgroup_threadgroup_change_begin(current);
21042280 /*
21052281 * Ensure that the cgroup subsystem policies allow the new process to be
2106
- * forked. It should be noted the the new process's css_set can be changed
2282
+ * forked. It should be noted that the new process's css_set can be changed
21072283 * between here and cgroup_post_fork() if an organisation operation is in
21082284 * progress.
21092285 */
2110
- retval = cgroup_can_fork(p);
2286
+ retval = cgroup_can_fork(p, args);
21112287 if (retval)
2112
- goto bad_fork_cgroup_threadgroup_change_end;
2288
+ goto bad_fork_put_pidfd;
2289
+
2290
+ /*
2291
+ * Now that the cgroups are pinned, re-clone the parent cgroup and put
2292
+ * the new task on the correct runqueue. All this *before* the task
2293
+ * becomes visible.
2294
+ *
2295
+ * This isn't part of ->can_fork() because while the re-cloning is
2296
+ * cgroup specific, it unconditionally needs to place the task on a
2297
+ * runqueue.
2298
+ */
2299
+ sched_cgroup_fork(p, args);
21132300
21142301 /*
21152302 * From this point on we must avoid any synchronous user-space
....@@ -2120,7 +2307,7 @@
21202307 */
21212308
21222309 p->start_time = ktime_get_ns();
2123
- p->real_start_time = ktime_get_boot_ns();
2310
+ p->start_boottime = ktime_get_boottime_ns();
21242311
21252312 /*
21262313 * Make it visible to the rest of the system, but dont wake it up yet.
....@@ -2139,7 +2326,7 @@
21392326 } else {
21402327 p->real_parent = current;
21412328 p->parent_exec_id = current->self_exec_id;
2142
- p->exit_signal = (clone_flags & CSIGNAL);
2329
+ p->exit_signal = args->exit_signal;
21432330 }
21442331
21452332 klp_copy_process(p);
....@@ -2165,7 +2352,6 @@
21652352 retval = -EINTR;
21662353 goto bad_fork_cancel_cgroup;
21672354 }
2168
-
21692355
21702356 init_task_pid_links(p);
21712357 if (likely(p->pid)) {
....@@ -2199,7 +2385,7 @@
21992385 } else {
22002386 current->signal->nr_threads++;
22012387 atomic_inc(&current->signal->live);
2202
- atomic_inc(&current->signal->sigcnt);
2388
+ refcount_inc(&current->signal->sigcnt);
22032389 task_join_group_stop(p);
22042390 list_add_tail_rcu(&p->thread_group,
22052391 &p->group_leader->thread_group);
....@@ -2215,9 +2401,12 @@
22152401 syscall_tracepoint_update(p);
22162402 write_unlock_irq(&tasklist_lock);
22172403
2404
+ if (pidfile)
2405
+ fd_install(pidfd, pidfile);
2406
+
22182407 proc_fork_connector(p);
2219
- cgroup_post_fork(p);
2220
- cgroup_threadgroup_change_end(current);
2408
+ sched_post_fork(p);
2409
+ cgroup_post_fork(p, args);
22212410 perf_event_fork(p);
22222411
22232412 trace_task_newtask(p, clone_flags);
....@@ -2230,12 +2419,12 @@
22302419 bad_fork_cancel_cgroup:
22312420 spin_unlock(&current->sighand->siglock);
22322421 write_unlock_irq(&tasklist_lock);
2233
- cgroup_cancel_fork(p);
2234
-bad_fork_cgroup_threadgroup_change_end:
2235
- cgroup_threadgroup_change_end(current);
2422
+ cgroup_cancel_fork(p, args);
22362423 bad_fork_put_pidfd:
2237
- if (clone_flags & CLONE_PIDFD)
2238
- ksys_close(pidfd);
2424
+ if (clone_flags & CLONE_PIDFD) {
2425
+ fput(pidfile);
2426
+ put_unused_fd(pidfd);
2427
+ }
22392428 bad_fork_free_pid:
22402429 if (pid != &init_struct_pid)
22412430 free_pid(pid);
....@@ -2299,11 +2488,14 @@
22992488 }
23002489 }
23012490
2302
-struct task_struct *fork_idle(int cpu)
2491
+struct task_struct * __init fork_idle(int cpu)
23032492 {
23042493 struct task_struct *task;
2305
- task = copy_process(CLONE_VM, 0, 0, NULL, NULL, &init_struct_pid, 0, 0,
2306
- cpu_to_node(cpu));
2494
+ struct kernel_clone_args args = {
2495
+ .flags = CLONE_VM,
2496
+ };
2497
+
2498
+ task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
23072499 if (!IS_ERR(task)) {
23082500 init_idle_pids(task);
23092501 init_idle(task, cpu);
....@@ -2312,24 +2504,63 @@
23122504 return task;
23132505 }
23142506
2507
+struct mm_struct *copy_init_mm(void)
2508
+{
2509
+ return dup_mm(NULL, &init_mm);
2510
+}
2511
+
2512
+/*
2513
+ * This is like kernel_clone(), but shaved down and tailored to just
2514
+ * creating io_uring workers. It returns a created task, or an error pointer.
2515
+ * The returned task is inactive, and the caller must fire it up through
2516
+ * wake_up_new_task(p). All signals are blocked in the created task.
2517
+ */
2518
+struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
2519
+{
2520
+ unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
2521
+ CLONE_IO;
2522
+ struct kernel_clone_args args = {
2523
+ .flags = ((lower_32_bits(flags) | CLONE_VM |
2524
+ CLONE_UNTRACED) & ~CSIGNAL),
2525
+ .exit_signal = (lower_32_bits(flags) & CSIGNAL),
2526
+ .stack = (unsigned long)fn,
2527
+ .stack_size = (unsigned long)arg,
2528
+ .io_thread = 1,
2529
+ };
2530
+
2531
+ return copy_process(NULL, 0, node, &args);
2532
+}
2533
+
23152534 /*
23162535 * Ok, this is the main fork-routine.
23172536 *
23182537 * It copies the process, and if successful kick-starts
23192538 * it and waits for it to finish using the VM if required.
2539
+ *
2540
+ * args->exit_signal is expected to be checked for sanity by the caller.
23202541 */
2321
-long _do_fork(unsigned long clone_flags,
2322
- unsigned long stack_start,
2323
- unsigned long stack_size,
2324
- int __user *parent_tidptr,
2325
- int __user *child_tidptr,
2326
- unsigned long tls)
2542
+pid_t kernel_clone(struct kernel_clone_args *args)
23272543 {
2544
+ u64 clone_flags = args->flags;
23282545 struct completion vfork;
23292546 struct pid *pid;
23302547 struct task_struct *p;
23312548 int trace = 0;
2332
- long nr;
2549
+ pid_t nr;
2550
+
2551
+ /*
2552
+ * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
2553
+ * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
2554
+ * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate
2555
+ * field in struct clone_args and it still doesn't make sense to have
2556
+ * them both point at the same memory location. Performing this check
2557
+ * here has the advantage that we don't need to have a separate helper
2558
+ * to check for legacy clone().
2559
+ */
2560
+ if ((args->flags & CLONE_PIDFD) &&
2561
+ (args->flags & CLONE_PARENT_SETTID) &&
2562
+ (args->pidfd == args->parent_tid))
2563
+ return -EINVAL;
23332564
23342565 /*
23352566 * Determine whether and which event to report to ptracer. When
....@@ -2340,7 +2571,7 @@
23402571 if (!(clone_flags & CLONE_UNTRACED)) {
23412572 if (clone_flags & CLONE_VFORK)
23422573 trace = PTRACE_EVENT_VFORK;
2343
- else if ((clone_flags & CSIGNAL) != SIGCHLD)
2574
+ else if (args->exit_signal != SIGCHLD)
23442575 trace = PTRACE_EVENT_CLONE;
23452576 else
23462577 trace = PTRACE_EVENT_FORK;
....@@ -2349,8 +2580,7 @@
23492580 trace = 0;
23502581 }
23512582
2352
- p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr,
2353
- child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
2583
+ p = copy_process(NULL, trace, NUMA_NO_NODE, args);
23542584 add_latent_entropy();
23552585
23562586 if (IS_ERR(p))
....@@ -2368,7 +2598,7 @@
23682598 nr = pid_vnr(pid);
23692599
23702600 if (clone_flags & CLONE_PARENT_SETTID)
2371
- put_user(nr, parent_tidptr);
2601
+ put_user(nr, args->parent_tid);
23722602
23732603 if (clone_flags & CLONE_VFORK) {
23742604 p->vfork_done = &vfork;
....@@ -2391,34 +2621,31 @@
23912621 return nr;
23922622 }
23932623
2394
-#ifndef CONFIG_HAVE_COPY_THREAD_TLS
2395
-/* For compatibility with architectures that call do_fork directly rather than
2396
- * using the syscall entry points below. */
2397
-long do_fork(unsigned long clone_flags,
2398
- unsigned long stack_start,
2399
- unsigned long stack_size,
2400
- int __user *parent_tidptr,
2401
- int __user *child_tidptr)
2402
-{
2403
- return _do_fork(clone_flags, stack_start, stack_size,
2404
- parent_tidptr, child_tidptr, 0);
2405
-}
2406
-#endif
2407
-
24082624 /*
24092625 * Create a kernel thread.
24102626 */
24112627 pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
24122628 {
2413
- return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
2414
- (unsigned long)arg, NULL, NULL, 0);
2629
+ struct kernel_clone_args args = {
2630
+ .flags = ((lower_32_bits(flags) | CLONE_VM |
2631
+ CLONE_UNTRACED) & ~CSIGNAL),
2632
+ .exit_signal = (lower_32_bits(flags) & CSIGNAL),
2633
+ .stack = (unsigned long)fn,
2634
+ .stack_size = (unsigned long)arg,
2635
+ };
2636
+
2637
+ return kernel_clone(&args);
24152638 }
24162639
24172640 #ifdef __ARCH_WANT_SYS_FORK
24182641 SYSCALL_DEFINE0(fork)
24192642 {
24202643 #ifdef CONFIG_MMU
2421
- return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);
2644
+ struct kernel_clone_args args = {
2645
+ .exit_signal = SIGCHLD,
2646
+ };
2647
+
2648
+ return kernel_clone(&args);
24222649 #else
24232650 /* can not support in nommu mode */
24242651 return -EINVAL;
....@@ -2429,8 +2656,12 @@
24292656 #ifdef __ARCH_WANT_SYS_VFORK
24302657 SYSCALL_DEFINE0(vfork)
24312658 {
2432
- return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
2433
- 0, NULL, NULL, 0);
2659
+ struct kernel_clone_args args = {
2660
+ .flags = CLONE_VFORK | CLONE_VM,
2661
+ .exit_signal = SIGCHLD,
2662
+ };
2663
+
2664
+ return kernel_clone(&args);
24342665 }
24352666 #endif
24362667
....@@ -2458,7 +2689,175 @@
24582689 unsigned long, tls)
24592690 #endif
24602691 {
2461
- return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls);
2692
+ struct kernel_clone_args args = {
2693
+ .flags = (lower_32_bits(clone_flags) & ~CSIGNAL),
2694
+ .pidfd = parent_tidptr,
2695
+ .child_tid = child_tidptr,
2696
+ .parent_tid = parent_tidptr,
2697
+ .exit_signal = (lower_32_bits(clone_flags) & CSIGNAL),
2698
+ .stack = newsp,
2699
+ .tls = tls,
2700
+ };
2701
+
2702
+ return kernel_clone(&args);
2703
+}
2704
+#endif
2705
+
2706
+#ifdef __ARCH_WANT_SYS_CLONE3
2707
+
2708
+noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
2709
+ struct clone_args __user *uargs,
2710
+ size_t usize)
2711
+{
2712
+ int err;
2713
+ struct clone_args args;
2714
+ pid_t *kset_tid = kargs->set_tid;
2715
+
2716
+ BUILD_BUG_ON(offsetofend(struct clone_args, tls) !=
2717
+ CLONE_ARGS_SIZE_VER0);
2718
+ BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) !=
2719
+ CLONE_ARGS_SIZE_VER1);
2720
+ BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) !=
2721
+ CLONE_ARGS_SIZE_VER2);
2722
+ BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2);
2723
+
2724
+ if (unlikely(usize > PAGE_SIZE))
2725
+ return -E2BIG;
2726
+ if (unlikely(usize < CLONE_ARGS_SIZE_VER0))
2727
+ return -EINVAL;
2728
+
2729
+ err = copy_struct_from_user(&args, sizeof(args), uargs, usize);
2730
+ if (err)
2731
+ return err;
2732
+
2733
+ if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL))
2734
+ return -EINVAL;
2735
+
2736
+ if (unlikely(!args.set_tid && args.set_tid_size > 0))
2737
+ return -EINVAL;
2738
+
2739
+ if (unlikely(args.set_tid && args.set_tid_size == 0))
2740
+ return -EINVAL;
2741
+
2742
+ /*
2743
+ * Verify that higher 32bits of exit_signal are unset and that
2744
+ * it is a valid signal
2745
+ */
2746
+ if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
2747
+ !valid_signal(args.exit_signal)))
2748
+ return -EINVAL;
2749
+
2750
+ if ((args.flags & CLONE_INTO_CGROUP) &&
2751
+ (args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2))
2752
+ return -EINVAL;
2753
+
2754
+ *kargs = (struct kernel_clone_args){
2755
+ .flags = args.flags,
2756
+ .pidfd = u64_to_user_ptr(args.pidfd),
2757
+ .child_tid = u64_to_user_ptr(args.child_tid),
2758
+ .parent_tid = u64_to_user_ptr(args.parent_tid),
2759
+ .exit_signal = args.exit_signal,
2760
+ .stack = args.stack,
2761
+ .stack_size = args.stack_size,
2762
+ .tls = args.tls,
2763
+ .set_tid_size = args.set_tid_size,
2764
+ .cgroup = args.cgroup,
2765
+ };
2766
+
2767
+ if (args.set_tid &&
2768
+ copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid),
2769
+ (kargs->set_tid_size * sizeof(pid_t))))
2770
+ return -EFAULT;
2771
+
2772
+ kargs->set_tid = kset_tid;
2773
+
2774
+ return 0;
2775
+}
2776
+
2777
+/**
2778
+ * clone3_stack_valid - check and prepare stack
2779
+ * @kargs: kernel clone args
2780
+ *
2781
+ * Verify that the stack arguments userspace gave us are sane.
2782
+ * In addition, set the stack direction for userspace since it's easy for us to
2783
+ * determine.
2784
+ */
2785
+static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
2786
+{
2787
+ if (kargs->stack == 0) {
2788
+ if (kargs->stack_size > 0)
2789
+ return false;
2790
+ } else {
2791
+ if (kargs->stack_size == 0)
2792
+ return false;
2793
+
2794
+ if (!access_ok((void __user *)kargs->stack, kargs->stack_size))
2795
+ return false;
2796
+
2797
+#if !defined(CONFIG_STACK_GROWSUP) && !defined(CONFIG_IA64)
2798
+ kargs->stack += kargs->stack_size;
2799
+#endif
2800
+ }
2801
+
2802
+ return true;
2803
+}
2804
+
2805
+static bool clone3_args_valid(struct kernel_clone_args *kargs)
2806
+{
2807
+ /* Verify that no unknown flags are passed along. */
2808
+ if (kargs->flags &
2809
+ ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
2810
+ return false;
2811
+
2812
+ /*
2813
+ * - make the CLONE_DETACHED bit reuseable for clone3
2814
+ * - make the CSIGNAL bits reuseable for clone3
2815
+ */
2816
+ if (kargs->flags & (CLONE_DETACHED | CSIGNAL))
2817
+ return false;
2818
+
2819
+ if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) ==
2820
+ (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND))
2821
+ return false;
2822
+
2823
+ if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) &&
2824
+ kargs->exit_signal)
2825
+ return false;
2826
+
2827
+ if (!clone3_stack_valid(kargs))
2828
+ return false;
2829
+
2830
+ return true;
2831
+}
2832
+
2833
+/**
2834
+ * clone3 - create a new process with specific properties
2835
+ * @uargs: argument structure
2836
+ * @size: size of @uargs
2837
+ *
2838
+ * clone3() is the extensible successor to clone()/clone2().
2839
+ * It takes a struct as argument that is versioned by its size.
2840
+ *
2841
+ * Return: On success, a positive PID for the child process.
2842
+ * On error, a negative errno number.
2843
+ */
2844
+SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
2845
+{
2846
+ int err;
2847
+
2848
+ struct kernel_clone_args kargs;
2849
+ pid_t set_tid[MAX_PID_NS_LEVEL];
2850
+
2851
+ kargs.set_tid = set_tid;
2852
+
2853
+ err = copy_clone_args_from_user(&kargs, uargs, size);
2854
+ if (err)
2855
+ return err;
2856
+
2857
+ if (!clone3_args_valid(&kargs))
2858
+ return -EINVAL;
2859
+
2860
+ return kernel_clone(&kargs);
24622861 }
24632862 #endif
24642863
....@@ -2553,7 +2952,8 @@
25532952 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
25542953 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
25552954 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
2556
- CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP))
2955
+ CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP|
2956
+ CLONE_NEWTIME))
25572957 return -EINVAL;
25582958 /*
25592959 * Not implemented, but pretend it works if there is nothing
....@@ -2566,7 +2966,7 @@
25662966 return -EINVAL;
25672967 }
25682968 if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) {
2569
- if (atomic_read(&current->sighand->count) > 1)
2969
+ if (refcount_read(&current->sighand->count) > 1)
25702970 return -EINVAL;
25712971 }
25722972 if (unshare_flags & CLONE_VM) {
....@@ -2601,14 +3001,15 @@
26013001 /*
26023002 * Unshare file descriptor table if it is being shared
26033003 */
2604
-static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
3004
+int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
3005
+ struct files_struct **new_fdp)
26053006 {
26063007 struct files_struct *fd = current->files;
26073008 int error = 0;
26083009
26093010 if ((unshare_flags & CLONE_FILES) &&
26103011 (fd && atomic_read(&fd->count) > 1)) {
2611
- *new_fdp = dup_fd(fd, &error);
3012
+ *new_fdp = dup_fd(fd, max_fds, &error);
26123013 if (!*new_fdp)
26133014 return error;
26143015 }
....@@ -2619,7 +3020,7 @@
26193020 /*
26203021 * unshare allows a process to 'unshare' part of the process
26213022 * context which was originally shared using clone. copy_*
2622
- * functions used by do_fork() cannot be used here directly
3023
+ * functions used by kernel_clone() cannot be used here directly
26233024 * because they modify an inactive task_struct that is being
26243025 * constructed. Here we are modifying the current, active,
26253026 * task_struct.
....@@ -2668,7 +3069,7 @@
26683069 err = unshare_fs(unshare_flags, &new_fs);
26693070 if (err)
26703071 goto bad_unshare_out;
2671
- err = unshare_fd(unshare_flags, &new_fd);
3072
+ err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd);
26723073 if (err)
26733074 goto bad_unshare_cleanup_fs;
26743075 err = unshare_userns(unshare_flags, &new_cred);
....@@ -2757,7 +3158,7 @@
27573158 struct files_struct *copy = NULL;
27583159 int error;
27593160
2760
- error = unshare_fd(CLONE_FILES, &copy);
3161
+ error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, &copy);
27613162 if (error || !copy) {
27623163 *displaced = NULL;
27633164 return error;
....@@ -2770,7 +3171,7 @@
27703171 }
27713172
27723173 int sysctl_max_threads(struct ctl_table *table, int write,
2773
- void __user *buffer, size_t *lenp, loff_t *ppos)
3174
+ void *buffer, size_t *lenp, loff_t *ppos)
27743175 {
27753176 struct ctl_table t;
27763177 int ret;