hc
2024-12-19 9370bb92b2d16684ee45cf24e879c93c509162da
kernel/kernel/fork.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * linux/kernel/fork.c
34 *
....@@ -39,10 +40,8 @@
3940 #include <linux/binfmts.h>
4041 #include <linux/mman.h>
4142 #include <linux/mmu_notifier.h>
42
-#include <linux/hmm.h>
4343 #include <linux/fs.h>
4444 #include <linux/mm.h>
45
-#include <linux/kprobes.h>
4645 #include <linux/vmacache.h>
4746 #include <linux/nsproxy.h>
4847 #include <linux/capability.h>
....@@ -80,7 +79,6 @@
8079 #include <linux/blkdev.h>
8180 #include <linux/fs_struct.h>
8281 #include <linux/magic.h>
83
-#include <linux/sched/mm.h>
8482 #include <linux/perf_event.h>
8583 #include <linux/posix-timers.h>
8684 #include <linux/user-return-notifier.h>
....@@ -94,10 +92,12 @@
9492 #include <linux/kcov.h>
9593 #include <linux/livepatch.h>
9694 #include <linux/thread_info.h>
97
-#include <linux/cpufreq_times.h>
95
+#include <linux/stackleak.h>
96
+#include <linux/kasan.h>
9897 #include <linux/scs.h>
98
+#include <linux/io_uring.h>
99
+#include <linux/cpufreq_times.h>
99100
100
-#include <asm/pgtable.h>
101101 #include <asm/pgalloc.h>
102102 #include <linux/uaccess.h>
103103 #include <asm/mmu_context.h>
....@@ -109,6 +109,8 @@
109109 #define CREATE_TRACE_POINTS
110110 #include <trace/events/task.h>
111111
112
+#undef CREATE_TRACE_POINTS
113
+#include <trace/hooks/sched.h>
112114 /*
113115 * Minimum number of threads to boot the kernel
114116 */
....@@ -119,17 +121,29 @@
119121 */
120122 #define MAX_THREADS FUTEX_TID_MASK
121123
124
+EXPORT_TRACEPOINT_SYMBOL_GPL(task_newtask);
125
+
122126 /*
123127 * Protected counters by write_lock_irq(&tasklist_lock)
124128 */
125129 unsigned long total_forks; /* Handle normal Linux uptimes. */
126130 int nr_threads; /* The idle threads do not count.. */
127131
128
-int max_threads; /* tunable limit on nr_threads */
132
+static int max_threads; /* tunable limit on nr_threads */
133
+
134
+#define NAMED_ARRAY_INDEX(x) [x] = __stringify(x)
135
+
136
+static const char * const resident_page_types[] = {
137
+ NAMED_ARRAY_INDEX(MM_FILEPAGES),
138
+ NAMED_ARRAY_INDEX(MM_ANONPAGES),
139
+ NAMED_ARRAY_INDEX(MM_SWAPENTS),
140
+ NAMED_ARRAY_INDEX(MM_SHMEMPAGES),
141
+};
129142
130143 DEFINE_PER_CPU(unsigned long, process_counts) = 0;
131144
132145 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
146
+EXPORT_SYMBOL_GPL(tasklist_lock);
133147
134148 #ifdef CONFIG_PROVE_RCU
135149 int lockdep_tasklist_lock_is_held(void)
....@@ -217,6 +231,9 @@
217231 if (!s)
218232 continue;
219233
234
+ /* Mark stack accessible for KASAN. */
235
+ kasan_unpoison_range(s->addr, THREAD_SIZE);
236
+
220237 /* Clear stale pointers from reused stack. */
221238 memset(s->addr, 0, THREAD_SIZE);
222239
....@@ -225,9 +242,14 @@
225242 return s->addr;
226243 }
227244
245
+ /*
246
+ * Allocated stacks are cached and later reused by new threads,
247
+ * so memcg accounting is performed manually on assigning/releasing
248
+ * stacks to tasks. Drop __GFP_ACCOUNT.
249
+ */
228250 stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
229251 VMALLOC_START, VMALLOC_END,
230
- THREADINFO_GFP,
252
+ THREADINFO_GFP & ~__GFP_ACCOUNT,
231253 PAGE_KERNEL,
232254 0, node, __builtin_return_address(0));
233255
....@@ -246,7 +268,7 @@
246268 THREAD_SIZE_ORDER);
247269
248270 if (likely(page)) {
249
- tsk->stack = page_address(page);
271
+ tsk->stack = kasan_reset_tag(page_address(page));
250272 return tsk->stack;
251273 }
252274 return NULL;
....@@ -256,8 +278,13 @@
256278 static inline void free_thread_stack(struct task_struct *tsk)
257279 {
258280 #ifdef CONFIG_VMAP_STACK
259
- if (task_stack_vm_area(tsk)) {
281
+ struct vm_struct *vm = task_stack_vm_area(tsk);
282
+
283
+ if (vm) {
260284 int i;
285
+
286
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
287
+ memcg_kmem_uncharge_page(vm->pages[i], 0);
261288
262289 for (i = 0; i < NR_CACHED_STACKS; i++) {
263290 if (this_cpu_cmpxchg(cached_stacks[i],
....@@ -282,6 +309,7 @@
282309 {
283310 unsigned long *stack;
284311 stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
312
+ stack = kasan_reset_tag(stack);
285313 tsk->stack = stack;
286314 return stack;
287315 }
....@@ -334,8 +362,15 @@
334362 struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
335363
336364 if (new) {
337
- *new = *orig;
338
- INIT_LIST_HEAD(&new->anon_vma_chain);
365
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
366
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
367
+ /*
368
+ * orig->shared.rb may be modified concurrently, but the clone
369
+ * will be reinitialized.
370
+ */
371
+ *new = data_race(*orig);
372
+ INIT_VMA(new);
373
+ new->vm_next = new->vm_prev = NULL;
339374 }
340375 return new;
341376 }
....@@ -350,6 +385,22 @@
350385 void *stack = task_stack_page(tsk);
351386 struct vm_struct *vm = task_stack_vm_area(tsk);
352387
388
+
389
+ /* All stack pages are in the same node. */
390
+ if (vm)
391
+ mod_lruvec_page_state(vm->pages[0], NR_KERNEL_STACK_KB,
392
+ account * (THREAD_SIZE / 1024));
393
+ else
394
+ mod_lruvec_slab_state(stack, NR_KERNEL_STACK_KB,
395
+ account * (THREAD_SIZE / 1024));
396
+}
397
+
398
+static int memcg_charge_kernel_stack(struct task_struct *tsk)
399
+{
400
+#ifdef CONFIG_VMAP_STACK
401
+ struct vm_struct *vm = task_stack_vm_area(tsk);
402
+ int ret;
403
+
353404 BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
354405
355406 if (vm) {
....@@ -358,27 +409,19 @@
358409 BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
359410
360411 for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
361
- mod_zone_page_state(page_zone(vm->pages[i]),
362
- NR_KERNEL_STACK_KB,
363
- PAGE_SIZE / 1024 * account);
412
+ /*
413
+ * If memcg_kmem_charge_page() fails, page->mem_cgroup
414
+ * pointer is NULL, and memcg_kmem_uncharge_page() in
415
+ * free_thread_stack() will ignore this page.
416
+ */
417
+ ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL,
418
+ 0);
419
+ if (ret)
420
+ return ret;
364421 }
365
-
366
- /* All stack pages belong to the same memcg. */
367
- mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
368
- account * (THREAD_SIZE / 1024));
369
- } else {
370
- /*
371
- * All stack pages are in the same zone and belong to the
372
- * same memcg.
373
- */
374
- struct page *first_page = virt_to_page(stack);
375
-
376
- mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
377
- THREAD_SIZE / 1024 * account);
378
-
379
- mod_memcg_page_state(first_page, MEMCG_KERNEL_STACK_KB,
380
- account * (THREAD_SIZE / 1024));
381422 }
423
+#endif
424
+ return 0;
382425 }
383426
384427 static void release_task_stack(struct task_struct *tsk)
....@@ -397,16 +440,21 @@
397440 #ifdef CONFIG_THREAD_INFO_IN_TASK
398441 void put_task_stack(struct task_struct *tsk)
399442 {
400
- if (atomic_dec_and_test(&tsk->stack_refcount))
443
+ if (refcount_dec_and_test(&tsk->stack_refcount))
401444 release_task_stack(tsk);
402445 }
446
+EXPORT_SYMBOL_GPL(put_task_stack);
403447 #endif
404448
405449 void free_task(struct task_struct *tsk)
406450 {
451
+#ifdef CONFIG_SECCOMP
452
+ WARN_ON_ONCE(tsk->seccomp.filter);
453
+#endif
407454 cpufreq_task_times_exit(tsk);
408455 scs_release(tsk);
409456
457
+ trace_android_vh_free_task(tsk);
410458 #ifndef CONFIG_THREAD_INFO_IN_TASK
411459 /*
412460 * The task is finally done with both the stack and thread_info,
....@@ -418,11 +466,10 @@
418466 * If the task had a separate stack allocation, it should be gone
419467 * by now.
420468 */
421
- WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0);
469
+ WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0);
422470 #endif
423471 rt_mutex_debug_task_free(tsk);
424472 ftrace_graph_exit_task(tsk);
425
- put_seccomp_filter(tsk);
426473 arch_release_task_struct(tsk);
427474 if (tsk->flags & PF_KTHREAD)
428475 free_kthread_struct(tsk);
....@@ -434,14 +481,14 @@
434481 static __latent_entropy int dup_mmap(struct mm_struct *mm,
435482 struct mm_struct *oldmm)
436483 {
437
- struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
484
+ struct vm_area_struct *mpnt, *tmp, *prev, **pprev, *last = NULL;
438485 struct rb_node **rb_link, *rb_parent;
439486 int retval;
440487 unsigned long charge;
441488 LIST_HEAD(uf);
442489
443490 uprobe_start_dup_mmap();
444
- if (down_write_killable(&oldmm->mmap_sem)) {
491
+ if (mmap_write_lock_killable(oldmm)) {
445492 retval = -EINTR;
446493 goto fail_uprobe_end;
447494 }
....@@ -450,7 +497,7 @@
450497 /*
451498 * Not linked in yet - no deadlock potential:
452499 */
453
- down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
500
+ mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
454501
455502 /* No ordering required: file already has been exposed. */
456503 RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
....@@ -505,14 +552,15 @@
505552 if (retval)
506553 goto fail_nomem_anon_vma_fork;
507554 if (tmp->vm_flags & VM_WIPEONFORK) {
508
- /* VM_WIPEONFORK gets a clean slate in the child. */
555
+ /*
556
+ * VM_WIPEONFORK gets a clean slate in the child.
557
+ * Don't prepare anon_vma until fault since we don't
558
+ * copy page for current vma.
559
+ */
509560 tmp->anon_vma = NULL;
510
- if (anon_vma_prepare(tmp))
511
- goto fail_nomem_anon_vma_fork;
512561 } else if (anon_vma_fork(tmp, mpnt))
513562 goto fail_nomem_anon_vma_fork;
514563 tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
515
- tmp->vm_next = tmp->vm_prev = NULL;
516564 file = tmp->vm_file;
517565 if (file) {
518566 struct inode *inode = file_inode(file);
....@@ -520,10 +568,10 @@
520568
521569 get_file(file);
522570 if (tmp->vm_flags & VM_DENYWRITE)
523
- atomic_dec(&inode->i_writecount);
571
+ put_write_access(inode);
524572 i_mmap_lock_write(mapping);
525573 if (tmp->vm_flags & VM_SHARED)
526
- atomic_inc(&mapping->i_mmap_writable);
574
+ mapping_allow_writable(mapping);
527575 flush_dcache_mmap_lock(mapping);
528576 /* insert tmp into the share list, just after mpnt */
529577 vma_interval_tree_insert_after(tmp, mpnt,
....@@ -553,8 +601,18 @@
553601 rb_parent = &tmp->vm_rb;
554602
555603 mm->map_count++;
556
- if (!(tmp->vm_flags & VM_WIPEONFORK))
557
- retval = copy_page_range(mm, oldmm, mpnt);
604
+ if (!(tmp->vm_flags & VM_WIPEONFORK)) {
605
+ if (IS_ENABLED(CONFIG_SPECULATIVE_PAGE_FAULT)) {
606
+ /*
607
+ * Mark this VMA as changing to prevent the
608
+ * speculative page fault hanlder to process
609
+ * it until the TLB are flushed below.
610
+ */
611
+ last = mpnt;
612
+ vm_write_begin(mpnt);
613
+ }
614
+ retval = copy_page_range(tmp, mpnt);
615
+ }
558616
559617 if (tmp->vm_ops && tmp->vm_ops->open)
560618 tmp->vm_ops->open(tmp);
....@@ -565,9 +623,25 @@
565623 /* a new mm has just been created */
566624 retval = arch_dup_mmap(oldmm, mm);
567625 out:
568
- up_write(&mm->mmap_sem);
626
+ mmap_write_unlock(mm);
569627 flush_tlb_mm(oldmm);
570
- up_write(&oldmm->mmap_sem);
628
+
629
+ if (IS_ENABLED(CONFIG_SPECULATIVE_PAGE_FAULT)) {
630
+ /*
631
+ * Since the TLB has been flush, we can safely unmark the
632
+ * copied VMAs and allows the speculative page fault handler to
633
+ * process them again.
634
+ * Walk back the VMA list from the last marked VMA.
635
+ */
636
+ for (; last; last = last->vm_prev) {
637
+ if (last->vm_flags & VM_DONTCOPY)
638
+ continue;
639
+ if (!(last->vm_flags & VM_WIPEONFORK))
640
+ vm_write_end(last);
641
+ }
642
+ }
643
+
644
+ mmap_write_unlock(oldmm);
571645 dup_userfaultfd_complete(&uf);
572646 fail_uprobe_end:
573647 uprobe_end_dup_mmap();
....@@ -597,9 +671,9 @@
597671 #else
598672 static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
599673 {
600
- down_write(&oldmm->mmap_sem);
674
+ mmap_write_lock(oldmm);
601675 RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
602
- up_write(&oldmm->mmap_sem);
676
+ mmap_write_unlock(oldmm);
603677 return 0;
604678 }
605679 #define mm_alloc_pgd(mm) (0)
....@@ -610,12 +684,15 @@
610684 {
611685 int i;
612686
687
+ BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS,
688
+ "Please make sure 'struct resident_page_types[]' is updated as well");
689
+
613690 for (i = 0; i < NR_MM_COUNTERS; i++) {
614691 long x = atomic_long_read(&mm->rss_stat.count[i]);
615692
616693 if (unlikely(x))
617
- printk(KERN_ALERT "BUG: Bad rss-counter state "
618
- "mm:%p idx:%d val:%ld\n", mm, i, x);
694
+ pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
695
+ mm, resident_page_types[i], x);
619696 }
620697
621698 if (mm_pgtables_bytes(mm))
....@@ -642,26 +719,12 @@
642719 WARN_ON_ONCE(mm == current->active_mm);
643720 mm_free_pgd(mm);
644721 destroy_context(mm);
645
- hmm_mm_destroy(mm);
646
- mmu_notifier_mm_destroy(mm);
722
+ mmu_notifier_subscriptions_destroy(mm);
647723 check_mm(mm);
648724 put_user_ns(mm->user_ns);
649725 free_mm(mm);
650726 }
651727 EXPORT_SYMBOL_GPL(__mmdrop);
652
-
653
-#ifdef CONFIG_PREEMPT_RT_BASE
654
-/*
655
- * RCU callback for delayed mm drop. Not strictly rcu, but we don't
656
- * want another facility to make this work.
657
- */
658
-void __mmdrop_delayed(struct rcu_head *rhp)
659
-{
660
- struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);
661
-
662
- __mmdrop(mm);
663
-}
664
-#endif
665728
666729 static void mmdrop_async_fn(struct work_struct *work)
667730 {
....@@ -694,27 +757,17 @@
694757
695758 static inline void put_signal_struct(struct signal_struct *sig)
696759 {
697
- if (atomic_dec_and_test(&sig->sigcnt))
760
+ if (refcount_dec_and_test(&sig->sigcnt))
698761 free_signal_struct(sig);
699762 }
700
-#ifdef CONFIG_PREEMPT_RT_BASE
701
-static
702
-#endif
763
+
703764 void __put_task_struct(struct task_struct *tsk)
704765 {
705766 WARN_ON(!tsk->exit_state);
706
- WARN_ON(atomic_read(&tsk->usage));
767
+ WARN_ON(refcount_read(&tsk->usage));
707768 WARN_ON(tsk == current);
708769
709
- /*
710
- * Remove function-return probe instances associated with this
711
- * task and put them back on the free list.
712
- */
713
- kprobe_flush_task(tsk);
714
-
715
- /* Task is done with its stack. */
716
- put_task_stack(tsk);
717
-
770
+ io_uring_free(tsk);
718771 cgroup_free(tsk);
719772 task_numa_free(tsk, true);
720773 security_task_free(tsk);
....@@ -725,18 +778,15 @@
725778 if (!profile_handoff_task(tsk))
726779 free_task(tsk);
727780 }
728
-#ifndef CONFIG_PREEMPT_RT_BASE
729781 EXPORT_SYMBOL_GPL(__put_task_struct);
730
-#else
731
-void __put_task_struct_cb(struct rcu_head *rhp)
782
+
783
+void __put_task_struct_rcu_cb(struct rcu_head *rhp)
732784 {
733
- struct task_struct *tsk = container_of(rhp, struct task_struct, put_rcu);
785
+ struct task_struct *task = container_of(rhp, struct task_struct, rcu);
734786
735
- __put_task_struct(tsk);
736
-
787
+ __put_task_struct(task);
737788 }
738
-EXPORT_SYMBOL_GPL(__put_task_struct_cb);
739
-#endif
789
+EXPORT_SYMBOL_GPL(__put_task_struct_rcu_cb);
740790
741791 void __init __weak arch_task_cache_init(void) { }
742792
....@@ -746,15 +796,16 @@
746796 static void set_max_threads(unsigned int max_threads_suggested)
747797 {
748798 u64 threads;
799
+ unsigned long nr_pages = totalram_pages();
749800
750801 /*
751802 * The number of threads shall be limited such that the thread
752803 * structures may only consume a small part of the available memory.
753804 */
754
- if (fls64(totalram_pages) + fls64(PAGE_SIZE) > 64)
805
+ if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64)
755806 threads = MAX_THREADS;
756807 else
757
- threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE,
808
+ threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE,
758809 (u64) THREAD_SIZE * 8UL);
759810
760811 if (threads > max_threads_suggested)
....@@ -768,6 +819,7 @@
768819 int arch_task_struct_size __read_mostly;
769820 #endif
770821
822
+#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
771823 static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
772824 {
773825 /* Fetch thread_struct whitelist for the architecture. */
....@@ -782,6 +834,7 @@
782834 else
783835 *offset += offsetof(struct task_struct, thread);
784836 }
837
+#endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */
785838
786839 void __init fork_init(void)
787840 {
....@@ -823,6 +876,7 @@
823876 scs_init();
824877
825878 lockdep_init_task(&init_task);
879
+ uprobes_init();
826880 }
827881
828882 int __weak arch_dup_task_struct(struct task_struct *dst,
....@@ -844,7 +898,7 @@
844898 {
845899 struct task_struct *tsk;
846900 unsigned long *stack;
847
- struct vm_struct *stack_vm_area;
901
+ struct vm_struct *stack_vm_area __maybe_unused;
848902 int err;
849903
850904 if (node == NUMA_NO_NODE)
....@@ -856,6 +910,9 @@
856910 stack = alloc_thread_stack_node(tsk, node);
857911 if (!stack)
858912 goto free_tsk;
913
+
914
+ if (memcg_charge_kernel_stack(tsk))
915
+ goto free_stack;
859916
860917 stack_vm_area = task_stack_vm_area(tsk);
861918
....@@ -871,7 +928,7 @@
871928 tsk->stack_vm_area = stack_vm_area;
872929 #endif
873930 #ifdef CONFIG_THREAD_INFO_IN_TASK
874
- atomic_set(&tsk->stack_refcount, 1);
931
+ refcount_set(&tsk->stack_refcount, 1);
875932 #endif
876933
877934 if (err)
....@@ -903,17 +960,19 @@
903960 tsk->cpus_ptr = &tsk->cpus_mask;
904961
905962 /*
906
- * One for us, one for whoever does the "release_task()" (usually
907
- * parent)
963
+ * One for the user space visible state that goes away when reaped.
964
+ * One for the scheduler.
908965 */
909
- atomic_set(&tsk->usage, 2);
966
+ refcount_set(&tsk->rcu_users, 2);
967
+ /* One for the rcu users */
968
+ refcount_set(&tsk->usage, 1);
910969 #ifdef CONFIG_BLK_DEV_IO_TRACE
911970 tsk->btrace_seq = 0;
912971 #endif
913972 tsk->splice_pipe = NULL;
914973 tsk->task_frag.page = NULL;
915974 tsk->wake_q.next = NULL;
916
- tsk->wake_q_sleeper.next = NULL;
975
+ tsk->pf_io_worker = NULL;
917976
918977 account_kernel_stack(tsk, 1);
919978
....@@ -931,6 +990,11 @@
931990 #ifdef CONFIG_MEMCG
932991 tsk->active_memcg = NULL;
933992 #endif
993
+
994
+ android_init_vendor_data(tsk, 1);
995
+ android_init_oem_data(tsk, 1);
996
+
997
+ trace_android_vh_dup_task_struct(tsk, orig);
934998 return tsk;
935999
9361000 free_stack:
....@@ -980,6 +1044,13 @@
9801044 #endif
9811045 }
9821046
1047
+static void mm_init_pasid(struct mm_struct *mm)
1048
+{
1049
+#ifdef CONFIG_IOMMU_SUPPORT
1050
+ mm->pasid = INIT_PASID;
1051
+#endif
1052
+}
1053
+
9831054 static void mm_init_uprobes_state(struct mm_struct *mm)
9841055 {
9851056 #ifdef CONFIG_UPROBES
....@@ -993,24 +1064,30 @@
9931064 mm->mmap = NULL;
9941065 mm->mm_rb = RB_ROOT;
9951066 mm->vmacache_seqnum = 0;
1067
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
1068
+ rwlock_init(&mm->mm_rb_lock);
1069
+#endif
9961070 atomic_set(&mm->mm_users, 1);
9971071 atomic_set(&mm->mm_count, 1);
998
- init_rwsem(&mm->mmap_sem);
1072
+ seqcount_init(&mm->write_protect_seq);
1073
+ mmap_init_lock(mm);
9991074 INIT_LIST_HEAD(&mm->mmlist);
10001075 mm->core_state = NULL;
10011076 mm_pgtables_bytes_init(mm);
10021077 mm->map_count = 0;
10031078 mm->locked_vm = 0;
1004
- mm->pinned_vm = 0;
1079
+ atomic_set(&mm->has_pinned, 0);
1080
+ atomic64_set(&mm->pinned_vm, 0);
10051081 memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
10061082 spin_lock_init(&mm->page_table_lock);
10071083 spin_lock_init(&mm->arg_lock);
10081084 mm_init_cpumask(mm);
10091085 mm_init_aio(mm);
10101086 mm_init_owner(mm, p);
1087
+ mm_init_pasid(mm);
10111088 RCU_INIT_POINTER(mm->exe_file, NULL);
1012
- mmu_notifier_mm_init(mm);
1013
- hmm_mm_init(mm);
1089
+ if (!mmu_notifier_subscriptions_init(mm))
1090
+ goto fail_nopgd;
10141091 init_tlb_flush_pending(mm);
10151092 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
10161093 mm->pmd_huge_pte = NULL;
....@@ -1085,8 +1162,10 @@
10851162 {
10861163 might_sleep();
10871164
1088
- if (atomic_dec_and_test(&mm->mm_users))
1165
+ if (atomic_dec_and_test(&mm->mm_users)) {
1166
+ trace_android_vh_mmput(NULL);
10891167 __mmput(mm);
1168
+ }
10901169 }
10911170 EXPORT_SYMBOL_GPL(mmput);
10921171
....@@ -1106,6 +1185,7 @@
11061185 schedule_work(&mm->async_put_work);
11071186 }
11081187 }
1188
+EXPORT_SYMBOL_GPL(mmput_async);
11091189 #endif
11101190
11111191 /**
....@@ -1210,7 +1290,7 @@
12101290 struct mm_struct *mm;
12111291 int err;
12121292
1213
- err = mutex_lock_killable(&task->signal->cred_guard_mutex);
1293
+ err = down_read_killable(&task->signal->exec_update_lock);
12141294 if (err)
12151295 return ERR_PTR(err);
12161296
....@@ -1220,7 +1300,7 @@
12201300 mmput(mm);
12211301 mm = ERR_PTR(-EACCES);
12221302 }
1223
- mutex_unlock(&task->signal->cred_guard_mutex);
1303
+ up_read(&task->signal->exec_update_lock);
12241304
12251305 return mm;
12261306 }
....@@ -1318,13 +1398,20 @@
13181398 mm_release(tsk, mm);
13191399 }
13201400
1321
-/*
1322
- * Allocate a new mm structure and copy contents from the
1323
- * mm structure of the passed in task structure.
1401
+/**
1402
+ * dup_mm() - duplicates an existing mm structure
1403
+ * @tsk: the task_struct with which the new mm will be associated.
1404
+ * @oldmm: the mm to duplicate.
1405
+ *
1406
+ * Allocates a new mm structure and duplicates the provided @oldmm structure
1407
+ * content into it.
1408
+ *
1409
+ * Return: the duplicated mm or NULL on failure.
13241410 */
1325
-static struct mm_struct *dup_mm(struct task_struct *tsk)
1411
+static struct mm_struct *dup_mm(struct task_struct *tsk,
1412
+ struct mm_struct *oldmm)
13261413 {
1327
- struct mm_struct *mm, *oldmm = current->mm;
1414
+ struct mm_struct *mm;
13281415 int err;
13291416
13301417 mm = allocate_mm();
....@@ -1392,7 +1479,7 @@
13921479 }
13931480
13941481 retval = -ENOMEM;
1395
- mm = dup_mm(tsk);
1482
+ mm = dup_mm(tsk, current->mm);
13961483 if (!mm)
13971484 goto fail_nomem;
13981485
....@@ -1442,7 +1529,7 @@
14421529 goto out;
14431530 }
14441531
1445
- newf = dup_fd(oldf, &error);
1532
+ newf = dup_fd(oldf, NR_OPEN_MAX, &error);
14461533 if (!newf)
14471534 goto out;
14481535
....@@ -1483,24 +1570,29 @@
14831570 struct sighand_struct *sig;
14841571
14851572 if (clone_flags & CLONE_SIGHAND) {
1486
- atomic_inc(&current->sighand->count);
1573
+ refcount_inc(&current->sighand->count);
14871574 return 0;
14881575 }
14891576 sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
1490
- rcu_assign_pointer(tsk->sighand, sig);
1577
+ RCU_INIT_POINTER(tsk->sighand, sig);
14911578 if (!sig)
14921579 return -ENOMEM;
14931580
1494
- atomic_set(&sig->count, 1);
1581
+ refcount_set(&sig->count, 1);
14951582 spin_lock_irq(&current->sighand->siglock);
14961583 memcpy(sig->action, current->sighand->action, sizeof(sig->action));
14971584 spin_unlock_irq(&current->sighand->siglock);
1585
+
1586
+ /* Reset all signal handler not set to SIG_IGN to SIG_DFL. */
1587
+ if (clone_flags & CLONE_CLEAR_SIGHAND)
1588
+ flush_signal_handlers(tsk, 0);
1589
+
14981590 return 0;
14991591 }
15001592
15011593 void __cleanup_sighand(struct sighand_struct *sighand)
15021594 {
1503
- if (atomic_dec_and_test(&sighand->count)) {
1595
+ if (refcount_dec_and_test(&sighand->count)) {
15041596 signalfd_cleanup(sighand);
15051597 /*
15061598 * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it
....@@ -1510,28 +1602,17 @@
15101602 }
15111603 }
15121604
1513
-#ifdef CONFIG_POSIX_TIMERS
15141605 /*
15151606 * Initialize POSIX timer handling for a thread group.
15161607 */
15171608 static void posix_cpu_timers_init_group(struct signal_struct *sig)
15181609 {
1610
+ struct posix_cputimers *pct = &sig->posix_cputimers;
15191611 unsigned long cpu_limit;
15201612
15211613 cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
1522
- if (cpu_limit != RLIM_INFINITY) {
1523
- sig->cputime_expires.prof_exp = cpu_limit * NSEC_PER_SEC;
1524
- sig->cputimer.running = true;
1525
- }
1526
-
1527
- /* The timer lists. */
1528
- INIT_LIST_HEAD(&sig->cpu_timers[0]);
1529
- INIT_LIST_HEAD(&sig->cpu_timers[1]);
1530
- INIT_LIST_HEAD(&sig->cpu_timers[2]);
1614
+ posix_cputimers_group_init(pct, cpu_limit);
15311615 }
1532
-#else
1533
-static inline void posix_cpu_timers_init_group(struct signal_struct *sig) { }
1534
-#endif
15351616
15361617 static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
15371618 {
....@@ -1547,7 +1628,7 @@
15471628
15481629 sig->nr_threads = 1;
15491630 atomic_set(&sig->live, 1);
1550
- atomic_set(&sig->sigcnt, 1);
1631
+ refcount_set(&sig->sigcnt, 1);
15511632
15521633 /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */
15531634 sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
....@@ -1579,6 +1660,7 @@
15791660 sig->oom_score_adj_min = current->signal->oom_score_adj_min;
15801661
15811662 mutex_init(&sig->cred_guard_mutex);
1663
+ init_rwsem(&sig->exec_update_lock);
15821664
15831665 return 0;
15841666 }
....@@ -1633,26 +1715,6 @@
16331715 #endif
16341716 }
16351717
1636
-#ifdef CONFIG_POSIX_TIMERS
1637
-/*
1638
- * Initialize POSIX timer handling for a single task.
1639
- */
1640
-static void posix_cpu_timers_init(struct task_struct *tsk)
1641
-{
1642
-#ifdef CONFIG_PREEMPT_RT_BASE
1643
- tsk->posix_timer_list = NULL;
1644
-#endif
1645
- tsk->cputime_expires.prof_exp = 0;
1646
- tsk->cputime_expires.virt_exp = 0;
1647
- tsk->cputime_expires.sched_exp = 0;
1648
- INIT_LIST_HEAD(&tsk->cpu_timers[0]);
1649
- INIT_LIST_HEAD(&tsk->cpu_timers[1]);
1650
- INIT_LIST_HEAD(&tsk->cpu_timers[2]);
1651
-}
1652
-#else
1653
-static inline void posix_cpu_timers_init(struct task_struct *tsk) { }
1654
-#endif
1655
-
16561718 static inline void init_task_pid_links(struct task_struct *task)
16571719 {
16581720 enum pid_type type;
....@@ -1684,7 +1746,125 @@
16841746 INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
16851747 p->rcu_tasks_idle_cpu = -1;
16861748 #endif /* #ifdef CONFIG_TASKS_RCU */
1749
+#ifdef CONFIG_TASKS_TRACE_RCU
1750
+ p->trc_reader_nesting = 0;
1751
+ p->trc_reader_special.s = 0;
1752
+ INIT_LIST_HEAD(&p->trc_holdout_list);
1753
+#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
16871754 }
1755
+
1756
+struct pid *pidfd_pid(const struct file *file)
1757
+{
1758
+ if (file->f_op == &pidfd_fops)
1759
+ return file->private_data;
1760
+
1761
+ return ERR_PTR(-EBADF);
1762
+}
1763
+
1764
+static int pidfd_release(struct inode *inode, struct file *file)
1765
+{
1766
+ struct pid *pid = file->private_data;
1767
+
1768
+ file->private_data = NULL;
1769
+ put_pid(pid);
1770
+ return 0;
1771
+}
1772
+
1773
+#ifdef CONFIG_PROC_FS
1774
+/**
1775
+ * pidfd_show_fdinfo - print information about a pidfd
1776
+ * @m: proc fdinfo file
1777
+ * @f: file referencing a pidfd
1778
+ *
1779
+ * Pid:
1780
+ * This function will print the pid that a given pidfd refers to in the
1781
+ * pid namespace of the procfs instance.
1782
+ * If the pid namespace of the process is not a descendant of the pid
1783
+ * namespace of the procfs instance 0 will be shown as its pid. This is
1784
+ * similar to calling getppid() on a process whose parent is outside of
1785
+ * its pid namespace.
1786
+ *
1787
+ * NSpid:
1788
+ * If pid namespaces are supported then this function will also print
1789
+ * the pid of a given pidfd refers to for all descendant pid namespaces
1790
+ * starting from the current pid namespace of the instance, i.e. the
1791
+ * Pid field and the first entry in the NSpid field will be identical.
1792
+ * If the pid namespace of the process is not a descendant of the pid
1793
+ * namespace of the procfs instance 0 will be shown as its first NSpid
1794
+ * entry and no others will be shown.
1795
+ * Note that this differs from the Pid and NSpid fields in
1796
+ * /proc/<pid>/status where Pid and NSpid are always shown relative to
1797
+ * the pid namespace of the procfs instance. The difference becomes
1798
+ * obvious when sending around a pidfd between pid namespaces from a
1799
+ * different branch of the tree, i.e. where no ancestoral relation is
1800
+ * present between the pid namespaces:
1801
+ * - create two new pid namespaces ns1 and ns2 in the initial pid
1802
+ * namespace (also take care to create new mount namespaces in the
1803
+ * new pid namespace and mount procfs)
1804
+ * - create a process with a pidfd in ns1
1805
+ * - send pidfd from ns1 to ns2
1806
+ * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid
1807
+ * have exactly one entry, which is 0
1808
+ */
1809
+static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
1810
+{
1811
+ struct pid *pid = f->private_data;
1812
+ struct pid_namespace *ns;
1813
+ pid_t nr = -1;
1814
+
1815
+ if (likely(pid_has_task(pid, PIDTYPE_PID))) {
1816
+ ns = proc_pid_ns(file_inode(m->file)->i_sb);
1817
+ nr = pid_nr_ns(pid, ns);
1818
+ }
1819
+
1820
+ seq_put_decimal_ll(m, "Pid:\t", nr);
1821
+
1822
+#ifdef CONFIG_PID_NS
1823
+ seq_put_decimal_ll(m, "\nNSpid:\t", nr);
1824
+ if (nr > 0) {
1825
+ int i;
1826
+
1827
+ /* If nr is non-zero it means that 'pid' is valid and that
1828
+ * ns, i.e. the pid namespace associated with the procfs
1829
+ * instance, is in the pid namespace hierarchy of pid.
1830
+ * Start at one below the already printed level.
1831
+ */
1832
+ for (i = ns->level + 1; i <= pid->level; i++)
1833
+ seq_put_decimal_ll(m, "\t", pid->numbers[i].nr);
1834
+ }
1835
+#endif
1836
+ seq_putc(m, '\n');
1837
+}
1838
+#endif
1839
+
1840
+/*
1841
+ * Poll support for process exit notification.
1842
+ */
1843
+static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
1844
+{
1845
+ struct pid *pid = file->private_data;
1846
+ __poll_t poll_flags = 0;
1847
+
1848
+ poll_wait(file, &pid->wait_pidfd, pts);
1849
+
1850
+ /*
1851
+ * Inform pollers only when the whole thread group exits.
1852
+ * If the thread group leader exits before all other threads in the
1853
+ * group, then poll(2) should block, similar to the wait(2) family.
1854
+ */
1855
+ if (thread_group_exited(pid))
1856
+ poll_flags = EPOLLIN | EPOLLRDNORM;
1857
+
1858
+ return poll_flags;
1859
+}
1860
+
1861
+const struct file_operations pidfd_fops = {
1862
+ .release = pidfd_release,
1863
+ .poll = pidfd_poll,
1864
+#ifdef CONFIG_PROC_FS
1865
+ .show_fdinfo = pidfd_show_fdinfo,
1866
+#endif
1867
+};
16881868
16891869 static void __delayed_free_task(struct rcu_head *rhp)
16901870 {
....@@ -1699,84 +1879,6 @@
16991879 call_rcu(&tsk->rcu, __delayed_free_task);
17001880 else
17011881 free_task(tsk);
1702
-}
1703
-
1704
-static int pidfd_release(struct inode *inode, struct file *file)
1705
-{
1706
- struct pid *pid = file->private_data;
1707
-
1708
- file->private_data = NULL;
1709
- put_pid(pid);
1710
- return 0;
1711
-}
1712
-
1713
-#ifdef CONFIG_PROC_FS
1714
-static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
1715
-{
1716
- struct pid_namespace *ns = proc_pid_ns(file_inode(m->file));
1717
- struct pid *pid = f->private_data;
1718
-
1719
- seq_put_decimal_ull(m, "Pid:\t", pid_nr_ns(pid, ns));
1720
- seq_putc(m, '\n');
1721
-}
1722
-#endif
1723
-
1724
-/*
1725
- * Poll support for process exit notification.
1726
- */
1727
-static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
1728
-{
1729
- struct task_struct *task;
1730
- struct pid *pid = file->private_data;
1731
- __poll_t poll_flags = 0;
1732
-
1733
- poll_wait(file, &pid->wait_pidfd, pts);
1734
-
1735
- rcu_read_lock();
1736
- task = pid_task(pid, PIDTYPE_PID);
1737
- /*
1738
- * Inform pollers only when the whole thread group exits.
1739
- * If the thread group leader exits before all other threads in the
1740
- * group, then poll(2) should block, similar to the wait(2) family.
1741
- */
1742
- if (!task || (task->exit_state && thread_group_empty(task)))
1743
- poll_flags = EPOLLIN | EPOLLRDNORM;
1744
- rcu_read_unlock();
1745
-
1746
- return poll_flags;
1747
-}
1748
-
1749
-const struct file_operations pidfd_fops = {
1750
- .release = pidfd_release,
1751
- .poll = pidfd_poll,
1752
-#ifdef CONFIG_PROC_FS
1753
- .show_fdinfo = pidfd_show_fdinfo,
1754
-#endif
1755
-};
1756
-
1757
-/**
1758
- * pidfd_create() - Create a new pid file descriptor.
1759
- *
1760
- * @pid: struct pid that the pidfd will reference
1761
- *
1762
- * This creates a new pid file descriptor with the O_CLOEXEC flag set.
1763
- *
1764
- * Note, that this function can only be called after the fd table has
1765
- * been unshared to avoid leaking the pidfd to the new process.
1766
- *
1767
- * Return: On success, a cloexec pidfd is returned.
1768
- * On error, a negative errno number will be returned.
1769
- */
1770
-static int pidfd_create(struct pid *pid)
1771
-{
1772
- int fd;
1773
-
1774
- fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
1775
- O_RDWR | O_CLOEXEC);
1776
- if (fd < 0)
1777
- put_pid(pid);
1778
-
1779
- return fd;
17801882 }
17811883
17821884 static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk)
....@@ -1807,19 +1909,17 @@
18071909 * flags). The actual kick-off is left to the caller.
18081910 */
18091911 static __latent_entropy struct task_struct *copy_process(
1810
- unsigned long clone_flags,
1811
- unsigned long stack_start,
1812
- unsigned long stack_size,
1813
- int __user *parent_tidptr,
1814
- int __user *child_tidptr,
18151912 struct pid *pid,
18161913 int trace,
1817
- unsigned long tls,
1818
- int node)
1914
+ int node,
1915
+ struct kernel_clone_args *args)
18191916 {
18201917 int pidfd = -1, retval;
18211918 struct task_struct *p;
18221919 struct multiprocess_signals delayed;
1920
+ struct file *pidfile = NULL;
1921
+ u64 clone_flags = args->flags;
1922
+ struct nsproxy *nsp = current->nsproxy;
18231923
18241924 /*
18251925 * Don't allow sharing the root directory with processes in a different
....@@ -1862,21 +1962,26 @@
18621962 */
18631963 if (clone_flags & CLONE_THREAD) {
18641964 if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
1865
- (task_active_pid_ns(current) !=
1866
- current->nsproxy->pid_ns_for_children))
1965
+ (task_active_pid_ns(current) != nsp->pid_ns_for_children))
1966
+ return ERR_PTR(-EINVAL);
1967
+ }
1968
+
1969
+ /*
1970
+ * If the new process will be in a different time namespace
1971
+ * do not allow it to share VM or a thread group with the forking task.
1972
+ */
1973
+ if (clone_flags & (CLONE_THREAD | CLONE_VM)) {
1974
+ if (nsp->time_ns != nsp->time_ns_for_children)
18671975 return ERR_PTR(-EINVAL);
18681976 }
18691977
18701978 if (clone_flags & CLONE_PIDFD) {
18711979 /*
1872
- * - CLONE_PARENT_SETTID is useless for pidfds and also
1873
- * parent_tidptr is used to return pidfds.
18741980 * - CLONE_DETACHED is blocked so that we can potentially
18751981 * reuse it later for CLONE_PIDFD.
18761982 * - CLONE_THREAD is blocked until someone really needs it.
18771983 */
1878
- if (clone_flags &
1879
- (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD))
1984
+ if (clone_flags & (CLONE_DETACHED | CLONE_THREAD))
18801985 return ERR_PTR(-EINVAL);
18811986 }
18821987
....@@ -1895,13 +2000,21 @@
18952000 recalc_sigpending();
18962001 spin_unlock_irq(&current->sighand->siglock);
18972002 retval = -ERESTARTNOINTR;
1898
- if (signal_pending(current))
2003
+ if (task_sigpending(current))
18992004 goto fork_out;
19002005
19012006 retval = -ENOMEM;
19022007 p = dup_task_struct(current, node);
19032008 if (!p)
19042009 goto fork_out;
2010
+ if (args->io_thread) {
2011
+ /*
2012
+ * Mark us an IO worker, and block any signal that isn't
2013
+ * fatal or STOP
2014
+ */
2015
+ p->flags |= PF_IO_WORKER;
2016
+ siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
2017
+ }
19052018
19062019 cpufreq_task_times_init(p);
19072020
....@@ -1911,18 +2024,18 @@
19112024 * p->set_child_tid which is (ab)used as a kthread's data pointer for
19122025 * kernel threads (PF_KTHREAD).
19132026 */
1914
- p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
2027
+ p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
19152028 /*
19162029 * Clear TID on mm_release()?
19172030 */
1918
- p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
2031
+ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL;
19192032
19202033 ftrace_graph_init_task(p);
19212034
19222035 rt_mutex_init_task(p);
19232036
2037
+ lockdep_assert_irqs_enabled();
19242038 #ifdef CONFIG_PROVE_LOCKING
1925
- DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
19262039 DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
19272040 #endif
19282041 retval = -EAGAIN;
....@@ -1944,7 +2057,7 @@
19442057 * to stop root fork bombs.
19452058 */
19462059 retval = -EAGAIN;
1947
- if (nr_threads >= max_threads)
2060
+ if (data_race(nr_threads >= max_threads))
19482061 goto bad_fork_cleanup_count;
19492062
19502063 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
....@@ -1957,7 +2070,6 @@
19572070 spin_lock_init(&p->alloc_lock);
19582071
19592072 init_sigpending(&p->pending);
1960
- p->sigqueue_cache = NULL;
19612073
19622074 p->utime = p->stime = p->gtime = 0;
19632075 #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
....@@ -1969,6 +2081,10 @@
19692081 seqcount_init(&p->vtime.seqcount);
19702082 p->vtime.starttime = 0;
19712083 p->vtime.state = VTIME_INACTIVE;
2084
+#endif
2085
+
2086
+#ifdef CONFIG_IO_URING
2087
+ p->io_uring = NULL;
19722088 #endif
19732089
19742090 #if defined(SPLIT_RSS_COUNTING)
....@@ -1984,7 +2100,7 @@
19842100 task_io_accounting_init(&p->ioac);
19852101 acct_clear_integrals(p);
19862102
1987
- posix_cpu_timers_init(p);
2103
+ posix_cputimers_init(&p->posix_cputimers);
19882104
19892105 p->io_context = NULL;
19902106 audit_set_context(p, NULL);
....@@ -2000,30 +2116,19 @@
20002116 #ifdef CONFIG_CPUSETS
20012117 p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
20022118 p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
2003
- seqcount_init(&p->mems_allowed_seq);
2119
+ seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock);
20042120 #endif
20052121 #ifdef CONFIG_TRACE_IRQFLAGS
2006
- p->irq_events = 0;
2007
- p->hardirqs_enabled = 0;
2008
- p->hardirq_enable_ip = 0;
2009
- p->hardirq_enable_event = 0;
2010
- p->hardirq_disable_ip = _THIS_IP_;
2011
- p->hardirq_disable_event = 0;
2012
- p->softirqs_enabled = 1;
2013
- p->softirq_enable_ip = _THIS_IP_;
2014
- p->softirq_enable_event = 0;
2015
- p->softirq_disable_ip = 0;
2016
- p->softirq_disable_event = 0;
2017
- p->hardirq_context = 0;
2018
- p->softirq_context = 0;
2122
+ memset(&p->irqtrace, 0, sizeof(p->irqtrace));
2123
+ p->irqtrace.hardirq_disable_ip = _THIS_IP_;
2124
+ p->irqtrace.softirq_enable_ip = _THIS_IP_;
2125
+ p->softirqs_enabled = 1;
2126
+ p->softirq_context = 0;
20192127 #endif
20202128
20212129 p->pagefault_disabled = 0;
20222130
20232131 #ifdef CONFIG_LOCKDEP
2024
- p->lockdep_depth = 0; /* no locks held yet */
2025
- p->curr_chain_key = 0;
2026
- p->lockdep_recursion = 0;
20272132 lockdep_init_task(p);
20282133 #endif
20292134
....@@ -2075,12 +2180,15 @@
20752180 retval = copy_io(clone_flags, p);
20762181 if (retval)
20772182 goto bad_fork_cleanup_namespaces;
2078
- retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);
2183
+ retval = copy_thread(clone_flags, args->stack, args->stack_size, p, args->tls);
20792184 if (retval)
20802185 goto bad_fork_cleanup_io;
20812186
2187
+ stackleak_task_init(p);
2188
+
20822189 if (pid != &init_struct_pid) {
2083
- pid = alloc_pid(p->nsproxy->pid_ns_for_children);
2190
+ pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid,
2191
+ args->set_tid_size);
20842192 if (IS_ERR(pid)) {
20852193 retval = PTR_ERR(pid);
20862194 goto bad_fork_cleanup_thread;
....@@ -2093,12 +2201,22 @@
20932201 * if the fd table isn't shared).
20942202 */
20952203 if (clone_flags & CLONE_PIDFD) {
2096
- retval = pidfd_create(pid);
2204
+ retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
20972205 if (retval < 0)
20982206 goto bad_fork_free_pid;
20992207
21002208 pidfd = retval;
2101
- retval = put_user(pidfd, parent_tidptr);
2209
+
2210
+ pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
2211
+ O_RDWR | O_CLOEXEC);
2212
+ if (IS_ERR(pidfile)) {
2213
+ put_unused_fd(pidfd);
2214
+ retval = PTR_ERR(pidfile);
2215
+ goto bad_fork_free_pid;
2216
+ }
2217
+ get_pid(pid); /* held by pidfile now */
2218
+
2219
+ retval = put_user(pidfd, args->pidfd);
21022220 if (retval)
21032221 goto bad_fork_put_pidfd;
21042222 }
....@@ -2123,7 +2241,7 @@
21232241 #ifdef TIF_SYSCALL_EMU
21242242 clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
21252243 #endif
2126
- clear_all_latency_tracing(p);
2244
+ clear_tsk_latency_tracing(p);
21272245
21282246 /* ok, now we should be set up.. */
21292247 p->pid = pid_nr(pid);
....@@ -2142,17 +2260,28 @@
21422260 p->pdeath_signal = 0;
21432261 INIT_LIST_HEAD(&p->thread_group);
21442262 p->task_works = NULL;
2263
+ clear_posix_cputimers_work(p);
21452264
2146
- cgroup_threadgroup_change_begin(current);
21472265 /*
21482266 * Ensure that the cgroup subsystem policies allow the new process to be
2149
- * forked. It should be noted the the new process's css_set can be changed
2267
+ * forked. It should be noted that the new process's css_set can be changed
21502268 * between here and cgroup_post_fork() if an organisation operation is in
21512269 * progress.
21522270 */
2153
- retval = cgroup_can_fork(p);
2271
+ retval = cgroup_can_fork(p, args);
21542272 if (retval)
2155
- goto bad_fork_cgroup_threadgroup_change_end;
2273
+ goto bad_fork_put_pidfd;
2274
+
2275
+ /*
2276
+ * Now that the cgroups are pinned, re-clone the parent cgroup and put
2277
+ * the new task on the correct runqueue. All this *before* the task
2278
+ * becomes visible.
2279
+ *
2280
+ * This isn't part of ->can_fork() because while the re-cloning is
2281
+ * cgroup specific, it unconditionally needs to place the task on a
2282
+ * runqueue.
2283
+ */
2284
+ sched_cgroup_fork(p, args);
21562285
21572286 /*
21582287 * From this point on we must avoid any synchronous user-space
....@@ -2163,7 +2292,7 @@
21632292 */
21642293
21652294 p->start_time = ktime_get_ns();
2166
- p->real_start_time = ktime_get_boot_ns();
2295
+ p->start_boottime = ktime_get_boottime_ns();
21672296
21682297 /*
21692298 * Make it visible to the rest of the system, but dont wake it up yet.
....@@ -2182,18 +2311,12 @@
21822311 } else {
21832312 p->real_parent = current;
21842313 p->parent_exec_id = current->self_exec_id;
2185
- p->exit_signal = (clone_flags & CSIGNAL);
2314
+ p->exit_signal = args->exit_signal;
21862315 }
21872316
21882317 klp_copy_process(p);
21892318
21902319 spin_lock(&current->sighand->siglock);
2191
-
2192
- /*
2193
- * Copy seccomp details explicitly here, in case they were changed
2194
- * before holding sighand lock.
2195
- */
2196
- copy_seccomp(p);
21972320
21982321 rseq_fork(p, clone_flags);
21992322
....@@ -2209,6 +2332,13 @@
22092332 goto bad_fork_cancel_cgroup;
22102333 }
22112334
2335
+ /* No more failure paths after this point. */
2336
+
2337
+ /*
2338
+ * Copy seccomp details explicitly here, in case they were changed
2339
+ * before holding sighand lock.
2340
+ */
2341
+ copy_seccomp(p);
22122342
22132343 init_task_pid_links(p);
22142344 if (likely(p->pid)) {
....@@ -2242,7 +2372,7 @@
22422372 } else {
22432373 current->signal->nr_threads++;
22442374 atomic_inc(&current->signal->live);
2245
- atomic_inc(&current->signal->sigcnt);
2375
+ refcount_inc(&current->signal->sigcnt);
22462376 task_join_group_stop(p);
22472377 list_add_tail_rcu(&p->thread_group,
22482378 &p->group_leader->thread_group);
....@@ -2258,9 +2388,12 @@
22582388 syscall_tracepoint_update(p);
22592389 write_unlock_irq(&tasklist_lock);
22602390
2391
+ if (pidfile)
2392
+ fd_install(pidfd, pidfile);
2393
+
22612394 proc_fork_connector(p);
2262
- cgroup_post_fork(p);
2263
- cgroup_threadgroup_change_end(current);
2395
+ sched_post_fork(p);
2396
+ cgroup_post_fork(p, args);
22642397 perf_event_fork(p);
22652398
22662399 trace_task_newtask(p, clone_flags);
....@@ -2273,12 +2406,12 @@
22732406 bad_fork_cancel_cgroup:
22742407 spin_unlock(&current->sighand->siglock);
22752408 write_unlock_irq(&tasklist_lock);
2276
- cgroup_cancel_fork(p);
2277
-bad_fork_cgroup_threadgroup_change_end:
2278
- cgroup_threadgroup_change_end(current);
2409
+ cgroup_cancel_fork(p, args);
22792410 bad_fork_put_pidfd:
2280
- if (clone_flags & CLONE_PIDFD)
2281
- ksys_close(pidfd);
2411
+ if (clone_flags & CLONE_PIDFD) {
2412
+ fput(pidfile);
2413
+ put_unused_fd(pidfd);
2414
+ }
22822415 bad_fork_free_pid:
22832416 if (pid != &init_struct_pid)
22842417 free_pid(pid);
....@@ -2342,11 +2475,14 @@
23422475 }
23432476 }
23442477
2345
-struct task_struct *fork_idle(int cpu)
2478
+struct task_struct * __init fork_idle(int cpu)
23462479 {
23472480 struct task_struct *task;
2348
- task = copy_process(CLONE_VM, 0, 0, NULL, NULL, &init_struct_pid, 0, 0,
2349
- cpu_to_node(cpu));
2481
+ struct kernel_clone_args args = {
2482
+ .flags = CLONE_VM,
2483
+ };
2484
+
2485
+ task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
23502486 if (!IS_ERR(task)) {
23512487 init_idle_pids(task);
23522488 init_idle(task, cpu);
....@@ -2356,23 +2492,57 @@
23562492 }
23572493
23582494 /*
2495
+ * This is like kernel_clone(), but shaved down and tailored to just
2496
+ * creating io_uring workers. It returns a created task, or an error pointer.
2497
+ * The returned task is inactive, and the caller must fire it up through
2498
+ * wake_up_new_task(p). All signals are blocked in the created task.
2499
+ */
2500
+struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
2501
+{
2502
+ unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
2503
+ CLONE_IO;
2504
+ struct kernel_clone_args args = {
2505
+ .flags = ((lower_32_bits(flags) | CLONE_VM |
2506
+ CLONE_UNTRACED) & ~CSIGNAL),
2507
+ .exit_signal = (lower_32_bits(flags) & CSIGNAL),
2508
+ .stack = (unsigned long)fn,
2509
+ .stack_size = (unsigned long)arg,
2510
+ .io_thread = 1,
2511
+ };
2512
+
2513
+ return copy_process(NULL, 0, node, &args);
2514
+}
2515
+
2516
+/*
23592517 * Ok, this is the main fork-routine.
23602518 *
23612519 * It copies the process, and if successful kick-starts
23622520 * it and waits for it to finish using the VM if required.
2521
+ *
2522
+ * args->exit_signal is expected to be checked for sanity by the caller.
23632523 */
2364
-long _do_fork(unsigned long clone_flags,
2365
- unsigned long stack_start,
2366
- unsigned long stack_size,
2367
- int __user *parent_tidptr,
2368
- int __user *child_tidptr,
2369
- unsigned long tls)
2524
+pid_t kernel_clone(struct kernel_clone_args *args)
23702525 {
2526
+ u64 clone_flags = args->flags;
23712527 struct completion vfork;
23722528 struct pid *pid;
23732529 struct task_struct *p;
23742530 int trace = 0;
2375
- long nr;
2531
+ pid_t nr;
2532
+
2533
+ /*
2534
+ * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
2535
+ * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
2536
+ * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate
2537
+ * field in struct clone_args and it still doesn't make sense to have
2538
+ * them both point at the same memory location. Performing this check
2539
+ * here has the advantage that we don't need to have a separate helper
2540
+ * to check for legacy clone().
2541
+ */
2542
+ if ((args->flags & CLONE_PIDFD) &&
2543
+ (args->flags & CLONE_PARENT_SETTID) &&
2544
+ (args->pidfd == args->parent_tid))
2545
+ return -EINVAL;
23762546
23772547 /*
23782548 * Determine whether and which event to report to ptracer. When
....@@ -2383,7 +2553,7 @@
23832553 if (!(clone_flags & CLONE_UNTRACED)) {
23842554 if (clone_flags & CLONE_VFORK)
23852555 trace = PTRACE_EVENT_VFORK;
2386
- else if ((clone_flags & CSIGNAL) != SIGCHLD)
2556
+ else if (args->exit_signal != SIGCHLD)
23872557 trace = PTRACE_EVENT_CLONE;
23882558 else
23892559 trace = PTRACE_EVENT_FORK;
....@@ -2392,8 +2562,7 @@
23922562 trace = 0;
23932563 }
23942564
2395
- p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr,
2396
- child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
2565
+ p = copy_process(NULL, trace, NUMA_NO_NODE, args);
23972566 add_latent_entropy();
23982567
23992568 if (IS_ERR(p))
....@@ -2411,7 +2580,7 @@
24112580 nr = pid_vnr(pid);
24122581
24132582 if (clone_flags & CLONE_PARENT_SETTID)
2414
- put_user(nr, parent_tidptr);
2583
+ put_user(nr, args->parent_tid);
24152584
24162585 if (clone_flags & CLONE_VFORK) {
24172586 p->vfork_done = &vfork;
....@@ -2434,34 +2603,31 @@
24342603 return nr;
24352604 }
24362605
2437
-#ifndef CONFIG_HAVE_COPY_THREAD_TLS
2438
-/* For compatibility with architectures that call do_fork directly rather than
2439
- * using the syscall entry points below. */
2440
-long do_fork(unsigned long clone_flags,
2441
- unsigned long stack_start,
2442
- unsigned long stack_size,
2443
- int __user *parent_tidptr,
2444
- int __user *child_tidptr)
2445
-{
2446
- return _do_fork(clone_flags, stack_start, stack_size,
2447
- parent_tidptr, child_tidptr, 0);
2448
-}
2449
-#endif
2450
-
24512606 /*
24522607 * Create a kernel thread.
24532608 */
24542609 pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
24552610 {
2456
- return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
2457
- (unsigned long)arg, NULL, NULL, 0);
2611
+ struct kernel_clone_args args = {
2612
+ .flags = ((lower_32_bits(flags) | CLONE_VM |
2613
+ CLONE_UNTRACED) & ~CSIGNAL),
2614
+ .exit_signal = (lower_32_bits(flags) & CSIGNAL),
2615
+ .stack = (unsigned long)fn,
2616
+ .stack_size = (unsigned long)arg,
2617
+ };
2618
+
2619
+ return kernel_clone(&args);
24582620 }
24592621
24602622 #ifdef __ARCH_WANT_SYS_FORK
24612623 SYSCALL_DEFINE0(fork)
24622624 {
24632625 #ifdef CONFIG_MMU
2464
- return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);
2626
+ struct kernel_clone_args args = {
2627
+ .exit_signal = SIGCHLD,
2628
+ };
2629
+
2630
+ return kernel_clone(&args);
24652631 #else
24662632 /* can not support in nommu mode */
24672633 return -EINVAL;
....@@ -2472,8 +2638,12 @@
24722638 #ifdef __ARCH_WANT_SYS_VFORK
24732639 SYSCALL_DEFINE0(vfork)
24742640 {
2475
- return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
2476
- 0, NULL, NULL, 0);
2641
+ struct kernel_clone_args args = {
2642
+ .flags = CLONE_VFORK | CLONE_VM,
2643
+ .exit_signal = SIGCHLD,
2644
+ };
2645
+
2646
+ return kernel_clone(&args);
24772647 }
24782648 #endif
24792649
....@@ -2501,7 +2671,175 @@
25012671 unsigned long, tls)
25022672 #endif
25032673 {
2504
- return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls);
2674
+ struct kernel_clone_args args = {
2675
+ .flags = (lower_32_bits(clone_flags) & ~CSIGNAL),
2676
+ .pidfd = parent_tidptr,
2677
+ .child_tid = child_tidptr,
2678
+ .parent_tid = parent_tidptr,
2679
+ .exit_signal = (lower_32_bits(clone_flags) & CSIGNAL),
2680
+ .stack = newsp,
2681
+ .tls = tls,
2682
+ };
2683
+
2684
+ return kernel_clone(&args);
2685
+}
2686
+#endif
2687
+
2688
+#ifdef __ARCH_WANT_SYS_CLONE3
2689
+
2690
+noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
2691
+ struct clone_args __user *uargs,
2692
+ size_t usize)
2693
+{
2694
+ int err;
2695
+ struct clone_args args;
2696
+ pid_t *kset_tid = kargs->set_tid;
2697
+
2698
+ BUILD_BUG_ON(offsetofend(struct clone_args, tls) !=
2699
+ CLONE_ARGS_SIZE_VER0);
2700
+ BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) !=
2701
+ CLONE_ARGS_SIZE_VER1);
2702
+ BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) !=
2703
+ CLONE_ARGS_SIZE_VER2);
2704
+ BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2);
2705
+
2706
+ if (unlikely(usize > PAGE_SIZE))
2707
+ return -E2BIG;
2708
+ if (unlikely(usize < CLONE_ARGS_SIZE_VER0))
2709
+ return -EINVAL;
2710
+
2711
+ err = copy_struct_from_user(&args, sizeof(args), uargs, usize);
2712
+ if (err)
2713
+ return err;
2714
+
2715
+ if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL))
2716
+ return -EINVAL;
2717
+
2718
+ if (unlikely(!args.set_tid && args.set_tid_size > 0))
2719
+ return -EINVAL;
2720
+
2721
+ if (unlikely(args.set_tid && args.set_tid_size == 0))
2722
+ return -EINVAL;
2723
+
2724
+ /*
2725
+ * Verify that higher 32bits of exit_signal are unset and that
2726
+ * it is a valid signal
2727
+ */
2728
+ if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
2729
+ !valid_signal(args.exit_signal)))
2730
+ return -EINVAL;
2731
+
2732
+ if ((args.flags & CLONE_INTO_CGROUP) &&
2733
+ (args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2))
2734
+ return -EINVAL;
2735
+
2736
+ *kargs = (struct kernel_clone_args){
2737
+ .flags = args.flags,
2738
+ .pidfd = u64_to_user_ptr(args.pidfd),
2739
+ .child_tid = u64_to_user_ptr(args.child_tid),
2740
+ .parent_tid = u64_to_user_ptr(args.parent_tid),
2741
+ .exit_signal = args.exit_signal,
2742
+ .stack = args.stack,
2743
+ .stack_size = args.stack_size,
2744
+ .tls = args.tls,
2745
+ .set_tid_size = args.set_tid_size,
2746
+ .cgroup = args.cgroup,
2747
+ };
2748
+
2749
+ if (args.set_tid &&
2750
+ copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid),
2751
+ (kargs->set_tid_size * sizeof(pid_t))))
2752
+ return -EFAULT;
2753
+
2754
+ kargs->set_tid = kset_tid;
2755
+
2756
+ return 0;
2757
+}
2758
+
2759
+/**
2760
+ * clone3_stack_valid - check and prepare stack
2761
+ * @kargs: kernel clone args
2762
+ *
2763
+ * Verify that the stack arguments userspace gave us are sane.
2764
+ * In addition, set the stack direction for userspace since it's easy for us to
2765
+ * determine.
2766
+ */
2767
+static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
2768
+{
2769
+ if (kargs->stack == 0) {
2770
+ if (kargs->stack_size > 0)
2771
+ return false;
2772
+ } else {
2773
+ if (kargs->stack_size == 0)
2774
+ return false;
2775
+
2776
+ if (!access_ok((void __user *)kargs->stack, kargs->stack_size))
2777
+ return false;
2778
+
2779
+#if !defined(CONFIG_STACK_GROWSUP) && !defined(CONFIG_IA64)
2780
+ kargs->stack += kargs->stack_size;
2781
+#endif
2782
+ }
2783
+
2784
+ return true;
2785
+}
2786
+
2787
+static bool clone3_args_valid(struct kernel_clone_args *kargs)
2788
+{
2789
+ /* Verify that no unknown flags are passed along. */
2790
+ if (kargs->flags &
2791
+ ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
2792
+ return false;
2793
+
2794
+ /*
2795
+ * - make the CLONE_DETACHED bit reuseable for clone3
2796
+ * - make the CSIGNAL bits reuseable for clone3
2797
+ */
2798
+ if (kargs->flags & (CLONE_DETACHED | (CSIGNAL & (~CLONE_NEWTIME))))
2799
+ return false;
2800
+
2801
+ if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) ==
2802
+ (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND))
2803
+ return false;
2804
+
2805
+ if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) &&
2806
+ kargs->exit_signal)
2807
+ return false;
2808
+
2809
+ if (!clone3_stack_valid(kargs))
2810
+ return false;
2811
+
2812
+ return true;
2813
+}
2814
+
2815
+/**
2816
+ * clone3 - create a new process with specific properties
2817
+ * @uargs: argument structure
2818
+ * @size: size of @uargs
2819
+ *
2820
+ * clone3() is the extensible successor to clone()/clone2().
2821
+ * It takes a struct as argument that is versioned by its size.
2822
+ *
2823
+ * Return: On success, a positive PID for the child process.
2824
+ * On error, a negative errno number.
2825
+ */
2826
+SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
2827
+{
2828
+ int err;
2829
+
2830
+ struct kernel_clone_args kargs;
2831
+ pid_t set_tid[MAX_PID_NS_LEVEL];
2832
+
2833
+ kargs.set_tid = set_tid;
2834
+
2835
+ err = copy_clone_args_from_user(&kargs, uargs, size);
2836
+ if (err)
2837
+ return err;
2838
+
2839
+ if (!clone3_args_valid(&kargs))
2840
+ return -EINVAL;
2841
+
2842
+ return kernel_clone(&kargs);
25052843 }
25062844 #endif
25072845
....@@ -2549,10 +2887,27 @@
25492887 init_waitqueue_head(&sighand->signalfd_wqh);
25502888 }
25512889
2552
-void __init proc_caches_init(void)
2890
+void __init mm_cache_init(void)
25532891 {
25542892 unsigned int mm_size;
25552893
2894
+ /*
2895
+ * The mm_cpumask is located at the end of mm_struct, and is
2896
+ * dynamically sized based on the maximum CPU number this system
2897
+ * can have, taking hotplug into account (nr_cpu_ids).
2898
+ */
2899
+ mm_size = sizeof(struct mm_struct) + cpumask_size();
2900
+
2901
+ mm_cachep = kmem_cache_create_usercopy("mm_struct",
2902
+ mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
2903
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
2904
+ offsetof(struct mm_struct, saved_auxv),
2905
+ sizeof_field(struct mm_struct, saved_auxv),
2906
+ NULL);
2907
+}
2908
+
2909
+void __init proc_caches_init(void)
2910
+{
25562911 sighand_cachep = kmem_cache_create("sighand_cache",
25572912 sizeof(struct sighand_struct), 0,
25582913 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
....@@ -2570,19 +2925,6 @@
25702925 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
25712926 NULL);
25722927
2573
- /*
2574
- * The mm_cpumask is located at the end of mm_struct, and is
2575
- * dynamically sized based on the maximum CPU number this system
2576
- * can have, taking hotplug into account (nr_cpu_ids).
2577
- */
2578
- mm_size = sizeof(struct mm_struct) + cpumask_size();
2579
-
2580
- mm_cachep = kmem_cache_create_usercopy("mm_struct",
2581
- mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
2582
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
2583
- offsetof(struct mm_struct, saved_auxv),
2584
- sizeof_field(struct mm_struct, saved_auxv),
2585
- NULL);
25862928 vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
25872929 mmap_init();
25882930 nsproxy_cache_init();
....@@ -2596,7 +2938,8 @@
25962938 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
25972939 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
25982940 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
2599
- CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP))
2941
+ CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP|
2942
+ CLONE_NEWTIME))
26002943 return -EINVAL;
26012944 /*
26022945 * Not implemented, but pretend it works if there is nothing
....@@ -2609,7 +2952,7 @@
26092952 return -EINVAL;
26102953 }
26112954 if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) {
2612
- if (atomic_read(&current->sighand->count) > 1)
2955
+ if (refcount_read(&current->sighand->count) > 1)
26132956 return -EINVAL;
26142957 }
26152958 if (unshare_flags & CLONE_VM) {
....@@ -2644,14 +2987,15 @@
26442987 /*
26452988 * Unshare file descriptor table if it is being shared
26462989 */
2647
-static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
2990
+int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
2991
+ struct files_struct **new_fdp)
26482992 {
26492993 struct files_struct *fd = current->files;
26502994 int error = 0;
26512995
26522996 if ((unshare_flags & CLONE_FILES) &&
26532997 (fd && atomic_read(&fd->count) > 1)) {
2654
- *new_fdp = dup_fd(fd, &error);
2998
+ *new_fdp = dup_fd(fd, max_fds, &error);
26552999 if (!*new_fdp)
26563000 return error;
26573001 }
....@@ -2662,7 +3006,7 @@
26623006 /*
26633007 * unshare allows a process to 'unshare' part of the process
26643008 * context which was originally shared using clone. copy_*
2665
- * functions used by do_fork() cannot be used here directly
3009
+ * functions used by kernel_clone() cannot be used here directly
26663010 * because they modify an inactive task_struct that is being
26673011 * constructed. Here we are modifying the current, active,
26683012 * task_struct.
....@@ -2711,7 +3055,7 @@
27113055 err = unshare_fs(unshare_flags, &new_fs);
27123056 if (err)
27133057 goto bad_unshare_out;
2714
- err = unshare_fd(unshare_flags, &new_fd);
3058
+ err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd);
27153059 if (err)
27163060 goto bad_unshare_cleanup_fs;
27173061 err = unshare_userns(unshare_flags, &new_cred);
....@@ -2800,7 +3144,7 @@
28003144 struct files_struct *copy = NULL;
28013145 int error;
28023146
2803
- error = unshare_fd(CLONE_FILES, &copy);
3147
+ error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, &copy);
28043148 if (error || !copy) {
28053149 *displaced = NULL;
28063150 return error;
....@@ -2813,7 +3157,7 @@
28133157 }
28143158
28153159 int sysctl_max_threads(struct ctl_table *table, int write,
2816
- void __user *buffer, size_t *lenp, loff_t *ppos)
3160
+ void *buffer, size_t *lenp, loff_t *ppos)
28173161 {
28183162 struct ctl_table t;
28193163 int ret;