.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
---|
1 | 2 | /* |
---|
2 | 3 | * linux/kernel/fork.c |
---|
3 | 4 | * |
---|
.. | .. |
---|
39 | 40 | #include <linux/binfmts.h> |
---|
40 | 41 | #include <linux/mman.h> |
---|
41 | 42 | #include <linux/mmu_notifier.h> |
---|
42 | | -#include <linux/hmm.h> |
---|
43 | 43 | #include <linux/fs.h> |
---|
44 | 44 | #include <linux/mm.h> |
---|
45 | | -#include <linux/kprobes.h> |
---|
46 | 45 | #include <linux/vmacache.h> |
---|
47 | 46 | #include <linux/nsproxy.h> |
---|
48 | 47 | #include <linux/capability.h> |
---|
.. | .. |
---|
80 | 79 | #include <linux/blkdev.h> |
---|
81 | 80 | #include <linux/fs_struct.h> |
---|
82 | 81 | #include <linux/magic.h> |
---|
83 | | -#include <linux/sched/mm.h> |
---|
84 | 82 | #include <linux/perf_event.h> |
---|
85 | 83 | #include <linux/posix-timers.h> |
---|
86 | 84 | #include <linux/user-return-notifier.h> |
---|
.. | .. |
---|
94 | 92 | #include <linux/kcov.h> |
---|
95 | 93 | #include <linux/livepatch.h> |
---|
96 | 94 | #include <linux/thread_info.h> |
---|
97 | | -#include <linux/cpufreq_times.h> |
---|
| 95 | +#include <linux/stackleak.h> |
---|
| 96 | +#include <linux/kasan.h> |
---|
98 | 97 | #include <linux/scs.h> |
---|
| 98 | +#include <linux/io_uring.h> |
---|
| 99 | +#include <linux/cpufreq_times.h> |
---|
99 | 100 | |
---|
100 | | -#include <asm/pgtable.h> |
---|
101 | 101 | #include <asm/pgalloc.h> |
---|
102 | 102 | #include <linux/uaccess.h> |
---|
103 | 103 | #include <asm/mmu_context.h> |
---|
.. | .. |
---|
109 | 109 | #define CREATE_TRACE_POINTS |
---|
110 | 110 | #include <trace/events/task.h> |
---|
111 | 111 | |
---|
| 112 | +#undef CREATE_TRACE_POINTS |
---|
| 113 | +#include <trace/hooks/sched.h> |
---|
112 | 114 | /* |
---|
113 | 115 | * Minimum number of threads to boot the kernel |
---|
114 | 116 | */ |
---|
.. | .. |
---|
119 | 121 | */ |
---|
120 | 122 | #define MAX_THREADS FUTEX_TID_MASK |
---|
121 | 123 | |
---|
| 124 | +EXPORT_TRACEPOINT_SYMBOL_GPL(task_newtask); |
---|
| 125 | + |
---|
122 | 126 | /* |
---|
123 | 127 | * Protected counters by write_lock_irq(&tasklist_lock) |
---|
124 | 128 | */ |
---|
125 | 129 | unsigned long total_forks; /* Handle normal Linux uptimes. */ |
---|
126 | 130 | int nr_threads; /* The idle threads do not count.. */ |
---|
127 | 131 | |
---|
128 | | -int max_threads; /* tunable limit on nr_threads */ |
---|
| 132 | +static int max_threads; /* tunable limit on nr_threads */ |
---|
| 133 | + |
---|
| 134 | +#define NAMED_ARRAY_INDEX(x) [x] = __stringify(x) |
---|
| 135 | + |
---|
| 136 | +static const char * const resident_page_types[] = { |
---|
| 137 | + NAMED_ARRAY_INDEX(MM_FILEPAGES), |
---|
| 138 | + NAMED_ARRAY_INDEX(MM_ANONPAGES), |
---|
| 139 | + NAMED_ARRAY_INDEX(MM_SWAPENTS), |
---|
| 140 | + NAMED_ARRAY_INDEX(MM_SHMEMPAGES), |
---|
| 141 | +}; |
---|
129 | 142 | |
---|
130 | 143 | DEFINE_PER_CPU(unsigned long, process_counts) = 0; |
---|
131 | 144 | |
---|
132 | 145 | __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ |
---|
| 146 | +EXPORT_SYMBOL_GPL(tasklist_lock); |
---|
133 | 147 | |
---|
134 | 148 | #ifdef CONFIG_PROVE_RCU |
---|
135 | 149 | int lockdep_tasklist_lock_is_held(void) |
---|
.. | .. |
---|
217 | 231 | if (!s) |
---|
218 | 232 | continue; |
---|
219 | 233 | |
---|
| 234 | + /* Mark stack accessible for KASAN. */ |
---|
| 235 | + kasan_unpoison_range(s->addr, THREAD_SIZE); |
---|
| 236 | + |
---|
220 | 237 | /* Clear stale pointers from reused stack. */ |
---|
221 | 238 | memset(s->addr, 0, THREAD_SIZE); |
---|
222 | 239 | |
---|
.. | .. |
---|
225 | 242 | return s->addr; |
---|
226 | 243 | } |
---|
227 | 244 | |
---|
| 245 | + /* |
---|
| 246 | + * Allocated stacks are cached and later reused by new threads, |
---|
| 247 | + * so memcg accounting is performed manually on assigning/releasing |
---|
| 248 | + * stacks to tasks. Drop __GFP_ACCOUNT. |
---|
| 249 | + */ |
---|
228 | 250 | stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN, |
---|
229 | 251 | VMALLOC_START, VMALLOC_END, |
---|
230 | | - THREADINFO_GFP, |
---|
| 252 | + THREADINFO_GFP & ~__GFP_ACCOUNT, |
---|
231 | 253 | PAGE_KERNEL, |
---|
232 | 254 | 0, node, __builtin_return_address(0)); |
---|
233 | 255 | |
---|
.. | .. |
---|
246 | 268 | THREAD_SIZE_ORDER); |
---|
247 | 269 | |
---|
248 | 270 | if (likely(page)) { |
---|
249 | | - tsk->stack = page_address(page); |
---|
| 271 | + tsk->stack = kasan_reset_tag(page_address(page)); |
---|
250 | 272 | return tsk->stack; |
---|
251 | 273 | } |
---|
252 | 274 | return NULL; |
---|
.. | .. |
---|
256 | 278 | static inline void free_thread_stack(struct task_struct *tsk) |
---|
257 | 279 | { |
---|
258 | 280 | #ifdef CONFIG_VMAP_STACK |
---|
259 | | - if (task_stack_vm_area(tsk)) { |
---|
| 281 | + struct vm_struct *vm = task_stack_vm_area(tsk); |
---|
| 282 | + |
---|
| 283 | + if (vm) { |
---|
260 | 284 | int i; |
---|
| 285 | + |
---|
| 286 | + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) |
---|
| 287 | + memcg_kmem_uncharge_page(vm->pages[i], 0); |
---|
261 | 288 | |
---|
262 | 289 | for (i = 0; i < NR_CACHED_STACKS; i++) { |
---|
263 | 290 | if (this_cpu_cmpxchg(cached_stacks[i], |
---|
.. | .. |
---|
282 | 309 | { |
---|
283 | 310 | unsigned long *stack; |
---|
284 | 311 | stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); |
---|
| 312 | + stack = kasan_reset_tag(stack); |
---|
285 | 313 | tsk->stack = stack; |
---|
286 | 314 | return stack; |
---|
287 | 315 | } |
---|
.. | .. |
---|
334 | 362 | struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
---|
335 | 363 | |
---|
336 | 364 | if (new) { |
---|
337 | | - *new = *orig; |
---|
338 | | - INIT_LIST_HEAD(&new->anon_vma_chain); |
---|
| 365 | + ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); |
---|
| 366 | + ASSERT_EXCLUSIVE_WRITER(orig->vm_file); |
---|
| 367 | + /* |
---|
| 368 | + * orig->shared.rb may be modified concurrently, but the clone |
---|
| 369 | + * will be reinitialized. |
---|
| 370 | + */ |
---|
| 371 | + *new = data_race(*orig); |
---|
| 372 | + INIT_VMA(new); |
---|
| 373 | + new->vm_next = new->vm_prev = NULL; |
---|
339 | 374 | } |
---|
340 | 375 | return new; |
---|
341 | 376 | } |
---|
.. | .. |
---|
350 | 385 | void *stack = task_stack_page(tsk); |
---|
351 | 386 | struct vm_struct *vm = task_stack_vm_area(tsk); |
---|
352 | 387 | |
---|
| 388 | + |
---|
| 389 | + /* All stack pages are in the same node. */ |
---|
| 390 | + if (vm) |
---|
| 391 | + mod_lruvec_page_state(vm->pages[0], NR_KERNEL_STACK_KB, |
---|
| 392 | + account * (THREAD_SIZE / 1024)); |
---|
| 393 | + else |
---|
| 394 | + mod_lruvec_slab_state(stack, NR_KERNEL_STACK_KB, |
---|
| 395 | + account * (THREAD_SIZE / 1024)); |
---|
| 396 | +} |
---|
| 397 | + |
---|
| 398 | +static int memcg_charge_kernel_stack(struct task_struct *tsk) |
---|
| 399 | +{ |
---|
| 400 | +#ifdef CONFIG_VMAP_STACK |
---|
| 401 | + struct vm_struct *vm = task_stack_vm_area(tsk); |
---|
| 402 | + int ret; |
---|
| 403 | + |
---|
353 | 404 | BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0); |
---|
354 | 405 | |
---|
355 | 406 | if (vm) { |
---|
.. | .. |
---|
358 | 409 | BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); |
---|
359 | 410 | |
---|
360 | 411 | for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { |
---|
361 | | - mod_zone_page_state(page_zone(vm->pages[i]), |
---|
362 | | - NR_KERNEL_STACK_KB, |
---|
363 | | - PAGE_SIZE / 1024 * account); |
---|
| 412 | + /* |
---|
| 413 | + * If memcg_kmem_charge_page() fails, page->mem_cgroup |
---|
| 414 | + * pointer is NULL, and memcg_kmem_uncharge_page() in |
---|
| 415 | + * free_thread_stack() will ignore this page. |
---|
| 416 | + */ |
---|
| 417 | + ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, |
---|
| 418 | + 0); |
---|
| 419 | + if (ret) |
---|
| 420 | + return ret; |
---|
364 | 421 | } |
---|
365 | | - |
---|
366 | | - /* All stack pages belong to the same memcg. */ |
---|
367 | | - mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB, |
---|
368 | | - account * (THREAD_SIZE / 1024)); |
---|
369 | | - } else { |
---|
370 | | - /* |
---|
371 | | - * All stack pages are in the same zone and belong to the |
---|
372 | | - * same memcg. |
---|
373 | | - */ |
---|
374 | | - struct page *first_page = virt_to_page(stack); |
---|
375 | | - |
---|
376 | | - mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, |
---|
377 | | - THREAD_SIZE / 1024 * account); |
---|
378 | | - |
---|
379 | | - mod_memcg_page_state(first_page, MEMCG_KERNEL_STACK_KB, |
---|
380 | | - account * (THREAD_SIZE / 1024)); |
---|
381 | 422 | } |
---|
| 423 | +#endif |
---|
| 424 | + return 0; |
---|
382 | 425 | } |
---|
383 | 426 | |
---|
384 | 427 | static void release_task_stack(struct task_struct *tsk) |
---|
.. | .. |
---|
397 | 440 | #ifdef CONFIG_THREAD_INFO_IN_TASK |
---|
398 | 441 | void put_task_stack(struct task_struct *tsk) |
---|
399 | 442 | { |
---|
400 | | - if (atomic_dec_and_test(&tsk->stack_refcount)) |
---|
| 443 | + if (refcount_dec_and_test(&tsk->stack_refcount)) |
---|
401 | 444 | release_task_stack(tsk); |
---|
402 | 445 | } |
---|
| 446 | +EXPORT_SYMBOL_GPL(put_task_stack); |
---|
403 | 447 | #endif |
---|
404 | 448 | |
---|
405 | 449 | void free_task(struct task_struct *tsk) |
---|
406 | 450 | { |
---|
| 451 | +#ifdef CONFIG_SECCOMP |
---|
| 452 | + WARN_ON_ONCE(tsk->seccomp.filter); |
---|
| 453 | +#endif |
---|
407 | 454 | cpufreq_task_times_exit(tsk); |
---|
408 | 455 | scs_release(tsk); |
---|
409 | 456 | |
---|
| 457 | + trace_android_vh_free_task(tsk); |
---|
410 | 458 | #ifndef CONFIG_THREAD_INFO_IN_TASK |
---|
411 | 459 | /* |
---|
412 | 460 | * The task is finally done with both the stack and thread_info, |
---|
.. | .. |
---|
418 | 466 | * If the task had a separate stack allocation, it should be gone |
---|
419 | 467 | * by now. |
---|
420 | 468 | */ |
---|
421 | | - WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0); |
---|
| 469 | + WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0); |
---|
422 | 470 | #endif |
---|
423 | 471 | rt_mutex_debug_task_free(tsk); |
---|
424 | 472 | ftrace_graph_exit_task(tsk); |
---|
425 | | - put_seccomp_filter(tsk); |
---|
426 | 473 | arch_release_task_struct(tsk); |
---|
427 | 474 | if (tsk->flags & PF_KTHREAD) |
---|
428 | 475 | free_kthread_struct(tsk); |
---|
.. | .. |
---|
434 | 481 | static __latent_entropy int dup_mmap(struct mm_struct *mm, |
---|
435 | 482 | struct mm_struct *oldmm) |
---|
436 | 483 | { |
---|
437 | | - struct vm_area_struct *mpnt, *tmp, *prev, **pprev; |
---|
| 484 | + struct vm_area_struct *mpnt, *tmp, *prev, **pprev, *last = NULL; |
---|
438 | 485 | struct rb_node **rb_link, *rb_parent; |
---|
439 | 486 | int retval; |
---|
440 | 487 | unsigned long charge; |
---|
441 | 488 | LIST_HEAD(uf); |
---|
442 | 489 | |
---|
443 | 490 | uprobe_start_dup_mmap(); |
---|
444 | | - if (down_write_killable(&oldmm->mmap_sem)) { |
---|
| 491 | + if (mmap_write_lock_killable(oldmm)) { |
---|
445 | 492 | retval = -EINTR; |
---|
446 | 493 | goto fail_uprobe_end; |
---|
447 | 494 | } |
---|
.. | .. |
---|
450 | 497 | /* |
---|
451 | 498 | * Not linked in yet - no deadlock potential: |
---|
452 | 499 | */ |
---|
453 | | - down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); |
---|
| 500 | + mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING); |
---|
454 | 501 | |
---|
455 | 502 | /* No ordering required: file already has been exposed. */ |
---|
456 | 503 | RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); |
---|
.. | .. |
---|
505 | 552 | if (retval) |
---|
506 | 553 | goto fail_nomem_anon_vma_fork; |
---|
507 | 554 | if (tmp->vm_flags & VM_WIPEONFORK) { |
---|
508 | | - /* VM_WIPEONFORK gets a clean slate in the child. */ |
---|
| 555 | + /* |
---|
| 556 | + * VM_WIPEONFORK gets a clean slate in the child. |
---|
| 557 | + * Don't prepare anon_vma until fault since we don't |
---|
| 558 | + * copy page for current vma. |
---|
| 559 | + */ |
---|
509 | 560 | tmp->anon_vma = NULL; |
---|
510 | | - if (anon_vma_prepare(tmp)) |
---|
511 | | - goto fail_nomem_anon_vma_fork; |
---|
512 | 561 | } else if (anon_vma_fork(tmp, mpnt)) |
---|
513 | 562 | goto fail_nomem_anon_vma_fork; |
---|
514 | 563 | tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); |
---|
515 | | - tmp->vm_next = tmp->vm_prev = NULL; |
---|
516 | 564 | file = tmp->vm_file; |
---|
517 | 565 | if (file) { |
---|
518 | 566 | struct inode *inode = file_inode(file); |
---|
.. | .. |
---|
520 | 568 | |
---|
521 | 569 | get_file(file); |
---|
522 | 570 | if (tmp->vm_flags & VM_DENYWRITE) |
---|
523 | | - atomic_dec(&inode->i_writecount); |
---|
| 571 | + put_write_access(inode); |
---|
524 | 572 | i_mmap_lock_write(mapping); |
---|
525 | 573 | if (tmp->vm_flags & VM_SHARED) |
---|
526 | | - atomic_inc(&mapping->i_mmap_writable); |
---|
| 574 | + mapping_allow_writable(mapping); |
---|
527 | 575 | flush_dcache_mmap_lock(mapping); |
---|
528 | 576 | /* insert tmp into the share list, just after mpnt */ |
---|
529 | 577 | vma_interval_tree_insert_after(tmp, mpnt, |
---|
.. | .. |
---|
553 | 601 | rb_parent = &tmp->vm_rb; |
---|
554 | 602 | |
---|
555 | 603 | mm->map_count++; |
---|
556 | | - if (!(tmp->vm_flags & VM_WIPEONFORK)) |
---|
557 | | - retval = copy_page_range(mm, oldmm, mpnt); |
---|
| 604 | + if (!(tmp->vm_flags & VM_WIPEONFORK)) { |
---|
| 605 | + if (IS_ENABLED(CONFIG_SPECULATIVE_PAGE_FAULT)) { |
---|
| 606 | + /* |
---|
| 607 | + * Mark this VMA as changing to prevent the |
---|
| 608 | + * speculative page fault hanlder to process |
---|
| 609 | + * it until the TLB are flushed below. |
---|
| 610 | + */ |
---|
| 611 | + last = mpnt; |
---|
| 612 | + vm_write_begin(mpnt); |
---|
| 613 | + } |
---|
| 614 | + retval = copy_page_range(tmp, mpnt); |
---|
| 615 | + } |
---|
558 | 616 | |
---|
559 | 617 | if (tmp->vm_ops && tmp->vm_ops->open) |
---|
560 | 618 | tmp->vm_ops->open(tmp); |
---|
.. | .. |
---|
565 | 623 | /* a new mm has just been created */ |
---|
566 | 624 | retval = arch_dup_mmap(oldmm, mm); |
---|
567 | 625 | out: |
---|
568 | | - up_write(&mm->mmap_sem); |
---|
| 626 | + mmap_write_unlock(mm); |
---|
569 | 627 | flush_tlb_mm(oldmm); |
---|
570 | | - up_write(&oldmm->mmap_sem); |
---|
| 628 | + |
---|
| 629 | + if (IS_ENABLED(CONFIG_SPECULATIVE_PAGE_FAULT)) { |
---|
| 630 | + /* |
---|
| 631 | + * Since the TLB has been flush, we can safely unmark the |
---|
| 632 | + * copied VMAs and allows the speculative page fault handler to |
---|
| 633 | + * process them again. |
---|
| 634 | + * Walk back the VMA list from the last marked VMA. |
---|
| 635 | + */ |
---|
| 636 | + for (; last; last = last->vm_prev) { |
---|
| 637 | + if (last->vm_flags & VM_DONTCOPY) |
---|
| 638 | + continue; |
---|
| 639 | + if (!(last->vm_flags & VM_WIPEONFORK)) |
---|
| 640 | + vm_write_end(last); |
---|
| 641 | + } |
---|
| 642 | + } |
---|
| 643 | + |
---|
| 644 | + mmap_write_unlock(oldmm); |
---|
571 | 645 | dup_userfaultfd_complete(&uf); |
---|
572 | 646 | fail_uprobe_end: |
---|
573 | 647 | uprobe_end_dup_mmap(); |
---|
.. | .. |
---|
597 | 671 | #else |
---|
598 | 672 | static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) |
---|
599 | 673 | { |
---|
600 | | - down_write(&oldmm->mmap_sem); |
---|
| 674 | + mmap_write_lock(oldmm); |
---|
601 | 675 | RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); |
---|
602 | | - up_write(&oldmm->mmap_sem); |
---|
| 676 | + mmap_write_unlock(oldmm); |
---|
603 | 677 | return 0; |
---|
604 | 678 | } |
---|
605 | 679 | #define mm_alloc_pgd(mm) (0) |
---|
.. | .. |
---|
610 | 684 | { |
---|
611 | 685 | int i; |
---|
612 | 686 | |
---|
| 687 | + BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS, |
---|
| 688 | + "Please make sure 'struct resident_page_types[]' is updated as well"); |
---|
| 689 | + |
---|
613 | 690 | for (i = 0; i < NR_MM_COUNTERS; i++) { |
---|
614 | 691 | long x = atomic_long_read(&mm->rss_stat.count[i]); |
---|
615 | 692 | |
---|
616 | 693 | if (unlikely(x)) |
---|
617 | | - printk(KERN_ALERT "BUG: Bad rss-counter state " |
---|
618 | | - "mm:%p idx:%d val:%ld\n", mm, i, x); |
---|
| 694 | + pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n", |
---|
| 695 | + mm, resident_page_types[i], x); |
---|
619 | 696 | } |
---|
620 | 697 | |
---|
621 | 698 | if (mm_pgtables_bytes(mm)) |
---|
.. | .. |
---|
642 | 719 | WARN_ON_ONCE(mm == current->active_mm); |
---|
643 | 720 | mm_free_pgd(mm); |
---|
644 | 721 | destroy_context(mm); |
---|
645 | | - hmm_mm_destroy(mm); |
---|
646 | | - mmu_notifier_mm_destroy(mm); |
---|
| 722 | + mmu_notifier_subscriptions_destroy(mm); |
---|
647 | 723 | check_mm(mm); |
---|
648 | 724 | put_user_ns(mm->user_ns); |
---|
649 | 725 | free_mm(mm); |
---|
650 | 726 | } |
---|
651 | 727 | EXPORT_SYMBOL_GPL(__mmdrop); |
---|
652 | | - |
---|
653 | | -#ifdef CONFIG_PREEMPT_RT_BASE |
---|
654 | | -/* |
---|
655 | | - * RCU callback for delayed mm drop. Not strictly rcu, but we don't |
---|
656 | | - * want another facility to make this work. |
---|
657 | | - */ |
---|
658 | | -void __mmdrop_delayed(struct rcu_head *rhp) |
---|
659 | | -{ |
---|
660 | | - struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop); |
---|
661 | | - |
---|
662 | | - __mmdrop(mm); |
---|
663 | | -} |
---|
664 | | -#endif |
---|
665 | 728 | |
---|
666 | 729 | static void mmdrop_async_fn(struct work_struct *work) |
---|
667 | 730 | { |
---|
.. | .. |
---|
694 | 757 | |
---|
695 | 758 | static inline void put_signal_struct(struct signal_struct *sig) |
---|
696 | 759 | { |
---|
697 | | - if (atomic_dec_and_test(&sig->sigcnt)) |
---|
| 760 | + if (refcount_dec_and_test(&sig->sigcnt)) |
---|
698 | 761 | free_signal_struct(sig); |
---|
699 | 762 | } |
---|
700 | | -#ifdef CONFIG_PREEMPT_RT_BASE |
---|
701 | | -static |
---|
702 | | -#endif |
---|
| 763 | + |
---|
703 | 764 | void __put_task_struct(struct task_struct *tsk) |
---|
704 | 765 | { |
---|
705 | 766 | WARN_ON(!tsk->exit_state); |
---|
706 | | - WARN_ON(atomic_read(&tsk->usage)); |
---|
| 767 | + WARN_ON(refcount_read(&tsk->usage)); |
---|
707 | 768 | WARN_ON(tsk == current); |
---|
708 | 769 | |
---|
709 | | - /* |
---|
710 | | - * Remove function-return probe instances associated with this |
---|
711 | | - * task and put them back on the free list. |
---|
712 | | - */ |
---|
713 | | - kprobe_flush_task(tsk); |
---|
714 | | - |
---|
715 | | - /* Task is done with its stack. */ |
---|
716 | | - put_task_stack(tsk); |
---|
717 | | - |
---|
| 770 | + io_uring_free(tsk); |
---|
718 | 771 | cgroup_free(tsk); |
---|
719 | 772 | task_numa_free(tsk, true); |
---|
720 | 773 | security_task_free(tsk); |
---|
.. | .. |
---|
725 | 778 | if (!profile_handoff_task(tsk)) |
---|
726 | 779 | free_task(tsk); |
---|
727 | 780 | } |
---|
728 | | -#ifndef CONFIG_PREEMPT_RT_BASE |
---|
729 | 781 | EXPORT_SYMBOL_GPL(__put_task_struct); |
---|
730 | | -#else |
---|
731 | | -void __put_task_struct_cb(struct rcu_head *rhp) |
---|
| 782 | + |
---|
| 783 | +void __put_task_struct_rcu_cb(struct rcu_head *rhp) |
---|
732 | 784 | { |
---|
733 | | - struct task_struct *tsk = container_of(rhp, struct task_struct, put_rcu); |
---|
| 785 | + struct task_struct *task = container_of(rhp, struct task_struct, rcu); |
---|
734 | 786 | |
---|
735 | | - __put_task_struct(tsk); |
---|
736 | | - |
---|
| 787 | + __put_task_struct(task); |
---|
737 | 788 | } |
---|
738 | | -EXPORT_SYMBOL_GPL(__put_task_struct_cb); |
---|
739 | | -#endif |
---|
| 789 | +EXPORT_SYMBOL_GPL(__put_task_struct_rcu_cb); |
---|
740 | 790 | |
---|
741 | 791 | void __init __weak arch_task_cache_init(void) { } |
---|
742 | 792 | |
---|
.. | .. |
---|
746 | 796 | static void set_max_threads(unsigned int max_threads_suggested) |
---|
747 | 797 | { |
---|
748 | 798 | u64 threads; |
---|
| 799 | + unsigned long nr_pages = totalram_pages(); |
---|
749 | 800 | |
---|
750 | 801 | /* |
---|
751 | 802 | * The number of threads shall be limited such that the thread |
---|
752 | 803 | * structures may only consume a small part of the available memory. |
---|
753 | 804 | */ |
---|
754 | | - if (fls64(totalram_pages) + fls64(PAGE_SIZE) > 64) |
---|
| 805 | + if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64) |
---|
755 | 806 | threads = MAX_THREADS; |
---|
756 | 807 | else |
---|
757 | | - threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE, |
---|
| 808 | + threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE, |
---|
758 | 809 | (u64) THREAD_SIZE * 8UL); |
---|
759 | 810 | |
---|
760 | 811 | if (threads > max_threads_suggested) |
---|
.. | .. |
---|
768 | 819 | int arch_task_struct_size __read_mostly; |
---|
769 | 820 | #endif |
---|
770 | 821 | |
---|
| 822 | +#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR |
---|
771 | 823 | static void task_struct_whitelist(unsigned long *offset, unsigned long *size) |
---|
772 | 824 | { |
---|
773 | 825 | /* Fetch thread_struct whitelist for the architecture. */ |
---|
.. | .. |
---|
782 | 834 | else |
---|
783 | 835 | *offset += offsetof(struct task_struct, thread); |
---|
784 | 836 | } |
---|
| 837 | +#endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */ |
---|
785 | 838 | |
---|
786 | 839 | void __init fork_init(void) |
---|
787 | 840 | { |
---|
.. | .. |
---|
823 | 876 | scs_init(); |
---|
824 | 877 | |
---|
825 | 878 | lockdep_init_task(&init_task); |
---|
| 879 | + uprobes_init(); |
---|
826 | 880 | } |
---|
827 | 881 | |
---|
828 | 882 | int __weak arch_dup_task_struct(struct task_struct *dst, |
---|
.. | .. |
---|
844 | 898 | { |
---|
845 | 899 | struct task_struct *tsk; |
---|
846 | 900 | unsigned long *stack; |
---|
847 | | - struct vm_struct *stack_vm_area; |
---|
| 901 | + struct vm_struct *stack_vm_area __maybe_unused; |
---|
848 | 902 | int err; |
---|
849 | 903 | |
---|
850 | 904 | if (node == NUMA_NO_NODE) |
---|
.. | .. |
---|
856 | 910 | stack = alloc_thread_stack_node(tsk, node); |
---|
857 | 911 | if (!stack) |
---|
858 | 912 | goto free_tsk; |
---|
| 913 | + |
---|
| 914 | + if (memcg_charge_kernel_stack(tsk)) |
---|
| 915 | + goto free_stack; |
---|
859 | 916 | |
---|
860 | 917 | stack_vm_area = task_stack_vm_area(tsk); |
---|
861 | 918 | |
---|
.. | .. |
---|
871 | 928 | tsk->stack_vm_area = stack_vm_area; |
---|
872 | 929 | #endif |
---|
873 | 930 | #ifdef CONFIG_THREAD_INFO_IN_TASK |
---|
874 | | - atomic_set(&tsk->stack_refcount, 1); |
---|
| 931 | + refcount_set(&tsk->stack_refcount, 1); |
---|
875 | 932 | #endif |
---|
876 | 933 | |
---|
877 | 934 | if (err) |
---|
.. | .. |
---|
903 | 960 | tsk->cpus_ptr = &tsk->cpus_mask; |
---|
904 | 961 | |
---|
905 | 962 | /* |
---|
906 | | - * One for us, one for whoever does the "release_task()" (usually |
---|
907 | | - * parent) |
---|
| 963 | + * One for the user space visible state that goes away when reaped. |
---|
| 964 | + * One for the scheduler. |
---|
908 | 965 | */ |
---|
909 | | - atomic_set(&tsk->usage, 2); |
---|
| 966 | + refcount_set(&tsk->rcu_users, 2); |
---|
| 967 | + /* One for the rcu users */ |
---|
| 968 | + refcount_set(&tsk->usage, 1); |
---|
910 | 969 | #ifdef CONFIG_BLK_DEV_IO_TRACE |
---|
911 | 970 | tsk->btrace_seq = 0; |
---|
912 | 971 | #endif |
---|
913 | 972 | tsk->splice_pipe = NULL; |
---|
914 | 973 | tsk->task_frag.page = NULL; |
---|
915 | 974 | tsk->wake_q.next = NULL; |
---|
916 | | - tsk->wake_q_sleeper.next = NULL; |
---|
| 975 | + tsk->pf_io_worker = NULL; |
---|
917 | 976 | |
---|
918 | 977 | account_kernel_stack(tsk, 1); |
---|
919 | 978 | |
---|
.. | .. |
---|
931 | 990 | #ifdef CONFIG_MEMCG |
---|
932 | 991 | tsk->active_memcg = NULL; |
---|
933 | 992 | #endif |
---|
| 993 | + |
---|
| 994 | + android_init_vendor_data(tsk, 1); |
---|
| 995 | + android_init_oem_data(tsk, 1); |
---|
| 996 | + |
---|
| 997 | + trace_android_vh_dup_task_struct(tsk, orig); |
---|
934 | 998 | return tsk; |
---|
935 | 999 | |
---|
936 | 1000 | free_stack: |
---|
.. | .. |
---|
980 | 1044 | #endif |
---|
981 | 1045 | } |
---|
982 | 1046 | |
---|
| 1047 | +static void mm_init_pasid(struct mm_struct *mm) |
---|
| 1048 | +{ |
---|
| 1049 | +#ifdef CONFIG_IOMMU_SUPPORT |
---|
| 1050 | + mm->pasid = INIT_PASID; |
---|
| 1051 | +#endif |
---|
| 1052 | +} |
---|
| 1053 | + |
---|
983 | 1054 | static void mm_init_uprobes_state(struct mm_struct *mm) |
---|
984 | 1055 | { |
---|
985 | 1056 | #ifdef CONFIG_UPROBES |
---|
.. | .. |
---|
993 | 1064 | mm->mmap = NULL; |
---|
994 | 1065 | mm->mm_rb = RB_ROOT; |
---|
995 | 1066 | mm->vmacache_seqnum = 0; |
---|
| 1067 | +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT |
---|
| 1068 | + rwlock_init(&mm->mm_rb_lock); |
---|
| 1069 | +#endif |
---|
996 | 1070 | atomic_set(&mm->mm_users, 1); |
---|
997 | 1071 | atomic_set(&mm->mm_count, 1); |
---|
998 | | - init_rwsem(&mm->mmap_sem); |
---|
| 1072 | + seqcount_init(&mm->write_protect_seq); |
---|
| 1073 | + mmap_init_lock(mm); |
---|
999 | 1074 | INIT_LIST_HEAD(&mm->mmlist); |
---|
1000 | 1075 | mm->core_state = NULL; |
---|
1001 | 1076 | mm_pgtables_bytes_init(mm); |
---|
1002 | 1077 | mm->map_count = 0; |
---|
1003 | 1078 | mm->locked_vm = 0; |
---|
1004 | | - mm->pinned_vm = 0; |
---|
| 1079 | + atomic_set(&mm->has_pinned, 0); |
---|
| 1080 | + atomic64_set(&mm->pinned_vm, 0); |
---|
1005 | 1081 | memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); |
---|
1006 | 1082 | spin_lock_init(&mm->page_table_lock); |
---|
1007 | 1083 | spin_lock_init(&mm->arg_lock); |
---|
1008 | 1084 | mm_init_cpumask(mm); |
---|
1009 | 1085 | mm_init_aio(mm); |
---|
1010 | 1086 | mm_init_owner(mm, p); |
---|
| 1087 | + mm_init_pasid(mm); |
---|
1011 | 1088 | RCU_INIT_POINTER(mm->exe_file, NULL); |
---|
1012 | | - mmu_notifier_mm_init(mm); |
---|
1013 | | - hmm_mm_init(mm); |
---|
| 1089 | + if (!mmu_notifier_subscriptions_init(mm)) |
---|
| 1090 | + goto fail_nopgd; |
---|
1014 | 1091 | init_tlb_flush_pending(mm); |
---|
1015 | 1092 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS |
---|
1016 | 1093 | mm->pmd_huge_pte = NULL; |
---|
.. | .. |
---|
1085 | 1162 | { |
---|
1086 | 1163 | might_sleep(); |
---|
1087 | 1164 | |
---|
1088 | | - if (atomic_dec_and_test(&mm->mm_users)) |
---|
| 1165 | + if (atomic_dec_and_test(&mm->mm_users)) { |
---|
| 1166 | + trace_android_vh_mmput(NULL); |
---|
1089 | 1167 | __mmput(mm); |
---|
| 1168 | + } |
---|
1090 | 1169 | } |
---|
1091 | 1170 | EXPORT_SYMBOL_GPL(mmput); |
---|
1092 | 1171 | |
---|
.. | .. |
---|
1106 | 1185 | schedule_work(&mm->async_put_work); |
---|
1107 | 1186 | } |
---|
1108 | 1187 | } |
---|
| 1188 | +EXPORT_SYMBOL_GPL(mmput_async); |
---|
1109 | 1189 | #endif |
---|
1110 | 1190 | |
---|
1111 | 1191 | /** |
---|
.. | .. |
---|
1210 | 1290 | struct mm_struct *mm; |
---|
1211 | 1291 | int err; |
---|
1212 | 1292 | |
---|
1213 | | - err = mutex_lock_killable(&task->signal->cred_guard_mutex); |
---|
| 1293 | + err = down_read_killable(&task->signal->exec_update_lock); |
---|
1214 | 1294 | if (err) |
---|
1215 | 1295 | return ERR_PTR(err); |
---|
1216 | 1296 | |
---|
.. | .. |
---|
1220 | 1300 | mmput(mm); |
---|
1221 | 1301 | mm = ERR_PTR(-EACCES); |
---|
1222 | 1302 | } |
---|
1223 | | - mutex_unlock(&task->signal->cred_guard_mutex); |
---|
| 1303 | + up_read(&task->signal->exec_update_lock); |
---|
1224 | 1304 | |
---|
1225 | 1305 | return mm; |
---|
1226 | 1306 | } |
---|
.. | .. |
---|
1318 | 1398 | mm_release(tsk, mm); |
---|
1319 | 1399 | } |
---|
1320 | 1400 | |
---|
1321 | | -/* |
---|
1322 | | - * Allocate a new mm structure and copy contents from the |
---|
1323 | | - * mm structure of the passed in task structure. |
---|
| 1401 | +/** |
---|
| 1402 | + * dup_mm() - duplicates an existing mm structure |
---|
| 1403 | + * @tsk: the task_struct with which the new mm will be associated. |
---|
| 1404 | + * @oldmm: the mm to duplicate. |
---|
| 1405 | + * |
---|
| 1406 | + * Allocates a new mm structure and duplicates the provided @oldmm structure |
---|
| 1407 | + * content into it. |
---|
| 1408 | + * |
---|
| 1409 | + * Return: the duplicated mm or NULL on failure. |
---|
1324 | 1410 | */ |
---|
1325 | | -static struct mm_struct *dup_mm(struct task_struct *tsk) |
---|
| 1411 | +static struct mm_struct *dup_mm(struct task_struct *tsk, |
---|
| 1412 | + struct mm_struct *oldmm) |
---|
1326 | 1413 | { |
---|
1327 | | - struct mm_struct *mm, *oldmm = current->mm; |
---|
| 1414 | + struct mm_struct *mm; |
---|
1328 | 1415 | int err; |
---|
1329 | 1416 | |
---|
1330 | 1417 | mm = allocate_mm(); |
---|
.. | .. |
---|
1392 | 1479 | } |
---|
1393 | 1480 | |
---|
1394 | 1481 | retval = -ENOMEM; |
---|
1395 | | - mm = dup_mm(tsk); |
---|
| 1482 | + mm = dup_mm(tsk, current->mm); |
---|
1396 | 1483 | if (!mm) |
---|
1397 | 1484 | goto fail_nomem; |
---|
1398 | 1485 | |
---|
.. | .. |
---|
1442 | 1529 | goto out; |
---|
1443 | 1530 | } |
---|
1444 | 1531 | |
---|
1445 | | - newf = dup_fd(oldf, &error); |
---|
| 1532 | + newf = dup_fd(oldf, NR_OPEN_MAX, &error); |
---|
1446 | 1533 | if (!newf) |
---|
1447 | 1534 | goto out; |
---|
1448 | 1535 | |
---|
.. | .. |
---|
1483 | 1570 | struct sighand_struct *sig; |
---|
1484 | 1571 | |
---|
1485 | 1572 | if (clone_flags & CLONE_SIGHAND) { |
---|
1486 | | - atomic_inc(¤t->sighand->count); |
---|
| 1573 | + refcount_inc(¤t->sighand->count); |
---|
1487 | 1574 | return 0; |
---|
1488 | 1575 | } |
---|
1489 | 1576 | sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); |
---|
1490 | | - rcu_assign_pointer(tsk->sighand, sig); |
---|
| 1577 | + RCU_INIT_POINTER(tsk->sighand, sig); |
---|
1491 | 1578 | if (!sig) |
---|
1492 | 1579 | return -ENOMEM; |
---|
1493 | 1580 | |
---|
1494 | | - atomic_set(&sig->count, 1); |
---|
| 1581 | + refcount_set(&sig->count, 1); |
---|
1495 | 1582 | spin_lock_irq(¤t->sighand->siglock); |
---|
1496 | 1583 | memcpy(sig->action, current->sighand->action, sizeof(sig->action)); |
---|
1497 | 1584 | spin_unlock_irq(¤t->sighand->siglock); |
---|
| 1585 | + |
---|
| 1586 | + /* Reset all signal handler not set to SIG_IGN to SIG_DFL. */ |
---|
| 1587 | + if (clone_flags & CLONE_CLEAR_SIGHAND) |
---|
| 1588 | + flush_signal_handlers(tsk, 0); |
---|
| 1589 | + |
---|
1498 | 1590 | return 0; |
---|
1499 | 1591 | } |
---|
1500 | 1592 | |
---|
1501 | 1593 | void __cleanup_sighand(struct sighand_struct *sighand) |
---|
1502 | 1594 | { |
---|
1503 | | - if (atomic_dec_and_test(&sighand->count)) { |
---|
| 1595 | + if (refcount_dec_and_test(&sighand->count)) { |
---|
1504 | 1596 | signalfd_cleanup(sighand); |
---|
1505 | 1597 | /* |
---|
1506 | 1598 | * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it |
---|
.. | .. |
---|
1510 | 1602 | } |
---|
1511 | 1603 | } |
---|
1512 | 1604 | |
---|
1513 | | -#ifdef CONFIG_POSIX_TIMERS |
---|
1514 | 1605 | /* |
---|
1515 | 1606 | * Initialize POSIX timer handling for a thread group. |
---|
1516 | 1607 | */ |
---|
1517 | 1608 | static void posix_cpu_timers_init_group(struct signal_struct *sig) |
---|
1518 | 1609 | { |
---|
| 1610 | + struct posix_cputimers *pct = &sig->posix_cputimers; |
---|
1519 | 1611 | unsigned long cpu_limit; |
---|
1520 | 1612 | |
---|
1521 | 1613 | cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); |
---|
1522 | | - if (cpu_limit != RLIM_INFINITY) { |
---|
1523 | | - sig->cputime_expires.prof_exp = cpu_limit * NSEC_PER_SEC; |
---|
1524 | | - sig->cputimer.running = true; |
---|
1525 | | - } |
---|
1526 | | - |
---|
1527 | | - /* The timer lists. */ |
---|
1528 | | - INIT_LIST_HEAD(&sig->cpu_timers[0]); |
---|
1529 | | - INIT_LIST_HEAD(&sig->cpu_timers[1]); |
---|
1530 | | - INIT_LIST_HEAD(&sig->cpu_timers[2]); |
---|
| 1614 | + posix_cputimers_group_init(pct, cpu_limit); |
---|
1531 | 1615 | } |
---|
1532 | | -#else |
---|
1533 | | -static inline void posix_cpu_timers_init_group(struct signal_struct *sig) { } |
---|
1534 | | -#endif |
---|
1535 | 1616 | |
---|
1536 | 1617 | static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) |
---|
1537 | 1618 | { |
---|
.. | .. |
---|
1547 | 1628 | |
---|
1548 | 1629 | sig->nr_threads = 1; |
---|
1549 | 1630 | atomic_set(&sig->live, 1); |
---|
1550 | | - atomic_set(&sig->sigcnt, 1); |
---|
| 1631 | + refcount_set(&sig->sigcnt, 1); |
---|
1551 | 1632 | |
---|
1552 | 1633 | /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */ |
---|
1553 | 1634 | sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node); |
---|
.. | .. |
---|
1579 | 1660 | sig->oom_score_adj_min = current->signal->oom_score_adj_min; |
---|
1580 | 1661 | |
---|
1581 | 1662 | mutex_init(&sig->cred_guard_mutex); |
---|
| 1663 | + init_rwsem(&sig->exec_update_lock); |
---|
1582 | 1664 | |
---|
1583 | 1665 | return 0; |
---|
1584 | 1666 | } |
---|
.. | .. |
---|
1633 | 1715 | #endif |
---|
1634 | 1716 | } |
---|
1635 | 1717 | |
---|
1636 | | -#ifdef CONFIG_POSIX_TIMERS |
---|
1637 | | -/* |
---|
1638 | | - * Initialize POSIX timer handling for a single task. |
---|
1639 | | - */ |
---|
1640 | | -static void posix_cpu_timers_init(struct task_struct *tsk) |
---|
1641 | | -{ |
---|
1642 | | -#ifdef CONFIG_PREEMPT_RT_BASE |
---|
1643 | | - tsk->posix_timer_list = NULL; |
---|
1644 | | -#endif |
---|
1645 | | - tsk->cputime_expires.prof_exp = 0; |
---|
1646 | | - tsk->cputime_expires.virt_exp = 0; |
---|
1647 | | - tsk->cputime_expires.sched_exp = 0; |
---|
1648 | | - INIT_LIST_HEAD(&tsk->cpu_timers[0]); |
---|
1649 | | - INIT_LIST_HEAD(&tsk->cpu_timers[1]); |
---|
1650 | | - INIT_LIST_HEAD(&tsk->cpu_timers[2]); |
---|
1651 | | -} |
---|
1652 | | -#else |
---|
1653 | | -static inline void posix_cpu_timers_init(struct task_struct *tsk) { } |
---|
1654 | | -#endif |
---|
1655 | | - |
---|
1656 | 1718 | static inline void init_task_pid_links(struct task_struct *task) |
---|
1657 | 1719 | { |
---|
1658 | 1720 | enum pid_type type; |
---|
.. | .. |
---|
1684 | 1746 | INIT_LIST_HEAD(&p->rcu_tasks_holdout_list); |
---|
1685 | 1747 | p->rcu_tasks_idle_cpu = -1; |
---|
1686 | 1748 | #endif /* #ifdef CONFIG_TASKS_RCU */ |
---|
| 1749 | +#ifdef CONFIG_TASKS_TRACE_RCU |
---|
| 1750 | + p->trc_reader_nesting = 0; |
---|
| 1751 | + p->trc_reader_special.s = 0; |
---|
| 1752 | + INIT_LIST_HEAD(&p->trc_holdout_list); |
---|
| 1753 | +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ |
---|
1687 | 1754 | } |
---|
| 1755 | + |
---|
| 1756 | +struct pid *pidfd_pid(const struct file *file) |
---|
| 1757 | +{ |
---|
| 1758 | + if (file->f_op == &pidfd_fops) |
---|
| 1759 | + return file->private_data; |
---|
| 1760 | + |
---|
| 1761 | + return ERR_PTR(-EBADF); |
---|
| 1762 | +} |
---|
| 1763 | + |
---|
| 1764 | +static int pidfd_release(struct inode *inode, struct file *file) |
---|
| 1765 | +{ |
---|
| 1766 | + struct pid *pid = file->private_data; |
---|
| 1767 | + |
---|
| 1768 | + file->private_data = NULL; |
---|
| 1769 | + put_pid(pid); |
---|
| 1770 | + return 0; |
---|
| 1771 | +} |
---|
| 1772 | + |
---|
| 1773 | +#ifdef CONFIG_PROC_FS |
---|
| 1774 | +/** |
---|
| 1775 | + * pidfd_show_fdinfo - print information about a pidfd |
---|
| 1776 | + * @m: proc fdinfo file |
---|
| 1777 | + * @f: file referencing a pidfd |
---|
| 1778 | + * |
---|
| 1779 | + * Pid: |
---|
| 1780 | + * This function will print the pid that a given pidfd refers to in the |
---|
| 1781 | + * pid namespace of the procfs instance. |
---|
| 1782 | + * If the pid namespace of the process is not a descendant of the pid |
---|
| 1783 | + * namespace of the procfs instance 0 will be shown as its pid. This is |
---|
| 1784 | + * similar to calling getppid() on a process whose parent is outside of |
---|
| 1785 | + * its pid namespace. |
---|
| 1786 | + * |
---|
| 1787 | + * NSpid: |
---|
| 1788 | + * If pid namespaces are supported then this function will also print |
---|
| 1789 | + * the pid of a given pidfd refers to for all descendant pid namespaces |
---|
| 1790 | + * starting from the current pid namespace of the instance, i.e. the |
---|
| 1791 | + * Pid field and the first entry in the NSpid field will be identical. |
---|
| 1792 | + * If the pid namespace of the process is not a descendant of the pid |
---|
| 1793 | + * namespace of the procfs instance 0 will be shown as its first NSpid |
---|
| 1794 | + * entry and no others will be shown. |
---|
| 1795 | + * Note that this differs from the Pid and NSpid fields in |
---|
| 1796 | + * /proc/<pid>/status where Pid and NSpid are always shown relative to |
---|
| 1797 | + * the pid namespace of the procfs instance. The difference becomes |
---|
| 1798 | + * obvious when sending around a pidfd between pid namespaces from a |
---|
| 1799 | + * different branch of the tree, i.e. where no ancestoral relation is |
---|
| 1800 | + * present between the pid namespaces: |
---|
| 1801 | + * - create two new pid namespaces ns1 and ns2 in the initial pid |
---|
| 1802 | + * namespace (also take care to create new mount namespaces in the |
---|
| 1803 | + * new pid namespace and mount procfs) |
---|
| 1804 | + * - create a process with a pidfd in ns1 |
---|
| 1805 | + * - send pidfd from ns1 to ns2 |
---|
| 1806 | + * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid |
---|
| 1807 | + * have exactly one entry, which is 0 |
---|
| 1808 | + */ |
---|
| 1809 | +static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) |
---|
| 1810 | +{ |
---|
| 1811 | + struct pid *pid = f->private_data; |
---|
| 1812 | + struct pid_namespace *ns; |
---|
| 1813 | + pid_t nr = -1; |
---|
| 1814 | + |
---|
| 1815 | + if (likely(pid_has_task(pid, PIDTYPE_PID))) { |
---|
| 1816 | + ns = proc_pid_ns(file_inode(m->file)->i_sb); |
---|
| 1817 | + nr = pid_nr_ns(pid, ns); |
---|
| 1818 | + } |
---|
| 1819 | + |
---|
| 1820 | + seq_put_decimal_ll(m, "Pid:\t", nr); |
---|
| 1821 | + |
---|
| 1822 | +#ifdef CONFIG_PID_NS |
---|
| 1823 | + seq_put_decimal_ll(m, "\nNSpid:\t", nr); |
---|
| 1824 | + if (nr > 0) { |
---|
| 1825 | + int i; |
---|
| 1826 | + |
---|
| 1827 | + /* If nr is non-zero it means that 'pid' is valid and that |
---|
| 1828 | + * ns, i.e. the pid namespace associated with the procfs |
---|
| 1829 | + * instance, is in the pid namespace hierarchy of pid. |
---|
| 1830 | + * Start at one below the already printed level. |
---|
| 1831 | + */ |
---|
| 1832 | + for (i = ns->level + 1; i <= pid->level; i++) |
---|
| 1833 | + seq_put_decimal_ll(m, "\t", pid->numbers[i].nr); |
---|
| 1834 | + } |
---|
| 1835 | +#endif |
---|
| 1836 | + seq_putc(m, '\n'); |
---|
| 1837 | +} |
---|
| 1838 | +#endif |
---|
| 1839 | + |
---|
| 1840 | +/* |
---|
| 1841 | + * Poll support for process exit notification. |
---|
| 1842 | + */ |
---|
| 1843 | +static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) |
---|
| 1844 | +{ |
---|
| 1845 | + struct pid *pid = file->private_data; |
---|
| 1846 | + __poll_t poll_flags = 0; |
---|
| 1847 | + |
---|
| 1848 | + poll_wait(file, &pid->wait_pidfd, pts); |
---|
| 1849 | + |
---|
| 1850 | + /* |
---|
| 1851 | + * Inform pollers only when the whole thread group exits. |
---|
| 1852 | + * If the thread group leader exits before all other threads in the |
---|
| 1853 | + * group, then poll(2) should block, similar to the wait(2) family. |
---|
| 1854 | + */ |
---|
| 1855 | + if (thread_group_exited(pid)) |
---|
| 1856 | + poll_flags = EPOLLIN | EPOLLRDNORM; |
---|
| 1857 | + |
---|
| 1858 | + return poll_flags; |
---|
| 1859 | +} |
---|
| 1860 | + |
---|
| 1861 | +const struct file_operations pidfd_fops = { |
---|
| 1862 | + .release = pidfd_release, |
---|
| 1863 | + .poll = pidfd_poll, |
---|
| 1864 | +#ifdef CONFIG_PROC_FS |
---|
| 1865 | + .show_fdinfo = pidfd_show_fdinfo, |
---|
| 1866 | +#endif |
---|
| 1867 | +}; |
---|
1688 | 1868 | |
---|
1689 | 1869 | static void __delayed_free_task(struct rcu_head *rhp) |
---|
1690 | 1870 | { |
---|
.. | .. |
---|
1699 | 1879 | call_rcu(&tsk->rcu, __delayed_free_task); |
---|
1700 | 1880 | else |
---|
1701 | 1881 | free_task(tsk); |
---|
1702 | | -} |
---|
1703 | | - |
---|
1704 | | -static int pidfd_release(struct inode *inode, struct file *file) |
---|
1705 | | -{ |
---|
1706 | | - struct pid *pid = file->private_data; |
---|
1707 | | - |
---|
1708 | | - file->private_data = NULL; |
---|
1709 | | - put_pid(pid); |
---|
1710 | | - return 0; |
---|
1711 | | -} |
---|
1712 | | - |
---|
1713 | | -#ifdef CONFIG_PROC_FS |
---|
1714 | | -static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) |
---|
1715 | | -{ |
---|
1716 | | - struct pid_namespace *ns = proc_pid_ns(file_inode(m->file)); |
---|
1717 | | - struct pid *pid = f->private_data; |
---|
1718 | | - |
---|
1719 | | - seq_put_decimal_ull(m, "Pid:\t", pid_nr_ns(pid, ns)); |
---|
1720 | | - seq_putc(m, '\n'); |
---|
1721 | | -} |
---|
1722 | | -#endif |
---|
1723 | | - |
---|
1724 | | -/* |
---|
1725 | | - * Poll support for process exit notification. |
---|
1726 | | - */ |
---|
1727 | | -static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) |
---|
1728 | | -{ |
---|
1729 | | - struct task_struct *task; |
---|
1730 | | - struct pid *pid = file->private_data; |
---|
1731 | | - __poll_t poll_flags = 0; |
---|
1732 | | - |
---|
1733 | | - poll_wait(file, &pid->wait_pidfd, pts); |
---|
1734 | | - |
---|
1735 | | - rcu_read_lock(); |
---|
1736 | | - task = pid_task(pid, PIDTYPE_PID); |
---|
1737 | | - /* |
---|
1738 | | - * Inform pollers only when the whole thread group exits. |
---|
1739 | | - * If the thread group leader exits before all other threads in the |
---|
1740 | | - * group, then poll(2) should block, similar to the wait(2) family. |
---|
1741 | | - */ |
---|
1742 | | - if (!task || (task->exit_state && thread_group_empty(task))) |
---|
1743 | | - poll_flags = EPOLLIN | EPOLLRDNORM; |
---|
1744 | | - rcu_read_unlock(); |
---|
1745 | | - |
---|
1746 | | - return poll_flags; |
---|
1747 | | -} |
---|
1748 | | - |
---|
1749 | | -const struct file_operations pidfd_fops = { |
---|
1750 | | - .release = pidfd_release, |
---|
1751 | | - .poll = pidfd_poll, |
---|
1752 | | -#ifdef CONFIG_PROC_FS |
---|
1753 | | - .show_fdinfo = pidfd_show_fdinfo, |
---|
1754 | | -#endif |
---|
1755 | | -}; |
---|
1756 | | - |
---|
1757 | | -/** |
---|
1758 | | - * pidfd_create() - Create a new pid file descriptor. |
---|
1759 | | - * |
---|
1760 | | - * @pid: struct pid that the pidfd will reference |
---|
1761 | | - * |
---|
1762 | | - * This creates a new pid file descriptor with the O_CLOEXEC flag set. |
---|
1763 | | - * |
---|
1764 | | - * Note, that this function can only be called after the fd table has |
---|
1765 | | - * been unshared to avoid leaking the pidfd to the new process. |
---|
1766 | | - * |
---|
1767 | | - * Return: On success, a cloexec pidfd is returned. |
---|
1768 | | - * On error, a negative errno number will be returned. |
---|
1769 | | - */ |
---|
1770 | | -static int pidfd_create(struct pid *pid) |
---|
1771 | | -{ |
---|
1772 | | - int fd; |
---|
1773 | | - |
---|
1774 | | - fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), |
---|
1775 | | - O_RDWR | O_CLOEXEC); |
---|
1776 | | - if (fd < 0) |
---|
1777 | | - put_pid(pid); |
---|
1778 | | - |
---|
1779 | | - return fd; |
---|
1780 | 1882 | } |
---|
1781 | 1883 | |
---|
1782 | 1884 | static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk) |
---|
.. | .. |
---|
1807 | 1909 | * flags). The actual kick-off is left to the caller. |
---|
1808 | 1910 | */ |
---|
1809 | 1911 | static __latent_entropy struct task_struct *copy_process( |
---|
1810 | | - unsigned long clone_flags, |
---|
1811 | | - unsigned long stack_start, |
---|
1812 | | - unsigned long stack_size, |
---|
1813 | | - int __user *parent_tidptr, |
---|
1814 | | - int __user *child_tidptr, |
---|
1815 | 1912 | struct pid *pid, |
---|
1816 | 1913 | int trace, |
---|
1817 | | - unsigned long tls, |
---|
1818 | | - int node) |
---|
| 1914 | + int node, |
---|
| 1915 | + struct kernel_clone_args *args) |
---|
1819 | 1916 | { |
---|
1820 | 1917 | int pidfd = -1, retval; |
---|
1821 | 1918 | struct task_struct *p; |
---|
1822 | 1919 | struct multiprocess_signals delayed; |
---|
| 1920 | + struct file *pidfile = NULL; |
---|
| 1921 | + u64 clone_flags = args->flags; |
---|
| 1922 | + struct nsproxy *nsp = current->nsproxy; |
---|
1823 | 1923 | |
---|
1824 | 1924 | /* |
---|
1825 | 1925 | * Don't allow sharing the root directory with processes in a different |
---|
.. | .. |
---|
1862 | 1962 | */ |
---|
1863 | 1963 | if (clone_flags & CLONE_THREAD) { |
---|
1864 | 1964 | if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) || |
---|
1865 | | - (task_active_pid_ns(current) != |
---|
1866 | | - current->nsproxy->pid_ns_for_children)) |
---|
| 1965 | + (task_active_pid_ns(current) != nsp->pid_ns_for_children)) |
---|
| 1966 | + return ERR_PTR(-EINVAL); |
---|
| 1967 | + } |
---|
| 1968 | + |
---|
| 1969 | + /* |
---|
| 1970 | + * If the new process will be in a different time namespace |
---|
| 1971 | + * do not allow it to share VM or a thread group with the forking task. |
---|
| 1972 | + */ |
---|
| 1973 | + if (clone_flags & (CLONE_THREAD | CLONE_VM)) { |
---|
| 1974 | + if (nsp->time_ns != nsp->time_ns_for_children) |
---|
1867 | 1975 | return ERR_PTR(-EINVAL); |
---|
1868 | 1976 | } |
---|
1869 | 1977 | |
---|
1870 | 1978 | if (clone_flags & CLONE_PIDFD) { |
---|
1871 | 1979 | /* |
---|
1872 | | - * - CLONE_PARENT_SETTID is useless for pidfds and also |
---|
1873 | | - * parent_tidptr is used to return pidfds. |
---|
1874 | 1980 | * - CLONE_DETACHED is blocked so that we can potentially |
---|
1875 | 1981 | * reuse it later for CLONE_PIDFD. |
---|
1876 | 1982 | * - CLONE_THREAD is blocked until someone really needs it. |
---|
1877 | 1983 | */ |
---|
1878 | | - if (clone_flags & |
---|
1879 | | - (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD)) |
---|
| 1984 | + if (clone_flags & (CLONE_DETACHED | CLONE_THREAD)) |
---|
1880 | 1985 | return ERR_PTR(-EINVAL); |
---|
1881 | 1986 | } |
---|
1882 | 1987 | |
---|
.. | .. |
---|
1895 | 2000 | recalc_sigpending(); |
---|
1896 | 2001 | spin_unlock_irq(¤t->sighand->siglock); |
---|
1897 | 2002 | retval = -ERESTARTNOINTR; |
---|
1898 | | - if (signal_pending(current)) |
---|
| 2003 | + if (task_sigpending(current)) |
---|
1899 | 2004 | goto fork_out; |
---|
1900 | 2005 | |
---|
1901 | 2006 | retval = -ENOMEM; |
---|
1902 | 2007 | p = dup_task_struct(current, node); |
---|
1903 | 2008 | if (!p) |
---|
1904 | 2009 | goto fork_out; |
---|
| 2010 | + if (args->io_thread) { |
---|
| 2011 | + /* |
---|
| 2012 | + * Mark us an IO worker, and block any signal that isn't |
---|
| 2013 | + * fatal or STOP |
---|
| 2014 | + */ |
---|
| 2015 | + p->flags |= PF_IO_WORKER; |
---|
| 2016 | + siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP)); |
---|
| 2017 | + } |
---|
1905 | 2018 | |
---|
1906 | 2019 | cpufreq_task_times_init(p); |
---|
1907 | 2020 | |
---|
.. | .. |
---|
1911 | 2024 | * p->set_child_tid which is (ab)used as a kthread's data pointer for |
---|
1912 | 2025 | * kernel threads (PF_KTHREAD). |
---|
1913 | 2026 | */ |
---|
1914 | | - p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; |
---|
| 2027 | + p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL; |
---|
1915 | 2028 | /* |
---|
1916 | 2029 | * Clear TID on mm_release()? |
---|
1917 | 2030 | */ |
---|
1918 | | - p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; |
---|
| 2031 | + p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL; |
---|
1919 | 2032 | |
---|
1920 | 2033 | ftrace_graph_init_task(p); |
---|
1921 | 2034 | |
---|
1922 | 2035 | rt_mutex_init_task(p); |
---|
1923 | 2036 | |
---|
| 2037 | + lockdep_assert_irqs_enabled(); |
---|
1924 | 2038 | #ifdef CONFIG_PROVE_LOCKING |
---|
1925 | | - DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); |
---|
1926 | 2039 | DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); |
---|
1927 | 2040 | #endif |
---|
1928 | 2041 | retval = -EAGAIN; |
---|
.. | .. |
---|
1944 | 2057 | * to stop root fork bombs. |
---|
1945 | 2058 | */ |
---|
1946 | 2059 | retval = -EAGAIN; |
---|
1947 | | - if (nr_threads >= max_threads) |
---|
| 2060 | + if (data_race(nr_threads >= max_threads)) |
---|
1948 | 2061 | goto bad_fork_cleanup_count; |
---|
1949 | 2062 | |
---|
1950 | 2063 | delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ |
---|
.. | .. |
---|
1957 | 2070 | spin_lock_init(&p->alloc_lock); |
---|
1958 | 2071 | |
---|
1959 | 2072 | init_sigpending(&p->pending); |
---|
1960 | | - p->sigqueue_cache = NULL; |
---|
1961 | 2073 | |
---|
1962 | 2074 | p->utime = p->stime = p->gtime = 0; |
---|
1963 | 2075 | #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME |
---|
.. | .. |
---|
1969 | 2081 | seqcount_init(&p->vtime.seqcount); |
---|
1970 | 2082 | p->vtime.starttime = 0; |
---|
1971 | 2083 | p->vtime.state = VTIME_INACTIVE; |
---|
| 2084 | +#endif |
---|
| 2085 | + |
---|
| 2086 | +#ifdef CONFIG_IO_URING |
---|
| 2087 | + p->io_uring = NULL; |
---|
1972 | 2088 | #endif |
---|
1973 | 2089 | |
---|
1974 | 2090 | #if defined(SPLIT_RSS_COUNTING) |
---|
.. | .. |
---|
1984 | 2100 | task_io_accounting_init(&p->ioac); |
---|
1985 | 2101 | acct_clear_integrals(p); |
---|
1986 | 2102 | |
---|
1987 | | - posix_cpu_timers_init(p); |
---|
| 2103 | + posix_cputimers_init(&p->posix_cputimers); |
---|
1988 | 2104 | |
---|
1989 | 2105 | p->io_context = NULL; |
---|
1990 | 2106 | audit_set_context(p, NULL); |
---|
.. | .. |
---|
2000 | 2116 | #ifdef CONFIG_CPUSETS |
---|
2001 | 2117 | p->cpuset_mem_spread_rotor = NUMA_NO_NODE; |
---|
2002 | 2118 | p->cpuset_slab_spread_rotor = NUMA_NO_NODE; |
---|
2003 | | - seqcount_init(&p->mems_allowed_seq); |
---|
| 2119 | + seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock); |
---|
2004 | 2120 | #endif |
---|
2005 | 2121 | #ifdef CONFIG_TRACE_IRQFLAGS |
---|
2006 | | - p->irq_events = 0; |
---|
2007 | | - p->hardirqs_enabled = 0; |
---|
2008 | | - p->hardirq_enable_ip = 0; |
---|
2009 | | - p->hardirq_enable_event = 0; |
---|
2010 | | - p->hardirq_disable_ip = _THIS_IP_; |
---|
2011 | | - p->hardirq_disable_event = 0; |
---|
2012 | | - p->softirqs_enabled = 1; |
---|
2013 | | - p->softirq_enable_ip = _THIS_IP_; |
---|
2014 | | - p->softirq_enable_event = 0; |
---|
2015 | | - p->softirq_disable_ip = 0; |
---|
2016 | | - p->softirq_disable_event = 0; |
---|
2017 | | - p->hardirq_context = 0; |
---|
2018 | | - p->softirq_context = 0; |
---|
| 2122 | + memset(&p->irqtrace, 0, sizeof(p->irqtrace)); |
---|
| 2123 | + p->irqtrace.hardirq_disable_ip = _THIS_IP_; |
---|
| 2124 | + p->irqtrace.softirq_enable_ip = _THIS_IP_; |
---|
| 2125 | + p->softirqs_enabled = 1; |
---|
| 2126 | + p->softirq_context = 0; |
---|
2019 | 2127 | #endif |
---|
2020 | 2128 | |
---|
2021 | 2129 | p->pagefault_disabled = 0; |
---|
2022 | 2130 | |
---|
2023 | 2131 | #ifdef CONFIG_LOCKDEP |
---|
2024 | | - p->lockdep_depth = 0; /* no locks held yet */ |
---|
2025 | | - p->curr_chain_key = 0; |
---|
2026 | | - p->lockdep_recursion = 0; |
---|
2027 | 2132 | lockdep_init_task(p); |
---|
2028 | 2133 | #endif |
---|
2029 | 2134 | |
---|
.. | .. |
---|
2075 | 2180 | retval = copy_io(clone_flags, p); |
---|
2076 | 2181 | if (retval) |
---|
2077 | 2182 | goto bad_fork_cleanup_namespaces; |
---|
2078 | | - retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls); |
---|
| 2183 | + retval = copy_thread(clone_flags, args->stack, args->stack_size, p, args->tls); |
---|
2079 | 2184 | if (retval) |
---|
2080 | 2185 | goto bad_fork_cleanup_io; |
---|
2081 | 2186 | |
---|
| 2187 | + stackleak_task_init(p); |
---|
| 2188 | + |
---|
2082 | 2189 | if (pid != &init_struct_pid) { |
---|
2083 | | - pid = alloc_pid(p->nsproxy->pid_ns_for_children); |
---|
| 2190 | + pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid, |
---|
| 2191 | + args->set_tid_size); |
---|
2084 | 2192 | if (IS_ERR(pid)) { |
---|
2085 | 2193 | retval = PTR_ERR(pid); |
---|
2086 | 2194 | goto bad_fork_cleanup_thread; |
---|
.. | .. |
---|
2093 | 2201 | * if the fd table isn't shared). |
---|
2094 | 2202 | */ |
---|
2095 | 2203 | if (clone_flags & CLONE_PIDFD) { |
---|
2096 | | - retval = pidfd_create(pid); |
---|
| 2204 | + retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC); |
---|
2097 | 2205 | if (retval < 0) |
---|
2098 | 2206 | goto bad_fork_free_pid; |
---|
2099 | 2207 | |
---|
2100 | 2208 | pidfd = retval; |
---|
2101 | | - retval = put_user(pidfd, parent_tidptr); |
---|
| 2209 | + |
---|
| 2210 | + pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid, |
---|
| 2211 | + O_RDWR | O_CLOEXEC); |
---|
| 2212 | + if (IS_ERR(pidfile)) { |
---|
| 2213 | + put_unused_fd(pidfd); |
---|
| 2214 | + retval = PTR_ERR(pidfile); |
---|
| 2215 | + goto bad_fork_free_pid; |
---|
| 2216 | + } |
---|
| 2217 | + get_pid(pid); /* held by pidfile now */ |
---|
| 2218 | + |
---|
| 2219 | + retval = put_user(pidfd, args->pidfd); |
---|
2102 | 2220 | if (retval) |
---|
2103 | 2221 | goto bad_fork_put_pidfd; |
---|
2104 | 2222 | } |
---|
.. | .. |
---|
2123 | 2241 | #ifdef TIF_SYSCALL_EMU |
---|
2124 | 2242 | clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); |
---|
2125 | 2243 | #endif |
---|
2126 | | - clear_all_latency_tracing(p); |
---|
| 2244 | + clear_tsk_latency_tracing(p); |
---|
2127 | 2245 | |
---|
2128 | 2246 | /* ok, now we should be set up.. */ |
---|
2129 | 2247 | p->pid = pid_nr(pid); |
---|
.. | .. |
---|
2142 | 2260 | p->pdeath_signal = 0; |
---|
2143 | 2261 | INIT_LIST_HEAD(&p->thread_group); |
---|
2144 | 2262 | p->task_works = NULL; |
---|
| 2263 | + clear_posix_cputimers_work(p); |
---|
2145 | 2264 | |
---|
2146 | | - cgroup_threadgroup_change_begin(current); |
---|
2147 | 2265 | /* |
---|
2148 | 2266 | * Ensure that the cgroup subsystem policies allow the new process to be |
---|
2149 | | - * forked. It should be noted the the new process's css_set can be changed |
---|
| 2267 | + * forked. It should be noted that the new process's css_set can be changed |
---|
2150 | 2268 | * between here and cgroup_post_fork() if an organisation operation is in |
---|
2151 | 2269 | * progress. |
---|
2152 | 2270 | */ |
---|
2153 | | - retval = cgroup_can_fork(p); |
---|
| 2271 | + retval = cgroup_can_fork(p, args); |
---|
2154 | 2272 | if (retval) |
---|
2155 | | - goto bad_fork_cgroup_threadgroup_change_end; |
---|
| 2273 | + goto bad_fork_put_pidfd; |
---|
| 2274 | + |
---|
| 2275 | + /* |
---|
| 2276 | + * Now that the cgroups are pinned, re-clone the parent cgroup and put |
---|
| 2277 | + * the new task on the correct runqueue. All this *before* the task |
---|
| 2278 | + * becomes visible. |
---|
| 2279 | + * |
---|
| 2280 | + * This isn't part of ->can_fork() because while the re-cloning is |
---|
| 2281 | + * cgroup specific, it unconditionally needs to place the task on a |
---|
| 2282 | + * runqueue. |
---|
| 2283 | + */ |
---|
| 2284 | + sched_cgroup_fork(p, args); |
---|
2156 | 2285 | |
---|
2157 | 2286 | /* |
---|
2158 | 2287 | * From this point on we must avoid any synchronous user-space |
---|
.. | .. |
---|
2163 | 2292 | */ |
---|
2164 | 2293 | |
---|
2165 | 2294 | p->start_time = ktime_get_ns(); |
---|
2166 | | - p->real_start_time = ktime_get_boot_ns(); |
---|
| 2295 | + p->start_boottime = ktime_get_boottime_ns(); |
---|
2167 | 2296 | |
---|
2168 | 2297 | /* |
---|
2169 | 2298 | * Make it visible to the rest of the system, but dont wake it up yet. |
---|
.. | .. |
---|
2182 | 2311 | } else { |
---|
2183 | 2312 | p->real_parent = current; |
---|
2184 | 2313 | p->parent_exec_id = current->self_exec_id; |
---|
2185 | | - p->exit_signal = (clone_flags & CSIGNAL); |
---|
| 2314 | + p->exit_signal = args->exit_signal; |
---|
2186 | 2315 | } |
---|
2187 | 2316 | |
---|
2188 | 2317 | klp_copy_process(p); |
---|
2189 | 2318 | |
---|
2190 | 2319 | spin_lock(¤t->sighand->siglock); |
---|
2191 | | - |
---|
2192 | | - /* |
---|
2193 | | - * Copy seccomp details explicitly here, in case they were changed |
---|
2194 | | - * before holding sighand lock. |
---|
2195 | | - */ |
---|
2196 | | - copy_seccomp(p); |
---|
2197 | 2320 | |
---|
2198 | 2321 | rseq_fork(p, clone_flags); |
---|
2199 | 2322 | |
---|
.. | .. |
---|
2209 | 2332 | goto bad_fork_cancel_cgroup; |
---|
2210 | 2333 | } |
---|
2211 | 2334 | |
---|
| 2335 | + /* No more failure paths after this point. */ |
---|
| 2336 | + |
---|
| 2337 | + /* |
---|
| 2338 | + * Copy seccomp details explicitly here, in case they were changed |
---|
| 2339 | + * before holding sighand lock. |
---|
| 2340 | + */ |
---|
| 2341 | + copy_seccomp(p); |
---|
2212 | 2342 | |
---|
2213 | 2343 | init_task_pid_links(p); |
---|
2214 | 2344 | if (likely(p->pid)) { |
---|
.. | .. |
---|
2242 | 2372 | } else { |
---|
2243 | 2373 | current->signal->nr_threads++; |
---|
2244 | 2374 | atomic_inc(¤t->signal->live); |
---|
2245 | | - atomic_inc(¤t->signal->sigcnt); |
---|
| 2375 | + refcount_inc(¤t->signal->sigcnt); |
---|
2246 | 2376 | task_join_group_stop(p); |
---|
2247 | 2377 | list_add_tail_rcu(&p->thread_group, |
---|
2248 | 2378 | &p->group_leader->thread_group); |
---|
.. | .. |
---|
2258 | 2388 | syscall_tracepoint_update(p); |
---|
2259 | 2389 | write_unlock_irq(&tasklist_lock); |
---|
2260 | 2390 | |
---|
| 2391 | + if (pidfile) |
---|
| 2392 | + fd_install(pidfd, pidfile); |
---|
| 2393 | + |
---|
2261 | 2394 | proc_fork_connector(p); |
---|
2262 | | - cgroup_post_fork(p); |
---|
2263 | | - cgroup_threadgroup_change_end(current); |
---|
| 2395 | + sched_post_fork(p); |
---|
| 2396 | + cgroup_post_fork(p, args); |
---|
2264 | 2397 | perf_event_fork(p); |
---|
2265 | 2398 | |
---|
2266 | 2399 | trace_task_newtask(p, clone_flags); |
---|
.. | .. |
---|
2273 | 2406 | bad_fork_cancel_cgroup: |
---|
2274 | 2407 | spin_unlock(¤t->sighand->siglock); |
---|
2275 | 2408 | write_unlock_irq(&tasklist_lock); |
---|
2276 | | - cgroup_cancel_fork(p); |
---|
2277 | | -bad_fork_cgroup_threadgroup_change_end: |
---|
2278 | | - cgroup_threadgroup_change_end(current); |
---|
| 2409 | + cgroup_cancel_fork(p, args); |
---|
2279 | 2410 | bad_fork_put_pidfd: |
---|
2280 | | - if (clone_flags & CLONE_PIDFD) |
---|
2281 | | - ksys_close(pidfd); |
---|
| 2411 | + if (clone_flags & CLONE_PIDFD) { |
---|
| 2412 | + fput(pidfile); |
---|
| 2413 | + put_unused_fd(pidfd); |
---|
| 2414 | + } |
---|
2282 | 2415 | bad_fork_free_pid: |
---|
2283 | 2416 | if (pid != &init_struct_pid) |
---|
2284 | 2417 | free_pid(pid); |
---|
.. | .. |
---|
2342 | 2475 | } |
---|
2343 | 2476 | } |
---|
2344 | 2477 | |
---|
2345 | | -struct task_struct *fork_idle(int cpu) |
---|
| 2478 | +struct task_struct * __init fork_idle(int cpu) |
---|
2346 | 2479 | { |
---|
2347 | 2480 | struct task_struct *task; |
---|
2348 | | - task = copy_process(CLONE_VM, 0, 0, NULL, NULL, &init_struct_pid, 0, 0, |
---|
2349 | | - cpu_to_node(cpu)); |
---|
| 2481 | + struct kernel_clone_args args = { |
---|
| 2482 | + .flags = CLONE_VM, |
---|
| 2483 | + }; |
---|
| 2484 | + |
---|
| 2485 | + task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args); |
---|
2350 | 2486 | if (!IS_ERR(task)) { |
---|
2351 | 2487 | init_idle_pids(task); |
---|
2352 | 2488 | init_idle(task, cpu); |
---|
.. | .. |
---|
2356 | 2492 | } |
---|
2357 | 2493 | |
---|
2358 | 2494 | /* |
---|
| 2495 | + * This is like kernel_clone(), but shaved down and tailored to just |
---|
| 2496 | + * creating io_uring workers. It returns a created task, or an error pointer. |
---|
| 2497 | + * The returned task is inactive, and the caller must fire it up through |
---|
| 2498 | + * wake_up_new_task(p). All signals are blocked in the created task. |
---|
| 2499 | + */ |
---|
| 2500 | +struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) |
---|
| 2501 | +{ |
---|
| 2502 | + unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD| |
---|
| 2503 | + CLONE_IO; |
---|
| 2504 | + struct kernel_clone_args args = { |
---|
| 2505 | + .flags = ((lower_32_bits(flags) | CLONE_VM | |
---|
| 2506 | + CLONE_UNTRACED) & ~CSIGNAL), |
---|
| 2507 | + .exit_signal = (lower_32_bits(flags) & CSIGNAL), |
---|
| 2508 | + .stack = (unsigned long)fn, |
---|
| 2509 | + .stack_size = (unsigned long)arg, |
---|
| 2510 | + .io_thread = 1, |
---|
| 2511 | + }; |
---|
| 2512 | + |
---|
| 2513 | + return copy_process(NULL, 0, node, &args); |
---|
| 2514 | +} |
---|
| 2515 | + |
---|
| 2516 | +/* |
---|
2359 | 2517 | * Ok, this is the main fork-routine. |
---|
2360 | 2518 | * |
---|
2361 | 2519 | * It copies the process, and if successful kick-starts |
---|
2362 | 2520 | * it and waits for it to finish using the VM if required. |
---|
| 2521 | + * |
---|
| 2522 | + * args->exit_signal is expected to be checked for sanity by the caller. |
---|
2363 | 2523 | */ |
---|
2364 | | -long _do_fork(unsigned long clone_flags, |
---|
2365 | | - unsigned long stack_start, |
---|
2366 | | - unsigned long stack_size, |
---|
2367 | | - int __user *parent_tidptr, |
---|
2368 | | - int __user *child_tidptr, |
---|
2369 | | - unsigned long tls) |
---|
| 2524 | +pid_t kernel_clone(struct kernel_clone_args *args) |
---|
2370 | 2525 | { |
---|
| 2526 | + u64 clone_flags = args->flags; |
---|
2371 | 2527 | struct completion vfork; |
---|
2372 | 2528 | struct pid *pid; |
---|
2373 | 2529 | struct task_struct *p; |
---|
2374 | 2530 | int trace = 0; |
---|
2375 | | - long nr; |
---|
| 2531 | + pid_t nr; |
---|
| 2532 | + |
---|
| 2533 | + /* |
---|
| 2534 | + * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument |
---|
| 2535 | + * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are |
---|
| 2536 | + * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate |
---|
| 2537 | + * field in struct clone_args and it still doesn't make sense to have |
---|
| 2538 | + * them both point at the same memory location. Performing this check |
---|
| 2539 | + * here has the advantage that we don't need to have a separate helper |
---|
| 2540 | + * to check for legacy clone(). |
---|
| 2541 | + */ |
---|
| 2542 | + if ((args->flags & CLONE_PIDFD) && |
---|
| 2543 | + (args->flags & CLONE_PARENT_SETTID) && |
---|
| 2544 | + (args->pidfd == args->parent_tid)) |
---|
| 2545 | + return -EINVAL; |
---|
2376 | 2546 | |
---|
2377 | 2547 | /* |
---|
2378 | 2548 | * Determine whether and which event to report to ptracer. When |
---|
.. | .. |
---|
2383 | 2553 | if (!(clone_flags & CLONE_UNTRACED)) { |
---|
2384 | 2554 | if (clone_flags & CLONE_VFORK) |
---|
2385 | 2555 | trace = PTRACE_EVENT_VFORK; |
---|
2386 | | - else if ((clone_flags & CSIGNAL) != SIGCHLD) |
---|
| 2556 | + else if (args->exit_signal != SIGCHLD) |
---|
2387 | 2557 | trace = PTRACE_EVENT_CLONE; |
---|
2388 | 2558 | else |
---|
2389 | 2559 | trace = PTRACE_EVENT_FORK; |
---|
.. | .. |
---|
2392 | 2562 | trace = 0; |
---|
2393 | 2563 | } |
---|
2394 | 2564 | |
---|
2395 | | - p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr, |
---|
2396 | | - child_tidptr, NULL, trace, tls, NUMA_NO_NODE); |
---|
| 2565 | + p = copy_process(NULL, trace, NUMA_NO_NODE, args); |
---|
2397 | 2566 | add_latent_entropy(); |
---|
2398 | 2567 | |
---|
2399 | 2568 | if (IS_ERR(p)) |
---|
.. | .. |
---|
2411 | 2580 | nr = pid_vnr(pid); |
---|
2412 | 2581 | |
---|
2413 | 2582 | if (clone_flags & CLONE_PARENT_SETTID) |
---|
2414 | | - put_user(nr, parent_tidptr); |
---|
| 2583 | + put_user(nr, args->parent_tid); |
---|
2415 | 2584 | |
---|
2416 | 2585 | if (clone_flags & CLONE_VFORK) { |
---|
2417 | 2586 | p->vfork_done = &vfork; |
---|
.. | .. |
---|
2434 | 2603 | return nr; |
---|
2435 | 2604 | } |
---|
2436 | 2605 | |
---|
2437 | | -#ifndef CONFIG_HAVE_COPY_THREAD_TLS |
---|
2438 | | -/* For compatibility with architectures that call do_fork directly rather than |
---|
2439 | | - * using the syscall entry points below. */ |
---|
2440 | | -long do_fork(unsigned long clone_flags, |
---|
2441 | | - unsigned long stack_start, |
---|
2442 | | - unsigned long stack_size, |
---|
2443 | | - int __user *parent_tidptr, |
---|
2444 | | - int __user *child_tidptr) |
---|
2445 | | -{ |
---|
2446 | | - return _do_fork(clone_flags, stack_start, stack_size, |
---|
2447 | | - parent_tidptr, child_tidptr, 0); |
---|
2448 | | -} |
---|
2449 | | -#endif |
---|
2450 | | - |
---|
2451 | 2606 | /* |
---|
2452 | 2607 | * Create a kernel thread. |
---|
2453 | 2608 | */ |
---|
2454 | 2609 | pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) |
---|
2455 | 2610 | { |
---|
2456 | | - return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, |
---|
2457 | | - (unsigned long)arg, NULL, NULL, 0); |
---|
| 2611 | + struct kernel_clone_args args = { |
---|
| 2612 | + .flags = ((lower_32_bits(flags) | CLONE_VM | |
---|
| 2613 | + CLONE_UNTRACED) & ~CSIGNAL), |
---|
| 2614 | + .exit_signal = (lower_32_bits(flags) & CSIGNAL), |
---|
| 2615 | + .stack = (unsigned long)fn, |
---|
| 2616 | + .stack_size = (unsigned long)arg, |
---|
| 2617 | + }; |
---|
| 2618 | + |
---|
| 2619 | + return kernel_clone(&args); |
---|
2458 | 2620 | } |
---|
2459 | 2621 | |
---|
2460 | 2622 | #ifdef __ARCH_WANT_SYS_FORK |
---|
2461 | 2623 | SYSCALL_DEFINE0(fork) |
---|
2462 | 2624 | { |
---|
2463 | 2625 | #ifdef CONFIG_MMU |
---|
2464 | | - return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0); |
---|
| 2626 | + struct kernel_clone_args args = { |
---|
| 2627 | + .exit_signal = SIGCHLD, |
---|
| 2628 | + }; |
---|
| 2629 | + |
---|
| 2630 | + return kernel_clone(&args); |
---|
2465 | 2631 | #else |
---|
2466 | 2632 | /* can not support in nommu mode */ |
---|
2467 | 2633 | return -EINVAL; |
---|
.. | .. |
---|
2472 | 2638 | #ifdef __ARCH_WANT_SYS_VFORK |
---|
2473 | 2639 | SYSCALL_DEFINE0(vfork) |
---|
2474 | 2640 | { |
---|
2475 | | - return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, |
---|
2476 | | - 0, NULL, NULL, 0); |
---|
| 2641 | + struct kernel_clone_args args = { |
---|
| 2642 | + .flags = CLONE_VFORK | CLONE_VM, |
---|
| 2643 | + .exit_signal = SIGCHLD, |
---|
| 2644 | + }; |
---|
| 2645 | + |
---|
| 2646 | + return kernel_clone(&args); |
---|
2477 | 2647 | } |
---|
2478 | 2648 | #endif |
---|
2479 | 2649 | |
---|
.. | .. |
---|
2501 | 2671 | unsigned long, tls) |
---|
2502 | 2672 | #endif |
---|
2503 | 2673 | { |
---|
2504 | | - return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls); |
---|
| 2674 | + struct kernel_clone_args args = { |
---|
| 2675 | + .flags = (lower_32_bits(clone_flags) & ~CSIGNAL), |
---|
| 2676 | + .pidfd = parent_tidptr, |
---|
| 2677 | + .child_tid = child_tidptr, |
---|
| 2678 | + .parent_tid = parent_tidptr, |
---|
| 2679 | + .exit_signal = (lower_32_bits(clone_flags) & CSIGNAL), |
---|
| 2680 | + .stack = newsp, |
---|
| 2681 | + .tls = tls, |
---|
| 2682 | + }; |
---|
| 2683 | + |
---|
| 2684 | + return kernel_clone(&args); |
---|
| 2685 | +} |
---|
| 2686 | +#endif |
---|
| 2687 | + |
---|
| 2688 | +#ifdef __ARCH_WANT_SYS_CLONE3 |
---|
| 2689 | + |
---|
| 2690 | +noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, |
---|
| 2691 | + struct clone_args __user *uargs, |
---|
| 2692 | + size_t usize) |
---|
| 2693 | +{ |
---|
| 2694 | + int err; |
---|
| 2695 | + struct clone_args args; |
---|
| 2696 | + pid_t *kset_tid = kargs->set_tid; |
---|
| 2697 | + |
---|
| 2698 | + BUILD_BUG_ON(offsetofend(struct clone_args, tls) != |
---|
| 2699 | + CLONE_ARGS_SIZE_VER0); |
---|
| 2700 | + BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) != |
---|
| 2701 | + CLONE_ARGS_SIZE_VER1); |
---|
| 2702 | + BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) != |
---|
| 2703 | + CLONE_ARGS_SIZE_VER2); |
---|
| 2704 | + BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2); |
---|
| 2705 | + |
---|
| 2706 | + if (unlikely(usize > PAGE_SIZE)) |
---|
| 2707 | + return -E2BIG; |
---|
| 2708 | + if (unlikely(usize < CLONE_ARGS_SIZE_VER0)) |
---|
| 2709 | + return -EINVAL; |
---|
| 2710 | + |
---|
| 2711 | + err = copy_struct_from_user(&args, sizeof(args), uargs, usize); |
---|
| 2712 | + if (err) |
---|
| 2713 | + return err; |
---|
| 2714 | + |
---|
| 2715 | + if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL)) |
---|
| 2716 | + return -EINVAL; |
---|
| 2717 | + |
---|
| 2718 | + if (unlikely(!args.set_tid && args.set_tid_size > 0)) |
---|
| 2719 | + return -EINVAL; |
---|
| 2720 | + |
---|
| 2721 | + if (unlikely(args.set_tid && args.set_tid_size == 0)) |
---|
| 2722 | + return -EINVAL; |
---|
| 2723 | + |
---|
| 2724 | + /* |
---|
| 2725 | + * Verify that higher 32bits of exit_signal are unset and that |
---|
| 2726 | + * it is a valid signal |
---|
| 2727 | + */ |
---|
| 2728 | + if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) || |
---|
| 2729 | + !valid_signal(args.exit_signal))) |
---|
| 2730 | + return -EINVAL; |
---|
| 2731 | + |
---|
| 2732 | + if ((args.flags & CLONE_INTO_CGROUP) && |
---|
| 2733 | + (args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2)) |
---|
| 2734 | + return -EINVAL; |
---|
| 2735 | + |
---|
| 2736 | + *kargs = (struct kernel_clone_args){ |
---|
| 2737 | + .flags = args.flags, |
---|
| 2738 | + .pidfd = u64_to_user_ptr(args.pidfd), |
---|
| 2739 | + .child_tid = u64_to_user_ptr(args.child_tid), |
---|
| 2740 | + .parent_tid = u64_to_user_ptr(args.parent_tid), |
---|
| 2741 | + .exit_signal = args.exit_signal, |
---|
| 2742 | + .stack = args.stack, |
---|
| 2743 | + .stack_size = args.stack_size, |
---|
| 2744 | + .tls = args.tls, |
---|
| 2745 | + .set_tid_size = args.set_tid_size, |
---|
| 2746 | + .cgroup = args.cgroup, |
---|
| 2747 | + }; |
---|
| 2748 | + |
---|
| 2749 | + if (args.set_tid && |
---|
| 2750 | + copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid), |
---|
| 2751 | + (kargs->set_tid_size * sizeof(pid_t)))) |
---|
| 2752 | + return -EFAULT; |
---|
| 2753 | + |
---|
| 2754 | + kargs->set_tid = kset_tid; |
---|
| 2755 | + |
---|
| 2756 | + return 0; |
---|
| 2757 | +} |
---|
| 2758 | + |
---|
| 2759 | +/** |
---|
| 2760 | + * clone3_stack_valid - check and prepare stack |
---|
| 2761 | + * @kargs: kernel clone args |
---|
| 2762 | + * |
---|
| 2763 | + * Verify that the stack arguments userspace gave us are sane. |
---|
| 2764 | + * In addition, set the stack direction for userspace since it's easy for us to |
---|
| 2765 | + * determine. |
---|
| 2766 | + */ |
---|
| 2767 | +static inline bool clone3_stack_valid(struct kernel_clone_args *kargs) |
---|
| 2768 | +{ |
---|
| 2769 | + if (kargs->stack == 0) { |
---|
| 2770 | + if (kargs->stack_size > 0) |
---|
| 2771 | + return false; |
---|
| 2772 | + } else { |
---|
| 2773 | + if (kargs->stack_size == 0) |
---|
| 2774 | + return false; |
---|
| 2775 | + |
---|
| 2776 | + if (!access_ok((void __user *)kargs->stack, kargs->stack_size)) |
---|
| 2777 | + return false; |
---|
| 2778 | + |
---|
| 2779 | +#if !defined(CONFIG_STACK_GROWSUP) && !defined(CONFIG_IA64) |
---|
| 2780 | + kargs->stack += kargs->stack_size; |
---|
| 2781 | +#endif |
---|
| 2782 | + } |
---|
| 2783 | + |
---|
| 2784 | + return true; |
---|
| 2785 | +} |
---|
| 2786 | + |
---|
| 2787 | +static bool clone3_args_valid(struct kernel_clone_args *kargs) |
---|
| 2788 | +{ |
---|
| 2789 | + /* Verify that no unknown flags are passed along. */ |
---|
| 2790 | + if (kargs->flags & |
---|
| 2791 | + ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP)) |
---|
| 2792 | + return false; |
---|
| 2793 | + |
---|
| 2794 | + /* |
---|
| 2795 | + * - make the CLONE_DETACHED bit reuseable for clone3 |
---|
| 2796 | + * - make the CSIGNAL bits reuseable for clone3 |
---|
| 2797 | + */ |
---|
| 2798 | + if (kargs->flags & (CLONE_DETACHED | (CSIGNAL & (~CLONE_NEWTIME)))) |
---|
| 2799 | + return false; |
---|
| 2800 | + |
---|
| 2801 | + if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) == |
---|
| 2802 | + (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) |
---|
| 2803 | + return false; |
---|
| 2804 | + |
---|
| 2805 | + if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) && |
---|
| 2806 | + kargs->exit_signal) |
---|
| 2807 | + return false; |
---|
| 2808 | + |
---|
| 2809 | + if (!clone3_stack_valid(kargs)) |
---|
| 2810 | + return false; |
---|
| 2811 | + |
---|
| 2812 | + return true; |
---|
| 2813 | +} |
---|
| 2814 | + |
---|
| 2815 | +/** |
---|
| 2816 | + * clone3 - create a new process with specific properties |
---|
| 2817 | + * @uargs: argument structure |
---|
| 2818 | + * @size: size of @uargs |
---|
| 2819 | + * |
---|
| 2820 | + * clone3() is the extensible successor to clone()/clone2(). |
---|
| 2821 | + * It takes a struct as argument that is versioned by its size. |
---|
| 2822 | + * |
---|
| 2823 | + * Return: On success, a positive PID for the child process. |
---|
| 2824 | + * On error, a negative errno number. |
---|
| 2825 | + */ |
---|
| 2826 | +SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size) |
---|
| 2827 | +{ |
---|
| 2828 | + int err; |
---|
| 2829 | + |
---|
| 2830 | + struct kernel_clone_args kargs; |
---|
| 2831 | + pid_t set_tid[MAX_PID_NS_LEVEL]; |
---|
| 2832 | + |
---|
| 2833 | + kargs.set_tid = set_tid; |
---|
| 2834 | + |
---|
| 2835 | + err = copy_clone_args_from_user(&kargs, uargs, size); |
---|
| 2836 | + if (err) |
---|
| 2837 | + return err; |
---|
| 2838 | + |
---|
| 2839 | + if (!clone3_args_valid(&kargs)) |
---|
| 2840 | + return -EINVAL; |
---|
| 2841 | + |
---|
| 2842 | + return kernel_clone(&kargs); |
---|
2505 | 2843 | } |
---|
2506 | 2844 | #endif |
---|
2507 | 2845 | |
---|
.. | .. |
---|
2549 | 2887 | init_waitqueue_head(&sighand->signalfd_wqh); |
---|
2550 | 2888 | } |
---|
2551 | 2889 | |
---|
2552 | | -void __init proc_caches_init(void) |
---|
| 2890 | +void __init mm_cache_init(void) |
---|
2553 | 2891 | { |
---|
2554 | 2892 | unsigned int mm_size; |
---|
2555 | 2893 | |
---|
| 2894 | + /* |
---|
| 2895 | + * The mm_cpumask is located at the end of mm_struct, and is |
---|
| 2896 | + * dynamically sized based on the maximum CPU number this system |
---|
| 2897 | + * can have, taking hotplug into account (nr_cpu_ids). |
---|
| 2898 | + */ |
---|
| 2899 | + mm_size = sizeof(struct mm_struct) + cpumask_size(); |
---|
| 2900 | + |
---|
| 2901 | + mm_cachep = kmem_cache_create_usercopy("mm_struct", |
---|
| 2902 | + mm_size, ARCH_MIN_MMSTRUCT_ALIGN, |
---|
| 2903 | + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, |
---|
| 2904 | + offsetof(struct mm_struct, saved_auxv), |
---|
| 2905 | + sizeof_field(struct mm_struct, saved_auxv), |
---|
| 2906 | + NULL); |
---|
| 2907 | +} |
---|
| 2908 | + |
---|
| 2909 | +void __init proc_caches_init(void) |
---|
| 2910 | +{ |
---|
2556 | 2911 | sighand_cachep = kmem_cache_create("sighand_cache", |
---|
2557 | 2912 | sizeof(struct sighand_struct), 0, |
---|
2558 | 2913 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| |
---|
.. | .. |
---|
2570 | 2925 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, |
---|
2571 | 2926 | NULL); |
---|
2572 | 2927 | |
---|
2573 | | - /* |
---|
2574 | | - * The mm_cpumask is located at the end of mm_struct, and is |
---|
2575 | | - * dynamically sized based on the maximum CPU number this system |
---|
2576 | | - * can have, taking hotplug into account (nr_cpu_ids). |
---|
2577 | | - */ |
---|
2578 | | - mm_size = sizeof(struct mm_struct) + cpumask_size(); |
---|
2579 | | - |
---|
2580 | | - mm_cachep = kmem_cache_create_usercopy("mm_struct", |
---|
2581 | | - mm_size, ARCH_MIN_MMSTRUCT_ALIGN, |
---|
2582 | | - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, |
---|
2583 | | - offsetof(struct mm_struct, saved_auxv), |
---|
2584 | | - sizeof_field(struct mm_struct, saved_auxv), |
---|
2585 | | - NULL); |
---|
2586 | 2928 | vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); |
---|
2587 | 2929 | mmap_init(); |
---|
2588 | 2930 | nsproxy_cache_init(); |
---|
.. | .. |
---|
2596 | 2938 | if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| |
---|
2597 | 2939 | CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| |
---|
2598 | 2940 | CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| |
---|
2599 | | - CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP)) |
---|
| 2941 | + CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP| |
---|
| 2942 | + CLONE_NEWTIME)) |
---|
2600 | 2943 | return -EINVAL; |
---|
2601 | 2944 | /* |
---|
2602 | 2945 | * Not implemented, but pretend it works if there is nothing |
---|
.. | .. |
---|
2609 | 2952 | return -EINVAL; |
---|
2610 | 2953 | } |
---|
2611 | 2954 | if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) { |
---|
2612 | | - if (atomic_read(¤t->sighand->count) > 1) |
---|
| 2955 | + if (refcount_read(¤t->sighand->count) > 1) |
---|
2613 | 2956 | return -EINVAL; |
---|
2614 | 2957 | } |
---|
2615 | 2958 | if (unshare_flags & CLONE_VM) { |
---|
.. | .. |
---|
2644 | 2987 | /* |
---|
2645 | 2988 | * Unshare file descriptor table if it is being shared |
---|
2646 | 2989 | */ |
---|
2647 | | -static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) |
---|
| 2990 | +int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, |
---|
| 2991 | + struct files_struct **new_fdp) |
---|
2648 | 2992 | { |
---|
2649 | 2993 | struct files_struct *fd = current->files; |
---|
2650 | 2994 | int error = 0; |
---|
2651 | 2995 | |
---|
2652 | 2996 | if ((unshare_flags & CLONE_FILES) && |
---|
2653 | 2997 | (fd && atomic_read(&fd->count) > 1)) { |
---|
2654 | | - *new_fdp = dup_fd(fd, &error); |
---|
| 2998 | + *new_fdp = dup_fd(fd, max_fds, &error); |
---|
2655 | 2999 | if (!*new_fdp) |
---|
2656 | 3000 | return error; |
---|
2657 | 3001 | } |
---|
.. | .. |
---|
2662 | 3006 | /* |
---|
2663 | 3007 | * unshare allows a process to 'unshare' part of the process |
---|
2664 | 3008 | * context which was originally shared using clone. copy_* |
---|
2665 | | - * functions used by do_fork() cannot be used here directly |
---|
| 3009 | + * functions used by kernel_clone() cannot be used here directly |
---|
2666 | 3010 | * because they modify an inactive task_struct that is being |
---|
2667 | 3011 | * constructed. Here we are modifying the current, active, |
---|
2668 | 3012 | * task_struct. |
---|
.. | .. |
---|
2711 | 3055 | err = unshare_fs(unshare_flags, &new_fs); |
---|
2712 | 3056 | if (err) |
---|
2713 | 3057 | goto bad_unshare_out; |
---|
2714 | | - err = unshare_fd(unshare_flags, &new_fd); |
---|
| 3058 | + err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd); |
---|
2715 | 3059 | if (err) |
---|
2716 | 3060 | goto bad_unshare_cleanup_fs; |
---|
2717 | 3061 | err = unshare_userns(unshare_flags, &new_cred); |
---|
.. | .. |
---|
2800 | 3144 | struct files_struct *copy = NULL; |
---|
2801 | 3145 | int error; |
---|
2802 | 3146 | |
---|
2803 | | - error = unshare_fd(CLONE_FILES, ©); |
---|
| 3147 | + error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, ©); |
---|
2804 | 3148 | if (error || !copy) { |
---|
2805 | 3149 | *displaced = NULL; |
---|
2806 | 3150 | return error; |
---|
.. | .. |
---|
2813 | 3157 | } |
---|
2814 | 3158 | |
---|
2815 | 3159 | int sysctl_max_threads(struct ctl_table *table, int write, |
---|
2816 | | - void __user *buffer, size_t *lenp, loff_t *ppos) |
---|
| 3160 | + void *buffer, size_t *lenp, loff_t *ppos) |
---|
2817 | 3161 | { |
---|
2818 | 3162 | struct ctl_table t; |
---|
2819 | 3163 | int ret; |
---|