| .. | .. |
|---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
|---|
| 1 | 2 | /* |
|---|
| 2 | 3 | * linux/kernel/fork.c |
|---|
| 3 | 4 | * |
|---|
| .. | .. |
|---|
| 39 | 40 | #include <linux/binfmts.h> |
|---|
| 40 | 41 | #include <linux/mman.h> |
|---|
| 41 | 42 | #include <linux/mmu_notifier.h> |
|---|
| 42 | | -#include <linux/hmm.h> |
|---|
| 43 | 43 | #include <linux/fs.h> |
|---|
| 44 | 44 | #include <linux/mm.h> |
|---|
| 45 | | -#include <linux/kprobes.h> |
|---|
| 46 | 45 | #include <linux/vmacache.h> |
|---|
| 47 | 46 | #include <linux/nsproxy.h> |
|---|
| 48 | 47 | #include <linux/capability.h> |
|---|
| .. | .. |
|---|
| 80 | 79 | #include <linux/blkdev.h> |
|---|
| 81 | 80 | #include <linux/fs_struct.h> |
|---|
| 82 | 81 | #include <linux/magic.h> |
|---|
| 83 | | -#include <linux/sched/mm.h> |
|---|
| 84 | 82 | #include <linux/perf_event.h> |
|---|
| 85 | 83 | #include <linux/posix-timers.h> |
|---|
| 86 | 84 | #include <linux/user-return-notifier.h> |
|---|
| .. | .. |
|---|
| 94 | 92 | #include <linux/kcov.h> |
|---|
| 95 | 93 | #include <linux/livepatch.h> |
|---|
| 96 | 94 | #include <linux/thread_info.h> |
|---|
| 97 | | -#include <linux/cpufreq_times.h> |
|---|
| 95 | +#include <linux/stackleak.h> |
|---|
| 96 | +#include <linux/kasan.h> |
|---|
| 98 | 97 | #include <linux/scs.h> |
|---|
| 98 | +#include <linux/io_uring.h> |
|---|
| 99 | +#include <linux/cpufreq_times.h> |
|---|
| 99 | 100 | |
|---|
| 100 | | -#include <asm/pgtable.h> |
|---|
| 101 | 101 | #include <asm/pgalloc.h> |
|---|
| 102 | 102 | #include <linux/uaccess.h> |
|---|
| 103 | 103 | #include <asm/mmu_context.h> |
|---|
| .. | .. |
|---|
| 109 | 109 | #define CREATE_TRACE_POINTS |
|---|
| 110 | 110 | #include <trace/events/task.h> |
|---|
| 111 | 111 | |
|---|
| 112 | +#undef CREATE_TRACE_POINTS |
|---|
| 113 | +#include <trace/hooks/sched.h> |
|---|
| 112 | 114 | /* |
|---|
| 113 | 115 | * Minimum number of threads to boot the kernel |
|---|
| 114 | 116 | */ |
|---|
| .. | .. |
|---|
| 119 | 121 | */ |
|---|
| 120 | 122 | #define MAX_THREADS FUTEX_TID_MASK |
|---|
| 121 | 123 | |
|---|
| 124 | +EXPORT_TRACEPOINT_SYMBOL_GPL(task_newtask); |
|---|
| 125 | + |
|---|
| 122 | 126 | /* |
|---|
| 123 | 127 | * Protected counters by write_lock_irq(&tasklist_lock) |
|---|
| 124 | 128 | */ |
|---|
| 125 | 129 | unsigned long total_forks; /* Handle normal Linux uptimes. */ |
|---|
| 126 | 130 | int nr_threads; /* The idle threads do not count.. */ |
|---|
| 127 | 131 | |
|---|
| 128 | | -int max_threads; /* tunable limit on nr_threads */ |
|---|
| 132 | +static int max_threads; /* tunable limit on nr_threads */ |
|---|
| 133 | + |
|---|
| 134 | +#define NAMED_ARRAY_INDEX(x) [x] = __stringify(x) |
|---|
| 135 | + |
|---|
| 136 | +static const char * const resident_page_types[] = { |
|---|
| 137 | + NAMED_ARRAY_INDEX(MM_FILEPAGES), |
|---|
| 138 | + NAMED_ARRAY_INDEX(MM_ANONPAGES), |
|---|
| 139 | + NAMED_ARRAY_INDEX(MM_SWAPENTS), |
|---|
| 140 | + NAMED_ARRAY_INDEX(MM_SHMEMPAGES), |
|---|
| 141 | +}; |
|---|
| 129 | 142 | |
|---|
| 130 | 143 | DEFINE_PER_CPU(unsigned long, process_counts) = 0; |
|---|
| 131 | 144 | |
|---|
| 132 | 145 | __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ |
|---|
| 146 | +EXPORT_SYMBOL_GPL(tasklist_lock); |
|---|
| 133 | 147 | |
|---|
| 134 | 148 | #ifdef CONFIG_PROVE_RCU |
|---|
| 135 | 149 | int lockdep_tasklist_lock_is_held(void) |
|---|
| .. | .. |
|---|
| 217 | 231 | if (!s) |
|---|
| 218 | 232 | continue; |
|---|
| 219 | 233 | |
|---|
| 234 | + /* Mark stack accessible for KASAN. */ |
|---|
| 235 | + kasan_unpoison_range(s->addr, THREAD_SIZE); |
|---|
| 236 | + |
|---|
| 220 | 237 | /* Clear stale pointers from reused stack. */ |
|---|
| 221 | 238 | memset(s->addr, 0, THREAD_SIZE); |
|---|
| 222 | 239 | |
|---|
| .. | .. |
|---|
| 225 | 242 | return s->addr; |
|---|
| 226 | 243 | } |
|---|
| 227 | 244 | |
|---|
| 245 | + /* |
|---|
| 246 | + * Allocated stacks are cached and later reused by new threads, |
|---|
| 247 | + * so memcg accounting is performed manually on assigning/releasing |
|---|
| 248 | + * stacks to tasks. Drop __GFP_ACCOUNT. |
|---|
| 249 | + */ |
|---|
| 228 | 250 | stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN, |
|---|
| 229 | 251 | VMALLOC_START, VMALLOC_END, |
|---|
| 230 | | - THREADINFO_GFP, |
|---|
| 252 | + THREADINFO_GFP & ~__GFP_ACCOUNT, |
|---|
| 231 | 253 | PAGE_KERNEL, |
|---|
| 232 | 254 | 0, node, __builtin_return_address(0)); |
|---|
| 233 | 255 | |
|---|
| .. | .. |
|---|
| 246 | 268 | THREAD_SIZE_ORDER); |
|---|
| 247 | 269 | |
|---|
| 248 | 270 | if (likely(page)) { |
|---|
| 249 | | - tsk->stack = page_address(page); |
|---|
| 271 | + tsk->stack = kasan_reset_tag(page_address(page)); |
|---|
| 250 | 272 | return tsk->stack; |
|---|
| 251 | 273 | } |
|---|
| 252 | 274 | return NULL; |
|---|
| .. | .. |
|---|
| 256 | 278 | static inline void free_thread_stack(struct task_struct *tsk) |
|---|
| 257 | 279 | { |
|---|
| 258 | 280 | #ifdef CONFIG_VMAP_STACK |
|---|
| 259 | | - if (task_stack_vm_area(tsk)) { |
|---|
| 281 | + struct vm_struct *vm = task_stack_vm_area(tsk); |
|---|
| 282 | + |
|---|
| 283 | + if (vm) { |
|---|
| 260 | 284 | int i; |
|---|
| 285 | + |
|---|
| 286 | + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) |
|---|
| 287 | + memcg_kmem_uncharge_page(vm->pages[i], 0); |
|---|
| 261 | 288 | |
|---|
| 262 | 289 | for (i = 0; i < NR_CACHED_STACKS; i++) { |
|---|
| 263 | 290 | if (this_cpu_cmpxchg(cached_stacks[i], |
|---|
| .. | .. |
|---|
| 282 | 309 | { |
|---|
| 283 | 310 | unsigned long *stack; |
|---|
| 284 | 311 | stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); |
|---|
| 312 | + stack = kasan_reset_tag(stack); |
|---|
| 285 | 313 | tsk->stack = stack; |
|---|
| 286 | 314 | return stack; |
|---|
| 287 | 315 | } |
|---|
| .. | .. |
|---|
| 334 | 362 | struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
|---|
| 335 | 363 | |
|---|
| 336 | 364 | if (new) { |
|---|
| 337 | | - *new = *orig; |
|---|
| 338 | | - INIT_LIST_HEAD(&new->anon_vma_chain); |
|---|
| 365 | + ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); |
|---|
| 366 | + ASSERT_EXCLUSIVE_WRITER(orig->vm_file); |
|---|
| 367 | + /* |
|---|
| 368 | + * orig->shared.rb may be modified concurrently, but the clone |
|---|
| 369 | + * will be reinitialized. |
|---|
| 370 | + */ |
|---|
| 371 | + *new = data_race(*orig); |
|---|
| 372 | + INIT_VMA(new); |
|---|
| 373 | + new->vm_next = new->vm_prev = NULL; |
|---|
| 339 | 374 | } |
|---|
| 340 | 375 | return new; |
|---|
| 341 | 376 | } |
|---|
| .. | .. |
|---|
| 350 | 385 | void *stack = task_stack_page(tsk); |
|---|
| 351 | 386 | struct vm_struct *vm = task_stack_vm_area(tsk); |
|---|
| 352 | 387 | |
|---|
| 388 | + |
|---|
| 389 | + /* All stack pages are in the same node. */ |
|---|
| 390 | + if (vm) |
|---|
| 391 | + mod_lruvec_page_state(vm->pages[0], NR_KERNEL_STACK_KB, |
|---|
| 392 | + account * (THREAD_SIZE / 1024)); |
|---|
| 393 | + else |
|---|
| 394 | + mod_lruvec_slab_state(stack, NR_KERNEL_STACK_KB, |
|---|
| 395 | + account * (THREAD_SIZE / 1024)); |
|---|
| 396 | +} |
|---|
| 397 | + |
|---|
| 398 | +static int memcg_charge_kernel_stack(struct task_struct *tsk) |
|---|
| 399 | +{ |
|---|
| 400 | +#ifdef CONFIG_VMAP_STACK |
|---|
| 401 | + struct vm_struct *vm = task_stack_vm_area(tsk); |
|---|
| 402 | + int ret; |
|---|
| 403 | + |
|---|
| 353 | 404 | BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0); |
|---|
| 354 | 405 | |
|---|
| 355 | 406 | if (vm) { |
|---|
| .. | .. |
|---|
| 358 | 409 | BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); |
|---|
| 359 | 410 | |
|---|
| 360 | 411 | for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { |
|---|
| 361 | | - mod_zone_page_state(page_zone(vm->pages[i]), |
|---|
| 362 | | - NR_KERNEL_STACK_KB, |
|---|
| 363 | | - PAGE_SIZE / 1024 * account); |
|---|
| 412 | + /* |
|---|
| 413 | + * If memcg_kmem_charge_page() fails, page->mem_cgroup |
|---|
| 414 | + * pointer is NULL, and memcg_kmem_uncharge_page() in |
|---|
| 415 | + * free_thread_stack() will ignore this page. |
|---|
| 416 | + */ |
|---|
| 417 | + ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, |
|---|
| 418 | + 0); |
|---|
| 419 | + if (ret) |
|---|
| 420 | + return ret; |
|---|
| 364 | 421 | } |
|---|
| 365 | | - |
|---|
| 366 | | - /* All stack pages belong to the same memcg. */ |
|---|
| 367 | | - mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB, |
|---|
| 368 | | - account * (THREAD_SIZE / 1024)); |
|---|
| 369 | | - } else { |
|---|
| 370 | | - /* |
|---|
| 371 | | - * All stack pages are in the same zone and belong to the |
|---|
| 372 | | - * same memcg. |
|---|
| 373 | | - */ |
|---|
| 374 | | - struct page *first_page = virt_to_page(stack); |
|---|
| 375 | | - |
|---|
| 376 | | - mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, |
|---|
| 377 | | - THREAD_SIZE / 1024 * account); |
|---|
| 378 | | - |
|---|
| 379 | | - mod_memcg_page_state(first_page, MEMCG_KERNEL_STACK_KB, |
|---|
| 380 | | - account * (THREAD_SIZE / 1024)); |
|---|
| 381 | 422 | } |
|---|
| 423 | +#endif |
|---|
| 424 | + return 0; |
|---|
| 382 | 425 | } |
|---|
| 383 | 426 | |
|---|
| 384 | 427 | static void release_task_stack(struct task_struct *tsk) |
|---|
| .. | .. |
|---|
| 397 | 440 | #ifdef CONFIG_THREAD_INFO_IN_TASK |
|---|
| 398 | 441 | void put_task_stack(struct task_struct *tsk) |
|---|
| 399 | 442 | { |
|---|
| 400 | | - if (atomic_dec_and_test(&tsk->stack_refcount)) |
|---|
| 443 | + if (refcount_dec_and_test(&tsk->stack_refcount)) |
|---|
| 401 | 444 | release_task_stack(tsk); |
|---|
| 402 | 445 | } |
|---|
| 446 | +EXPORT_SYMBOL_GPL(put_task_stack); |
|---|
| 403 | 447 | #endif |
|---|
| 404 | 448 | |
|---|
| 405 | 449 | void free_task(struct task_struct *tsk) |
|---|
| 406 | 450 | { |
|---|
| 451 | +#ifdef CONFIG_SECCOMP |
|---|
| 452 | + WARN_ON_ONCE(tsk->seccomp.filter); |
|---|
| 453 | +#endif |
|---|
| 407 | 454 | cpufreq_task_times_exit(tsk); |
|---|
| 408 | 455 | scs_release(tsk); |
|---|
| 409 | 456 | |
|---|
| 457 | + trace_android_vh_free_task(tsk); |
|---|
| 410 | 458 | #ifndef CONFIG_THREAD_INFO_IN_TASK |
|---|
| 411 | 459 | /* |
|---|
| 412 | 460 | * The task is finally done with both the stack and thread_info, |
|---|
| .. | .. |
|---|
| 418 | 466 | * If the task had a separate stack allocation, it should be gone |
|---|
| 419 | 467 | * by now. |
|---|
| 420 | 468 | */ |
|---|
| 421 | | - WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0); |
|---|
| 469 | + WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0); |
|---|
| 422 | 470 | #endif |
|---|
| 423 | 471 | rt_mutex_debug_task_free(tsk); |
|---|
| 424 | 472 | ftrace_graph_exit_task(tsk); |
|---|
| 425 | | - put_seccomp_filter(tsk); |
|---|
| 426 | 473 | arch_release_task_struct(tsk); |
|---|
| 427 | 474 | if (tsk->flags & PF_KTHREAD) |
|---|
| 428 | 475 | free_kthread_struct(tsk); |
|---|
| .. | .. |
|---|
| 434 | 481 | static __latent_entropy int dup_mmap(struct mm_struct *mm, |
|---|
| 435 | 482 | struct mm_struct *oldmm) |
|---|
| 436 | 483 | { |
|---|
| 437 | | - struct vm_area_struct *mpnt, *tmp, *prev, **pprev; |
|---|
| 484 | + struct vm_area_struct *mpnt, *tmp, *prev, **pprev, *last = NULL; |
|---|
| 438 | 485 | struct rb_node **rb_link, *rb_parent; |
|---|
| 439 | 486 | int retval; |
|---|
| 440 | 487 | unsigned long charge; |
|---|
| 441 | 488 | LIST_HEAD(uf); |
|---|
| 442 | 489 | |
|---|
| 443 | 490 | uprobe_start_dup_mmap(); |
|---|
| 444 | | - if (down_write_killable(&oldmm->mmap_sem)) { |
|---|
| 491 | + if (mmap_write_lock_killable(oldmm)) { |
|---|
| 445 | 492 | retval = -EINTR; |
|---|
| 446 | 493 | goto fail_uprobe_end; |
|---|
| 447 | 494 | } |
|---|
| .. | .. |
|---|
| 450 | 497 | /* |
|---|
| 451 | 498 | * Not linked in yet - no deadlock potential: |
|---|
| 452 | 499 | */ |
|---|
| 453 | | - down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); |
|---|
| 500 | + mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING); |
|---|
| 454 | 501 | |
|---|
| 455 | 502 | /* No ordering required: file already has been exposed. */ |
|---|
| 456 | 503 | RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); |
|---|
| .. | .. |
|---|
| 505 | 552 | if (retval) |
|---|
| 506 | 553 | goto fail_nomem_anon_vma_fork; |
|---|
| 507 | 554 | if (tmp->vm_flags & VM_WIPEONFORK) { |
|---|
| 508 | | - /* VM_WIPEONFORK gets a clean slate in the child. */ |
|---|
| 555 | + /* |
|---|
| 556 | + * VM_WIPEONFORK gets a clean slate in the child. |
|---|
| 557 | + * Don't prepare anon_vma until fault since we don't |
|---|
| 558 | + * copy page for current vma. |
|---|
| 559 | + */ |
|---|
| 509 | 560 | tmp->anon_vma = NULL; |
|---|
| 510 | | - if (anon_vma_prepare(tmp)) |
|---|
| 511 | | - goto fail_nomem_anon_vma_fork; |
|---|
| 512 | 561 | } else if (anon_vma_fork(tmp, mpnt)) |
|---|
| 513 | 562 | goto fail_nomem_anon_vma_fork; |
|---|
| 514 | 563 | tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); |
|---|
| 515 | | - tmp->vm_next = tmp->vm_prev = NULL; |
|---|
| 516 | 564 | file = tmp->vm_file; |
|---|
| 517 | 565 | if (file) { |
|---|
| 518 | 566 | struct inode *inode = file_inode(file); |
|---|
| .. | .. |
|---|
| 520 | 568 | |
|---|
| 521 | 569 | get_file(file); |
|---|
| 522 | 570 | if (tmp->vm_flags & VM_DENYWRITE) |
|---|
| 523 | | - atomic_dec(&inode->i_writecount); |
|---|
| 571 | + put_write_access(inode); |
|---|
| 524 | 572 | i_mmap_lock_write(mapping); |
|---|
| 525 | 573 | if (tmp->vm_flags & VM_SHARED) |
|---|
| 526 | | - atomic_inc(&mapping->i_mmap_writable); |
|---|
| 574 | + mapping_allow_writable(mapping); |
|---|
| 527 | 575 | flush_dcache_mmap_lock(mapping); |
|---|
| 528 | 576 | /* insert tmp into the share list, just after mpnt */ |
|---|
| 529 | 577 | vma_interval_tree_insert_after(tmp, mpnt, |
|---|
| .. | .. |
|---|
| 553 | 601 | rb_parent = &tmp->vm_rb; |
|---|
| 554 | 602 | |
|---|
| 555 | 603 | mm->map_count++; |
|---|
| 556 | | - if (!(tmp->vm_flags & VM_WIPEONFORK)) |
|---|
| 557 | | - retval = copy_page_range(mm, oldmm, mpnt); |
|---|
| 604 | + if (!(tmp->vm_flags & VM_WIPEONFORK)) { |
|---|
| 605 | + if (IS_ENABLED(CONFIG_SPECULATIVE_PAGE_FAULT)) { |
|---|
| 606 | + /* |
|---|
| 607 | + * Mark this VMA as changing to prevent the |
|---|
| 608 | + * speculative page fault hanlder to process |
|---|
| 609 | + * it until the TLB are flushed below. |
|---|
| 610 | + */ |
|---|
| 611 | + last = mpnt; |
|---|
| 612 | + vm_write_begin(mpnt); |
|---|
| 613 | + } |
|---|
| 614 | + retval = copy_page_range(tmp, mpnt); |
|---|
| 615 | + } |
|---|
| 558 | 616 | |
|---|
| 559 | 617 | if (tmp->vm_ops && tmp->vm_ops->open) |
|---|
| 560 | 618 | tmp->vm_ops->open(tmp); |
|---|
| .. | .. |
|---|
| 565 | 623 | /* a new mm has just been created */ |
|---|
| 566 | 624 | retval = arch_dup_mmap(oldmm, mm); |
|---|
| 567 | 625 | out: |
|---|
| 568 | | - up_write(&mm->mmap_sem); |
|---|
| 626 | + mmap_write_unlock(mm); |
|---|
| 569 | 627 | flush_tlb_mm(oldmm); |
|---|
| 570 | | - up_write(&oldmm->mmap_sem); |
|---|
| 628 | + |
|---|
| 629 | + if (IS_ENABLED(CONFIG_SPECULATIVE_PAGE_FAULT)) { |
|---|
| 630 | + /* |
|---|
| 631 | + * Since the TLB has been flush, we can safely unmark the |
|---|
| 632 | + * copied VMAs and allows the speculative page fault handler to |
|---|
| 633 | + * process them again. |
|---|
| 634 | + * Walk back the VMA list from the last marked VMA. |
|---|
| 635 | + */ |
|---|
| 636 | + for (; last; last = last->vm_prev) { |
|---|
| 637 | + if (last->vm_flags & VM_DONTCOPY) |
|---|
| 638 | + continue; |
|---|
| 639 | + if (!(last->vm_flags & VM_WIPEONFORK)) |
|---|
| 640 | + vm_write_end(last); |
|---|
| 641 | + } |
|---|
| 642 | + } |
|---|
| 643 | + |
|---|
| 644 | + mmap_write_unlock(oldmm); |
|---|
| 571 | 645 | dup_userfaultfd_complete(&uf); |
|---|
| 572 | 646 | fail_uprobe_end: |
|---|
| 573 | 647 | uprobe_end_dup_mmap(); |
|---|
| .. | .. |
|---|
| 597 | 671 | #else |
|---|
| 598 | 672 | static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) |
|---|
| 599 | 673 | { |
|---|
| 600 | | - down_write(&oldmm->mmap_sem); |
|---|
| 674 | + mmap_write_lock(oldmm); |
|---|
| 601 | 675 | RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); |
|---|
| 602 | | - up_write(&oldmm->mmap_sem); |
|---|
| 676 | + mmap_write_unlock(oldmm); |
|---|
| 603 | 677 | return 0; |
|---|
| 604 | 678 | } |
|---|
| 605 | 679 | #define mm_alloc_pgd(mm) (0) |
|---|
| .. | .. |
|---|
| 610 | 684 | { |
|---|
| 611 | 685 | int i; |
|---|
| 612 | 686 | |
|---|
| 687 | + BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS, |
|---|
| 688 | + "Please make sure 'struct resident_page_types[]' is updated as well"); |
|---|
| 689 | + |
|---|
| 613 | 690 | for (i = 0; i < NR_MM_COUNTERS; i++) { |
|---|
| 614 | 691 | long x = atomic_long_read(&mm->rss_stat.count[i]); |
|---|
| 615 | 692 | |
|---|
| 616 | 693 | if (unlikely(x)) |
|---|
| 617 | | - printk(KERN_ALERT "BUG: Bad rss-counter state " |
|---|
| 618 | | - "mm:%p idx:%d val:%ld\n", mm, i, x); |
|---|
| 694 | + pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n", |
|---|
| 695 | + mm, resident_page_types[i], x); |
|---|
| 619 | 696 | } |
|---|
| 620 | 697 | |
|---|
| 621 | 698 | if (mm_pgtables_bytes(mm)) |
|---|
| .. | .. |
|---|
| 642 | 719 | WARN_ON_ONCE(mm == current->active_mm); |
|---|
| 643 | 720 | mm_free_pgd(mm); |
|---|
| 644 | 721 | destroy_context(mm); |
|---|
| 645 | | - hmm_mm_destroy(mm); |
|---|
| 646 | | - mmu_notifier_mm_destroy(mm); |
|---|
| 722 | + mmu_notifier_subscriptions_destroy(mm); |
|---|
| 647 | 723 | check_mm(mm); |
|---|
| 648 | 724 | put_user_ns(mm->user_ns); |
|---|
| 649 | 725 | free_mm(mm); |
|---|
| 650 | 726 | } |
|---|
| 651 | 727 | EXPORT_SYMBOL_GPL(__mmdrop); |
|---|
| 652 | | - |
|---|
| 653 | | -#ifdef CONFIG_PREEMPT_RT_BASE |
|---|
| 654 | | -/* |
|---|
| 655 | | - * RCU callback for delayed mm drop. Not strictly rcu, but we don't |
|---|
| 656 | | - * want another facility to make this work. |
|---|
| 657 | | - */ |
|---|
| 658 | | -void __mmdrop_delayed(struct rcu_head *rhp) |
|---|
| 659 | | -{ |
|---|
| 660 | | - struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop); |
|---|
| 661 | | - |
|---|
| 662 | | - __mmdrop(mm); |
|---|
| 663 | | -} |
|---|
| 664 | | -#endif |
|---|
| 665 | 728 | |
|---|
| 666 | 729 | static void mmdrop_async_fn(struct work_struct *work) |
|---|
| 667 | 730 | { |
|---|
| .. | .. |
|---|
| 694 | 757 | |
|---|
| 695 | 758 | static inline void put_signal_struct(struct signal_struct *sig) |
|---|
| 696 | 759 | { |
|---|
| 697 | | - if (atomic_dec_and_test(&sig->sigcnt)) |
|---|
| 760 | + if (refcount_dec_and_test(&sig->sigcnt)) |
|---|
| 698 | 761 | free_signal_struct(sig); |
|---|
| 699 | 762 | } |
|---|
| 700 | | -#ifdef CONFIG_PREEMPT_RT_BASE |
|---|
| 701 | | -static |
|---|
| 702 | | -#endif |
|---|
| 763 | + |
|---|
| 703 | 764 | void __put_task_struct(struct task_struct *tsk) |
|---|
| 704 | 765 | { |
|---|
| 705 | 766 | WARN_ON(!tsk->exit_state); |
|---|
| 706 | | - WARN_ON(atomic_read(&tsk->usage)); |
|---|
| 767 | + WARN_ON(refcount_read(&tsk->usage)); |
|---|
| 707 | 768 | WARN_ON(tsk == current); |
|---|
| 708 | 769 | |
|---|
| 709 | | - /* |
|---|
| 710 | | - * Remove function-return probe instances associated with this |
|---|
| 711 | | - * task and put them back on the free list. |
|---|
| 712 | | - */ |
|---|
| 713 | | - kprobe_flush_task(tsk); |
|---|
| 714 | | - |
|---|
| 715 | | - /* Task is done with its stack. */ |
|---|
| 716 | | - put_task_stack(tsk); |
|---|
| 717 | | - |
|---|
| 770 | + io_uring_free(tsk); |
|---|
| 718 | 771 | cgroup_free(tsk); |
|---|
| 719 | 772 | task_numa_free(tsk, true); |
|---|
| 720 | 773 | security_task_free(tsk); |
|---|
| .. | .. |
|---|
| 725 | 778 | if (!profile_handoff_task(tsk)) |
|---|
| 726 | 779 | free_task(tsk); |
|---|
| 727 | 780 | } |
|---|
| 728 | | -#ifndef CONFIG_PREEMPT_RT_BASE |
|---|
| 729 | 781 | EXPORT_SYMBOL_GPL(__put_task_struct); |
|---|
| 730 | | -#else |
|---|
| 731 | | -void __put_task_struct_cb(struct rcu_head *rhp) |
|---|
| 782 | + |
|---|
| 783 | +void __put_task_struct_rcu_cb(struct rcu_head *rhp) |
|---|
| 732 | 784 | { |
|---|
| 733 | | - struct task_struct *tsk = container_of(rhp, struct task_struct, put_rcu); |
|---|
| 785 | + struct task_struct *task = container_of(rhp, struct task_struct, rcu); |
|---|
| 734 | 786 | |
|---|
| 735 | | - __put_task_struct(tsk); |
|---|
| 736 | | - |
|---|
| 787 | + __put_task_struct(task); |
|---|
| 737 | 788 | } |
|---|
| 738 | | -EXPORT_SYMBOL_GPL(__put_task_struct_cb); |
|---|
| 739 | | -#endif |
|---|
| 789 | +EXPORT_SYMBOL_GPL(__put_task_struct_rcu_cb); |
|---|
| 740 | 790 | |
|---|
| 741 | 791 | void __init __weak arch_task_cache_init(void) { } |
|---|
| 742 | 792 | |
|---|
| .. | .. |
|---|
| 746 | 796 | static void set_max_threads(unsigned int max_threads_suggested) |
|---|
| 747 | 797 | { |
|---|
| 748 | 798 | u64 threads; |
|---|
| 799 | + unsigned long nr_pages = totalram_pages(); |
|---|
| 749 | 800 | |
|---|
| 750 | 801 | /* |
|---|
| 751 | 802 | * The number of threads shall be limited such that the thread |
|---|
| 752 | 803 | * structures may only consume a small part of the available memory. |
|---|
| 753 | 804 | */ |
|---|
| 754 | | - if (fls64(totalram_pages) + fls64(PAGE_SIZE) > 64) |
|---|
| 805 | + if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64) |
|---|
| 755 | 806 | threads = MAX_THREADS; |
|---|
| 756 | 807 | else |
|---|
| 757 | | - threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE, |
|---|
| 808 | + threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE, |
|---|
| 758 | 809 | (u64) THREAD_SIZE * 8UL); |
|---|
| 759 | 810 | |
|---|
| 760 | 811 | if (threads > max_threads_suggested) |
|---|
| .. | .. |
|---|
| 768 | 819 | int arch_task_struct_size __read_mostly; |
|---|
| 769 | 820 | #endif |
|---|
| 770 | 821 | |
|---|
| 822 | +#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR |
|---|
| 771 | 823 | static void task_struct_whitelist(unsigned long *offset, unsigned long *size) |
|---|
| 772 | 824 | { |
|---|
| 773 | 825 | /* Fetch thread_struct whitelist for the architecture. */ |
|---|
| .. | .. |
|---|
| 782 | 834 | else |
|---|
| 783 | 835 | *offset += offsetof(struct task_struct, thread); |
|---|
| 784 | 836 | } |
|---|
| 837 | +#endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */ |
|---|
| 785 | 838 | |
|---|
| 786 | 839 | void __init fork_init(void) |
|---|
| 787 | 840 | { |
|---|
| .. | .. |
|---|
| 823 | 876 | scs_init(); |
|---|
| 824 | 877 | |
|---|
| 825 | 878 | lockdep_init_task(&init_task); |
|---|
| 879 | + uprobes_init(); |
|---|
| 826 | 880 | } |
|---|
| 827 | 881 | |
|---|
| 828 | 882 | int __weak arch_dup_task_struct(struct task_struct *dst, |
|---|
| .. | .. |
|---|
| 844 | 898 | { |
|---|
| 845 | 899 | struct task_struct *tsk; |
|---|
| 846 | 900 | unsigned long *stack; |
|---|
| 847 | | - struct vm_struct *stack_vm_area; |
|---|
| 901 | + struct vm_struct *stack_vm_area __maybe_unused; |
|---|
| 848 | 902 | int err; |
|---|
| 849 | 903 | |
|---|
| 850 | 904 | if (node == NUMA_NO_NODE) |
|---|
| .. | .. |
|---|
| 856 | 910 | stack = alloc_thread_stack_node(tsk, node); |
|---|
| 857 | 911 | if (!stack) |
|---|
| 858 | 912 | goto free_tsk; |
|---|
| 913 | + |
|---|
| 914 | + if (memcg_charge_kernel_stack(tsk)) |
|---|
| 915 | + goto free_stack; |
|---|
| 859 | 916 | |
|---|
| 860 | 917 | stack_vm_area = task_stack_vm_area(tsk); |
|---|
| 861 | 918 | |
|---|
| .. | .. |
|---|
| 871 | 928 | tsk->stack_vm_area = stack_vm_area; |
|---|
| 872 | 929 | #endif |
|---|
| 873 | 930 | #ifdef CONFIG_THREAD_INFO_IN_TASK |
|---|
| 874 | | - atomic_set(&tsk->stack_refcount, 1); |
|---|
| 931 | + refcount_set(&tsk->stack_refcount, 1); |
|---|
| 875 | 932 | #endif |
|---|
| 876 | 933 | |
|---|
| 877 | 934 | if (err) |
|---|
| .. | .. |
|---|
| 903 | 960 | tsk->cpus_ptr = &tsk->cpus_mask; |
|---|
| 904 | 961 | |
|---|
| 905 | 962 | /* |
|---|
| 906 | | - * One for us, one for whoever does the "release_task()" (usually |
|---|
| 907 | | - * parent) |
|---|
| 963 | + * One for the user space visible state that goes away when reaped. |
|---|
| 964 | + * One for the scheduler. |
|---|
| 908 | 965 | */ |
|---|
| 909 | | - atomic_set(&tsk->usage, 2); |
|---|
| 966 | + refcount_set(&tsk->rcu_users, 2); |
|---|
| 967 | + /* One for the rcu users */ |
|---|
| 968 | + refcount_set(&tsk->usage, 1); |
|---|
| 910 | 969 | #ifdef CONFIG_BLK_DEV_IO_TRACE |
|---|
| 911 | 970 | tsk->btrace_seq = 0; |
|---|
| 912 | 971 | #endif |
|---|
| 913 | 972 | tsk->splice_pipe = NULL; |
|---|
| 914 | 973 | tsk->task_frag.page = NULL; |
|---|
| 915 | 974 | tsk->wake_q.next = NULL; |
|---|
| 916 | | - tsk->wake_q_sleeper.next = NULL; |
|---|
| 975 | + tsk->pf_io_worker = NULL; |
|---|
| 917 | 976 | |
|---|
| 918 | 977 | account_kernel_stack(tsk, 1); |
|---|
| 919 | 978 | |
|---|
| .. | .. |
|---|
| 931 | 990 | #ifdef CONFIG_MEMCG |
|---|
| 932 | 991 | tsk->active_memcg = NULL; |
|---|
| 933 | 992 | #endif |
|---|
| 993 | + |
|---|
| 994 | + android_init_vendor_data(tsk, 1); |
|---|
| 995 | + android_init_oem_data(tsk, 1); |
|---|
| 996 | + |
|---|
| 997 | + trace_android_vh_dup_task_struct(tsk, orig); |
|---|
| 934 | 998 | return tsk; |
|---|
| 935 | 999 | |
|---|
| 936 | 1000 | free_stack: |
|---|
| .. | .. |
|---|
| 980 | 1044 | #endif |
|---|
| 981 | 1045 | } |
|---|
| 982 | 1046 | |
|---|
| 1047 | +static void mm_init_pasid(struct mm_struct *mm) |
|---|
| 1048 | +{ |
|---|
| 1049 | +#ifdef CONFIG_IOMMU_SUPPORT |
|---|
| 1050 | + mm->pasid = INIT_PASID; |
|---|
| 1051 | +#endif |
|---|
| 1052 | +} |
|---|
| 1053 | + |
|---|
| 983 | 1054 | static void mm_init_uprobes_state(struct mm_struct *mm) |
|---|
| 984 | 1055 | { |
|---|
| 985 | 1056 | #ifdef CONFIG_UPROBES |
|---|
| .. | .. |
|---|
| 993 | 1064 | mm->mmap = NULL; |
|---|
| 994 | 1065 | mm->mm_rb = RB_ROOT; |
|---|
| 995 | 1066 | mm->vmacache_seqnum = 0; |
|---|
| 1067 | +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT |
|---|
| 1068 | + rwlock_init(&mm->mm_rb_lock); |
|---|
| 1069 | +#endif |
|---|
| 996 | 1070 | atomic_set(&mm->mm_users, 1); |
|---|
| 997 | 1071 | atomic_set(&mm->mm_count, 1); |
|---|
| 998 | | - init_rwsem(&mm->mmap_sem); |
|---|
| 1072 | + seqcount_init(&mm->write_protect_seq); |
|---|
| 1073 | + mmap_init_lock(mm); |
|---|
| 999 | 1074 | INIT_LIST_HEAD(&mm->mmlist); |
|---|
| 1000 | 1075 | mm->core_state = NULL; |
|---|
| 1001 | 1076 | mm_pgtables_bytes_init(mm); |
|---|
| 1002 | 1077 | mm->map_count = 0; |
|---|
| 1003 | 1078 | mm->locked_vm = 0; |
|---|
| 1004 | | - mm->pinned_vm = 0; |
|---|
| 1079 | + atomic_set(&mm->has_pinned, 0); |
|---|
| 1080 | + atomic64_set(&mm->pinned_vm, 0); |
|---|
| 1005 | 1081 | memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); |
|---|
| 1006 | 1082 | spin_lock_init(&mm->page_table_lock); |
|---|
| 1007 | 1083 | spin_lock_init(&mm->arg_lock); |
|---|
| 1008 | 1084 | mm_init_cpumask(mm); |
|---|
| 1009 | 1085 | mm_init_aio(mm); |
|---|
| 1010 | 1086 | mm_init_owner(mm, p); |
|---|
| 1087 | + mm_init_pasid(mm); |
|---|
| 1011 | 1088 | RCU_INIT_POINTER(mm->exe_file, NULL); |
|---|
| 1012 | | - mmu_notifier_mm_init(mm); |
|---|
| 1013 | | - hmm_mm_init(mm); |
|---|
| 1089 | + if (!mmu_notifier_subscriptions_init(mm)) |
|---|
| 1090 | + goto fail_nopgd; |
|---|
| 1014 | 1091 | init_tlb_flush_pending(mm); |
|---|
| 1015 | 1092 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS |
|---|
| 1016 | 1093 | mm->pmd_huge_pte = NULL; |
|---|
| .. | .. |
|---|
| 1085 | 1162 | { |
|---|
| 1086 | 1163 | might_sleep(); |
|---|
| 1087 | 1164 | |
|---|
| 1088 | | - if (atomic_dec_and_test(&mm->mm_users)) |
|---|
| 1165 | + if (atomic_dec_and_test(&mm->mm_users)) { |
|---|
| 1166 | + trace_android_vh_mmput(NULL); |
|---|
| 1089 | 1167 | __mmput(mm); |
|---|
| 1168 | + } |
|---|
| 1090 | 1169 | } |
|---|
| 1091 | 1170 | EXPORT_SYMBOL_GPL(mmput); |
|---|
| 1092 | 1171 | |
|---|
| .. | .. |
|---|
| 1106 | 1185 | schedule_work(&mm->async_put_work); |
|---|
| 1107 | 1186 | } |
|---|
| 1108 | 1187 | } |
|---|
| 1188 | +EXPORT_SYMBOL_GPL(mmput_async); |
|---|
| 1109 | 1189 | #endif |
|---|
| 1110 | 1190 | |
|---|
| 1111 | 1191 | /** |
|---|
| .. | .. |
|---|
| 1210 | 1290 | struct mm_struct *mm; |
|---|
| 1211 | 1291 | int err; |
|---|
| 1212 | 1292 | |
|---|
| 1213 | | - err = mutex_lock_killable(&task->signal->cred_guard_mutex); |
|---|
| 1293 | + err = down_read_killable(&task->signal->exec_update_lock); |
|---|
| 1214 | 1294 | if (err) |
|---|
| 1215 | 1295 | return ERR_PTR(err); |
|---|
| 1216 | 1296 | |
|---|
| .. | .. |
|---|
| 1220 | 1300 | mmput(mm); |
|---|
| 1221 | 1301 | mm = ERR_PTR(-EACCES); |
|---|
| 1222 | 1302 | } |
|---|
| 1223 | | - mutex_unlock(&task->signal->cred_guard_mutex); |
|---|
| 1303 | + up_read(&task->signal->exec_update_lock); |
|---|
| 1224 | 1304 | |
|---|
| 1225 | 1305 | return mm; |
|---|
| 1226 | 1306 | } |
|---|
| .. | .. |
|---|
| 1318 | 1398 | mm_release(tsk, mm); |
|---|
| 1319 | 1399 | } |
|---|
| 1320 | 1400 | |
|---|
| 1321 | | -/* |
|---|
| 1322 | | - * Allocate a new mm structure and copy contents from the |
|---|
| 1323 | | - * mm structure of the passed in task structure. |
|---|
| 1401 | +/** |
|---|
| 1402 | + * dup_mm() - duplicates an existing mm structure |
|---|
| 1403 | + * @tsk: the task_struct with which the new mm will be associated. |
|---|
| 1404 | + * @oldmm: the mm to duplicate. |
|---|
| 1405 | + * |
|---|
| 1406 | + * Allocates a new mm structure and duplicates the provided @oldmm structure |
|---|
| 1407 | + * content into it. |
|---|
| 1408 | + * |
|---|
| 1409 | + * Return: the duplicated mm or NULL on failure. |
|---|
| 1324 | 1410 | */ |
|---|
| 1325 | | -static struct mm_struct *dup_mm(struct task_struct *tsk) |
|---|
| 1411 | +static struct mm_struct *dup_mm(struct task_struct *tsk, |
|---|
| 1412 | + struct mm_struct *oldmm) |
|---|
| 1326 | 1413 | { |
|---|
| 1327 | | - struct mm_struct *mm, *oldmm = current->mm; |
|---|
| 1414 | + struct mm_struct *mm; |
|---|
| 1328 | 1415 | int err; |
|---|
| 1329 | 1416 | |
|---|
| 1330 | 1417 | mm = allocate_mm(); |
|---|
| .. | .. |
|---|
| 1392 | 1479 | } |
|---|
| 1393 | 1480 | |
|---|
| 1394 | 1481 | retval = -ENOMEM; |
|---|
| 1395 | | - mm = dup_mm(tsk); |
|---|
| 1482 | + mm = dup_mm(tsk, current->mm); |
|---|
| 1396 | 1483 | if (!mm) |
|---|
| 1397 | 1484 | goto fail_nomem; |
|---|
| 1398 | 1485 | |
|---|
| .. | .. |
|---|
| 1442 | 1529 | goto out; |
|---|
| 1443 | 1530 | } |
|---|
| 1444 | 1531 | |
|---|
| 1445 | | - newf = dup_fd(oldf, &error); |
|---|
| 1532 | + newf = dup_fd(oldf, NR_OPEN_MAX, &error); |
|---|
| 1446 | 1533 | if (!newf) |
|---|
| 1447 | 1534 | goto out; |
|---|
| 1448 | 1535 | |
|---|
| .. | .. |
|---|
| 1483 | 1570 | struct sighand_struct *sig; |
|---|
| 1484 | 1571 | |
|---|
| 1485 | 1572 | if (clone_flags & CLONE_SIGHAND) { |
|---|
| 1486 | | - atomic_inc(¤t->sighand->count); |
|---|
| 1573 | + refcount_inc(¤t->sighand->count); |
|---|
| 1487 | 1574 | return 0; |
|---|
| 1488 | 1575 | } |
|---|
| 1489 | 1576 | sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); |
|---|
| 1490 | | - rcu_assign_pointer(tsk->sighand, sig); |
|---|
| 1577 | + RCU_INIT_POINTER(tsk->sighand, sig); |
|---|
| 1491 | 1578 | if (!sig) |
|---|
| 1492 | 1579 | return -ENOMEM; |
|---|
| 1493 | 1580 | |
|---|
| 1494 | | - atomic_set(&sig->count, 1); |
|---|
| 1581 | + refcount_set(&sig->count, 1); |
|---|
| 1495 | 1582 | spin_lock_irq(¤t->sighand->siglock); |
|---|
| 1496 | 1583 | memcpy(sig->action, current->sighand->action, sizeof(sig->action)); |
|---|
| 1497 | 1584 | spin_unlock_irq(¤t->sighand->siglock); |
|---|
| 1585 | + |
|---|
| 1586 | + /* Reset all signal handler not set to SIG_IGN to SIG_DFL. */ |
|---|
| 1587 | + if (clone_flags & CLONE_CLEAR_SIGHAND) |
|---|
| 1588 | + flush_signal_handlers(tsk, 0); |
|---|
| 1589 | + |
|---|
| 1498 | 1590 | return 0; |
|---|
| 1499 | 1591 | } |
|---|
| 1500 | 1592 | |
|---|
| 1501 | 1593 | void __cleanup_sighand(struct sighand_struct *sighand) |
|---|
| 1502 | 1594 | { |
|---|
| 1503 | | - if (atomic_dec_and_test(&sighand->count)) { |
|---|
| 1595 | + if (refcount_dec_and_test(&sighand->count)) { |
|---|
| 1504 | 1596 | signalfd_cleanup(sighand); |
|---|
| 1505 | 1597 | /* |
|---|
| 1506 | 1598 | * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it |
|---|
| .. | .. |
|---|
| 1510 | 1602 | } |
|---|
| 1511 | 1603 | } |
|---|
| 1512 | 1604 | |
|---|
| 1513 | | -#ifdef CONFIG_POSIX_TIMERS |
|---|
| 1514 | 1605 | /* |
|---|
| 1515 | 1606 | * Initialize POSIX timer handling for a thread group. |
|---|
| 1516 | 1607 | */ |
|---|
| 1517 | 1608 | static void posix_cpu_timers_init_group(struct signal_struct *sig) |
|---|
| 1518 | 1609 | { |
|---|
| 1610 | + struct posix_cputimers *pct = &sig->posix_cputimers; |
|---|
| 1519 | 1611 | unsigned long cpu_limit; |
|---|
| 1520 | 1612 | |
|---|
| 1521 | 1613 | cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); |
|---|
| 1522 | | - if (cpu_limit != RLIM_INFINITY) { |
|---|
| 1523 | | - sig->cputime_expires.prof_exp = cpu_limit * NSEC_PER_SEC; |
|---|
| 1524 | | - sig->cputimer.running = true; |
|---|
| 1525 | | - } |
|---|
| 1526 | | - |
|---|
| 1527 | | - /* The timer lists. */ |
|---|
| 1528 | | - INIT_LIST_HEAD(&sig->cpu_timers[0]); |
|---|
| 1529 | | - INIT_LIST_HEAD(&sig->cpu_timers[1]); |
|---|
| 1530 | | - INIT_LIST_HEAD(&sig->cpu_timers[2]); |
|---|
| 1614 | + posix_cputimers_group_init(pct, cpu_limit); |
|---|
| 1531 | 1615 | } |
|---|
| 1532 | | -#else |
|---|
| 1533 | | -static inline void posix_cpu_timers_init_group(struct signal_struct *sig) { } |
|---|
| 1534 | | -#endif |
|---|
| 1535 | 1616 | |
|---|
| 1536 | 1617 | static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) |
|---|
| 1537 | 1618 | { |
|---|
| .. | .. |
|---|
| 1547 | 1628 | |
|---|
| 1548 | 1629 | sig->nr_threads = 1; |
|---|
| 1549 | 1630 | atomic_set(&sig->live, 1); |
|---|
| 1550 | | - atomic_set(&sig->sigcnt, 1); |
|---|
| 1631 | + refcount_set(&sig->sigcnt, 1); |
|---|
| 1551 | 1632 | |
|---|
| 1552 | 1633 | /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */ |
|---|
| 1553 | 1634 | sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node); |
|---|
| .. | .. |
|---|
| 1579 | 1660 | sig->oom_score_adj_min = current->signal->oom_score_adj_min; |
|---|
| 1580 | 1661 | |
|---|
| 1581 | 1662 | mutex_init(&sig->cred_guard_mutex); |
|---|
| 1663 | + init_rwsem(&sig->exec_update_lock); |
|---|
| 1582 | 1664 | |
|---|
| 1583 | 1665 | return 0; |
|---|
| 1584 | 1666 | } |
|---|
| .. | .. |
|---|
| 1633 | 1715 | #endif |
|---|
| 1634 | 1716 | } |
|---|
| 1635 | 1717 | |
|---|
| 1636 | | -#ifdef CONFIG_POSIX_TIMERS |
|---|
| 1637 | | -/* |
|---|
| 1638 | | - * Initialize POSIX timer handling for a single task. |
|---|
| 1639 | | - */ |
|---|
| 1640 | | -static void posix_cpu_timers_init(struct task_struct *tsk) |
|---|
| 1641 | | -{ |
|---|
| 1642 | | -#ifdef CONFIG_PREEMPT_RT_BASE |
|---|
| 1643 | | - tsk->posix_timer_list = NULL; |
|---|
| 1644 | | -#endif |
|---|
| 1645 | | - tsk->cputime_expires.prof_exp = 0; |
|---|
| 1646 | | - tsk->cputime_expires.virt_exp = 0; |
|---|
| 1647 | | - tsk->cputime_expires.sched_exp = 0; |
|---|
| 1648 | | - INIT_LIST_HEAD(&tsk->cpu_timers[0]); |
|---|
| 1649 | | - INIT_LIST_HEAD(&tsk->cpu_timers[1]); |
|---|
| 1650 | | - INIT_LIST_HEAD(&tsk->cpu_timers[2]); |
|---|
| 1651 | | -} |
|---|
| 1652 | | -#else |
|---|
| 1653 | | -static inline void posix_cpu_timers_init(struct task_struct *tsk) { } |
|---|
| 1654 | | -#endif |
|---|
| 1655 | | - |
|---|
| 1656 | 1718 | static inline void init_task_pid_links(struct task_struct *task) |
|---|
| 1657 | 1719 | { |
|---|
| 1658 | 1720 | enum pid_type type; |
|---|
| .. | .. |
|---|
| 1684 | 1746 | INIT_LIST_HEAD(&p->rcu_tasks_holdout_list); |
|---|
| 1685 | 1747 | p->rcu_tasks_idle_cpu = -1; |
|---|
| 1686 | 1748 | #endif /* #ifdef CONFIG_TASKS_RCU */ |
|---|
| 1749 | +#ifdef CONFIG_TASKS_TRACE_RCU |
|---|
| 1750 | + p->trc_reader_nesting = 0; |
|---|
| 1751 | + p->trc_reader_special.s = 0; |
|---|
| 1752 | + INIT_LIST_HEAD(&p->trc_holdout_list); |
|---|
| 1753 | +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ |
|---|
| 1687 | 1754 | } |
|---|
| 1755 | + |
|---|
| 1756 | +struct pid *pidfd_pid(const struct file *file) |
|---|
| 1757 | +{ |
|---|
| 1758 | + if (file->f_op == &pidfd_fops) |
|---|
| 1759 | + return file->private_data; |
|---|
| 1760 | + |
|---|
| 1761 | + return ERR_PTR(-EBADF); |
|---|
| 1762 | +} |
|---|
| 1763 | + |
|---|
| 1764 | +static int pidfd_release(struct inode *inode, struct file *file) |
|---|
| 1765 | +{ |
|---|
| 1766 | + struct pid *pid = file->private_data; |
|---|
| 1767 | + |
|---|
| 1768 | + file->private_data = NULL; |
|---|
| 1769 | + put_pid(pid); |
|---|
| 1770 | + return 0; |
|---|
| 1771 | +} |
|---|
| 1772 | + |
|---|
| 1773 | +#ifdef CONFIG_PROC_FS |
|---|
| 1774 | +/** |
|---|
| 1775 | + * pidfd_show_fdinfo - print information about a pidfd |
|---|
| 1776 | + * @m: proc fdinfo file |
|---|
| 1777 | + * @f: file referencing a pidfd |
|---|
| 1778 | + * |
|---|
| 1779 | + * Pid: |
|---|
| 1780 | + * This function will print the pid that a given pidfd refers to in the |
|---|
| 1781 | + * pid namespace of the procfs instance. |
|---|
| 1782 | + * If the pid namespace of the process is not a descendant of the pid |
|---|
| 1783 | + * namespace of the procfs instance 0 will be shown as its pid. This is |
|---|
| 1784 | + * similar to calling getppid() on a process whose parent is outside of |
|---|
| 1785 | + * its pid namespace. |
|---|
| 1786 | + * |
|---|
| 1787 | + * NSpid: |
|---|
| 1788 | + * If pid namespaces are supported then this function will also print |
|---|
| 1789 | + * the pid of a given pidfd refers to for all descendant pid namespaces |
|---|
| 1790 | + * starting from the current pid namespace of the instance, i.e. the |
|---|
| 1791 | + * Pid field and the first entry in the NSpid field will be identical. |
|---|
| 1792 | + * If the pid namespace of the process is not a descendant of the pid |
|---|
| 1793 | + * namespace of the procfs instance 0 will be shown as its first NSpid |
|---|
| 1794 | + * entry and no others will be shown. |
|---|
| 1795 | + * Note that this differs from the Pid and NSpid fields in |
|---|
| 1796 | + * /proc/<pid>/status where Pid and NSpid are always shown relative to |
|---|
| 1797 | + * the pid namespace of the procfs instance. The difference becomes |
|---|
| 1798 | + * obvious when sending around a pidfd between pid namespaces from a |
|---|
| 1799 | + * different branch of the tree, i.e. where no ancestoral relation is |
|---|
| 1800 | + * present between the pid namespaces: |
|---|
| 1801 | + * - create two new pid namespaces ns1 and ns2 in the initial pid |
|---|
| 1802 | + * namespace (also take care to create new mount namespaces in the |
|---|
| 1803 | + * new pid namespace and mount procfs) |
|---|
| 1804 | + * - create a process with a pidfd in ns1 |
|---|
| 1805 | + * - send pidfd from ns1 to ns2 |
|---|
| 1806 | + * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid |
|---|
| 1807 | + * have exactly one entry, which is 0 |
|---|
| 1808 | + */ |
|---|
| 1809 | +static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) |
|---|
| 1810 | +{ |
|---|
| 1811 | + struct pid *pid = f->private_data; |
|---|
| 1812 | + struct pid_namespace *ns; |
|---|
| 1813 | + pid_t nr = -1; |
|---|
| 1814 | + |
|---|
| 1815 | + if (likely(pid_has_task(pid, PIDTYPE_PID))) { |
|---|
| 1816 | + ns = proc_pid_ns(file_inode(m->file)->i_sb); |
|---|
| 1817 | + nr = pid_nr_ns(pid, ns); |
|---|
| 1818 | + } |
|---|
| 1819 | + |
|---|
| 1820 | + seq_put_decimal_ll(m, "Pid:\t", nr); |
|---|
| 1821 | + |
|---|
| 1822 | +#ifdef CONFIG_PID_NS |
|---|
| 1823 | + seq_put_decimal_ll(m, "\nNSpid:\t", nr); |
|---|
| 1824 | + if (nr > 0) { |
|---|
| 1825 | + int i; |
|---|
| 1826 | + |
|---|
| 1827 | + /* If nr is non-zero it means that 'pid' is valid and that |
|---|
| 1828 | + * ns, i.e. the pid namespace associated with the procfs |
|---|
| 1829 | + * instance, is in the pid namespace hierarchy of pid. |
|---|
| 1830 | + * Start at one below the already printed level. |
|---|
| 1831 | + */ |
|---|
| 1832 | + for (i = ns->level + 1; i <= pid->level; i++) |
|---|
| 1833 | + seq_put_decimal_ll(m, "\t", pid->numbers[i].nr); |
|---|
| 1834 | + } |
|---|
| 1835 | +#endif |
|---|
| 1836 | + seq_putc(m, '\n'); |
|---|
| 1837 | +} |
|---|
| 1838 | +#endif |
|---|
| 1839 | + |
|---|
| 1840 | +/* |
|---|
| 1841 | + * Poll support for process exit notification. |
|---|
| 1842 | + */ |
|---|
| 1843 | +static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) |
|---|
| 1844 | +{ |
|---|
| 1845 | + struct pid *pid = file->private_data; |
|---|
| 1846 | + __poll_t poll_flags = 0; |
|---|
| 1847 | + |
|---|
| 1848 | + poll_wait(file, &pid->wait_pidfd, pts); |
|---|
| 1849 | + |
|---|
| 1850 | + /* |
|---|
| 1851 | + * Inform pollers only when the whole thread group exits. |
|---|
| 1852 | + * If the thread group leader exits before all other threads in the |
|---|
| 1853 | + * group, then poll(2) should block, similar to the wait(2) family. |
|---|
| 1854 | + */ |
|---|
| 1855 | + if (thread_group_exited(pid)) |
|---|
| 1856 | + poll_flags = EPOLLIN | EPOLLRDNORM; |
|---|
| 1857 | + |
|---|
| 1858 | + return poll_flags; |
|---|
| 1859 | +} |
|---|
| 1860 | + |
|---|
| 1861 | +const struct file_operations pidfd_fops = { |
|---|
| 1862 | + .release = pidfd_release, |
|---|
| 1863 | + .poll = pidfd_poll, |
|---|
| 1864 | +#ifdef CONFIG_PROC_FS |
|---|
| 1865 | + .show_fdinfo = pidfd_show_fdinfo, |
|---|
| 1866 | +#endif |
|---|
| 1867 | +}; |
|---|
| 1688 | 1868 | |
|---|
| 1689 | 1869 | static void __delayed_free_task(struct rcu_head *rhp) |
|---|
| 1690 | 1870 | { |
|---|
| .. | .. |
|---|
| 1699 | 1879 | call_rcu(&tsk->rcu, __delayed_free_task); |
|---|
| 1700 | 1880 | else |
|---|
| 1701 | 1881 | free_task(tsk); |
|---|
| 1702 | | -} |
|---|
| 1703 | | - |
|---|
| 1704 | | -static int pidfd_release(struct inode *inode, struct file *file) |
|---|
| 1705 | | -{ |
|---|
| 1706 | | - struct pid *pid = file->private_data; |
|---|
| 1707 | | - |
|---|
| 1708 | | - file->private_data = NULL; |
|---|
| 1709 | | - put_pid(pid); |
|---|
| 1710 | | - return 0; |
|---|
| 1711 | | -} |
|---|
| 1712 | | - |
|---|
| 1713 | | -#ifdef CONFIG_PROC_FS |
|---|
| 1714 | | -static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) |
|---|
| 1715 | | -{ |
|---|
| 1716 | | - struct pid_namespace *ns = proc_pid_ns(file_inode(m->file)); |
|---|
| 1717 | | - struct pid *pid = f->private_data; |
|---|
| 1718 | | - |
|---|
| 1719 | | - seq_put_decimal_ull(m, "Pid:\t", pid_nr_ns(pid, ns)); |
|---|
| 1720 | | - seq_putc(m, '\n'); |
|---|
| 1721 | | -} |
|---|
| 1722 | | -#endif |
|---|
| 1723 | | - |
|---|
| 1724 | | -/* |
|---|
| 1725 | | - * Poll support for process exit notification. |
|---|
| 1726 | | - */ |
|---|
| 1727 | | -static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) |
|---|
| 1728 | | -{ |
|---|
| 1729 | | - struct task_struct *task; |
|---|
| 1730 | | - struct pid *pid = file->private_data; |
|---|
| 1731 | | - __poll_t poll_flags = 0; |
|---|
| 1732 | | - |
|---|
| 1733 | | - poll_wait(file, &pid->wait_pidfd, pts); |
|---|
| 1734 | | - |
|---|
| 1735 | | - rcu_read_lock(); |
|---|
| 1736 | | - task = pid_task(pid, PIDTYPE_PID); |
|---|
| 1737 | | - /* |
|---|
| 1738 | | - * Inform pollers only when the whole thread group exits. |
|---|
| 1739 | | - * If the thread group leader exits before all other threads in the |
|---|
| 1740 | | - * group, then poll(2) should block, similar to the wait(2) family. |
|---|
| 1741 | | - */ |
|---|
| 1742 | | - if (!task || (task->exit_state && thread_group_empty(task))) |
|---|
| 1743 | | - poll_flags = EPOLLIN | EPOLLRDNORM; |
|---|
| 1744 | | - rcu_read_unlock(); |
|---|
| 1745 | | - |
|---|
| 1746 | | - return poll_flags; |
|---|
| 1747 | | -} |
|---|
| 1748 | | - |
|---|
| 1749 | | -const struct file_operations pidfd_fops = { |
|---|
| 1750 | | - .release = pidfd_release, |
|---|
| 1751 | | - .poll = pidfd_poll, |
|---|
| 1752 | | -#ifdef CONFIG_PROC_FS |
|---|
| 1753 | | - .show_fdinfo = pidfd_show_fdinfo, |
|---|
| 1754 | | -#endif |
|---|
| 1755 | | -}; |
|---|
| 1756 | | - |
|---|
| 1757 | | -/** |
|---|
| 1758 | | - * pidfd_create() - Create a new pid file descriptor. |
|---|
| 1759 | | - * |
|---|
| 1760 | | - * @pid: struct pid that the pidfd will reference |
|---|
| 1761 | | - * |
|---|
| 1762 | | - * This creates a new pid file descriptor with the O_CLOEXEC flag set. |
|---|
| 1763 | | - * |
|---|
| 1764 | | - * Note, that this function can only be called after the fd table has |
|---|
| 1765 | | - * been unshared to avoid leaking the pidfd to the new process. |
|---|
| 1766 | | - * |
|---|
| 1767 | | - * Return: On success, a cloexec pidfd is returned. |
|---|
| 1768 | | - * On error, a negative errno number will be returned. |
|---|
| 1769 | | - */ |
|---|
| 1770 | | -static int pidfd_create(struct pid *pid) |
|---|
| 1771 | | -{ |
|---|
| 1772 | | - int fd; |
|---|
| 1773 | | - |
|---|
| 1774 | | - fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), |
|---|
| 1775 | | - O_RDWR | O_CLOEXEC); |
|---|
| 1776 | | - if (fd < 0) |
|---|
| 1777 | | - put_pid(pid); |
|---|
| 1778 | | - |
|---|
| 1779 | | - return fd; |
|---|
| 1780 | 1882 | } |
|---|
| 1781 | 1883 | |
|---|
| 1782 | 1884 | static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk) |
|---|
| .. | .. |
|---|
| 1807 | 1909 | * flags). The actual kick-off is left to the caller. |
|---|
| 1808 | 1910 | */ |
|---|
| 1809 | 1911 | static __latent_entropy struct task_struct *copy_process( |
|---|
| 1810 | | - unsigned long clone_flags, |
|---|
| 1811 | | - unsigned long stack_start, |
|---|
| 1812 | | - unsigned long stack_size, |
|---|
| 1813 | | - int __user *parent_tidptr, |
|---|
| 1814 | | - int __user *child_tidptr, |
|---|
| 1815 | 1912 | struct pid *pid, |
|---|
| 1816 | 1913 | int trace, |
|---|
| 1817 | | - unsigned long tls, |
|---|
| 1818 | | - int node) |
|---|
| 1914 | + int node, |
|---|
| 1915 | + struct kernel_clone_args *args) |
|---|
| 1819 | 1916 | { |
|---|
| 1820 | 1917 | int pidfd = -1, retval; |
|---|
| 1821 | 1918 | struct task_struct *p; |
|---|
| 1822 | 1919 | struct multiprocess_signals delayed; |
|---|
| 1920 | + struct file *pidfile = NULL; |
|---|
| 1921 | + u64 clone_flags = args->flags; |
|---|
| 1922 | + struct nsproxy *nsp = current->nsproxy; |
|---|
| 1823 | 1923 | |
|---|
| 1824 | 1924 | /* |
|---|
| 1825 | 1925 | * Don't allow sharing the root directory with processes in a different |
|---|
| .. | .. |
|---|
| 1862 | 1962 | */ |
|---|
| 1863 | 1963 | if (clone_flags & CLONE_THREAD) { |
|---|
| 1864 | 1964 | if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) || |
|---|
| 1865 | | - (task_active_pid_ns(current) != |
|---|
| 1866 | | - current->nsproxy->pid_ns_for_children)) |
|---|
| 1965 | + (task_active_pid_ns(current) != nsp->pid_ns_for_children)) |
|---|
| 1966 | + return ERR_PTR(-EINVAL); |
|---|
| 1967 | + } |
|---|
| 1968 | + |
|---|
| 1969 | + /* |
|---|
| 1970 | + * If the new process will be in a different time namespace |
|---|
| 1971 | + * do not allow it to share VM or a thread group with the forking task. |
|---|
| 1972 | + */ |
|---|
| 1973 | + if (clone_flags & (CLONE_THREAD | CLONE_VM)) { |
|---|
| 1974 | + if (nsp->time_ns != nsp->time_ns_for_children) |
|---|
| 1867 | 1975 | return ERR_PTR(-EINVAL); |
|---|
| 1868 | 1976 | } |
|---|
| 1869 | 1977 | |
|---|
| 1870 | 1978 | if (clone_flags & CLONE_PIDFD) { |
|---|
| 1871 | 1979 | /* |
|---|
| 1872 | | - * - CLONE_PARENT_SETTID is useless for pidfds and also |
|---|
| 1873 | | - * parent_tidptr is used to return pidfds. |
|---|
| 1874 | 1980 | * - CLONE_DETACHED is blocked so that we can potentially |
|---|
| 1875 | 1981 | * reuse it later for CLONE_PIDFD. |
|---|
| 1876 | 1982 | * - CLONE_THREAD is blocked until someone really needs it. |
|---|
| 1877 | 1983 | */ |
|---|
| 1878 | | - if (clone_flags & |
|---|
| 1879 | | - (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD)) |
|---|
| 1984 | + if (clone_flags & (CLONE_DETACHED | CLONE_THREAD)) |
|---|
| 1880 | 1985 | return ERR_PTR(-EINVAL); |
|---|
| 1881 | 1986 | } |
|---|
| 1882 | 1987 | |
|---|
| .. | .. |
|---|
| 1895 | 2000 | recalc_sigpending(); |
|---|
| 1896 | 2001 | spin_unlock_irq(¤t->sighand->siglock); |
|---|
| 1897 | 2002 | retval = -ERESTARTNOINTR; |
|---|
| 1898 | | - if (signal_pending(current)) |
|---|
| 2003 | + if (task_sigpending(current)) |
|---|
| 1899 | 2004 | goto fork_out; |
|---|
| 1900 | 2005 | |
|---|
| 1901 | 2006 | retval = -ENOMEM; |
|---|
| 1902 | 2007 | p = dup_task_struct(current, node); |
|---|
| 1903 | 2008 | if (!p) |
|---|
| 1904 | 2009 | goto fork_out; |
|---|
| 2010 | + if (args->io_thread) { |
|---|
| 2011 | + /* |
|---|
| 2012 | + * Mark us an IO worker, and block any signal that isn't |
|---|
| 2013 | + * fatal or STOP |
|---|
| 2014 | + */ |
|---|
| 2015 | + p->flags |= PF_IO_WORKER; |
|---|
| 2016 | + siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP)); |
|---|
| 2017 | + } |
|---|
| 1905 | 2018 | |
|---|
| 1906 | 2019 | cpufreq_task_times_init(p); |
|---|
| 1907 | 2020 | |
|---|
| .. | .. |
|---|
| 1911 | 2024 | * p->set_child_tid which is (ab)used as a kthread's data pointer for |
|---|
| 1912 | 2025 | * kernel threads (PF_KTHREAD). |
|---|
| 1913 | 2026 | */ |
|---|
| 1914 | | - p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; |
|---|
| 2027 | + p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL; |
|---|
| 1915 | 2028 | /* |
|---|
| 1916 | 2029 | * Clear TID on mm_release()? |
|---|
| 1917 | 2030 | */ |
|---|
| 1918 | | - p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; |
|---|
| 2031 | + p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL; |
|---|
| 1919 | 2032 | |
|---|
| 1920 | 2033 | ftrace_graph_init_task(p); |
|---|
| 1921 | 2034 | |
|---|
| 1922 | 2035 | rt_mutex_init_task(p); |
|---|
| 1923 | 2036 | |
|---|
| 2037 | + lockdep_assert_irqs_enabled(); |
|---|
| 1924 | 2038 | #ifdef CONFIG_PROVE_LOCKING |
|---|
| 1925 | | - DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); |
|---|
| 1926 | 2039 | DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); |
|---|
| 1927 | 2040 | #endif |
|---|
| 1928 | 2041 | retval = -EAGAIN; |
|---|
| .. | .. |
|---|
| 1944 | 2057 | * to stop root fork bombs. |
|---|
| 1945 | 2058 | */ |
|---|
| 1946 | 2059 | retval = -EAGAIN; |
|---|
| 1947 | | - if (nr_threads >= max_threads) |
|---|
| 2060 | + if (data_race(nr_threads >= max_threads)) |
|---|
| 1948 | 2061 | goto bad_fork_cleanup_count; |
|---|
| 1949 | 2062 | |
|---|
| 1950 | 2063 | delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ |
|---|
| .. | .. |
|---|
| 1957 | 2070 | spin_lock_init(&p->alloc_lock); |
|---|
| 1958 | 2071 | |
|---|
| 1959 | 2072 | init_sigpending(&p->pending); |
|---|
| 1960 | | - p->sigqueue_cache = NULL; |
|---|
| 1961 | 2073 | |
|---|
| 1962 | 2074 | p->utime = p->stime = p->gtime = 0; |
|---|
| 1963 | 2075 | #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME |
|---|
| .. | .. |
|---|
| 1969 | 2081 | seqcount_init(&p->vtime.seqcount); |
|---|
| 1970 | 2082 | p->vtime.starttime = 0; |
|---|
| 1971 | 2083 | p->vtime.state = VTIME_INACTIVE; |
|---|
| 2084 | +#endif |
|---|
| 2085 | + |
|---|
| 2086 | +#ifdef CONFIG_IO_URING |
|---|
| 2087 | + p->io_uring = NULL; |
|---|
| 1972 | 2088 | #endif |
|---|
| 1973 | 2089 | |
|---|
| 1974 | 2090 | #if defined(SPLIT_RSS_COUNTING) |
|---|
| .. | .. |
|---|
| 1984 | 2100 | task_io_accounting_init(&p->ioac); |
|---|
| 1985 | 2101 | acct_clear_integrals(p); |
|---|
| 1986 | 2102 | |
|---|
| 1987 | | - posix_cpu_timers_init(p); |
|---|
| 2103 | + posix_cputimers_init(&p->posix_cputimers); |
|---|
| 1988 | 2104 | |
|---|
| 1989 | 2105 | p->io_context = NULL; |
|---|
| 1990 | 2106 | audit_set_context(p, NULL); |
|---|
| .. | .. |
|---|
| 2000 | 2116 | #ifdef CONFIG_CPUSETS |
|---|
| 2001 | 2117 | p->cpuset_mem_spread_rotor = NUMA_NO_NODE; |
|---|
| 2002 | 2118 | p->cpuset_slab_spread_rotor = NUMA_NO_NODE; |
|---|
| 2003 | | - seqcount_init(&p->mems_allowed_seq); |
|---|
| 2119 | + seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock); |
|---|
| 2004 | 2120 | #endif |
|---|
| 2005 | 2121 | #ifdef CONFIG_TRACE_IRQFLAGS |
|---|
| 2006 | | - p->irq_events = 0; |
|---|
| 2007 | | - p->hardirqs_enabled = 0; |
|---|
| 2008 | | - p->hardirq_enable_ip = 0; |
|---|
| 2009 | | - p->hardirq_enable_event = 0; |
|---|
| 2010 | | - p->hardirq_disable_ip = _THIS_IP_; |
|---|
| 2011 | | - p->hardirq_disable_event = 0; |
|---|
| 2012 | | - p->softirqs_enabled = 1; |
|---|
| 2013 | | - p->softirq_enable_ip = _THIS_IP_; |
|---|
| 2014 | | - p->softirq_enable_event = 0; |
|---|
| 2015 | | - p->softirq_disable_ip = 0; |
|---|
| 2016 | | - p->softirq_disable_event = 0; |
|---|
| 2017 | | - p->hardirq_context = 0; |
|---|
| 2018 | | - p->softirq_context = 0; |
|---|
| 2122 | + memset(&p->irqtrace, 0, sizeof(p->irqtrace)); |
|---|
| 2123 | + p->irqtrace.hardirq_disable_ip = _THIS_IP_; |
|---|
| 2124 | + p->irqtrace.softirq_enable_ip = _THIS_IP_; |
|---|
| 2125 | + p->softirqs_enabled = 1; |
|---|
| 2126 | + p->softirq_context = 0; |
|---|
| 2019 | 2127 | #endif |
|---|
| 2020 | 2128 | |
|---|
| 2021 | 2129 | p->pagefault_disabled = 0; |
|---|
| 2022 | 2130 | |
|---|
| 2023 | 2131 | #ifdef CONFIG_LOCKDEP |
|---|
| 2024 | | - p->lockdep_depth = 0; /* no locks held yet */ |
|---|
| 2025 | | - p->curr_chain_key = 0; |
|---|
| 2026 | | - p->lockdep_recursion = 0; |
|---|
| 2027 | 2132 | lockdep_init_task(p); |
|---|
| 2028 | 2133 | #endif |
|---|
| 2029 | 2134 | |
|---|
| .. | .. |
|---|
| 2075 | 2180 | retval = copy_io(clone_flags, p); |
|---|
| 2076 | 2181 | if (retval) |
|---|
| 2077 | 2182 | goto bad_fork_cleanup_namespaces; |
|---|
| 2078 | | - retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls); |
|---|
| 2183 | + retval = copy_thread(clone_flags, args->stack, args->stack_size, p, args->tls); |
|---|
| 2079 | 2184 | if (retval) |
|---|
| 2080 | 2185 | goto bad_fork_cleanup_io; |
|---|
| 2081 | 2186 | |
|---|
| 2187 | + stackleak_task_init(p); |
|---|
| 2188 | + |
|---|
| 2082 | 2189 | if (pid != &init_struct_pid) { |
|---|
| 2083 | | - pid = alloc_pid(p->nsproxy->pid_ns_for_children); |
|---|
| 2190 | + pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid, |
|---|
| 2191 | + args->set_tid_size); |
|---|
| 2084 | 2192 | if (IS_ERR(pid)) { |
|---|
| 2085 | 2193 | retval = PTR_ERR(pid); |
|---|
| 2086 | 2194 | goto bad_fork_cleanup_thread; |
|---|
| .. | .. |
|---|
| 2093 | 2201 | * if the fd table isn't shared). |
|---|
| 2094 | 2202 | */ |
|---|
| 2095 | 2203 | if (clone_flags & CLONE_PIDFD) { |
|---|
| 2096 | | - retval = pidfd_create(pid); |
|---|
| 2204 | + retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC); |
|---|
| 2097 | 2205 | if (retval < 0) |
|---|
| 2098 | 2206 | goto bad_fork_free_pid; |
|---|
| 2099 | 2207 | |
|---|
| 2100 | 2208 | pidfd = retval; |
|---|
| 2101 | | - retval = put_user(pidfd, parent_tidptr); |
|---|
| 2209 | + |
|---|
| 2210 | + pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid, |
|---|
| 2211 | + O_RDWR | O_CLOEXEC); |
|---|
| 2212 | + if (IS_ERR(pidfile)) { |
|---|
| 2213 | + put_unused_fd(pidfd); |
|---|
| 2214 | + retval = PTR_ERR(pidfile); |
|---|
| 2215 | + goto bad_fork_free_pid; |
|---|
| 2216 | + } |
|---|
| 2217 | + get_pid(pid); /* held by pidfile now */ |
|---|
| 2218 | + |
|---|
| 2219 | + retval = put_user(pidfd, args->pidfd); |
|---|
| 2102 | 2220 | if (retval) |
|---|
| 2103 | 2221 | goto bad_fork_put_pidfd; |
|---|
| 2104 | 2222 | } |
|---|
| .. | .. |
|---|
| 2123 | 2241 | #ifdef TIF_SYSCALL_EMU |
|---|
| 2124 | 2242 | clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); |
|---|
| 2125 | 2243 | #endif |
|---|
| 2126 | | - clear_all_latency_tracing(p); |
|---|
| 2244 | + clear_tsk_latency_tracing(p); |
|---|
| 2127 | 2245 | |
|---|
| 2128 | 2246 | /* ok, now we should be set up.. */ |
|---|
| 2129 | 2247 | p->pid = pid_nr(pid); |
|---|
| .. | .. |
|---|
| 2142 | 2260 | p->pdeath_signal = 0; |
|---|
| 2143 | 2261 | INIT_LIST_HEAD(&p->thread_group); |
|---|
| 2144 | 2262 | p->task_works = NULL; |
|---|
| 2263 | + clear_posix_cputimers_work(p); |
|---|
| 2145 | 2264 | |
|---|
| 2146 | | - cgroup_threadgroup_change_begin(current); |
|---|
| 2147 | 2265 | /* |
|---|
| 2148 | 2266 | * Ensure that the cgroup subsystem policies allow the new process to be |
|---|
| 2149 | | - * forked. It should be noted the the new process's css_set can be changed |
|---|
| 2267 | + * forked. It should be noted that the new process's css_set can be changed |
|---|
| 2150 | 2268 | * between here and cgroup_post_fork() if an organisation operation is in |
|---|
| 2151 | 2269 | * progress. |
|---|
| 2152 | 2270 | */ |
|---|
| 2153 | | - retval = cgroup_can_fork(p); |
|---|
| 2271 | + retval = cgroup_can_fork(p, args); |
|---|
| 2154 | 2272 | if (retval) |
|---|
| 2155 | | - goto bad_fork_cgroup_threadgroup_change_end; |
|---|
| 2273 | + goto bad_fork_put_pidfd; |
|---|
| 2274 | + |
|---|
| 2275 | + /* |
|---|
| 2276 | + * Now that the cgroups are pinned, re-clone the parent cgroup and put |
|---|
| 2277 | + * the new task on the correct runqueue. All this *before* the task |
|---|
| 2278 | + * becomes visible. |
|---|
| 2279 | + * |
|---|
| 2280 | + * This isn't part of ->can_fork() because while the re-cloning is |
|---|
| 2281 | + * cgroup specific, it unconditionally needs to place the task on a |
|---|
| 2282 | + * runqueue. |
|---|
| 2283 | + */ |
|---|
| 2284 | + sched_cgroup_fork(p, args); |
|---|
| 2156 | 2285 | |
|---|
| 2157 | 2286 | /* |
|---|
| 2158 | 2287 | * From this point on we must avoid any synchronous user-space |
|---|
| .. | .. |
|---|
| 2163 | 2292 | */ |
|---|
| 2164 | 2293 | |
|---|
| 2165 | 2294 | p->start_time = ktime_get_ns(); |
|---|
| 2166 | | - p->real_start_time = ktime_get_boot_ns(); |
|---|
| 2295 | + p->start_boottime = ktime_get_boottime_ns(); |
|---|
| 2167 | 2296 | |
|---|
| 2168 | 2297 | /* |
|---|
| 2169 | 2298 | * Make it visible to the rest of the system, but dont wake it up yet. |
|---|
| .. | .. |
|---|
| 2182 | 2311 | } else { |
|---|
| 2183 | 2312 | p->real_parent = current; |
|---|
| 2184 | 2313 | p->parent_exec_id = current->self_exec_id; |
|---|
| 2185 | | - p->exit_signal = (clone_flags & CSIGNAL); |
|---|
| 2314 | + p->exit_signal = args->exit_signal; |
|---|
| 2186 | 2315 | } |
|---|
| 2187 | 2316 | |
|---|
| 2188 | 2317 | klp_copy_process(p); |
|---|
| 2189 | 2318 | |
|---|
| 2190 | 2319 | spin_lock(¤t->sighand->siglock); |
|---|
| 2191 | | - |
|---|
| 2192 | | - /* |
|---|
| 2193 | | - * Copy seccomp details explicitly here, in case they were changed |
|---|
| 2194 | | - * before holding sighand lock. |
|---|
| 2195 | | - */ |
|---|
| 2196 | | - copy_seccomp(p); |
|---|
| 2197 | 2320 | |
|---|
| 2198 | 2321 | rseq_fork(p, clone_flags); |
|---|
| 2199 | 2322 | |
|---|
| .. | .. |
|---|
| 2209 | 2332 | goto bad_fork_cancel_cgroup; |
|---|
| 2210 | 2333 | } |
|---|
| 2211 | 2334 | |
|---|
| 2335 | + /* No more failure paths after this point. */ |
|---|
| 2336 | + |
|---|
| 2337 | + /* |
|---|
| 2338 | + * Copy seccomp details explicitly here, in case they were changed |
|---|
| 2339 | + * before holding sighand lock. |
|---|
| 2340 | + */ |
|---|
| 2341 | + copy_seccomp(p); |
|---|
| 2212 | 2342 | |
|---|
| 2213 | 2343 | init_task_pid_links(p); |
|---|
| 2214 | 2344 | if (likely(p->pid)) { |
|---|
| .. | .. |
|---|
| 2242 | 2372 | } else { |
|---|
| 2243 | 2373 | current->signal->nr_threads++; |
|---|
| 2244 | 2374 | atomic_inc(¤t->signal->live); |
|---|
| 2245 | | - atomic_inc(¤t->signal->sigcnt); |
|---|
| 2375 | + refcount_inc(¤t->signal->sigcnt); |
|---|
| 2246 | 2376 | task_join_group_stop(p); |
|---|
| 2247 | 2377 | list_add_tail_rcu(&p->thread_group, |
|---|
| 2248 | 2378 | &p->group_leader->thread_group); |
|---|
| .. | .. |
|---|
| 2258 | 2388 | syscall_tracepoint_update(p); |
|---|
| 2259 | 2389 | write_unlock_irq(&tasklist_lock); |
|---|
| 2260 | 2390 | |
|---|
| 2391 | + if (pidfile) |
|---|
| 2392 | + fd_install(pidfd, pidfile); |
|---|
| 2393 | + |
|---|
| 2261 | 2394 | proc_fork_connector(p); |
|---|
| 2262 | | - cgroup_post_fork(p); |
|---|
| 2263 | | - cgroup_threadgroup_change_end(current); |
|---|
| 2395 | + sched_post_fork(p); |
|---|
| 2396 | + cgroup_post_fork(p, args); |
|---|
| 2264 | 2397 | perf_event_fork(p); |
|---|
| 2265 | 2398 | |
|---|
| 2266 | 2399 | trace_task_newtask(p, clone_flags); |
|---|
| .. | .. |
|---|
| 2273 | 2406 | bad_fork_cancel_cgroup: |
|---|
| 2274 | 2407 | spin_unlock(¤t->sighand->siglock); |
|---|
| 2275 | 2408 | write_unlock_irq(&tasklist_lock); |
|---|
| 2276 | | - cgroup_cancel_fork(p); |
|---|
| 2277 | | -bad_fork_cgroup_threadgroup_change_end: |
|---|
| 2278 | | - cgroup_threadgroup_change_end(current); |
|---|
| 2409 | + cgroup_cancel_fork(p, args); |
|---|
| 2279 | 2410 | bad_fork_put_pidfd: |
|---|
| 2280 | | - if (clone_flags & CLONE_PIDFD) |
|---|
| 2281 | | - ksys_close(pidfd); |
|---|
| 2411 | + if (clone_flags & CLONE_PIDFD) { |
|---|
| 2412 | + fput(pidfile); |
|---|
| 2413 | + put_unused_fd(pidfd); |
|---|
| 2414 | + } |
|---|
| 2282 | 2415 | bad_fork_free_pid: |
|---|
| 2283 | 2416 | if (pid != &init_struct_pid) |
|---|
| 2284 | 2417 | free_pid(pid); |
|---|
| .. | .. |
|---|
| 2342 | 2475 | } |
|---|
| 2343 | 2476 | } |
|---|
| 2344 | 2477 | |
|---|
| 2345 | | -struct task_struct *fork_idle(int cpu) |
|---|
| 2478 | +struct task_struct * __init fork_idle(int cpu) |
|---|
| 2346 | 2479 | { |
|---|
| 2347 | 2480 | struct task_struct *task; |
|---|
| 2348 | | - task = copy_process(CLONE_VM, 0, 0, NULL, NULL, &init_struct_pid, 0, 0, |
|---|
| 2349 | | - cpu_to_node(cpu)); |
|---|
| 2481 | + struct kernel_clone_args args = { |
|---|
| 2482 | + .flags = CLONE_VM, |
|---|
| 2483 | + }; |
|---|
| 2484 | + |
|---|
| 2485 | + task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args); |
|---|
| 2350 | 2486 | if (!IS_ERR(task)) { |
|---|
| 2351 | 2487 | init_idle_pids(task); |
|---|
| 2352 | 2488 | init_idle(task, cpu); |
|---|
| .. | .. |
|---|
| 2356 | 2492 | } |
|---|
| 2357 | 2493 | |
|---|
| 2358 | 2494 | /* |
|---|
| 2495 | + * This is like kernel_clone(), but shaved down and tailored to just |
|---|
| 2496 | + * creating io_uring workers. It returns a created task, or an error pointer. |
|---|
| 2497 | + * The returned task is inactive, and the caller must fire it up through |
|---|
| 2498 | + * wake_up_new_task(p). All signals are blocked in the created task. |
|---|
| 2499 | + */ |
|---|
| 2500 | +struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) |
|---|
| 2501 | +{ |
|---|
| 2502 | + unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD| |
|---|
| 2503 | + CLONE_IO; |
|---|
| 2504 | + struct kernel_clone_args args = { |
|---|
| 2505 | + .flags = ((lower_32_bits(flags) | CLONE_VM | |
|---|
| 2506 | + CLONE_UNTRACED) & ~CSIGNAL), |
|---|
| 2507 | + .exit_signal = (lower_32_bits(flags) & CSIGNAL), |
|---|
| 2508 | + .stack = (unsigned long)fn, |
|---|
| 2509 | + .stack_size = (unsigned long)arg, |
|---|
| 2510 | + .io_thread = 1, |
|---|
| 2511 | + }; |
|---|
| 2512 | + |
|---|
| 2513 | + return copy_process(NULL, 0, node, &args); |
|---|
| 2514 | +} |
|---|
| 2515 | + |
|---|
| 2516 | +/* |
|---|
| 2359 | 2517 | * Ok, this is the main fork-routine. |
|---|
| 2360 | 2518 | * |
|---|
| 2361 | 2519 | * It copies the process, and if successful kick-starts |
|---|
| 2362 | 2520 | * it and waits for it to finish using the VM if required. |
|---|
| 2521 | + * |
|---|
| 2522 | + * args->exit_signal is expected to be checked for sanity by the caller. |
|---|
| 2363 | 2523 | */ |
|---|
| 2364 | | -long _do_fork(unsigned long clone_flags, |
|---|
| 2365 | | - unsigned long stack_start, |
|---|
| 2366 | | - unsigned long stack_size, |
|---|
| 2367 | | - int __user *parent_tidptr, |
|---|
| 2368 | | - int __user *child_tidptr, |
|---|
| 2369 | | - unsigned long tls) |
|---|
| 2524 | +pid_t kernel_clone(struct kernel_clone_args *args) |
|---|
| 2370 | 2525 | { |
|---|
| 2526 | + u64 clone_flags = args->flags; |
|---|
| 2371 | 2527 | struct completion vfork; |
|---|
| 2372 | 2528 | struct pid *pid; |
|---|
| 2373 | 2529 | struct task_struct *p; |
|---|
| 2374 | 2530 | int trace = 0; |
|---|
| 2375 | | - long nr; |
|---|
| 2531 | + pid_t nr; |
|---|
| 2532 | + |
|---|
| 2533 | + /* |
|---|
| 2534 | + * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument |
|---|
| 2535 | + * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are |
|---|
| 2536 | + * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate |
|---|
| 2537 | + * field in struct clone_args and it still doesn't make sense to have |
|---|
| 2538 | + * them both point at the same memory location. Performing this check |
|---|
| 2539 | + * here has the advantage that we don't need to have a separate helper |
|---|
| 2540 | + * to check for legacy clone(). |
|---|
| 2541 | + */ |
|---|
| 2542 | + if ((args->flags & CLONE_PIDFD) && |
|---|
| 2543 | + (args->flags & CLONE_PARENT_SETTID) && |
|---|
| 2544 | + (args->pidfd == args->parent_tid)) |
|---|
| 2545 | + return -EINVAL; |
|---|
| 2376 | 2546 | |
|---|
| 2377 | 2547 | /* |
|---|
| 2378 | 2548 | * Determine whether and which event to report to ptracer. When |
|---|
| .. | .. |
|---|
| 2383 | 2553 | if (!(clone_flags & CLONE_UNTRACED)) { |
|---|
| 2384 | 2554 | if (clone_flags & CLONE_VFORK) |
|---|
| 2385 | 2555 | trace = PTRACE_EVENT_VFORK; |
|---|
| 2386 | | - else if ((clone_flags & CSIGNAL) != SIGCHLD) |
|---|
| 2556 | + else if (args->exit_signal != SIGCHLD) |
|---|
| 2387 | 2557 | trace = PTRACE_EVENT_CLONE; |
|---|
| 2388 | 2558 | else |
|---|
| 2389 | 2559 | trace = PTRACE_EVENT_FORK; |
|---|
| .. | .. |
|---|
| 2392 | 2562 | trace = 0; |
|---|
| 2393 | 2563 | } |
|---|
| 2394 | 2564 | |
|---|
| 2395 | | - p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr, |
|---|
| 2396 | | - child_tidptr, NULL, trace, tls, NUMA_NO_NODE); |
|---|
| 2565 | + p = copy_process(NULL, trace, NUMA_NO_NODE, args); |
|---|
| 2397 | 2566 | add_latent_entropy(); |
|---|
| 2398 | 2567 | |
|---|
| 2399 | 2568 | if (IS_ERR(p)) |
|---|
| .. | .. |
|---|
| 2411 | 2580 | nr = pid_vnr(pid); |
|---|
| 2412 | 2581 | |
|---|
| 2413 | 2582 | if (clone_flags & CLONE_PARENT_SETTID) |
|---|
| 2414 | | - put_user(nr, parent_tidptr); |
|---|
| 2583 | + put_user(nr, args->parent_tid); |
|---|
| 2415 | 2584 | |
|---|
| 2416 | 2585 | if (clone_flags & CLONE_VFORK) { |
|---|
| 2417 | 2586 | p->vfork_done = &vfork; |
|---|
| .. | .. |
|---|
| 2434 | 2603 | return nr; |
|---|
| 2435 | 2604 | } |
|---|
| 2436 | 2605 | |
|---|
| 2437 | | -#ifndef CONFIG_HAVE_COPY_THREAD_TLS |
|---|
| 2438 | | -/* For compatibility with architectures that call do_fork directly rather than |
|---|
| 2439 | | - * using the syscall entry points below. */ |
|---|
| 2440 | | -long do_fork(unsigned long clone_flags, |
|---|
| 2441 | | - unsigned long stack_start, |
|---|
| 2442 | | - unsigned long stack_size, |
|---|
| 2443 | | - int __user *parent_tidptr, |
|---|
| 2444 | | - int __user *child_tidptr) |
|---|
| 2445 | | -{ |
|---|
| 2446 | | - return _do_fork(clone_flags, stack_start, stack_size, |
|---|
| 2447 | | - parent_tidptr, child_tidptr, 0); |
|---|
| 2448 | | -} |
|---|
| 2449 | | -#endif |
|---|
| 2450 | | - |
|---|
| 2451 | 2606 | /* |
|---|
| 2452 | 2607 | * Create a kernel thread. |
|---|
| 2453 | 2608 | */ |
|---|
| 2454 | 2609 | pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) |
|---|
| 2455 | 2610 | { |
|---|
| 2456 | | - return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, |
|---|
| 2457 | | - (unsigned long)arg, NULL, NULL, 0); |
|---|
| 2611 | + struct kernel_clone_args args = { |
|---|
| 2612 | + .flags = ((lower_32_bits(flags) | CLONE_VM | |
|---|
| 2613 | + CLONE_UNTRACED) & ~CSIGNAL), |
|---|
| 2614 | + .exit_signal = (lower_32_bits(flags) & CSIGNAL), |
|---|
| 2615 | + .stack = (unsigned long)fn, |
|---|
| 2616 | + .stack_size = (unsigned long)arg, |
|---|
| 2617 | + }; |
|---|
| 2618 | + |
|---|
| 2619 | + return kernel_clone(&args); |
|---|
| 2458 | 2620 | } |
|---|
| 2459 | 2621 | |
|---|
| 2460 | 2622 | #ifdef __ARCH_WANT_SYS_FORK |
|---|
| 2461 | 2623 | SYSCALL_DEFINE0(fork) |
|---|
| 2462 | 2624 | { |
|---|
| 2463 | 2625 | #ifdef CONFIG_MMU |
|---|
| 2464 | | - return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0); |
|---|
| 2626 | + struct kernel_clone_args args = { |
|---|
| 2627 | + .exit_signal = SIGCHLD, |
|---|
| 2628 | + }; |
|---|
| 2629 | + |
|---|
| 2630 | + return kernel_clone(&args); |
|---|
| 2465 | 2631 | #else |
|---|
| 2466 | 2632 | /* can not support in nommu mode */ |
|---|
| 2467 | 2633 | return -EINVAL; |
|---|
| .. | .. |
|---|
| 2472 | 2638 | #ifdef __ARCH_WANT_SYS_VFORK |
|---|
| 2473 | 2639 | SYSCALL_DEFINE0(vfork) |
|---|
| 2474 | 2640 | { |
|---|
| 2475 | | - return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, |
|---|
| 2476 | | - 0, NULL, NULL, 0); |
|---|
| 2641 | + struct kernel_clone_args args = { |
|---|
| 2642 | + .flags = CLONE_VFORK | CLONE_VM, |
|---|
| 2643 | + .exit_signal = SIGCHLD, |
|---|
| 2644 | + }; |
|---|
| 2645 | + |
|---|
| 2646 | + return kernel_clone(&args); |
|---|
| 2477 | 2647 | } |
|---|
| 2478 | 2648 | #endif |
|---|
| 2479 | 2649 | |
|---|
| .. | .. |
|---|
| 2501 | 2671 | unsigned long, tls) |
|---|
| 2502 | 2672 | #endif |
|---|
| 2503 | 2673 | { |
|---|
| 2504 | | - return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls); |
|---|
| 2674 | + struct kernel_clone_args args = { |
|---|
| 2675 | + .flags = (lower_32_bits(clone_flags) & ~CSIGNAL), |
|---|
| 2676 | + .pidfd = parent_tidptr, |
|---|
| 2677 | + .child_tid = child_tidptr, |
|---|
| 2678 | + .parent_tid = parent_tidptr, |
|---|
| 2679 | + .exit_signal = (lower_32_bits(clone_flags) & CSIGNAL), |
|---|
| 2680 | + .stack = newsp, |
|---|
| 2681 | + .tls = tls, |
|---|
| 2682 | + }; |
|---|
| 2683 | + |
|---|
| 2684 | + return kernel_clone(&args); |
|---|
| 2685 | +} |
|---|
| 2686 | +#endif |
|---|
| 2687 | + |
|---|
| 2688 | +#ifdef __ARCH_WANT_SYS_CLONE3 |
|---|
| 2689 | + |
|---|
| 2690 | +noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, |
|---|
| 2691 | + struct clone_args __user *uargs, |
|---|
| 2692 | + size_t usize) |
|---|
| 2693 | +{ |
|---|
| 2694 | + int err; |
|---|
| 2695 | + struct clone_args args; |
|---|
| 2696 | + pid_t *kset_tid = kargs->set_tid; |
|---|
| 2697 | + |
|---|
| 2698 | + BUILD_BUG_ON(offsetofend(struct clone_args, tls) != |
|---|
| 2699 | + CLONE_ARGS_SIZE_VER0); |
|---|
| 2700 | + BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) != |
|---|
| 2701 | + CLONE_ARGS_SIZE_VER1); |
|---|
| 2702 | + BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) != |
|---|
| 2703 | + CLONE_ARGS_SIZE_VER2); |
|---|
| 2704 | + BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2); |
|---|
| 2705 | + |
|---|
| 2706 | + if (unlikely(usize > PAGE_SIZE)) |
|---|
| 2707 | + return -E2BIG; |
|---|
| 2708 | + if (unlikely(usize < CLONE_ARGS_SIZE_VER0)) |
|---|
| 2709 | + return -EINVAL; |
|---|
| 2710 | + |
|---|
| 2711 | + err = copy_struct_from_user(&args, sizeof(args), uargs, usize); |
|---|
| 2712 | + if (err) |
|---|
| 2713 | + return err; |
|---|
| 2714 | + |
|---|
| 2715 | + if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL)) |
|---|
| 2716 | + return -EINVAL; |
|---|
| 2717 | + |
|---|
| 2718 | + if (unlikely(!args.set_tid && args.set_tid_size > 0)) |
|---|
| 2719 | + return -EINVAL; |
|---|
| 2720 | + |
|---|
| 2721 | + if (unlikely(args.set_tid && args.set_tid_size == 0)) |
|---|
| 2722 | + return -EINVAL; |
|---|
| 2723 | + |
|---|
| 2724 | + /* |
|---|
| 2725 | + * Verify that higher 32bits of exit_signal are unset and that |
|---|
| 2726 | + * it is a valid signal |
|---|
| 2727 | + */ |
|---|
| 2728 | + if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) || |
|---|
| 2729 | + !valid_signal(args.exit_signal))) |
|---|
| 2730 | + return -EINVAL; |
|---|
| 2731 | + |
|---|
| 2732 | + if ((args.flags & CLONE_INTO_CGROUP) && |
|---|
| 2733 | + (args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2)) |
|---|
| 2734 | + return -EINVAL; |
|---|
| 2735 | + |
|---|
| 2736 | + *kargs = (struct kernel_clone_args){ |
|---|
| 2737 | + .flags = args.flags, |
|---|
| 2738 | + .pidfd = u64_to_user_ptr(args.pidfd), |
|---|
| 2739 | + .child_tid = u64_to_user_ptr(args.child_tid), |
|---|
| 2740 | + .parent_tid = u64_to_user_ptr(args.parent_tid), |
|---|
| 2741 | + .exit_signal = args.exit_signal, |
|---|
| 2742 | + .stack = args.stack, |
|---|
| 2743 | + .stack_size = args.stack_size, |
|---|
| 2744 | + .tls = args.tls, |
|---|
| 2745 | + .set_tid_size = args.set_tid_size, |
|---|
| 2746 | + .cgroup = args.cgroup, |
|---|
| 2747 | + }; |
|---|
| 2748 | + |
|---|
| 2749 | + if (args.set_tid && |
|---|
| 2750 | + copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid), |
|---|
| 2751 | + (kargs->set_tid_size * sizeof(pid_t)))) |
|---|
| 2752 | + return -EFAULT; |
|---|
| 2753 | + |
|---|
| 2754 | + kargs->set_tid = kset_tid; |
|---|
| 2755 | + |
|---|
| 2756 | + return 0; |
|---|
| 2757 | +} |
|---|
| 2758 | + |
|---|
| 2759 | +/** |
|---|
| 2760 | + * clone3_stack_valid - check and prepare stack |
|---|
| 2761 | + * @kargs: kernel clone args |
|---|
| 2762 | + * |
|---|
| 2763 | + * Verify that the stack arguments userspace gave us are sane. |
|---|
| 2764 | + * In addition, set the stack direction for userspace since it's easy for us to |
|---|
| 2765 | + * determine. |
|---|
| 2766 | + */ |
|---|
| 2767 | +static inline bool clone3_stack_valid(struct kernel_clone_args *kargs) |
|---|
| 2768 | +{ |
|---|
| 2769 | + if (kargs->stack == 0) { |
|---|
| 2770 | + if (kargs->stack_size > 0) |
|---|
| 2771 | + return false; |
|---|
| 2772 | + } else { |
|---|
| 2773 | + if (kargs->stack_size == 0) |
|---|
| 2774 | + return false; |
|---|
| 2775 | + |
|---|
| 2776 | + if (!access_ok((void __user *)kargs->stack, kargs->stack_size)) |
|---|
| 2777 | + return false; |
|---|
| 2778 | + |
|---|
| 2779 | +#if !defined(CONFIG_STACK_GROWSUP) && !defined(CONFIG_IA64) |
|---|
| 2780 | + kargs->stack += kargs->stack_size; |
|---|
| 2781 | +#endif |
|---|
| 2782 | + } |
|---|
| 2783 | + |
|---|
| 2784 | + return true; |
|---|
| 2785 | +} |
|---|
| 2786 | + |
|---|
| 2787 | +static bool clone3_args_valid(struct kernel_clone_args *kargs) |
|---|
| 2788 | +{ |
|---|
| 2789 | + /* Verify that no unknown flags are passed along. */ |
|---|
| 2790 | + if (kargs->flags & |
|---|
| 2791 | + ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP)) |
|---|
| 2792 | + return false; |
|---|
| 2793 | + |
|---|
| 2794 | + /* |
|---|
| 2795 | + * - make the CLONE_DETACHED bit reuseable for clone3 |
|---|
| 2796 | + * - make the CSIGNAL bits reuseable for clone3 |
|---|
| 2797 | + */ |
|---|
| 2798 | + if (kargs->flags & (CLONE_DETACHED | (CSIGNAL & (~CLONE_NEWTIME)))) |
|---|
| 2799 | + return false; |
|---|
| 2800 | + |
|---|
| 2801 | + if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) == |
|---|
| 2802 | + (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) |
|---|
| 2803 | + return false; |
|---|
| 2804 | + |
|---|
| 2805 | + if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) && |
|---|
| 2806 | + kargs->exit_signal) |
|---|
| 2807 | + return false; |
|---|
| 2808 | + |
|---|
| 2809 | + if (!clone3_stack_valid(kargs)) |
|---|
| 2810 | + return false; |
|---|
| 2811 | + |
|---|
| 2812 | + return true; |
|---|
| 2813 | +} |
|---|
| 2814 | + |
|---|
| 2815 | +/** |
|---|
| 2816 | + * clone3 - create a new process with specific properties |
|---|
| 2817 | + * @uargs: argument structure |
|---|
| 2818 | + * @size: size of @uargs |
|---|
| 2819 | + * |
|---|
| 2820 | + * clone3() is the extensible successor to clone()/clone2(). |
|---|
| 2821 | + * It takes a struct as argument that is versioned by its size. |
|---|
| 2822 | + * |
|---|
| 2823 | + * Return: On success, a positive PID for the child process. |
|---|
| 2824 | + * On error, a negative errno number. |
|---|
| 2825 | + */ |
|---|
| 2826 | +SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size) |
|---|
| 2827 | +{ |
|---|
| 2828 | + int err; |
|---|
| 2829 | + |
|---|
| 2830 | + struct kernel_clone_args kargs; |
|---|
| 2831 | + pid_t set_tid[MAX_PID_NS_LEVEL]; |
|---|
| 2832 | + |
|---|
| 2833 | + kargs.set_tid = set_tid; |
|---|
| 2834 | + |
|---|
| 2835 | + err = copy_clone_args_from_user(&kargs, uargs, size); |
|---|
| 2836 | + if (err) |
|---|
| 2837 | + return err; |
|---|
| 2838 | + |
|---|
| 2839 | + if (!clone3_args_valid(&kargs)) |
|---|
| 2840 | + return -EINVAL; |
|---|
| 2841 | + |
|---|
| 2842 | + return kernel_clone(&kargs); |
|---|
| 2505 | 2843 | } |
|---|
| 2506 | 2844 | #endif |
|---|
| 2507 | 2845 | |
|---|
| .. | .. |
|---|
| 2549 | 2887 | init_waitqueue_head(&sighand->signalfd_wqh); |
|---|
| 2550 | 2888 | } |
|---|
| 2551 | 2889 | |
|---|
| 2552 | | -void __init proc_caches_init(void) |
|---|
| 2890 | +void __init mm_cache_init(void) |
|---|
| 2553 | 2891 | { |
|---|
| 2554 | 2892 | unsigned int mm_size; |
|---|
| 2555 | 2893 | |
|---|
| 2894 | + /* |
|---|
| 2895 | + * The mm_cpumask is located at the end of mm_struct, and is |
|---|
| 2896 | + * dynamically sized based on the maximum CPU number this system |
|---|
| 2897 | + * can have, taking hotplug into account (nr_cpu_ids). |
|---|
| 2898 | + */ |
|---|
| 2899 | + mm_size = sizeof(struct mm_struct) + cpumask_size(); |
|---|
| 2900 | + |
|---|
| 2901 | + mm_cachep = kmem_cache_create_usercopy("mm_struct", |
|---|
| 2902 | + mm_size, ARCH_MIN_MMSTRUCT_ALIGN, |
|---|
| 2903 | + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, |
|---|
| 2904 | + offsetof(struct mm_struct, saved_auxv), |
|---|
| 2905 | + sizeof_field(struct mm_struct, saved_auxv), |
|---|
| 2906 | + NULL); |
|---|
| 2907 | +} |
|---|
| 2908 | + |
|---|
| 2909 | +void __init proc_caches_init(void) |
|---|
| 2910 | +{ |
|---|
| 2556 | 2911 | sighand_cachep = kmem_cache_create("sighand_cache", |
|---|
| 2557 | 2912 | sizeof(struct sighand_struct), 0, |
|---|
| 2558 | 2913 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| |
|---|
| .. | .. |
|---|
| 2570 | 2925 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, |
|---|
| 2571 | 2926 | NULL); |
|---|
| 2572 | 2927 | |
|---|
| 2573 | | - /* |
|---|
| 2574 | | - * The mm_cpumask is located at the end of mm_struct, and is |
|---|
| 2575 | | - * dynamically sized based on the maximum CPU number this system |
|---|
| 2576 | | - * can have, taking hotplug into account (nr_cpu_ids). |
|---|
| 2577 | | - */ |
|---|
| 2578 | | - mm_size = sizeof(struct mm_struct) + cpumask_size(); |
|---|
| 2579 | | - |
|---|
| 2580 | | - mm_cachep = kmem_cache_create_usercopy("mm_struct", |
|---|
| 2581 | | - mm_size, ARCH_MIN_MMSTRUCT_ALIGN, |
|---|
| 2582 | | - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, |
|---|
| 2583 | | - offsetof(struct mm_struct, saved_auxv), |
|---|
| 2584 | | - sizeof_field(struct mm_struct, saved_auxv), |
|---|
| 2585 | | - NULL); |
|---|
| 2586 | 2928 | vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); |
|---|
| 2587 | 2929 | mmap_init(); |
|---|
| 2588 | 2930 | nsproxy_cache_init(); |
|---|
| .. | .. |
|---|
| 2596 | 2938 | if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| |
|---|
| 2597 | 2939 | CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| |
|---|
| 2598 | 2940 | CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| |
|---|
| 2599 | | - CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP)) |
|---|
| 2941 | + CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP| |
|---|
| 2942 | + CLONE_NEWTIME)) |
|---|
| 2600 | 2943 | return -EINVAL; |
|---|
| 2601 | 2944 | /* |
|---|
| 2602 | 2945 | * Not implemented, but pretend it works if there is nothing |
|---|
| .. | .. |
|---|
| 2609 | 2952 | return -EINVAL; |
|---|
| 2610 | 2953 | } |
|---|
| 2611 | 2954 | if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) { |
|---|
| 2612 | | - if (atomic_read(¤t->sighand->count) > 1) |
|---|
| 2955 | + if (refcount_read(¤t->sighand->count) > 1) |
|---|
| 2613 | 2956 | return -EINVAL; |
|---|
| 2614 | 2957 | } |
|---|
| 2615 | 2958 | if (unshare_flags & CLONE_VM) { |
|---|
| .. | .. |
|---|
| 2644 | 2987 | /* |
|---|
| 2645 | 2988 | * Unshare file descriptor table if it is being shared |
|---|
| 2646 | 2989 | */ |
|---|
| 2647 | | -static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) |
|---|
| 2990 | +int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, |
|---|
| 2991 | + struct files_struct **new_fdp) |
|---|
| 2648 | 2992 | { |
|---|
| 2649 | 2993 | struct files_struct *fd = current->files; |
|---|
| 2650 | 2994 | int error = 0; |
|---|
| 2651 | 2995 | |
|---|
| 2652 | 2996 | if ((unshare_flags & CLONE_FILES) && |
|---|
| 2653 | 2997 | (fd && atomic_read(&fd->count) > 1)) { |
|---|
| 2654 | | - *new_fdp = dup_fd(fd, &error); |
|---|
| 2998 | + *new_fdp = dup_fd(fd, max_fds, &error); |
|---|
| 2655 | 2999 | if (!*new_fdp) |
|---|
| 2656 | 3000 | return error; |
|---|
| 2657 | 3001 | } |
|---|
| .. | .. |
|---|
| 2662 | 3006 | /* |
|---|
| 2663 | 3007 | * unshare allows a process to 'unshare' part of the process |
|---|
| 2664 | 3008 | * context which was originally shared using clone. copy_* |
|---|
| 2665 | | - * functions used by do_fork() cannot be used here directly |
|---|
| 3009 | + * functions used by kernel_clone() cannot be used here directly |
|---|
| 2666 | 3010 | * because they modify an inactive task_struct that is being |
|---|
| 2667 | 3011 | * constructed. Here we are modifying the current, active, |
|---|
| 2668 | 3012 | * task_struct. |
|---|
| .. | .. |
|---|
| 2711 | 3055 | err = unshare_fs(unshare_flags, &new_fs); |
|---|
| 2712 | 3056 | if (err) |
|---|
| 2713 | 3057 | goto bad_unshare_out; |
|---|
| 2714 | | - err = unshare_fd(unshare_flags, &new_fd); |
|---|
| 3058 | + err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd); |
|---|
| 2715 | 3059 | if (err) |
|---|
| 2716 | 3060 | goto bad_unshare_cleanup_fs; |
|---|
| 2717 | 3061 | err = unshare_userns(unshare_flags, &new_cred); |
|---|
| .. | .. |
|---|
| 2800 | 3144 | struct files_struct *copy = NULL; |
|---|
| 2801 | 3145 | int error; |
|---|
| 2802 | 3146 | |
|---|
| 2803 | | - error = unshare_fd(CLONE_FILES, ©); |
|---|
| 3147 | + error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, ©); |
|---|
| 2804 | 3148 | if (error || !copy) { |
|---|
| 2805 | 3149 | *displaced = NULL; |
|---|
| 2806 | 3150 | return error; |
|---|
| .. | .. |
|---|
| 2813 | 3157 | } |
|---|
| 2814 | 3158 | |
|---|
| 2815 | 3159 | int sysctl_max_threads(struct ctl_table *table, int write, |
|---|
| 2816 | | - void __user *buffer, size_t *lenp, loff_t *ppos) |
|---|
| 3160 | + void *buffer, size_t *lenp, loff_t *ppos) |
|---|
| 2817 | 3161 | { |
|---|
| 2818 | 3162 | struct ctl_table t; |
|---|
| 2819 | 3163 | int ret; |
|---|