.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
---|
1 | 2 | /* |
---|
2 | 3 | * linux/kernel/fork.c |
---|
3 | 4 | * |
---|
.. | .. |
---|
39 | 40 | #include <linux/binfmts.h> |
---|
40 | 41 | #include <linux/mman.h> |
---|
41 | 42 | #include <linux/mmu_notifier.h> |
---|
42 | | -#include <linux/hmm.h> |
---|
43 | 43 | #include <linux/fs.h> |
---|
44 | 44 | #include <linux/mm.h> |
---|
45 | 45 | #include <linux/vmacache.h> |
---|
.. | .. |
---|
79 | 79 | #include <linux/blkdev.h> |
---|
80 | 80 | #include <linux/fs_struct.h> |
---|
81 | 81 | #include <linux/magic.h> |
---|
82 | | -#include <linux/sched/mm.h> |
---|
83 | 82 | #include <linux/perf_event.h> |
---|
84 | 83 | #include <linux/posix-timers.h> |
---|
85 | 84 | #include <linux/user-return-notifier.h> |
---|
.. | .. |
---|
93 | 92 | #include <linux/kcov.h> |
---|
94 | 93 | #include <linux/livepatch.h> |
---|
95 | 94 | #include <linux/thread_info.h> |
---|
96 | | -#include <linux/cpufreq_times.h> |
---|
| 95 | +#include <linux/stackleak.h> |
---|
| 96 | +#include <linux/kasan.h> |
---|
97 | 97 | #include <linux/scs.h> |
---|
| 98 | +#include <linux/io_uring.h> |
---|
| 99 | +#include <linux/cpufreq_times.h> |
---|
98 | 100 | |
---|
99 | | -#include <asm/pgtable.h> |
---|
100 | 101 | #include <asm/pgalloc.h> |
---|
101 | 102 | #include <linux/uaccess.h> |
---|
102 | 103 | #include <asm/mmu_context.h> |
---|
.. | .. |
---|
108 | 109 | #define CREATE_TRACE_POINTS |
---|
109 | 110 | #include <trace/events/task.h> |
---|
110 | 111 | |
---|
| 112 | +#undef CREATE_TRACE_POINTS |
---|
| 113 | +#include <trace/hooks/sched.h> |
---|
111 | 114 | /* |
---|
112 | 115 | * Minimum number of threads to boot the kernel |
---|
113 | 116 | */ |
---|
.. | .. |
---|
118 | 121 | */ |
---|
119 | 122 | #define MAX_THREADS FUTEX_TID_MASK |
---|
120 | 123 | |
---|
| 124 | +EXPORT_TRACEPOINT_SYMBOL_GPL(task_newtask); |
---|
| 125 | + |
---|
121 | 126 | /* |
---|
122 | 127 | * Protected counters by write_lock_irq(&tasklist_lock) |
---|
123 | 128 | */ |
---|
124 | 129 | unsigned long total_forks; /* Handle normal Linux uptimes. */ |
---|
125 | 130 | int nr_threads; /* The idle threads do not count.. */ |
---|
126 | 131 | |
---|
127 | | -int max_threads; /* tunable limit on nr_threads */ |
---|
| 132 | +static int max_threads; /* tunable limit on nr_threads */ |
---|
| 133 | + |
---|
| 134 | +#define NAMED_ARRAY_INDEX(x) [x] = __stringify(x) |
---|
| 135 | + |
---|
| 136 | +static const char * const resident_page_types[] = { |
---|
| 137 | + NAMED_ARRAY_INDEX(MM_FILEPAGES), |
---|
| 138 | + NAMED_ARRAY_INDEX(MM_ANONPAGES), |
---|
| 139 | + NAMED_ARRAY_INDEX(MM_SWAPENTS), |
---|
| 140 | + NAMED_ARRAY_INDEX(MM_SHMEMPAGES), |
---|
| 141 | +}; |
---|
128 | 142 | |
---|
129 | 143 | DEFINE_PER_CPU(unsigned long, process_counts) = 0; |
---|
130 | 144 | |
---|
131 | 145 | __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ |
---|
| 146 | +EXPORT_SYMBOL_GPL(tasklist_lock); |
---|
132 | 147 | |
---|
133 | 148 | #ifdef CONFIG_PROVE_RCU |
---|
134 | 149 | int lockdep_tasklist_lock_is_held(void) |
---|
.. | .. |
---|
216 | 231 | if (!s) |
---|
217 | 232 | continue; |
---|
218 | 233 | |
---|
| 234 | + /* Mark stack accessible for KASAN. */ |
---|
| 235 | + kasan_unpoison_range(s->addr, THREAD_SIZE); |
---|
| 236 | + |
---|
219 | 237 | /* Clear stale pointers from reused stack. */ |
---|
220 | 238 | memset(s->addr, 0, THREAD_SIZE); |
---|
221 | 239 | |
---|
.. | .. |
---|
224 | 242 | return s->addr; |
---|
225 | 243 | } |
---|
226 | 244 | |
---|
| 245 | + /* |
---|
| 246 | + * Allocated stacks are cached and later reused by new threads, |
---|
| 247 | + * so memcg accounting is performed manually on assigning/releasing |
---|
| 248 | + * stacks to tasks. Drop __GFP_ACCOUNT. |
---|
| 249 | + */ |
---|
227 | 250 | stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN, |
---|
228 | 251 | VMALLOC_START, VMALLOC_END, |
---|
229 | | - THREADINFO_GFP, |
---|
| 252 | + THREADINFO_GFP & ~__GFP_ACCOUNT, |
---|
230 | 253 | PAGE_KERNEL, |
---|
231 | 254 | 0, node, __builtin_return_address(0)); |
---|
232 | 255 | |
---|
.. | .. |
---|
245 | 268 | THREAD_SIZE_ORDER); |
---|
246 | 269 | |
---|
247 | 270 | if (likely(page)) { |
---|
248 | | - tsk->stack = page_address(page); |
---|
| 271 | + tsk->stack = kasan_reset_tag(page_address(page)); |
---|
249 | 272 | return tsk->stack; |
---|
250 | 273 | } |
---|
251 | 274 | return NULL; |
---|
.. | .. |
---|
255 | 278 | static inline void free_thread_stack(struct task_struct *tsk) |
---|
256 | 279 | { |
---|
257 | 280 | #ifdef CONFIG_VMAP_STACK |
---|
258 | | - if (task_stack_vm_area(tsk)) { |
---|
| 281 | + struct vm_struct *vm = task_stack_vm_area(tsk); |
---|
| 282 | + |
---|
| 283 | + if (vm) { |
---|
259 | 284 | int i; |
---|
| 285 | + |
---|
| 286 | + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) |
---|
| 287 | + memcg_kmem_uncharge_page(vm->pages[i], 0); |
---|
260 | 288 | |
---|
261 | 289 | for (i = 0; i < NR_CACHED_STACKS; i++) { |
---|
262 | 290 | if (this_cpu_cmpxchg(cached_stacks[i], |
---|
.. | .. |
---|
281 | 309 | { |
---|
282 | 310 | unsigned long *stack; |
---|
283 | 311 | stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); |
---|
| 312 | + stack = kasan_reset_tag(stack); |
---|
284 | 313 | tsk->stack = stack; |
---|
285 | 314 | return stack; |
---|
286 | 315 | } |
---|
.. | .. |
---|
333 | 362 | struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
---|
334 | 363 | |
---|
335 | 364 | if (new) { |
---|
336 | | - *new = *orig; |
---|
337 | | - INIT_LIST_HEAD(&new->anon_vma_chain); |
---|
| 365 | + ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); |
---|
| 366 | + ASSERT_EXCLUSIVE_WRITER(orig->vm_file); |
---|
| 367 | + /* |
---|
| 368 | + * orig->shared.rb may be modified concurrently, but the clone |
---|
| 369 | + * will be reinitialized. |
---|
| 370 | + */ |
---|
| 371 | + *new = data_race(*orig); |
---|
| 372 | + INIT_VMA(new); |
---|
| 373 | + new->vm_next = new->vm_prev = NULL; |
---|
338 | 374 | } |
---|
339 | 375 | return new; |
---|
340 | 376 | } |
---|
.. | .. |
---|
349 | 385 | void *stack = task_stack_page(tsk); |
---|
350 | 386 | struct vm_struct *vm = task_stack_vm_area(tsk); |
---|
351 | 387 | |
---|
| 388 | + |
---|
| 389 | + /* All stack pages are in the same node. */ |
---|
| 390 | + if (vm) |
---|
| 391 | + mod_lruvec_page_state(vm->pages[0], NR_KERNEL_STACK_KB, |
---|
| 392 | + account * (THREAD_SIZE / 1024)); |
---|
| 393 | + else |
---|
| 394 | + mod_lruvec_slab_state(stack, NR_KERNEL_STACK_KB, |
---|
| 395 | + account * (THREAD_SIZE / 1024)); |
---|
| 396 | +} |
---|
| 397 | + |
---|
| 398 | +static int memcg_charge_kernel_stack(struct task_struct *tsk) |
---|
| 399 | +{ |
---|
| 400 | +#ifdef CONFIG_VMAP_STACK |
---|
| 401 | + struct vm_struct *vm = task_stack_vm_area(tsk); |
---|
| 402 | + int ret; |
---|
| 403 | + |
---|
352 | 404 | BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0); |
---|
353 | 405 | |
---|
354 | 406 | if (vm) { |
---|
.. | .. |
---|
357 | 409 | BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); |
---|
358 | 410 | |
---|
359 | 411 | for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { |
---|
360 | | - mod_zone_page_state(page_zone(vm->pages[i]), |
---|
361 | | - NR_KERNEL_STACK_KB, |
---|
362 | | - PAGE_SIZE / 1024 * account); |
---|
| 412 | + /* |
---|
| 413 | + * If memcg_kmem_charge_page() fails, page->mem_cgroup |
---|
| 414 | + * pointer is NULL, and memcg_kmem_uncharge_page() in |
---|
| 415 | + * free_thread_stack() will ignore this page. |
---|
| 416 | + */ |
---|
| 417 | + ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, |
---|
| 418 | + 0); |
---|
| 419 | + if (ret) |
---|
| 420 | + return ret; |
---|
363 | 421 | } |
---|
364 | | - |
---|
365 | | - /* All stack pages belong to the same memcg. */ |
---|
366 | | - mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB, |
---|
367 | | - account * (THREAD_SIZE / 1024)); |
---|
368 | | - } else { |
---|
369 | | - /* |
---|
370 | | - * All stack pages are in the same zone and belong to the |
---|
371 | | - * same memcg. |
---|
372 | | - */ |
---|
373 | | - struct page *first_page = virt_to_page(stack); |
---|
374 | | - |
---|
375 | | - mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, |
---|
376 | | - THREAD_SIZE / 1024 * account); |
---|
377 | | - |
---|
378 | | - mod_memcg_page_state(first_page, MEMCG_KERNEL_STACK_KB, |
---|
379 | | - account * (THREAD_SIZE / 1024)); |
---|
380 | 422 | } |
---|
| 423 | +#endif |
---|
| 424 | + return 0; |
---|
381 | 425 | } |
---|
382 | 426 | |
---|
383 | 427 | static void release_task_stack(struct task_struct *tsk) |
---|
.. | .. |
---|
396 | 440 | #ifdef CONFIG_THREAD_INFO_IN_TASK |
---|
397 | 441 | void put_task_stack(struct task_struct *tsk) |
---|
398 | 442 | { |
---|
399 | | - if (atomic_dec_and_test(&tsk->stack_refcount)) |
---|
| 443 | + if (refcount_dec_and_test(&tsk->stack_refcount)) |
---|
400 | 444 | release_task_stack(tsk); |
---|
401 | 445 | } |
---|
| 446 | +EXPORT_SYMBOL_GPL(put_task_stack); |
---|
402 | 447 | #endif |
---|
403 | 448 | |
---|
404 | 449 | void free_task(struct task_struct *tsk) |
---|
.. | .. |
---|
406 | 451 | cpufreq_task_times_exit(tsk); |
---|
407 | 452 | scs_release(tsk); |
---|
408 | 453 | |
---|
| 454 | + trace_android_vh_free_task(tsk); |
---|
409 | 455 | #ifndef CONFIG_THREAD_INFO_IN_TASK |
---|
410 | 456 | /* |
---|
411 | 457 | * The task is finally done with both the stack and thread_info, |
---|
.. | .. |
---|
417 | 463 | * If the task had a separate stack allocation, it should be gone |
---|
418 | 464 | * by now. |
---|
419 | 465 | */ |
---|
420 | | - WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0); |
---|
| 466 | + WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0); |
---|
421 | 467 | #endif |
---|
422 | 468 | rt_mutex_debug_task_free(tsk); |
---|
423 | 469 | ftrace_graph_exit_task(tsk); |
---|
424 | | - put_seccomp_filter(tsk); |
---|
425 | 470 | arch_release_task_struct(tsk); |
---|
426 | 471 | if (tsk->flags & PF_KTHREAD) |
---|
427 | 472 | free_kthread_struct(tsk); |
---|
.. | .. |
---|
433 | 478 | static __latent_entropy int dup_mmap(struct mm_struct *mm, |
---|
434 | 479 | struct mm_struct *oldmm) |
---|
435 | 480 | { |
---|
436 | | - struct vm_area_struct *mpnt, *tmp, *prev, **pprev; |
---|
| 481 | + struct vm_area_struct *mpnt, *tmp, *prev, **pprev, *last = NULL; |
---|
437 | 482 | struct rb_node **rb_link, *rb_parent; |
---|
438 | 483 | int retval; |
---|
439 | 484 | unsigned long charge; |
---|
440 | 485 | LIST_HEAD(uf); |
---|
441 | 486 | |
---|
442 | 487 | uprobe_start_dup_mmap(); |
---|
443 | | - if (down_write_killable(&oldmm->mmap_sem)) { |
---|
| 488 | + if (mmap_write_lock_killable(oldmm)) { |
---|
444 | 489 | retval = -EINTR; |
---|
445 | 490 | goto fail_uprobe_end; |
---|
446 | 491 | } |
---|
.. | .. |
---|
449 | 494 | /* |
---|
450 | 495 | * Not linked in yet - no deadlock potential: |
---|
451 | 496 | */ |
---|
452 | | - down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); |
---|
| 497 | + mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING); |
---|
453 | 498 | |
---|
454 | 499 | /* No ordering required: file already has been exposed. */ |
---|
455 | 500 | RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); |
---|
.. | .. |
---|
504 | 549 | if (retval) |
---|
505 | 550 | goto fail_nomem_anon_vma_fork; |
---|
506 | 551 | if (tmp->vm_flags & VM_WIPEONFORK) { |
---|
507 | | - /* VM_WIPEONFORK gets a clean slate in the child. */ |
---|
| 552 | + /* |
---|
| 553 | + * VM_WIPEONFORK gets a clean slate in the child. |
---|
| 554 | + * Don't prepare anon_vma until fault since we don't |
---|
| 555 | + * copy page for current vma. |
---|
| 556 | + */ |
---|
508 | 557 | tmp->anon_vma = NULL; |
---|
509 | | - if (anon_vma_prepare(tmp)) |
---|
510 | | - goto fail_nomem_anon_vma_fork; |
---|
511 | 558 | } else if (anon_vma_fork(tmp, mpnt)) |
---|
512 | 559 | goto fail_nomem_anon_vma_fork; |
---|
513 | 560 | tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); |
---|
514 | | - tmp->vm_next = tmp->vm_prev = NULL; |
---|
515 | 561 | file = tmp->vm_file; |
---|
516 | 562 | if (file) { |
---|
517 | 563 | struct inode *inode = file_inode(file); |
---|
.. | .. |
---|
519 | 565 | |
---|
520 | 566 | get_file(file); |
---|
521 | 567 | if (tmp->vm_flags & VM_DENYWRITE) |
---|
522 | | - atomic_dec(&inode->i_writecount); |
---|
| 568 | + put_write_access(inode); |
---|
523 | 569 | i_mmap_lock_write(mapping); |
---|
524 | 570 | if (tmp->vm_flags & VM_SHARED) |
---|
525 | | - atomic_inc(&mapping->i_mmap_writable); |
---|
| 571 | + mapping_allow_writable(mapping); |
---|
526 | 572 | flush_dcache_mmap_lock(mapping); |
---|
527 | 573 | /* insert tmp into the share list, just after mpnt */ |
---|
528 | 574 | vma_interval_tree_insert_after(tmp, mpnt, |
---|
.. | .. |
---|
552 | 598 | rb_parent = &tmp->vm_rb; |
---|
553 | 599 | |
---|
554 | 600 | mm->map_count++; |
---|
555 | | - if (!(tmp->vm_flags & VM_WIPEONFORK)) |
---|
556 | | - retval = copy_page_range(mm, oldmm, mpnt); |
---|
| 601 | + if (!(tmp->vm_flags & VM_WIPEONFORK)) { |
---|
| 602 | + if (IS_ENABLED(CONFIG_SPECULATIVE_PAGE_FAULT)) { |
---|
| 603 | + /* |
---|
| 604 | + * Mark this VMA as changing to prevent the |
---|
| 605 | + * speculative page fault hanlder to process |
---|
| 606 | + * it until the TLB are flushed below. |
---|
| 607 | + */ |
---|
| 608 | + last = mpnt; |
---|
| 609 | + vm_write_begin(mpnt); |
---|
| 610 | + } |
---|
| 611 | + retval = copy_page_range(tmp, mpnt); |
---|
| 612 | + } |
---|
557 | 613 | |
---|
558 | 614 | if (tmp->vm_ops && tmp->vm_ops->open) |
---|
559 | 615 | tmp->vm_ops->open(tmp); |
---|
.. | .. |
---|
564 | 620 | /* a new mm has just been created */ |
---|
565 | 621 | retval = arch_dup_mmap(oldmm, mm); |
---|
566 | 622 | out: |
---|
567 | | - up_write(&mm->mmap_sem); |
---|
| 623 | + mmap_write_unlock(mm); |
---|
568 | 624 | flush_tlb_mm(oldmm); |
---|
569 | | - up_write(&oldmm->mmap_sem); |
---|
| 625 | + |
---|
| 626 | + if (IS_ENABLED(CONFIG_SPECULATIVE_PAGE_FAULT)) { |
---|
| 627 | + /* |
---|
| 628 | + * Since the TLB has been flush, we can safely unmark the |
---|
| 629 | + * copied VMAs and allows the speculative page fault handler to |
---|
| 630 | + * process them again. |
---|
| 631 | + * Walk back the VMA list from the last marked VMA. |
---|
| 632 | + */ |
---|
| 633 | + for (; last; last = last->vm_prev) { |
---|
| 634 | + if (last->vm_flags & VM_DONTCOPY) |
---|
| 635 | + continue; |
---|
| 636 | + if (!(last->vm_flags & VM_WIPEONFORK)) |
---|
| 637 | + vm_write_end(last); |
---|
| 638 | + } |
---|
| 639 | + } |
---|
| 640 | + |
---|
| 641 | + mmap_write_unlock(oldmm); |
---|
570 | 642 | dup_userfaultfd_complete(&uf); |
---|
571 | 643 | fail_uprobe_end: |
---|
572 | 644 | uprobe_end_dup_mmap(); |
---|
.. | .. |
---|
596 | 668 | #else |
---|
597 | 669 | static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) |
---|
598 | 670 | { |
---|
599 | | - down_write(&oldmm->mmap_sem); |
---|
| 671 | + mmap_write_lock(oldmm); |
---|
600 | 672 | RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); |
---|
601 | | - up_write(&oldmm->mmap_sem); |
---|
| 673 | + mmap_write_unlock(oldmm); |
---|
602 | 674 | return 0; |
---|
603 | 675 | } |
---|
604 | 676 | #define mm_alloc_pgd(mm) (0) |
---|
.. | .. |
---|
609 | 681 | { |
---|
610 | 682 | int i; |
---|
611 | 683 | |
---|
| 684 | + BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS, |
---|
| 685 | + "Please make sure 'struct resident_page_types[]' is updated as well"); |
---|
| 686 | + |
---|
612 | 687 | for (i = 0; i < NR_MM_COUNTERS; i++) { |
---|
613 | 688 | long x = atomic_long_read(&mm->rss_stat.count[i]); |
---|
614 | 689 | |
---|
615 | 690 | if (unlikely(x)) |
---|
616 | | - printk(KERN_ALERT "BUG: Bad rss-counter state " |
---|
617 | | - "mm:%p idx:%d val:%ld\n", mm, i, x); |
---|
| 691 | + pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n", |
---|
| 692 | + mm, resident_page_types[i], x); |
---|
618 | 693 | } |
---|
619 | 694 | |
---|
620 | 695 | if (mm_pgtables_bytes(mm)) |
---|
.. | .. |
---|
641 | 716 | WARN_ON_ONCE(mm == current->active_mm); |
---|
642 | 717 | mm_free_pgd(mm); |
---|
643 | 718 | destroy_context(mm); |
---|
644 | | - hmm_mm_destroy(mm); |
---|
645 | | - mmu_notifier_mm_destroy(mm); |
---|
| 719 | + mmu_notifier_subscriptions_destroy(mm); |
---|
646 | 720 | check_mm(mm); |
---|
647 | 721 | put_user_ns(mm->user_ns); |
---|
648 | 722 | free_mm(mm); |
---|
.. | .. |
---|
680 | 754 | |
---|
681 | 755 | static inline void put_signal_struct(struct signal_struct *sig) |
---|
682 | 756 | { |
---|
683 | | - if (atomic_dec_and_test(&sig->sigcnt)) |
---|
| 757 | + if (refcount_dec_and_test(&sig->sigcnt)) |
---|
684 | 758 | free_signal_struct(sig); |
---|
685 | 759 | } |
---|
686 | 760 | |
---|
687 | 761 | void __put_task_struct(struct task_struct *tsk) |
---|
688 | 762 | { |
---|
689 | 763 | WARN_ON(!tsk->exit_state); |
---|
690 | | - WARN_ON(atomic_read(&tsk->usage)); |
---|
| 764 | + WARN_ON(refcount_read(&tsk->usage)); |
---|
691 | 765 | WARN_ON(tsk == current); |
---|
692 | 766 | |
---|
| 767 | + io_uring_free(tsk); |
---|
693 | 768 | cgroup_free(tsk); |
---|
694 | 769 | task_numa_free(tsk, true); |
---|
695 | 770 | security_task_free(tsk); |
---|
.. | .. |
---|
710 | 785 | static void set_max_threads(unsigned int max_threads_suggested) |
---|
711 | 786 | { |
---|
712 | 787 | u64 threads; |
---|
| 788 | + unsigned long nr_pages = totalram_pages(); |
---|
713 | 789 | |
---|
714 | 790 | /* |
---|
715 | 791 | * The number of threads shall be limited such that the thread |
---|
716 | 792 | * structures may only consume a small part of the available memory. |
---|
717 | 793 | */ |
---|
718 | | - if (fls64(totalram_pages) + fls64(PAGE_SIZE) > 64) |
---|
| 794 | + if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64) |
---|
719 | 795 | threads = MAX_THREADS; |
---|
720 | 796 | else |
---|
721 | | - threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE, |
---|
| 797 | + threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE, |
---|
722 | 798 | (u64) THREAD_SIZE * 8UL); |
---|
723 | 799 | |
---|
724 | 800 | if (threads > max_threads_suggested) |
---|
.. | .. |
---|
732 | 808 | int arch_task_struct_size __read_mostly; |
---|
733 | 809 | #endif |
---|
734 | 810 | |
---|
| 811 | +#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR |
---|
735 | 812 | static void task_struct_whitelist(unsigned long *offset, unsigned long *size) |
---|
736 | 813 | { |
---|
737 | 814 | /* Fetch thread_struct whitelist for the architecture. */ |
---|
.. | .. |
---|
746 | 823 | else |
---|
747 | 824 | *offset += offsetof(struct task_struct, thread); |
---|
748 | 825 | } |
---|
| 826 | +#endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */ |
---|
749 | 827 | |
---|
750 | 828 | void __init fork_init(void) |
---|
751 | 829 | { |
---|
.. | .. |
---|
787 | 865 | scs_init(); |
---|
788 | 866 | |
---|
789 | 867 | lockdep_init_task(&init_task); |
---|
| 868 | + uprobes_init(); |
---|
790 | 869 | } |
---|
791 | 870 | |
---|
792 | 871 | int __weak arch_dup_task_struct(struct task_struct *dst, |
---|
.. | .. |
---|
808 | 887 | { |
---|
809 | 888 | struct task_struct *tsk; |
---|
810 | 889 | unsigned long *stack; |
---|
811 | | - struct vm_struct *stack_vm_area; |
---|
| 890 | + struct vm_struct *stack_vm_area __maybe_unused; |
---|
812 | 891 | int err; |
---|
813 | 892 | |
---|
814 | 893 | if (node == NUMA_NO_NODE) |
---|
.. | .. |
---|
820 | 899 | stack = alloc_thread_stack_node(tsk, node); |
---|
821 | 900 | if (!stack) |
---|
822 | 901 | goto free_tsk; |
---|
| 902 | + |
---|
| 903 | + if (memcg_charge_kernel_stack(tsk)) |
---|
| 904 | + goto free_stack; |
---|
823 | 905 | |
---|
824 | 906 | stack_vm_area = task_stack_vm_area(tsk); |
---|
825 | 907 | |
---|
.. | .. |
---|
835 | 917 | tsk->stack_vm_area = stack_vm_area; |
---|
836 | 918 | #endif |
---|
837 | 919 | #ifdef CONFIG_THREAD_INFO_IN_TASK |
---|
838 | | - atomic_set(&tsk->stack_refcount, 1); |
---|
| 920 | + refcount_set(&tsk->stack_refcount, 1); |
---|
839 | 921 | #endif |
---|
840 | 922 | |
---|
841 | 923 | if (err) |
---|
.. | .. |
---|
863 | 945 | #ifdef CONFIG_STACKPROTECTOR |
---|
864 | 946 | tsk->stack_canary = get_random_canary(); |
---|
865 | 947 | #endif |
---|
| 948 | + if (orig->cpus_ptr == &orig->cpus_mask) |
---|
| 949 | + tsk->cpus_ptr = &tsk->cpus_mask; |
---|
866 | 950 | |
---|
867 | 951 | /* |
---|
868 | | - * One for us, one for whoever does the "release_task()" (usually |
---|
869 | | - * parent) |
---|
| 952 | + * One for the user space visible state that goes away when reaped. |
---|
| 953 | + * One for the scheduler. |
---|
870 | 954 | */ |
---|
871 | | - atomic_set(&tsk->usage, 2); |
---|
| 955 | + refcount_set(&tsk->rcu_users, 2); |
---|
| 956 | + /* One for the rcu users */ |
---|
| 957 | + refcount_set(&tsk->usage, 1); |
---|
872 | 958 | #ifdef CONFIG_BLK_DEV_IO_TRACE |
---|
873 | 959 | tsk->btrace_seq = 0; |
---|
874 | 960 | #endif |
---|
875 | 961 | tsk->splice_pipe = NULL; |
---|
876 | 962 | tsk->task_frag.page = NULL; |
---|
877 | 963 | tsk->wake_q.next = NULL; |
---|
| 964 | + tsk->pf_io_worker = NULL; |
---|
878 | 965 | |
---|
879 | 966 | account_kernel_stack(tsk, 1); |
---|
880 | 967 | |
---|
.. | .. |
---|
892 | 979 | #ifdef CONFIG_MEMCG |
---|
893 | 980 | tsk->active_memcg = NULL; |
---|
894 | 981 | #endif |
---|
| 982 | + |
---|
| 983 | + android_init_vendor_data(tsk, 1); |
---|
| 984 | + android_init_oem_data(tsk, 1); |
---|
| 985 | + |
---|
| 986 | + trace_android_vh_dup_task_struct(tsk, orig); |
---|
895 | 987 | return tsk; |
---|
896 | 988 | |
---|
897 | 989 | free_stack: |
---|
.. | .. |
---|
941 | 1033 | #endif |
---|
942 | 1034 | } |
---|
943 | 1035 | |
---|
| 1036 | +static void mm_init_pasid(struct mm_struct *mm) |
---|
| 1037 | +{ |
---|
| 1038 | +#ifdef CONFIG_IOMMU_SUPPORT |
---|
| 1039 | + mm->pasid = INIT_PASID; |
---|
| 1040 | +#endif |
---|
| 1041 | +} |
---|
| 1042 | + |
---|
944 | 1043 | static void mm_init_uprobes_state(struct mm_struct *mm) |
---|
945 | 1044 | { |
---|
946 | 1045 | #ifdef CONFIG_UPROBES |
---|
.. | .. |
---|
954 | 1053 | mm->mmap = NULL; |
---|
955 | 1054 | mm->mm_rb = RB_ROOT; |
---|
956 | 1055 | mm->vmacache_seqnum = 0; |
---|
| 1056 | +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT |
---|
| 1057 | + rwlock_init(&mm->mm_rb_lock); |
---|
| 1058 | +#endif |
---|
957 | 1059 | atomic_set(&mm->mm_users, 1); |
---|
958 | 1060 | atomic_set(&mm->mm_count, 1); |
---|
959 | | - init_rwsem(&mm->mmap_sem); |
---|
| 1061 | + seqcount_init(&mm->write_protect_seq); |
---|
| 1062 | + mmap_init_lock(mm); |
---|
960 | 1063 | INIT_LIST_HEAD(&mm->mmlist); |
---|
961 | 1064 | mm->core_state = NULL; |
---|
962 | 1065 | mm_pgtables_bytes_init(mm); |
---|
963 | 1066 | mm->map_count = 0; |
---|
964 | 1067 | mm->locked_vm = 0; |
---|
965 | | - mm->pinned_vm = 0; |
---|
| 1068 | + atomic_set(&mm->has_pinned, 0); |
---|
| 1069 | + atomic64_set(&mm->pinned_vm, 0); |
---|
966 | 1070 | memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); |
---|
967 | 1071 | spin_lock_init(&mm->page_table_lock); |
---|
968 | 1072 | spin_lock_init(&mm->arg_lock); |
---|
969 | 1073 | mm_init_cpumask(mm); |
---|
970 | 1074 | mm_init_aio(mm); |
---|
971 | 1075 | mm_init_owner(mm, p); |
---|
| 1076 | + mm_init_pasid(mm); |
---|
972 | 1077 | RCU_INIT_POINTER(mm->exe_file, NULL); |
---|
973 | | - mmu_notifier_mm_init(mm); |
---|
974 | | - hmm_mm_init(mm); |
---|
| 1078 | + if (!mmu_notifier_subscriptions_init(mm)) |
---|
| 1079 | + goto fail_nopgd; |
---|
975 | 1080 | init_tlb_flush_pending(mm); |
---|
976 | 1081 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS |
---|
977 | 1082 | mm->pmd_huge_pte = NULL; |
---|
.. | .. |
---|
1046 | 1151 | { |
---|
1047 | 1152 | might_sleep(); |
---|
1048 | 1153 | |
---|
1049 | | - if (atomic_dec_and_test(&mm->mm_users)) |
---|
| 1154 | + if (atomic_dec_and_test(&mm->mm_users)) { |
---|
| 1155 | + trace_android_vh_mmput(NULL); |
---|
1050 | 1156 | __mmput(mm); |
---|
| 1157 | + } |
---|
1051 | 1158 | } |
---|
1052 | 1159 | EXPORT_SYMBOL_GPL(mmput); |
---|
1053 | 1160 | |
---|
.. | .. |
---|
1067 | 1174 | schedule_work(&mm->async_put_work); |
---|
1068 | 1175 | } |
---|
1069 | 1176 | } |
---|
| 1177 | +EXPORT_SYMBOL_GPL(mmput_async); |
---|
1070 | 1178 | #endif |
---|
1071 | 1179 | |
---|
1072 | 1180 | /** |
---|
.. | .. |
---|
1171 | 1279 | struct mm_struct *mm; |
---|
1172 | 1280 | int err; |
---|
1173 | 1281 | |
---|
1174 | | - err = mutex_lock_killable(&task->signal->cred_guard_mutex); |
---|
| 1282 | + err = down_read_killable(&task->signal->exec_update_lock); |
---|
1175 | 1283 | if (err) |
---|
1176 | 1284 | return ERR_PTR(err); |
---|
1177 | 1285 | |
---|
.. | .. |
---|
1181 | 1289 | mmput(mm); |
---|
1182 | 1290 | mm = ERR_PTR(-EACCES); |
---|
1183 | 1291 | } |
---|
1184 | | - mutex_unlock(&task->signal->cred_guard_mutex); |
---|
| 1292 | + up_read(&task->signal->exec_update_lock); |
---|
1185 | 1293 | |
---|
1186 | 1294 | return mm; |
---|
1187 | 1295 | } |
---|
.. | .. |
---|
1279 | 1387 | mm_release(tsk, mm); |
---|
1280 | 1388 | } |
---|
1281 | 1389 | |
---|
1282 | | -/* |
---|
1283 | | - * Allocate a new mm structure and copy contents from the |
---|
1284 | | - * mm structure of the passed in task structure. |
---|
| 1390 | +/** |
---|
| 1391 | + * dup_mm() - duplicates an existing mm structure |
---|
| 1392 | + * @tsk: the task_struct with which the new mm will be associated. |
---|
| 1393 | + * @oldmm: the mm to duplicate. |
---|
| 1394 | + * |
---|
| 1395 | + * Allocates a new mm structure and duplicates the provided @oldmm structure |
---|
| 1396 | + * content into it. |
---|
| 1397 | + * |
---|
| 1398 | + * Return: the duplicated mm or NULL on failure. |
---|
1285 | 1399 | */ |
---|
1286 | | -static struct mm_struct *dup_mm(struct task_struct *tsk) |
---|
| 1400 | +static struct mm_struct *dup_mm(struct task_struct *tsk, |
---|
| 1401 | + struct mm_struct *oldmm) |
---|
1287 | 1402 | { |
---|
1288 | | - struct mm_struct *mm, *oldmm = current->mm; |
---|
| 1403 | + struct mm_struct *mm; |
---|
1289 | 1404 | int err; |
---|
1290 | 1405 | |
---|
1291 | 1406 | mm = allocate_mm(); |
---|
.. | .. |
---|
1353 | 1468 | } |
---|
1354 | 1469 | |
---|
1355 | 1470 | retval = -ENOMEM; |
---|
1356 | | - mm = dup_mm(tsk); |
---|
| 1471 | + mm = dup_mm(tsk, current->mm); |
---|
1357 | 1472 | if (!mm) |
---|
1358 | 1473 | goto fail_nomem; |
---|
1359 | 1474 | |
---|
.. | .. |
---|
1403 | 1518 | goto out; |
---|
1404 | 1519 | } |
---|
1405 | 1520 | |
---|
1406 | | - newf = dup_fd(oldf, &error); |
---|
| 1521 | + newf = dup_fd(oldf, NR_OPEN_MAX, &error); |
---|
1407 | 1522 | if (!newf) |
---|
1408 | 1523 | goto out; |
---|
1409 | 1524 | |
---|
.. | .. |
---|
1444 | 1559 | struct sighand_struct *sig; |
---|
1445 | 1560 | |
---|
1446 | 1561 | if (clone_flags & CLONE_SIGHAND) { |
---|
1447 | | - atomic_inc(¤t->sighand->count); |
---|
| 1562 | + refcount_inc(¤t->sighand->count); |
---|
1448 | 1563 | return 0; |
---|
1449 | 1564 | } |
---|
1450 | 1565 | sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); |
---|
1451 | | - rcu_assign_pointer(tsk->sighand, sig); |
---|
| 1566 | + RCU_INIT_POINTER(tsk->sighand, sig); |
---|
1452 | 1567 | if (!sig) |
---|
1453 | 1568 | return -ENOMEM; |
---|
1454 | 1569 | |
---|
1455 | | - atomic_set(&sig->count, 1); |
---|
| 1570 | + refcount_set(&sig->count, 1); |
---|
1456 | 1571 | spin_lock_irq(¤t->sighand->siglock); |
---|
1457 | 1572 | memcpy(sig->action, current->sighand->action, sizeof(sig->action)); |
---|
1458 | 1573 | spin_unlock_irq(¤t->sighand->siglock); |
---|
| 1574 | + |
---|
| 1575 | + /* Reset all signal handler not set to SIG_IGN to SIG_DFL. */ |
---|
| 1576 | + if (clone_flags & CLONE_CLEAR_SIGHAND) |
---|
| 1577 | + flush_signal_handlers(tsk, 0); |
---|
| 1578 | + |
---|
1459 | 1579 | return 0; |
---|
1460 | 1580 | } |
---|
1461 | 1581 | |
---|
1462 | 1582 | void __cleanup_sighand(struct sighand_struct *sighand) |
---|
1463 | 1583 | { |
---|
1464 | | - if (atomic_dec_and_test(&sighand->count)) { |
---|
| 1584 | + if (refcount_dec_and_test(&sighand->count)) { |
---|
1465 | 1585 | signalfd_cleanup(sighand); |
---|
1466 | 1586 | /* |
---|
1467 | 1587 | * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it |
---|
.. | .. |
---|
1471 | 1591 | } |
---|
1472 | 1592 | } |
---|
1473 | 1593 | |
---|
1474 | | -#ifdef CONFIG_POSIX_TIMERS |
---|
1475 | 1594 | /* |
---|
1476 | 1595 | * Initialize POSIX timer handling for a thread group. |
---|
1477 | 1596 | */ |
---|
1478 | 1597 | static void posix_cpu_timers_init_group(struct signal_struct *sig) |
---|
1479 | 1598 | { |
---|
| 1599 | + struct posix_cputimers *pct = &sig->posix_cputimers; |
---|
1480 | 1600 | unsigned long cpu_limit; |
---|
1481 | 1601 | |
---|
1482 | 1602 | cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); |
---|
1483 | | - if (cpu_limit != RLIM_INFINITY) { |
---|
1484 | | - sig->cputime_expires.prof_exp = cpu_limit * NSEC_PER_SEC; |
---|
1485 | | - sig->cputimer.running = true; |
---|
1486 | | - } |
---|
1487 | | - |
---|
1488 | | - /* The timer lists. */ |
---|
1489 | | - INIT_LIST_HEAD(&sig->cpu_timers[0]); |
---|
1490 | | - INIT_LIST_HEAD(&sig->cpu_timers[1]); |
---|
1491 | | - INIT_LIST_HEAD(&sig->cpu_timers[2]); |
---|
| 1603 | + posix_cputimers_group_init(pct, cpu_limit); |
---|
1492 | 1604 | } |
---|
1493 | | -#else |
---|
1494 | | -static inline void posix_cpu_timers_init_group(struct signal_struct *sig) { } |
---|
1495 | | -#endif |
---|
1496 | 1605 | |
---|
1497 | 1606 | static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) |
---|
1498 | 1607 | { |
---|
.. | .. |
---|
1508 | 1617 | |
---|
1509 | 1618 | sig->nr_threads = 1; |
---|
1510 | 1619 | atomic_set(&sig->live, 1); |
---|
1511 | | - atomic_set(&sig->sigcnt, 1); |
---|
| 1620 | + refcount_set(&sig->sigcnt, 1); |
---|
1512 | 1621 | |
---|
1513 | 1622 | /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */ |
---|
1514 | 1623 | sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node); |
---|
.. | .. |
---|
1540 | 1649 | sig->oom_score_adj_min = current->signal->oom_score_adj_min; |
---|
1541 | 1650 | |
---|
1542 | 1651 | mutex_init(&sig->cred_guard_mutex); |
---|
| 1652 | + init_rwsem(&sig->exec_update_lock); |
---|
1543 | 1653 | |
---|
1544 | 1654 | return 0; |
---|
1545 | 1655 | } |
---|
.. | .. |
---|
1594 | 1704 | #endif |
---|
1595 | 1705 | } |
---|
1596 | 1706 | |
---|
1597 | | -#ifdef CONFIG_POSIX_TIMERS |
---|
1598 | | -/* |
---|
1599 | | - * Initialize POSIX timer handling for a single task. |
---|
1600 | | - */ |
---|
1601 | | -static void posix_cpu_timers_init(struct task_struct *tsk) |
---|
1602 | | -{ |
---|
1603 | | - tsk->cputime_expires.prof_exp = 0; |
---|
1604 | | - tsk->cputime_expires.virt_exp = 0; |
---|
1605 | | - tsk->cputime_expires.sched_exp = 0; |
---|
1606 | | - INIT_LIST_HEAD(&tsk->cpu_timers[0]); |
---|
1607 | | - INIT_LIST_HEAD(&tsk->cpu_timers[1]); |
---|
1608 | | - INIT_LIST_HEAD(&tsk->cpu_timers[2]); |
---|
1609 | | -} |
---|
1610 | | -#else |
---|
1611 | | -static inline void posix_cpu_timers_init(struct task_struct *tsk) { } |
---|
1612 | | -#endif |
---|
1613 | | - |
---|
1614 | 1707 | static inline void init_task_pid_links(struct task_struct *task) |
---|
1615 | 1708 | { |
---|
1616 | 1709 | enum pid_type type; |
---|
.. | .. |
---|
1642 | 1735 | INIT_LIST_HEAD(&p->rcu_tasks_holdout_list); |
---|
1643 | 1736 | p->rcu_tasks_idle_cpu = -1; |
---|
1644 | 1737 | #endif /* #ifdef CONFIG_TASKS_RCU */ |
---|
| 1738 | +#ifdef CONFIG_TASKS_TRACE_RCU |
---|
| 1739 | + p->trc_reader_nesting = 0; |
---|
| 1740 | + p->trc_reader_special.s = 0; |
---|
| 1741 | + INIT_LIST_HEAD(&p->trc_holdout_list); |
---|
| 1742 | +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ |
---|
1645 | 1743 | } |
---|
| 1744 | + |
---|
| 1745 | +struct pid *pidfd_pid(const struct file *file) |
---|
| 1746 | +{ |
---|
| 1747 | + if (file->f_op == &pidfd_fops) |
---|
| 1748 | + return file->private_data; |
---|
| 1749 | + |
---|
| 1750 | + return ERR_PTR(-EBADF); |
---|
| 1751 | +} |
---|
| 1752 | + |
---|
| 1753 | +static int pidfd_release(struct inode *inode, struct file *file) |
---|
| 1754 | +{ |
---|
| 1755 | + struct pid *pid = file->private_data; |
---|
| 1756 | + |
---|
| 1757 | + file->private_data = NULL; |
---|
| 1758 | + put_pid(pid); |
---|
| 1759 | + return 0; |
---|
| 1760 | +} |
---|
| 1761 | + |
---|
| 1762 | +#ifdef CONFIG_PROC_FS |
---|
| 1763 | +/** |
---|
| 1764 | + * pidfd_show_fdinfo - print information about a pidfd |
---|
| 1765 | + * @m: proc fdinfo file |
---|
| 1766 | + * @f: file referencing a pidfd |
---|
| 1767 | + * |
---|
| 1768 | + * Pid: |
---|
| 1769 | + * This function will print the pid that a given pidfd refers to in the |
---|
| 1770 | + * pid namespace of the procfs instance. |
---|
| 1771 | + * If the pid namespace of the process is not a descendant of the pid |
---|
| 1772 | + * namespace of the procfs instance 0 will be shown as its pid. This is |
---|
| 1773 | + * similar to calling getppid() on a process whose parent is outside of |
---|
| 1774 | + * its pid namespace. |
---|
| 1775 | + * |
---|
| 1776 | + * NSpid: |
---|
| 1777 | + * If pid namespaces are supported then this function will also print |
---|
| 1778 | + * the pid of a given pidfd refers to for all descendant pid namespaces |
---|
| 1779 | + * starting from the current pid namespace of the instance, i.e. the |
---|
| 1780 | + * Pid field and the first entry in the NSpid field will be identical. |
---|
| 1781 | + * If the pid namespace of the process is not a descendant of the pid |
---|
| 1782 | + * namespace of the procfs instance 0 will be shown as its first NSpid |
---|
| 1783 | + * entry and no others will be shown. |
---|
| 1784 | + * Note that this differs from the Pid and NSpid fields in |
---|
| 1785 | + * /proc/<pid>/status where Pid and NSpid are always shown relative to |
---|
| 1786 | + * the pid namespace of the procfs instance. The difference becomes |
---|
| 1787 | + * obvious when sending around a pidfd between pid namespaces from a |
---|
| 1788 | + * different branch of the tree, i.e. where no ancestoral relation is |
---|
| 1789 | + * present between the pid namespaces: |
---|
| 1790 | + * - create two new pid namespaces ns1 and ns2 in the initial pid |
---|
| 1791 | + * namespace (also take care to create new mount namespaces in the |
---|
| 1792 | + * new pid namespace and mount procfs) |
---|
| 1793 | + * - create a process with a pidfd in ns1 |
---|
| 1794 | + * - send pidfd from ns1 to ns2 |
---|
| 1795 | + * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid |
---|
| 1796 | + * have exactly one entry, which is 0 |
---|
| 1797 | + */ |
---|
| 1798 | +static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) |
---|
| 1799 | +{ |
---|
| 1800 | + struct pid *pid = f->private_data; |
---|
| 1801 | + struct pid_namespace *ns; |
---|
| 1802 | + pid_t nr = -1; |
---|
| 1803 | + |
---|
| 1804 | + if (likely(pid_has_task(pid, PIDTYPE_PID))) { |
---|
| 1805 | + ns = proc_pid_ns(file_inode(m->file)->i_sb); |
---|
| 1806 | + nr = pid_nr_ns(pid, ns); |
---|
| 1807 | + } |
---|
| 1808 | + |
---|
| 1809 | + seq_put_decimal_ll(m, "Pid:\t", nr); |
---|
| 1810 | + |
---|
| 1811 | +#ifdef CONFIG_PID_NS |
---|
| 1812 | + seq_put_decimal_ll(m, "\nNSpid:\t", nr); |
---|
| 1813 | + if (nr > 0) { |
---|
| 1814 | + int i; |
---|
| 1815 | + |
---|
| 1816 | + /* If nr is non-zero it means that 'pid' is valid and that |
---|
| 1817 | + * ns, i.e. the pid namespace associated with the procfs |
---|
| 1818 | + * instance, is in the pid namespace hierarchy of pid. |
---|
| 1819 | + * Start at one below the already printed level. |
---|
| 1820 | + */ |
---|
| 1821 | + for (i = ns->level + 1; i <= pid->level; i++) |
---|
| 1822 | + seq_put_decimal_ll(m, "\t", pid->numbers[i].nr); |
---|
| 1823 | + } |
---|
| 1824 | +#endif |
---|
| 1825 | + seq_putc(m, '\n'); |
---|
| 1826 | +} |
---|
| 1827 | +#endif |
---|
| 1828 | + |
---|
| 1829 | +/* |
---|
| 1830 | + * Poll support for process exit notification. |
---|
| 1831 | + */ |
---|
| 1832 | +static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) |
---|
| 1833 | +{ |
---|
| 1834 | + struct pid *pid = file->private_data; |
---|
| 1835 | + __poll_t poll_flags = 0; |
---|
| 1836 | + |
---|
| 1837 | + poll_wait(file, &pid->wait_pidfd, pts); |
---|
| 1838 | + |
---|
| 1839 | + /* |
---|
| 1840 | + * Inform pollers only when the whole thread group exits. |
---|
| 1841 | + * If the thread group leader exits before all other threads in the |
---|
| 1842 | + * group, then poll(2) should block, similar to the wait(2) family. |
---|
| 1843 | + */ |
---|
| 1844 | + if (thread_group_exited(pid)) |
---|
| 1845 | + poll_flags = EPOLLIN | EPOLLRDNORM; |
---|
| 1846 | + |
---|
| 1847 | + return poll_flags; |
---|
| 1848 | +} |
---|
| 1849 | + |
---|
| 1850 | +const struct file_operations pidfd_fops = { |
---|
| 1851 | + .release = pidfd_release, |
---|
| 1852 | + .poll = pidfd_poll, |
---|
| 1853 | +#ifdef CONFIG_PROC_FS |
---|
| 1854 | + .show_fdinfo = pidfd_show_fdinfo, |
---|
| 1855 | +#endif |
---|
| 1856 | +}; |
---|
1646 | 1857 | |
---|
1647 | 1858 | static void __delayed_free_task(struct rcu_head *rhp) |
---|
1648 | 1859 | { |
---|
.. | .. |
---|
1657 | 1868 | call_rcu(&tsk->rcu, __delayed_free_task); |
---|
1658 | 1869 | else |
---|
1659 | 1870 | free_task(tsk); |
---|
1660 | | -} |
---|
1661 | | - |
---|
1662 | | -static int pidfd_release(struct inode *inode, struct file *file) |
---|
1663 | | -{ |
---|
1664 | | - struct pid *pid = file->private_data; |
---|
1665 | | - |
---|
1666 | | - file->private_data = NULL; |
---|
1667 | | - put_pid(pid); |
---|
1668 | | - return 0; |
---|
1669 | | -} |
---|
1670 | | - |
---|
1671 | | -#ifdef CONFIG_PROC_FS |
---|
1672 | | -static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) |
---|
1673 | | -{ |
---|
1674 | | - struct pid_namespace *ns = proc_pid_ns(file_inode(m->file)); |
---|
1675 | | - struct pid *pid = f->private_data; |
---|
1676 | | - |
---|
1677 | | - seq_put_decimal_ull(m, "Pid:\t", pid_nr_ns(pid, ns)); |
---|
1678 | | - seq_putc(m, '\n'); |
---|
1679 | | -} |
---|
1680 | | -#endif |
---|
1681 | | - |
---|
1682 | | -/* |
---|
1683 | | - * Poll support for process exit notification. |
---|
1684 | | - */ |
---|
1685 | | -static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) |
---|
1686 | | -{ |
---|
1687 | | - struct task_struct *task; |
---|
1688 | | - struct pid *pid = file->private_data; |
---|
1689 | | - __poll_t poll_flags = 0; |
---|
1690 | | - |
---|
1691 | | - poll_wait(file, &pid->wait_pidfd, pts); |
---|
1692 | | - |
---|
1693 | | - rcu_read_lock(); |
---|
1694 | | - task = pid_task(pid, PIDTYPE_PID); |
---|
1695 | | - /* |
---|
1696 | | - * Inform pollers only when the whole thread group exits. |
---|
1697 | | - * If the thread group leader exits before all other threads in the |
---|
1698 | | - * group, then poll(2) should block, similar to the wait(2) family. |
---|
1699 | | - */ |
---|
1700 | | - if (!task || (task->exit_state && thread_group_empty(task))) |
---|
1701 | | - poll_flags = EPOLLIN | EPOLLRDNORM; |
---|
1702 | | - rcu_read_unlock(); |
---|
1703 | | - |
---|
1704 | | - return poll_flags; |
---|
1705 | | -} |
---|
1706 | | - |
---|
1707 | | -const struct file_operations pidfd_fops = { |
---|
1708 | | - .release = pidfd_release, |
---|
1709 | | - .poll = pidfd_poll, |
---|
1710 | | -#ifdef CONFIG_PROC_FS |
---|
1711 | | - .show_fdinfo = pidfd_show_fdinfo, |
---|
1712 | | -#endif |
---|
1713 | | -}; |
---|
1714 | | - |
---|
1715 | | -/** |
---|
1716 | | - * pidfd_create() - Create a new pid file descriptor. |
---|
1717 | | - * |
---|
1718 | | - * @pid: struct pid that the pidfd will reference |
---|
1719 | | - * |
---|
1720 | | - * This creates a new pid file descriptor with the O_CLOEXEC flag set. |
---|
1721 | | - * |
---|
1722 | | - * Note, that this function can only be called after the fd table has |
---|
1723 | | - * been unshared to avoid leaking the pidfd to the new process. |
---|
1724 | | - * |
---|
1725 | | - * Return: On success, a cloexec pidfd is returned. |
---|
1726 | | - * On error, a negative errno number will be returned. |
---|
1727 | | - */ |
---|
1728 | | -static int pidfd_create(struct pid *pid) |
---|
1729 | | -{ |
---|
1730 | | - int fd; |
---|
1731 | | - |
---|
1732 | | - fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), |
---|
1733 | | - O_RDWR | O_CLOEXEC); |
---|
1734 | | - if (fd < 0) |
---|
1735 | | - put_pid(pid); |
---|
1736 | | - |
---|
1737 | | - return fd; |
---|
1738 | 1871 | } |
---|
1739 | 1872 | |
---|
1740 | 1873 | static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk) |
---|
.. | .. |
---|
1765 | 1898 | * flags). The actual kick-off is left to the caller. |
---|
1766 | 1899 | */ |
---|
1767 | 1900 | static __latent_entropy struct task_struct *copy_process( |
---|
1768 | | - unsigned long clone_flags, |
---|
1769 | | - unsigned long stack_start, |
---|
1770 | | - unsigned long stack_size, |
---|
1771 | | - int __user *parent_tidptr, |
---|
1772 | | - int __user *child_tidptr, |
---|
1773 | 1901 | struct pid *pid, |
---|
1774 | 1902 | int trace, |
---|
1775 | | - unsigned long tls, |
---|
1776 | | - int node) |
---|
| 1903 | + int node, |
---|
| 1904 | + struct kernel_clone_args *args) |
---|
1777 | 1905 | { |
---|
1778 | 1906 | int pidfd = -1, retval; |
---|
1779 | 1907 | struct task_struct *p; |
---|
1780 | 1908 | struct multiprocess_signals delayed; |
---|
| 1909 | + struct file *pidfile = NULL; |
---|
| 1910 | + u64 clone_flags = args->flags; |
---|
| 1911 | + struct nsproxy *nsp = current->nsproxy; |
---|
1781 | 1912 | |
---|
1782 | 1913 | /* |
---|
1783 | 1914 | * Don't allow sharing the root directory with processes in a different |
---|
.. | .. |
---|
1820 | 1951 | */ |
---|
1821 | 1952 | if (clone_flags & CLONE_THREAD) { |
---|
1822 | 1953 | if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) || |
---|
1823 | | - (task_active_pid_ns(current) != |
---|
1824 | | - current->nsproxy->pid_ns_for_children)) |
---|
| 1954 | + (task_active_pid_ns(current) != nsp->pid_ns_for_children)) |
---|
| 1955 | + return ERR_PTR(-EINVAL); |
---|
| 1956 | + } |
---|
| 1957 | + |
---|
| 1958 | + /* |
---|
| 1959 | + * If the new process will be in a different time namespace |
---|
| 1960 | + * do not allow it to share VM or a thread group with the forking task. |
---|
| 1961 | + */ |
---|
| 1962 | + if (clone_flags & (CLONE_THREAD | CLONE_VM)) { |
---|
| 1963 | + if (nsp->time_ns != nsp->time_ns_for_children) |
---|
1825 | 1964 | return ERR_PTR(-EINVAL); |
---|
1826 | 1965 | } |
---|
1827 | 1966 | |
---|
1828 | 1967 | if (clone_flags & CLONE_PIDFD) { |
---|
1829 | 1968 | /* |
---|
1830 | | - * - CLONE_PARENT_SETTID is useless for pidfds and also |
---|
1831 | | - * parent_tidptr is used to return pidfds. |
---|
1832 | 1969 | * - CLONE_DETACHED is blocked so that we can potentially |
---|
1833 | 1970 | * reuse it later for CLONE_PIDFD. |
---|
1834 | 1971 | * - CLONE_THREAD is blocked until someone really needs it. |
---|
1835 | 1972 | */ |
---|
1836 | | - if (clone_flags & |
---|
1837 | | - (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD)) |
---|
| 1973 | + if (clone_flags & (CLONE_DETACHED | CLONE_THREAD)) |
---|
1838 | 1974 | return ERR_PTR(-EINVAL); |
---|
1839 | 1975 | } |
---|
1840 | 1976 | |
---|
.. | .. |
---|
1853 | 1989 | recalc_sigpending(); |
---|
1854 | 1990 | spin_unlock_irq(¤t->sighand->siglock); |
---|
1855 | 1991 | retval = -ERESTARTNOINTR; |
---|
1856 | | - if (signal_pending(current)) |
---|
| 1992 | + if (task_sigpending(current)) |
---|
1857 | 1993 | goto fork_out; |
---|
1858 | 1994 | |
---|
1859 | 1995 | retval = -ENOMEM; |
---|
1860 | 1996 | p = dup_task_struct(current, node); |
---|
1861 | 1997 | if (!p) |
---|
1862 | 1998 | goto fork_out; |
---|
| 1999 | + if (args->io_thread) { |
---|
| 2000 | + /* |
---|
| 2001 | + * Mark us an IO worker, and block any signal that isn't |
---|
| 2002 | + * fatal or STOP |
---|
| 2003 | + */ |
---|
| 2004 | + p->flags |= PF_IO_WORKER; |
---|
| 2005 | + siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP)); |
---|
| 2006 | + } |
---|
1863 | 2007 | |
---|
1864 | 2008 | cpufreq_task_times_init(p); |
---|
1865 | 2009 | |
---|
.. | .. |
---|
1869 | 2013 | * p->set_child_tid which is (ab)used as a kthread's data pointer for |
---|
1870 | 2014 | * kernel threads (PF_KTHREAD). |
---|
1871 | 2015 | */ |
---|
1872 | | - p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; |
---|
| 2016 | + p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL; |
---|
1873 | 2017 | /* |
---|
1874 | 2018 | * Clear TID on mm_release()? |
---|
1875 | 2019 | */ |
---|
1876 | | - p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; |
---|
| 2020 | + p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL; |
---|
1877 | 2021 | |
---|
1878 | 2022 | ftrace_graph_init_task(p); |
---|
1879 | 2023 | |
---|
1880 | 2024 | rt_mutex_init_task(p); |
---|
1881 | 2025 | |
---|
| 2026 | + lockdep_assert_irqs_enabled(); |
---|
1882 | 2027 | #ifdef CONFIG_PROVE_LOCKING |
---|
1883 | | - DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); |
---|
1884 | 2028 | DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); |
---|
1885 | 2029 | #endif |
---|
1886 | 2030 | retval = -EAGAIN; |
---|
.. | .. |
---|
1902 | 2046 | * to stop root fork bombs. |
---|
1903 | 2047 | */ |
---|
1904 | 2048 | retval = -EAGAIN; |
---|
1905 | | - if (nr_threads >= max_threads) |
---|
| 2049 | + if (data_race(nr_threads >= max_threads)) |
---|
1906 | 2050 | goto bad_fork_cleanup_count; |
---|
1907 | 2051 | |
---|
1908 | 2052 | delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ |
---|
.. | .. |
---|
1928 | 2072 | p->vtime.state = VTIME_INACTIVE; |
---|
1929 | 2073 | #endif |
---|
1930 | 2074 | |
---|
| 2075 | +#ifdef CONFIG_IO_URING |
---|
| 2076 | + p->io_uring = NULL; |
---|
| 2077 | +#endif |
---|
| 2078 | + |
---|
1931 | 2079 | #if defined(SPLIT_RSS_COUNTING) |
---|
1932 | 2080 | memset(&p->rss_stat, 0, sizeof(p->rss_stat)); |
---|
1933 | 2081 | #endif |
---|
.. | .. |
---|
1941 | 2089 | task_io_accounting_init(&p->ioac); |
---|
1942 | 2090 | acct_clear_integrals(p); |
---|
1943 | 2091 | |
---|
1944 | | - posix_cpu_timers_init(p); |
---|
| 2092 | + posix_cputimers_init(&p->posix_cputimers); |
---|
1945 | 2093 | |
---|
1946 | 2094 | p->io_context = NULL; |
---|
1947 | 2095 | audit_set_context(p, NULL); |
---|
.. | .. |
---|
1957 | 2105 | #ifdef CONFIG_CPUSETS |
---|
1958 | 2106 | p->cpuset_mem_spread_rotor = NUMA_NO_NODE; |
---|
1959 | 2107 | p->cpuset_slab_spread_rotor = NUMA_NO_NODE; |
---|
1960 | | - seqcount_init(&p->mems_allowed_seq); |
---|
| 2108 | + seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock); |
---|
1961 | 2109 | #endif |
---|
1962 | 2110 | #ifdef CONFIG_TRACE_IRQFLAGS |
---|
1963 | | - p->irq_events = 0; |
---|
1964 | | - p->hardirqs_enabled = 0; |
---|
1965 | | - p->hardirq_enable_ip = 0; |
---|
1966 | | - p->hardirq_enable_event = 0; |
---|
1967 | | - p->hardirq_disable_ip = _THIS_IP_; |
---|
1968 | | - p->hardirq_disable_event = 0; |
---|
1969 | | - p->softirqs_enabled = 1; |
---|
1970 | | - p->softirq_enable_ip = _THIS_IP_; |
---|
1971 | | - p->softirq_enable_event = 0; |
---|
1972 | | - p->softirq_disable_ip = 0; |
---|
1973 | | - p->softirq_disable_event = 0; |
---|
1974 | | - p->hardirq_context = 0; |
---|
1975 | | - p->softirq_context = 0; |
---|
| 2111 | + memset(&p->irqtrace, 0, sizeof(p->irqtrace)); |
---|
| 2112 | + p->irqtrace.hardirq_disable_ip = _THIS_IP_; |
---|
| 2113 | + p->irqtrace.softirq_enable_ip = _THIS_IP_; |
---|
| 2114 | + p->softirqs_enabled = 1; |
---|
| 2115 | + p->softirq_context = 0; |
---|
1976 | 2116 | #endif |
---|
1977 | 2117 | |
---|
1978 | 2118 | p->pagefault_disabled = 0; |
---|
1979 | 2119 | |
---|
1980 | 2120 | #ifdef CONFIG_LOCKDEP |
---|
1981 | | - p->lockdep_depth = 0; /* no locks held yet */ |
---|
1982 | | - p->curr_chain_key = 0; |
---|
1983 | | - p->lockdep_recursion = 0; |
---|
1984 | 2121 | lockdep_init_task(p); |
---|
1985 | 2122 | #endif |
---|
1986 | 2123 | |
---|
.. | .. |
---|
2032 | 2169 | retval = copy_io(clone_flags, p); |
---|
2033 | 2170 | if (retval) |
---|
2034 | 2171 | goto bad_fork_cleanup_namespaces; |
---|
2035 | | - retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls); |
---|
| 2172 | + retval = copy_thread(clone_flags, args->stack, args->stack_size, p, args->tls); |
---|
2036 | 2173 | if (retval) |
---|
2037 | 2174 | goto bad_fork_cleanup_io; |
---|
2038 | 2175 | |
---|
| 2176 | + stackleak_task_init(p); |
---|
| 2177 | + |
---|
2039 | 2178 | if (pid != &init_struct_pid) { |
---|
2040 | | - pid = alloc_pid(p->nsproxy->pid_ns_for_children); |
---|
| 2179 | + pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid, |
---|
| 2180 | + args->set_tid_size); |
---|
2041 | 2181 | if (IS_ERR(pid)) { |
---|
2042 | 2182 | retval = PTR_ERR(pid); |
---|
2043 | 2183 | goto bad_fork_cleanup_thread; |
---|
.. | .. |
---|
2050 | 2190 | * if the fd table isn't shared). |
---|
2051 | 2191 | */ |
---|
2052 | 2192 | if (clone_flags & CLONE_PIDFD) { |
---|
2053 | | - retval = pidfd_create(pid); |
---|
| 2193 | + retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC); |
---|
2054 | 2194 | if (retval < 0) |
---|
2055 | 2195 | goto bad_fork_free_pid; |
---|
2056 | 2196 | |
---|
2057 | 2197 | pidfd = retval; |
---|
2058 | | - retval = put_user(pidfd, parent_tidptr); |
---|
| 2198 | + |
---|
| 2199 | + pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid, |
---|
| 2200 | + O_RDWR | O_CLOEXEC); |
---|
| 2201 | + if (IS_ERR(pidfile)) { |
---|
| 2202 | + put_unused_fd(pidfd); |
---|
| 2203 | + retval = PTR_ERR(pidfile); |
---|
| 2204 | + goto bad_fork_free_pid; |
---|
| 2205 | + } |
---|
| 2206 | + get_pid(pid); /* held by pidfile now */ |
---|
| 2207 | + |
---|
| 2208 | + retval = put_user(pidfd, args->pidfd); |
---|
2059 | 2209 | if (retval) |
---|
2060 | 2210 | goto bad_fork_put_pidfd; |
---|
2061 | 2211 | } |
---|
.. | .. |
---|
2080 | 2230 | #ifdef TIF_SYSCALL_EMU |
---|
2081 | 2231 | clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); |
---|
2082 | 2232 | #endif |
---|
2083 | | - clear_all_latency_tracing(p); |
---|
| 2233 | + clear_tsk_latency_tracing(p); |
---|
2084 | 2234 | |
---|
2085 | 2235 | /* ok, now we should be set up.. */ |
---|
2086 | 2236 | p->pid = pid_nr(pid); |
---|
.. | .. |
---|
2099 | 2249 | p->pdeath_signal = 0; |
---|
2100 | 2250 | INIT_LIST_HEAD(&p->thread_group); |
---|
2101 | 2251 | p->task_works = NULL; |
---|
| 2252 | + clear_posix_cputimers_work(p); |
---|
2102 | 2253 | |
---|
2103 | | - cgroup_threadgroup_change_begin(current); |
---|
2104 | 2254 | /* |
---|
2105 | 2255 | * Ensure that the cgroup subsystem policies allow the new process to be |
---|
2106 | | - * forked. It should be noted the the new process's css_set can be changed |
---|
| 2256 | + * forked. It should be noted that the new process's css_set can be changed |
---|
2107 | 2257 | * between here and cgroup_post_fork() if an organisation operation is in |
---|
2108 | 2258 | * progress. |
---|
2109 | 2259 | */ |
---|
2110 | | - retval = cgroup_can_fork(p); |
---|
| 2260 | + retval = cgroup_can_fork(p, args); |
---|
2111 | 2261 | if (retval) |
---|
2112 | | - goto bad_fork_cgroup_threadgroup_change_end; |
---|
| 2262 | + goto bad_fork_put_pidfd; |
---|
| 2263 | + |
---|
| 2264 | + /* |
---|
| 2265 | + * Now that the cgroups are pinned, re-clone the parent cgroup and put |
---|
| 2266 | + * the new task on the correct runqueue. All this *before* the task |
---|
| 2267 | + * becomes visible. |
---|
| 2268 | + * |
---|
| 2269 | + * This isn't part of ->can_fork() because while the re-cloning is |
---|
| 2270 | + * cgroup specific, it unconditionally needs to place the task on a |
---|
| 2271 | + * runqueue. |
---|
| 2272 | + */ |
---|
| 2273 | + sched_cgroup_fork(p, args); |
---|
2113 | 2274 | |
---|
2114 | 2275 | /* |
---|
2115 | 2276 | * From this point on we must avoid any synchronous user-space |
---|
.. | .. |
---|
2120 | 2281 | */ |
---|
2121 | 2282 | |
---|
2122 | 2283 | p->start_time = ktime_get_ns(); |
---|
2123 | | - p->real_start_time = ktime_get_boot_ns(); |
---|
| 2284 | + p->start_boottime = ktime_get_boottime_ns(); |
---|
2124 | 2285 | |
---|
2125 | 2286 | /* |
---|
2126 | 2287 | * Make it visible to the rest of the system, but dont wake it up yet. |
---|
.. | .. |
---|
2139 | 2300 | } else { |
---|
2140 | 2301 | p->real_parent = current; |
---|
2141 | 2302 | p->parent_exec_id = current->self_exec_id; |
---|
2142 | | - p->exit_signal = (clone_flags & CSIGNAL); |
---|
| 2303 | + p->exit_signal = args->exit_signal; |
---|
2143 | 2304 | } |
---|
2144 | 2305 | |
---|
2145 | 2306 | klp_copy_process(p); |
---|
.. | .. |
---|
2165 | 2326 | retval = -EINTR; |
---|
2166 | 2327 | goto bad_fork_cancel_cgroup; |
---|
2167 | 2328 | } |
---|
2168 | | - |
---|
2169 | 2329 | |
---|
2170 | 2330 | init_task_pid_links(p); |
---|
2171 | 2331 | if (likely(p->pid)) { |
---|
.. | .. |
---|
2199 | 2359 | } else { |
---|
2200 | 2360 | current->signal->nr_threads++; |
---|
2201 | 2361 | atomic_inc(¤t->signal->live); |
---|
2202 | | - atomic_inc(¤t->signal->sigcnt); |
---|
| 2362 | + refcount_inc(¤t->signal->sigcnt); |
---|
2203 | 2363 | task_join_group_stop(p); |
---|
2204 | 2364 | list_add_tail_rcu(&p->thread_group, |
---|
2205 | 2365 | &p->group_leader->thread_group); |
---|
.. | .. |
---|
2215 | 2375 | syscall_tracepoint_update(p); |
---|
2216 | 2376 | write_unlock_irq(&tasklist_lock); |
---|
2217 | 2377 | |
---|
| 2378 | + if (pidfile) |
---|
| 2379 | + fd_install(pidfd, pidfile); |
---|
| 2380 | + |
---|
2218 | 2381 | proc_fork_connector(p); |
---|
2219 | | - cgroup_post_fork(p); |
---|
2220 | | - cgroup_threadgroup_change_end(current); |
---|
| 2382 | + sched_post_fork(p); |
---|
| 2383 | + cgroup_post_fork(p, args); |
---|
2221 | 2384 | perf_event_fork(p); |
---|
2222 | 2385 | |
---|
2223 | 2386 | trace_task_newtask(p, clone_flags); |
---|
.. | .. |
---|
2230 | 2393 | bad_fork_cancel_cgroup: |
---|
2231 | 2394 | spin_unlock(¤t->sighand->siglock); |
---|
2232 | 2395 | write_unlock_irq(&tasklist_lock); |
---|
2233 | | - cgroup_cancel_fork(p); |
---|
2234 | | -bad_fork_cgroup_threadgroup_change_end: |
---|
2235 | | - cgroup_threadgroup_change_end(current); |
---|
| 2396 | + cgroup_cancel_fork(p, args); |
---|
2236 | 2397 | bad_fork_put_pidfd: |
---|
2237 | | - if (clone_flags & CLONE_PIDFD) |
---|
2238 | | - ksys_close(pidfd); |
---|
| 2398 | + if (clone_flags & CLONE_PIDFD) { |
---|
| 2399 | + fput(pidfile); |
---|
| 2400 | + put_unused_fd(pidfd); |
---|
| 2401 | + } |
---|
2239 | 2402 | bad_fork_free_pid: |
---|
2240 | 2403 | if (pid != &init_struct_pid) |
---|
2241 | 2404 | free_pid(pid); |
---|
.. | .. |
---|
2299 | 2462 | } |
---|
2300 | 2463 | } |
---|
2301 | 2464 | |
---|
2302 | | -struct task_struct *fork_idle(int cpu) |
---|
| 2465 | +struct task_struct * __init fork_idle(int cpu) |
---|
2303 | 2466 | { |
---|
2304 | 2467 | struct task_struct *task; |
---|
2305 | | - task = copy_process(CLONE_VM, 0, 0, NULL, NULL, &init_struct_pid, 0, 0, |
---|
2306 | | - cpu_to_node(cpu)); |
---|
| 2468 | + struct kernel_clone_args args = { |
---|
| 2469 | + .flags = CLONE_VM, |
---|
| 2470 | + }; |
---|
| 2471 | + |
---|
| 2472 | + task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args); |
---|
2307 | 2473 | if (!IS_ERR(task)) { |
---|
2308 | 2474 | init_idle_pids(task); |
---|
2309 | 2475 | init_idle(task, cpu); |
---|
.. | .. |
---|
2312 | 2478 | return task; |
---|
2313 | 2479 | } |
---|
2314 | 2480 | |
---|
| 2481 | +struct mm_struct *copy_init_mm(void) |
---|
| 2482 | +{ |
---|
| 2483 | + return dup_mm(NULL, &init_mm); |
---|
| 2484 | +} |
---|
| 2485 | + |
---|
| 2486 | +/* |
---|
| 2487 | + * This is like kernel_clone(), but shaved down and tailored to just |
---|
| 2488 | + * creating io_uring workers. It returns a created task, or an error pointer. |
---|
| 2489 | + * The returned task is inactive, and the caller must fire it up through |
---|
| 2490 | + * wake_up_new_task(p). All signals are blocked in the created task. |
---|
| 2491 | + */ |
---|
| 2492 | +struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) |
---|
| 2493 | +{ |
---|
| 2494 | + unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD| |
---|
| 2495 | + CLONE_IO; |
---|
| 2496 | + struct kernel_clone_args args = { |
---|
| 2497 | + .flags = ((lower_32_bits(flags) | CLONE_VM | |
---|
| 2498 | + CLONE_UNTRACED) & ~CSIGNAL), |
---|
| 2499 | + .exit_signal = (lower_32_bits(flags) & CSIGNAL), |
---|
| 2500 | + .stack = (unsigned long)fn, |
---|
| 2501 | + .stack_size = (unsigned long)arg, |
---|
| 2502 | + .io_thread = 1, |
---|
| 2503 | + }; |
---|
| 2504 | + |
---|
| 2505 | + return copy_process(NULL, 0, node, &args); |
---|
| 2506 | +} |
---|
| 2507 | + |
---|
2315 | 2508 | /* |
---|
2316 | 2509 | * Ok, this is the main fork-routine. |
---|
2317 | 2510 | * |
---|
2318 | 2511 | * It copies the process, and if successful kick-starts |
---|
2319 | 2512 | * it and waits for it to finish using the VM if required. |
---|
| 2513 | + * |
---|
| 2514 | + * args->exit_signal is expected to be checked for sanity by the caller. |
---|
2320 | 2515 | */ |
---|
2321 | | -long _do_fork(unsigned long clone_flags, |
---|
2322 | | - unsigned long stack_start, |
---|
2323 | | - unsigned long stack_size, |
---|
2324 | | - int __user *parent_tidptr, |
---|
2325 | | - int __user *child_tidptr, |
---|
2326 | | - unsigned long tls) |
---|
| 2516 | +pid_t kernel_clone(struct kernel_clone_args *args) |
---|
2327 | 2517 | { |
---|
| 2518 | + u64 clone_flags = args->flags; |
---|
2328 | 2519 | struct completion vfork; |
---|
2329 | 2520 | struct pid *pid; |
---|
2330 | 2521 | struct task_struct *p; |
---|
2331 | 2522 | int trace = 0; |
---|
2332 | | - long nr; |
---|
| 2523 | + pid_t nr; |
---|
| 2524 | + |
---|
| 2525 | + /* |
---|
| 2526 | + * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument |
---|
| 2527 | + * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are |
---|
| 2528 | + * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate |
---|
| 2529 | + * field in struct clone_args and it still doesn't make sense to have |
---|
| 2530 | + * them both point at the same memory location. Performing this check |
---|
| 2531 | + * here has the advantage that we don't need to have a separate helper |
---|
| 2532 | + * to check for legacy clone(). |
---|
| 2533 | + */ |
---|
| 2534 | + if ((args->flags & CLONE_PIDFD) && |
---|
| 2535 | + (args->flags & CLONE_PARENT_SETTID) && |
---|
| 2536 | + (args->pidfd == args->parent_tid)) |
---|
| 2537 | + return -EINVAL; |
---|
2333 | 2538 | |
---|
2334 | 2539 | /* |
---|
2335 | 2540 | * Determine whether and which event to report to ptracer. When |
---|
.. | .. |
---|
2340 | 2545 | if (!(clone_flags & CLONE_UNTRACED)) { |
---|
2341 | 2546 | if (clone_flags & CLONE_VFORK) |
---|
2342 | 2547 | trace = PTRACE_EVENT_VFORK; |
---|
2343 | | - else if ((clone_flags & CSIGNAL) != SIGCHLD) |
---|
| 2548 | + else if (args->exit_signal != SIGCHLD) |
---|
2344 | 2549 | trace = PTRACE_EVENT_CLONE; |
---|
2345 | 2550 | else |
---|
2346 | 2551 | trace = PTRACE_EVENT_FORK; |
---|
.. | .. |
---|
2349 | 2554 | trace = 0; |
---|
2350 | 2555 | } |
---|
2351 | 2556 | |
---|
2352 | | - p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr, |
---|
2353 | | - child_tidptr, NULL, trace, tls, NUMA_NO_NODE); |
---|
| 2557 | + p = copy_process(NULL, trace, NUMA_NO_NODE, args); |
---|
2354 | 2558 | add_latent_entropy(); |
---|
2355 | 2559 | |
---|
2356 | 2560 | if (IS_ERR(p)) |
---|
.. | .. |
---|
2368 | 2572 | nr = pid_vnr(pid); |
---|
2369 | 2573 | |
---|
2370 | 2574 | if (clone_flags & CLONE_PARENT_SETTID) |
---|
2371 | | - put_user(nr, parent_tidptr); |
---|
| 2575 | + put_user(nr, args->parent_tid); |
---|
2372 | 2576 | |
---|
2373 | 2577 | if (clone_flags & CLONE_VFORK) { |
---|
2374 | 2578 | p->vfork_done = &vfork; |
---|
.. | .. |
---|
2391 | 2595 | return nr; |
---|
2392 | 2596 | } |
---|
2393 | 2597 | |
---|
2394 | | -#ifndef CONFIG_HAVE_COPY_THREAD_TLS |
---|
2395 | | -/* For compatibility with architectures that call do_fork directly rather than |
---|
2396 | | - * using the syscall entry points below. */ |
---|
2397 | | -long do_fork(unsigned long clone_flags, |
---|
2398 | | - unsigned long stack_start, |
---|
2399 | | - unsigned long stack_size, |
---|
2400 | | - int __user *parent_tidptr, |
---|
2401 | | - int __user *child_tidptr) |
---|
2402 | | -{ |
---|
2403 | | - return _do_fork(clone_flags, stack_start, stack_size, |
---|
2404 | | - parent_tidptr, child_tidptr, 0); |
---|
2405 | | -} |
---|
2406 | | -#endif |
---|
2407 | | - |
---|
2408 | 2598 | /* |
---|
2409 | 2599 | * Create a kernel thread. |
---|
2410 | 2600 | */ |
---|
2411 | 2601 | pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) |
---|
2412 | 2602 | { |
---|
2413 | | - return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, |
---|
2414 | | - (unsigned long)arg, NULL, NULL, 0); |
---|
| 2603 | + struct kernel_clone_args args = { |
---|
| 2604 | + .flags = ((lower_32_bits(flags) | CLONE_VM | |
---|
| 2605 | + CLONE_UNTRACED) & ~CSIGNAL), |
---|
| 2606 | + .exit_signal = (lower_32_bits(flags) & CSIGNAL), |
---|
| 2607 | + .stack = (unsigned long)fn, |
---|
| 2608 | + .stack_size = (unsigned long)arg, |
---|
| 2609 | + }; |
---|
| 2610 | + |
---|
| 2611 | + return kernel_clone(&args); |
---|
2415 | 2612 | } |
---|
2416 | 2613 | |
---|
2417 | 2614 | #ifdef __ARCH_WANT_SYS_FORK |
---|
2418 | 2615 | SYSCALL_DEFINE0(fork) |
---|
2419 | 2616 | { |
---|
2420 | 2617 | #ifdef CONFIG_MMU |
---|
2421 | | - return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0); |
---|
| 2618 | + struct kernel_clone_args args = { |
---|
| 2619 | + .exit_signal = SIGCHLD, |
---|
| 2620 | + }; |
---|
| 2621 | + |
---|
| 2622 | + return kernel_clone(&args); |
---|
2422 | 2623 | #else |
---|
2423 | 2624 | /* can not support in nommu mode */ |
---|
2424 | 2625 | return -EINVAL; |
---|
.. | .. |
---|
2429 | 2630 | #ifdef __ARCH_WANT_SYS_VFORK |
---|
2430 | 2631 | SYSCALL_DEFINE0(vfork) |
---|
2431 | 2632 | { |
---|
2432 | | - return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, |
---|
2433 | | - 0, NULL, NULL, 0); |
---|
| 2633 | + struct kernel_clone_args args = { |
---|
| 2634 | + .flags = CLONE_VFORK | CLONE_VM, |
---|
| 2635 | + .exit_signal = SIGCHLD, |
---|
| 2636 | + }; |
---|
| 2637 | + |
---|
| 2638 | + return kernel_clone(&args); |
---|
2434 | 2639 | } |
---|
2435 | 2640 | #endif |
---|
2436 | 2641 | |
---|
.. | .. |
---|
2458 | 2663 | unsigned long, tls) |
---|
2459 | 2664 | #endif |
---|
2460 | 2665 | { |
---|
2461 | | - return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls); |
---|
| 2666 | + struct kernel_clone_args args = { |
---|
| 2667 | + .flags = (lower_32_bits(clone_flags) & ~CSIGNAL), |
---|
| 2668 | + .pidfd = parent_tidptr, |
---|
| 2669 | + .child_tid = child_tidptr, |
---|
| 2670 | + .parent_tid = parent_tidptr, |
---|
| 2671 | + .exit_signal = (lower_32_bits(clone_flags) & CSIGNAL), |
---|
| 2672 | + .stack = newsp, |
---|
| 2673 | + .tls = tls, |
---|
| 2674 | + }; |
---|
| 2675 | + |
---|
| 2676 | + return kernel_clone(&args); |
---|
| 2677 | +} |
---|
| 2678 | +#endif |
---|
| 2679 | + |
---|
| 2680 | +#ifdef __ARCH_WANT_SYS_CLONE3 |
---|
| 2681 | + |
---|
| 2682 | +noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, |
---|
| 2683 | + struct clone_args __user *uargs, |
---|
| 2684 | + size_t usize) |
---|
| 2685 | +{ |
---|
| 2686 | + int err; |
---|
| 2687 | + struct clone_args args; |
---|
| 2688 | + pid_t *kset_tid = kargs->set_tid; |
---|
| 2689 | + |
---|
| 2690 | + BUILD_BUG_ON(offsetofend(struct clone_args, tls) != |
---|
| 2691 | + CLONE_ARGS_SIZE_VER0); |
---|
| 2692 | + BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) != |
---|
| 2693 | + CLONE_ARGS_SIZE_VER1); |
---|
| 2694 | + BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) != |
---|
| 2695 | + CLONE_ARGS_SIZE_VER2); |
---|
| 2696 | + BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2); |
---|
| 2697 | + |
---|
| 2698 | + if (unlikely(usize > PAGE_SIZE)) |
---|
| 2699 | + return -E2BIG; |
---|
| 2700 | + if (unlikely(usize < CLONE_ARGS_SIZE_VER0)) |
---|
| 2701 | + return -EINVAL; |
---|
| 2702 | + |
---|
| 2703 | + err = copy_struct_from_user(&args, sizeof(args), uargs, usize); |
---|
| 2704 | + if (err) |
---|
| 2705 | + return err; |
---|
| 2706 | + |
---|
| 2707 | + if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL)) |
---|
| 2708 | + return -EINVAL; |
---|
| 2709 | + |
---|
| 2710 | + if (unlikely(!args.set_tid && args.set_tid_size > 0)) |
---|
| 2711 | + return -EINVAL; |
---|
| 2712 | + |
---|
| 2713 | + if (unlikely(args.set_tid && args.set_tid_size == 0)) |
---|
| 2714 | + return -EINVAL; |
---|
| 2715 | + |
---|
| 2716 | + /* |
---|
| 2717 | + * Verify that higher 32bits of exit_signal are unset and that |
---|
| 2718 | + * it is a valid signal |
---|
| 2719 | + */ |
---|
| 2720 | + if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) || |
---|
| 2721 | + !valid_signal(args.exit_signal))) |
---|
| 2722 | + return -EINVAL; |
---|
| 2723 | + |
---|
| 2724 | + if ((args.flags & CLONE_INTO_CGROUP) && |
---|
| 2725 | + (args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2)) |
---|
| 2726 | + return -EINVAL; |
---|
| 2727 | + |
---|
| 2728 | + *kargs = (struct kernel_clone_args){ |
---|
| 2729 | + .flags = args.flags, |
---|
| 2730 | + .pidfd = u64_to_user_ptr(args.pidfd), |
---|
| 2731 | + .child_tid = u64_to_user_ptr(args.child_tid), |
---|
| 2732 | + .parent_tid = u64_to_user_ptr(args.parent_tid), |
---|
| 2733 | + .exit_signal = args.exit_signal, |
---|
| 2734 | + .stack = args.stack, |
---|
| 2735 | + .stack_size = args.stack_size, |
---|
| 2736 | + .tls = args.tls, |
---|
| 2737 | + .set_tid_size = args.set_tid_size, |
---|
| 2738 | + .cgroup = args.cgroup, |
---|
| 2739 | + }; |
---|
| 2740 | + |
---|
| 2741 | + if (args.set_tid && |
---|
| 2742 | + copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid), |
---|
| 2743 | + (kargs->set_tid_size * sizeof(pid_t)))) |
---|
| 2744 | + return -EFAULT; |
---|
| 2745 | + |
---|
| 2746 | + kargs->set_tid = kset_tid; |
---|
| 2747 | + |
---|
| 2748 | + return 0; |
---|
| 2749 | +} |
---|
| 2750 | + |
---|
| 2751 | +/** |
---|
| 2752 | + * clone3_stack_valid - check and prepare stack |
---|
| 2753 | + * @kargs: kernel clone args |
---|
| 2754 | + * |
---|
| 2755 | + * Verify that the stack arguments userspace gave us are sane. |
---|
| 2756 | + * In addition, set the stack direction for userspace since it's easy for us to |
---|
| 2757 | + * determine. |
---|
| 2758 | + */ |
---|
| 2759 | +static inline bool clone3_stack_valid(struct kernel_clone_args *kargs) |
---|
| 2760 | +{ |
---|
| 2761 | + if (kargs->stack == 0) { |
---|
| 2762 | + if (kargs->stack_size > 0) |
---|
| 2763 | + return false; |
---|
| 2764 | + } else { |
---|
| 2765 | + if (kargs->stack_size == 0) |
---|
| 2766 | + return false; |
---|
| 2767 | + |
---|
| 2768 | + if (!access_ok((void __user *)kargs->stack, kargs->stack_size)) |
---|
| 2769 | + return false; |
---|
| 2770 | + |
---|
| 2771 | +#if !defined(CONFIG_STACK_GROWSUP) && !defined(CONFIG_IA64) |
---|
| 2772 | + kargs->stack += kargs->stack_size; |
---|
| 2773 | +#endif |
---|
| 2774 | + } |
---|
| 2775 | + |
---|
| 2776 | + return true; |
---|
| 2777 | +} |
---|
| 2778 | + |
---|
| 2779 | +static bool clone3_args_valid(struct kernel_clone_args *kargs) |
---|
| 2780 | +{ |
---|
| 2781 | + /* Verify that no unknown flags are passed along. */ |
---|
| 2782 | + if (kargs->flags & |
---|
| 2783 | + ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP)) |
---|
| 2784 | + return false; |
---|
| 2785 | + |
---|
| 2786 | + /* |
---|
| 2787 | + * - make the CLONE_DETACHED bit reuseable for clone3 |
---|
| 2788 | + * - make the CSIGNAL bits reuseable for clone3 |
---|
| 2789 | + */ |
---|
| 2790 | + if (kargs->flags & (CLONE_DETACHED | CSIGNAL)) |
---|
| 2791 | + return false; |
---|
| 2792 | + |
---|
| 2793 | + if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) == |
---|
| 2794 | + (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) |
---|
| 2795 | + return false; |
---|
| 2796 | + |
---|
| 2797 | + if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) && |
---|
| 2798 | + kargs->exit_signal) |
---|
| 2799 | + return false; |
---|
| 2800 | + |
---|
| 2801 | + if (!clone3_stack_valid(kargs)) |
---|
| 2802 | + return false; |
---|
| 2803 | + |
---|
| 2804 | + return true; |
---|
| 2805 | +} |
---|
| 2806 | + |
---|
| 2807 | +/** |
---|
| 2808 | + * clone3 - create a new process with specific properties |
---|
| 2809 | + * @uargs: argument structure |
---|
| 2810 | + * @size: size of @uargs |
---|
| 2811 | + * |
---|
| 2812 | + * clone3() is the extensible successor to clone()/clone2(). |
---|
| 2813 | + * It takes a struct as argument that is versioned by its size. |
---|
| 2814 | + * |
---|
| 2815 | + * Return: On success, a positive PID for the child process. |
---|
| 2816 | + * On error, a negative errno number. |
---|
| 2817 | + */ |
---|
| 2818 | +SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size) |
---|
| 2819 | +{ |
---|
| 2820 | + int err; |
---|
| 2821 | + |
---|
| 2822 | + struct kernel_clone_args kargs; |
---|
| 2823 | + pid_t set_tid[MAX_PID_NS_LEVEL]; |
---|
| 2824 | + |
---|
| 2825 | + kargs.set_tid = set_tid; |
---|
| 2826 | + |
---|
| 2827 | + err = copy_clone_args_from_user(&kargs, uargs, size); |
---|
| 2828 | + if (err) |
---|
| 2829 | + return err; |
---|
| 2830 | + |
---|
| 2831 | + if (!clone3_args_valid(&kargs)) |
---|
| 2832 | + return -EINVAL; |
---|
| 2833 | + |
---|
| 2834 | + return kernel_clone(&kargs); |
---|
2462 | 2835 | } |
---|
2463 | 2836 | #endif |
---|
2464 | 2837 | |
---|
.. | .. |
---|
2553 | 2926 | if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| |
---|
2554 | 2927 | CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| |
---|
2555 | 2928 | CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| |
---|
2556 | | - CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP)) |
---|
| 2929 | + CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP| |
---|
| 2930 | + CLONE_NEWTIME)) |
---|
2557 | 2931 | return -EINVAL; |
---|
2558 | 2932 | /* |
---|
2559 | 2933 | * Not implemented, but pretend it works if there is nothing |
---|
.. | .. |
---|
2566 | 2940 | return -EINVAL; |
---|
2567 | 2941 | } |
---|
2568 | 2942 | if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) { |
---|
2569 | | - if (atomic_read(¤t->sighand->count) > 1) |
---|
| 2943 | + if (refcount_read(¤t->sighand->count) > 1) |
---|
2570 | 2944 | return -EINVAL; |
---|
2571 | 2945 | } |
---|
2572 | 2946 | if (unshare_flags & CLONE_VM) { |
---|
.. | .. |
---|
2601 | 2975 | /* |
---|
2602 | 2976 | * Unshare file descriptor table if it is being shared |
---|
2603 | 2977 | */ |
---|
2604 | | -static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) |
---|
| 2978 | +int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, |
---|
| 2979 | + struct files_struct **new_fdp) |
---|
2605 | 2980 | { |
---|
2606 | 2981 | struct files_struct *fd = current->files; |
---|
2607 | 2982 | int error = 0; |
---|
2608 | 2983 | |
---|
2609 | 2984 | if ((unshare_flags & CLONE_FILES) && |
---|
2610 | 2985 | (fd && atomic_read(&fd->count) > 1)) { |
---|
2611 | | - *new_fdp = dup_fd(fd, &error); |
---|
| 2986 | + *new_fdp = dup_fd(fd, max_fds, &error); |
---|
2612 | 2987 | if (!*new_fdp) |
---|
2613 | 2988 | return error; |
---|
2614 | 2989 | } |
---|
.. | .. |
---|
2619 | 2994 | /* |
---|
2620 | 2995 | * unshare allows a process to 'unshare' part of the process |
---|
2621 | 2996 | * context which was originally shared using clone. copy_* |
---|
2622 | | - * functions used by do_fork() cannot be used here directly |
---|
| 2997 | + * functions used by kernel_clone() cannot be used here directly |
---|
2623 | 2998 | * because they modify an inactive task_struct that is being |
---|
2624 | 2999 | * constructed. Here we are modifying the current, active, |
---|
2625 | 3000 | * task_struct. |
---|
.. | .. |
---|
2668 | 3043 | err = unshare_fs(unshare_flags, &new_fs); |
---|
2669 | 3044 | if (err) |
---|
2670 | 3045 | goto bad_unshare_out; |
---|
2671 | | - err = unshare_fd(unshare_flags, &new_fd); |
---|
| 3046 | + err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd); |
---|
2672 | 3047 | if (err) |
---|
2673 | 3048 | goto bad_unshare_cleanup_fs; |
---|
2674 | 3049 | err = unshare_userns(unshare_flags, &new_cred); |
---|
.. | .. |
---|
2757 | 3132 | struct files_struct *copy = NULL; |
---|
2758 | 3133 | int error; |
---|
2759 | 3134 | |
---|
2760 | | - error = unshare_fd(CLONE_FILES, ©); |
---|
| 3135 | + error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, ©); |
---|
2761 | 3136 | if (error || !copy) { |
---|
2762 | 3137 | *displaced = NULL; |
---|
2763 | 3138 | return error; |
---|
.. | .. |
---|
2770 | 3145 | } |
---|
2771 | 3146 | |
---|
2772 | 3147 | int sysctl_max_threads(struct ctl_table *table, int write, |
---|
2773 | | - void __user *buffer, size_t *lenp, loff_t *ppos) |
---|
| 3148 | + void *buffer, size_t *lenp, loff_t *ppos) |
---|
2774 | 3149 | { |
---|
2775 | 3150 | struct ctl_table t; |
---|
2776 | 3151 | int ret; |
---|