.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
---|
1 | 2 | /* |
---|
2 | 3 | * linux/kernel/fork.c |
---|
3 | 4 | * |
---|
.. | .. |
---|
39 | 40 | #include <linux/binfmts.h> |
---|
40 | 41 | #include <linux/mman.h> |
---|
41 | 42 | #include <linux/mmu_notifier.h> |
---|
42 | | -#include <linux/hmm.h> |
---|
43 | 43 | #include <linux/fs.h> |
---|
44 | 44 | #include <linux/mm.h> |
---|
| 45 | +#include <linux/kprobes.h> |
---|
45 | 46 | #include <linux/vmacache.h> |
---|
46 | 47 | #include <linux/nsproxy.h> |
---|
47 | 48 | #include <linux/capability.h> |
---|
.. | .. |
---|
79 | 80 | #include <linux/blkdev.h> |
---|
80 | 81 | #include <linux/fs_struct.h> |
---|
81 | 82 | #include <linux/magic.h> |
---|
82 | | -#include <linux/sched/mm.h> |
---|
83 | 83 | #include <linux/perf_event.h> |
---|
84 | 84 | #include <linux/posix-timers.h> |
---|
85 | 85 | #include <linux/user-return-notifier.h> |
---|
.. | .. |
---|
93 | 93 | #include <linux/kcov.h> |
---|
94 | 94 | #include <linux/livepatch.h> |
---|
95 | 95 | #include <linux/thread_info.h> |
---|
96 | | -#include <linux/cpufreq_times.h> |
---|
| 96 | +#include <linux/stackleak.h> |
---|
| 97 | +#include <linux/kasan.h> |
---|
97 | 98 | #include <linux/scs.h> |
---|
| 99 | +#include <linux/io_uring.h> |
---|
| 100 | +#include <linux/cpufreq_times.h> |
---|
98 | 101 | |
---|
99 | | -#include <asm/pgtable.h> |
---|
100 | 102 | #include <asm/pgalloc.h> |
---|
101 | 103 | #include <linux/uaccess.h> |
---|
102 | 104 | #include <asm/mmu_context.h> |
---|
.. | .. |
---|
108 | 110 | #define CREATE_TRACE_POINTS |
---|
109 | 111 | #include <trace/events/task.h> |
---|
110 | 112 | |
---|
| 113 | +#undef CREATE_TRACE_POINTS |
---|
| 114 | +#include <trace/hooks/sched.h> |
---|
111 | 115 | /* |
---|
112 | 116 | * Minimum number of threads to boot the kernel |
---|
113 | 117 | */ |
---|
.. | .. |
---|
118 | 122 | */ |
---|
119 | 123 | #define MAX_THREADS FUTEX_TID_MASK |
---|
120 | 124 | |
---|
| 125 | +EXPORT_TRACEPOINT_SYMBOL_GPL(task_newtask); |
---|
| 126 | + |
---|
121 | 127 | /* |
---|
122 | 128 | * Protected counters by write_lock_irq(&tasklist_lock) |
---|
123 | 129 | */ |
---|
124 | 130 | unsigned long total_forks; /* Handle normal Linux uptimes. */ |
---|
125 | 131 | int nr_threads; /* The idle threads do not count.. */ |
---|
126 | 132 | |
---|
127 | | -int max_threads; /* tunable limit on nr_threads */ |
---|
| 133 | +static int max_threads; /* tunable limit on nr_threads */ |
---|
| 134 | + |
---|
| 135 | +#define NAMED_ARRAY_INDEX(x) [x] = __stringify(x) |
---|
| 136 | + |
---|
| 137 | +static const char * const resident_page_types[] = { |
---|
| 138 | + NAMED_ARRAY_INDEX(MM_FILEPAGES), |
---|
| 139 | + NAMED_ARRAY_INDEX(MM_ANONPAGES), |
---|
| 140 | + NAMED_ARRAY_INDEX(MM_SWAPENTS), |
---|
| 141 | + NAMED_ARRAY_INDEX(MM_SHMEMPAGES), |
---|
| 142 | +}; |
---|
128 | 143 | |
---|
129 | 144 | DEFINE_PER_CPU(unsigned long, process_counts) = 0; |
---|
130 | 145 | |
---|
131 | 146 | __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ |
---|
| 147 | +EXPORT_SYMBOL_GPL(tasklist_lock); |
---|
132 | 148 | |
---|
133 | 149 | #ifdef CONFIG_PROVE_RCU |
---|
134 | 150 | int lockdep_tasklist_lock_is_held(void) |
---|
.. | .. |
---|
216 | 232 | if (!s) |
---|
217 | 233 | continue; |
---|
218 | 234 | |
---|
| 235 | + /* Mark stack accessible for KASAN. */ |
---|
| 236 | + kasan_unpoison_range(s->addr, THREAD_SIZE); |
---|
| 237 | + |
---|
219 | 238 | /* Clear stale pointers from reused stack. */ |
---|
220 | 239 | memset(s->addr, 0, THREAD_SIZE); |
---|
221 | 240 | |
---|
.. | .. |
---|
224 | 243 | return s->addr; |
---|
225 | 244 | } |
---|
226 | 245 | |
---|
| 246 | + /* |
---|
| 247 | + * Allocated stacks are cached and later reused by new threads, |
---|
| 248 | + * so memcg accounting is performed manually on assigning/releasing |
---|
| 249 | + * stacks to tasks. Drop __GFP_ACCOUNT. |
---|
| 250 | + */ |
---|
227 | 251 | stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN, |
---|
228 | 252 | VMALLOC_START, VMALLOC_END, |
---|
229 | | - THREADINFO_GFP, |
---|
| 253 | + THREADINFO_GFP & ~__GFP_ACCOUNT, |
---|
230 | 254 | PAGE_KERNEL, |
---|
231 | 255 | 0, node, __builtin_return_address(0)); |
---|
232 | 256 | |
---|
.. | .. |
---|
245 | 269 | THREAD_SIZE_ORDER); |
---|
246 | 270 | |
---|
247 | 271 | if (likely(page)) { |
---|
248 | | - tsk->stack = page_address(page); |
---|
| 272 | + tsk->stack = kasan_reset_tag(page_address(page)); |
---|
249 | 273 | return tsk->stack; |
---|
250 | 274 | } |
---|
251 | 275 | return NULL; |
---|
.. | .. |
---|
255 | 279 | static inline void free_thread_stack(struct task_struct *tsk) |
---|
256 | 280 | { |
---|
257 | 281 | #ifdef CONFIG_VMAP_STACK |
---|
258 | | - if (task_stack_vm_area(tsk)) { |
---|
| 282 | + struct vm_struct *vm = task_stack_vm_area(tsk); |
---|
| 283 | + |
---|
| 284 | + if (vm) { |
---|
259 | 285 | int i; |
---|
| 286 | + |
---|
| 287 | + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) |
---|
| 288 | + memcg_kmem_uncharge_page(vm->pages[i], 0); |
---|
260 | 289 | |
---|
261 | 290 | for (i = 0; i < NR_CACHED_STACKS; i++) { |
---|
262 | 291 | if (this_cpu_cmpxchg(cached_stacks[i], |
---|
.. | .. |
---|
266 | 295 | return; |
---|
267 | 296 | } |
---|
268 | 297 | |
---|
269 | | - vfree_atomic(tsk->stack); |
---|
| 298 | + vfree(tsk->stack); |
---|
270 | 299 | return; |
---|
271 | 300 | } |
---|
272 | 301 | #endif |
---|
.. | .. |
---|
281 | 310 | { |
---|
282 | 311 | unsigned long *stack; |
---|
283 | 312 | stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); |
---|
| 313 | + stack = kasan_reset_tag(stack); |
---|
284 | 314 | tsk->stack = stack; |
---|
285 | 315 | return stack; |
---|
286 | 316 | } |
---|
.. | .. |
---|
333 | 363 | struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
---|
334 | 364 | |
---|
335 | 365 | if (new) { |
---|
336 | | - *new = *orig; |
---|
337 | | - INIT_LIST_HEAD(&new->anon_vma_chain); |
---|
| 366 | + ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); |
---|
| 367 | + ASSERT_EXCLUSIVE_WRITER(orig->vm_file); |
---|
| 368 | + /* |
---|
| 369 | + * orig->shared.rb may be modified concurrently, but the clone |
---|
| 370 | + * will be reinitialized. |
---|
| 371 | + */ |
---|
| 372 | + *new = data_race(*orig); |
---|
| 373 | + INIT_VMA(new); |
---|
| 374 | + new->vm_next = new->vm_prev = NULL; |
---|
338 | 375 | } |
---|
339 | 376 | return new; |
---|
340 | 377 | } |
---|
.. | .. |
---|
349 | 386 | void *stack = task_stack_page(tsk); |
---|
350 | 387 | struct vm_struct *vm = task_stack_vm_area(tsk); |
---|
351 | 388 | |
---|
| 389 | + |
---|
| 390 | + /* All stack pages are in the same node. */ |
---|
| 391 | + if (vm) |
---|
| 392 | + mod_lruvec_page_state(vm->pages[0], NR_KERNEL_STACK_KB, |
---|
| 393 | + account * (THREAD_SIZE / 1024)); |
---|
| 394 | + else |
---|
| 395 | + mod_lruvec_slab_state(stack, NR_KERNEL_STACK_KB, |
---|
| 396 | + account * (THREAD_SIZE / 1024)); |
---|
| 397 | +} |
---|
| 398 | + |
---|
| 399 | +static int memcg_charge_kernel_stack(struct task_struct *tsk) |
---|
| 400 | +{ |
---|
| 401 | +#ifdef CONFIG_VMAP_STACK |
---|
| 402 | + struct vm_struct *vm = task_stack_vm_area(tsk); |
---|
| 403 | + int ret; |
---|
| 404 | + |
---|
352 | 405 | BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0); |
---|
353 | 406 | |
---|
354 | 407 | if (vm) { |
---|
.. | .. |
---|
357 | 410 | BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); |
---|
358 | 411 | |
---|
359 | 412 | for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { |
---|
360 | | - mod_zone_page_state(page_zone(vm->pages[i]), |
---|
361 | | - NR_KERNEL_STACK_KB, |
---|
362 | | - PAGE_SIZE / 1024 * account); |
---|
| 413 | + /* |
---|
| 414 | + * If memcg_kmem_charge_page() fails, page->mem_cgroup |
---|
| 415 | + * pointer is NULL, and memcg_kmem_uncharge_page() in |
---|
| 416 | + * free_thread_stack() will ignore this page. |
---|
| 417 | + */ |
---|
| 418 | + ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, |
---|
| 419 | + 0); |
---|
| 420 | + if (ret) |
---|
| 421 | + return ret; |
---|
363 | 422 | } |
---|
364 | | - |
---|
365 | | - /* All stack pages belong to the same memcg. */ |
---|
366 | | - mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB, |
---|
367 | | - account * (THREAD_SIZE / 1024)); |
---|
368 | | - } else { |
---|
369 | | - /* |
---|
370 | | - * All stack pages are in the same zone and belong to the |
---|
371 | | - * same memcg. |
---|
372 | | - */ |
---|
373 | | - struct page *first_page = virt_to_page(stack); |
---|
374 | | - |
---|
375 | | - mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, |
---|
376 | | - THREAD_SIZE / 1024 * account); |
---|
377 | | - |
---|
378 | | - mod_memcg_page_state(first_page, MEMCG_KERNEL_STACK_KB, |
---|
379 | | - account * (THREAD_SIZE / 1024)); |
---|
380 | 423 | } |
---|
| 424 | +#endif |
---|
| 425 | + return 0; |
---|
381 | 426 | } |
---|
382 | 427 | |
---|
383 | 428 | static void release_task_stack(struct task_struct *tsk) |
---|
.. | .. |
---|
396 | 441 | #ifdef CONFIG_THREAD_INFO_IN_TASK |
---|
397 | 442 | void put_task_stack(struct task_struct *tsk) |
---|
398 | 443 | { |
---|
399 | | - if (atomic_dec_and_test(&tsk->stack_refcount)) |
---|
| 444 | + if (refcount_dec_and_test(&tsk->stack_refcount)) |
---|
400 | 445 | release_task_stack(tsk); |
---|
401 | 446 | } |
---|
| 447 | +EXPORT_SYMBOL_GPL(put_task_stack); |
---|
402 | 448 | #endif |
---|
403 | 449 | |
---|
404 | 450 | void free_task(struct task_struct *tsk) |
---|
.. | .. |
---|
406 | 452 | cpufreq_task_times_exit(tsk); |
---|
407 | 453 | scs_release(tsk); |
---|
408 | 454 | |
---|
| 455 | + trace_android_vh_free_task(tsk); |
---|
409 | 456 | #ifndef CONFIG_THREAD_INFO_IN_TASK |
---|
410 | 457 | /* |
---|
411 | 458 | * The task is finally done with both the stack and thread_info, |
---|
.. | .. |
---|
417 | 464 | * If the task had a separate stack allocation, it should be gone |
---|
418 | 465 | * by now. |
---|
419 | 466 | */ |
---|
420 | | - WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0); |
---|
| 467 | + WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0); |
---|
421 | 468 | #endif |
---|
422 | 469 | rt_mutex_debug_task_free(tsk); |
---|
423 | 470 | ftrace_graph_exit_task(tsk); |
---|
424 | | - put_seccomp_filter(tsk); |
---|
425 | 471 | arch_release_task_struct(tsk); |
---|
426 | 472 | if (tsk->flags & PF_KTHREAD) |
---|
427 | 473 | free_kthread_struct(tsk); |
---|
.. | .. |
---|
433 | 479 | static __latent_entropy int dup_mmap(struct mm_struct *mm, |
---|
434 | 480 | struct mm_struct *oldmm) |
---|
435 | 481 | { |
---|
436 | | - struct vm_area_struct *mpnt, *tmp, *prev, **pprev; |
---|
| 482 | + struct vm_area_struct *mpnt, *tmp, *prev, **pprev, *last = NULL; |
---|
437 | 483 | struct rb_node **rb_link, *rb_parent; |
---|
438 | 484 | int retval; |
---|
439 | 485 | unsigned long charge; |
---|
440 | 486 | LIST_HEAD(uf); |
---|
441 | 487 | |
---|
442 | 488 | uprobe_start_dup_mmap(); |
---|
443 | | - if (down_write_killable(&oldmm->mmap_sem)) { |
---|
| 489 | + if (mmap_write_lock_killable(oldmm)) { |
---|
444 | 490 | retval = -EINTR; |
---|
445 | 491 | goto fail_uprobe_end; |
---|
446 | 492 | } |
---|
.. | .. |
---|
449 | 495 | /* |
---|
450 | 496 | * Not linked in yet - no deadlock potential: |
---|
451 | 497 | */ |
---|
452 | | - down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); |
---|
| 498 | + mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING); |
---|
453 | 499 | |
---|
454 | 500 | /* No ordering required: file already has been exposed. */ |
---|
455 | 501 | RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); |
---|
.. | .. |
---|
504 | 550 | if (retval) |
---|
505 | 551 | goto fail_nomem_anon_vma_fork; |
---|
506 | 552 | if (tmp->vm_flags & VM_WIPEONFORK) { |
---|
507 | | - /* VM_WIPEONFORK gets a clean slate in the child. */ |
---|
| 553 | + /* |
---|
| 554 | + * VM_WIPEONFORK gets a clean slate in the child. |
---|
| 555 | + * Don't prepare anon_vma until fault since we don't |
---|
| 556 | + * copy page for current vma. |
---|
| 557 | + */ |
---|
508 | 558 | tmp->anon_vma = NULL; |
---|
509 | | - if (anon_vma_prepare(tmp)) |
---|
510 | | - goto fail_nomem_anon_vma_fork; |
---|
511 | 559 | } else if (anon_vma_fork(tmp, mpnt)) |
---|
512 | 560 | goto fail_nomem_anon_vma_fork; |
---|
513 | 561 | tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); |
---|
514 | | - tmp->vm_next = tmp->vm_prev = NULL; |
---|
515 | 562 | file = tmp->vm_file; |
---|
516 | 563 | if (file) { |
---|
517 | 564 | struct inode *inode = file_inode(file); |
---|
.. | .. |
---|
519 | 566 | |
---|
520 | 567 | get_file(file); |
---|
521 | 568 | if (tmp->vm_flags & VM_DENYWRITE) |
---|
522 | | - atomic_dec(&inode->i_writecount); |
---|
| 569 | + put_write_access(inode); |
---|
523 | 570 | i_mmap_lock_write(mapping); |
---|
524 | 571 | if (tmp->vm_flags & VM_SHARED) |
---|
525 | | - atomic_inc(&mapping->i_mmap_writable); |
---|
| 572 | + mapping_allow_writable(mapping); |
---|
526 | 573 | flush_dcache_mmap_lock(mapping); |
---|
527 | 574 | /* insert tmp into the share list, just after mpnt */ |
---|
528 | 575 | vma_interval_tree_insert_after(tmp, mpnt, |
---|
.. | .. |
---|
552 | 599 | rb_parent = &tmp->vm_rb; |
---|
553 | 600 | |
---|
554 | 601 | mm->map_count++; |
---|
555 | | - if (!(tmp->vm_flags & VM_WIPEONFORK)) |
---|
556 | | - retval = copy_page_range(mm, oldmm, mpnt); |
---|
| 602 | + if (!(tmp->vm_flags & VM_WIPEONFORK)) { |
---|
| 603 | + if (IS_ENABLED(CONFIG_SPECULATIVE_PAGE_FAULT)) { |
---|
| 604 | + /* |
---|
| 605 | + * Mark this VMA as changing to prevent the |
---|
| 606 | + * speculative page fault hanlder to process |
---|
| 607 | + * it until the TLB are flushed below. |
---|
| 608 | + */ |
---|
| 609 | + last = mpnt; |
---|
| 610 | + vm_write_begin(mpnt); |
---|
| 611 | + } |
---|
| 612 | + retval = copy_page_range(tmp, mpnt); |
---|
| 613 | + } |
---|
557 | 614 | |
---|
558 | 615 | if (tmp->vm_ops && tmp->vm_ops->open) |
---|
559 | 616 | tmp->vm_ops->open(tmp); |
---|
.. | .. |
---|
564 | 621 | /* a new mm has just been created */ |
---|
565 | 622 | retval = arch_dup_mmap(oldmm, mm); |
---|
566 | 623 | out: |
---|
567 | | - up_write(&mm->mmap_sem); |
---|
| 624 | + mmap_write_unlock(mm); |
---|
568 | 625 | flush_tlb_mm(oldmm); |
---|
569 | | - up_write(&oldmm->mmap_sem); |
---|
| 626 | + |
---|
| 627 | + if (IS_ENABLED(CONFIG_SPECULATIVE_PAGE_FAULT)) { |
---|
| 628 | + /* |
---|
| 629 | + * Since the TLB has been flush, we can safely unmark the |
---|
| 630 | + * copied VMAs and allows the speculative page fault handler to |
---|
| 631 | + * process them again. |
---|
| 632 | + * Walk back the VMA list from the last marked VMA. |
---|
| 633 | + */ |
---|
| 634 | + for (; last; last = last->vm_prev) { |
---|
| 635 | + if (last->vm_flags & VM_DONTCOPY) |
---|
| 636 | + continue; |
---|
| 637 | + if (!(last->vm_flags & VM_WIPEONFORK)) |
---|
| 638 | + vm_write_end(last); |
---|
| 639 | + } |
---|
| 640 | + } |
---|
| 641 | + |
---|
| 642 | + mmap_write_unlock(oldmm); |
---|
570 | 643 | dup_userfaultfd_complete(&uf); |
---|
571 | 644 | fail_uprobe_end: |
---|
572 | 645 | uprobe_end_dup_mmap(); |
---|
.. | .. |
---|
596 | 669 | #else |
---|
597 | 670 | static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) |
---|
598 | 671 | { |
---|
599 | | - down_write(&oldmm->mmap_sem); |
---|
| 672 | + mmap_write_lock(oldmm); |
---|
600 | 673 | RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); |
---|
601 | | - up_write(&oldmm->mmap_sem); |
---|
| 674 | + mmap_write_unlock(oldmm); |
---|
602 | 675 | return 0; |
---|
603 | 676 | } |
---|
604 | 677 | #define mm_alloc_pgd(mm) (0) |
---|
.. | .. |
---|
609 | 682 | { |
---|
610 | 683 | int i; |
---|
611 | 684 | |
---|
| 685 | + BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS, |
---|
| 686 | + "Please make sure 'struct resident_page_types[]' is updated as well"); |
---|
| 687 | + |
---|
612 | 688 | for (i = 0; i < NR_MM_COUNTERS; i++) { |
---|
613 | 689 | long x = atomic_long_read(&mm->rss_stat.count[i]); |
---|
614 | 690 | |
---|
615 | 691 | if (unlikely(x)) |
---|
616 | | - printk(KERN_ALERT "BUG: Bad rss-counter state " |
---|
617 | | - "mm:%p idx:%d val:%ld\n", mm, i, x); |
---|
| 692 | + pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n", |
---|
| 693 | + mm, resident_page_types[i], x); |
---|
618 | 694 | } |
---|
619 | 695 | |
---|
620 | 696 | if (mm_pgtables_bytes(mm)) |
---|
.. | .. |
---|
641 | 717 | WARN_ON_ONCE(mm == current->active_mm); |
---|
642 | 718 | mm_free_pgd(mm); |
---|
643 | 719 | destroy_context(mm); |
---|
644 | | - hmm_mm_destroy(mm); |
---|
645 | | - mmu_notifier_mm_destroy(mm); |
---|
| 720 | + mmu_notifier_subscriptions_destroy(mm); |
---|
646 | 721 | check_mm(mm); |
---|
647 | 722 | put_user_ns(mm->user_ns); |
---|
648 | 723 | free_mm(mm); |
---|
649 | 724 | } |
---|
650 | 725 | EXPORT_SYMBOL_GPL(__mmdrop); |
---|
| 726 | + |
---|
| 727 | +#ifdef CONFIG_PREEMPT_RT |
---|
| 728 | +/* |
---|
| 729 | + * RCU callback for delayed mm drop. Not strictly rcu, but we don't |
---|
| 730 | + * want another facility to make this work. |
---|
| 731 | + */ |
---|
| 732 | +void __mmdrop_delayed(struct rcu_head *rhp) |
---|
| 733 | +{ |
---|
| 734 | + struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop); |
---|
| 735 | + |
---|
| 736 | + __mmdrop(mm); |
---|
| 737 | +} |
---|
| 738 | +#endif |
---|
651 | 739 | |
---|
652 | 740 | static void mmdrop_async_fn(struct work_struct *work) |
---|
653 | 741 | { |
---|
.. | .. |
---|
680 | 768 | |
---|
681 | 769 | static inline void put_signal_struct(struct signal_struct *sig) |
---|
682 | 770 | { |
---|
683 | | - if (atomic_dec_and_test(&sig->sigcnt)) |
---|
| 771 | + if (refcount_dec_and_test(&sig->sigcnt)) |
---|
684 | 772 | free_signal_struct(sig); |
---|
685 | 773 | } |
---|
686 | 774 | |
---|
687 | 775 | void __put_task_struct(struct task_struct *tsk) |
---|
688 | 776 | { |
---|
689 | 777 | WARN_ON(!tsk->exit_state); |
---|
690 | | - WARN_ON(atomic_read(&tsk->usage)); |
---|
| 778 | + WARN_ON(refcount_read(&tsk->usage)); |
---|
691 | 779 | WARN_ON(tsk == current); |
---|
692 | 780 | |
---|
| 781 | + /* |
---|
| 782 | + * Remove function-return probe instances associated with this |
---|
| 783 | + * task and put them back on the free list. |
---|
| 784 | + */ |
---|
| 785 | + kprobe_flush_task(tsk); |
---|
| 786 | + |
---|
| 787 | + /* Task is done with its stack. */ |
---|
| 788 | + put_task_stack(tsk); |
---|
| 789 | + |
---|
| 790 | + io_uring_free(tsk); |
---|
693 | 791 | cgroup_free(tsk); |
---|
694 | 792 | task_numa_free(tsk, true); |
---|
695 | 793 | security_task_free(tsk); |
---|
.. | .. |
---|
710 | 808 | static void set_max_threads(unsigned int max_threads_suggested) |
---|
711 | 809 | { |
---|
712 | 810 | u64 threads; |
---|
| 811 | + unsigned long nr_pages = totalram_pages(); |
---|
713 | 812 | |
---|
714 | 813 | /* |
---|
715 | 814 | * The number of threads shall be limited such that the thread |
---|
716 | 815 | * structures may only consume a small part of the available memory. |
---|
717 | 816 | */ |
---|
718 | | - if (fls64(totalram_pages) + fls64(PAGE_SIZE) > 64) |
---|
| 817 | + if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64) |
---|
719 | 818 | threads = MAX_THREADS; |
---|
720 | 819 | else |
---|
721 | | - threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE, |
---|
| 820 | + threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE, |
---|
722 | 821 | (u64) THREAD_SIZE * 8UL); |
---|
723 | 822 | |
---|
724 | 823 | if (threads > max_threads_suggested) |
---|
.. | .. |
---|
732 | 831 | int arch_task_struct_size __read_mostly; |
---|
733 | 832 | #endif |
---|
734 | 833 | |
---|
| 834 | +#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR |
---|
735 | 835 | static void task_struct_whitelist(unsigned long *offset, unsigned long *size) |
---|
736 | 836 | { |
---|
737 | 837 | /* Fetch thread_struct whitelist for the architecture. */ |
---|
.. | .. |
---|
746 | 846 | else |
---|
747 | 847 | *offset += offsetof(struct task_struct, thread); |
---|
748 | 848 | } |
---|
| 849 | +#endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */ |
---|
749 | 850 | |
---|
750 | 851 | void __init fork_init(void) |
---|
751 | 852 | { |
---|
.. | .. |
---|
787 | 888 | scs_init(); |
---|
788 | 889 | |
---|
789 | 890 | lockdep_init_task(&init_task); |
---|
| 891 | + uprobes_init(); |
---|
790 | 892 | } |
---|
791 | 893 | |
---|
792 | 894 | int __weak arch_dup_task_struct(struct task_struct *dst, |
---|
.. | .. |
---|
808 | 910 | { |
---|
809 | 911 | struct task_struct *tsk; |
---|
810 | 912 | unsigned long *stack; |
---|
811 | | - struct vm_struct *stack_vm_area; |
---|
| 913 | + struct vm_struct *stack_vm_area __maybe_unused; |
---|
812 | 914 | int err; |
---|
813 | 915 | |
---|
814 | 916 | if (node == NUMA_NO_NODE) |
---|
.. | .. |
---|
820 | 922 | stack = alloc_thread_stack_node(tsk, node); |
---|
821 | 923 | if (!stack) |
---|
822 | 924 | goto free_tsk; |
---|
| 925 | + |
---|
| 926 | + if (memcg_charge_kernel_stack(tsk)) |
---|
| 927 | + goto free_stack; |
---|
823 | 928 | |
---|
824 | 929 | stack_vm_area = task_stack_vm_area(tsk); |
---|
825 | 930 | |
---|
.. | .. |
---|
835 | 940 | tsk->stack_vm_area = stack_vm_area; |
---|
836 | 941 | #endif |
---|
837 | 942 | #ifdef CONFIG_THREAD_INFO_IN_TASK |
---|
838 | | - atomic_set(&tsk->stack_refcount, 1); |
---|
| 943 | + refcount_set(&tsk->stack_refcount, 1); |
---|
839 | 944 | #endif |
---|
840 | 945 | |
---|
841 | 946 | if (err) |
---|
.. | .. |
---|
863 | 968 | #ifdef CONFIG_STACKPROTECTOR |
---|
864 | 969 | tsk->stack_canary = get_random_canary(); |
---|
865 | 970 | #endif |
---|
| 971 | + if (orig->cpus_ptr == &orig->cpus_mask) |
---|
| 972 | + tsk->cpus_ptr = &tsk->cpus_mask; |
---|
866 | 973 | |
---|
867 | 974 | /* |
---|
868 | | - * One for us, one for whoever does the "release_task()" (usually |
---|
869 | | - * parent) |
---|
| 975 | + * One for the user space visible state that goes away when reaped. |
---|
| 976 | + * One for the scheduler. |
---|
870 | 977 | */ |
---|
871 | | - atomic_set(&tsk->usage, 2); |
---|
| 978 | + refcount_set(&tsk->rcu_users, 2); |
---|
| 979 | + /* One for the rcu users */ |
---|
| 980 | + refcount_set(&tsk->usage, 1); |
---|
872 | 981 | #ifdef CONFIG_BLK_DEV_IO_TRACE |
---|
873 | 982 | tsk->btrace_seq = 0; |
---|
874 | 983 | #endif |
---|
875 | 984 | tsk->splice_pipe = NULL; |
---|
876 | 985 | tsk->task_frag.page = NULL; |
---|
877 | 986 | tsk->wake_q.next = NULL; |
---|
| 987 | + tsk->wake_q_sleeper.next = NULL; |
---|
| 988 | + tsk->pf_io_worker = NULL; |
---|
878 | 989 | |
---|
879 | 990 | account_kernel_stack(tsk, 1); |
---|
880 | 991 | |
---|
881 | 992 | kcov_task_init(tsk); |
---|
| 993 | + kmap_local_fork(tsk); |
---|
882 | 994 | |
---|
883 | 995 | #ifdef CONFIG_FAULT_INJECTION |
---|
884 | 996 | tsk->fail_nth = 0; |
---|
.. | .. |
---|
892 | 1004 | #ifdef CONFIG_MEMCG |
---|
893 | 1005 | tsk->active_memcg = NULL; |
---|
894 | 1006 | #endif |
---|
| 1007 | + |
---|
| 1008 | + android_init_vendor_data(tsk, 1); |
---|
| 1009 | + android_init_oem_data(tsk, 1); |
---|
| 1010 | + |
---|
| 1011 | + trace_android_vh_dup_task_struct(tsk, orig); |
---|
895 | 1012 | return tsk; |
---|
896 | 1013 | |
---|
897 | 1014 | free_stack: |
---|
.. | .. |
---|
941 | 1058 | #endif |
---|
942 | 1059 | } |
---|
943 | 1060 | |
---|
| 1061 | +static void mm_init_pasid(struct mm_struct *mm) |
---|
| 1062 | +{ |
---|
| 1063 | +#ifdef CONFIG_IOMMU_SUPPORT |
---|
| 1064 | + mm->pasid = INIT_PASID; |
---|
| 1065 | +#endif |
---|
| 1066 | +} |
---|
| 1067 | + |
---|
944 | 1068 | static void mm_init_uprobes_state(struct mm_struct *mm) |
---|
945 | 1069 | { |
---|
946 | 1070 | #ifdef CONFIG_UPROBES |
---|
.. | .. |
---|
954 | 1078 | mm->mmap = NULL; |
---|
955 | 1079 | mm->mm_rb = RB_ROOT; |
---|
956 | 1080 | mm->vmacache_seqnum = 0; |
---|
| 1081 | +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT |
---|
| 1082 | + rwlock_init(&mm->mm_rb_lock); |
---|
| 1083 | +#endif |
---|
957 | 1084 | atomic_set(&mm->mm_users, 1); |
---|
958 | 1085 | atomic_set(&mm->mm_count, 1); |
---|
959 | | - init_rwsem(&mm->mmap_sem); |
---|
| 1086 | + seqcount_init(&mm->write_protect_seq); |
---|
| 1087 | + mmap_init_lock(mm); |
---|
960 | 1088 | INIT_LIST_HEAD(&mm->mmlist); |
---|
961 | 1089 | mm->core_state = NULL; |
---|
962 | 1090 | mm_pgtables_bytes_init(mm); |
---|
963 | 1091 | mm->map_count = 0; |
---|
964 | 1092 | mm->locked_vm = 0; |
---|
965 | | - mm->pinned_vm = 0; |
---|
| 1093 | + atomic_set(&mm->has_pinned, 0); |
---|
| 1094 | + atomic64_set(&mm->pinned_vm, 0); |
---|
966 | 1095 | memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); |
---|
967 | 1096 | spin_lock_init(&mm->page_table_lock); |
---|
968 | 1097 | spin_lock_init(&mm->arg_lock); |
---|
969 | 1098 | mm_init_cpumask(mm); |
---|
970 | 1099 | mm_init_aio(mm); |
---|
971 | 1100 | mm_init_owner(mm, p); |
---|
| 1101 | + mm_init_pasid(mm); |
---|
972 | 1102 | RCU_INIT_POINTER(mm->exe_file, NULL); |
---|
973 | | - mmu_notifier_mm_init(mm); |
---|
974 | | - hmm_mm_init(mm); |
---|
| 1103 | + if (!mmu_notifier_subscriptions_init(mm)) |
---|
| 1104 | + goto fail_nopgd; |
---|
975 | 1105 | init_tlb_flush_pending(mm); |
---|
976 | 1106 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS |
---|
977 | 1107 | mm->pmd_huge_pte = NULL; |
---|
.. | .. |
---|
1046 | 1176 | { |
---|
1047 | 1177 | might_sleep(); |
---|
1048 | 1178 | |
---|
1049 | | - if (atomic_dec_and_test(&mm->mm_users)) |
---|
| 1179 | + if (atomic_dec_and_test(&mm->mm_users)) { |
---|
| 1180 | + trace_android_vh_mmput(NULL); |
---|
1050 | 1181 | __mmput(mm); |
---|
| 1182 | + } |
---|
1051 | 1183 | } |
---|
1052 | 1184 | EXPORT_SYMBOL_GPL(mmput); |
---|
1053 | 1185 | |
---|
.. | .. |
---|
1067 | 1199 | schedule_work(&mm->async_put_work); |
---|
1068 | 1200 | } |
---|
1069 | 1201 | } |
---|
| 1202 | +EXPORT_SYMBOL_GPL(mmput_async); |
---|
1070 | 1203 | #endif |
---|
1071 | 1204 | |
---|
1072 | 1205 | /** |
---|
.. | .. |
---|
1171 | 1304 | struct mm_struct *mm; |
---|
1172 | 1305 | int err; |
---|
1173 | 1306 | |
---|
1174 | | - err = mutex_lock_killable(&task->signal->cred_guard_mutex); |
---|
| 1307 | + err = down_read_killable(&task->signal->exec_update_lock); |
---|
1175 | 1308 | if (err) |
---|
1176 | 1309 | return ERR_PTR(err); |
---|
1177 | 1310 | |
---|
.. | .. |
---|
1181 | 1314 | mmput(mm); |
---|
1182 | 1315 | mm = ERR_PTR(-EACCES); |
---|
1183 | 1316 | } |
---|
1184 | | - mutex_unlock(&task->signal->cred_guard_mutex); |
---|
| 1317 | + up_read(&task->signal->exec_update_lock); |
---|
1185 | 1318 | |
---|
1186 | 1319 | return mm; |
---|
1187 | 1320 | } |
---|
.. | .. |
---|
1279 | 1412 | mm_release(tsk, mm); |
---|
1280 | 1413 | } |
---|
1281 | 1414 | |
---|
1282 | | -/* |
---|
1283 | | - * Allocate a new mm structure and copy contents from the |
---|
1284 | | - * mm structure of the passed in task structure. |
---|
| 1415 | +/** |
---|
| 1416 | + * dup_mm() - duplicates an existing mm structure |
---|
| 1417 | + * @tsk: the task_struct with which the new mm will be associated. |
---|
| 1418 | + * @oldmm: the mm to duplicate. |
---|
| 1419 | + * |
---|
| 1420 | + * Allocates a new mm structure and duplicates the provided @oldmm structure |
---|
| 1421 | + * content into it. |
---|
| 1422 | + * |
---|
| 1423 | + * Return: the duplicated mm or NULL on failure. |
---|
1285 | 1424 | */ |
---|
1286 | | -static struct mm_struct *dup_mm(struct task_struct *tsk) |
---|
| 1425 | +static struct mm_struct *dup_mm(struct task_struct *tsk, |
---|
| 1426 | + struct mm_struct *oldmm) |
---|
1287 | 1427 | { |
---|
1288 | | - struct mm_struct *mm, *oldmm = current->mm; |
---|
| 1428 | + struct mm_struct *mm; |
---|
1289 | 1429 | int err; |
---|
1290 | 1430 | |
---|
1291 | 1431 | mm = allocate_mm(); |
---|
.. | .. |
---|
1353 | 1493 | } |
---|
1354 | 1494 | |
---|
1355 | 1495 | retval = -ENOMEM; |
---|
1356 | | - mm = dup_mm(tsk); |
---|
| 1496 | + mm = dup_mm(tsk, current->mm); |
---|
1357 | 1497 | if (!mm) |
---|
1358 | 1498 | goto fail_nomem; |
---|
1359 | 1499 | |
---|
.. | .. |
---|
1403 | 1543 | goto out; |
---|
1404 | 1544 | } |
---|
1405 | 1545 | |
---|
1406 | | - newf = dup_fd(oldf, &error); |
---|
| 1546 | + newf = dup_fd(oldf, NR_OPEN_MAX, &error); |
---|
1407 | 1547 | if (!newf) |
---|
1408 | 1548 | goto out; |
---|
1409 | 1549 | |
---|
.. | .. |
---|
1444 | 1584 | struct sighand_struct *sig; |
---|
1445 | 1585 | |
---|
1446 | 1586 | if (clone_flags & CLONE_SIGHAND) { |
---|
1447 | | - atomic_inc(¤t->sighand->count); |
---|
| 1587 | + refcount_inc(¤t->sighand->count); |
---|
1448 | 1588 | return 0; |
---|
1449 | 1589 | } |
---|
1450 | 1590 | sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); |
---|
1451 | | - rcu_assign_pointer(tsk->sighand, sig); |
---|
| 1591 | + RCU_INIT_POINTER(tsk->sighand, sig); |
---|
1452 | 1592 | if (!sig) |
---|
1453 | 1593 | return -ENOMEM; |
---|
1454 | 1594 | |
---|
1455 | | - atomic_set(&sig->count, 1); |
---|
| 1595 | + refcount_set(&sig->count, 1); |
---|
1456 | 1596 | spin_lock_irq(¤t->sighand->siglock); |
---|
1457 | 1597 | memcpy(sig->action, current->sighand->action, sizeof(sig->action)); |
---|
1458 | 1598 | spin_unlock_irq(¤t->sighand->siglock); |
---|
| 1599 | + |
---|
| 1600 | + /* Reset all signal handler not set to SIG_IGN to SIG_DFL. */ |
---|
| 1601 | + if (clone_flags & CLONE_CLEAR_SIGHAND) |
---|
| 1602 | + flush_signal_handlers(tsk, 0); |
---|
| 1603 | + |
---|
1459 | 1604 | return 0; |
---|
1460 | 1605 | } |
---|
1461 | 1606 | |
---|
1462 | 1607 | void __cleanup_sighand(struct sighand_struct *sighand) |
---|
1463 | 1608 | { |
---|
1464 | | - if (atomic_dec_and_test(&sighand->count)) { |
---|
| 1609 | + if (refcount_dec_and_test(&sighand->count)) { |
---|
1465 | 1610 | signalfd_cleanup(sighand); |
---|
1466 | 1611 | /* |
---|
1467 | 1612 | * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it |
---|
.. | .. |
---|
1471 | 1616 | } |
---|
1472 | 1617 | } |
---|
1473 | 1618 | |
---|
1474 | | -#ifdef CONFIG_POSIX_TIMERS |
---|
1475 | 1619 | /* |
---|
1476 | 1620 | * Initialize POSIX timer handling for a thread group. |
---|
1477 | 1621 | */ |
---|
1478 | 1622 | static void posix_cpu_timers_init_group(struct signal_struct *sig) |
---|
1479 | 1623 | { |
---|
| 1624 | + struct posix_cputimers *pct = &sig->posix_cputimers; |
---|
1480 | 1625 | unsigned long cpu_limit; |
---|
1481 | 1626 | |
---|
1482 | 1627 | cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); |
---|
1483 | | - if (cpu_limit != RLIM_INFINITY) { |
---|
1484 | | - sig->cputime_expires.prof_exp = cpu_limit * NSEC_PER_SEC; |
---|
1485 | | - sig->cputimer.running = true; |
---|
1486 | | - } |
---|
1487 | | - |
---|
1488 | | - /* The timer lists. */ |
---|
1489 | | - INIT_LIST_HEAD(&sig->cpu_timers[0]); |
---|
1490 | | - INIT_LIST_HEAD(&sig->cpu_timers[1]); |
---|
1491 | | - INIT_LIST_HEAD(&sig->cpu_timers[2]); |
---|
| 1628 | + posix_cputimers_group_init(pct, cpu_limit); |
---|
1492 | 1629 | } |
---|
1493 | | -#else |
---|
1494 | | -static inline void posix_cpu_timers_init_group(struct signal_struct *sig) { } |
---|
1495 | | -#endif |
---|
1496 | 1630 | |
---|
1497 | 1631 | static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) |
---|
1498 | 1632 | { |
---|
.. | .. |
---|
1508 | 1642 | |
---|
1509 | 1643 | sig->nr_threads = 1; |
---|
1510 | 1644 | atomic_set(&sig->live, 1); |
---|
1511 | | - atomic_set(&sig->sigcnt, 1); |
---|
| 1645 | + refcount_set(&sig->sigcnt, 1); |
---|
1512 | 1646 | |
---|
1513 | 1647 | /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */ |
---|
1514 | 1648 | sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node); |
---|
.. | .. |
---|
1540 | 1674 | sig->oom_score_adj_min = current->signal->oom_score_adj_min; |
---|
1541 | 1675 | |
---|
1542 | 1676 | mutex_init(&sig->cred_guard_mutex); |
---|
| 1677 | + init_rwsem(&sig->exec_update_lock); |
---|
1543 | 1678 | |
---|
1544 | 1679 | return 0; |
---|
1545 | 1680 | } |
---|
.. | .. |
---|
1594 | 1729 | #endif |
---|
1595 | 1730 | } |
---|
1596 | 1731 | |
---|
1597 | | -#ifdef CONFIG_POSIX_TIMERS |
---|
1598 | | -/* |
---|
1599 | | - * Initialize POSIX timer handling for a single task. |
---|
1600 | | - */ |
---|
1601 | | -static void posix_cpu_timers_init(struct task_struct *tsk) |
---|
1602 | | -{ |
---|
1603 | | - tsk->cputime_expires.prof_exp = 0; |
---|
1604 | | - tsk->cputime_expires.virt_exp = 0; |
---|
1605 | | - tsk->cputime_expires.sched_exp = 0; |
---|
1606 | | - INIT_LIST_HEAD(&tsk->cpu_timers[0]); |
---|
1607 | | - INIT_LIST_HEAD(&tsk->cpu_timers[1]); |
---|
1608 | | - INIT_LIST_HEAD(&tsk->cpu_timers[2]); |
---|
1609 | | -} |
---|
1610 | | -#else |
---|
1611 | | -static inline void posix_cpu_timers_init(struct task_struct *tsk) { } |
---|
1612 | | -#endif |
---|
1613 | | - |
---|
1614 | 1732 | static inline void init_task_pid_links(struct task_struct *task) |
---|
1615 | 1733 | { |
---|
1616 | 1734 | enum pid_type type; |
---|
.. | .. |
---|
1642 | 1760 | INIT_LIST_HEAD(&p->rcu_tasks_holdout_list); |
---|
1643 | 1761 | p->rcu_tasks_idle_cpu = -1; |
---|
1644 | 1762 | #endif /* #ifdef CONFIG_TASKS_RCU */ |
---|
| 1763 | +#ifdef CONFIG_TASKS_TRACE_RCU |
---|
| 1764 | + p->trc_reader_nesting = 0; |
---|
| 1765 | + p->trc_reader_special.s = 0; |
---|
| 1766 | + INIT_LIST_HEAD(&p->trc_holdout_list); |
---|
| 1767 | +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ |
---|
1645 | 1768 | } |
---|
| 1769 | + |
---|
| 1770 | +struct pid *pidfd_pid(const struct file *file) |
---|
| 1771 | +{ |
---|
| 1772 | + if (file->f_op == &pidfd_fops) |
---|
| 1773 | + return file->private_data; |
---|
| 1774 | + |
---|
| 1775 | + return ERR_PTR(-EBADF); |
---|
| 1776 | +} |
---|
| 1777 | + |
---|
| 1778 | +static int pidfd_release(struct inode *inode, struct file *file) |
---|
| 1779 | +{ |
---|
| 1780 | + struct pid *pid = file->private_data; |
---|
| 1781 | + |
---|
| 1782 | + file->private_data = NULL; |
---|
| 1783 | + put_pid(pid); |
---|
| 1784 | + return 0; |
---|
| 1785 | +} |
---|
| 1786 | + |
---|
| 1787 | +#ifdef CONFIG_PROC_FS |
---|
| 1788 | +/** |
---|
| 1789 | + * pidfd_show_fdinfo - print information about a pidfd |
---|
| 1790 | + * @m: proc fdinfo file |
---|
| 1791 | + * @f: file referencing a pidfd |
---|
| 1792 | + * |
---|
| 1793 | + * Pid: |
---|
| 1794 | + * This function will print the pid that a given pidfd refers to in the |
---|
| 1795 | + * pid namespace of the procfs instance. |
---|
| 1796 | + * If the pid namespace of the process is not a descendant of the pid |
---|
| 1797 | + * namespace of the procfs instance 0 will be shown as its pid. This is |
---|
| 1798 | + * similar to calling getppid() on a process whose parent is outside of |
---|
| 1799 | + * its pid namespace. |
---|
| 1800 | + * |
---|
| 1801 | + * NSpid: |
---|
| 1802 | + * If pid namespaces are supported then this function will also print |
---|
| 1803 | + * the pid of a given pidfd refers to for all descendant pid namespaces |
---|
| 1804 | + * starting from the current pid namespace of the instance, i.e. the |
---|
| 1805 | + * Pid field and the first entry in the NSpid field will be identical. |
---|
| 1806 | + * If the pid namespace of the process is not a descendant of the pid |
---|
| 1807 | + * namespace of the procfs instance 0 will be shown as its first NSpid |
---|
| 1808 | + * entry and no others will be shown. |
---|
| 1809 | + * Note that this differs from the Pid and NSpid fields in |
---|
| 1810 | + * /proc/<pid>/status where Pid and NSpid are always shown relative to |
---|
| 1811 | + * the pid namespace of the procfs instance. The difference becomes |
---|
| 1812 | + * obvious when sending around a pidfd between pid namespaces from a |
---|
| 1813 | + * different branch of the tree, i.e. where no ancestoral relation is |
---|
| 1814 | + * present between the pid namespaces: |
---|
| 1815 | + * - create two new pid namespaces ns1 and ns2 in the initial pid |
---|
| 1816 | + * namespace (also take care to create new mount namespaces in the |
---|
| 1817 | + * new pid namespace and mount procfs) |
---|
| 1818 | + * - create a process with a pidfd in ns1 |
---|
| 1819 | + * - send pidfd from ns1 to ns2 |
---|
| 1820 | + * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid |
---|
| 1821 | + * have exactly one entry, which is 0 |
---|
| 1822 | + */ |
---|
| 1823 | +static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) |
---|
| 1824 | +{ |
---|
| 1825 | + struct pid *pid = f->private_data; |
---|
| 1826 | + struct pid_namespace *ns; |
---|
| 1827 | + pid_t nr = -1; |
---|
| 1828 | + |
---|
| 1829 | + if (likely(pid_has_task(pid, PIDTYPE_PID))) { |
---|
| 1830 | + ns = proc_pid_ns(file_inode(m->file)->i_sb); |
---|
| 1831 | + nr = pid_nr_ns(pid, ns); |
---|
| 1832 | + } |
---|
| 1833 | + |
---|
| 1834 | + seq_put_decimal_ll(m, "Pid:\t", nr); |
---|
| 1835 | + |
---|
| 1836 | +#ifdef CONFIG_PID_NS |
---|
| 1837 | + seq_put_decimal_ll(m, "\nNSpid:\t", nr); |
---|
| 1838 | + if (nr > 0) { |
---|
| 1839 | + int i; |
---|
| 1840 | + |
---|
| 1841 | + /* If nr is non-zero it means that 'pid' is valid and that |
---|
| 1842 | + * ns, i.e. the pid namespace associated with the procfs |
---|
| 1843 | + * instance, is in the pid namespace hierarchy of pid. |
---|
| 1844 | + * Start at one below the already printed level. |
---|
| 1845 | + */ |
---|
| 1846 | + for (i = ns->level + 1; i <= pid->level; i++) |
---|
| 1847 | + seq_put_decimal_ll(m, "\t", pid->numbers[i].nr); |
---|
| 1848 | + } |
---|
| 1849 | +#endif |
---|
| 1850 | + seq_putc(m, '\n'); |
---|
| 1851 | +} |
---|
| 1852 | +#endif |
---|
| 1853 | + |
---|
| 1854 | +/* |
---|
| 1855 | + * Poll support for process exit notification. |
---|
| 1856 | + */ |
---|
| 1857 | +static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) |
---|
| 1858 | +{ |
---|
| 1859 | + struct pid *pid = file->private_data; |
---|
| 1860 | + __poll_t poll_flags = 0; |
---|
| 1861 | + |
---|
| 1862 | + poll_wait(file, &pid->wait_pidfd, pts); |
---|
| 1863 | + |
---|
| 1864 | + /* |
---|
| 1865 | + * Inform pollers only when the whole thread group exits. |
---|
| 1866 | + * If the thread group leader exits before all other threads in the |
---|
| 1867 | + * group, then poll(2) should block, similar to the wait(2) family. |
---|
| 1868 | + */ |
---|
| 1869 | + if (thread_group_exited(pid)) |
---|
| 1870 | + poll_flags = EPOLLIN | EPOLLRDNORM; |
---|
| 1871 | + |
---|
| 1872 | + return poll_flags; |
---|
| 1873 | +} |
---|
| 1874 | + |
---|
| 1875 | +const struct file_operations pidfd_fops = { |
---|
| 1876 | + .release = pidfd_release, |
---|
| 1877 | + .poll = pidfd_poll, |
---|
| 1878 | +#ifdef CONFIG_PROC_FS |
---|
| 1879 | + .show_fdinfo = pidfd_show_fdinfo, |
---|
| 1880 | +#endif |
---|
| 1881 | +}; |
---|
1646 | 1882 | |
---|
1647 | 1883 | static void __delayed_free_task(struct rcu_head *rhp) |
---|
1648 | 1884 | { |
---|
.. | .. |
---|
1657 | 1893 | call_rcu(&tsk->rcu, __delayed_free_task); |
---|
1658 | 1894 | else |
---|
1659 | 1895 | free_task(tsk); |
---|
1660 | | -} |
---|
1661 | | - |
---|
1662 | | -static int pidfd_release(struct inode *inode, struct file *file) |
---|
1663 | | -{ |
---|
1664 | | - struct pid *pid = file->private_data; |
---|
1665 | | - |
---|
1666 | | - file->private_data = NULL; |
---|
1667 | | - put_pid(pid); |
---|
1668 | | - return 0; |
---|
1669 | | -} |
---|
1670 | | - |
---|
1671 | | -#ifdef CONFIG_PROC_FS |
---|
1672 | | -static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) |
---|
1673 | | -{ |
---|
1674 | | - struct pid_namespace *ns = proc_pid_ns(file_inode(m->file)); |
---|
1675 | | - struct pid *pid = f->private_data; |
---|
1676 | | - |
---|
1677 | | - seq_put_decimal_ull(m, "Pid:\t", pid_nr_ns(pid, ns)); |
---|
1678 | | - seq_putc(m, '\n'); |
---|
1679 | | -} |
---|
1680 | | -#endif |
---|
1681 | | - |
---|
1682 | | -/* |
---|
1683 | | - * Poll support for process exit notification. |
---|
1684 | | - */ |
---|
1685 | | -static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) |
---|
1686 | | -{ |
---|
1687 | | - struct task_struct *task; |
---|
1688 | | - struct pid *pid = file->private_data; |
---|
1689 | | - __poll_t poll_flags = 0; |
---|
1690 | | - |
---|
1691 | | - poll_wait(file, &pid->wait_pidfd, pts); |
---|
1692 | | - |
---|
1693 | | - rcu_read_lock(); |
---|
1694 | | - task = pid_task(pid, PIDTYPE_PID); |
---|
1695 | | - /* |
---|
1696 | | - * Inform pollers only when the whole thread group exits. |
---|
1697 | | - * If the thread group leader exits before all other threads in the |
---|
1698 | | - * group, then poll(2) should block, similar to the wait(2) family. |
---|
1699 | | - */ |
---|
1700 | | - if (!task || (task->exit_state && thread_group_empty(task))) |
---|
1701 | | - poll_flags = EPOLLIN | EPOLLRDNORM; |
---|
1702 | | - rcu_read_unlock(); |
---|
1703 | | - |
---|
1704 | | - return poll_flags; |
---|
1705 | | -} |
---|
1706 | | - |
---|
1707 | | -const struct file_operations pidfd_fops = { |
---|
1708 | | - .release = pidfd_release, |
---|
1709 | | - .poll = pidfd_poll, |
---|
1710 | | -#ifdef CONFIG_PROC_FS |
---|
1711 | | - .show_fdinfo = pidfd_show_fdinfo, |
---|
1712 | | -#endif |
---|
1713 | | -}; |
---|
1714 | | - |
---|
1715 | | -/** |
---|
1716 | | - * pidfd_create() - Create a new pid file descriptor. |
---|
1717 | | - * |
---|
1718 | | - * @pid: struct pid that the pidfd will reference |
---|
1719 | | - * |
---|
1720 | | - * This creates a new pid file descriptor with the O_CLOEXEC flag set. |
---|
1721 | | - * |
---|
1722 | | - * Note, that this function can only be called after the fd table has |
---|
1723 | | - * been unshared to avoid leaking the pidfd to the new process. |
---|
1724 | | - * |
---|
1725 | | - * Return: On success, a cloexec pidfd is returned. |
---|
1726 | | - * On error, a negative errno number will be returned. |
---|
1727 | | - */ |
---|
1728 | | -static int pidfd_create(struct pid *pid) |
---|
1729 | | -{ |
---|
1730 | | - int fd; |
---|
1731 | | - |
---|
1732 | | - fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), |
---|
1733 | | - O_RDWR | O_CLOEXEC); |
---|
1734 | | - if (fd < 0) |
---|
1735 | | - put_pid(pid); |
---|
1736 | | - |
---|
1737 | | - return fd; |
---|
1738 | 1896 | } |
---|
1739 | 1897 | |
---|
1740 | 1898 | static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk) |
---|
.. | .. |
---|
1765 | 1923 | * flags). The actual kick-off is left to the caller. |
---|
1766 | 1924 | */ |
---|
1767 | 1925 | static __latent_entropy struct task_struct *copy_process( |
---|
1768 | | - unsigned long clone_flags, |
---|
1769 | | - unsigned long stack_start, |
---|
1770 | | - unsigned long stack_size, |
---|
1771 | | - int __user *parent_tidptr, |
---|
1772 | | - int __user *child_tidptr, |
---|
1773 | 1926 | struct pid *pid, |
---|
1774 | 1927 | int trace, |
---|
1775 | | - unsigned long tls, |
---|
1776 | | - int node) |
---|
| 1928 | + int node, |
---|
| 1929 | + struct kernel_clone_args *args) |
---|
1777 | 1930 | { |
---|
1778 | 1931 | int pidfd = -1, retval; |
---|
1779 | 1932 | struct task_struct *p; |
---|
1780 | 1933 | struct multiprocess_signals delayed; |
---|
| 1934 | + struct file *pidfile = NULL; |
---|
| 1935 | + u64 clone_flags = args->flags; |
---|
| 1936 | + struct nsproxy *nsp = current->nsproxy; |
---|
1781 | 1937 | |
---|
1782 | 1938 | /* |
---|
1783 | 1939 | * Don't allow sharing the root directory with processes in a different |
---|
.. | .. |
---|
1820 | 1976 | */ |
---|
1821 | 1977 | if (clone_flags & CLONE_THREAD) { |
---|
1822 | 1978 | if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) || |
---|
1823 | | - (task_active_pid_ns(current) != |
---|
1824 | | - current->nsproxy->pid_ns_for_children)) |
---|
| 1979 | + (task_active_pid_ns(current) != nsp->pid_ns_for_children)) |
---|
| 1980 | + return ERR_PTR(-EINVAL); |
---|
| 1981 | + } |
---|
| 1982 | + |
---|
| 1983 | + /* |
---|
| 1984 | + * If the new process will be in a different time namespace |
---|
| 1985 | + * do not allow it to share VM or a thread group with the forking task. |
---|
| 1986 | + */ |
---|
| 1987 | + if (clone_flags & (CLONE_THREAD | CLONE_VM)) { |
---|
| 1988 | + if (nsp->time_ns != nsp->time_ns_for_children) |
---|
1825 | 1989 | return ERR_PTR(-EINVAL); |
---|
1826 | 1990 | } |
---|
1827 | 1991 | |
---|
1828 | 1992 | if (clone_flags & CLONE_PIDFD) { |
---|
1829 | 1993 | /* |
---|
1830 | | - * - CLONE_PARENT_SETTID is useless for pidfds and also |
---|
1831 | | - * parent_tidptr is used to return pidfds. |
---|
1832 | 1994 | * - CLONE_DETACHED is blocked so that we can potentially |
---|
1833 | 1995 | * reuse it later for CLONE_PIDFD. |
---|
1834 | 1996 | * - CLONE_THREAD is blocked until someone really needs it. |
---|
1835 | 1997 | */ |
---|
1836 | | - if (clone_flags & |
---|
1837 | | - (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD)) |
---|
| 1998 | + if (clone_flags & (CLONE_DETACHED | CLONE_THREAD)) |
---|
1838 | 1999 | return ERR_PTR(-EINVAL); |
---|
1839 | 2000 | } |
---|
1840 | 2001 | |
---|
.. | .. |
---|
1853 | 2014 | recalc_sigpending(); |
---|
1854 | 2015 | spin_unlock_irq(¤t->sighand->siglock); |
---|
1855 | 2016 | retval = -ERESTARTNOINTR; |
---|
1856 | | - if (signal_pending(current)) |
---|
| 2017 | + if (task_sigpending(current)) |
---|
1857 | 2018 | goto fork_out; |
---|
1858 | 2019 | |
---|
1859 | 2020 | retval = -ENOMEM; |
---|
1860 | 2021 | p = dup_task_struct(current, node); |
---|
1861 | 2022 | if (!p) |
---|
1862 | 2023 | goto fork_out; |
---|
| 2024 | + if (args->io_thread) { |
---|
| 2025 | + /* |
---|
| 2026 | + * Mark us an IO worker, and block any signal that isn't |
---|
| 2027 | + * fatal or STOP |
---|
| 2028 | + */ |
---|
| 2029 | + p->flags |= PF_IO_WORKER; |
---|
| 2030 | + siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP)); |
---|
| 2031 | + } |
---|
1863 | 2032 | |
---|
1864 | 2033 | cpufreq_task_times_init(p); |
---|
1865 | 2034 | |
---|
.. | .. |
---|
1869 | 2038 | * p->set_child_tid which is (ab)used as a kthread's data pointer for |
---|
1870 | 2039 | * kernel threads (PF_KTHREAD). |
---|
1871 | 2040 | */ |
---|
1872 | | - p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; |
---|
| 2041 | + p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL; |
---|
1873 | 2042 | /* |
---|
1874 | 2043 | * Clear TID on mm_release()? |
---|
1875 | 2044 | */ |
---|
1876 | | - p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; |
---|
| 2045 | + p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL; |
---|
1877 | 2046 | |
---|
1878 | 2047 | ftrace_graph_init_task(p); |
---|
1879 | 2048 | |
---|
1880 | 2049 | rt_mutex_init_task(p); |
---|
1881 | 2050 | |
---|
| 2051 | + lockdep_assert_irqs_enabled(); |
---|
1882 | 2052 | #ifdef CONFIG_PROVE_LOCKING |
---|
1883 | | - DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); |
---|
1884 | 2053 | DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); |
---|
1885 | 2054 | #endif |
---|
1886 | 2055 | retval = -EAGAIN; |
---|
.. | .. |
---|
1902 | 2071 | * to stop root fork bombs. |
---|
1903 | 2072 | */ |
---|
1904 | 2073 | retval = -EAGAIN; |
---|
1905 | | - if (nr_threads >= max_threads) |
---|
| 2074 | + if (data_race(nr_threads >= max_threads)) |
---|
1906 | 2075 | goto bad_fork_cleanup_count; |
---|
1907 | 2076 | |
---|
1908 | 2077 | delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ |
---|
.. | .. |
---|
1915 | 2084 | spin_lock_init(&p->alloc_lock); |
---|
1916 | 2085 | |
---|
1917 | 2086 | init_sigpending(&p->pending); |
---|
| 2087 | + p->sigqueue_cache = NULL; |
---|
1918 | 2088 | |
---|
1919 | 2089 | p->utime = p->stime = p->gtime = 0; |
---|
1920 | 2090 | #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME |
---|
.. | .. |
---|
1926 | 2096 | seqcount_init(&p->vtime.seqcount); |
---|
1927 | 2097 | p->vtime.starttime = 0; |
---|
1928 | 2098 | p->vtime.state = VTIME_INACTIVE; |
---|
| 2099 | +#endif |
---|
| 2100 | + |
---|
| 2101 | +#ifdef CONFIG_IO_URING |
---|
| 2102 | + p->io_uring = NULL; |
---|
1929 | 2103 | #endif |
---|
1930 | 2104 | |
---|
1931 | 2105 | #if defined(SPLIT_RSS_COUNTING) |
---|
.. | .. |
---|
1941 | 2115 | task_io_accounting_init(&p->ioac); |
---|
1942 | 2116 | acct_clear_integrals(p); |
---|
1943 | 2117 | |
---|
1944 | | - posix_cpu_timers_init(p); |
---|
| 2118 | + posix_cputimers_init(&p->posix_cputimers); |
---|
1945 | 2119 | |
---|
1946 | 2120 | p->io_context = NULL; |
---|
1947 | 2121 | audit_set_context(p, NULL); |
---|
.. | .. |
---|
1957 | 2131 | #ifdef CONFIG_CPUSETS |
---|
1958 | 2132 | p->cpuset_mem_spread_rotor = NUMA_NO_NODE; |
---|
1959 | 2133 | p->cpuset_slab_spread_rotor = NUMA_NO_NODE; |
---|
1960 | | - seqcount_init(&p->mems_allowed_seq); |
---|
| 2134 | + seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock); |
---|
1961 | 2135 | #endif |
---|
1962 | 2136 | #ifdef CONFIG_TRACE_IRQFLAGS |
---|
1963 | | - p->irq_events = 0; |
---|
1964 | | - p->hardirqs_enabled = 0; |
---|
1965 | | - p->hardirq_enable_ip = 0; |
---|
1966 | | - p->hardirq_enable_event = 0; |
---|
1967 | | - p->hardirq_disable_ip = _THIS_IP_; |
---|
1968 | | - p->hardirq_disable_event = 0; |
---|
1969 | | - p->softirqs_enabled = 1; |
---|
1970 | | - p->softirq_enable_ip = _THIS_IP_; |
---|
1971 | | - p->softirq_enable_event = 0; |
---|
1972 | | - p->softirq_disable_ip = 0; |
---|
1973 | | - p->softirq_disable_event = 0; |
---|
1974 | | - p->hardirq_context = 0; |
---|
1975 | | - p->softirq_context = 0; |
---|
| 2137 | + memset(&p->irqtrace, 0, sizeof(p->irqtrace)); |
---|
| 2138 | + p->irqtrace.hardirq_disable_ip = _THIS_IP_; |
---|
| 2139 | + p->irqtrace.softirq_enable_ip = _THIS_IP_; |
---|
| 2140 | + p->softirqs_enabled = 1; |
---|
| 2141 | + p->softirq_context = 0; |
---|
1976 | 2142 | #endif |
---|
1977 | 2143 | |
---|
1978 | 2144 | p->pagefault_disabled = 0; |
---|
1979 | 2145 | |
---|
1980 | 2146 | #ifdef CONFIG_LOCKDEP |
---|
1981 | | - p->lockdep_depth = 0; /* no locks held yet */ |
---|
1982 | | - p->curr_chain_key = 0; |
---|
1983 | | - p->lockdep_recursion = 0; |
---|
1984 | 2147 | lockdep_init_task(p); |
---|
1985 | 2148 | #endif |
---|
1986 | 2149 | |
---|
.. | .. |
---|
2032 | 2195 | retval = copy_io(clone_flags, p); |
---|
2033 | 2196 | if (retval) |
---|
2034 | 2197 | goto bad_fork_cleanup_namespaces; |
---|
2035 | | - retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls); |
---|
| 2198 | + retval = copy_thread(clone_flags, args->stack, args->stack_size, p, args->tls); |
---|
2036 | 2199 | if (retval) |
---|
2037 | 2200 | goto bad_fork_cleanup_io; |
---|
2038 | 2201 | |
---|
| 2202 | + stackleak_task_init(p); |
---|
| 2203 | + |
---|
2039 | 2204 | if (pid != &init_struct_pid) { |
---|
2040 | | - pid = alloc_pid(p->nsproxy->pid_ns_for_children); |
---|
| 2205 | + pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid, |
---|
| 2206 | + args->set_tid_size); |
---|
2041 | 2207 | if (IS_ERR(pid)) { |
---|
2042 | 2208 | retval = PTR_ERR(pid); |
---|
2043 | 2209 | goto bad_fork_cleanup_thread; |
---|
.. | .. |
---|
2050 | 2216 | * if the fd table isn't shared). |
---|
2051 | 2217 | */ |
---|
2052 | 2218 | if (clone_flags & CLONE_PIDFD) { |
---|
2053 | | - retval = pidfd_create(pid); |
---|
| 2219 | + retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC); |
---|
2054 | 2220 | if (retval < 0) |
---|
2055 | 2221 | goto bad_fork_free_pid; |
---|
2056 | 2222 | |
---|
2057 | 2223 | pidfd = retval; |
---|
2058 | | - retval = put_user(pidfd, parent_tidptr); |
---|
| 2224 | + |
---|
| 2225 | + pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid, |
---|
| 2226 | + O_RDWR | O_CLOEXEC); |
---|
| 2227 | + if (IS_ERR(pidfile)) { |
---|
| 2228 | + put_unused_fd(pidfd); |
---|
| 2229 | + retval = PTR_ERR(pidfile); |
---|
| 2230 | + goto bad_fork_free_pid; |
---|
| 2231 | + } |
---|
| 2232 | + get_pid(pid); /* held by pidfile now */ |
---|
| 2233 | + |
---|
| 2234 | + retval = put_user(pidfd, args->pidfd); |
---|
2059 | 2235 | if (retval) |
---|
2060 | 2236 | goto bad_fork_put_pidfd; |
---|
2061 | 2237 | } |
---|
.. | .. |
---|
2080 | 2256 | #ifdef TIF_SYSCALL_EMU |
---|
2081 | 2257 | clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); |
---|
2082 | 2258 | #endif |
---|
2083 | | - clear_all_latency_tracing(p); |
---|
| 2259 | + clear_tsk_latency_tracing(p); |
---|
2084 | 2260 | |
---|
2085 | 2261 | /* ok, now we should be set up.. */ |
---|
2086 | 2262 | p->pid = pid_nr(pid); |
---|
.. | .. |
---|
2099 | 2275 | p->pdeath_signal = 0; |
---|
2100 | 2276 | INIT_LIST_HEAD(&p->thread_group); |
---|
2101 | 2277 | p->task_works = NULL; |
---|
| 2278 | + clear_posix_cputimers_work(p); |
---|
2102 | 2279 | |
---|
2103 | | - cgroup_threadgroup_change_begin(current); |
---|
2104 | 2280 | /* |
---|
2105 | 2281 | * Ensure that the cgroup subsystem policies allow the new process to be |
---|
2106 | | - * forked. It should be noted the the new process's css_set can be changed |
---|
| 2282 | + * forked. It should be noted that the new process's css_set can be changed |
---|
2107 | 2283 | * between here and cgroup_post_fork() if an organisation operation is in |
---|
2108 | 2284 | * progress. |
---|
2109 | 2285 | */ |
---|
2110 | | - retval = cgroup_can_fork(p); |
---|
| 2286 | + retval = cgroup_can_fork(p, args); |
---|
2111 | 2287 | if (retval) |
---|
2112 | | - goto bad_fork_cgroup_threadgroup_change_end; |
---|
| 2288 | + goto bad_fork_put_pidfd; |
---|
| 2289 | + |
---|
| 2290 | + /* |
---|
| 2291 | + * Now that the cgroups are pinned, re-clone the parent cgroup and put |
---|
| 2292 | + * the new task on the correct runqueue. All this *before* the task |
---|
| 2293 | + * becomes visible. |
---|
| 2294 | + * |
---|
| 2295 | + * This isn't part of ->can_fork() because while the re-cloning is |
---|
| 2296 | + * cgroup specific, it unconditionally needs to place the task on a |
---|
| 2297 | + * runqueue. |
---|
| 2298 | + */ |
---|
| 2299 | + sched_cgroup_fork(p, args); |
---|
2113 | 2300 | |
---|
2114 | 2301 | /* |
---|
2115 | 2302 | * From this point on we must avoid any synchronous user-space |
---|
.. | .. |
---|
2120 | 2307 | */ |
---|
2121 | 2308 | |
---|
2122 | 2309 | p->start_time = ktime_get_ns(); |
---|
2123 | | - p->real_start_time = ktime_get_boot_ns(); |
---|
| 2310 | + p->start_boottime = ktime_get_boottime_ns(); |
---|
2124 | 2311 | |
---|
2125 | 2312 | /* |
---|
2126 | 2313 | * Make it visible to the rest of the system, but dont wake it up yet. |
---|
.. | .. |
---|
2139 | 2326 | } else { |
---|
2140 | 2327 | p->real_parent = current; |
---|
2141 | 2328 | p->parent_exec_id = current->self_exec_id; |
---|
2142 | | - p->exit_signal = (clone_flags & CSIGNAL); |
---|
| 2329 | + p->exit_signal = args->exit_signal; |
---|
2143 | 2330 | } |
---|
2144 | 2331 | |
---|
2145 | 2332 | klp_copy_process(p); |
---|
.. | .. |
---|
2165 | 2352 | retval = -EINTR; |
---|
2166 | 2353 | goto bad_fork_cancel_cgroup; |
---|
2167 | 2354 | } |
---|
2168 | | - |
---|
2169 | 2355 | |
---|
2170 | 2356 | init_task_pid_links(p); |
---|
2171 | 2357 | if (likely(p->pid)) { |
---|
.. | .. |
---|
2199 | 2385 | } else { |
---|
2200 | 2386 | current->signal->nr_threads++; |
---|
2201 | 2387 | atomic_inc(¤t->signal->live); |
---|
2202 | | - atomic_inc(¤t->signal->sigcnt); |
---|
| 2388 | + refcount_inc(¤t->signal->sigcnt); |
---|
2203 | 2389 | task_join_group_stop(p); |
---|
2204 | 2390 | list_add_tail_rcu(&p->thread_group, |
---|
2205 | 2391 | &p->group_leader->thread_group); |
---|
.. | .. |
---|
2215 | 2401 | syscall_tracepoint_update(p); |
---|
2216 | 2402 | write_unlock_irq(&tasklist_lock); |
---|
2217 | 2403 | |
---|
| 2404 | + if (pidfile) |
---|
| 2405 | + fd_install(pidfd, pidfile); |
---|
| 2406 | + |
---|
2218 | 2407 | proc_fork_connector(p); |
---|
2219 | | - cgroup_post_fork(p); |
---|
2220 | | - cgroup_threadgroup_change_end(current); |
---|
| 2408 | + sched_post_fork(p); |
---|
| 2409 | + cgroup_post_fork(p, args); |
---|
2221 | 2410 | perf_event_fork(p); |
---|
2222 | 2411 | |
---|
2223 | 2412 | trace_task_newtask(p, clone_flags); |
---|
.. | .. |
---|
2230 | 2419 | bad_fork_cancel_cgroup: |
---|
2231 | 2420 | spin_unlock(¤t->sighand->siglock); |
---|
2232 | 2421 | write_unlock_irq(&tasklist_lock); |
---|
2233 | | - cgroup_cancel_fork(p); |
---|
2234 | | -bad_fork_cgroup_threadgroup_change_end: |
---|
2235 | | - cgroup_threadgroup_change_end(current); |
---|
| 2422 | + cgroup_cancel_fork(p, args); |
---|
2236 | 2423 | bad_fork_put_pidfd: |
---|
2237 | | - if (clone_flags & CLONE_PIDFD) |
---|
2238 | | - ksys_close(pidfd); |
---|
| 2424 | + if (clone_flags & CLONE_PIDFD) { |
---|
| 2425 | + fput(pidfile); |
---|
| 2426 | + put_unused_fd(pidfd); |
---|
| 2427 | + } |
---|
2239 | 2428 | bad_fork_free_pid: |
---|
2240 | 2429 | if (pid != &init_struct_pid) |
---|
2241 | 2430 | free_pid(pid); |
---|
.. | .. |
---|
2299 | 2488 | } |
---|
2300 | 2489 | } |
---|
2301 | 2490 | |
---|
2302 | | -struct task_struct *fork_idle(int cpu) |
---|
| 2491 | +struct task_struct * __init fork_idle(int cpu) |
---|
2303 | 2492 | { |
---|
2304 | 2493 | struct task_struct *task; |
---|
2305 | | - task = copy_process(CLONE_VM, 0, 0, NULL, NULL, &init_struct_pid, 0, 0, |
---|
2306 | | - cpu_to_node(cpu)); |
---|
| 2494 | + struct kernel_clone_args args = { |
---|
| 2495 | + .flags = CLONE_VM, |
---|
| 2496 | + }; |
---|
| 2497 | + |
---|
| 2498 | + task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args); |
---|
2307 | 2499 | if (!IS_ERR(task)) { |
---|
2308 | 2500 | init_idle_pids(task); |
---|
2309 | 2501 | init_idle(task, cpu); |
---|
.. | .. |
---|
2312 | 2504 | return task; |
---|
2313 | 2505 | } |
---|
2314 | 2506 | |
---|
| 2507 | +struct mm_struct *copy_init_mm(void) |
---|
| 2508 | +{ |
---|
| 2509 | + return dup_mm(NULL, &init_mm); |
---|
| 2510 | +} |
---|
| 2511 | + |
---|
| 2512 | +/* |
---|
| 2513 | + * This is like kernel_clone(), but shaved down and tailored to just |
---|
| 2514 | + * creating io_uring workers. It returns a created task, or an error pointer. |
---|
| 2515 | + * The returned task is inactive, and the caller must fire it up through |
---|
| 2516 | + * wake_up_new_task(p). All signals are blocked in the created task. |
---|
| 2517 | + */ |
---|
| 2518 | +struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) |
---|
| 2519 | +{ |
---|
| 2520 | + unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD| |
---|
| 2521 | + CLONE_IO; |
---|
| 2522 | + struct kernel_clone_args args = { |
---|
| 2523 | + .flags = ((lower_32_bits(flags) | CLONE_VM | |
---|
| 2524 | + CLONE_UNTRACED) & ~CSIGNAL), |
---|
| 2525 | + .exit_signal = (lower_32_bits(flags) & CSIGNAL), |
---|
| 2526 | + .stack = (unsigned long)fn, |
---|
| 2527 | + .stack_size = (unsigned long)arg, |
---|
| 2528 | + .io_thread = 1, |
---|
| 2529 | + }; |
---|
| 2530 | + |
---|
| 2531 | + return copy_process(NULL, 0, node, &args); |
---|
| 2532 | +} |
---|
| 2533 | + |
---|
2315 | 2534 | /* |
---|
2316 | 2535 | * Ok, this is the main fork-routine. |
---|
2317 | 2536 | * |
---|
2318 | 2537 | * It copies the process, and if successful kick-starts |
---|
2319 | 2538 | * it and waits for it to finish using the VM if required. |
---|
| 2539 | + * |
---|
| 2540 | + * args->exit_signal is expected to be checked for sanity by the caller. |
---|
2320 | 2541 | */ |
---|
2321 | | -long _do_fork(unsigned long clone_flags, |
---|
2322 | | - unsigned long stack_start, |
---|
2323 | | - unsigned long stack_size, |
---|
2324 | | - int __user *parent_tidptr, |
---|
2325 | | - int __user *child_tidptr, |
---|
2326 | | - unsigned long tls) |
---|
| 2542 | +pid_t kernel_clone(struct kernel_clone_args *args) |
---|
2327 | 2543 | { |
---|
| 2544 | + u64 clone_flags = args->flags; |
---|
2328 | 2545 | struct completion vfork; |
---|
2329 | 2546 | struct pid *pid; |
---|
2330 | 2547 | struct task_struct *p; |
---|
2331 | 2548 | int trace = 0; |
---|
2332 | | - long nr; |
---|
| 2549 | + pid_t nr; |
---|
| 2550 | + |
---|
| 2551 | + /* |
---|
| 2552 | + * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument |
---|
| 2553 | + * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are |
---|
| 2554 | + * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate |
---|
| 2555 | + * field in struct clone_args and it still doesn't make sense to have |
---|
| 2556 | + * them both point at the same memory location. Performing this check |
---|
| 2557 | + * here has the advantage that we don't need to have a separate helper |
---|
| 2558 | + * to check for legacy clone(). |
---|
| 2559 | + */ |
---|
| 2560 | + if ((args->flags & CLONE_PIDFD) && |
---|
| 2561 | + (args->flags & CLONE_PARENT_SETTID) && |
---|
| 2562 | + (args->pidfd == args->parent_tid)) |
---|
| 2563 | + return -EINVAL; |
---|
2333 | 2564 | |
---|
2334 | 2565 | /* |
---|
2335 | 2566 | * Determine whether and which event to report to ptracer. When |
---|
.. | .. |
---|
2340 | 2571 | if (!(clone_flags & CLONE_UNTRACED)) { |
---|
2341 | 2572 | if (clone_flags & CLONE_VFORK) |
---|
2342 | 2573 | trace = PTRACE_EVENT_VFORK; |
---|
2343 | | - else if ((clone_flags & CSIGNAL) != SIGCHLD) |
---|
| 2574 | + else if (args->exit_signal != SIGCHLD) |
---|
2344 | 2575 | trace = PTRACE_EVENT_CLONE; |
---|
2345 | 2576 | else |
---|
2346 | 2577 | trace = PTRACE_EVENT_FORK; |
---|
.. | .. |
---|
2349 | 2580 | trace = 0; |
---|
2350 | 2581 | } |
---|
2351 | 2582 | |
---|
2352 | | - p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr, |
---|
2353 | | - child_tidptr, NULL, trace, tls, NUMA_NO_NODE); |
---|
| 2583 | + p = copy_process(NULL, trace, NUMA_NO_NODE, args); |
---|
2354 | 2584 | add_latent_entropy(); |
---|
2355 | 2585 | |
---|
2356 | 2586 | if (IS_ERR(p)) |
---|
.. | .. |
---|
2368 | 2598 | nr = pid_vnr(pid); |
---|
2369 | 2599 | |
---|
2370 | 2600 | if (clone_flags & CLONE_PARENT_SETTID) |
---|
2371 | | - put_user(nr, parent_tidptr); |
---|
| 2601 | + put_user(nr, args->parent_tid); |
---|
2372 | 2602 | |
---|
2373 | 2603 | if (clone_flags & CLONE_VFORK) { |
---|
2374 | 2604 | p->vfork_done = &vfork; |
---|
.. | .. |
---|
2391 | 2621 | return nr; |
---|
2392 | 2622 | } |
---|
2393 | 2623 | |
---|
2394 | | -#ifndef CONFIG_HAVE_COPY_THREAD_TLS |
---|
2395 | | -/* For compatibility with architectures that call do_fork directly rather than |
---|
2396 | | - * using the syscall entry points below. */ |
---|
2397 | | -long do_fork(unsigned long clone_flags, |
---|
2398 | | - unsigned long stack_start, |
---|
2399 | | - unsigned long stack_size, |
---|
2400 | | - int __user *parent_tidptr, |
---|
2401 | | - int __user *child_tidptr) |
---|
2402 | | -{ |
---|
2403 | | - return _do_fork(clone_flags, stack_start, stack_size, |
---|
2404 | | - parent_tidptr, child_tidptr, 0); |
---|
2405 | | -} |
---|
2406 | | -#endif |
---|
2407 | | - |
---|
2408 | 2624 | /* |
---|
2409 | 2625 | * Create a kernel thread. |
---|
2410 | 2626 | */ |
---|
2411 | 2627 | pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) |
---|
2412 | 2628 | { |
---|
2413 | | - return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, |
---|
2414 | | - (unsigned long)arg, NULL, NULL, 0); |
---|
| 2629 | + struct kernel_clone_args args = { |
---|
| 2630 | + .flags = ((lower_32_bits(flags) | CLONE_VM | |
---|
| 2631 | + CLONE_UNTRACED) & ~CSIGNAL), |
---|
| 2632 | + .exit_signal = (lower_32_bits(flags) & CSIGNAL), |
---|
| 2633 | + .stack = (unsigned long)fn, |
---|
| 2634 | + .stack_size = (unsigned long)arg, |
---|
| 2635 | + }; |
---|
| 2636 | + |
---|
| 2637 | + return kernel_clone(&args); |
---|
2415 | 2638 | } |
---|
2416 | 2639 | |
---|
2417 | 2640 | #ifdef __ARCH_WANT_SYS_FORK |
---|
2418 | 2641 | SYSCALL_DEFINE0(fork) |
---|
2419 | 2642 | { |
---|
2420 | 2643 | #ifdef CONFIG_MMU |
---|
2421 | | - return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0); |
---|
| 2644 | + struct kernel_clone_args args = { |
---|
| 2645 | + .exit_signal = SIGCHLD, |
---|
| 2646 | + }; |
---|
| 2647 | + |
---|
| 2648 | + return kernel_clone(&args); |
---|
2422 | 2649 | #else |
---|
2423 | 2650 | /* can not support in nommu mode */ |
---|
2424 | 2651 | return -EINVAL; |
---|
.. | .. |
---|
2429 | 2656 | #ifdef __ARCH_WANT_SYS_VFORK |
---|
2430 | 2657 | SYSCALL_DEFINE0(vfork) |
---|
2431 | 2658 | { |
---|
2432 | | - return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, |
---|
2433 | | - 0, NULL, NULL, 0); |
---|
| 2659 | + struct kernel_clone_args args = { |
---|
| 2660 | + .flags = CLONE_VFORK | CLONE_VM, |
---|
| 2661 | + .exit_signal = SIGCHLD, |
---|
| 2662 | + }; |
---|
| 2663 | + |
---|
| 2664 | + return kernel_clone(&args); |
---|
2434 | 2665 | } |
---|
2435 | 2666 | #endif |
---|
2436 | 2667 | |
---|
.. | .. |
---|
2458 | 2689 | unsigned long, tls) |
---|
2459 | 2690 | #endif |
---|
2460 | 2691 | { |
---|
2461 | | - return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls); |
---|
| 2692 | + struct kernel_clone_args args = { |
---|
| 2693 | + .flags = (lower_32_bits(clone_flags) & ~CSIGNAL), |
---|
| 2694 | + .pidfd = parent_tidptr, |
---|
| 2695 | + .child_tid = child_tidptr, |
---|
| 2696 | + .parent_tid = parent_tidptr, |
---|
| 2697 | + .exit_signal = (lower_32_bits(clone_flags) & CSIGNAL), |
---|
| 2698 | + .stack = newsp, |
---|
| 2699 | + .tls = tls, |
---|
| 2700 | + }; |
---|
| 2701 | + |
---|
| 2702 | + return kernel_clone(&args); |
---|
| 2703 | +} |
---|
| 2704 | +#endif |
---|
| 2705 | + |
---|
| 2706 | +#ifdef __ARCH_WANT_SYS_CLONE3 |
---|
| 2707 | + |
---|
| 2708 | +noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, |
---|
| 2709 | + struct clone_args __user *uargs, |
---|
| 2710 | + size_t usize) |
---|
| 2711 | +{ |
---|
| 2712 | + int err; |
---|
| 2713 | + struct clone_args args; |
---|
| 2714 | + pid_t *kset_tid = kargs->set_tid; |
---|
| 2715 | + |
---|
| 2716 | + BUILD_BUG_ON(offsetofend(struct clone_args, tls) != |
---|
| 2717 | + CLONE_ARGS_SIZE_VER0); |
---|
| 2718 | + BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) != |
---|
| 2719 | + CLONE_ARGS_SIZE_VER1); |
---|
| 2720 | + BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) != |
---|
| 2721 | + CLONE_ARGS_SIZE_VER2); |
---|
| 2722 | + BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2); |
---|
| 2723 | + |
---|
| 2724 | + if (unlikely(usize > PAGE_SIZE)) |
---|
| 2725 | + return -E2BIG; |
---|
| 2726 | + if (unlikely(usize < CLONE_ARGS_SIZE_VER0)) |
---|
| 2727 | + return -EINVAL; |
---|
| 2728 | + |
---|
| 2729 | + err = copy_struct_from_user(&args, sizeof(args), uargs, usize); |
---|
| 2730 | + if (err) |
---|
| 2731 | + return err; |
---|
| 2732 | + |
---|
| 2733 | + if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL)) |
---|
| 2734 | + return -EINVAL; |
---|
| 2735 | + |
---|
| 2736 | + if (unlikely(!args.set_tid && args.set_tid_size > 0)) |
---|
| 2737 | + return -EINVAL; |
---|
| 2738 | + |
---|
| 2739 | + if (unlikely(args.set_tid && args.set_tid_size == 0)) |
---|
| 2740 | + return -EINVAL; |
---|
| 2741 | + |
---|
| 2742 | + /* |
---|
| 2743 | + * Verify that higher 32bits of exit_signal are unset and that |
---|
| 2744 | + * it is a valid signal |
---|
| 2745 | + */ |
---|
| 2746 | + if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) || |
---|
| 2747 | + !valid_signal(args.exit_signal))) |
---|
| 2748 | + return -EINVAL; |
---|
| 2749 | + |
---|
| 2750 | + if ((args.flags & CLONE_INTO_CGROUP) && |
---|
| 2751 | + (args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2)) |
---|
| 2752 | + return -EINVAL; |
---|
| 2753 | + |
---|
| 2754 | + *kargs = (struct kernel_clone_args){ |
---|
| 2755 | + .flags = args.flags, |
---|
| 2756 | + .pidfd = u64_to_user_ptr(args.pidfd), |
---|
| 2757 | + .child_tid = u64_to_user_ptr(args.child_tid), |
---|
| 2758 | + .parent_tid = u64_to_user_ptr(args.parent_tid), |
---|
| 2759 | + .exit_signal = args.exit_signal, |
---|
| 2760 | + .stack = args.stack, |
---|
| 2761 | + .stack_size = args.stack_size, |
---|
| 2762 | + .tls = args.tls, |
---|
| 2763 | + .set_tid_size = args.set_tid_size, |
---|
| 2764 | + .cgroup = args.cgroup, |
---|
| 2765 | + }; |
---|
| 2766 | + |
---|
| 2767 | + if (args.set_tid && |
---|
| 2768 | + copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid), |
---|
| 2769 | + (kargs->set_tid_size * sizeof(pid_t)))) |
---|
| 2770 | + return -EFAULT; |
---|
| 2771 | + |
---|
| 2772 | + kargs->set_tid = kset_tid; |
---|
| 2773 | + |
---|
| 2774 | + return 0; |
---|
| 2775 | +} |
---|
| 2776 | + |
---|
| 2777 | +/** |
---|
| 2778 | + * clone3_stack_valid - check and prepare stack |
---|
| 2779 | + * @kargs: kernel clone args |
---|
| 2780 | + * |
---|
| 2781 | + * Verify that the stack arguments userspace gave us are sane. |
---|
| 2782 | + * In addition, set the stack direction for userspace since it's easy for us to |
---|
| 2783 | + * determine. |
---|
| 2784 | + */ |
---|
| 2785 | +static inline bool clone3_stack_valid(struct kernel_clone_args *kargs) |
---|
| 2786 | +{ |
---|
| 2787 | + if (kargs->stack == 0) { |
---|
| 2788 | + if (kargs->stack_size > 0) |
---|
| 2789 | + return false; |
---|
| 2790 | + } else { |
---|
| 2791 | + if (kargs->stack_size == 0) |
---|
| 2792 | + return false; |
---|
| 2793 | + |
---|
| 2794 | + if (!access_ok((void __user *)kargs->stack, kargs->stack_size)) |
---|
| 2795 | + return false; |
---|
| 2796 | + |
---|
| 2797 | +#if !defined(CONFIG_STACK_GROWSUP) && !defined(CONFIG_IA64) |
---|
| 2798 | + kargs->stack += kargs->stack_size; |
---|
| 2799 | +#endif |
---|
| 2800 | + } |
---|
| 2801 | + |
---|
| 2802 | + return true; |
---|
| 2803 | +} |
---|
| 2804 | + |
---|
| 2805 | +static bool clone3_args_valid(struct kernel_clone_args *kargs) |
---|
| 2806 | +{ |
---|
| 2807 | + /* Verify that no unknown flags are passed along. */ |
---|
| 2808 | + if (kargs->flags & |
---|
| 2809 | + ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP)) |
---|
| 2810 | + return false; |
---|
| 2811 | + |
---|
| 2812 | + /* |
---|
| 2813 | + * - make the CLONE_DETACHED bit reuseable for clone3 |
---|
| 2814 | + * - make the CSIGNAL bits reuseable for clone3 |
---|
| 2815 | + */ |
---|
| 2816 | + if (kargs->flags & (CLONE_DETACHED | CSIGNAL)) |
---|
| 2817 | + return false; |
---|
| 2818 | + |
---|
| 2819 | + if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) == |
---|
| 2820 | + (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) |
---|
| 2821 | + return false; |
---|
| 2822 | + |
---|
| 2823 | + if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) && |
---|
| 2824 | + kargs->exit_signal) |
---|
| 2825 | + return false; |
---|
| 2826 | + |
---|
| 2827 | + if (!clone3_stack_valid(kargs)) |
---|
| 2828 | + return false; |
---|
| 2829 | + |
---|
| 2830 | + return true; |
---|
| 2831 | +} |
---|
| 2832 | + |
---|
| 2833 | +/** |
---|
| 2834 | + * clone3 - create a new process with specific properties |
---|
| 2835 | + * @uargs: argument structure |
---|
| 2836 | + * @size: size of @uargs |
---|
| 2837 | + * |
---|
| 2838 | + * clone3() is the extensible successor to clone()/clone2(). |
---|
| 2839 | + * It takes a struct as argument that is versioned by its size. |
---|
| 2840 | + * |
---|
| 2841 | + * Return: On success, a positive PID for the child process. |
---|
| 2842 | + * On error, a negative errno number. |
---|
| 2843 | + */ |
---|
| 2844 | +SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size) |
---|
| 2845 | +{ |
---|
| 2846 | + int err; |
---|
| 2847 | + |
---|
| 2848 | + struct kernel_clone_args kargs; |
---|
| 2849 | + pid_t set_tid[MAX_PID_NS_LEVEL]; |
---|
| 2850 | + |
---|
| 2851 | + kargs.set_tid = set_tid; |
---|
| 2852 | + |
---|
| 2853 | + err = copy_clone_args_from_user(&kargs, uargs, size); |
---|
| 2854 | + if (err) |
---|
| 2855 | + return err; |
---|
| 2856 | + |
---|
| 2857 | + if (!clone3_args_valid(&kargs)) |
---|
| 2858 | + return -EINVAL; |
---|
| 2859 | + |
---|
| 2860 | + return kernel_clone(&kargs); |
---|
2462 | 2861 | } |
---|
2463 | 2862 | #endif |
---|
2464 | 2863 | |
---|
.. | .. |
---|
2553 | 2952 | if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| |
---|
2554 | 2953 | CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| |
---|
2555 | 2954 | CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| |
---|
2556 | | - CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP)) |
---|
| 2955 | + CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP| |
---|
| 2956 | + CLONE_NEWTIME)) |
---|
2557 | 2957 | return -EINVAL; |
---|
2558 | 2958 | /* |
---|
2559 | 2959 | * Not implemented, but pretend it works if there is nothing |
---|
.. | .. |
---|
2566 | 2966 | return -EINVAL; |
---|
2567 | 2967 | } |
---|
2568 | 2968 | if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) { |
---|
2569 | | - if (atomic_read(¤t->sighand->count) > 1) |
---|
| 2969 | + if (refcount_read(¤t->sighand->count) > 1) |
---|
2570 | 2970 | return -EINVAL; |
---|
2571 | 2971 | } |
---|
2572 | 2972 | if (unshare_flags & CLONE_VM) { |
---|
.. | .. |
---|
2601 | 3001 | /* |
---|
2602 | 3002 | * Unshare file descriptor table if it is being shared |
---|
2603 | 3003 | */ |
---|
2604 | | -static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) |
---|
| 3004 | +int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, |
---|
| 3005 | + struct files_struct **new_fdp) |
---|
2605 | 3006 | { |
---|
2606 | 3007 | struct files_struct *fd = current->files; |
---|
2607 | 3008 | int error = 0; |
---|
2608 | 3009 | |
---|
2609 | 3010 | if ((unshare_flags & CLONE_FILES) && |
---|
2610 | 3011 | (fd && atomic_read(&fd->count) > 1)) { |
---|
2611 | | - *new_fdp = dup_fd(fd, &error); |
---|
| 3012 | + *new_fdp = dup_fd(fd, max_fds, &error); |
---|
2612 | 3013 | if (!*new_fdp) |
---|
2613 | 3014 | return error; |
---|
2614 | 3015 | } |
---|
.. | .. |
---|
2619 | 3020 | /* |
---|
2620 | 3021 | * unshare allows a process to 'unshare' part of the process |
---|
2621 | 3022 | * context which was originally shared using clone. copy_* |
---|
2622 | | - * functions used by do_fork() cannot be used here directly |
---|
| 3023 | + * functions used by kernel_clone() cannot be used here directly |
---|
2623 | 3024 | * because they modify an inactive task_struct that is being |
---|
2624 | 3025 | * constructed. Here we are modifying the current, active, |
---|
2625 | 3026 | * task_struct. |
---|
.. | .. |
---|
2668 | 3069 | err = unshare_fs(unshare_flags, &new_fs); |
---|
2669 | 3070 | if (err) |
---|
2670 | 3071 | goto bad_unshare_out; |
---|
2671 | | - err = unshare_fd(unshare_flags, &new_fd); |
---|
| 3072 | + err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd); |
---|
2672 | 3073 | if (err) |
---|
2673 | 3074 | goto bad_unshare_cleanup_fs; |
---|
2674 | 3075 | err = unshare_userns(unshare_flags, &new_cred); |
---|
.. | .. |
---|
2757 | 3158 | struct files_struct *copy = NULL; |
---|
2758 | 3159 | int error; |
---|
2759 | 3160 | |
---|
2760 | | - error = unshare_fd(CLONE_FILES, ©); |
---|
| 3161 | + error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, ©); |
---|
2761 | 3162 | if (error || !copy) { |
---|
2762 | 3163 | *displaced = NULL; |
---|
2763 | 3164 | return error; |
---|
.. | .. |
---|
2770 | 3171 | } |
---|
2771 | 3172 | |
---|
2772 | 3173 | int sysctl_max_threads(struct ctl_table *table, int write, |
---|
2773 | | - void __user *buffer, size_t *lenp, loff_t *ppos) |
---|
| 3174 | + void *buffer, size_t *lenp, loff_t *ppos) |
---|
2774 | 3175 | { |
---|
2775 | 3176 | struct ctl_table t; |
---|
2776 | 3177 | int ret; |
---|