.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
---|
1 | 2 | /* |
---|
2 | 3 | * Generic pidhash and scalable, time-bounded PID allocator |
---|
3 | 4 | * |
---|
.. | .. |
---|
31 | 32 | #include <linux/slab.h> |
---|
32 | 33 | #include <linux/init.h> |
---|
33 | 34 | #include <linux/rculist.h> |
---|
34 | | -#include <linux/bootmem.h> |
---|
35 | | -#include <linux/hash.h> |
---|
| 35 | +#include <linux/memblock.h> |
---|
36 | 36 | #include <linux/pid_namespace.h> |
---|
37 | 37 | #include <linux/init_task.h> |
---|
38 | 38 | #include <linux/syscalls.h> |
---|
39 | 39 | #include <linux/proc_ns.h> |
---|
40 | | -#include <linux/proc_fs.h> |
---|
| 40 | +#include <linux/refcount.h> |
---|
41 | 41 | #include <linux/anon_inodes.h> |
---|
42 | 42 | #include <linux/sched/signal.h> |
---|
43 | 43 | #include <linux/sched/task.h> |
---|
44 | 44 | #include <linux/idr.h> |
---|
| 45 | +#include <net/sock.h> |
---|
| 46 | +#include <uapi/linux/pidfd.h> |
---|
45 | 47 | |
---|
46 | 48 | struct pid init_struct_pid = { |
---|
47 | | - .count = ATOMIC_INIT(1), |
---|
| 49 | + .count = REFCOUNT_INIT(1), |
---|
48 | 50 | .tasks = { |
---|
49 | 51 | { .first = NULL }, |
---|
50 | 52 | { .first = NULL }, |
---|
.. | .. |
---|
108 | 110 | return; |
---|
109 | 111 | |
---|
110 | 112 | ns = pid->numbers[pid->level].ns; |
---|
111 | | - if ((atomic_read(&pid->count) == 1) || |
---|
112 | | - atomic_dec_and_test(&pid->count)) { |
---|
| 113 | + if (refcount_dec_and_test(&pid->count)) { |
---|
113 | 114 | kmem_cache_free(ns->pid_cachep, pid); |
---|
114 | 115 | put_pid_ns(ns); |
---|
115 | 116 | } |
---|
.. | .. |
---|
145 | 146 | /* Handle a fork failure of the first process */ |
---|
146 | 147 | WARN_ON(ns->child_reaper); |
---|
147 | 148 | ns->pid_allocated = 0; |
---|
148 | | - /* fall through */ |
---|
149 | | - case 0: |
---|
150 | | - schedule_work(&ns->proc_work); |
---|
151 | 149 | break; |
---|
152 | 150 | } |
---|
153 | 151 | |
---|
.. | .. |
---|
158 | 156 | call_rcu(&pid->rcu, delayed_put_pid); |
---|
159 | 157 | } |
---|
160 | 158 | |
---|
161 | | -struct pid *alloc_pid(struct pid_namespace *ns) |
---|
| 159 | +struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, |
---|
| 160 | + size_t set_tid_size) |
---|
162 | 161 | { |
---|
163 | 162 | struct pid *pid; |
---|
164 | 163 | enum pid_type type; |
---|
.. | .. |
---|
166 | 165 | struct pid_namespace *tmp; |
---|
167 | 166 | struct upid *upid; |
---|
168 | 167 | int retval = -ENOMEM; |
---|
| 168 | + |
---|
| 169 | + /* |
---|
| 170 | + * set_tid_size contains the size of the set_tid array. Starting at |
---|
| 171 | + * the most nested currently active PID namespace it tells alloc_pid() |
---|
| 172 | + * which PID to set for a process in that most nested PID namespace |
---|
| 173 | + * up to set_tid_size PID namespaces. It does not have to set the PID |
---|
| 174 | + * for a process in all nested PID namespaces but set_tid_size must |
---|
| 175 | + * never be greater than the current ns->level + 1. |
---|
| 176 | + */ |
---|
| 177 | + if (set_tid_size > ns->level + 1) |
---|
| 178 | + return ERR_PTR(-EINVAL); |
---|
169 | 179 | |
---|
170 | 180 | pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); |
---|
171 | 181 | if (!pid) |
---|
.. | .. |
---|
175 | 185 | pid->level = ns->level; |
---|
176 | 186 | |
---|
177 | 187 | for (i = ns->level; i >= 0; i--) { |
---|
178 | | - int pid_min = 1; |
---|
| 188 | + int tid = 0; |
---|
| 189 | + |
---|
| 190 | + if (set_tid_size) { |
---|
| 191 | + tid = set_tid[ns->level - i]; |
---|
| 192 | + |
---|
| 193 | + retval = -EINVAL; |
---|
| 194 | + if (tid < 1 || tid >= pid_max) |
---|
| 195 | + goto out_free; |
---|
| 196 | + /* |
---|
| 197 | + * Also fail if a PID != 1 is requested and |
---|
| 198 | + * no PID 1 exists. |
---|
| 199 | + */ |
---|
| 200 | + if (tid != 1 && !tmp->child_reaper) |
---|
| 201 | + goto out_free; |
---|
| 202 | + retval = -EPERM; |
---|
| 203 | + if (!checkpoint_restore_ns_capable(tmp->user_ns)) |
---|
| 204 | + goto out_free; |
---|
| 205 | + set_tid_size--; |
---|
| 206 | + } |
---|
179 | 207 | |
---|
180 | 208 | idr_preload(GFP_KERNEL); |
---|
181 | 209 | spin_lock_irq(&pidmap_lock); |
---|
182 | 210 | |
---|
183 | | - /* |
---|
184 | | - * init really needs pid 1, but after reaching the maximum |
---|
185 | | - * wrap back to RESERVED_PIDS |
---|
186 | | - */ |
---|
187 | | - if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS) |
---|
188 | | - pid_min = RESERVED_PIDS; |
---|
| 211 | + if (tid) { |
---|
| 212 | + nr = idr_alloc(&tmp->idr, NULL, tid, |
---|
| 213 | + tid + 1, GFP_ATOMIC); |
---|
| 214 | + /* |
---|
| 215 | + * If ENOSPC is returned it means that the PID is |
---|
| 216 | + * alreay in use. Return EEXIST in that case. |
---|
| 217 | + */ |
---|
| 218 | + if (nr == -ENOSPC) |
---|
| 219 | + nr = -EEXIST; |
---|
| 220 | + } else { |
---|
| 221 | + int pid_min = 1; |
---|
| 222 | + /* |
---|
| 223 | + * init really needs pid 1, but after reaching the |
---|
| 224 | + * maximum wrap back to RESERVED_PIDS |
---|
| 225 | + */ |
---|
| 226 | + if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS) |
---|
| 227 | + pid_min = RESERVED_PIDS; |
---|
189 | 228 | |
---|
190 | | - /* |
---|
191 | | - * Store a null pointer so find_pid_ns does not find |
---|
192 | | - * a partially initialized PID (see below). |
---|
193 | | - */ |
---|
194 | | - nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, |
---|
195 | | - pid_max, GFP_ATOMIC); |
---|
| 229 | + /* |
---|
| 230 | + * Store a null pointer so find_pid_ns does not find |
---|
| 231 | + * a partially initialized PID (see below). |
---|
| 232 | + */ |
---|
| 233 | + nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, |
---|
| 234 | + pid_max, GFP_ATOMIC); |
---|
| 235 | + } |
---|
196 | 236 | spin_unlock_irq(&pidmap_lock); |
---|
197 | 237 | idr_preload_end(); |
---|
198 | 238 | |
---|
.. | .. |
---|
206 | 246 | tmp = tmp->parent; |
---|
207 | 247 | } |
---|
208 | 248 | |
---|
209 | | - if (unlikely(is_child_reaper(pid))) { |
---|
210 | | - if (pid_ns_prepare_proc(ns)) |
---|
211 | | - goto out_free; |
---|
212 | | - } |
---|
| 249 | + /* |
---|
| 250 | + * ENOMEM is not the most obvious choice especially for the case |
---|
| 251 | + * where the child subreaper has already exited and the pid |
---|
| 252 | + * namespace denies the creation of any new processes. But ENOMEM |
---|
| 253 | + * is what we have exposed to userspace for a long time and it is |
---|
| 254 | + * documented behavior for pid namespaces. So we can't easily |
---|
| 255 | + * change it even if there were an error code better suited. |
---|
| 256 | + */ |
---|
| 257 | + retval = -ENOMEM; |
---|
213 | 258 | |
---|
214 | 259 | get_pid_ns(ns); |
---|
215 | | - atomic_set(&pid->count, 1); |
---|
| 260 | + refcount_set(&pid->count, 1); |
---|
| 261 | + spin_lock_init(&pid->lock); |
---|
216 | 262 | for (type = 0; type < PIDTYPE_MAX; ++type) |
---|
217 | 263 | INIT_HLIST_HEAD(&pid->tasks[type]); |
---|
218 | 264 | |
---|
219 | 265 | init_waitqueue_head(&pid->wait_pidfd); |
---|
| 266 | + INIT_HLIST_HEAD(&pid->inodes); |
---|
220 | 267 | |
---|
221 | 268 | upid = pid->numbers + ns->level; |
---|
222 | 269 | spin_lock_irq(&pidmap_lock); |
---|
.. | .. |
---|
300 | 347 | *pid_ptr = new; |
---|
301 | 348 | |
---|
302 | 349 | for (tmp = PIDTYPE_MAX; --tmp >= 0; ) |
---|
303 | | - if (!hlist_empty(&pid->tasks[tmp])) |
---|
| 350 | + if (pid_has_task(pid, tmp)) |
---|
304 | 351 | return; |
---|
305 | 352 | |
---|
306 | 353 | free_pid(pid); |
---|
.. | .. |
---|
316 | 363 | { |
---|
317 | 364 | __change_pid(task, type, pid); |
---|
318 | 365 | attach_pid(task, type); |
---|
| 366 | +} |
---|
| 367 | + |
---|
| 368 | +void exchange_tids(struct task_struct *left, struct task_struct *right) |
---|
| 369 | +{ |
---|
| 370 | + struct pid *pid1 = left->thread_pid; |
---|
| 371 | + struct pid *pid2 = right->thread_pid; |
---|
| 372 | + struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID]; |
---|
| 373 | + struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID]; |
---|
| 374 | + |
---|
| 375 | + /* Swap the single entry tid lists */ |
---|
| 376 | + hlists_swap_heads_rcu(head1, head2); |
---|
| 377 | + |
---|
| 378 | + /* Swap the per task_struct pid */ |
---|
| 379 | + rcu_assign_pointer(left->thread_pid, pid2); |
---|
| 380 | + rcu_assign_pointer(right->thread_pid, pid1); |
---|
| 381 | + |
---|
| 382 | + /* Swap the cached value */ |
---|
| 383 | + WRITE_ONCE(left->pid, pid_nr(pid2)); |
---|
| 384 | + WRITE_ONCE(right->pid, pid_nr(pid1)); |
---|
319 | 385 | } |
---|
320 | 386 | |
---|
321 | 387 | /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ |
---|
.. | .. |
---|
355 | 421 | { |
---|
356 | 422 | return find_task_by_pid_ns(vnr, task_active_pid_ns(current)); |
---|
357 | 423 | } |
---|
| 424 | +EXPORT_SYMBOL_GPL(find_task_by_vpid); |
---|
358 | 425 | |
---|
359 | 426 | struct task_struct *find_get_task_by_vpid(pid_t nr) |
---|
360 | 427 | { |
---|
.. | .. |
---|
431 | 498 | rcu_read_lock(); |
---|
432 | 499 | if (!ns) |
---|
433 | 500 | ns = task_active_pid_ns(current); |
---|
434 | | - if (likely(pid_alive(task))) |
---|
435 | | - nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); |
---|
| 501 | + nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); |
---|
436 | 502 | rcu_read_unlock(); |
---|
437 | 503 | |
---|
438 | 504 | return nr; |
---|
.. | .. |
---|
455 | 521 | return idr_get_next(&ns->idr, &nr); |
---|
456 | 522 | } |
---|
457 | 523 | |
---|
| 524 | +struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) |
---|
| 525 | +{ |
---|
| 526 | + struct fd f; |
---|
| 527 | + struct pid *pid; |
---|
| 528 | + |
---|
| 529 | + f = fdget(fd); |
---|
| 530 | + if (!f.file) |
---|
| 531 | + return ERR_PTR(-EBADF); |
---|
| 532 | + |
---|
| 533 | + pid = pidfd_pid(f.file); |
---|
| 534 | + if (!IS_ERR(pid)) { |
---|
| 535 | + get_pid(pid); |
---|
| 536 | + *flags = f.file->f_flags; |
---|
| 537 | + } |
---|
| 538 | + |
---|
| 539 | + fdput(f); |
---|
| 540 | + return pid; |
---|
| 541 | +} |
---|
| 542 | + |
---|
458 | 543 | /** |
---|
459 | 544 | * pidfd_create() - Create a new pid file descriptor. |
---|
460 | 545 | * |
---|
461 | | - * @pid: struct pid that the pidfd will reference |
---|
| 546 | + * @pid: struct pid that the pidfd will reference |
---|
| 547 | + * @flags: flags to pass |
---|
462 | 548 | * |
---|
463 | 549 | * This creates a new pid file descriptor with the O_CLOEXEC flag set. |
---|
464 | 550 | * |
---|
.. | .. |
---|
468 | 554 | * Return: On success, a cloexec pidfd is returned. |
---|
469 | 555 | * On error, a negative errno number will be returned. |
---|
470 | 556 | */ |
---|
471 | | -static int pidfd_create(struct pid *pid) |
---|
| 557 | +static int pidfd_create(struct pid *pid, unsigned int flags) |
---|
472 | 558 | { |
---|
473 | 559 | int fd; |
---|
474 | 560 | |
---|
475 | 561 | fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), |
---|
476 | | - O_RDWR | O_CLOEXEC); |
---|
| 562 | + flags | O_RDWR | O_CLOEXEC); |
---|
477 | 563 | if (fd < 0) |
---|
478 | 564 | put_pid(pid); |
---|
479 | 565 | |
---|
.. | .. |
---|
498 | 584 | */ |
---|
499 | 585 | SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) |
---|
500 | 586 | { |
---|
501 | | - int fd, ret; |
---|
| 587 | + int fd; |
---|
502 | 588 | struct pid *p; |
---|
503 | 589 | |
---|
504 | | - if (flags) |
---|
| 590 | + if (flags & ~PIDFD_NONBLOCK) |
---|
505 | 591 | return -EINVAL; |
---|
506 | 592 | |
---|
507 | 593 | if (pid <= 0) |
---|
.. | .. |
---|
511 | 597 | if (!p) |
---|
512 | 598 | return -ESRCH; |
---|
513 | 599 | |
---|
514 | | - ret = 0; |
---|
515 | | - rcu_read_lock(); |
---|
516 | | - if (!pid_task(p, PIDTYPE_TGID)) |
---|
517 | | - ret = -EINVAL; |
---|
518 | | - rcu_read_unlock(); |
---|
| 600 | + if (pid_has_task(p, PIDTYPE_TGID)) |
---|
| 601 | + fd = pidfd_create(p, flags); |
---|
| 602 | + else |
---|
| 603 | + fd = -EINVAL; |
---|
519 | 604 | |
---|
520 | | - fd = ret ?: pidfd_create(p); |
---|
521 | 605 | put_pid(p); |
---|
522 | 606 | return fd; |
---|
523 | 607 | } |
---|
.. | .. |
---|
539 | 623 | init_pid_ns.pid_cachep = KMEM_CACHE(pid, |
---|
540 | 624 | SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); |
---|
541 | 625 | } |
---|
| 626 | + |
---|
| 627 | +static struct file *__pidfd_fget(struct task_struct *task, int fd) |
---|
| 628 | +{ |
---|
| 629 | + struct file *file; |
---|
| 630 | + int ret; |
---|
| 631 | + |
---|
| 632 | + ret = down_read_killable(&task->signal->exec_update_lock); |
---|
| 633 | + if (ret) |
---|
| 634 | + return ERR_PTR(ret); |
---|
| 635 | + |
---|
| 636 | + if (ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS)) |
---|
| 637 | + file = fget_task(task, fd); |
---|
| 638 | + else |
---|
| 639 | + file = ERR_PTR(-EPERM); |
---|
| 640 | + |
---|
| 641 | + up_read(&task->signal->exec_update_lock); |
---|
| 642 | + |
---|
| 643 | + return file ?: ERR_PTR(-EBADF); |
---|
| 644 | +} |
---|
| 645 | + |
---|
| 646 | +static int pidfd_getfd(struct pid *pid, int fd) |
---|
| 647 | +{ |
---|
| 648 | + struct task_struct *task; |
---|
| 649 | + struct file *file; |
---|
| 650 | + int ret; |
---|
| 651 | + |
---|
| 652 | + task = get_pid_task(pid, PIDTYPE_PID); |
---|
| 653 | + if (!task) |
---|
| 654 | + return -ESRCH; |
---|
| 655 | + |
---|
| 656 | + file = __pidfd_fget(task, fd); |
---|
| 657 | + put_task_struct(task); |
---|
| 658 | + if (IS_ERR(file)) |
---|
| 659 | + return PTR_ERR(file); |
---|
| 660 | + |
---|
| 661 | + ret = receive_fd(file, O_CLOEXEC); |
---|
| 662 | + fput(file); |
---|
| 663 | + |
---|
| 664 | + return ret; |
---|
| 665 | +} |
---|
| 666 | + |
---|
| 667 | +/** |
---|
| 668 | + * sys_pidfd_getfd() - Get a file descriptor from another process |
---|
| 669 | + * |
---|
| 670 | + * @pidfd: the pidfd file descriptor of the process |
---|
| 671 | + * @fd: the file descriptor number to get |
---|
| 672 | + * @flags: flags on how to get the fd (reserved) |
---|
| 673 | + * |
---|
| 674 | + * This syscall gets a copy of a file descriptor from another process |
---|
| 675 | + * based on the pidfd, and file descriptor number. It requires that |
---|
| 676 | + * the calling process has the ability to ptrace the process represented |
---|
| 677 | + * by the pidfd. The process which is having its file descriptor copied |
---|
| 678 | + * is otherwise unaffected. |
---|
| 679 | + * |
---|
| 680 | + * Return: On success, a cloexec file descriptor is returned. |
---|
| 681 | + * On error, a negative errno number will be returned. |
---|
| 682 | + */ |
---|
| 683 | +SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd, |
---|
| 684 | + unsigned int, flags) |
---|
| 685 | +{ |
---|
| 686 | + struct pid *pid; |
---|
| 687 | + struct fd f; |
---|
| 688 | + int ret; |
---|
| 689 | + |
---|
| 690 | + /* flags is currently unused - make sure it's unset */ |
---|
| 691 | + if (flags) |
---|
| 692 | + return -EINVAL; |
---|
| 693 | + |
---|
| 694 | + f = fdget(pidfd); |
---|
| 695 | + if (!f.file) |
---|
| 696 | + return -EBADF; |
---|
| 697 | + |
---|
| 698 | + pid = pidfd_pid(f.file); |
---|
| 699 | + if (IS_ERR(pid)) |
---|
| 700 | + ret = PTR_ERR(pid); |
---|
| 701 | + else |
---|
| 702 | + ret = pidfd_getfd(pid, fd); |
---|
| 703 | + |
---|
| 704 | + fdput(f); |
---|
| 705 | + return ret; |
---|
| 706 | +} |
---|