hc
2024-10-22 8ac6c7a54ed1b98d142dce24b11c6de6a1e239a5
kernel/kernel/pid.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * Generic pidhash and scalable, time-bounded PID allocator
34 *
....@@ -31,20 +32,21 @@
3132 #include <linux/slab.h>
3233 #include <linux/init.h>
3334 #include <linux/rculist.h>
34
-#include <linux/bootmem.h>
35
-#include <linux/hash.h>
35
+#include <linux/memblock.h>
3636 #include <linux/pid_namespace.h>
3737 #include <linux/init_task.h>
3838 #include <linux/syscalls.h>
3939 #include <linux/proc_ns.h>
40
-#include <linux/proc_fs.h>
40
+#include <linux/refcount.h>
4141 #include <linux/anon_inodes.h>
4242 #include <linux/sched/signal.h>
4343 #include <linux/sched/task.h>
4444 #include <linux/idr.h>
45
+#include <net/sock.h>
46
+#include <uapi/linux/pidfd.h>
4547
4648 struct pid init_struct_pid = {
47
- .count = ATOMIC_INIT(1),
49
+ .count = REFCOUNT_INIT(1),
4850 .tasks = {
4951 { .first = NULL },
5052 { .first = NULL },
....@@ -108,8 +110,7 @@
108110 return;
109111
110112 ns = pid->numbers[pid->level].ns;
111
- if ((atomic_read(&pid->count) == 1) ||
112
- atomic_dec_and_test(&pid->count)) {
113
+ if (refcount_dec_and_test(&pid->count)) {
113114 kmem_cache_free(ns->pid_cachep, pid);
114115 put_pid_ns(ns);
115116 }
....@@ -145,9 +146,6 @@
145146 /* Handle a fork failure of the first process */
146147 WARN_ON(ns->child_reaper);
147148 ns->pid_allocated = 0;
148
- /* fall through */
149
- case 0:
150
- schedule_work(&ns->proc_work);
151149 break;
152150 }
153151
....@@ -158,7 +156,8 @@
158156 call_rcu(&pid->rcu, delayed_put_pid);
159157 }
160158
161
-struct pid *alloc_pid(struct pid_namespace *ns)
159
+struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
160
+ size_t set_tid_size)
162161 {
163162 struct pid *pid;
164163 enum pid_type type;
....@@ -166,6 +165,17 @@
166165 struct pid_namespace *tmp;
167166 struct upid *upid;
168167 int retval = -ENOMEM;
168
+
169
+ /*
170
+ * set_tid_size contains the size of the set_tid array. Starting at
171
+ * the most nested currently active PID namespace it tells alloc_pid()
172
+ * which PID to set for a process in that most nested PID namespace
173
+ * up to set_tid_size PID namespaces. It does not have to set the PID
174
+ * for a process in all nested PID namespaces but set_tid_size must
175
+ * never be greater than the current ns->level + 1.
176
+ */
177
+ if (set_tid_size > ns->level + 1)
178
+ return ERR_PTR(-EINVAL);
169179
170180 pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
171181 if (!pid)
....@@ -175,24 +185,54 @@
175185 pid->level = ns->level;
176186
177187 for (i = ns->level; i >= 0; i--) {
178
- int pid_min = 1;
188
+ int tid = 0;
189
+
190
+ if (set_tid_size) {
191
+ tid = set_tid[ns->level - i];
192
+
193
+ retval = -EINVAL;
194
+ if (tid < 1 || tid >= pid_max)
195
+ goto out_free;
196
+ /*
197
+ * Also fail if a PID != 1 is requested and
198
+ * no PID 1 exists.
199
+ */
200
+ if (tid != 1 && !tmp->child_reaper)
201
+ goto out_free;
202
+ retval = -EPERM;
203
+ if (!checkpoint_restore_ns_capable(tmp->user_ns))
204
+ goto out_free;
205
+ set_tid_size--;
206
+ }
179207
180208 idr_preload(GFP_KERNEL);
181209 spin_lock_irq(&pidmap_lock);
182210
183
- /*
184
- * init really needs pid 1, but after reaching the maximum
185
- * wrap back to RESERVED_PIDS
186
- */
187
- if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
188
- pid_min = RESERVED_PIDS;
211
+ if (tid) {
212
+ nr = idr_alloc(&tmp->idr, NULL, tid,
213
+ tid + 1, GFP_ATOMIC);
214
+ /*
215
+ * If ENOSPC is returned it means that the PID is
216
+ * alreay in use. Return EEXIST in that case.
217
+ */
218
+ if (nr == -ENOSPC)
219
+ nr = -EEXIST;
220
+ } else {
221
+ int pid_min = 1;
222
+ /*
223
+ * init really needs pid 1, but after reaching the
224
+ * maximum wrap back to RESERVED_PIDS
225
+ */
226
+ if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
227
+ pid_min = RESERVED_PIDS;
189228
190
- /*
191
- * Store a null pointer so find_pid_ns does not find
192
- * a partially initialized PID (see below).
193
- */
194
- nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
195
- pid_max, GFP_ATOMIC);
229
+ /*
230
+ * Store a null pointer so find_pid_ns does not find
231
+ * a partially initialized PID (see below).
232
+ */
233
+ nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
234
+ pid_max, GFP_ATOMIC);
235
+ }
196236 spin_unlock_irq(&pidmap_lock);
197237 idr_preload_end();
198238
....@@ -206,17 +246,24 @@
206246 tmp = tmp->parent;
207247 }
208248
209
- if (unlikely(is_child_reaper(pid))) {
210
- if (pid_ns_prepare_proc(ns))
211
- goto out_free;
212
- }
249
+ /*
250
+ * ENOMEM is not the most obvious choice especially for the case
251
+ * where the child subreaper has already exited and the pid
252
+ * namespace denies the creation of any new processes. But ENOMEM
253
+ * is what we have exposed to userspace for a long time and it is
254
+ * documented behavior for pid namespaces. So we can't easily
255
+ * change it even if there were an error code better suited.
256
+ */
257
+ retval = -ENOMEM;
213258
214259 get_pid_ns(ns);
215
- atomic_set(&pid->count, 1);
260
+ refcount_set(&pid->count, 1);
261
+ spin_lock_init(&pid->lock);
216262 for (type = 0; type < PIDTYPE_MAX; ++type)
217263 INIT_HLIST_HEAD(&pid->tasks[type]);
218264
219265 init_waitqueue_head(&pid->wait_pidfd);
266
+ INIT_HLIST_HEAD(&pid->inodes);
220267
221268 upid = pid->numbers + ns->level;
222269 spin_lock_irq(&pidmap_lock);
....@@ -300,7 +347,7 @@
300347 *pid_ptr = new;
301348
302349 for (tmp = PIDTYPE_MAX; --tmp >= 0; )
303
- if (!hlist_empty(&pid->tasks[tmp]))
350
+ if (pid_has_task(pid, tmp))
304351 return;
305352
306353 free_pid(pid);
....@@ -316,6 +363,25 @@
316363 {
317364 __change_pid(task, type, pid);
318365 attach_pid(task, type);
366
+}
367
+
368
+void exchange_tids(struct task_struct *left, struct task_struct *right)
369
+{
370
+ struct pid *pid1 = left->thread_pid;
371
+ struct pid *pid2 = right->thread_pid;
372
+ struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID];
373
+ struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID];
374
+
375
+ /* Swap the single entry tid lists */
376
+ hlists_swap_heads_rcu(head1, head2);
377
+
378
+ /* Swap the per task_struct pid */
379
+ rcu_assign_pointer(left->thread_pid, pid2);
380
+ rcu_assign_pointer(right->thread_pid, pid1);
381
+
382
+ /* Swap the cached value */
383
+ WRITE_ONCE(left->pid, pid_nr(pid2));
384
+ WRITE_ONCE(right->pid, pid_nr(pid1));
319385 }
320386
321387 /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
....@@ -355,6 +421,7 @@
355421 {
356422 return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
357423 }
424
+EXPORT_SYMBOL_GPL(find_task_by_vpid);
358425
359426 struct task_struct *find_get_task_by_vpid(pid_t nr)
360427 {
....@@ -431,8 +498,7 @@
431498 rcu_read_lock();
432499 if (!ns)
433500 ns = task_active_pid_ns(current);
434
- if (likely(pid_alive(task)))
435
- nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns);
501
+ nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns);
436502 rcu_read_unlock();
437503
438504 return nr;
....@@ -455,10 +521,30 @@
455521 return idr_get_next(&ns->idr, &nr);
456522 }
457523
524
+struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
525
+{
526
+ struct fd f;
527
+ struct pid *pid;
528
+
529
+ f = fdget(fd);
530
+ if (!f.file)
531
+ return ERR_PTR(-EBADF);
532
+
533
+ pid = pidfd_pid(f.file);
534
+ if (!IS_ERR(pid)) {
535
+ get_pid(pid);
536
+ *flags = f.file->f_flags;
537
+ }
538
+
539
+ fdput(f);
540
+ return pid;
541
+}
542
+
458543 /**
459544 * pidfd_create() - Create a new pid file descriptor.
460545 *
461
- * @pid: struct pid that the pidfd will reference
546
+ * @pid: struct pid that the pidfd will reference
547
+ * @flags: flags to pass
462548 *
463549 * This creates a new pid file descriptor with the O_CLOEXEC flag set.
464550 *
....@@ -468,12 +554,12 @@
468554 * Return: On success, a cloexec pidfd is returned.
469555 * On error, a negative errno number will be returned.
470556 */
471
-static int pidfd_create(struct pid *pid)
557
+static int pidfd_create(struct pid *pid, unsigned int flags)
472558 {
473559 int fd;
474560
475561 fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
476
- O_RDWR | O_CLOEXEC);
562
+ flags | O_RDWR | O_CLOEXEC);
477563 if (fd < 0)
478564 put_pid(pid);
479565
....@@ -498,10 +584,10 @@
498584 */
499585 SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
500586 {
501
- int fd, ret;
587
+ int fd;
502588 struct pid *p;
503589
504
- if (flags)
590
+ if (flags & ~PIDFD_NONBLOCK)
505591 return -EINVAL;
506592
507593 if (pid <= 0)
....@@ -511,13 +597,11 @@
511597 if (!p)
512598 return -ESRCH;
513599
514
- ret = 0;
515
- rcu_read_lock();
516
- if (!pid_task(p, PIDTYPE_TGID))
517
- ret = -EINVAL;
518
- rcu_read_unlock();
600
+ if (pid_has_task(p, PIDTYPE_TGID))
601
+ fd = pidfd_create(p, flags);
602
+ else
603
+ fd = -EINVAL;
519604
520
- fd = ret ?: pidfd_create(p);
521605 put_pid(p);
522606 return fd;
523607 }
....@@ -539,3 +623,84 @@
539623 init_pid_ns.pid_cachep = KMEM_CACHE(pid,
540624 SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
541625 }
626
+
627
+static struct file *__pidfd_fget(struct task_struct *task, int fd)
628
+{
629
+ struct file *file;
630
+ int ret;
631
+
632
+ ret = down_read_killable(&task->signal->exec_update_lock);
633
+ if (ret)
634
+ return ERR_PTR(ret);
635
+
636
+ if (ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS))
637
+ file = fget_task(task, fd);
638
+ else
639
+ file = ERR_PTR(-EPERM);
640
+
641
+ up_read(&task->signal->exec_update_lock);
642
+
643
+ return file ?: ERR_PTR(-EBADF);
644
+}
645
+
646
+static int pidfd_getfd(struct pid *pid, int fd)
647
+{
648
+ struct task_struct *task;
649
+ struct file *file;
650
+ int ret;
651
+
652
+ task = get_pid_task(pid, PIDTYPE_PID);
653
+ if (!task)
654
+ return -ESRCH;
655
+
656
+ file = __pidfd_fget(task, fd);
657
+ put_task_struct(task);
658
+ if (IS_ERR(file))
659
+ return PTR_ERR(file);
660
+
661
+ ret = receive_fd(file, O_CLOEXEC);
662
+ fput(file);
663
+
664
+ return ret;
665
+}
666
+
667
+/**
668
+ * sys_pidfd_getfd() - Get a file descriptor from another process
669
+ *
670
+ * @pidfd: the pidfd file descriptor of the process
671
+ * @fd: the file descriptor number to get
672
+ * @flags: flags on how to get the fd (reserved)
673
+ *
674
+ * This syscall gets a copy of a file descriptor from another process
675
+ * based on the pidfd, and file descriptor number. It requires that
676
+ * the calling process has the ability to ptrace the process represented
677
+ * by the pidfd. The process which is having its file descriptor copied
678
+ * is otherwise unaffected.
679
+ *
680
+ * Return: On success, a cloexec file descriptor is returned.
681
+ * On error, a negative errno number will be returned.
682
+ */
683
+SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd,
684
+ unsigned int, flags)
685
+{
686
+ struct pid *pid;
687
+ struct fd f;
688
+ int ret;
689
+
690
+ /* flags is currently unused - make sure it's unset */
691
+ if (flags)
692
+ return -EINVAL;
693
+
694
+ f = fdget(pidfd);
695
+ if (!f.file)
696
+ return -EBADF;
697
+
698
+ pid = pidfd_pid(f.file);
699
+ if (IS_ERR(pid))
700
+ ret = PTR_ERR(pid);
701
+ else
702
+ ret = pidfd_getfd(pid, fd);
703
+
704
+ fdput(f);
705
+ return ret;
706
+}