hc
2024-09-20 cf4ce59b3b70238352c7f1729f0f7223214828ad
kernel/kernel/pid_namespace.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * Pid namespaces
34 *
....@@ -25,8 +26,6 @@
2526
2627 static DEFINE_MUTEX(pid_caches_mutex);
2728 static struct kmem_cache *pid_ns_cachep;
28
-/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
29
-#define MAX_PID_NS_LEVEL 32
3029 /* Write once array, filled from the beginning. */
3130 static struct kmem_cache *pid_cache[MAX_PID_NS_LEVEL];
3231
....@@ -57,12 +56,6 @@
5756 mutex_unlock(&pid_caches_mutex);
5857 /* current can fail, but someone else can succeed. */
5958 return READ_ONCE(*pkc);
60
-}
61
-
62
-static void proc_cleanup_work(struct work_struct *work)
63
-{
64
- struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work);
65
- pid_ns_release_proc(ns);
6659 }
6760
6861 static struct ucounts *inc_pid_namespaces(struct user_namespace *ns)
....@@ -116,7 +109,6 @@
116109 ns->user_ns = get_user_ns(user_ns);
117110 ns->ucounts = ucounts;
118111 ns->pid_allocated = PIDNS_ADDING;
119
- INIT_WORK(&ns->proc_work, proc_cleanup_work);
120112
121113 return ns;
122114
....@@ -217,7 +209,7 @@
217209 idr_for_each_entry_continue(&pid_ns->idr, pid, nr) {
218210 task = pid_task(pid, PIDTYPE_PID);
219211 if (task && !__fatal_signal_pending(task))
220
- send_sig_info(SIGKILL, SEND_SIG_FORCED, task);
212
+ group_send_sig_info(SIGKILL, SEND_SIG_PRIV, task, PIDTYPE_MAX);
221213 }
222214 read_unlock(&tasklist_lock);
223215 rcu_read_unlock();
....@@ -233,26 +225,50 @@
233225 } while (rc != -ECHILD);
234226
235227 /*
236
- * kernel_wait4() above can't reap the EXIT_DEAD children but we do not
237
- * really care, we could reparent them to the global init. We could
238
- * exit and reap ->child_reaper even if it is not the last thread in
239
- * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(),
240
- * pid_ns can not go away until proc_kill_sb() drops the reference.
228
+ * kernel_wait4() misses EXIT_DEAD children, and EXIT_ZOMBIE
229
+ * process whose parents processes are outside of the pid
230
+ * namespace. Such processes are created with setns()+fork().
241231 *
242
- * But this ns can also have other tasks injected by setns()+fork().
243
- * Again, ignoring the user visible semantics we do not really need
244
- * to wait until they are all reaped, but they can be reparented to
245
- * us and thus we need to ensure that pid->child_reaper stays valid
246
- * until they all go away. See free_pid()->wake_up_process().
232
+ * If those EXIT_ZOMBIE processes are not reaped by their
233
+ * parents before their parents exit, they will be reparented
234
+ * to pid_ns->child_reaper. Thus pidns->child_reaper needs to
235
+ * stay valid until they all go away.
247236 *
248
- * We rely on ignored SIGCHLD, an injected zombie must be autoreaped
249
- * if reparented.
237
+ * The code relies on the pid_ns->child_reaper ignoring
238
+ * SIGCHILD to cause those EXIT_ZOMBIE processes to be
239
+ * autoreaped if reparented.
240
+ *
241
+ * Semantically it is also desirable to wait for EXIT_ZOMBIE
242
+ * processes before allowing the child_reaper to be reaped, as
243
+ * that gives the invariant that when the init process of a
244
+ * pid namespace is reaped all of the processes in the pid
245
+ * namespace are gone.
246
+ *
247
+ * Once all of the other tasks are gone from the pid_namespace
248
+ * free_pid() will awaken this task.
250249 */
251250 for (;;) {
252251 set_current_state(TASK_INTERRUPTIBLE);
253252 if (pid_ns->pid_allocated == init_pids)
254253 break;
254
+ /*
255
+ * Release tasks_rcu_exit_srcu to avoid following deadlock:
256
+ *
257
+ * 1) TASK A unshare(CLONE_NEWPID)
258
+ * 2) TASK A fork() twice -> TASK B (child reaper for new ns)
259
+ * and TASK C
260
+ * 3) TASK B exits, kills TASK C, waits for TASK A to reap it
261
+ * 4) TASK A calls synchronize_rcu_tasks()
262
+ * -> synchronize_srcu(tasks_rcu_exit_srcu)
263
+ * 5) *DEADLOCK*
264
+ *
265
+ * It is considered safe to release tasks_rcu_exit_srcu here
266
+ * because we assume the current task can not be concurrently
267
+ * reaped at this point.
268
+ */
269
+ exit_tasks_rcu_stop();
255270 schedule();
271
+ exit_tasks_rcu_start();
256272 }
257273 __set_current_state(TASK_RUNNING);
258274
....@@ -265,13 +281,13 @@
265281
266282 #ifdef CONFIG_CHECKPOINT_RESTORE
267283 static int pid_ns_ctl_handler(struct ctl_table *table, int write,
268
- void __user *buffer, size_t *lenp, loff_t *ppos)
284
+ void *buffer, size_t *lenp, loff_t *ppos)
269285 {
270286 struct pid_namespace *pid_ns = task_active_pid_ns(current);
271287 struct ctl_table tmp = *table;
272288 int ret, next;
273289
274
- if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
290
+ if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns))
275291 return -EPERM;
276292
277293 /*
....@@ -291,14 +307,13 @@
291307 }
292308
293309 extern int pid_max;
294
-static int zero = 0;
295310 static struct ctl_table pid_ns_ctl_table[] = {
296311 {
297312 .procname = "ns_last_pid",
298313 .maxlen = sizeof(int),
299314 .mode = 0666, /* permissions are checked in the handler */
300315 .proc_handler = pid_ns_ctl_handler,
301
- .extra1 = &zero,
316
+ .extra1 = SYSCTL_ZERO,
302317 .extra2 = &pid_max,
303318 },
304319 { }
....@@ -381,13 +396,14 @@
381396 put_pid_ns(to_pid_ns(ns));
382397 }
383398
384
-static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns)
399
+static int pidns_install(struct nsset *nsset, struct ns_common *ns)
385400 {
401
+ struct nsproxy *nsproxy = nsset->nsproxy;
386402 struct pid_namespace *active = task_active_pid_ns(current);
387403 struct pid_namespace *ancestor, *new = to_pid_ns(ns);
388404
389405 if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
390
- !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
406
+ !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
391407 return -EPERM;
392408
393409 /*