.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
---|
1 | 2 | /* |
---|
2 | 3 | * Pid namespaces |
---|
3 | 4 | * |
---|
.. | .. |
---|
25 | 26 | |
---|
26 | 27 | static DEFINE_MUTEX(pid_caches_mutex); |
---|
27 | 28 | static struct kmem_cache *pid_ns_cachep; |
---|
28 | | -/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ |
---|
29 | | -#define MAX_PID_NS_LEVEL 32 |
---|
30 | 29 | /* Write once array, filled from the beginning. */ |
---|
31 | 30 | static struct kmem_cache *pid_cache[MAX_PID_NS_LEVEL]; |
---|
32 | 31 | |
---|
.. | .. |
---|
57 | 56 | mutex_unlock(&pid_caches_mutex); |
---|
58 | 57 | /* current can fail, but someone else can succeed. */ |
---|
59 | 58 | return READ_ONCE(*pkc); |
---|
60 | | -} |
---|
61 | | - |
---|
62 | | -static void proc_cleanup_work(struct work_struct *work) |
---|
63 | | -{ |
---|
64 | | - struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work); |
---|
65 | | - pid_ns_release_proc(ns); |
---|
66 | 59 | } |
---|
67 | 60 | |
---|
68 | 61 | static struct ucounts *inc_pid_namespaces(struct user_namespace *ns) |
---|
.. | .. |
---|
116 | 109 | ns->user_ns = get_user_ns(user_ns); |
---|
117 | 110 | ns->ucounts = ucounts; |
---|
118 | 111 | ns->pid_allocated = PIDNS_ADDING; |
---|
119 | | - INIT_WORK(&ns->proc_work, proc_cleanup_work); |
---|
120 | 112 | |
---|
121 | 113 | return ns; |
---|
122 | 114 | |
---|
.. | .. |
---|
217 | 209 | idr_for_each_entry_continue(&pid_ns->idr, pid, nr) { |
---|
218 | 210 | task = pid_task(pid, PIDTYPE_PID); |
---|
219 | 211 | if (task && !__fatal_signal_pending(task)) |
---|
220 | | - send_sig_info(SIGKILL, SEND_SIG_FORCED, task); |
---|
| 212 | + group_send_sig_info(SIGKILL, SEND_SIG_PRIV, task, PIDTYPE_MAX); |
---|
221 | 213 | } |
---|
222 | 214 | read_unlock(&tasklist_lock); |
---|
223 | 215 | rcu_read_unlock(); |
---|
.. | .. |
---|
233 | 225 | } while (rc != -ECHILD); |
---|
234 | 226 | |
---|
235 | 227 | /* |
---|
236 | | - * kernel_wait4() above can't reap the EXIT_DEAD children but we do not |
---|
237 | | - * really care, we could reparent them to the global init. We could |
---|
238 | | - * exit and reap ->child_reaper even if it is not the last thread in |
---|
239 | | - * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(), |
---|
240 | | - * pid_ns can not go away until proc_kill_sb() drops the reference. |
---|
| 228 | + * kernel_wait4() misses EXIT_DEAD children, and EXIT_ZOMBIE |
---|
| 229 | + * process whose parents processes are outside of the pid |
---|
| 230 | + * namespace. Such processes are created with setns()+fork(). |
---|
241 | 231 | * |
---|
242 | | - * But this ns can also have other tasks injected by setns()+fork(). |
---|
243 | | - * Again, ignoring the user visible semantics we do not really need |
---|
244 | | - * to wait until they are all reaped, but they can be reparented to |
---|
245 | | - * us and thus we need to ensure that pid->child_reaper stays valid |
---|
246 | | - * until they all go away. See free_pid()->wake_up_process(). |
---|
| 232 | + * If those EXIT_ZOMBIE processes are not reaped by their |
---|
| 233 | + * parents before their parents exit, they will be reparented |
---|
| 234 | + * to pid_ns->child_reaper. Thus pidns->child_reaper needs to |
---|
| 235 | + * stay valid until they all go away. |
---|
247 | 236 | * |
---|
248 | | - * We rely on ignored SIGCHLD, an injected zombie must be autoreaped |
---|
249 | | - * if reparented. |
---|
| 237 | + * The code relies on the pid_ns->child_reaper ignoring |
---|
| 238 | + * SIGCHILD to cause those EXIT_ZOMBIE processes to be |
---|
| 239 | + * autoreaped if reparented. |
---|
| 240 | + * |
---|
| 241 | + * Semantically it is also desirable to wait for EXIT_ZOMBIE |
---|
| 242 | + * processes before allowing the child_reaper to be reaped, as |
---|
| 243 | + * that gives the invariant that when the init process of a |
---|
| 244 | + * pid namespace is reaped all of the processes in the pid |
---|
| 245 | + * namespace are gone. |
---|
| 246 | + * |
---|
| 247 | + * Once all of the other tasks are gone from the pid_namespace |
---|
| 248 | + * free_pid() will awaken this task. |
---|
250 | 249 | */ |
---|
251 | 250 | for (;;) { |
---|
252 | 251 | set_current_state(TASK_INTERRUPTIBLE); |
---|
.. | .. |
---|
265 | 264 | |
---|
266 | 265 | #ifdef CONFIG_CHECKPOINT_RESTORE |
---|
267 | 266 | static int pid_ns_ctl_handler(struct ctl_table *table, int write, |
---|
268 | | - void __user *buffer, size_t *lenp, loff_t *ppos) |
---|
| 267 | + void *buffer, size_t *lenp, loff_t *ppos) |
---|
269 | 268 | { |
---|
270 | 269 | struct pid_namespace *pid_ns = task_active_pid_ns(current); |
---|
271 | 270 | struct ctl_table tmp = *table; |
---|
272 | 271 | int ret, next; |
---|
273 | 272 | |
---|
274 | | - if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) |
---|
| 273 | + if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns)) |
---|
275 | 274 | return -EPERM; |
---|
276 | 275 | |
---|
277 | 276 | /* |
---|
.. | .. |
---|
291 | 290 | } |
---|
292 | 291 | |
---|
293 | 292 | extern int pid_max; |
---|
294 | | -static int zero = 0; |
---|
295 | 293 | static struct ctl_table pid_ns_ctl_table[] = { |
---|
296 | 294 | { |
---|
297 | 295 | .procname = "ns_last_pid", |
---|
298 | 296 | .maxlen = sizeof(int), |
---|
299 | 297 | .mode = 0666, /* permissions are checked in the handler */ |
---|
300 | 298 | .proc_handler = pid_ns_ctl_handler, |
---|
301 | | - .extra1 = &zero, |
---|
| 299 | + .extra1 = SYSCTL_ZERO, |
---|
302 | 300 | .extra2 = &pid_max, |
---|
303 | 301 | }, |
---|
304 | 302 | { } |
---|
.. | .. |
---|
381 | 379 | put_pid_ns(to_pid_ns(ns)); |
---|
382 | 380 | } |
---|
383 | 381 | |
---|
384 | | -static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns) |
---|
| 382 | +static int pidns_install(struct nsset *nsset, struct ns_common *ns) |
---|
385 | 383 | { |
---|
| 384 | + struct nsproxy *nsproxy = nsset->nsproxy; |
---|
386 | 385 | struct pid_namespace *active = task_active_pid_ns(current); |
---|
387 | 386 | struct pid_namespace *ancestor, *new = to_pid_ns(ns); |
---|
388 | 387 | |
---|
389 | 388 | if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || |
---|
390 | | - !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) |
---|
| 389 | + !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) |
---|
391 | 390 | return -EPERM; |
---|
392 | 391 | |
---|
393 | 392 | /* |
---|