.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
---|
1 | 2 | /* |
---|
2 | 3 | * Pid namespaces |
---|
3 | 4 | * |
---|
.. | .. |
---|
25 | 26 | |
---|
26 | 27 | static DEFINE_MUTEX(pid_caches_mutex); |
---|
27 | 28 | static struct kmem_cache *pid_ns_cachep; |
---|
28 | | -/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ |
---|
29 | | -#define MAX_PID_NS_LEVEL 32 |
---|
30 | 29 | /* Write once array, filled from the beginning. */ |
---|
31 | 30 | static struct kmem_cache *pid_cache[MAX_PID_NS_LEVEL]; |
---|
32 | 31 | |
---|
.. | .. |
---|
57 | 56 | mutex_unlock(&pid_caches_mutex); |
---|
58 | 57 | /* current can fail, but someone else can succeed. */ |
---|
59 | 58 | return READ_ONCE(*pkc); |
---|
60 | | -} |
---|
61 | | - |
---|
62 | | -static void proc_cleanup_work(struct work_struct *work) |
---|
63 | | -{ |
---|
64 | | - struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work); |
---|
65 | | - pid_ns_release_proc(ns); |
---|
66 | 59 | } |
---|
67 | 60 | |
---|
68 | 61 | static struct ucounts *inc_pid_namespaces(struct user_namespace *ns) |
---|
.. | .. |
---|
116 | 109 | ns->user_ns = get_user_ns(user_ns); |
---|
117 | 110 | ns->ucounts = ucounts; |
---|
118 | 111 | ns->pid_allocated = PIDNS_ADDING; |
---|
119 | | - INIT_WORK(&ns->proc_work, proc_cleanup_work); |
---|
120 | 112 | |
---|
121 | 113 | return ns; |
---|
122 | 114 | |
---|
.. | .. |
---|
217 | 209 | idr_for_each_entry_continue(&pid_ns->idr, pid, nr) { |
---|
218 | 210 | task = pid_task(pid, PIDTYPE_PID); |
---|
219 | 211 | if (task && !__fatal_signal_pending(task)) |
---|
220 | | - send_sig_info(SIGKILL, SEND_SIG_FORCED, task); |
---|
| 212 | + group_send_sig_info(SIGKILL, SEND_SIG_PRIV, task, PIDTYPE_MAX); |
---|
221 | 213 | } |
---|
222 | 214 | read_unlock(&tasklist_lock); |
---|
223 | 215 | rcu_read_unlock(); |
---|
.. | .. |
---|
233 | 225 | } while (rc != -ECHILD); |
---|
234 | 226 | |
---|
235 | 227 | /* |
---|
236 | | - * kernel_wait4() above can't reap the EXIT_DEAD children but we do not |
---|
237 | | - * really care, we could reparent them to the global init. We could |
---|
238 | | - * exit and reap ->child_reaper even if it is not the last thread in |
---|
239 | | - * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(), |
---|
240 | | - * pid_ns can not go away until proc_kill_sb() drops the reference. |
---|
| 228 | + * kernel_wait4() misses EXIT_DEAD children, and EXIT_ZOMBIE |
---|
| 229 | + * process whose parents processes are outside of the pid |
---|
| 230 | + * namespace. Such processes are created with setns()+fork(). |
---|
241 | 231 | * |
---|
242 | | - * But this ns can also have other tasks injected by setns()+fork(). |
---|
243 | | - * Again, ignoring the user visible semantics we do not really need |
---|
244 | | - * to wait until they are all reaped, but they can be reparented to |
---|
245 | | - * us and thus we need to ensure that pid->child_reaper stays valid |
---|
246 | | - * until they all go away. See free_pid()->wake_up_process(). |
---|
| 232 | + * If those EXIT_ZOMBIE processes are not reaped by their |
---|
| 233 | + * parents before their parents exit, they will be reparented |
---|
| 234 | + * to pid_ns->child_reaper. Thus pidns->child_reaper needs to |
---|
| 235 | + * stay valid until they all go away. |
---|
247 | 236 | * |
---|
248 | | - * We rely on ignored SIGCHLD, an injected zombie must be autoreaped |
---|
249 | | - * if reparented. |
---|
| 237 | + * The code relies on the pid_ns->child_reaper ignoring |
---|
| 238 | + * SIGCHILD to cause those EXIT_ZOMBIE processes to be |
---|
| 239 | + * autoreaped if reparented. |
---|
| 240 | + * |
---|
| 241 | + * Semantically it is also desirable to wait for EXIT_ZOMBIE |
---|
| 242 | + * processes before allowing the child_reaper to be reaped, as |
---|
| 243 | + * that gives the invariant that when the init process of a |
---|
| 244 | + * pid namespace is reaped all of the processes in the pid |
---|
| 245 | + * namespace are gone. |
---|
| 246 | + * |
---|
| 247 | + * Once all of the other tasks are gone from the pid_namespace |
---|
| 248 | + * free_pid() will awaken this task. |
---|
250 | 249 | */ |
---|
251 | 250 | for (;;) { |
---|
252 | 251 | set_current_state(TASK_INTERRUPTIBLE); |
---|
253 | 252 | if (pid_ns->pid_allocated == init_pids) |
---|
254 | 253 | break; |
---|
| 254 | + /* |
---|
| 255 | + * Release tasks_rcu_exit_srcu to avoid following deadlock: |
---|
| 256 | + * |
---|
| 257 | + * 1) TASK A unshare(CLONE_NEWPID) |
---|
| 258 | + * 2) TASK A fork() twice -> TASK B (child reaper for new ns) |
---|
| 259 | + * and TASK C |
---|
| 260 | + * 3) TASK B exits, kills TASK C, waits for TASK A to reap it |
---|
| 261 | + * 4) TASK A calls synchronize_rcu_tasks() |
---|
| 262 | + * -> synchronize_srcu(tasks_rcu_exit_srcu) |
---|
| 263 | + * 5) *DEADLOCK* |
---|
| 264 | + * |
---|
| 265 | + * It is considered safe to release tasks_rcu_exit_srcu here |
---|
| 266 | + * because we assume the current task can not be concurrently |
---|
| 267 | + * reaped at this point. |
---|
| 268 | + */ |
---|
| 269 | + exit_tasks_rcu_stop(); |
---|
255 | 270 | schedule(); |
---|
| 271 | + exit_tasks_rcu_start(); |
---|
256 | 272 | } |
---|
257 | 273 | __set_current_state(TASK_RUNNING); |
---|
258 | 274 | |
---|
.. | .. |
---|
265 | 281 | |
---|
266 | 282 | #ifdef CONFIG_CHECKPOINT_RESTORE |
---|
267 | 283 | static int pid_ns_ctl_handler(struct ctl_table *table, int write, |
---|
268 | | - void __user *buffer, size_t *lenp, loff_t *ppos) |
---|
| 284 | + void *buffer, size_t *lenp, loff_t *ppos) |
---|
269 | 285 | { |
---|
270 | 286 | struct pid_namespace *pid_ns = task_active_pid_ns(current); |
---|
271 | 287 | struct ctl_table tmp = *table; |
---|
272 | 288 | int ret, next; |
---|
273 | 289 | |
---|
274 | | - if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) |
---|
| 290 | + if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns)) |
---|
275 | 291 | return -EPERM; |
---|
276 | 292 | |
---|
277 | 293 | /* |
---|
.. | .. |
---|
291 | 307 | } |
---|
292 | 308 | |
---|
293 | 309 | extern int pid_max; |
---|
294 | | -static int zero = 0; |
---|
295 | 310 | static struct ctl_table pid_ns_ctl_table[] = { |
---|
296 | 311 | { |
---|
297 | 312 | .procname = "ns_last_pid", |
---|
298 | 313 | .maxlen = sizeof(int), |
---|
299 | 314 | .mode = 0666, /* permissions are checked in the handler */ |
---|
300 | 315 | .proc_handler = pid_ns_ctl_handler, |
---|
301 | | - .extra1 = &zero, |
---|
| 316 | + .extra1 = SYSCTL_ZERO, |
---|
302 | 317 | .extra2 = &pid_max, |
---|
303 | 318 | }, |
---|
304 | 319 | { } |
---|
.. | .. |
---|
381 | 396 | put_pid_ns(to_pid_ns(ns)); |
---|
382 | 397 | } |
---|
383 | 398 | |
---|
384 | | -static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns) |
---|
| 399 | +static int pidns_install(struct nsset *nsset, struct ns_common *ns) |
---|
385 | 400 | { |
---|
| 401 | + struct nsproxy *nsproxy = nsset->nsproxy; |
---|
386 | 402 | struct pid_namespace *active = task_active_pid_ns(current); |
---|
387 | 403 | struct pid_namespace *ancestor, *new = to_pid_ns(ns); |
---|
388 | 404 | |
---|
389 | 405 | if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || |
---|
390 | | - !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) |
---|
| 406 | + !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) |
---|
391 | 407 | return -EPERM; |
---|
392 | 408 | |
---|
393 | 409 | /* |
---|