| .. | .. |
|---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
|---|
| 1 | 2 | /* |
|---|
| 2 | 3 | * Pid namespaces |
|---|
| 3 | 4 | * |
|---|
| .. | .. |
|---|
| 25 | 26 | |
|---|
| 26 | 27 | static DEFINE_MUTEX(pid_caches_mutex); |
|---|
| 27 | 28 | static struct kmem_cache *pid_ns_cachep; |
|---|
| 28 | | -/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ |
|---|
| 29 | | -#define MAX_PID_NS_LEVEL 32 |
|---|
| 30 | 29 | /* Write once array, filled from the beginning. */ |
|---|
| 31 | 30 | static struct kmem_cache *pid_cache[MAX_PID_NS_LEVEL]; |
|---|
| 32 | 31 | |
|---|
| .. | .. |
|---|
| 57 | 56 | mutex_unlock(&pid_caches_mutex); |
|---|
| 58 | 57 | /* current can fail, but someone else can succeed. */ |
|---|
| 59 | 58 | return READ_ONCE(*pkc); |
|---|
| 60 | | -} |
|---|
| 61 | | - |
|---|
| 62 | | -static void proc_cleanup_work(struct work_struct *work) |
|---|
| 63 | | -{ |
|---|
| 64 | | - struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work); |
|---|
| 65 | | - pid_ns_release_proc(ns); |
|---|
| 66 | 59 | } |
|---|
| 67 | 60 | |
|---|
| 68 | 61 | static struct ucounts *inc_pid_namespaces(struct user_namespace *ns) |
|---|
| .. | .. |
|---|
| 116 | 109 | ns->user_ns = get_user_ns(user_ns); |
|---|
| 117 | 110 | ns->ucounts = ucounts; |
|---|
| 118 | 111 | ns->pid_allocated = PIDNS_ADDING; |
|---|
| 119 | | - INIT_WORK(&ns->proc_work, proc_cleanup_work); |
|---|
| 120 | 112 | |
|---|
| 121 | 113 | return ns; |
|---|
| 122 | 114 | |
|---|
| .. | .. |
|---|
| 217 | 209 | idr_for_each_entry_continue(&pid_ns->idr, pid, nr) { |
|---|
| 218 | 210 | task = pid_task(pid, PIDTYPE_PID); |
|---|
| 219 | 211 | if (task && !__fatal_signal_pending(task)) |
|---|
| 220 | | - send_sig_info(SIGKILL, SEND_SIG_FORCED, task); |
|---|
| 212 | + group_send_sig_info(SIGKILL, SEND_SIG_PRIV, task, PIDTYPE_MAX); |
|---|
| 221 | 213 | } |
|---|
| 222 | 214 | read_unlock(&tasklist_lock); |
|---|
| 223 | 215 | rcu_read_unlock(); |
|---|
| .. | .. |
|---|
| 233 | 225 | } while (rc != -ECHILD); |
|---|
| 234 | 226 | |
|---|
| 235 | 227 | /* |
|---|
| 236 | | - * kernel_wait4() above can't reap the EXIT_DEAD children but we do not |
|---|
| 237 | | - * really care, we could reparent them to the global init. We could |
|---|
| 238 | | - * exit and reap ->child_reaper even if it is not the last thread in |
|---|
| 239 | | - * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(), |
|---|
| 240 | | - * pid_ns can not go away until proc_kill_sb() drops the reference. |
|---|
| 228 | + * kernel_wait4() misses EXIT_DEAD children, and EXIT_ZOMBIE |
|---|
| 229 | + * process whose parents processes are outside of the pid |
|---|
| 230 | + * namespace. Such processes are created with setns()+fork(). |
|---|
| 241 | 231 | * |
|---|
| 242 | | - * But this ns can also have other tasks injected by setns()+fork(). |
|---|
| 243 | | - * Again, ignoring the user visible semantics we do not really need |
|---|
| 244 | | - * to wait until they are all reaped, but they can be reparented to |
|---|
| 245 | | - * us and thus we need to ensure that pid->child_reaper stays valid |
|---|
| 246 | | - * until they all go away. See free_pid()->wake_up_process(). |
|---|
| 232 | + * If those EXIT_ZOMBIE processes are not reaped by their |
|---|
| 233 | + * parents before their parents exit, they will be reparented |
|---|
| 234 | + * to pid_ns->child_reaper. Thus pidns->child_reaper needs to |
|---|
| 235 | + * stay valid until they all go away. |
|---|
| 247 | 236 | * |
|---|
| 248 | | - * We rely on ignored SIGCHLD, an injected zombie must be autoreaped |
|---|
| 249 | | - * if reparented. |
|---|
| 237 | + * The code relies on the pid_ns->child_reaper ignoring |
|---|
| 238 | + * SIGCHILD to cause those EXIT_ZOMBIE processes to be |
|---|
| 239 | + * autoreaped if reparented. |
|---|
| 240 | + * |
|---|
| 241 | + * Semantically it is also desirable to wait for EXIT_ZOMBIE |
|---|
| 242 | + * processes before allowing the child_reaper to be reaped, as |
|---|
| 243 | + * that gives the invariant that when the init process of a |
|---|
| 244 | + * pid namespace is reaped all of the processes in the pid |
|---|
| 245 | + * namespace are gone. |
|---|
| 246 | + * |
|---|
| 247 | + * Once all of the other tasks are gone from the pid_namespace |
|---|
| 248 | + * free_pid() will awaken this task. |
|---|
| 250 | 249 | */ |
|---|
| 251 | 250 | for (;;) { |
|---|
| 252 | 251 | set_current_state(TASK_INTERRUPTIBLE); |
|---|
| 253 | 252 | if (pid_ns->pid_allocated == init_pids) |
|---|
| 254 | 253 | break; |
|---|
| 254 | + /* |
|---|
| 255 | + * Release tasks_rcu_exit_srcu to avoid following deadlock: |
|---|
| 256 | + * |
|---|
| 257 | + * 1) TASK A unshare(CLONE_NEWPID) |
|---|
| 258 | + * 2) TASK A fork() twice -> TASK B (child reaper for new ns) |
|---|
| 259 | + * and TASK C |
|---|
| 260 | + * 3) TASK B exits, kills TASK C, waits for TASK A to reap it |
|---|
| 261 | + * 4) TASK A calls synchronize_rcu_tasks() |
|---|
| 262 | + * -> synchronize_srcu(tasks_rcu_exit_srcu) |
|---|
| 263 | + * 5) *DEADLOCK* |
|---|
| 264 | + * |
|---|
| 265 | + * It is considered safe to release tasks_rcu_exit_srcu here |
|---|
| 266 | + * because we assume the current task can not be concurrently |
|---|
| 267 | + * reaped at this point. |
|---|
| 268 | + */ |
|---|
| 269 | + exit_tasks_rcu_stop(); |
|---|
| 255 | 270 | schedule(); |
|---|
| 271 | + exit_tasks_rcu_start(); |
|---|
| 256 | 272 | } |
|---|
| 257 | 273 | __set_current_state(TASK_RUNNING); |
|---|
| 258 | 274 | |
|---|
| .. | .. |
|---|
| 265 | 281 | |
|---|
| 266 | 282 | #ifdef CONFIG_CHECKPOINT_RESTORE |
|---|
| 267 | 283 | static int pid_ns_ctl_handler(struct ctl_table *table, int write, |
|---|
| 268 | | - void __user *buffer, size_t *lenp, loff_t *ppos) |
|---|
| 284 | + void *buffer, size_t *lenp, loff_t *ppos) |
|---|
| 269 | 285 | { |
|---|
| 270 | 286 | struct pid_namespace *pid_ns = task_active_pid_ns(current); |
|---|
| 271 | 287 | struct ctl_table tmp = *table; |
|---|
| 272 | 288 | int ret, next; |
|---|
| 273 | 289 | |
|---|
| 274 | | - if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) |
|---|
| 290 | + if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns)) |
|---|
| 275 | 291 | return -EPERM; |
|---|
| 276 | 292 | |
|---|
| 277 | 293 | /* |
|---|
| .. | .. |
|---|
| 291 | 307 | } |
|---|
| 292 | 308 | |
|---|
| 293 | 309 | extern int pid_max; |
|---|
| 294 | | -static int zero = 0; |
|---|
| 295 | 310 | static struct ctl_table pid_ns_ctl_table[] = { |
|---|
| 296 | 311 | { |
|---|
| 297 | 312 | .procname = "ns_last_pid", |
|---|
| 298 | 313 | .maxlen = sizeof(int), |
|---|
| 299 | 314 | .mode = 0666, /* permissions are checked in the handler */ |
|---|
| 300 | 315 | .proc_handler = pid_ns_ctl_handler, |
|---|
| 301 | | - .extra1 = &zero, |
|---|
| 316 | + .extra1 = SYSCTL_ZERO, |
|---|
| 302 | 317 | .extra2 = &pid_max, |
|---|
| 303 | 318 | }, |
|---|
| 304 | 319 | { } |
|---|
| .. | .. |
|---|
| 381 | 396 | put_pid_ns(to_pid_ns(ns)); |
|---|
| 382 | 397 | } |
|---|
| 383 | 398 | |
|---|
| 384 | | -static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns) |
|---|
| 399 | +static int pidns_install(struct nsset *nsset, struct ns_common *ns) |
|---|
| 385 | 400 | { |
|---|
| 401 | + struct nsproxy *nsproxy = nsset->nsproxy; |
|---|
| 386 | 402 | struct pid_namespace *active = task_active_pid_ns(current); |
|---|
| 387 | 403 | struct pid_namespace *ancestor, *new = to_pid_ns(ns); |
|---|
| 388 | 404 | |
|---|
| 389 | 405 | if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || |
|---|
| 390 | | - !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) |
|---|
| 406 | + !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) |
|---|
| 391 | 407 | return -EPERM; |
|---|
| 392 | 408 | |
|---|
| 393 | 409 | /* |
|---|