| .. | .. | 
|---|
|  | 1 | +// SPDX-License-Identifier: GPL-2.0-only | 
|---|
| 1 | 2 | /* | 
|---|
| 2 | 3 | * Pid namespaces | 
|---|
| 3 | 4 | * | 
|---|
| .. | .. | 
|---|
| 25 | 26 |  | 
|---|
| 26 | 27 | static DEFINE_MUTEX(pid_caches_mutex); | 
|---|
| 27 | 28 | static struct kmem_cache *pid_ns_cachep; | 
|---|
| 28 |  | -/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ | 
|---|
| 29 |  | -#define MAX_PID_NS_LEVEL 32 | 
|---|
| 30 | 29 | /* Write once array, filled from the beginning. */ | 
|---|
| 31 | 30 | static struct kmem_cache *pid_cache[MAX_PID_NS_LEVEL]; | 
|---|
| 32 | 31 |  | 
|---|
| .. | .. | 
|---|
| 57 | 56 | mutex_unlock(&pid_caches_mutex); | 
|---|
| 58 | 57 | /* current can fail, but someone else can succeed. */ | 
|---|
| 59 | 58 | return READ_ONCE(*pkc); | 
|---|
| 60 |  | -} | 
|---|
| 61 |  | - | 
|---|
| 62 |  | -static void proc_cleanup_work(struct work_struct *work) | 
|---|
| 63 |  | -{ | 
|---|
| 64 |  | -	struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work); | 
|---|
| 65 |  | -	pid_ns_release_proc(ns); | 
|---|
| 66 | 59 | } | 
|---|
| 67 | 60 |  | 
|---|
| 68 | 61 | static struct ucounts *inc_pid_namespaces(struct user_namespace *ns) | 
|---|
| .. | .. | 
|---|
| 116 | 109 | ns->user_ns = get_user_ns(user_ns); | 
|---|
| 117 | 110 | ns->ucounts = ucounts; | 
|---|
| 118 | 111 | ns->pid_allocated = PIDNS_ADDING; | 
|---|
| 119 |  | -	INIT_WORK(&ns->proc_work, proc_cleanup_work); | 
|---|
| 120 | 112 |  | 
|---|
| 121 | 113 | return ns; | 
|---|
| 122 | 114 |  | 
|---|
| .. | .. | 
|---|
| 217 | 209 | idr_for_each_entry_continue(&pid_ns->idr, pid, nr) { | 
|---|
| 218 | 210 | task = pid_task(pid, PIDTYPE_PID); | 
|---|
| 219 | 211 | if (task && !__fatal_signal_pending(task)) | 
|---|
| 220 |  | -			send_sig_info(SIGKILL, SEND_SIG_FORCED, task); | 
|---|
|  | 212 | +			group_send_sig_info(SIGKILL, SEND_SIG_PRIV, task, PIDTYPE_MAX); | 
|---|
| 221 | 213 | } | 
|---|
| 222 | 214 | read_unlock(&tasklist_lock); | 
|---|
| 223 | 215 | rcu_read_unlock(); | 
|---|
| .. | .. | 
|---|
| 233 | 225 | } while (rc != -ECHILD); | 
|---|
| 234 | 226 |  | 
|---|
| 235 | 227 | /* | 
|---|
| 236 |  | -	 * kernel_wait4() above can't reap the EXIT_DEAD children but we do not | 
|---|
| 237 |  | -	 * really care, we could reparent them to the global init. We could | 
|---|
| 238 |  | -	 * exit and reap ->child_reaper even if it is not the last thread in | 
|---|
| 239 |  | -	 * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(), | 
|---|
| 240 |  | -	 * pid_ns can not go away until proc_kill_sb() drops the reference. | 
|---|
|  | 228 | +	 * kernel_wait4() misses EXIT_DEAD children, and EXIT_ZOMBIE | 
|---|
|  | 229 | +	 * process whose parents processes are outside of the pid | 
|---|
|  | 230 | +	 * namespace.  Such processes are created with setns()+fork(). | 
|---|
| 241 | 231 | * | 
|---|
| 242 |  | -	 * But this ns can also have other tasks injected by setns()+fork(). | 
|---|
| 243 |  | -	 * Again, ignoring the user visible semantics we do not really need | 
|---|
| 244 |  | -	 * to wait until they are all reaped, but they can be reparented to | 
|---|
| 245 |  | -	 * us and thus we need to ensure that pid->child_reaper stays valid | 
|---|
| 246 |  | -	 * until they all go away. See free_pid()->wake_up_process(). | 
|---|
|  | 232 | +	 * If those EXIT_ZOMBIE processes are not reaped by their | 
|---|
|  | 233 | +	 * parents before their parents exit, they will be reparented | 
|---|
|  | 234 | +	 * to pid_ns->child_reaper.  Thus pidns->child_reaper needs to | 
|---|
|  | 235 | +	 * stay valid until they all go away. | 
|---|
| 247 | 236 | * | 
|---|
| 248 |  | -	 * We rely on ignored SIGCHLD, an injected zombie must be autoreaped | 
|---|
| 249 |  | -	 * if reparented. | 
|---|
|  | 237 | +	 * The code relies on the pid_ns->child_reaper ignoring | 
|---|
|  | 238 | +	 * SIGCHILD to cause those EXIT_ZOMBIE processes to be | 
|---|
|  | 239 | +	 * autoreaped if reparented. | 
|---|
|  | 240 | +	 * | 
|---|
|  | 241 | +	 * Semantically it is also desirable to wait for EXIT_ZOMBIE | 
|---|
|  | 242 | +	 * processes before allowing the child_reaper to be reaped, as | 
|---|
|  | 243 | +	 * that gives the invariant that when the init process of a | 
|---|
|  | 244 | +	 * pid namespace is reaped all of the processes in the pid | 
|---|
|  | 245 | +	 * namespace are gone. | 
|---|
|  | 246 | +	 * | 
|---|
|  | 247 | +	 * Once all of the other tasks are gone from the pid_namespace | 
|---|
|  | 248 | +	 * free_pid() will awaken this task. | 
|---|
| 250 | 249 | */ | 
|---|
| 251 | 250 | for (;;) { | 
|---|
| 252 | 251 | set_current_state(TASK_INTERRUPTIBLE); | 
|---|
| 253 | 252 | if (pid_ns->pid_allocated == init_pids) | 
|---|
| 254 | 253 | break; | 
|---|
|  | 254 | +		/* | 
|---|
|  | 255 | +		 * Release tasks_rcu_exit_srcu to avoid following deadlock: | 
|---|
|  | 256 | +		 * | 
|---|
|  | 257 | +		 * 1) TASK A unshare(CLONE_NEWPID) | 
|---|
|  | 258 | +		 * 2) TASK A fork() twice -> TASK B (child reaper for new ns) | 
|---|
|  | 259 | +		 *    and TASK C | 
|---|
|  | 260 | +		 * 3) TASK B exits, kills TASK C, waits for TASK A to reap it | 
|---|
|  | 261 | +		 * 4) TASK A calls synchronize_rcu_tasks() | 
|---|
|  | 262 | +		 *                   -> synchronize_srcu(tasks_rcu_exit_srcu) | 
|---|
|  | 263 | +		 * 5) *DEADLOCK* | 
|---|
|  | 264 | +		 * | 
|---|
|  | 265 | +		 * It is considered safe to release tasks_rcu_exit_srcu here | 
|---|
|  | 266 | +		 * because we assume the current task can not be concurrently | 
|---|
|  | 267 | +		 * reaped at this point. | 
|---|
|  | 268 | +		 */ | 
|---|
|  | 269 | +		exit_tasks_rcu_stop(); | 
|---|
| 255 | 270 | schedule(); | 
|---|
|  | 271 | +		exit_tasks_rcu_start(); | 
|---|
| 256 | 272 | } | 
|---|
| 257 | 273 | __set_current_state(TASK_RUNNING); | 
|---|
| 258 | 274 |  | 
|---|
| .. | .. | 
|---|
| 265 | 281 |  | 
|---|
| 266 | 282 | #ifdef CONFIG_CHECKPOINT_RESTORE | 
|---|
| 267 | 283 | static int pid_ns_ctl_handler(struct ctl_table *table, int write, | 
|---|
| 268 |  | -		void __user *buffer, size_t *lenp, loff_t *ppos) | 
|---|
|  | 284 | +		void *buffer, size_t *lenp, loff_t *ppos) | 
|---|
| 269 | 285 | { | 
|---|
| 270 | 286 | struct pid_namespace *pid_ns = task_active_pid_ns(current); | 
|---|
| 271 | 287 | struct ctl_table tmp = *table; | 
|---|
| 272 | 288 | int ret, next; | 
|---|
| 273 | 289 |  | 
|---|
| 274 |  | -	if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) | 
|---|
|  | 290 | +	if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns)) | 
|---|
| 275 | 291 | return -EPERM; | 
|---|
| 276 | 292 |  | 
|---|
| 277 | 293 | /* | 
|---|
| .. | .. | 
|---|
| 291 | 307 | } | 
|---|
| 292 | 308 |  | 
|---|
| 293 | 309 | extern int pid_max; | 
|---|
| 294 |  | -static int zero = 0; | 
|---|
| 295 | 310 | static struct ctl_table pid_ns_ctl_table[] = { | 
|---|
| 296 | 311 | { | 
|---|
| 297 | 312 | .procname = "ns_last_pid", | 
|---|
| 298 | 313 | .maxlen = sizeof(int), | 
|---|
| 299 | 314 | .mode = 0666, /* permissions are checked in the handler */ | 
|---|
| 300 | 315 | .proc_handler = pid_ns_ctl_handler, | 
|---|
| 301 |  | -		.extra1 = &zero, | 
|---|
|  | 316 | +		.extra1 = SYSCTL_ZERO, | 
|---|
| 302 | 317 | .extra2 = &pid_max, | 
|---|
| 303 | 318 | }, | 
|---|
| 304 | 319 | { } | 
|---|
| .. | .. | 
|---|
| 381 | 396 | put_pid_ns(to_pid_ns(ns)); | 
|---|
| 382 | 397 | } | 
|---|
| 383 | 398 |  | 
|---|
| 384 |  | -static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns) | 
|---|
|  | 399 | +static int pidns_install(struct nsset *nsset, struct ns_common *ns) | 
|---|
| 385 | 400 | { | 
|---|
|  | 401 | +	struct nsproxy *nsproxy = nsset->nsproxy; | 
|---|
| 386 | 402 | struct pid_namespace *active = task_active_pid_ns(current); | 
|---|
| 387 | 403 | struct pid_namespace *ancestor, *new = to_pid_ns(ns); | 
|---|
| 388 | 404 |  | 
|---|
| 389 | 405 | if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || | 
|---|
| 390 |  | -	    !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) | 
|---|
|  | 406 | +	    !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) | 
|---|
| 391 | 407 | return -EPERM; | 
|---|
| 392 | 408 |  | 
|---|
| 393 | 409 | /* | 
|---|