~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0-only
1	2	/*
2	3	* linux/mm/oom_kill.c
3	4	*
..	..	@@ -25,7 +26,9 @@
25	26	#include <linux/sched/mm.h>
26	27	#include <linux/sched/coredump.h>
27	28	#include <linux/sched/task.h>
	29	+#include <linux/sched/debug.h>
28	30	#include <linux/swap.h>
	31	+#include <linux/syscalls.h>
29	32	#include <linux/timex.h>
30	33	#include <linux/jiffies.h>
31	34	#include <linux/cpuset.h>
..	..	@@ -49,6 +52,9 @@
49	52	#define CREATE_TRACE_POINTS
50	53	#include <trace/events/oom.h>
51	54
	55	+#undef CREATE_TRACE_POINTS
	56	+#include <trace/hooks/mm.h>
	57	+
52	58	int sysctl_panic_on_oom;
53	59	int sysctl_oom_kill_allocating_task;
54	60	int sysctl_oom_dump_tasks = 1;
..	..	@@ -65,21 +71,33 @@
65	71	/* Serializes oom_score_adj and oom_score_adj_min updates */
66	72	DEFINE_MUTEX(oom_adj_mutex);
67	73
	74	+static inline bool is_memcg_oom(struct oom_control *oc)
	75	+{
	76	+ return oc->memcg != NULL;
	77	+}
	78	+
68	79	#ifdef CONFIG_NUMA
69	80	/**
70		- * has_intersects_mems_allowed() - check task eligiblity for kill
	81	+ * oom_cpuset_eligible() - check task eligiblity for kill
71	82	* @start: task struct of which task to consider
72		- * @mask: nodemask passed to page allocator for mempolicy ooms
	83	+ * @oc: pointer to struct oom_control
73	84	*
74	85	* Task eligibility is determined by whether or not a candidate task, @tsk,
75	86	* shares the same mempolicy nodes as current if it is bound by such a policy
76	87	* and whether or not it has the same set of allowed cpuset nodes.
	88	+ *
	89	+ * This function is assuming oom-killer context and 'current' has triggered
	90	+ * the oom-killer.
77	91	*/
78		-static bool has_intersects_mems_allowed(struct task_struct *start,
79		- const nodemask_t *mask)
	92	+static bool oom_cpuset_eligible(struct task_struct *start,
	93	+ struct oom_control *oc)
80	94	{
81	95	struct task_struct *tsk;
82	96	bool ret = false;
	97	+ const nodemask_t *mask = oc->nodemask;
	98	+
	99	+ if (is_memcg_oom(oc))
	100	+ return true;
83	101
84	102	rcu_read_lock();
85	103	for_each_thread(start, tsk) {
..	..	@@ -106,8 +124,7 @@
106	124	return ret;
107	125	}
108	126	#else
109		-static bool has_intersects_mems_allowed(struct task_struct *tsk,
110		- const nodemask_t *mask)
	127	+static bool oom_cpuset_eligible(struct task_struct tsk, struct oom_control oc)
111	128	{
112	129	return true;
113	130	}
..	..	@@ -115,7 +132,7 @@
115	132
116	133	/*
117	134	* The process p may have detached its own ->mm while exiting or through
118		- * use_mm(), but one or more of its subthreads may still have a valid
	135	+ * kthread_use_mm(), but one or more of its subthreads may still have a valid
119	136	* pointer. Return p, or any of its subthreads with a valid ->mm, with
120	137	* task_lock() held.
121	138	*/
..	..	@@ -147,28 +164,13 @@
147	164	return oc->order == -1;
148	165	}
149	166
150		-static inline bool is_memcg_oom(struct oom_control *oc)
151		-{
152		- return oc->memcg != NULL;
153		-}
154		-
155	167	/* return true if the task is not adequate as candidate victim task. */
156		-static bool oom_unkillable_task(struct task_struct *p,
157		- struct mem_cgroup memcg, const nodemask_t nodemask)
	168	+static bool oom_unkillable_task(struct task_struct *p)
158	169	{
159	170	if (is_global_init(p))
160	171	return true;
161	172	if (p->flags & PF_KTHREAD)
162	173	return true;
163		-
164		- /* When mem_cgroup_out_of_memory() and p is not member of the group */
165		- if (memcg && !task_in_mem_cgroup(p, memcg))
166		- return true;
167		-
168		- /* p may not have freeable memory in nodemask */
169		- if (!has_intersects_mems_allowed(p, nodemask))
170		- return true;
171		-
172	174	return false;
173	175	}
174	176
..	..	@@ -188,32 +190,29 @@
188	190	global_node_page_state(NR_ISOLATED_FILE) +
189	191	global_node_page_state(NR_UNEVICTABLE);
190	192
191		- return (global_node_page_state(NR_SLAB_UNRECLAIMABLE) > nr_lru);
	193	+ return (global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B) > nr_lru);
192	194	}
193	195
194	196	/**
195	197	* oom_badness - heuristic function to determine which candidate task to kill
196	198	* @p: task struct of which task we should calculate
197	199	* @totalpages: total present RAM allowed for page allocation
198		- * @memcg: task's memory controller, if constrained
199		- * @nodemask: nodemask passed to page allocator for mempolicy ooms
200	200	*
201	201	* The heuristic for determining which task to kill is made to be as simple and
202	202	* predictable as possible. The goal is to return the highest value for the
203	203	* task consuming the most memory to avoid subsequent oom failures.
204	204	*/
205		-unsigned long oom_badness(struct task_struct p, struct mem_cgroup memcg,
206		- const nodemask_t *nodemask, unsigned long totalpages)
	205	+long oom_badness(struct task_struct *p, unsigned long totalpages)
207	206	{
208	207	long points;
209	208	long adj;
210	209
211		- if (oom_unkillable_task(p, memcg, nodemask))
212		- return 0;
	210	+ if (oom_unkillable_task(p))
	211	+ return LONG_MIN;
213	212
214	213	p = find_lock_task_mm(p);
215	214	if (!p)
216		- return 0;
	215	+ return LONG_MIN;
217	216
218	217	/*
219	218	* Do not even consider tasks which are explicitly marked oom
..	..	@@ -225,7 +224,7 @@
225	224	test_bit(MMF_OOM_SKIP, &p->mm->flags) \|\|
226	225	in_vfork(p)) {
227	226	task_unlock(p);
228		- return 0;
	227	+ return LONG_MIN;
229	228	}
230	229
231	230	/*
..	..	@@ -240,18 +239,14 @@
240	239	adj *= totalpages / 1000;
241	240	points += adj;
242	241
243		- /*
244		- * Never return 0 for an eligible task regardless of the root bonus and
245		- * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
246		- */
247		- return points > 0 ? points : 1;
	242	+ return points;
248	243	}
249	244
250		-enum oom_constraint {
251		- CONSTRAINT_NONE,
252		- CONSTRAINT_CPUSET,
253		- CONSTRAINT_MEMORY_POLICY,
254		- CONSTRAINT_MEMCG,
	245	+static const char * const oom_constraint_text[] = {
	246	+ [CONSTRAINT_NONE] = "CONSTRAINT_NONE",
	247	+ [CONSTRAINT_CPUSET] = "CONSTRAINT_CPUSET",
	248	+ [CONSTRAINT_MEMORY_POLICY] = "CONSTRAINT_MEMORY_POLICY",
	249	+ [CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG",
255	250	};
256	251
257	252	/*
..	..	@@ -261,7 +256,7 @@
261	256	{
262	257	struct zone *zone;
263	258	struct zoneref *z;
264		- enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask);
	259	+ enum zone_type highest_zoneidx = gfp_zone(oc->gfp_mask);
265	260	bool cpuset_limited = false;
266	261	int nid;
267	262
..	..	@@ -271,7 +266,7 @@
271	266	}
272	267
273	268	/* Default to all available memory */
274		- oc->totalpages = totalram_pages + total_swap_pages;
	269	+ oc->totalpages = totalram_pages() + total_swap_pages;
275	270
276	271	if (!IS_ENABLED(CONFIG_NUMA))
277	272	return CONSTRAINT_NONE;
..	..	@@ -295,20 +290,20 @@
295	290	!nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
296	291	oc->totalpages = total_swap_pages;
297	292	for_each_node_mask(nid, *oc->nodemask)
298		- oc->totalpages += node_spanned_pages(nid);
	293	+ oc->totalpages += node_present_pages(nid);
299	294	return CONSTRAINT_MEMORY_POLICY;
300	295	}
301	296
302	297	/* Check this allocation failure is caused by cpuset's wall function */
303	298	for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
304		- high_zoneidx, oc->nodemask)
	299	+ highest_zoneidx, oc->nodemask)
305	300	if (!cpuset_zone_allowed(zone, oc->gfp_mask))
306	301	cpuset_limited = true;
307	302
308	303	if (cpuset_limited) {
309	304	oc->totalpages = total_swap_pages;
310	305	for_each_node_mask(nid, cpuset_current_mems_allowed)
311		- oc->totalpages += node_spanned_pages(nid);
	306	+ oc->totalpages += node_present_pages(nid);
312	307	return CONSTRAINT_CPUSET;
313	308	}
314	309	return CONSTRAINT_NONE;
..	..	@@ -317,9 +312,13 @@
317	312	static int oom_evaluate_task(struct task_struct task, void arg)
318	313	{
319	314	struct oom_control *oc = arg;
320		- unsigned long points;
	315	+ long points;
321	316
322		- if (oom_unkillable_task(task, NULL, oc->nodemask))
	317	+ if (oom_unkillable_task(task))
	318	+ goto next;
	319	+
	320	+ /* p may not have freeable memory in nodemask */
	321	+ if (!is_memcg_oom(oc) && !oom_cpuset_eligible(task, oc))
323	322	goto next;
324	323
325	324	/*
..	..	@@ -339,17 +338,31 @@
339	338	* killed first if it triggers an oom, then select it.
340	339	*/
341	340	if (oom_task_origin(task)) {
342		- points = ULONG_MAX;
	341	+ points = LONG_MAX;
343	342	goto select;
344	343	}
345	344
346		- points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
347		- if (!points \|\| points < oc->chosen_points)
	345	+ points = oom_badness(task, oc->totalpages);
	346	+
	347	+ if (points == LONG_MIN)
348	348	goto next;
349	349
350		- /* Prefer thread group leaders for display purposes */
351		- if (points == oc->chosen_points && thread_group_leader(oc->chosen))
	350	+ /*
	351	+ * Check to see if this is the worst task with a non-negative
	352	+ * ADJ score seen so far
	353	+ */
	354	+ if (task->signal->oom_score_adj >= 0 &&
	355	+ points > oc->chosen_non_negative_adj_points) {
	356	+ if (oc->chosen_non_negative_adj)
	357	+ put_task_struct(oc->chosen_non_negative_adj);
	358	+ get_task_struct(task);
	359	+ oc->chosen_non_negative_adj = task;
	360	+ oc->chosen_non_negative_adj_points = points;
	361	+ }
	362	+
	363	+ if (points < oc->chosen_points)
352	364	goto next;
	365	+
353	366	select:
354	367	if (oc->chosen)
355	368	put_task_struct(oc->chosen);
..	..	@@ -359,8 +372,11 @@
359	372	next:
360	373	return 0;
361	374	abort:
	375	+ if (oc->chosen_non_negative_adj)
	376	+ put_task_struct(oc->chosen_non_negative_adj);
362	377	if (oc->chosen)
363	378	put_task_struct(oc->chosen);
	379	+ oc->chosen_non_negative_adj = NULL;
364	380	oc->chosen = (void *)-1UL;
365	381	return 1;
366	382	}
..	..	@@ -371,6 +387,10 @@
371	387	*/
372	388	static void select_bad_process(struct oom_control *oc)
373	389	{
	390	+ oc->chosen_points = LONG_MIN;
	391	+ oc->chosen_non_negative_adj_points = LONG_MIN;
	392	+ oc->chosen_non_negative_adj = NULL;
	393	+
374	394	if (is_memcg_oom(oc))
375	395	mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
376	396	else {
..	..	@@ -383,13 +403,57 @@
383	403	rcu_read_unlock();
384	404	}
385	405
386		- oc->chosen_points = oc->chosen_points * 1000 / oc->totalpages;
	406	+ if (oc->chosen_non_negative_adj) {
	407	+ /*
	408	+ * If oc->chosen has a negative ADJ, and we found a task with
	409	+ * a postive ADJ to kill, kill the task with the positive ADJ
	410	+ * instead.
	411	+ */
	412	+ if (oc->chosen && oc->chosen->signal->oom_score_adj < 0) {
	413	+ put_task_struct(oc->chosen);
	414	+ oc->chosen = oc->chosen_non_negative_adj;
	415	+ oc->chosen_points = oc->chosen_non_negative_adj_points;
	416	+ } else
	417	+ put_task_struct(oc->chosen_non_negative_adj);
	418	+ }
	419	+}
	420	+
	421	+static int dump_task(struct task_struct p, void arg)
	422	+{
	423	+ struct oom_control *oc = arg;
	424	+ struct task_struct *task;
	425	+
	426	+ if (oom_unkillable_task(p))
	427	+ return 0;
	428	+
	429	+ /* p may not have freeable memory in nodemask */
	430	+ if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
	431	+ return 0;
	432	+
	433	+ task = find_lock_task_mm(p);
	434	+ if (!task) {
	435	+ /*
	436	+ * This is a kthread or all of p's threads have already
	437	+ * detached their mm's. There's no need to report
	438	+ * them; they can't be oom killed anyway.
	439	+ */
	440	+ return 0;
	441	+ }
	442	+
	443	+ pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
	444	+ task->pid, from_kuid(&init_user_ns, task_uid(task)),
	445	+ task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
	446	+ mm_pgtables_bytes(task->mm),
	447	+ get_mm_counter(task->mm, MM_SWAPENTS),
	448	+ task->signal->oom_score_adj, task->comm);
	449	+ task_unlock(task);
	450	+
	451	+ return 0;
387	452	}
388	453
389	454	/**
390	455	* dump_tasks - dump current memory state of all system tasks
391		- * @memcg: current's memory controller, if constrained
392		- * @nodemask: nodemask passed to page allocator for mempolicy ooms
	456	+ * @oc: pointer to struct oom_control
393	457	*
394	458	* Dumps the current memory state of all eligible tasks. Tasks not in the same
395	459	* memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
..	..	@@ -397,59 +461,55 @@
397	461	* State information includes task's pid, uid, tgid, vm size, rss,
398	462	* pgtables_bytes, swapents, oom_score_adj value, and name.
399	463	*/
400		-static void dump_tasks(struct mem_cgroup memcg, const nodemask_t nodemask)
	464	+static void dump_tasks(struct oom_control *oc)
401	465	{
402		- struct task_struct *p;
403		- struct task_struct *task;
404		-
405	466	pr_info("Tasks state (memory values in pages):\n");
406	467	pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
407		- rcu_read_lock();
408		- for_each_process(p) {
409		- if (oom_unkillable_task(p, memcg, nodemask))
410		- continue;
411	468
412		- task = find_lock_task_mm(p);
413		- if (!task) {
414		- /*
415		- * This is a kthread or all of p's threads have already
416		- * detached their mm's. There's no need to report
417		- * them; they can't be oom killed anyway.
418		- */
419		- continue;
420		- }
	469	+ if (is_memcg_oom(oc))
	470	+ mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
	471	+ else {
	472	+ struct task_struct *p;
421	473
422		- pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
423		- task->pid, from_kuid(&init_user_ns, task_uid(task)),
424		- task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
425		- mm_pgtables_bytes(task->mm),
426		- get_mm_counter(task->mm, MM_SWAPENTS),
427		- task->signal->oom_score_adj, task->comm);
428		- task_unlock(task);
	474	+ rcu_read_lock();
	475	+ for_each_process(p)
	476	+ dump_task(p, oc);
	477	+ rcu_read_unlock();
429	478	}
430		- rcu_read_unlock();
	479	+}
	480	+
	481	+static void dump_oom_summary(struct oom_control oc, struct task_struct victim)
	482	+{
	483	+ /* one line summary of the oom killer context. */
	484	+ pr_info("oom-kill:constraint=%s,nodemask=%*pbl",
	485	+ oom_constraint_text[oc->constraint],
	486	+ nodemask_pr_args(oc->nodemask));
	487	+ cpuset_print_current_mems_allowed();
	488	+ mem_cgroup_print_oom_context(oc->memcg, victim);
	489	+ pr_cont(",task=%s,pid=%d,uid=%d\n", victim->comm, victim->pid,
	490	+ from_kuid(&init_user_ns, task_uid(victim)));
431	491	}
432	492
433	493	static void dump_header(struct oom_control oc, struct task_struct p)
434	494	{
435		- pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n",
436		- current->comm, oc->gfp_mask, &oc->gfp_mask,
437		- nodemask_pr_args(oc->nodemask), oc->order,
	495	+ pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
	496	+ current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
438	497	current->signal->oom_score_adj);
439	498	if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
440	499	pr_warn("COMPACTION is disabled!!!\n");
441	500
442		- cpuset_print_current_mems_allowed();
443	501	dump_stack();
444	502	if (is_memcg_oom(oc))
445		- mem_cgroup_print_oom_info(oc->memcg, p);
	503	+ mem_cgroup_print_oom_meminfo(oc->memcg);
446	504	else {
447	505	show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask);
448	506	if (is_dump_unreclaim_slabs())
449	507	dump_unreclaimable_slab();
450	508	}
451	509	if (sysctl_oom_dump_tasks)
452		- dump_tasks(oc->memcg, oc->nodemask);
	510	+ dump_tasks(oc);
	511	+ if (p)
	512	+ dump_oom_summary(oc, p);
453	513	}
454	514
455	515	/*
..	..	@@ -504,7 +564,7 @@
504	564	set_bit(MMF_UNSTABLE, &mm->flags);
505	565
506	566	for (vma = mm->mmap ; vma; vma = vma->vm_next) {
507		- if (!can_madv_dontneed_vma(vma))
	567	+ if (!can_madv_lru_vma(vma))
508	568	continue;
509	569
510	570	/*
..	..	@@ -518,19 +578,21 @@
518	578	* count elevated without a good reason.
519	579	*/
520	580	if (vma_is_anonymous(vma) \|\| !(vma->vm_flags & VM_SHARED)) {
521		- const unsigned long start = vma->vm_start;
522		- const unsigned long end = vma->vm_end;
	581	+ struct mmu_notifier_range range;
523	582	struct mmu_gather tlb;
524	583
525		- tlb_gather_mmu(&tlb, mm, start, end);
526		- if (mmu_notifier_invalidate_range_start_nonblock(mm, start, end)) {
527		- tlb_finish_mmu(&tlb, start, end);
	584	+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0,
	585	+ vma, mm, vma->vm_start,
	586	+ vma->vm_end);
	587	+ tlb_gather_mmu(&tlb, mm, range.start, range.end);
	588	+ if (mmu_notifier_invalidate_range_start_nonblock(&range)) {
	589	+ tlb_finish_mmu(&tlb, range.start, range.end);
528	590	ret = false;
529	591	continue;
530	592	}
531		- unmap_page_range(&tlb, vma, start, end, NULL);
532		- mmu_notifier_invalidate_range_end(mm, start, end);
533		- tlb_finish_mmu(&tlb, start, end);
	593	+ unmap_page_range(&tlb, vma, range.start, range.end, NULL);
	594	+ mmu_notifier_invalidate_range_end(&range);
	595	+ tlb_finish_mmu(&tlb, range.start, range.end);
534	596	}
535	597	}
536	598
..	..	@@ -547,7 +609,7 @@
547	609	{
548	610	bool ret = true;
549	611
550		- if (!down_read_trylock(&mm->mmap_sem)) {
	612	+ if (!mmap_read_trylock(mm)) {
551	613	trace_skip_task_reaping(tsk->pid);
552	614	return false;
553	615	}
..	..	@@ -555,8 +617,8 @@
555	617	/*
556	618	* MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
557	619	* work on the mm anymore. The check for MMF_OOM_SKIP must run
558		- * under mmap_sem for reading because it serializes against the
559		- * down_write();up_write() cycle in exit_mmap().
	620	+ * under mmap_lock for reading because it serializes against the
	621	+ * mmap_write_lock();mmap_write_unlock() cycle in exit_mmap().
560	622	*/
561	623	if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
562	624	trace_skip_task_reaping(tsk->pid);
..	..	@@ -578,7 +640,7 @@
578	640	out_finish:
579	641	trace_finish_task_reaping(tsk->pid);
580	642	out_unlock:
581		- up_read(&mm->mmap_sem);
	643	+ mmap_read_unlock(mm);
582	644
583	645	return ret;
584	646	}
..	..	@@ -589,7 +651,7 @@
589	651	int attempts = 0;
590	652	struct mm_struct *mm = tsk->signal->oom_mm;
591	653
592		- /* Retry the down_read_trylock(mmap_sem) a few times */
	654	+ /* Retry the mmap_read_trylock(mm) a few times */
593	655	while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
594	656	schedule_timeout_idle(HZ/10);
595	657
..	..	@@ -599,6 +661,7 @@
599	661
600	662	pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
601	663	task_pid_nr(tsk), tsk->comm);
	664	+ sched_show_task(tsk);
602	665	debug_show_all_locks();
603	666
604	667	done:
..	..	@@ -606,7 +669,7 @@
606	669
607	670	/*
608	671	* Hide this mm from OOM killer because it has been either reaped or
609		- * somebody can't call up_write(mmap_sem).
	672	+ * somebody can't call mmap_write_unlock(mm).
610	673	*/
611	674	set_bit(MMF_OOM_SKIP, &mm->flags);
612	675
..	..	@@ -663,6 +726,20 @@
663	726	#endif /* CONFIG_MMU */
664	727
665	728	/**
	729	+ * tsk->mm has to be non NULL and caller has to guarantee it is stable (either
	730	+ * under task_lock or operate on the current).
	731	+ */
	732	+static void __mark_oom_victim(struct task_struct *tsk)
	733	+{
	734	+ struct mm_struct *mm = tsk->mm;
	735	+
	736	+ if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
	737	+ mmgrab(tsk->signal->oom_mm);
	738	+ set_bit(MMF_OOM_VICTIM, &mm->flags);
	739	+ }
	740	+}
	741	+
	742	+/**
666	743	* mark_oom_victim - mark the given task as OOM victim
667	744	* @tsk: task to mark
668	745	*
..	..	@@ -674,18 +751,13 @@
674	751	*/
675	752	static void mark_oom_victim(struct task_struct *tsk)
676	753	{
677		- struct mm_struct *mm = tsk->mm;
678		-
679	754	WARN_ON(oom_killer_disabled);
680	755	/* OOM killer might race with memcg OOM */
681	756	if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
682	757	return;
683	758
684	759	/* oom_mm is bound to the signal struct life time. */
685		- if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
686		- mmgrab(tsk->signal->oom_mm);
687		- set_bit(MMF_OOM_VICTIM, &mm->flags);
688		- }
	760	+ __mark_oom_victim(tsk);
689	761
690	762	/*
691	763	* Make sure that the task is woken up from uninterruptible sleep
..	..	@@ -832,7 +904,7 @@
832	904	return ret;
833	905	}
834	906
835		-static void __oom_kill_process(struct task_struct *victim)
	907	+static void __oom_kill_process(struct task_struct victim, const char message)
836	908	{
837	909	struct task_struct *p;
838	910	struct mm_struct *mm;
..	..	@@ -840,6 +912,8 @@
840	912
841	913	p = find_lock_task_mm(victim);
842	914	if (!p) {
	915	+ pr_info("%s: OOM victim %d (%s) is already exiting. Skip killing the task\n",
	916	+ message, task_pid_nr(victim), victim->comm);
843	917	put_task_struct(victim);
844	918	return;
845	919	} else if (victim != p) {
..	..	@@ -861,19 +935,21 @@
861	935	* in order to prevent the OOM victim from depleting the memory
862	936	* reserves from the user space under its control.
863	937	*/
864		- do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, PIDTYPE_TGID);
	938	+ do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID);
865	939	mark_oom_victim(victim);
866		- pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
867		- task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
868		- K(get_mm_counter(victim->mm, MM_ANONPAGES)),
869		- K(get_mm_counter(victim->mm, MM_FILEPAGES)),
870		- K(get_mm_counter(victim->mm, MM_SHMEMPAGES)));
	940	+ pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n",
	941	+ message, task_pid_nr(victim), victim->comm, K(mm->total_vm),
	942	+ K(get_mm_counter(mm, MM_ANONPAGES)),
	943	+ K(get_mm_counter(mm, MM_FILEPAGES)),
	944	+ K(get_mm_counter(mm, MM_SHMEMPAGES)),
	945	+ from_kuid(&init_user_ns, task_uid(victim)),
	946	+ mm_pgtables_bytes(mm) >> 10, victim->signal->oom_score_adj);
871	947	task_unlock(victim);
872	948
873	949	/*
874	950	* Kill all user processes sharing victim->mm in other thread groups, if
875	951	* any. They don't get access to memory reserves, though, to avoid
876		- * depletion of all memory. This prevents mm->mmap_sem livelock when an
	952	+ * depletion of all memory. This prevents mm->mmap_lock livelock when an
877	953	* oom killed thread cannot exit because it requires the semaphore and
878	954	* its contended by another thread trying to allocate memory itself.
879	955	* That thread will now get access to memory reserves since it has a
..	..	@@ -894,12 +970,12 @@
894	970	continue;
895	971	}
896	972	/*
897		- * No use_mm() user needs to read from the userspace so we are
898		- * ok to reap it.
	973	+ * No kthead_use_mm() user needs to read from the userspace so
	974	+ * we are ok to reap it.
899	975	*/
900	976	if (unlikely(p->flags & PF_KTHREAD))
901	977	continue;
902		- do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, PIDTYPE_TGID);
	978	+ do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID);
903	979	}
904	980	rcu_read_unlock();
905	981
..	..	@@ -915,25 +991,20 @@
915	991	* Kill provided task unless it's secured by setting
916	992	* oom_score_adj to OOM_SCORE_ADJ_MIN.
917	993	*/
918		-static int oom_kill_memcg_member(struct task_struct task, void unused)
	994	+static int oom_kill_memcg_member(struct task_struct task, void message)
919	995	{
920	996	if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN &&
921	997	!is_global_init(task)) {
922	998	get_task_struct(task);
923		- __oom_kill_process(task);
	999	+ __oom_kill_process(task, message);
924	1000	}
925	1001	return 0;
926	1002	}
927	1003
928	1004	static void oom_kill_process(struct oom_control oc, const char message)
929	1005	{
930		- struct task_struct *p = oc->chosen;
931		- unsigned int points = oc->chosen_points;
932		- struct task_struct *victim = p;
933		- struct task_struct *child;
934		- struct task_struct *t;
	1006	+ struct task_struct *victim = oc->chosen;
935	1007	struct mem_cgroup *oom_group;
936		- unsigned int victim_points = 0;
937	1008	static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
938	1009	DEFAULT_RATELIMIT_BURST);
939	1010
..	..	@@ -942,57 +1013,18 @@
942	1013	* its children or threads, just give it access to memory reserves
943	1014	* so it can die quickly
944	1015	*/
945		- task_lock(p);
946		- if (task_will_free_mem(p)) {
947		- mark_oom_victim(p);
948		- wake_oom_reaper(p);
949		- task_unlock(p);
950		- put_task_struct(p);
	1016	+ task_lock(victim);
	1017	+ if (task_will_free_mem(victim)) {
	1018	+ mark_oom_victim(victim);
	1019	+ wake_oom_reaper(victim);
	1020	+ task_unlock(victim);
	1021	+ put_task_struct(victim);
951	1022	return;
952	1023	}
953		- task_unlock(p);
	1024	+ task_unlock(victim);
954	1025
955	1026	if (__ratelimit(&oom_rs))
956		- dump_header(oc, p);
957		-
958		- pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
959		- message, task_pid_nr(p), p->comm, points);
960		-
961		- /*
962		- * If any of p's children has a different mm and is eligible for kill,
963		- * the one with the highest oom_badness() score is sacrificed for its
964		- * parent. This attempts to lose the minimal amount of work done while
965		- * still freeing memory.
966		- */
967		- read_lock(&tasklist_lock);
968		-
969		- /*
970		- * The task 'p' might have already exited before reaching here. The
971		- * put_task_struct() will free task_struct 'p' while the loop still try
972		- * to access the field of 'p', so, get an extra reference.
973		- */
974		- get_task_struct(p);
975		- for_each_thread(p, t) {
976		- list_for_each_entry(child, &t->children, sibling) {
977		- unsigned int child_points;
978		-
979		- if (process_shares_mm(child, p->mm))
980		- continue;
981		- /*
982		- * oom_badness() returns 0 if the thread is unkillable
983		- */
984		- child_points = oom_badness(child,
985		- oc->memcg, oc->nodemask, oc->totalpages);
986		- if (child_points > victim_points) {
987		- put_task_struct(victim);
988		- victim = child;
989		- victim_points = child_points;
990		- get_task_struct(victim);
991		- }
992		- }
993		- }
994		- put_task_struct(p);
995		- read_unlock(&tasklist_lock);
	1027	+ dump_header(oc, victim);
996	1028
997	1029	/*
998	1030	* Do we need to kill the entire memory cgroup?
..	..	@@ -1001,14 +1033,15 @@
1001	1033	*/
1002	1034	oom_group = mem_cgroup_get_oom_group(victim, oc->memcg);
1003	1035
1004		- __oom_kill_process(victim);
	1036	+ __oom_kill_process(victim, message);
1005	1037
1006	1038	/*
1007	1039	* If necessary, kill all tasks in the selected memory cgroup.
1008	1040	*/
1009	1041	if (oom_group) {
1010	1042	mem_cgroup_print_oom_group(oom_group);
1011		- mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, NULL);
	1043	+ mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member,
	1044	+ (void*)message);
1012	1045	mem_cgroup_put(oom_group);
1013	1046	}
1014	1047	}
..	..	@@ -1016,8 +1049,7 @@
1016	1049	/*
1017	1050	* Determines whether the kernel must panic because of the panic_on_oom sysctl.
1018	1051	*/
1019		-static void check_panic_on_oom(struct oom_control *oc,
1020		- enum oom_constraint constraint)
	1052	+static void check_panic_on_oom(struct oom_control *oc)
1021	1053	{
1022	1054	if (likely(!sysctl_panic_on_oom))
1023	1055	return;
..	..	@@ -1027,7 +1059,7 @@
1027	1059	* does not panic for cpuset, mempolicy, or memcg allocation
1028	1060	* failures.
1029	1061	*/
1030		- if (constraint != CONSTRAINT_NONE)
	1062	+ if (oc->constraint != CONSTRAINT_NONE)
1031	1063	return;
1032	1064	}
1033	1065	/* Do not panic for oom kills triggered by sysrq */
..	..	@@ -1064,7 +1096,6 @@
1064	1096	bool out_of_memory(struct oom_control *oc)
1065	1097	{
1066	1098	unsigned long freed = 0;
1067		- enum oom_constraint constraint = CONSTRAINT_NONE;
1068	1099
1069	1100	if (oom_killer_disabled)
1070	1101	return false;
..	..	@@ -1101,13 +1132,14 @@
1101	1132	* Check if there were limitations on the allocation (only relevant for
1102	1133	* NUMA and memcg) that may require different handling.
1103	1134	*/
1104		- constraint = constrained_alloc(oc);
1105		- if (constraint != CONSTRAINT_MEMORY_POLICY)
	1135	+ oc->constraint = constrained_alloc(oc);
	1136	+ if (oc->constraint != CONSTRAINT_MEMORY_POLICY)
1106	1137	oc->nodemask = NULL;
1107		- check_panic_on_oom(oc, constraint);
	1138	+ check_panic_on_oom(oc);
1108	1139
1109	1140	if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
1110		- current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
	1141	+ current->mm && !oom_unkillable_task(current) &&
	1142	+ oom_cpuset_eligible(current, oc) &&
1111	1143	current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
1112	1144	get_task_struct(current);
1113	1145	oc->chosen = current;
..	..	@@ -1118,6 +1150,12 @@
1118	1150	select_bad_process(oc);
1119	1151	/* Found nothing?!?! */
1120	1152	if (!oc->chosen) {
	1153	+ int ret = false;
	1154	+
	1155	+ trace_android_vh_oom_check_panic(oc, &ret);
	1156	+ if (ret)
	1157	+ return true;
	1158	+
1121	1159	dump_header(oc, NULL);
1122	1160	pr_warn("Out of memory and no killable processes...\n");
1123	1161	/*
..	..	@@ -1154,3 +1192,97 @@
1154	1192	if (__ratelimit(&pfoom_rs))
1155	1193	pr_warn("Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF\n");
1156	1194	}
	1195	+
	1196	+SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
	1197	+{
	1198	+#ifdef CONFIG_MMU
	1199	+ struct mm_struct *mm = NULL;
	1200	+ struct task_struct *task;
	1201	+ struct task_struct *p;
	1202	+ unsigned int f_flags;
	1203	+ bool reap = false;
	1204	+ struct pid *pid;
	1205	+ long ret = 0;
	1206	+
	1207	+ if (flags)
	1208	+ return -EINVAL;
	1209	+
	1210	+ pid = pidfd_get_pid(pidfd, &f_flags);
	1211	+ if (IS_ERR(pid))
	1212	+ return PTR_ERR(pid);
	1213	+
	1214	+ task = get_pid_task(pid, PIDTYPE_TGID);
	1215	+ if (!task) {
	1216	+ ret = -ESRCH;
	1217	+ goto put_pid;
	1218	+ }
	1219	+
	1220	+ /*
	1221	+ * Make sure to choose a thread which still has a reference to mm
	1222	+ * during the group exit
	1223	+ */
	1224	+ p = find_lock_task_mm(task);
	1225	+ if (!p) {
	1226	+ ret = -ESRCH;
	1227	+ goto put_task;
	1228	+ }
	1229	+
	1230	+ mm = p->mm;
	1231	+ mmgrab(mm);
	1232	+
	1233	+ /*
	1234	+ * If we are too late and exit_mmap already checked mm_is_oom_victim
	1235	+ * then will block on mmap_read_lock until exit_mmap releases mmap_lock
	1236	+ */
	1237	+ set_bit(MMF_OOM_VICTIM, &mm->flags);
	1238	+
	1239	+ if (task_will_free_mem(p))
	1240	+ reap = true;
	1241	+ else {
	1242	+ /* Error only if the work has not been done already */
	1243	+ if (!test_bit(MMF_OOM_SKIP, &mm->flags))
	1244	+ ret = -EINVAL;
	1245	+ }
	1246	+ task_unlock(p);
	1247	+
	1248	+ if (!reap)
	1249	+ goto drop_mm;
	1250	+
	1251	+ if (mmap_read_lock_killable(mm)) {
	1252	+ ret = -EINTR;
	1253	+ goto drop_mm;
	1254	+ }
	1255	+ /*
	1256	+ * Check MMF_OOM_SKIP again under mmap_read_lock protection to ensure
	1257	+ * possible change in exit_mmap is seen
	1258	+ */
	1259	+ if (!test_bit(MMF_OOM_SKIP, &mm->flags) && !__oom_reap_task_mm(mm))
	1260	+ ret = -EAGAIN;
	1261	+ mmap_read_unlock(mm);
	1262	+
	1263	+drop_mm:
	1264	+ mmdrop(mm);
	1265	+put_task:
	1266	+ put_task_struct(task);
	1267	+put_pid:
	1268	+ put_pid(pid);
	1269	+ return ret;
	1270	+#else
	1271	+ return -ENOSYS;
	1272	+#endif /* CONFIG_MMU */
	1273	+}
	1274	+
	1275	+void add_to_oom_reaper(struct task_struct *p)
	1276	+{
	1277	+ p = find_lock_task_mm(p);
	1278	+ if (!p)
	1279	+ return;
	1280	+
	1281	+ get_task_struct(p);
	1282	+ if (task_will_free_mem(p)) {
	1283	+ __mark_oom_victim(p);
	1284	+ wake_oom_reaper(p);
	1285	+ }
	1286	+ task_unlock(p);
	1287	+ put_task_struct(p);
	1288	+}