hc
2024-01-05 071106ecf68c401173c58808b1cf5f68cc50d390
kernel/mm/oom_kill.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * linux/mm/oom_kill.c
34 *
....@@ -25,7 +26,9 @@
2526 #include <linux/sched/mm.h>
2627 #include <linux/sched/coredump.h>
2728 #include <linux/sched/task.h>
29
+#include <linux/sched/debug.h>
2830 #include <linux/swap.h>
31
+#include <linux/syscalls.h>
2932 #include <linux/timex.h>
3033 #include <linux/jiffies.h>
3134 #include <linux/cpuset.h>
....@@ -49,6 +52,9 @@
4952 #define CREATE_TRACE_POINTS
5053 #include <trace/events/oom.h>
5154
55
+#undef CREATE_TRACE_POINTS
56
+#include <trace/hooks/mm.h>
57
+
5258 int sysctl_panic_on_oom;
5359 int sysctl_oom_kill_allocating_task;
5460 int sysctl_oom_dump_tasks = 1;
....@@ -65,21 +71,33 @@
6571 /* Serializes oom_score_adj and oom_score_adj_min updates */
6672 DEFINE_MUTEX(oom_adj_mutex);
6773
74
+static inline bool is_memcg_oom(struct oom_control *oc)
75
+{
76
+ return oc->memcg != NULL;
77
+}
78
+
6879 #ifdef CONFIG_NUMA
6980 /**
70
- * has_intersects_mems_allowed() - check task eligiblity for kill
81
+ * oom_cpuset_eligible() - check task eligiblity for kill
7182 * @start: task struct of which task to consider
72
- * @mask: nodemask passed to page allocator for mempolicy ooms
83
+ * @oc: pointer to struct oom_control
7384 *
7485 * Task eligibility is determined by whether or not a candidate task, @tsk,
7586 * shares the same mempolicy nodes as current if it is bound by such a policy
7687 * and whether or not it has the same set of allowed cpuset nodes.
88
+ *
89
+ * This function is assuming oom-killer context and 'current' has triggered
90
+ * the oom-killer.
7791 */
78
-static bool has_intersects_mems_allowed(struct task_struct *start,
79
- const nodemask_t *mask)
92
+static bool oom_cpuset_eligible(struct task_struct *start,
93
+ struct oom_control *oc)
8094 {
8195 struct task_struct *tsk;
8296 bool ret = false;
97
+ const nodemask_t *mask = oc->nodemask;
98
+
99
+ if (is_memcg_oom(oc))
100
+ return true;
83101
84102 rcu_read_lock();
85103 for_each_thread(start, tsk) {
....@@ -106,8 +124,7 @@
106124 return ret;
107125 }
108126 #else
109
-static bool has_intersects_mems_allowed(struct task_struct *tsk,
110
- const nodemask_t *mask)
127
+static bool oom_cpuset_eligible(struct task_struct *tsk, struct oom_control *oc)
111128 {
112129 return true;
113130 }
....@@ -115,7 +132,7 @@
115132
116133 /*
117134 * The process p may have detached its own ->mm while exiting or through
118
- * use_mm(), but one or more of its subthreads may still have a valid
135
+ * kthread_use_mm(), but one or more of its subthreads may still have a valid
119136 * pointer. Return p, or any of its subthreads with a valid ->mm, with
120137 * task_lock() held.
121138 */
....@@ -147,28 +164,13 @@
147164 return oc->order == -1;
148165 }
149166
150
-static inline bool is_memcg_oom(struct oom_control *oc)
151
-{
152
- return oc->memcg != NULL;
153
-}
154
-
155167 /* return true if the task is not adequate as candidate victim task. */
156
-static bool oom_unkillable_task(struct task_struct *p,
157
- struct mem_cgroup *memcg, const nodemask_t *nodemask)
168
+static bool oom_unkillable_task(struct task_struct *p)
158169 {
159170 if (is_global_init(p))
160171 return true;
161172 if (p->flags & PF_KTHREAD)
162173 return true;
163
-
164
- /* When mem_cgroup_out_of_memory() and p is not member of the group */
165
- if (memcg && !task_in_mem_cgroup(p, memcg))
166
- return true;
167
-
168
- /* p may not have freeable memory in nodemask */
169
- if (!has_intersects_mems_allowed(p, nodemask))
170
- return true;
171
-
172174 return false;
173175 }
174176
....@@ -188,32 +190,29 @@
188190 global_node_page_state(NR_ISOLATED_FILE) +
189191 global_node_page_state(NR_UNEVICTABLE);
190192
191
- return (global_node_page_state(NR_SLAB_UNRECLAIMABLE) > nr_lru);
193
+ return (global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B) > nr_lru);
192194 }
193195
194196 /**
195197 * oom_badness - heuristic function to determine which candidate task to kill
196198 * @p: task struct of which task we should calculate
197199 * @totalpages: total present RAM allowed for page allocation
198
- * @memcg: task's memory controller, if constrained
199
- * @nodemask: nodemask passed to page allocator for mempolicy ooms
200200 *
201201 * The heuristic for determining which task to kill is made to be as simple and
202202 * predictable as possible. The goal is to return the highest value for the
203203 * task consuming the most memory to avoid subsequent oom failures.
204204 */
205
-unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
206
- const nodemask_t *nodemask, unsigned long totalpages)
205
+long oom_badness(struct task_struct *p, unsigned long totalpages)
207206 {
208207 long points;
209208 long adj;
210209
211
- if (oom_unkillable_task(p, memcg, nodemask))
212
- return 0;
210
+ if (oom_unkillable_task(p))
211
+ return LONG_MIN;
213212
214213 p = find_lock_task_mm(p);
215214 if (!p)
216
- return 0;
215
+ return LONG_MIN;
217216
218217 /*
219218 * Do not even consider tasks which are explicitly marked oom
....@@ -225,7 +224,7 @@
225224 test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
226225 in_vfork(p)) {
227226 task_unlock(p);
228
- return 0;
227
+ return LONG_MIN;
229228 }
230229
231230 /*
....@@ -240,18 +239,14 @@
240239 adj *= totalpages / 1000;
241240 points += adj;
242241
243
- /*
244
- * Never return 0 for an eligible task regardless of the root bonus and
245
- * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
246
- */
247
- return points > 0 ? points : 1;
242
+ return points;
248243 }
249244
250
-enum oom_constraint {
251
- CONSTRAINT_NONE,
252
- CONSTRAINT_CPUSET,
253
- CONSTRAINT_MEMORY_POLICY,
254
- CONSTRAINT_MEMCG,
245
+static const char * const oom_constraint_text[] = {
246
+ [CONSTRAINT_NONE] = "CONSTRAINT_NONE",
247
+ [CONSTRAINT_CPUSET] = "CONSTRAINT_CPUSET",
248
+ [CONSTRAINT_MEMORY_POLICY] = "CONSTRAINT_MEMORY_POLICY",
249
+ [CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG",
255250 };
256251
257252 /*
....@@ -261,7 +256,7 @@
261256 {
262257 struct zone *zone;
263258 struct zoneref *z;
264
- enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask);
259
+ enum zone_type highest_zoneidx = gfp_zone(oc->gfp_mask);
265260 bool cpuset_limited = false;
266261 int nid;
267262
....@@ -271,7 +266,7 @@
271266 }
272267
273268 /* Default to all available memory */
274
- oc->totalpages = totalram_pages + total_swap_pages;
269
+ oc->totalpages = totalram_pages() + total_swap_pages;
275270
276271 if (!IS_ENABLED(CONFIG_NUMA))
277272 return CONSTRAINT_NONE;
....@@ -295,20 +290,20 @@
295290 !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
296291 oc->totalpages = total_swap_pages;
297292 for_each_node_mask(nid, *oc->nodemask)
298
- oc->totalpages += node_spanned_pages(nid);
293
+ oc->totalpages += node_present_pages(nid);
299294 return CONSTRAINT_MEMORY_POLICY;
300295 }
301296
302297 /* Check this allocation failure is caused by cpuset's wall function */
303298 for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
304
- high_zoneidx, oc->nodemask)
299
+ highest_zoneidx, oc->nodemask)
305300 if (!cpuset_zone_allowed(zone, oc->gfp_mask))
306301 cpuset_limited = true;
307302
308303 if (cpuset_limited) {
309304 oc->totalpages = total_swap_pages;
310305 for_each_node_mask(nid, cpuset_current_mems_allowed)
311
- oc->totalpages += node_spanned_pages(nid);
306
+ oc->totalpages += node_present_pages(nid);
312307 return CONSTRAINT_CPUSET;
313308 }
314309 return CONSTRAINT_NONE;
....@@ -317,9 +312,13 @@
317312 static int oom_evaluate_task(struct task_struct *task, void *arg)
318313 {
319314 struct oom_control *oc = arg;
320
- unsigned long points;
315
+ long points;
321316
322
- if (oom_unkillable_task(task, NULL, oc->nodemask))
317
+ if (oom_unkillable_task(task))
318
+ goto next;
319
+
320
+ /* p may not have freeable memory in nodemask */
321
+ if (!is_memcg_oom(oc) && !oom_cpuset_eligible(task, oc))
323322 goto next;
324323
325324 /*
....@@ -339,17 +338,31 @@
339338 * killed first if it triggers an oom, then select it.
340339 */
341340 if (oom_task_origin(task)) {
342
- points = ULONG_MAX;
341
+ points = LONG_MAX;
343342 goto select;
344343 }
345344
346
- points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
347
- if (!points || points < oc->chosen_points)
345
+ points = oom_badness(task, oc->totalpages);
346
+
347
+ if (points == LONG_MIN)
348348 goto next;
349349
350
- /* Prefer thread group leaders for display purposes */
351
- if (points == oc->chosen_points && thread_group_leader(oc->chosen))
350
+ /*
351
+ * Check to see if this is the worst task with a non-negative
352
+ * ADJ score seen so far
353
+ */
354
+ if (task->signal->oom_score_adj >= 0 &&
355
+ points > oc->chosen_non_negative_adj_points) {
356
+ if (oc->chosen_non_negative_adj)
357
+ put_task_struct(oc->chosen_non_negative_adj);
358
+ get_task_struct(task);
359
+ oc->chosen_non_negative_adj = task;
360
+ oc->chosen_non_negative_adj_points = points;
361
+ }
362
+
363
+ if (points < oc->chosen_points)
352364 goto next;
365
+
353366 select:
354367 if (oc->chosen)
355368 put_task_struct(oc->chosen);
....@@ -359,8 +372,11 @@
359372 next:
360373 return 0;
361374 abort:
375
+ if (oc->chosen_non_negative_adj)
376
+ put_task_struct(oc->chosen_non_negative_adj);
362377 if (oc->chosen)
363378 put_task_struct(oc->chosen);
379
+ oc->chosen_non_negative_adj = NULL;
364380 oc->chosen = (void *)-1UL;
365381 return 1;
366382 }
....@@ -371,6 +387,10 @@
371387 */
372388 static void select_bad_process(struct oom_control *oc)
373389 {
390
+ oc->chosen_points = LONG_MIN;
391
+ oc->chosen_non_negative_adj_points = LONG_MIN;
392
+ oc->chosen_non_negative_adj = NULL;
393
+
374394 if (is_memcg_oom(oc))
375395 mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
376396 else {
....@@ -383,13 +403,57 @@
383403 rcu_read_unlock();
384404 }
385405
386
- oc->chosen_points = oc->chosen_points * 1000 / oc->totalpages;
406
+ if (oc->chosen_non_negative_adj) {
407
+ /*
408
+ * If oc->chosen has a negative ADJ, and we found a task with
409
+ * a postive ADJ to kill, kill the task with the positive ADJ
410
+ * instead.
411
+ */
412
+ if (oc->chosen && oc->chosen->signal->oom_score_adj < 0) {
413
+ put_task_struct(oc->chosen);
414
+ oc->chosen = oc->chosen_non_negative_adj;
415
+ oc->chosen_points = oc->chosen_non_negative_adj_points;
416
+ } else
417
+ put_task_struct(oc->chosen_non_negative_adj);
418
+ }
419
+}
420
+
421
+static int dump_task(struct task_struct *p, void *arg)
422
+{
423
+ struct oom_control *oc = arg;
424
+ struct task_struct *task;
425
+
426
+ if (oom_unkillable_task(p))
427
+ return 0;
428
+
429
+ /* p may not have freeable memory in nodemask */
430
+ if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
431
+ return 0;
432
+
433
+ task = find_lock_task_mm(p);
434
+ if (!task) {
435
+ /*
436
+ * This is a kthread or all of p's threads have already
437
+ * detached their mm's. There's no need to report
438
+ * them; they can't be oom killed anyway.
439
+ */
440
+ return 0;
441
+ }
442
+
443
+ pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
444
+ task->pid, from_kuid(&init_user_ns, task_uid(task)),
445
+ task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
446
+ mm_pgtables_bytes(task->mm),
447
+ get_mm_counter(task->mm, MM_SWAPENTS),
448
+ task->signal->oom_score_adj, task->comm);
449
+ task_unlock(task);
450
+
451
+ return 0;
387452 }
388453
389454 /**
390455 * dump_tasks - dump current memory state of all system tasks
391
- * @memcg: current's memory controller, if constrained
392
- * @nodemask: nodemask passed to page allocator for mempolicy ooms
456
+ * @oc: pointer to struct oom_control
393457 *
394458 * Dumps the current memory state of all eligible tasks. Tasks not in the same
395459 * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
....@@ -397,59 +461,55 @@
397461 * State information includes task's pid, uid, tgid, vm size, rss,
398462 * pgtables_bytes, swapents, oom_score_adj value, and name.
399463 */
400
-static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
464
+static void dump_tasks(struct oom_control *oc)
401465 {
402
- struct task_struct *p;
403
- struct task_struct *task;
404
-
405466 pr_info("Tasks state (memory values in pages):\n");
406467 pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
407
- rcu_read_lock();
408
- for_each_process(p) {
409
- if (oom_unkillable_task(p, memcg, nodemask))
410
- continue;
411468
412
- task = find_lock_task_mm(p);
413
- if (!task) {
414
- /*
415
- * This is a kthread or all of p's threads have already
416
- * detached their mm's. There's no need to report
417
- * them; they can't be oom killed anyway.
418
- */
419
- continue;
420
- }
469
+ if (is_memcg_oom(oc))
470
+ mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
471
+ else {
472
+ struct task_struct *p;
421473
422
- pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
423
- task->pid, from_kuid(&init_user_ns, task_uid(task)),
424
- task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
425
- mm_pgtables_bytes(task->mm),
426
- get_mm_counter(task->mm, MM_SWAPENTS),
427
- task->signal->oom_score_adj, task->comm);
428
- task_unlock(task);
474
+ rcu_read_lock();
475
+ for_each_process(p)
476
+ dump_task(p, oc);
477
+ rcu_read_unlock();
429478 }
430
- rcu_read_unlock();
479
+}
480
+
481
+static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
482
+{
483
+ /* one line summary of the oom killer context. */
484
+ pr_info("oom-kill:constraint=%s,nodemask=%*pbl",
485
+ oom_constraint_text[oc->constraint],
486
+ nodemask_pr_args(oc->nodemask));
487
+ cpuset_print_current_mems_allowed();
488
+ mem_cgroup_print_oom_context(oc->memcg, victim);
489
+ pr_cont(",task=%s,pid=%d,uid=%d\n", victim->comm, victim->pid,
490
+ from_kuid(&init_user_ns, task_uid(victim)));
431491 }
432492
433493 static void dump_header(struct oom_control *oc, struct task_struct *p)
434494 {
435
- pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n",
436
- current->comm, oc->gfp_mask, &oc->gfp_mask,
437
- nodemask_pr_args(oc->nodemask), oc->order,
495
+ pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
496
+ current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
438497 current->signal->oom_score_adj);
439498 if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
440499 pr_warn("COMPACTION is disabled!!!\n");
441500
442
- cpuset_print_current_mems_allowed();
443501 dump_stack();
444502 if (is_memcg_oom(oc))
445
- mem_cgroup_print_oom_info(oc->memcg, p);
503
+ mem_cgroup_print_oom_meminfo(oc->memcg);
446504 else {
447505 show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask);
448506 if (is_dump_unreclaim_slabs())
449507 dump_unreclaimable_slab();
450508 }
451509 if (sysctl_oom_dump_tasks)
452
- dump_tasks(oc->memcg, oc->nodemask);
510
+ dump_tasks(oc);
511
+ if (p)
512
+ dump_oom_summary(oc, p);
453513 }
454514
455515 /*
....@@ -504,7 +564,7 @@
504564 set_bit(MMF_UNSTABLE, &mm->flags);
505565
506566 for (vma = mm->mmap ; vma; vma = vma->vm_next) {
507
- if (!can_madv_dontneed_vma(vma))
567
+ if (!can_madv_lru_vma(vma))
508568 continue;
509569
510570 /*
....@@ -518,19 +578,21 @@
518578 * count elevated without a good reason.
519579 */
520580 if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
521
- const unsigned long start = vma->vm_start;
522
- const unsigned long end = vma->vm_end;
581
+ struct mmu_notifier_range range;
523582 struct mmu_gather tlb;
524583
525
- tlb_gather_mmu(&tlb, mm, start, end);
526
- if (mmu_notifier_invalidate_range_start_nonblock(mm, start, end)) {
527
- tlb_finish_mmu(&tlb, start, end);
584
+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0,
585
+ vma, mm, vma->vm_start,
586
+ vma->vm_end);
587
+ tlb_gather_mmu(&tlb, mm, range.start, range.end);
588
+ if (mmu_notifier_invalidate_range_start_nonblock(&range)) {
589
+ tlb_finish_mmu(&tlb, range.start, range.end);
528590 ret = false;
529591 continue;
530592 }
531
- unmap_page_range(&tlb, vma, start, end, NULL);
532
- mmu_notifier_invalidate_range_end(mm, start, end);
533
- tlb_finish_mmu(&tlb, start, end);
593
+ unmap_page_range(&tlb, vma, range.start, range.end, NULL);
594
+ mmu_notifier_invalidate_range_end(&range);
595
+ tlb_finish_mmu(&tlb, range.start, range.end);
534596 }
535597 }
536598
....@@ -547,7 +609,7 @@
547609 {
548610 bool ret = true;
549611
550
- if (!down_read_trylock(&mm->mmap_sem)) {
612
+ if (!mmap_read_trylock(mm)) {
551613 trace_skip_task_reaping(tsk->pid);
552614 return false;
553615 }
....@@ -555,8 +617,8 @@
555617 /*
556618 * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
557619 * work on the mm anymore. The check for MMF_OOM_SKIP must run
558
- * under mmap_sem for reading because it serializes against the
559
- * down_write();up_write() cycle in exit_mmap().
620
+ * under mmap_lock for reading because it serializes against the
621
+ * mmap_write_lock();mmap_write_unlock() cycle in exit_mmap().
560622 */
561623 if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
562624 trace_skip_task_reaping(tsk->pid);
....@@ -578,7 +640,7 @@
578640 out_finish:
579641 trace_finish_task_reaping(tsk->pid);
580642 out_unlock:
581
- up_read(&mm->mmap_sem);
643
+ mmap_read_unlock(mm);
582644
583645 return ret;
584646 }
....@@ -589,7 +651,7 @@
589651 int attempts = 0;
590652 struct mm_struct *mm = tsk->signal->oom_mm;
591653
592
- /* Retry the down_read_trylock(mmap_sem) a few times */
654
+ /* Retry the mmap_read_trylock(mm) a few times */
593655 while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
594656 schedule_timeout_idle(HZ/10);
595657
....@@ -599,6 +661,7 @@
599661
600662 pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
601663 task_pid_nr(tsk), tsk->comm);
664
+ sched_show_task(tsk);
602665 debug_show_all_locks();
603666
604667 done:
....@@ -606,7 +669,7 @@
606669
607670 /*
608671 * Hide this mm from OOM killer because it has been either reaped or
609
- * somebody can't call up_write(mmap_sem).
672
+ * somebody can't call mmap_write_unlock(mm).
610673 */
611674 set_bit(MMF_OOM_SKIP, &mm->flags);
612675
....@@ -663,6 +726,20 @@
663726 #endif /* CONFIG_MMU */
664727
665728 /**
729
+ * tsk->mm has to be non NULL and caller has to guarantee it is stable (either
730
+ * under task_lock or operate on the current).
731
+ */
732
+static void __mark_oom_victim(struct task_struct *tsk)
733
+{
734
+ struct mm_struct *mm = tsk->mm;
735
+
736
+ if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
737
+ mmgrab(tsk->signal->oom_mm);
738
+ set_bit(MMF_OOM_VICTIM, &mm->flags);
739
+ }
740
+}
741
+
742
+/**
666743 * mark_oom_victim - mark the given task as OOM victim
667744 * @tsk: task to mark
668745 *
....@@ -674,18 +751,13 @@
674751 */
675752 static void mark_oom_victim(struct task_struct *tsk)
676753 {
677
- struct mm_struct *mm = tsk->mm;
678
-
679754 WARN_ON(oom_killer_disabled);
680755 /* OOM killer might race with memcg OOM */
681756 if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
682757 return;
683758
684759 /* oom_mm is bound to the signal struct life time. */
685
- if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
686
- mmgrab(tsk->signal->oom_mm);
687
- set_bit(MMF_OOM_VICTIM, &mm->flags);
688
- }
760
+ __mark_oom_victim(tsk);
689761
690762 /*
691763 * Make sure that the task is woken up from uninterruptible sleep
....@@ -832,7 +904,7 @@
832904 return ret;
833905 }
834906
835
-static void __oom_kill_process(struct task_struct *victim)
907
+static void __oom_kill_process(struct task_struct *victim, const char *message)
836908 {
837909 struct task_struct *p;
838910 struct mm_struct *mm;
....@@ -840,6 +912,8 @@
840912
841913 p = find_lock_task_mm(victim);
842914 if (!p) {
915
+ pr_info("%s: OOM victim %d (%s) is already exiting. Skip killing the task\n",
916
+ message, task_pid_nr(victim), victim->comm);
843917 put_task_struct(victim);
844918 return;
845919 } else if (victim != p) {
....@@ -861,19 +935,21 @@
861935 * in order to prevent the OOM victim from depleting the memory
862936 * reserves from the user space under its control.
863937 */
864
- do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, PIDTYPE_TGID);
938
+ do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID);
865939 mark_oom_victim(victim);
866
- pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
867
- task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
868
- K(get_mm_counter(victim->mm, MM_ANONPAGES)),
869
- K(get_mm_counter(victim->mm, MM_FILEPAGES)),
870
- K(get_mm_counter(victim->mm, MM_SHMEMPAGES)));
940
+ pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n",
941
+ message, task_pid_nr(victim), victim->comm, K(mm->total_vm),
942
+ K(get_mm_counter(mm, MM_ANONPAGES)),
943
+ K(get_mm_counter(mm, MM_FILEPAGES)),
944
+ K(get_mm_counter(mm, MM_SHMEMPAGES)),
945
+ from_kuid(&init_user_ns, task_uid(victim)),
946
+ mm_pgtables_bytes(mm) >> 10, victim->signal->oom_score_adj);
871947 task_unlock(victim);
872948
873949 /*
874950 * Kill all user processes sharing victim->mm in other thread groups, if
875951 * any. They don't get access to memory reserves, though, to avoid
876
- * depletion of all memory. This prevents mm->mmap_sem livelock when an
952
+ * depletion of all memory. This prevents mm->mmap_lock livelock when an
877953 * oom killed thread cannot exit because it requires the semaphore and
878954 * its contended by another thread trying to allocate memory itself.
879955 * That thread will now get access to memory reserves since it has a
....@@ -894,12 +970,12 @@
894970 continue;
895971 }
896972 /*
897
- * No use_mm() user needs to read from the userspace so we are
898
- * ok to reap it.
973
+ * No kthead_use_mm() user needs to read from the userspace so
974
+ * we are ok to reap it.
899975 */
900976 if (unlikely(p->flags & PF_KTHREAD))
901977 continue;
902
- do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, PIDTYPE_TGID);
978
+ do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID);
903979 }
904980 rcu_read_unlock();
905981
....@@ -915,25 +991,20 @@
915991 * Kill provided task unless it's secured by setting
916992 * oom_score_adj to OOM_SCORE_ADJ_MIN.
917993 */
918
-static int oom_kill_memcg_member(struct task_struct *task, void *unused)
994
+static int oom_kill_memcg_member(struct task_struct *task, void *message)
919995 {
920996 if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN &&
921997 !is_global_init(task)) {
922998 get_task_struct(task);
923
- __oom_kill_process(task);
999
+ __oom_kill_process(task, message);
9241000 }
9251001 return 0;
9261002 }
9271003
9281004 static void oom_kill_process(struct oom_control *oc, const char *message)
9291005 {
930
- struct task_struct *p = oc->chosen;
931
- unsigned int points = oc->chosen_points;
932
- struct task_struct *victim = p;
933
- struct task_struct *child;
934
- struct task_struct *t;
1006
+ struct task_struct *victim = oc->chosen;
9351007 struct mem_cgroup *oom_group;
936
- unsigned int victim_points = 0;
9371008 static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
9381009 DEFAULT_RATELIMIT_BURST);
9391010
....@@ -942,57 +1013,18 @@
9421013 * its children or threads, just give it access to memory reserves
9431014 * so it can die quickly
9441015 */
945
- task_lock(p);
946
- if (task_will_free_mem(p)) {
947
- mark_oom_victim(p);
948
- wake_oom_reaper(p);
949
- task_unlock(p);
950
- put_task_struct(p);
1016
+ task_lock(victim);
1017
+ if (task_will_free_mem(victim)) {
1018
+ mark_oom_victim(victim);
1019
+ wake_oom_reaper(victim);
1020
+ task_unlock(victim);
1021
+ put_task_struct(victim);
9511022 return;
9521023 }
953
- task_unlock(p);
1024
+ task_unlock(victim);
9541025
9551026 if (__ratelimit(&oom_rs))
956
- dump_header(oc, p);
957
-
958
- pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
959
- message, task_pid_nr(p), p->comm, points);
960
-
961
- /*
962
- * If any of p's children has a different mm and is eligible for kill,
963
- * the one with the highest oom_badness() score is sacrificed for its
964
- * parent. This attempts to lose the minimal amount of work done while
965
- * still freeing memory.
966
- */
967
- read_lock(&tasklist_lock);
968
-
969
- /*
970
- * The task 'p' might have already exited before reaching here. The
971
- * put_task_struct() will free task_struct 'p' while the loop still try
972
- * to access the field of 'p', so, get an extra reference.
973
- */
974
- get_task_struct(p);
975
- for_each_thread(p, t) {
976
- list_for_each_entry(child, &t->children, sibling) {
977
- unsigned int child_points;
978
-
979
- if (process_shares_mm(child, p->mm))
980
- continue;
981
- /*
982
- * oom_badness() returns 0 if the thread is unkillable
983
- */
984
- child_points = oom_badness(child,
985
- oc->memcg, oc->nodemask, oc->totalpages);
986
- if (child_points > victim_points) {
987
- put_task_struct(victim);
988
- victim = child;
989
- victim_points = child_points;
990
- get_task_struct(victim);
991
- }
992
- }
993
- }
994
- put_task_struct(p);
995
- read_unlock(&tasklist_lock);
1027
+ dump_header(oc, victim);
9961028
9971029 /*
9981030 * Do we need to kill the entire memory cgroup?
....@@ -1001,14 +1033,15 @@
10011033 */
10021034 oom_group = mem_cgroup_get_oom_group(victim, oc->memcg);
10031035
1004
- __oom_kill_process(victim);
1036
+ __oom_kill_process(victim, message);
10051037
10061038 /*
10071039 * If necessary, kill all tasks in the selected memory cgroup.
10081040 */
10091041 if (oom_group) {
10101042 mem_cgroup_print_oom_group(oom_group);
1011
- mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, NULL);
1043
+ mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member,
1044
+ (void*)message);
10121045 mem_cgroup_put(oom_group);
10131046 }
10141047 }
....@@ -1016,8 +1049,7 @@
10161049 /*
10171050 * Determines whether the kernel must panic because of the panic_on_oom sysctl.
10181051 */
1019
-static void check_panic_on_oom(struct oom_control *oc,
1020
- enum oom_constraint constraint)
1052
+static void check_panic_on_oom(struct oom_control *oc)
10211053 {
10221054 if (likely(!sysctl_panic_on_oom))
10231055 return;
....@@ -1027,7 +1059,7 @@
10271059 * does not panic for cpuset, mempolicy, or memcg allocation
10281060 * failures.
10291061 */
1030
- if (constraint != CONSTRAINT_NONE)
1062
+ if (oc->constraint != CONSTRAINT_NONE)
10311063 return;
10321064 }
10331065 /* Do not panic for oom kills triggered by sysrq */
....@@ -1064,7 +1096,6 @@
10641096 bool out_of_memory(struct oom_control *oc)
10651097 {
10661098 unsigned long freed = 0;
1067
- enum oom_constraint constraint = CONSTRAINT_NONE;
10681099
10691100 if (oom_killer_disabled)
10701101 return false;
....@@ -1101,13 +1132,14 @@
11011132 * Check if there were limitations on the allocation (only relevant for
11021133 * NUMA and memcg) that may require different handling.
11031134 */
1104
- constraint = constrained_alloc(oc);
1105
- if (constraint != CONSTRAINT_MEMORY_POLICY)
1135
+ oc->constraint = constrained_alloc(oc);
1136
+ if (oc->constraint != CONSTRAINT_MEMORY_POLICY)
11061137 oc->nodemask = NULL;
1107
- check_panic_on_oom(oc, constraint);
1138
+ check_panic_on_oom(oc);
11081139
11091140 if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
1110
- current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
1141
+ current->mm && !oom_unkillable_task(current) &&
1142
+ oom_cpuset_eligible(current, oc) &&
11111143 current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
11121144 get_task_struct(current);
11131145 oc->chosen = current;
....@@ -1118,6 +1150,12 @@
11181150 select_bad_process(oc);
11191151 /* Found nothing?!?! */
11201152 if (!oc->chosen) {
1153
+ int ret = false;
1154
+
1155
+ trace_android_vh_oom_check_panic(oc, &ret);
1156
+ if (ret)
1157
+ return true;
1158
+
11211159 dump_header(oc, NULL);
11221160 pr_warn("Out of memory and no killable processes...\n");
11231161 /*
....@@ -1154,3 +1192,97 @@
11541192 if (__ratelimit(&pfoom_rs))
11551193 pr_warn("Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF\n");
11561194 }
1195
+
1196
+SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
1197
+{
1198
+#ifdef CONFIG_MMU
1199
+ struct mm_struct *mm = NULL;
1200
+ struct task_struct *task;
1201
+ struct task_struct *p;
1202
+ unsigned int f_flags;
1203
+ bool reap = false;
1204
+ struct pid *pid;
1205
+ long ret = 0;
1206
+
1207
+ if (flags)
1208
+ return -EINVAL;
1209
+
1210
+ pid = pidfd_get_pid(pidfd, &f_flags);
1211
+ if (IS_ERR(pid))
1212
+ return PTR_ERR(pid);
1213
+
1214
+ task = get_pid_task(pid, PIDTYPE_TGID);
1215
+ if (!task) {
1216
+ ret = -ESRCH;
1217
+ goto put_pid;
1218
+ }
1219
+
1220
+ /*
1221
+ * Make sure to choose a thread which still has a reference to mm
1222
+ * during the group exit
1223
+ */
1224
+ p = find_lock_task_mm(task);
1225
+ if (!p) {
1226
+ ret = -ESRCH;
1227
+ goto put_task;
1228
+ }
1229
+
1230
+ mm = p->mm;
1231
+ mmgrab(mm);
1232
+
1233
+ /*
1234
+ * If we are too late and exit_mmap already checked mm_is_oom_victim
1235
+ * then will block on mmap_read_lock until exit_mmap releases mmap_lock
1236
+ */
1237
+ set_bit(MMF_OOM_VICTIM, &mm->flags);
1238
+
1239
+ if (task_will_free_mem(p))
1240
+ reap = true;
1241
+ else {
1242
+ /* Error only if the work has not been done already */
1243
+ if (!test_bit(MMF_OOM_SKIP, &mm->flags))
1244
+ ret = -EINVAL;
1245
+ }
1246
+ task_unlock(p);
1247
+
1248
+ if (!reap)
1249
+ goto drop_mm;
1250
+
1251
+ if (mmap_read_lock_killable(mm)) {
1252
+ ret = -EINTR;
1253
+ goto drop_mm;
1254
+ }
1255
+ /*
1256
+ * Check MMF_OOM_SKIP again under mmap_read_lock protection to ensure
1257
+ * possible change in exit_mmap is seen
1258
+ */
1259
+ if (!test_bit(MMF_OOM_SKIP, &mm->flags) && !__oom_reap_task_mm(mm))
1260
+ ret = -EAGAIN;
1261
+ mmap_read_unlock(mm);
1262
+
1263
+drop_mm:
1264
+ mmdrop(mm);
1265
+put_task:
1266
+ put_task_struct(task);
1267
+put_pid:
1268
+ put_pid(pid);
1269
+ return ret;
1270
+#else
1271
+ return -ENOSYS;
1272
+#endif /* CONFIG_MMU */
1273
+}
1274
+
1275
+void add_to_oom_reaper(struct task_struct *p)
1276
+{
1277
+ p = find_lock_task_mm(p);
1278
+ if (!p)
1279
+ return;
1280
+
1281
+ get_task_struct(p);
1282
+ if (task_will_free_mem(p)) {
1283
+ __mark_oom_victim(p);
1284
+ wake_oom_reaper(p);
1285
+ }
1286
+ task_unlock(p);
1287
+ put_task_struct(p);
1288
+}