hc
2023-12-09 95099d4622f8cb224d94e314c7a8e0df60b13f87
kernel/mm/memcontrol.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /* memcontrol.c - Memory Controller
23 *
34 * Copyright IBM Corporation, 2007
....@@ -19,26 +20,17 @@
1920 * Lockless page tracking & accounting
2021 * Unified hierarchy configuration model
2122 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
22
- *
23
- * This program is free software; you can redistribute it and/or modify
24
- * it under the terms of the GNU General Public License as published by
25
- * the Free Software Foundation; either version 2 of the License, or
26
- * (at your option) any later version.
27
- *
28
- * This program is distributed in the hope that it will be useful,
29
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
30
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
31
- * GNU General Public License for more details.
3223 */
3324
3425 #include <linux/page_counter.h>
3526 #include <linux/memcontrol.h>
3627 #include <linux/cgroup.h>
37
-#include <linux/mm.h>
28
+#include <linux/pagewalk.h>
3829 #include <linux/sched/mm.h>
3930 #include <linux/shmem_fs.h>
4031 #include <linux/hugetlb.h>
4132 #include <linux/pagemap.h>
33
+#include <linux/vm_event_item.h>
4234 #include <linux/smp.h>
4335 #include <linux/page-flags.h>
4436 #include <linux/backing-dev.h>
....@@ -65,21 +57,26 @@
6557 #include <linux/lockdep.h>
6658 #include <linux/file.h>
6759 #include <linux/tracehook.h>
60
+#include <linux/psi.h>
61
+#include <linux/seq_buf.h>
6862 #include "internal.h"
6963 #include <net/sock.h>
7064 #include <net/ip.h>
7165 #include "slab.h"
66
+#include <linux/local_lock.h>
7267
7368 #include <linux/uaccess.h>
7469
7570 #include <trace/events/vmscan.h>
71
+#include <trace/hooks/mm.h>
7672
7773 struct cgroup_subsys memory_cgrp_subsys __read_mostly;
7874 EXPORT_SYMBOL(memory_cgrp_subsys);
7975
8076 struct mem_cgroup *root_mem_cgroup __read_mostly;
8177
82
-#define MEM_CGROUP_RECLAIM_RETRIES 5
78
+/* Active memory cgroup to use from an interrupt context */
79
+DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
8380
8481 /* Socket memory accounting disabled? */
8582 static bool cgroup_memory_nosocket;
....@@ -89,28 +86,30 @@
8986
9087 /* Whether the swap controller is active */
9188 #ifdef CONFIG_MEMCG_SWAP
92
-int do_swap_account __read_mostly;
89
+bool cgroup_memory_noswap __read_mostly;
9390 #else
94
-#define do_swap_account 0
91
+#define cgroup_memory_noswap 1
9592 #endif
93
+
94
+#ifdef CONFIG_CGROUP_WRITEBACK
95
+static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
96
+#endif
97
+
98
+struct event_lock {
99
+ local_lock_t l;
100
+};
101
+static DEFINE_PER_CPU(struct event_lock, event_lock) = {
102
+ .l = INIT_LOCAL_LOCK(l),
103
+};
96104
97105 /* Whether legacy memory+swap accounting is active */
98106 static bool do_memsw_account(void)
99107 {
100
- return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
108
+ return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
101109 }
102
-
103
-static const char *const mem_cgroup_lru_names[] = {
104
- "inactive_anon",
105
- "active_anon",
106
- "inactive_file",
107
- "active_file",
108
- "unevictable",
109
-};
110110
111111 #define THRESHOLDS_EVENTS_TARGET 128
112112 #define SOFTLIMIT_EVENTS_TARGET 1024
113
-#define NUMAINFO_EVENTS_TARGET 1024
114113
115114 /*
116115 * Cgroups above their limits are maintained in a RB-Tree, independent of
....@@ -210,14 +209,6 @@
210209 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
211210 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
212211
213
-enum charge_type {
214
- MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
215
- MEM_CGROUP_CHARGE_TYPE_ANON,
216
- MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
217
- MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */
218
- NR_CHARGE_TYPE,
219
-};
220
-
221212 /* for encoding cft->private value on file */
222213 enum res_type {
223214 _MEM,
....@@ -248,7 +239,7 @@
248239 iter != NULL; \
249240 iter = mem_cgroup_iter(NULL, iter, NULL))
250241
251
-static inline bool should_force_charge(void)
242
+static inline bool task_is_dying(void)
252243 {
253244 return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
254245 (current->flags & PF_EXITING);
....@@ -268,8 +259,100 @@
268259 }
269260
270261 #ifdef CONFIG_MEMCG_KMEM
262
+static DEFINE_SPINLOCK(objcg_lock);
263
+
264
+static void obj_cgroup_release(struct percpu_ref *ref)
265
+{
266
+ struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
267
+ struct mem_cgroup *memcg;
268
+ unsigned int nr_bytes;
269
+ unsigned int nr_pages;
270
+ unsigned long flags;
271
+
272
+ /*
273
+ * At this point all allocated objects are freed, and
274
+ * objcg->nr_charged_bytes can't have an arbitrary byte value.
275
+ * However, it can be PAGE_SIZE or (x * PAGE_SIZE).
276
+ *
277
+ * The following sequence can lead to it:
278
+ * 1) CPU0: objcg == stock->cached_objcg
279
+ * 2) CPU1: we do a small allocation (e.g. 92 bytes),
280
+ * PAGE_SIZE bytes are charged
281
+ * 3) CPU1: a process from another memcg is allocating something,
282
+ * the stock if flushed,
283
+ * objcg->nr_charged_bytes = PAGE_SIZE - 92
284
+ * 5) CPU0: we do release this object,
285
+ * 92 bytes are added to stock->nr_bytes
286
+ * 6) CPU0: stock is flushed,
287
+ * 92 bytes are added to objcg->nr_charged_bytes
288
+ *
289
+ * In the result, nr_charged_bytes == PAGE_SIZE.
290
+ * This page will be uncharged in obj_cgroup_release().
291
+ */
292
+ nr_bytes = atomic_read(&objcg->nr_charged_bytes);
293
+ WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
294
+ nr_pages = nr_bytes >> PAGE_SHIFT;
295
+
296
+ spin_lock_irqsave(&objcg_lock, flags);
297
+ memcg = obj_cgroup_memcg(objcg);
298
+ if (nr_pages)
299
+ __memcg_kmem_uncharge(memcg, nr_pages);
300
+ list_del(&objcg->list);
301
+ mem_cgroup_put(memcg);
302
+ spin_unlock_irqrestore(&objcg_lock, flags);
303
+
304
+ percpu_ref_exit(ref);
305
+ kfree_rcu(objcg, rcu);
306
+}
307
+
308
+static struct obj_cgroup *obj_cgroup_alloc(void)
309
+{
310
+ struct obj_cgroup *objcg;
311
+ int ret;
312
+
313
+ objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
314
+ if (!objcg)
315
+ return NULL;
316
+
317
+ ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0,
318
+ GFP_KERNEL);
319
+ if (ret) {
320
+ kfree(objcg);
321
+ return NULL;
322
+ }
323
+ INIT_LIST_HEAD(&objcg->list);
324
+ return objcg;
325
+}
326
+
327
+static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
328
+ struct mem_cgroup *parent)
329
+{
330
+ struct obj_cgroup *objcg, *iter;
331
+
332
+ objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
333
+
334
+ spin_lock_irq(&objcg_lock);
335
+
336
+ /* Move active objcg to the parent's list */
337
+ xchg(&objcg->memcg, parent);
338
+ css_get(&parent->css);
339
+ list_add(&objcg->list, &parent->objcg_list);
340
+
341
+ /* Move already reparented objcgs to the parent's list */
342
+ list_for_each_entry(iter, &memcg->objcg_list, list) {
343
+ css_get(&parent->css);
344
+ xchg(&iter->memcg, parent);
345
+ css_put(&memcg->css);
346
+ }
347
+ list_splice(&memcg->objcg_list, &parent->objcg_list);
348
+
349
+ spin_unlock_irq(&objcg_lock);
350
+
351
+ percpu_ref_kill(&objcg->refcnt);
352
+}
353
+
271354 /*
272
- * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
355
+ * This will be used as a shrinker list's index.
273356 * The main reason for not using cgroup id for this:
274357 * this works better in sparse environments, where we have a lot of memcgs,
275358 * but only a few kmem-limited. Or also, if we have, for instance, 200
....@@ -312,14 +395,13 @@
312395
313396 /*
314397 * A lot of the calls to the cache allocation functions are expected to be
315
- * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
398
+ * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are
316399 * conditional to this static branch, we'll have to allow modules that does
317400 * kmem_cache_alloc and the such to see this symbol as well
318401 */
319402 DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
320403 EXPORT_SYMBOL(memcg_kmem_enabled_key);
321
-
322
-struct workqueue_struct *memcg_kmem_cache_wq;
404
+#endif
323405
324406 static int memcg_shrinker_map_size;
325407 static DEFINE_MUTEX(memcg_shrinker_map_mutex);
....@@ -344,7 +426,7 @@
344426 if (!old)
345427 return 0;
346428
347
- new = kvmalloc(sizeof(*new) + size, GFP_KERNEL);
429
+ new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
348430 if (!new)
349431 return -ENOMEM;
350432
....@@ -388,7 +470,7 @@
388470 mutex_lock(&memcg_shrinker_map_mutex);
389471 size = memcg_shrinker_map_size;
390472 for_each_node(nid) {
391
- map = kvzalloc(sizeof(*map) + size, GFP_KERNEL);
473
+ map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid);
392474 if (!map) {
393475 memcg_free_shrinker_maps(memcg);
394476 ret = -ENOMEM;
....@@ -445,14 +527,6 @@
445527 }
446528 }
447529
448
-#else /* CONFIG_MEMCG_KMEM */
449
-static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
450
-{
451
- return 0;
452
-}
453
-static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { }
454
-#endif /* CONFIG_MEMCG_KMEM */
455
-
456530 /**
457531 * mem_cgroup_css_from_page - css of the memcg associated with a page
458532 * @page: page of interest
....@@ -495,7 +569,17 @@
495569 unsigned long ino = 0;
496570
497571 rcu_read_lock();
498
- memcg = READ_ONCE(page->mem_cgroup);
572
+ memcg = page->mem_cgroup;
573
+
574
+ /*
575
+ * The lowest bit set means that memcg isn't a valid
576
+ * memcg pointer, but a obj_cgroups pointer.
577
+ * In this case the page is shared and doesn't belong
578
+ * to any specific memory cgroup.
579
+ */
580
+ if ((unsigned long) memcg & 0x1UL)
581
+ memcg = NULL;
582
+
499583 while (memcg && !(memcg->css.flags & CSS_ONLINE))
500584 memcg = parent_mem_cgroup(memcg);
501585 if (memcg)
....@@ -671,7 +755,7 @@
671755 */
672756 __mem_cgroup_remove_exceeded(mz, mctz);
673757 if (!soft_limit_excess(mz->memcg) ||
674
- !css_tryget_online(&mz->memcg->css))
758
+ !css_tryget(&mz->memcg->css))
675759 goto retry;
676760 done:
677761 return mz;
....@@ -688,33 +772,187 @@
688772 return mz;
689773 }
690774
691
-static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
692
- int event)
775
+/**
776
+ * __mod_memcg_state - update cgroup memory statistics
777
+ * @memcg: the memory cgroup
778
+ * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
779
+ * @val: delta to add to the counter, can be negative
780
+ */
781
+void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
693782 {
694
- return atomic_long_read(&memcg->events[event]);
783
+ long x, threshold = MEMCG_CHARGE_BATCH;
784
+
785
+ if (mem_cgroup_disabled())
786
+ return;
787
+
788
+ if (memcg_stat_item_in_bytes(idx))
789
+ threshold <<= PAGE_SHIFT;
790
+
791
+ x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
792
+ if (unlikely(abs(x) > threshold)) {
793
+ struct mem_cgroup *mi;
794
+
795
+ /*
796
+ * Batch local counters to keep them in sync with
797
+ * the hierarchical ones.
798
+ */
799
+ __this_cpu_add(memcg->vmstats_local->stat[idx], x);
800
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
801
+ atomic_long_add(x, &mi->vmstats[idx]);
802
+ x = 0;
803
+ }
804
+ __this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
805
+}
806
+
807
+static struct mem_cgroup_per_node *
808
+parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
809
+{
810
+ struct mem_cgroup *parent;
811
+
812
+ parent = parent_mem_cgroup(pn->memcg);
813
+ if (!parent)
814
+ return NULL;
815
+ return mem_cgroup_nodeinfo(parent, nid);
816
+}
817
+
818
+void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
819
+ int val)
820
+{
821
+ struct mem_cgroup_per_node *pn;
822
+ struct mem_cgroup *memcg;
823
+ long x, threshold = MEMCG_CHARGE_BATCH;
824
+
825
+ pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
826
+ memcg = pn->memcg;
827
+
828
+ preempt_disable_rt();
829
+ /* Update memcg */
830
+ __mod_memcg_state(memcg, idx, val);
831
+
832
+ /* Update lruvec */
833
+ __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
834
+
835
+ if (vmstat_item_in_bytes(idx))
836
+ threshold <<= PAGE_SHIFT;
837
+
838
+ x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
839
+ if (unlikely(abs(x) > threshold)) {
840
+ pg_data_t *pgdat = lruvec_pgdat(lruvec);
841
+ struct mem_cgroup_per_node *pi;
842
+
843
+ for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
844
+ atomic_long_add(x, &pi->lruvec_stat[idx]);
845
+ x = 0;
846
+ }
847
+ __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
848
+ preempt_enable_rt();
849
+}
850
+
851
+/**
852
+ * __mod_lruvec_state - update lruvec memory statistics
853
+ * @lruvec: the lruvec
854
+ * @idx: the stat item
855
+ * @val: delta to add to the counter, can be negative
856
+ *
857
+ * The lruvec is the intersection of the NUMA node and a cgroup. This
858
+ * function updates the all three counters that are affected by a
859
+ * change of state at this level: per-node, per-cgroup, per-lruvec.
860
+ */
861
+void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
862
+ int val)
863
+{
864
+ /* Update node */
865
+ __mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
866
+
867
+ /* Update memcg and lruvec */
868
+ if (!mem_cgroup_disabled())
869
+ __mod_memcg_lruvec_state(lruvec, idx, val);
870
+}
871
+
872
+void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
873
+{
874
+ pg_data_t *pgdat = page_pgdat(virt_to_page(p));
875
+ struct mem_cgroup *memcg;
876
+ struct lruvec *lruvec;
877
+
878
+ rcu_read_lock();
879
+ memcg = mem_cgroup_from_obj(p);
880
+
881
+ /*
882
+ * Untracked pages have no memcg, no lruvec. Update only the
883
+ * node. If we reparent the slab objects to the root memcg,
884
+ * when we free the slab object, we need to update the per-memcg
885
+ * vmstats to keep it correct for the root memcg.
886
+ */
887
+ if (!memcg) {
888
+ __mod_node_page_state(pgdat, idx, val);
889
+ } else {
890
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
891
+ __mod_lruvec_state(lruvec, idx, val);
892
+ }
893
+ rcu_read_unlock();
894
+}
895
+
896
+void mod_memcg_obj_state(void *p, int idx, int val)
897
+{
898
+ struct mem_cgroup *memcg;
899
+
900
+ rcu_read_lock();
901
+ memcg = mem_cgroup_from_obj(p);
902
+ if (memcg)
903
+ mod_memcg_state(memcg, idx, val);
904
+ rcu_read_unlock();
905
+}
906
+
907
+/**
908
+ * __count_memcg_events - account VM events in a cgroup
909
+ * @memcg: the memory cgroup
910
+ * @idx: the event item
911
+ * @count: the number of events that occured
912
+ */
913
+void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
914
+ unsigned long count)
915
+{
916
+ unsigned long x;
917
+
918
+ if (mem_cgroup_disabled())
919
+ return;
920
+
921
+ x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
922
+ if (unlikely(x > MEMCG_CHARGE_BATCH)) {
923
+ struct mem_cgroup *mi;
924
+
925
+ /*
926
+ * Batch local counters to keep them in sync with
927
+ * the hierarchical ones.
928
+ */
929
+ __this_cpu_add(memcg->vmstats_local->events[idx], x);
930
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
931
+ atomic_long_add(x, &mi->vmevents[idx]);
932
+ x = 0;
933
+ }
934
+ __this_cpu_write(memcg->vmstats_percpu->events[idx], x);
935
+}
936
+
937
+static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
938
+{
939
+ return atomic_long_read(&memcg->vmevents[event]);
940
+}
941
+
942
+static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
943
+{
944
+ long x = 0;
945
+ int cpu;
946
+
947
+ for_each_possible_cpu(cpu)
948
+ x += per_cpu(memcg->vmstats_local->events[event], cpu);
949
+ return x;
695950 }
696951
697952 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
698953 struct page *page,
699
- bool compound, int nr_pages)
954
+ int nr_pages)
700955 {
701
- /*
702
- * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
703
- * counted as CACHE even if it's on ANON LRU.
704
- */
705
- if (PageAnon(page))
706
- __mod_memcg_state(memcg, MEMCG_RSS, nr_pages);
707
- else {
708
- __mod_memcg_state(memcg, MEMCG_CACHE, nr_pages);
709
- if (PageSwapBacked(page))
710
- __mod_memcg_state(memcg, NR_SHMEM, nr_pages);
711
- }
712
-
713
- if (compound) {
714
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
715
- __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages);
716
- }
717
-
718956 /* pagein of a big page is an event. So, ignore page size */
719957 if (nr_pages > 0)
720958 __count_memcg_events(memcg, PGPGIN, 1);
....@@ -723,35 +961,7 @@
723961 nr_pages = -nr_pages; /* for event */
724962 }
725963
726
- __this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages);
727
-}
728
-
729
-unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
730
- int nid, unsigned int lru_mask)
731
-{
732
- struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
733
- unsigned long nr = 0;
734
- enum lru_list lru;
735
-
736
- VM_BUG_ON((unsigned)nid >= nr_node_ids);
737
-
738
- for_each_lru(lru) {
739
- if (!(BIT(lru) & lru_mask))
740
- continue;
741
- nr += mem_cgroup_get_lru_size(lruvec, lru);
742
- }
743
- return nr;
744
-}
745
-
746
-static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
747
- unsigned int lru_mask)
748
-{
749
- unsigned long nr = 0;
750
- int nid;
751
-
752
- for_each_node_state(nid, N_MEMORY)
753
- nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
754
- return nr;
964
+ __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
755965 }
756966
757967 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
....@@ -759,8 +969,8 @@
759969 {
760970 unsigned long val, next;
761971
762
- val = __this_cpu_read(memcg->stat_cpu->nr_page_events);
763
- next = __this_cpu_read(memcg->stat_cpu->targets[target]);
972
+ val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
973
+ next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
764974 /* from time_after() in jiffies.h */
765975 if ((long)(next - val) < 0) {
766976 switch (target) {
....@@ -770,13 +980,10 @@
770980 case MEM_CGROUP_TARGET_SOFTLIMIT:
771981 next = val + SOFTLIMIT_EVENTS_TARGET;
772982 break;
773
- case MEM_CGROUP_TARGET_NUMAINFO:
774
- next = val + NUMAINFO_EVENTS_TARGET;
775
- break;
776983 default:
777984 break;
778985 }
779
- __this_cpu_write(memcg->stat_cpu->targets[target], next);
986
+ __this_cpu_write(memcg->vmstats_percpu->targets[target], next);
780987 return true;
781988 }
782989 return false;
....@@ -792,21 +999,12 @@
792999 if (unlikely(mem_cgroup_event_ratelimit(memcg,
7931000 MEM_CGROUP_TARGET_THRESH))) {
7941001 bool do_softlimit;
795
- bool do_numainfo __maybe_unused;
7961002
7971003 do_softlimit = mem_cgroup_event_ratelimit(memcg,
7981004 MEM_CGROUP_TARGET_SOFTLIMIT);
799
-#if MAX_NUMNODES > 1
800
- do_numainfo = mem_cgroup_event_ratelimit(memcg,
801
- MEM_CGROUP_TARGET_NUMAINFO);
802
-#endif
8031005 mem_cgroup_threshold(memcg);
8041006 if (unlikely(do_softlimit))
8051007 mem_cgroup_update_tree(memcg, page);
806
-#if MAX_NUMNODES > 1
807
- if (unlikely(do_numainfo))
808
- atomic_inc(&memcg->numainfo_events);
809
-#endif
8101008 }
8111009 }
8121010
....@@ -874,27 +1072,60 @@
8741072 return NULL;
8751073
8761074 rcu_read_lock();
877
- if (!memcg || !css_tryget_online(&memcg->css))
1075
+ /* Page should not get uncharged and freed memcg under us. */
1076
+ if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css)))
8781077 memcg = root_mem_cgroup;
8791078 rcu_read_unlock();
8801079 return memcg;
8811080 }
8821081 EXPORT_SYMBOL(get_mem_cgroup_from_page);
8831082
1083
+static __always_inline struct mem_cgroup *active_memcg(void)
1084
+{
1085
+ if (in_interrupt())
1086
+ return this_cpu_read(int_active_memcg);
1087
+ else
1088
+ return current->active_memcg;
1089
+}
1090
+
1091
+static __always_inline struct mem_cgroup *get_active_memcg(void)
1092
+{
1093
+ struct mem_cgroup *memcg;
1094
+
1095
+ rcu_read_lock();
1096
+ memcg = active_memcg();
1097
+ /* remote memcg must hold a ref. */
1098
+ if (memcg && WARN_ON_ONCE(!css_tryget(&memcg->css)))
1099
+ memcg = root_mem_cgroup;
1100
+ rcu_read_unlock();
1101
+
1102
+ return memcg;
1103
+}
1104
+
1105
+static __always_inline bool memcg_kmem_bypass(void)
1106
+{
1107
+ /* Allow remote memcg charging from any context. */
1108
+ if (unlikely(active_memcg()))
1109
+ return false;
1110
+
1111
+ /* Memcg to charge can't be determined. */
1112
+ if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
1113
+ return true;
1114
+
1115
+ return false;
1116
+}
1117
+
8841118 /**
885
- * If current->active_memcg is non-NULL, do not fallback to current->mm->memcg.
1119
+ * If active memcg is set, do not fallback to current->mm->memcg.
8861120 */
8871121 static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
8881122 {
889
- if (unlikely(current->active_memcg)) {
890
- struct mem_cgroup *memcg = root_mem_cgroup;
1123
+ if (memcg_kmem_bypass())
1124
+ return NULL;
8911125
892
- rcu_read_lock();
893
- if (css_tryget_online(&current->active_memcg->css))
894
- memcg = current->active_memcg;
895
- rcu_read_unlock();
896
- return memcg;
897
- }
1126
+ if (unlikely(active_memcg()))
1127
+ return get_active_memcg();
1128
+
8981129 return get_mem_cgroup_from_mm(current->mm);
8991130 }
9001131
....@@ -911,15 +1142,15 @@
9111142 * invocations for reference counting, or use mem_cgroup_iter_break()
9121143 * to cancel a hierarchy walk before the round-trip is complete.
9131144 *
914
- * Reclaimers can specify a node and a priority level in @reclaim to
915
- * divide up the memcgs in the hierarchy among all concurrent
916
- * reclaimers operating on the same node and priority.
1145
+ * Reclaimers can specify a node in @reclaim to divide up the memcgs
1146
+ * in the hierarchy among all concurrent reclaimers operating on the
1147
+ * same node.
9171148 */
9181149 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
9191150 struct mem_cgroup *prev,
9201151 struct mem_cgroup_reclaim_cookie *reclaim)
9211152 {
922
- struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
1153
+ struct mem_cgroup_reclaim_iter *iter;
9231154 struct cgroup_subsys_state *css = NULL;
9241155 struct mem_cgroup *memcg = NULL;
9251156 struct mem_cgroup *pos = NULL;
....@@ -945,7 +1176,7 @@
9451176 struct mem_cgroup_per_node *mz;
9461177
9471178 mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
948
- iter = &mz->iter[reclaim->priority];
1179
+ iter = &mz->iter;
9491180
9501181 if (prev && reclaim->generation != iter->generation)
9511182 goto out_unlock;
....@@ -1045,15 +1276,11 @@
10451276 struct mem_cgroup_reclaim_iter *iter;
10461277 struct mem_cgroup_per_node *mz;
10471278 int nid;
1048
- int i;
10491279
10501280 for_each_node(nid) {
10511281 mz = mem_cgroup_nodeinfo(from, nid);
1052
- for (i = 0; i <= DEF_PRIORITY; i++) {
1053
- iter = &mz->iter[i];
1054
- cmpxchg(&iter->position,
1055
- dead_memcg, NULL);
1056
- }
1282
+ iter = &mz->iter;
1283
+ cmpxchg(&iter->position, dead_memcg, NULL);
10571284 }
10581285 }
10591286
....@@ -1103,7 +1330,7 @@
11031330 struct css_task_iter it;
11041331 struct task_struct *task;
11051332
1106
- css_task_iter_start(&iter->css, 0, &it);
1333
+ css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
11071334 while (!ret && (task = css_task_iter_next(&it)))
11081335 ret = fn(task, arg);
11091336 css_task_iter_end(&it);
....@@ -1120,9 +1347,8 @@
11201347 * @page: the page
11211348 * @pgdat: pgdat of the page
11221349 *
1123
- * This function is only safe when following the LRU page isolation
1124
- * and putback protocol: the LRU lock must be held, and the page must
1125
- * either be PageLRU() or the caller must have isolated/allocated it.
1350
+ * This function relies on page->mem_cgroup being stable - see the
1351
+ * access rules in commit_charge().
11261352 */
11271353 struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
11281354 {
....@@ -1131,7 +1357,7 @@
11311357 struct lruvec *lruvec;
11321358
11331359 if (mem_cgroup_disabled()) {
1134
- lruvec = &pgdat->lruvec;
1360
+ lruvec = &pgdat->__lruvec;
11351361 goto out;
11361362 }
11371363
....@@ -1155,6 +1381,38 @@
11551381 lruvec->pgdat = pgdat;
11561382 return lruvec;
11571383 }
1384
+
1385
+struct lruvec *page_to_lruvec(struct page *page, pg_data_t *pgdat)
1386
+{
1387
+ struct lruvec *lruvec;
1388
+
1389
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
1390
+
1391
+ return lruvec;
1392
+}
1393
+EXPORT_SYMBOL_GPL(page_to_lruvec);
1394
+
1395
+void do_traversal_all_lruvec(void)
1396
+{
1397
+ pg_data_t *pgdat;
1398
+
1399
+ for_each_online_pgdat(pgdat) {
1400
+ struct mem_cgroup *memcg = NULL;
1401
+
1402
+ spin_lock_irq(&pgdat->lru_lock);
1403
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
1404
+ do {
1405
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
1406
+
1407
+ trace_android_vh_do_traversal_lruvec(lruvec);
1408
+
1409
+ memcg = mem_cgroup_iter(NULL, memcg, NULL);
1410
+ } while (memcg);
1411
+
1412
+ spin_unlock_irq(&pgdat->lru_lock);
1413
+ }
1414
+}
1415
+EXPORT_SYMBOL_GPL(do_traversal_all_lruvec);
11581416
11591417 /**
11601418 * mem_cgroup_update_lru_size - account for adding or removing an lru page
....@@ -1195,32 +1453,6 @@
11951453 *lru_size += nr_pages;
11961454 }
11971455
1198
-bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
1199
-{
1200
- struct mem_cgroup *task_memcg;
1201
- struct task_struct *p;
1202
- bool ret;
1203
-
1204
- p = find_lock_task_mm(task);
1205
- if (p) {
1206
- task_memcg = get_mem_cgroup_from_mm(p->mm);
1207
- task_unlock(p);
1208
- } else {
1209
- /*
1210
- * All threads may have already detached their mm's, but the oom
1211
- * killer still needs to detect if they have already been oom
1212
- * killed to prevent needlessly killing additional tasks.
1213
- */
1214
- rcu_read_lock();
1215
- task_memcg = mem_cgroup_from_task(task);
1216
- css_get(&task_memcg->css);
1217
- rcu_read_unlock();
1218
- }
1219
- ret = mem_cgroup_is_descendant(task_memcg, memcg);
1220
- css_put(&task_memcg->css);
1221
- return ret;
1222
-}
1223
-
12241456 /**
12251457 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
12261458 * @memcg: the memory cgroup
....@@ -1242,7 +1474,7 @@
12421474 if (do_memsw_account()) {
12431475 count = page_counter_read(&memcg->memsw);
12441476 limit = READ_ONCE(memcg->memsw.max);
1245
- if (count <= limit)
1477
+ if (count < limit)
12461478 margin = min(margin, limit - count);
12471479 else
12481480 margin = 0;
....@@ -1296,85 +1528,199 @@
12961528 return false;
12971529 }
12981530
1299
-static const unsigned int memcg1_stats[] = {
1300
- MEMCG_CACHE,
1301
- MEMCG_RSS,
1302
- MEMCG_RSS_HUGE,
1303
- NR_SHMEM,
1304
- NR_FILE_MAPPED,
1305
- NR_FILE_DIRTY,
1306
- NR_WRITEBACK,
1307
- MEMCG_SWAP,
1531
+struct memory_stat {
1532
+ const char *name;
1533
+ unsigned int ratio;
1534
+ unsigned int idx;
13081535 };
13091536
1310
-static const char *const memcg1_stat_names[] = {
1311
- "cache",
1312
- "rss",
1313
- "rss_huge",
1314
- "shmem",
1315
- "mapped_file",
1316
- "dirty",
1317
- "writeback",
1318
- "swap",
1537
+static struct memory_stat memory_stats[] = {
1538
+ { "anon", PAGE_SIZE, NR_ANON_MAPPED },
1539
+ { "file", PAGE_SIZE, NR_FILE_PAGES },
1540
+ { "kernel_stack", 1024, NR_KERNEL_STACK_KB },
1541
+ { "percpu", 1, MEMCG_PERCPU_B },
1542
+ { "sock", PAGE_SIZE, MEMCG_SOCK },
1543
+ { "shmem", PAGE_SIZE, NR_SHMEM },
1544
+ { "file_mapped", PAGE_SIZE, NR_FILE_MAPPED },
1545
+ { "file_dirty", PAGE_SIZE, NR_FILE_DIRTY },
1546
+ { "file_writeback", PAGE_SIZE, NR_WRITEBACK },
1547
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1548
+ /*
1549
+ * The ratio will be initialized in memory_stats_init(). Because
1550
+ * on some architectures, the macro of HPAGE_PMD_SIZE is not
1551
+ * constant(e.g. powerpc).
1552
+ */
1553
+ { "anon_thp", 0, NR_ANON_THPS },
1554
+#endif
1555
+ { "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON },
1556
+ { "active_anon", PAGE_SIZE, NR_ACTIVE_ANON },
1557
+ { "inactive_file", PAGE_SIZE, NR_INACTIVE_FILE },
1558
+ { "active_file", PAGE_SIZE, NR_ACTIVE_FILE },
1559
+ { "unevictable", PAGE_SIZE, NR_UNEVICTABLE },
1560
+
1561
+ /*
1562
+ * Note: The slab_reclaimable and slab_unreclaimable must be
1563
+ * together and slab_reclaimable must be in front.
1564
+ */
1565
+ { "slab_reclaimable", 1, NR_SLAB_RECLAIMABLE_B },
1566
+ { "slab_unreclaimable", 1, NR_SLAB_UNRECLAIMABLE_B },
1567
+
1568
+ /* The memory events */
1569
+ { "workingset_refault_anon", 1, WORKINGSET_REFAULT_ANON },
1570
+ { "workingset_refault_file", 1, WORKINGSET_REFAULT_FILE },
1571
+ { "workingset_activate_anon", 1, WORKINGSET_ACTIVATE_ANON },
1572
+ { "workingset_activate_file", 1, WORKINGSET_ACTIVATE_FILE },
1573
+ { "workingset_restore_anon", 1, WORKINGSET_RESTORE_ANON },
1574
+ { "workingset_restore_file", 1, WORKINGSET_RESTORE_FILE },
1575
+ { "workingset_nodereclaim", 1, WORKINGSET_NODERECLAIM },
13191576 };
1577
+
1578
+static int __init memory_stats_init(void)
1579
+{
1580
+ int i;
1581
+
1582
+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
1583
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1584
+ if (memory_stats[i].idx == NR_ANON_THPS)
1585
+ memory_stats[i].ratio = HPAGE_PMD_SIZE;
1586
+#endif
1587
+ VM_BUG_ON(!memory_stats[i].ratio);
1588
+ VM_BUG_ON(memory_stats[i].idx >= MEMCG_NR_STAT);
1589
+ }
1590
+
1591
+ return 0;
1592
+}
1593
+pure_initcall(memory_stats_init);
1594
+
1595
+static char *memory_stat_format(struct mem_cgroup *memcg)
1596
+{
1597
+ struct seq_buf s;
1598
+ int i;
1599
+
1600
+ seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
1601
+ if (!s.buffer)
1602
+ return NULL;
1603
+
1604
+ /*
1605
+ * Provide statistics on the state of the memory subsystem as
1606
+ * well as cumulative event counters that show past behavior.
1607
+ *
1608
+ * This list is ordered following a combination of these gradients:
1609
+ * 1) generic big picture -> specifics and details
1610
+ * 2) reflecting userspace activity -> reflecting kernel heuristics
1611
+ *
1612
+ * Current memory state:
1613
+ */
1614
+
1615
+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
1616
+ u64 size;
1617
+
1618
+ size = memcg_page_state(memcg, memory_stats[i].idx);
1619
+ size *= memory_stats[i].ratio;
1620
+ seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size);
1621
+
1622
+ if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
1623
+ size = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
1624
+ memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B);
1625
+ seq_buf_printf(&s, "slab %llu\n", size);
1626
+ }
1627
+ }
1628
+
1629
+ /* Accumulated memory events */
1630
+
1631
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT),
1632
+ memcg_events(memcg, PGFAULT));
1633
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
1634
+ memcg_events(memcg, PGMAJFAULT));
1635
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGREFILL),
1636
+ memcg_events(memcg, PGREFILL));
1637
+ seq_buf_printf(&s, "pgscan %lu\n",
1638
+ memcg_events(memcg, PGSCAN_KSWAPD) +
1639
+ memcg_events(memcg, PGSCAN_DIRECT));
1640
+ seq_buf_printf(&s, "pgsteal %lu\n",
1641
+ memcg_events(memcg, PGSTEAL_KSWAPD) +
1642
+ memcg_events(memcg, PGSTEAL_DIRECT));
1643
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE),
1644
+ memcg_events(memcg, PGACTIVATE));
1645
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE),
1646
+ memcg_events(memcg, PGDEACTIVATE));
1647
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE),
1648
+ memcg_events(memcg, PGLAZYFREE));
1649
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED),
1650
+ memcg_events(memcg, PGLAZYFREED));
1651
+
1652
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1653
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC),
1654
+ memcg_events(memcg, THP_FAULT_ALLOC));
1655
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC),
1656
+ memcg_events(memcg, THP_COLLAPSE_ALLOC));
1657
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1658
+
1659
+ /* The above should easily fit into one page */
1660
+ WARN_ON_ONCE(seq_buf_has_overflowed(&s));
1661
+
1662
+ return s.buffer;
1663
+}
13201664
13211665 #define K(x) ((x) << (PAGE_SHIFT-10))
13221666 /**
1323
- * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
1667
+ * mem_cgroup_print_oom_context: Print OOM information relevant to
1668
+ * memory controller.
13241669 * @memcg: The memory cgroup that went over limit
13251670 * @p: Task that is going to be killed
13261671 *
13271672 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
13281673 * enabled
13291674 */
1330
-void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1675
+void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
13311676 {
1332
- struct mem_cgroup *iter;
1333
- unsigned int i;
1334
-
13351677 rcu_read_lock();
13361678
1679
+ if (memcg) {
1680
+ pr_cont(",oom_memcg=");
1681
+ pr_cont_cgroup_path(memcg->css.cgroup);
1682
+ } else
1683
+ pr_cont(",global_oom");
13371684 if (p) {
1338
- pr_info("Task in ");
1685
+ pr_cont(",task_memcg=");
13391686 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1340
- pr_cont(" killed as a result of limit of ");
1341
- } else {
1342
- pr_info("Memory limit reached of cgroup ");
13431687 }
1344
-
1345
- pr_cont_cgroup_path(memcg->css.cgroup);
1346
- pr_cont("\n");
1347
-
13481688 rcu_read_unlock();
1689
+}
1690
+
1691
+/**
1692
+ * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to
1693
+ * memory controller.
1694
+ * @memcg: The memory cgroup that went over limit
1695
+ */
1696
+void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
1697
+{
1698
+ char *buf;
13491699
13501700 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
13511701 K((u64)page_counter_read(&memcg->memory)),
1352
- K((u64)memcg->memory.max), memcg->memory.failcnt);
1353
- pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1354
- K((u64)page_counter_read(&memcg->memsw)),
1355
- K((u64)memcg->memsw.max), memcg->memsw.failcnt);
1356
- pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1357
- K((u64)page_counter_read(&memcg->kmem)),
1358
- K((u64)memcg->kmem.max), memcg->kmem.failcnt);
1359
-
1360
- for_each_mem_cgroup_tree(iter, memcg) {
1361
- pr_info("Memory cgroup stats for ");
1362
- pr_cont_cgroup_path(iter->css.cgroup);
1363
- pr_cont(":");
1364
-
1365
- for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
1366
- if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account)
1367
- continue;
1368
- pr_cont(" %s:%luKB", memcg1_stat_names[i],
1369
- K(memcg_page_state(iter, memcg1_stats[i])));
1370
- }
1371
-
1372
- for (i = 0; i < NR_LRU_LISTS; i++)
1373
- pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
1374
- K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
1375
-
1376
- pr_cont("\n");
1702
+ K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt);
1703
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
1704
+ pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
1705
+ K((u64)page_counter_read(&memcg->swap)),
1706
+ K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt);
1707
+ else {
1708
+ pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1709
+ K((u64)page_counter_read(&memcg->memsw)),
1710
+ K((u64)memcg->memsw.max), memcg->memsw.failcnt);
1711
+ pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1712
+ K((u64)page_counter_read(&memcg->kmem)),
1713
+ K((u64)memcg->kmem.max), memcg->kmem.failcnt);
13771714 }
1715
+
1716
+ pr_info("Memory cgroup stats for ");
1717
+ pr_cont_cgroup_path(memcg->css.cgroup);
1718
+ pr_cont(":");
1719
+ buf = memory_stat_format(memcg);
1720
+ if (!buf)
1721
+ return;
1722
+ pr_info("%s", buf);
1723
+ kfree(buf);
13781724 }
13791725
13801726 /*
....@@ -1382,19 +1728,26 @@
13821728 */
13831729 unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
13841730 {
1385
- unsigned long max;
1731
+ unsigned long max = READ_ONCE(memcg->memory.max);
13861732
1387
- max = memcg->memory.max;
1388
- if (mem_cgroup_swappiness(memcg)) {
1389
- unsigned long memsw_max;
1390
- unsigned long swap_max;
1733
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
1734
+ if (mem_cgroup_swappiness(memcg))
1735
+ max += min(READ_ONCE(memcg->swap.max),
1736
+ (unsigned long)total_swap_pages);
1737
+ } else { /* v1 */
1738
+ if (mem_cgroup_swappiness(memcg)) {
1739
+ /* Calculate swap excess capacity from memsw limit */
1740
+ unsigned long swap = READ_ONCE(memcg->memsw.max) - max;
13911741
1392
- memsw_max = memcg->memsw.max;
1393
- swap_max = memcg->swap.max;
1394
- swap_max = min(swap_max, (unsigned long)total_swap_pages);
1395
- max = min(max + swap_max, memsw_max);
1742
+ max += min(swap, (unsigned long)total_swap_pages);
1743
+ }
13961744 }
13971745 return max;
1746
+}
1747
+
1748
+unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
1749
+{
1750
+ return page_counter_read(&memcg->memory);
13981751 }
13991752
14001753 static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
....@@ -1407,112 +1760,24 @@
14071760 .gfp_mask = gfp_mask,
14081761 .order = order,
14091762 };
1410
- bool ret;
1763
+ bool ret = true;
14111764
14121765 if (mutex_lock_killable(&oom_lock))
14131766 return true;
1767
+
1768
+ if (mem_cgroup_margin(memcg) >= (1 << order))
1769
+ goto unlock;
1770
+
14141771 /*
14151772 * A few threads which were not waiting at mutex_lock_killable() can
14161773 * fail to bail out. Therefore, check again after holding oom_lock.
14171774 */
1418
- ret = should_force_charge() || out_of_memory(&oc);
1775
+ ret = task_is_dying() || out_of_memory(&oc);
1776
+
1777
+unlock:
14191778 mutex_unlock(&oom_lock);
14201779 return ret;
14211780 }
1422
-
1423
-#if MAX_NUMNODES > 1
1424
-
1425
-/**
1426
- * test_mem_cgroup_node_reclaimable
1427
- * @memcg: the target memcg
1428
- * @nid: the node ID to be checked.
1429
- * @noswap : specify true here if the user wants flle only information.
1430
- *
1431
- * This function returns whether the specified memcg contains any
1432
- * reclaimable pages on a node. Returns true if there are any reclaimable
1433
- * pages in the node.
1434
- */
1435
-static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1436
- int nid, bool noswap)
1437
-{
1438
- if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
1439
- return true;
1440
- if (noswap || !total_swap_pages)
1441
- return false;
1442
- if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
1443
- return true;
1444
- return false;
1445
-
1446
-}
1447
-
1448
-/*
1449
- * Always updating the nodemask is not very good - even if we have an empty
1450
- * list or the wrong list here, we can start from some node and traverse all
1451
- * nodes based on the zonelist. So update the list loosely once per 10 secs.
1452
- *
1453
- */
1454
-static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1455
-{
1456
- int nid;
1457
- /*
1458
- * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
1459
- * pagein/pageout changes since the last update.
1460
- */
1461
- if (!atomic_read(&memcg->numainfo_events))
1462
- return;
1463
- if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1464
- return;
1465
-
1466
- /* make a nodemask where this memcg uses memory from */
1467
- memcg->scan_nodes = node_states[N_MEMORY];
1468
-
1469
- for_each_node_mask(nid, node_states[N_MEMORY]) {
1470
-
1471
- if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1472
- node_clear(nid, memcg->scan_nodes);
1473
- }
1474
-
1475
- atomic_set(&memcg->numainfo_events, 0);
1476
- atomic_set(&memcg->numainfo_updating, 0);
1477
-}
1478
-
1479
-/*
1480
- * Selecting a node where we start reclaim from. Because what we need is just
1481
- * reducing usage counter, start from anywhere is O,K. Considering
1482
- * memory reclaim from current node, there are pros. and cons.
1483
- *
1484
- * Freeing memory from current node means freeing memory from a node which
1485
- * we'll use or we've used. So, it may make LRU bad. And if several threads
1486
- * hit limits, it will see a contention on a node. But freeing from remote
1487
- * node means more costs for memory reclaim because of memory latency.
1488
- *
1489
- * Now, we use round-robin. Better algorithm is welcomed.
1490
- */
1491
-int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1492
-{
1493
- int node;
1494
-
1495
- mem_cgroup_may_update_nodemask(memcg);
1496
- node = memcg->last_scanned_node;
1497
-
1498
- node = next_node_in(node, memcg->scan_nodes);
1499
- /*
1500
- * mem_cgroup_may_update_nodemask might have seen no reclaimmable pages
1501
- * last time it really checked all the LRUs due to rate limiting.
1502
- * Fallback to the current node in that case for simplicity.
1503
- */
1504
- if (unlikely(node == MAX_NUMNODES))
1505
- node = numa_node_id();
1506
-
1507
- memcg->last_scanned_node = node;
1508
- return node;
1509
-}
1510
-#else
1511
-int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1512
-{
1513
- return 0;
1514
-}
1515
-#endif
15161781
15171782 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
15181783 pg_data_t *pgdat,
....@@ -1526,7 +1791,6 @@
15261791 unsigned long nr_scanned;
15271792 struct mem_cgroup_reclaim_cookie reclaim = {
15281793 .pgdat = pgdat,
1529
- .priority = 0,
15301794 };
15311795
15321796 excess = soft_limit_excess(root_memcg);
....@@ -1621,7 +1885,7 @@
16211885 struct mem_cgroup *iter;
16221886
16231887 spin_lock(&memcg_oom_lock);
1624
- mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
1888
+ mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
16251889 for_each_mem_cgroup_tree(iter, memcg)
16261890 iter->oom_lock = false;
16271891 spin_unlock(&memcg_oom_lock);
....@@ -1642,8 +1906,8 @@
16421906 struct mem_cgroup *iter;
16431907
16441908 /*
1645
- * When a new child is created while the hierarchy is under oom,
1646
- * mem_cgroup_oom_lock() may not be called. Watch for underflow.
1909
+ * Be careful about under_oom underflows becase a child memcg
1910
+ * could have been added after mem_cgroup_mark_under_oom.
16471911 */
16481912 spin_lock(&memcg_oom_lock);
16491913 for_each_mem_cgroup_tree(iter, memcg)
....@@ -1703,6 +1967,8 @@
17031967
17041968 if (order > PAGE_ALLOC_COSTLY_ORDER)
17051969 return OOM_SKIPPED;
1970
+
1971
+ memcg_memory_event(memcg, MEMCG_OOM);
17061972
17071973 /*
17081974 * We are in the middle of the charge context here, so we
....@@ -1851,6 +2117,14 @@
18512117 goto out;
18522118
18532119 /*
2120
+ * If the victim task has been asynchronously moved to a different
2121
+ * memory cgroup, we might end up killing tasks outside oom_domain.
2122
+ * In this case it's better to ignore memory.group.oom.
2123
+ */
2124
+ if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain)))
2125
+ goto out;
2126
+
2127
+ /*
18542128 * Traverse the memory cgroup hierarchy from the victim task's
18552129 * cgroup up to the OOMing cgroup (or root) to find the
18562130 * highest-level memory cgroup with oom.group set.
....@@ -1891,6 +2165,7 @@
18912165 */
18922166 struct mem_cgroup *lock_page_memcg(struct page *page)
18932167 {
2168
+ struct page *head = compound_head(page); /* rmap on tail pages */
18942169 struct mem_cgroup *memcg;
18952170 unsigned long flags;
18962171
....@@ -1910,7 +2185,7 @@
19102185 if (mem_cgroup_disabled())
19112186 return NULL;
19122187 again:
1913
- memcg = page->mem_cgroup;
2188
+ memcg = head->mem_cgroup;
19142189 if (unlikely(!memcg))
19152190 return NULL;
19162191
....@@ -1918,7 +2193,7 @@
19182193 return memcg;
19192194
19202195 spin_lock_irqsave(&memcg->move_lock, flags);
1921
- if (memcg != page->mem_cgroup) {
2196
+ if (memcg != head->mem_cgroup) {
19222197 spin_unlock_irqrestore(&memcg->move_lock, flags);
19232198 goto again;
19242199 }
....@@ -1961,19 +2236,44 @@
19612236 */
19622237 void unlock_page_memcg(struct page *page)
19632238 {
1964
- __unlock_page_memcg(page->mem_cgroup);
2239
+ struct page *head = compound_head(page);
2240
+
2241
+ __unlock_page_memcg(head->mem_cgroup);
19652242 }
19662243 EXPORT_SYMBOL(unlock_page_memcg);
19672244
19682245 struct memcg_stock_pcp {
2246
+ local_lock_t lock;
19692247 struct mem_cgroup *cached; /* this never be root cgroup */
19702248 unsigned int nr_pages;
2249
+
2250
+#ifdef CONFIG_MEMCG_KMEM
2251
+ struct obj_cgroup *cached_objcg;
2252
+ unsigned int nr_bytes;
2253
+#endif
2254
+
19712255 struct work_struct work;
19722256 unsigned long flags;
19732257 #define FLUSHING_CACHED_CHARGE 0
19742258 };
19752259 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
19762260 static DEFINE_MUTEX(percpu_charge_mutex);
2261
+
2262
+#ifdef CONFIG_MEMCG_KMEM
2263
+static void drain_obj_stock(struct memcg_stock_pcp *stock);
2264
+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
2265
+ struct mem_cgroup *root_memcg);
2266
+
2267
+#else
2268
+static inline void drain_obj_stock(struct memcg_stock_pcp *stock)
2269
+{
2270
+}
2271
+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
2272
+ struct mem_cgroup *root_memcg)
2273
+{
2274
+ return false;
2275
+}
2276
+#endif
19772277
19782278 /**
19792279 * consume_stock: Try to consume stocked charge on this cpu.
....@@ -1995,7 +2295,7 @@
19952295 if (nr_pages > MEMCG_CHARGE_BATCH)
19962296 return ret;
19972297
1998
- local_irq_save(flags);
2298
+ local_lock_irqsave(&memcg_stock.lock, flags);
19992299
20002300 stock = this_cpu_ptr(&memcg_stock);
20012301 if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
....@@ -2003,7 +2303,7 @@
20032303 ret = true;
20042304 }
20052305
2006
- local_irq_restore(flags);
2306
+ local_unlock_irqrestore(&memcg_stock.lock, flags);
20072307
20082308 return ret;
20092309 }
....@@ -2015,13 +2315,17 @@
20152315 {
20162316 struct mem_cgroup *old = stock->cached;
20172317
2318
+ if (!old)
2319
+ return;
2320
+
20182321 if (stock->nr_pages) {
20192322 page_counter_uncharge(&old->memory, stock->nr_pages);
20202323 if (do_memsw_account())
20212324 page_counter_uncharge(&old->memsw, stock->nr_pages);
2022
- css_put_many(&old->css, stock->nr_pages);
20232325 stock->nr_pages = 0;
20242326 }
2327
+
2328
+ css_put(&old->css);
20252329 stock->cached = NULL;
20262330 }
20272331
....@@ -2034,13 +2338,14 @@
20342338 * The only protection from memory hotplug vs. drain_stock races is
20352339 * that we always operate on local CPU stock here with IRQ disabled
20362340 */
2037
- local_irq_save(flags);
2341
+ local_lock_irqsave(&memcg_stock.lock, flags);
20382342
20392343 stock = this_cpu_ptr(&memcg_stock);
2344
+ drain_obj_stock(stock);
20402345 drain_stock(stock);
20412346 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
20422347
2043
- local_irq_restore(flags);
2348
+ local_unlock_irqrestore(&memcg_stock.lock, flags);
20442349 }
20452350
20462351 /*
....@@ -2052,11 +2357,12 @@
20522357 struct memcg_stock_pcp *stock;
20532358 unsigned long flags;
20542359
2055
- local_irq_save(flags);
2360
+ local_lock_irqsave(&memcg_stock.lock, flags);
20562361
20572362 stock = this_cpu_ptr(&memcg_stock);
20582363 if (stock->cached != memcg) { /* reset if necessary */
20592364 drain_stock(stock);
2365
+ css_get(&memcg->css);
20602366 stock->cached = memcg;
20612367 }
20622368 stock->nr_pages += nr_pages;
....@@ -2064,7 +2370,7 @@
20642370 if (stock->nr_pages > MEMCG_CHARGE_BATCH)
20652371 drain_stock(stock);
20662372
2067
- local_irq_restore(flags);
2373
+ local_unlock_irqrestore(&memcg_stock.lock, flags);
20682374 }
20692375
20702376 /*
....@@ -2084,34 +2390,37 @@
20842390 * as well as workers from this path always operate on the local
20852391 * per-cpu data. CPU up doesn't touch memcg_stock at all.
20862392 */
2087
- curcpu = get_cpu();
2393
+ curcpu = get_cpu_light();
20882394 for_each_online_cpu(cpu) {
20892395 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
20902396 struct mem_cgroup *memcg;
2397
+ bool flush = false;
20912398
2399
+ rcu_read_lock();
20922400 memcg = stock->cached;
2093
- if (!memcg || !stock->nr_pages || !css_tryget(&memcg->css))
2094
- continue;
2095
- if (!mem_cgroup_is_descendant(memcg, root_memcg)) {
2096
- css_put(&memcg->css);
2097
- continue;
2098
- }
2099
- if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2401
+ if (memcg && stock->nr_pages &&
2402
+ mem_cgroup_is_descendant(memcg, root_memcg))
2403
+ flush = true;
2404
+ if (obj_stock_flush_required(stock, root_memcg))
2405
+ flush = true;
2406
+ rcu_read_unlock();
2407
+
2408
+ if (flush &&
2409
+ !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
21002410 if (cpu == curcpu)
21012411 drain_local_stock(&stock->work);
21022412 else
21032413 schedule_work_on(cpu, &stock->work);
21042414 }
2105
- css_put(&memcg->css);
21062415 }
2107
- put_cpu();
2416
+ put_cpu_light();
21082417 mutex_unlock(&percpu_charge_mutex);
21092418 }
21102419
21112420 static int memcg_hotplug_cpu_dead(unsigned int cpu)
21122421 {
21132422 struct memcg_stock_pcp *stock;
2114
- struct mem_cgroup *memcg;
2423
+ struct mem_cgroup *memcg, *mi;
21152424
21162425 stock = &per_cpu(memcg_stock, cpu);
21172426 drain_stock(stock);
....@@ -2123,9 +2432,10 @@
21232432 int nid;
21242433 long x;
21252434
2126
- x = this_cpu_xchg(memcg->stat_cpu->count[i], 0);
2435
+ x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
21272436 if (x)
2128
- atomic_long_add(x, &memcg->stat[i]);
2437
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
2438
+ atomic_long_add(x, &memcg->vmstats[i]);
21292439
21302440 if (i >= NR_VM_NODE_STAT_ITEMS)
21312441 continue;
....@@ -2136,32 +2446,48 @@
21362446 pn = mem_cgroup_nodeinfo(memcg, nid);
21372447 x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
21382448 if (x)
2139
- atomic_long_add(x, &pn->lruvec_stat[i]);
2449
+ do {
2450
+ atomic_long_add(x, &pn->lruvec_stat[i]);
2451
+ } while ((pn = parent_nodeinfo(pn, nid)));
21402452 }
21412453 }
21422454
21432455 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
21442456 long x;
21452457
2146
- x = this_cpu_xchg(memcg->stat_cpu->events[i], 0);
2458
+ x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
21472459 if (x)
2148
- atomic_long_add(x, &memcg->events[i]);
2460
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
2461
+ atomic_long_add(x, &memcg->vmevents[i]);
21492462 }
21502463 }
21512464
21522465 return 0;
21532466 }
21542467
2155
-static void reclaim_high(struct mem_cgroup *memcg,
2156
- unsigned int nr_pages,
2157
- gfp_t gfp_mask)
2468
+static unsigned long reclaim_high(struct mem_cgroup *memcg,
2469
+ unsigned int nr_pages,
2470
+ gfp_t gfp_mask)
21582471 {
2472
+ unsigned long nr_reclaimed = 0;
2473
+
21592474 do {
2160
- if (page_counter_read(&memcg->memory) <= memcg->high)
2475
+ unsigned long pflags;
2476
+
2477
+ if (page_counter_read(&memcg->memory) <=
2478
+ READ_ONCE(memcg->memory.high))
21612479 continue;
2480
+
21622481 memcg_memory_event(memcg, MEMCG_HIGH);
2163
- try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
2164
- } while ((memcg = parent_mem_cgroup(memcg)));
2482
+
2483
+ psi_memstall_enter(&pflags);
2484
+ nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
2485
+ gfp_mask, true);
2486
+ psi_memstall_leave(&pflags);
2487
+ } while ((memcg = parent_mem_cgroup(memcg)) &&
2488
+ !mem_cgroup_is_root(memcg));
2489
+
2490
+ return nr_reclaimed;
21652491 }
21662492
21672493 static void high_work_func(struct work_struct *work)
....@@ -2173,35 +2499,238 @@
21732499 }
21742500
21752501 /*
2502
+ * Clamp the maximum sleep time per allocation batch to 2 seconds. This is
2503
+ * enough to still cause a significant slowdown in most cases, while still
2504
+ * allowing diagnostics and tracing to proceed without becoming stuck.
2505
+ */
2506
+#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
2507
+
2508
+/*
2509
+ * When calculating the delay, we use these either side of the exponentiation to
2510
+ * maintain precision and scale to a reasonable number of jiffies (see the table
2511
+ * below.
2512
+ *
2513
+ * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
2514
+ * overage ratio to a delay.
2515
+ * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the
2516
+ * proposed penalty in order to reduce to a reasonable number of jiffies, and
2517
+ * to produce a reasonable delay curve.
2518
+ *
2519
+ * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
2520
+ * reasonable delay curve compared to precision-adjusted overage, not
2521
+ * penalising heavily at first, but still making sure that growth beyond the
2522
+ * limit penalises misbehaviour cgroups by slowing them down exponentially. For
2523
+ * example, with a high of 100 megabytes:
2524
+ *
2525
+ * +-------+------------------------+
2526
+ * | usage | time to allocate in ms |
2527
+ * +-------+------------------------+
2528
+ * | 100M | 0 |
2529
+ * | 101M | 6 |
2530
+ * | 102M | 25 |
2531
+ * | 103M | 57 |
2532
+ * | 104M | 102 |
2533
+ * | 105M | 159 |
2534
+ * | 106M | 230 |
2535
+ * | 107M | 313 |
2536
+ * | 108M | 409 |
2537
+ * | 109M | 518 |
2538
+ * | 110M | 639 |
2539
+ * | 111M | 774 |
2540
+ * | 112M | 921 |
2541
+ * | 113M | 1081 |
2542
+ * | 114M | 1254 |
2543
+ * | 115M | 1439 |
2544
+ * | 116M | 1638 |
2545
+ * | 117M | 1849 |
2546
+ * | 118M | 2000 |
2547
+ * | 119M | 2000 |
2548
+ * | 120M | 2000 |
2549
+ * +-------+------------------------+
2550
+ */
2551
+ #define MEMCG_DELAY_PRECISION_SHIFT 20
2552
+ #define MEMCG_DELAY_SCALING_SHIFT 14
2553
+
2554
+static u64 calculate_overage(unsigned long usage, unsigned long high)
2555
+{
2556
+ u64 overage;
2557
+
2558
+ if (usage <= high)
2559
+ return 0;
2560
+
2561
+ /*
2562
+ * Prevent division by 0 in overage calculation by acting as if
2563
+ * it was a threshold of 1 page
2564
+ */
2565
+ high = max(high, 1UL);
2566
+
2567
+ overage = usage - high;
2568
+ overage <<= MEMCG_DELAY_PRECISION_SHIFT;
2569
+ return div64_u64(overage, high);
2570
+}
2571
+
2572
+static u64 mem_find_max_overage(struct mem_cgroup *memcg)
2573
+{
2574
+ u64 overage, max_overage = 0;
2575
+
2576
+ do {
2577
+ overage = calculate_overage(page_counter_read(&memcg->memory),
2578
+ READ_ONCE(memcg->memory.high));
2579
+ max_overage = max(overage, max_overage);
2580
+ } while ((memcg = parent_mem_cgroup(memcg)) &&
2581
+ !mem_cgroup_is_root(memcg));
2582
+
2583
+ return max_overage;
2584
+}
2585
+
2586
+static u64 swap_find_max_overage(struct mem_cgroup *memcg)
2587
+{
2588
+ u64 overage, max_overage = 0;
2589
+
2590
+ do {
2591
+ overage = calculate_overage(page_counter_read(&memcg->swap),
2592
+ READ_ONCE(memcg->swap.high));
2593
+ if (overage)
2594
+ memcg_memory_event(memcg, MEMCG_SWAP_HIGH);
2595
+ max_overage = max(overage, max_overage);
2596
+ } while ((memcg = parent_mem_cgroup(memcg)) &&
2597
+ !mem_cgroup_is_root(memcg));
2598
+
2599
+ return max_overage;
2600
+}
2601
+
2602
+/*
2603
+ * Get the number of jiffies that we should penalise a mischievous cgroup which
2604
+ * is exceeding its memory.high by checking both it and its ancestors.
2605
+ */
2606
+static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
2607
+ unsigned int nr_pages,
2608
+ u64 max_overage)
2609
+{
2610
+ unsigned long penalty_jiffies;
2611
+
2612
+ if (!max_overage)
2613
+ return 0;
2614
+
2615
+ /*
2616
+ * We use overage compared to memory.high to calculate the number of
2617
+ * jiffies to sleep (penalty_jiffies). Ideally this value should be
2618
+ * fairly lenient on small overages, and increasingly harsh when the
2619
+ * memcg in question makes it clear that it has no intention of stopping
2620
+ * its crazy behaviour, so we exponentially increase the delay based on
2621
+ * overage amount.
2622
+ */
2623
+ penalty_jiffies = max_overage * max_overage * HZ;
2624
+ penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
2625
+ penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
2626
+
2627
+ /*
2628
+ * Factor in the task's own contribution to the overage, such that four
2629
+ * N-sized allocations are throttled approximately the same as one
2630
+ * 4N-sized allocation.
2631
+ *
2632
+ * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
2633
+ * larger the current charge patch is than that.
2634
+ */
2635
+ return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
2636
+}
2637
+
2638
+/*
21762639 * Scheduled by try_charge() to be executed from the userland return path
21772640 * and reclaims memory over the high limit.
21782641 */
21792642 void mem_cgroup_handle_over_high(void)
21802643 {
2644
+ unsigned long penalty_jiffies;
2645
+ unsigned long pflags;
2646
+ unsigned long nr_reclaimed;
21812647 unsigned int nr_pages = current->memcg_nr_pages_over_high;
2648
+ int nr_retries = MAX_RECLAIM_RETRIES;
21822649 struct mem_cgroup *memcg;
2650
+ bool in_retry = false;
21832651
21842652 if (likely(!nr_pages))
21852653 return;
21862654
21872655 memcg = get_mem_cgroup_from_mm(current->mm);
2188
- reclaim_high(memcg, nr_pages, GFP_KERNEL);
2189
- css_put(&memcg->css);
21902656 current->memcg_nr_pages_over_high = 0;
2657
+
2658
+retry_reclaim:
2659
+ /*
2660
+ * The allocating task should reclaim at least the batch size, but for
2661
+ * subsequent retries we only want to do what's necessary to prevent oom
2662
+ * or breaching resource isolation.
2663
+ *
2664
+ * This is distinct from memory.max or page allocator behaviour because
2665
+ * memory.high is currently batched, whereas memory.max and the page
2666
+ * allocator run every time an allocation is made.
2667
+ */
2668
+ nr_reclaimed = reclaim_high(memcg,
2669
+ in_retry ? SWAP_CLUSTER_MAX : nr_pages,
2670
+ GFP_KERNEL);
2671
+
2672
+ /*
2673
+ * memory.high is breached and reclaim is unable to keep up. Throttle
2674
+ * allocators proactively to slow down excessive growth.
2675
+ */
2676
+ penalty_jiffies = calculate_high_delay(memcg, nr_pages,
2677
+ mem_find_max_overage(memcg));
2678
+
2679
+ penalty_jiffies += calculate_high_delay(memcg, nr_pages,
2680
+ swap_find_max_overage(memcg));
2681
+
2682
+ /*
2683
+ * Clamp the max delay per usermode return so as to still keep the
2684
+ * application moving forwards and also permit diagnostics, albeit
2685
+ * extremely slowly.
2686
+ */
2687
+ penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
2688
+
2689
+ /*
2690
+ * Don't sleep if the amount of jiffies this memcg owes us is so low
2691
+ * that it's not even worth doing, in an attempt to be nice to those who
2692
+ * go only a small amount over their memory.high value and maybe haven't
2693
+ * been aggressively reclaimed enough yet.
2694
+ */
2695
+ if (penalty_jiffies <= HZ / 100)
2696
+ goto out;
2697
+
2698
+ /*
2699
+ * If reclaim is making forward progress but we're still over
2700
+ * memory.high, we want to encourage that rather than doing allocator
2701
+ * throttling.
2702
+ */
2703
+ if (nr_reclaimed || nr_retries--) {
2704
+ in_retry = true;
2705
+ goto retry_reclaim;
2706
+ }
2707
+
2708
+ /*
2709
+ * If we exit early, we're guaranteed to die (since
2710
+ * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
2711
+ * need to account for any ill-begotten jiffies to pay them off later.
2712
+ */
2713
+ psi_memstall_enter(&pflags);
2714
+ schedule_timeout_killable(penalty_jiffies);
2715
+ psi_memstall_leave(&pflags);
2716
+
2717
+out:
2718
+ css_put(&memcg->css);
21912719 }
21922720
21932721 static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
21942722 unsigned int nr_pages)
21952723 {
21962724 unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
2197
- int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2725
+ int nr_retries = MAX_RECLAIM_RETRIES;
21982726 struct mem_cgroup *mem_over_limit;
21992727 struct page_counter *counter;
2728
+ enum oom_status oom_status;
22002729 unsigned long nr_reclaimed;
2730
+ bool passed_oom = false;
22012731 bool may_swap = true;
22022732 bool drained = false;
2203
- bool oomed = false;
2204
- enum oom_status oom_status;
2733
+ unsigned long pflags;
22052734
22062735 if (mem_cgroup_is_root(memcg))
22072736 return 0;
....@@ -2236,15 +2765,6 @@
22362765 goto force;
22372766
22382767 /*
2239
- * Unlike in global OOM situations, memcg is not in a physical
2240
- * memory shortage. Allow dying and OOM-killed tasks to
2241
- * bypass the last charges so that they can exit quickly and
2242
- * free their memory.
2243
- */
2244
- if (unlikely(should_force_charge()))
2245
- goto force;
2246
-
2247
- /*
22482768 * Prevent unbounded recursion when reclaim operations need to
22492769 * allocate memory. This might exceed the limits temporarily,
22502770 * but we prefer facilitating memory reclaim and getting back
....@@ -2261,8 +2781,10 @@
22612781
22622782 memcg_memory_event(mem_over_limit, MEMCG_MAX);
22632783
2784
+ psi_memstall_enter(&pflags);
22642785 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
22652786 gfp_mask, may_swap);
2787
+ psi_memstall_leave(&pflags);
22662788
22672789 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
22682790 goto retry;
....@@ -2296,16 +2818,15 @@
22962818 if (nr_retries--)
22972819 goto retry;
22982820
2299
- if (gfp_mask & __GFP_RETRY_MAYFAIL && oomed)
2821
+ if (gfp_mask & __GFP_RETRY_MAYFAIL)
23002822 goto nomem;
23012823
23022824 if (gfp_mask & __GFP_NOFAIL)
23032825 goto force;
23042826
2305
- if (fatal_signal_pending(current))
2306
- goto force;
2307
-
2308
- memcg_memory_event(mem_over_limit, MEMCG_OOM);
2827
+ /* Avoid endless loop for tasks bypassed by the oom killer */
2828
+ if (passed_oom && task_is_dying())
2829
+ goto nomem;
23092830
23102831 /*
23112832 * keep retrying as long as the memcg oom killer is able to make
....@@ -2314,15 +2835,10 @@
23142835 */
23152836 oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
23162837 get_order(nr_pages * PAGE_SIZE));
2317
- switch (oom_status) {
2318
- case OOM_SUCCESS:
2319
- nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2320
- oomed = true;
2838
+ if (oom_status == OOM_SUCCESS) {
2839
+ passed_oom = true;
2840
+ nr_retries = MAX_RECLAIM_RETRIES;
23212841 goto retry;
2322
- case OOM_FAILED:
2323
- goto force;
2324
- default:
2325
- goto nomem;
23262842 }
23272843 nomem:
23282844 if (!(gfp_mask & __GFP_NOFAIL))
....@@ -2336,12 +2852,10 @@
23362852 page_counter_charge(&memcg->memory, nr_pages);
23372853 if (do_memsw_account())
23382854 page_counter_charge(&memcg->memsw, nr_pages);
2339
- css_get_many(&memcg->css, nr_pages);
23402855
23412856 return 0;
23422857
23432858 done_restock:
2344
- css_get_many(&memcg->css, batch);
23452859 if (batch > nr_pages)
23462860 refill_stock(memcg, batch - nr_pages);
23472861
....@@ -2355,12 +2869,32 @@
23552869 * reclaim, the cost of mismatch is negligible.
23562870 */
23572871 do {
2358
- if (page_counter_read(&memcg->memory) > memcg->high) {
2359
- /* Don't bother a random interrupted task */
2360
- if (in_interrupt()) {
2872
+ bool mem_high, swap_high;
2873
+
2874
+ mem_high = page_counter_read(&memcg->memory) >
2875
+ READ_ONCE(memcg->memory.high);
2876
+ swap_high = page_counter_read(&memcg->swap) >
2877
+ READ_ONCE(memcg->swap.high);
2878
+
2879
+ /* Don't bother a random interrupted task */
2880
+ if (in_interrupt()) {
2881
+ if (mem_high) {
23612882 schedule_work(&memcg->high_work);
23622883 break;
23632884 }
2885
+ continue;
2886
+ }
2887
+
2888
+ if (mem_high || swap_high) {
2889
+ /*
2890
+ * The allocating tasks in this cgroup will need to do
2891
+ * reclaim or be throttled to prevent further growth
2892
+ * of the memory or swap footprints.
2893
+ *
2894
+ * Target some best-effort fairness between the tasks,
2895
+ * and distribute reclaim work and delay penalties
2896
+ * based on how much each task is actually allocating.
2897
+ */
23642898 current->memcg_nr_pages_over_high += batch;
23652899 set_notify_resume(current);
23662900 break;
....@@ -2370,6 +2904,7 @@
23702904 return 0;
23712905 }
23722906
2907
+#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU)
23732908 static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
23742909 {
23752910 if (mem_cgroup_is_root(memcg))
....@@ -2378,76 +2913,124 @@
23782913 page_counter_uncharge(&memcg->memory, nr_pages);
23792914 if (do_memsw_account())
23802915 page_counter_uncharge(&memcg->memsw, nr_pages);
2381
-
2382
- css_put_many(&memcg->css, nr_pages);
23832916 }
2917
+#endif
23842918
2385
-static void lock_page_lru(struct page *page, int *isolated)
2919
+static void commit_charge(struct page *page, struct mem_cgroup *memcg)
23862920 {
2387
- struct zone *zone = page_zone(page);
2388
-
2389
- spin_lock_irq(zone_lru_lock(zone));
2390
- if (PageLRU(page)) {
2391
- struct lruvec *lruvec;
2392
-
2393
- lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
2394
- ClearPageLRU(page);
2395
- del_page_from_lru_list(page, lruvec, page_lru(page));
2396
- *isolated = 1;
2397
- } else
2398
- *isolated = 0;
2399
-}
2400
-
2401
-static void unlock_page_lru(struct page *page, int isolated)
2402
-{
2403
- struct zone *zone = page_zone(page);
2404
-
2405
- if (isolated) {
2406
- struct lruvec *lruvec;
2407
-
2408
- lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
2409
- VM_BUG_ON_PAGE(PageLRU(page), page);
2410
- SetPageLRU(page);
2411
- add_page_to_lru_list(page, lruvec, page_lru(page));
2412
- }
2413
- spin_unlock_irq(zone_lru_lock(zone));
2414
-}
2415
-
2416
-static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2417
- bool lrucare)
2418
-{
2419
- int isolated;
2420
-
24212921 VM_BUG_ON_PAGE(page->mem_cgroup, page);
2422
-
24232922 /*
2424
- * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
2425
- * may already be on some other mem_cgroup's LRU. Take care of it.
2426
- */
2427
- if (lrucare)
2428
- lock_page_lru(page, &isolated);
2429
-
2430
- /*
2431
- * Nobody should be changing or seriously looking at
2432
- * page->mem_cgroup at this point:
2923
+ * Any of the following ensures page->mem_cgroup stability:
24332924 *
2434
- * - the page is uncharged
2435
- *
2436
- * - the page is off-LRU
2437
- *
2438
- * - an anonymous fault has exclusive page access, except for
2439
- * a locked page table
2440
- *
2441
- * - a page cache insertion, a swapin fault, or a migration
2442
- * have the page locked
2925
+ * - the page lock
2926
+ * - LRU isolation
2927
+ * - lock_page_memcg()
2928
+ * - exclusive reference
24432929 */
24442930 page->mem_cgroup = memcg;
2445
-
2446
- if (lrucare)
2447
- unlock_page_lru(page, isolated);
24482931 }
24492932
24502933 #ifdef CONFIG_MEMCG_KMEM
2934
+/*
2935
+ * The allocated objcg pointers array is not accounted directly.
2936
+ * Moreover, it should not come from DMA buffer and is not readily
2937
+ * reclaimable. So those GFP bits should be masked off.
2938
+ */
2939
+#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT)
2940
+
2941
+int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
2942
+ gfp_t gfp)
2943
+{
2944
+ unsigned int objects = objs_per_slab_page(s, page);
2945
+ void *vec;
2946
+
2947
+ gfp &= ~OBJCGS_CLEAR_MASK;
2948
+ vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
2949
+ page_to_nid(page));
2950
+ if (!vec)
2951
+ return -ENOMEM;
2952
+
2953
+ if (cmpxchg(&page->obj_cgroups, NULL,
2954
+ (struct obj_cgroup **) ((unsigned long)vec | 0x1UL)))
2955
+ kfree(vec);
2956
+ else
2957
+ kmemleak_not_leak(vec);
2958
+
2959
+ return 0;
2960
+}
2961
+
2962
+/*
2963
+ * Returns a pointer to the memory cgroup to which the kernel object is charged.
2964
+ *
2965
+ * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
2966
+ * cgroup_mutex, etc.
2967
+ */
2968
+struct mem_cgroup *mem_cgroup_from_obj(void *p)
2969
+{
2970
+ struct page *page;
2971
+
2972
+ if (mem_cgroup_disabled())
2973
+ return NULL;
2974
+
2975
+ page = virt_to_head_page(p);
2976
+
2977
+ /*
2978
+ * If page->mem_cgroup is set, it's either a simple mem_cgroup pointer
2979
+ * or a pointer to obj_cgroup vector. In the latter case the lowest
2980
+ * bit of the pointer is set.
2981
+ * The page->mem_cgroup pointer can be asynchronously changed
2982
+ * from NULL to (obj_cgroup_vec | 0x1UL), but can't be changed
2983
+ * from a valid memcg pointer to objcg vector or back.
2984
+ */
2985
+ if (!page->mem_cgroup)
2986
+ return NULL;
2987
+
2988
+ /*
2989
+ * Slab objects are accounted individually, not per-page.
2990
+ * Memcg membership data for each individual object is saved in
2991
+ * the page->obj_cgroups.
2992
+ */
2993
+ if (page_has_obj_cgroups(page)) {
2994
+ struct obj_cgroup *objcg;
2995
+ unsigned int off;
2996
+
2997
+ off = obj_to_index(page->slab_cache, page, p);
2998
+ objcg = page_obj_cgroups(page)[off];
2999
+ if (objcg)
3000
+ return obj_cgroup_memcg(objcg);
3001
+
3002
+ return NULL;
3003
+ }
3004
+
3005
+ /* All other pages use page->mem_cgroup */
3006
+ return page->mem_cgroup;
3007
+}
3008
+
3009
+__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
3010
+{
3011
+ struct obj_cgroup *objcg = NULL;
3012
+ struct mem_cgroup *memcg;
3013
+
3014
+ if (memcg_kmem_bypass())
3015
+ return NULL;
3016
+
3017
+ rcu_read_lock();
3018
+ if (unlikely(active_memcg()))
3019
+ memcg = active_memcg();
3020
+ else
3021
+ memcg = mem_cgroup_from_task(current);
3022
+
3023
+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
3024
+ objcg = rcu_dereference(memcg->objcg);
3025
+ if (objcg && obj_cgroup_tryget(objcg))
3026
+ break;
3027
+ objcg = NULL;
3028
+ }
3029
+ rcu_read_unlock();
3030
+
3031
+ return objcg;
3032
+}
3033
+
24513034 static int memcg_alloc_cache_id(void)
24523035 {
24533036 int id, size;
....@@ -2473,9 +3056,7 @@
24733056 else if (size > MEMCG_CACHES_MAX_SIZE)
24743057 size = MEMCG_CACHES_MAX_SIZE;
24753058
2476
- err = memcg_update_all_caches(size);
2477
- if (!err)
2478
- err = memcg_update_all_list_lrus(size);
3059
+ err = memcg_update_all_list_lrus(size);
24793060 if (!err)
24803061 memcg_nr_cache_ids = size;
24813062
....@@ -2493,152 +3074,17 @@
24933074 ida_simple_remove(&memcg_cache_ida, id);
24943075 }
24953076
2496
-struct memcg_kmem_cache_create_work {
2497
- struct mem_cgroup *memcg;
2498
- struct kmem_cache *cachep;
2499
- struct work_struct work;
2500
-};
2501
-
2502
-static void memcg_kmem_cache_create_func(struct work_struct *w)
2503
-{
2504
- struct memcg_kmem_cache_create_work *cw =
2505
- container_of(w, struct memcg_kmem_cache_create_work, work);
2506
- struct mem_cgroup *memcg = cw->memcg;
2507
- struct kmem_cache *cachep = cw->cachep;
2508
-
2509
- memcg_create_kmem_cache(memcg, cachep);
2510
-
2511
- css_put(&memcg->css);
2512
- kfree(cw);
2513
-}
2514
-
2515
-/*
2516
- * Enqueue the creation of a per-memcg kmem_cache.
2517
- */
2518
-static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2519
- struct kmem_cache *cachep)
2520
-{
2521
- struct memcg_kmem_cache_create_work *cw;
2522
-
2523
- cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
2524
- if (!cw)
2525
- return;
2526
-
2527
- css_get(&memcg->css);
2528
-
2529
- cw->memcg = memcg;
2530
- cw->cachep = cachep;
2531
- INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
2532
-
2533
- queue_work(memcg_kmem_cache_wq, &cw->work);
2534
-}
2535
-
2536
-static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2537
- struct kmem_cache *cachep)
2538
-{
2539
- /*
2540
- * We need to stop accounting when we kmalloc, because if the
2541
- * corresponding kmalloc cache is not yet created, the first allocation
2542
- * in __memcg_schedule_kmem_cache_create will recurse.
2543
- *
2544
- * However, it is better to enclose the whole function. Depending on
2545
- * the debugging options enabled, INIT_WORK(), for instance, can
2546
- * trigger an allocation. This too, will make us recurse. Because at
2547
- * this point we can't allow ourselves back into memcg_kmem_get_cache,
2548
- * the safest choice is to do it like this, wrapping the whole function.
2549
- */
2550
- current->memcg_kmem_skip_account = 1;
2551
- __memcg_schedule_kmem_cache_create(memcg, cachep);
2552
- current->memcg_kmem_skip_account = 0;
2553
-}
2554
-
2555
-static inline bool memcg_kmem_bypass(void)
2556
-{
2557
- if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
2558
- return true;
2559
- return false;
2560
-}
2561
-
25623077 /**
2563
- * memcg_kmem_get_cache: select the correct per-memcg cache for allocation
2564
- * @cachep: the original global kmem cache
2565
- *
2566
- * Return the kmem_cache we're supposed to use for a slab allocation.
2567
- * We try to use the current memcg's version of the cache.
2568
- *
2569
- * If the cache does not exist yet, if we are the first user of it, we
2570
- * create it asynchronously in a workqueue and let the current allocation
2571
- * go through with the original cache.
2572
- *
2573
- * This function takes a reference to the cache it returns to assure it
2574
- * won't get destroyed while we are working with it. Once the caller is
2575
- * done with it, memcg_kmem_put_cache() must be called to release the
2576
- * reference.
2577
- */
2578
-struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
2579
-{
2580
- struct mem_cgroup *memcg;
2581
- struct kmem_cache *memcg_cachep;
2582
- int kmemcg_id;
2583
-
2584
- VM_BUG_ON(!is_root_cache(cachep));
2585
-
2586
- if (memcg_kmem_bypass())
2587
- return cachep;
2588
-
2589
- if (current->memcg_kmem_skip_account)
2590
- return cachep;
2591
-
2592
- memcg = get_mem_cgroup_from_current();
2593
- kmemcg_id = READ_ONCE(memcg->kmemcg_id);
2594
- if (kmemcg_id < 0)
2595
- goto out;
2596
-
2597
- memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
2598
- if (likely(memcg_cachep))
2599
- return memcg_cachep;
2600
-
2601
- /*
2602
- * If we are in a safe context (can wait, and not in interrupt
2603
- * context), we could be be predictable and return right away.
2604
- * This would guarantee that the allocation being performed
2605
- * already belongs in the new cache.
2606
- *
2607
- * However, there are some clashes that can arrive from locking.
2608
- * For instance, because we acquire the slab_mutex while doing
2609
- * memcg_create_kmem_cache, this means no further allocation
2610
- * could happen with the slab_mutex held. So it's better to
2611
- * defer everything.
2612
- */
2613
- memcg_schedule_kmem_cache_create(memcg, cachep);
2614
-out:
2615
- css_put(&memcg->css);
2616
- return cachep;
2617
-}
2618
-
2619
-/**
2620
- * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache
2621
- * @cachep: the cache returned by memcg_kmem_get_cache
2622
- */
2623
-void memcg_kmem_put_cache(struct kmem_cache *cachep)
2624
-{
2625
- if (!is_root_cache(cachep))
2626
- css_put(&cachep->memcg_params.memcg->css);
2627
-}
2628
-
2629
-/**
2630
- * memcg_kmem_charge_memcg: charge a kmem page
2631
- * @page: page to charge
2632
- * @gfp: reclaim mode
2633
- * @order: allocation order
3078
+ * __memcg_kmem_charge: charge a number of kernel pages to a memcg
26343079 * @memcg: memory cgroup to charge
3080
+ * @gfp: reclaim mode
3081
+ * @nr_pages: number of pages to charge
26353082 *
26363083 * Returns 0 on success, an error code on failure.
26373084 */
2638
-int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
2639
- struct mem_cgroup *memcg)
3085
+int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
3086
+ unsigned int nr_pages)
26403087 {
2641
- unsigned int nr_pages = 1 << order;
26423088 struct page_counter *counter;
26433089 int ret;
26443090
....@@ -2661,43 +3107,54 @@
26613107 cancel_charge(memcg, nr_pages);
26623108 return -ENOMEM;
26633109 }
2664
-
2665
- page->mem_cgroup = memcg;
2666
-
26673110 return 0;
26683111 }
26693112
26703113 /**
2671
- * memcg_kmem_charge: charge a kmem page to the current memory cgroup
3114
+ * __memcg_kmem_uncharge: uncharge a number of kernel pages from a memcg
3115
+ * @memcg: memcg to uncharge
3116
+ * @nr_pages: number of pages to uncharge
3117
+ */
3118
+void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
3119
+{
3120
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
3121
+ page_counter_uncharge(&memcg->kmem, nr_pages);
3122
+
3123
+ refill_stock(memcg, nr_pages);
3124
+}
3125
+
3126
+/**
3127
+ * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup
26723128 * @page: page to charge
26733129 * @gfp: reclaim mode
26743130 * @order: allocation order
26753131 *
26763132 * Returns 0 on success, an error code on failure.
26773133 */
2678
-int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
3134
+int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
26793135 {
26803136 struct mem_cgroup *memcg;
26813137 int ret = 0;
26823138
2683
- if (mem_cgroup_disabled() || memcg_kmem_bypass())
2684
- return 0;
2685
-
26863139 memcg = get_mem_cgroup_from_current();
2687
- if (!mem_cgroup_is_root(memcg)) {
2688
- ret = memcg_kmem_charge_memcg(page, gfp, order, memcg);
2689
- if (!ret)
3140
+ if (memcg && !mem_cgroup_is_root(memcg)) {
3141
+ ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
3142
+ if (!ret) {
3143
+ page->mem_cgroup = memcg;
26903144 __SetPageKmemcg(page);
3145
+ return 0;
3146
+ }
3147
+ css_put(&memcg->css);
26913148 }
2692
- css_put(&memcg->css);
26933149 return ret;
26943150 }
3151
+
26953152 /**
2696
- * memcg_kmem_uncharge: uncharge a kmem page
3153
+ * __memcg_kmem_uncharge_page: uncharge a kmem page
26973154 * @page: page to uncharge
26983155 * @order: allocation order
26993156 */
2700
-void memcg_kmem_uncharge(struct page *page, int order)
3157
+void __memcg_kmem_uncharge_page(struct page *page, int order)
27013158 {
27023159 struct mem_cgroup *memcg = page->mem_cgroup;
27033160 unsigned int nr_pages = 1 << order;
....@@ -2706,43 +3163,179 @@
27063163 return;
27073164
27083165 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
2709
-
2710
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
2711
- page_counter_uncharge(&memcg->kmem, nr_pages);
2712
-
2713
- page_counter_uncharge(&memcg->memory, nr_pages);
2714
- if (do_memsw_account())
2715
- page_counter_uncharge(&memcg->memsw, nr_pages);
2716
-
3166
+ __memcg_kmem_uncharge(memcg, nr_pages);
27173167 page->mem_cgroup = NULL;
3168
+ css_put(&memcg->css);
27183169
27193170 /* slab pages do not have PageKmemcg flag set */
27203171 if (PageKmemcg(page))
27213172 __ClearPageKmemcg(page);
2722
-
2723
- css_put_many(&memcg->css, nr_pages);
27243173 }
2725
-#endif /* CONFIG_MEMCG_KMEM */
27263174
2727
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2728
-
2729
-/*
2730
- * Because tail pages are not marked as "used", set it. We're under
2731
- * zone_lru_lock and migration entries setup in all page mappings.
2732
- */
2733
-void mem_cgroup_split_huge_fixup(struct page *head)
3175
+static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
27343176 {
2735
- int i;
3177
+ struct memcg_stock_pcp *stock;
3178
+ unsigned long flags;
3179
+ bool ret = false;
27363180
2737
- if (mem_cgroup_disabled())
3181
+ local_lock_irqsave(&memcg_stock.lock, flags);
3182
+
3183
+ stock = this_cpu_ptr(&memcg_stock);
3184
+ if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
3185
+ stock->nr_bytes -= nr_bytes;
3186
+ ret = true;
3187
+ }
3188
+
3189
+ local_unlock_irqrestore(&memcg_stock.lock, flags);
3190
+
3191
+ return ret;
3192
+}
3193
+
3194
+static void drain_obj_stock(struct memcg_stock_pcp *stock)
3195
+{
3196
+ struct obj_cgroup *old = stock->cached_objcg;
3197
+
3198
+ if (!old)
27383199 return;
27393200
2740
- for (i = 1; i < HPAGE_PMD_NR; i++)
2741
- head[i].mem_cgroup = head->mem_cgroup;
3201
+ if (stock->nr_bytes) {
3202
+ unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
3203
+ unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
27423204
2743
- __mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR);
3205
+ if (nr_pages) {
3206
+ struct mem_cgroup *memcg;
3207
+
3208
+ rcu_read_lock();
3209
+retry:
3210
+ memcg = obj_cgroup_memcg(old);
3211
+ if (unlikely(!css_tryget(&memcg->css)))
3212
+ goto retry;
3213
+ rcu_read_unlock();
3214
+
3215
+ __memcg_kmem_uncharge(memcg, nr_pages);
3216
+ css_put(&memcg->css);
3217
+ }
3218
+
3219
+ /*
3220
+ * The leftover is flushed to the centralized per-memcg value.
3221
+ * On the next attempt to refill obj stock it will be moved
3222
+ * to a per-cpu stock (probably, on an other CPU), see
3223
+ * refill_obj_stock().
3224
+ *
3225
+ * How often it's flushed is a trade-off between the memory
3226
+ * limit enforcement accuracy and potential CPU contention,
3227
+ * so it might be changed in the future.
3228
+ */
3229
+ atomic_add(nr_bytes, &old->nr_charged_bytes);
3230
+ stock->nr_bytes = 0;
3231
+ }
3232
+
3233
+ obj_cgroup_put(old);
3234
+ stock->cached_objcg = NULL;
27443235 }
2745
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
3236
+
3237
+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
3238
+ struct mem_cgroup *root_memcg)
3239
+{
3240
+ struct mem_cgroup *memcg;
3241
+
3242
+ if (stock->cached_objcg) {
3243
+ memcg = obj_cgroup_memcg(stock->cached_objcg);
3244
+ if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
3245
+ return true;
3246
+ }
3247
+
3248
+ return false;
3249
+}
3250
+
3251
+static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
3252
+{
3253
+ struct memcg_stock_pcp *stock;
3254
+ unsigned long flags;
3255
+
3256
+ local_lock_irqsave(&memcg_stock.lock, flags);
3257
+
3258
+ stock = this_cpu_ptr(&memcg_stock);
3259
+ if (stock->cached_objcg != objcg) { /* reset if necessary */
3260
+ drain_obj_stock(stock);
3261
+ obj_cgroup_get(objcg);
3262
+ stock->cached_objcg = objcg;
3263
+ stock->nr_bytes = atomic_xchg(&objcg->nr_charged_bytes, 0);
3264
+ }
3265
+ stock->nr_bytes += nr_bytes;
3266
+
3267
+ if (stock->nr_bytes > PAGE_SIZE)
3268
+ drain_obj_stock(stock);
3269
+
3270
+ local_unlock_irqrestore(&memcg_stock.lock, flags);
3271
+}
3272
+
3273
+int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
3274
+{
3275
+ struct mem_cgroup *memcg;
3276
+ unsigned int nr_pages, nr_bytes;
3277
+ int ret;
3278
+
3279
+ if (consume_obj_stock(objcg, size))
3280
+ return 0;
3281
+
3282
+ /*
3283
+ * In theory, memcg->nr_charged_bytes can have enough
3284
+ * pre-charged bytes to satisfy the allocation. However,
3285
+ * flushing memcg->nr_charged_bytes requires two atomic
3286
+ * operations, and memcg->nr_charged_bytes can't be big,
3287
+ * so it's better to ignore it and try grab some new pages.
3288
+ * memcg->nr_charged_bytes will be flushed in
3289
+ * refill_obj_stock(), called from this function or
3290
+ * independently later.
3291
+ */
3292
+ rcu_read_lock();
3293
+retry:
3294
+ memcg = obj_cgroup_memcg(objcg);
3295
+ if (unlikely(!css_tryget(&memcg->css)))
3296
+ goto retry;
3297
+ rcu_read_unlock();
3298
+
3299
+ nr_pages = size >> PAGE_SHIFT;
3300
+ nr_bytes = size & (PAGE_SIZE - 1);
3301
+
3302
+ if (nr_bytes)
3303
+ nr_pages += 1;
3304
+
3305
+ ret = __memcg_kmem_charge(memcg, gfp, nr_pages);
3306
+ if (!ret && nr_bytes)
3307
+ refill_obj_stock(objcg, PAGE_SIZE - nr_bytes);
3308
+
3309
+ css_put(&memcg->css);
3310
+ return ret;
3311
+}
3312
+
3313
+void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
3314
+{
3315
+ refill_obj_stock(objcg, size);
3316
+}
3317
+
3318
+#endif /* CONFIG_MEMCG_KMEM */
3319
+
3320
+/*
3321
+ * Because head->mem_cgroup is not set on tails, set it now.
3322
+ */
3323
+void split_page_memcg(struct page *head, unsigned int nr)
3324
+{
3325
+ struct mem_cgroup *memcg = head->mem_cgroup;
3326
+ int kmemcg = PageKmemcg(head);
3327
+ int i;
3328
+
3329
+ if (mem_cgroup_disabled() || !memcg)
3330
+ return;
3331
+
3332
+ for (i = 1; i < nr; i++) {
3333
+ head[i].mem_cgroup = memcg;
3334
+ if (kmemcg)
3335
+ __SetPageKmemcg(head + i);
3336
+ }
3337
+ css_get_many(&memcg->css, nr - 1);
3338
+}
27463339
27473340 #ifdef CONFIG_MEMCG_SWAP
27483341 /**
....@@ -2804,7 +3397,7 @@
28043397 * Make sure that the new limit (memsw or memory limit) doesn't
28053398 * break our basic invariant rule memory.max <= memsw.max.
28063399 */
2807
- limits_invariant = memsw ? max >= memcg->memory.max :
3400
+ limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
28083401 max <= memcg->memsw.max;
28093402 if (!limits_invariant) {
28103403 mutex_unlock(&memcg_max_mutex);
....@@ -2925,7 +3518,7 @@
29253518 * Test whether @memcg has children, dead or alive. Note that this
29263519 * function doesn't care whether @memcg has use_hierarchy enabled and
29273520 * returns %true if there are child csses according to the cgroup
2928
- * hierarchy. Testing use_hierarchy is the caller's responsiblity.
3521
+ * hierarchy. Testing use_hierarchy is the caller's responsibility.
29293522 */
29303523 static inline bool memcg_has_children(struct mem_cgroup *memcg)
29313524 {
....@@ -2944,7 +3537,7 @@
29443537 */
29453538 static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
29463539 {
2947
- int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3540
+ int nr_retries = MAX_RECLAIM_RETRIES;
29483541
29493542 /* we call try-to-free pages for make this cgroup empty */
29503543 lru_add_drain_all();
....@@ -3018,50 +3611,15 @@
30183611 return retval;
30193612 }
30203613
3021
-struct accumulated_stats {
3022
- unsigned long stat[MEMCG_NR_STAT];
3023
- unsigned long events[NR_VM_EVENT_ITEMS];
3024
- unsigned long lru_pages[NR_LRU_LISTS];
3025
- const unsigned int *stats_array;
3026
- const unsigned int *events_array;
3027
- int stats_size;
3028
- int events_size;
3029
-};
3030
-
3031
-static void accumulate_memcg_tree(struct mem_cgroup *memcg,
3032
- struct accumulated_stats *acc)
3033
-{
3034
- struct mem_cgroup *mi;
3035
- int i;
3036
-
3037
- for_each_mem_cgroup_tree(mi, memcg) {
3038
- for (i = 0; i < acc->stats_size; i++)
3039
- acc->stat[i] += memcg_page_state(mi,
3040
- acc->stats_array ? acc->stats_array[i] : i);
3041
-
3042
- for (i = 0; i < acc->events_size; i++)
3043
- acc->events[i] += memcg_sum_events(mi,
3044
- acc->events_array ? acc->events_array[i] : i);
3045
-
3046
- for (i = 0; i < NR_LRU_LISTS; i++)
3047
- acc->lru_pages[i] +=
3048
- mem_cgroup_nr_lru_pages(mi, BIT(i));
3049
- }
3050
-}
3051
-
30523614 static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
30533615 {
3054
- unsigned long val = 0;
3616
+ unsigned long val;
30553617
30563618 if (mem_cgroup_is_root(memcg)) {
3057
- struct mem_cgroup *iter;
3058
-
3059
- for_each_mem_cgroup_tree(iter, memcg) {
3060
- val += memcg_page_state(iter, MEMCG_CACHE);
3061
- val += memcg_page_state(iter, MEMCG_RSS);
3062
- if (swap)
3063
- val += memcg_page_state(iter, MEMCG_SWAP);
3064
- }
3619
+ val = memcg_page_state(memcg, NR_FILE_PAGES) +
3620
+ memcg_page_state(memcg, NR_ANON_MAPPED);
3621
+ if (swap)
3622
+ val += memcg_page_state(memcg, MEMCG_SWAP);
30653623 } else {
30663624 if (!swap)
30673625 val = page_counter_read(&memcg->memory);
....@@ -3122,9 +3680,61 @@
31223680 }
31233681 }
31243682
3683
+static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
3684
+{
3685
+ unsigned long stat[MEMCG_NR_STAT] = {0};
3686
+ struct mem_cgroup *mi;
3687
+ int node, cpu, i;
3688
+
3689
+ for_each_online_cpu(cpu)
3690
+ for (i = 0; i < MEMCG_NR_STAT; i++)
3691
+ stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
3692
+
3693
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
3694
+ for (i = 0; i < MEMCG_NR_STAT; i++)
3695
+ atomic_long_add(stat[i], &mi->vmstats[i]);
3696
+
3697
+ for_each_node(node) {
3698
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
3699
+ struct mem_cgroup_per_node *pi;
3700
+
3701
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3702
+ stat[i] = 0;
3703
+
3704
+ for_each_online_cpu(cpu)
3705
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3706
+ stat[i] += per_cpu(
3707
+ pn->lruvec_stat_cpu->count[i], cpu);
3708
+
3709
+ for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
3710
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3711
+ atomic_long_add(stat[i], &pi->lruvec_stat[i]);
3712
+ }
3713
+}
3714
+
3715
+static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
3716
+{
3717
+ unsigned long events[NR_VM_EVENT_ITEMS];
3718
+ struct mem_cgroup *mi;
3719
+ int cpu, i;
3720
+
3721
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3722
+ events[i] = 0;
3723
+
3724
+ for_each_online_cpu(cpu)
3725
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3726
+ events[i] += per_cpu(memcg->vmstats_percpu->events[i],
3727
+ cpu);
3728
+
3729
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
3730
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3731
+ atomic_long_add(events[i], &mi->vmevents[i]);
3732
+}
3733
+
31253734 #ifdef CONFIG_MEMCG_KMEM
31263735 static int memcg_online_kmem(struct mem_cgroup *memcg)
31273736 {
3737
+ struct obj_cgroup *objcg;
31283738 int memcg_id;
31293739
31303740 if (cgroup_memory_nokmem)
....@@ -3137,7 +3747,16 @@
31373747 if (memcg_id < 0)
31383748 return memcg_id;
31393749
3140
- static_branch_inc(&memcg_kmem_enabled_key);
3750
+ objcg = obj_cgroup_alloc();
3751
+ if (!objcg) {
3752
+ memcg_free_cache_id(memcg_id);
3753
+ return -ENOMEM;
3754
+ }
3755
+ objcg->memcg = memcg;
3756
+ rcu_assign_pointer(memcg->objcg, objcg);
3757
+
3758
+ static_branch_enable(&memcg_kmem_enabled_key);
3759
+
31413760 /*
31423761 * A memory cgroup is considered kmem-online as soon as it gets
31433762 * kmemcg_id. Setting the id after enabling static branching will
....@@ -3146,7 +3765,6 @@
31463765 */
31473766 memcg->kmemcg_id = memcg_id;
31483767 memcg->kmem_state = KMEM_ONLINE;
3149
- INIT_LIST_HEAD(&memcg->kmem_caches);
31503768
31513769 return 0;
31523770 }
....@@ -3159,22 +3777,17 @@
31593777
31603778 if (memcg->kmem_state != KMEM_ONLINE)
31613779 return;
3162
- /*
3163
- * Clear the online state before clearing memcg_caches array
3164
- * entries. The slab_mutex in memcg_deactivate_kmem_caches()
3165
- * guarantees that no cache will be created for this cgroup
3166
- * after we are done (see memcg_create_kmem_cache()).
3167
- */
3780
+
31683781 memcg->kmem_state = KMEM_ALLOCATED;
3169
-
3170
- memcg_deactivate_kmem_caches(memcg);
3171
-
3172
- kmemcg_id = memcg->kmemcg_id;
3173
- BUG_ON(kmemcg_id < 0);
31743782
31753783 parent = parent_mem_cgroup(memcg);
31763784 if (!parent)
31773785 parent = root_mem_cgroup;
3786
+
3787
+ memcg_reparent_objcgs(memcg, parent);
3788
+
3789
+ kmemcg_id = memcg->kmemcg_id;
3790
+ BUG_ON(kmemcg_id < 0);
31783791
31793792 /*
31803793 * Change kmemcg_id of this cgroup and all its descendants to the
....@@ -3204,12 +3817,6 @@
32043817 /* css_alloc() failed, offlining didn't happen */
32053818 if (unlikely(memcg->kmem_state == KMEM_ONLINE))
32063819 memcg_offline_kmem(memcg);
3207
-
3208
- if (memcg->kmem_state == KMEM_ALLOCATED) {
3209
- memcg_destroy_kmem_caches(memcg);
3210
- static_branch_dec(&memcg_kmem_enabled_key);
3211
- WARN_ON(page_counter_read(&memcg->kmem));
3212
- }
32133820 }
32143821 #else
32153822 static int memcg_online_kmem(struct mem_cgroup *memcg)
....@@ -3300,6 +3907,9 @@
33003907 ret = mem_cgroup_resize_max(memcg, nr_pages, true);
33013908 break;
33023909 case _KMEM:
3910
+ pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
3911
+ "Please report your usecase to linux-mm@kvack.org if you "
3912
+ "depend on this functionality.\n");
33033913 ret = memcg_update_kmem_max(memcg, nr_pages);
33043914 break;
33053915 case _TCP:
....@@ -3385,6 +3995,49 @@
33853995 #endif
33863996
33873997 #ifdef CONFIG_NUMA
3998
+
3999
+#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
4000
+#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
4001
+#define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
4002
+
4003
+static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
4004
+ int nid, unsigned int lru_mask, bool tree)
4005
+{
4006
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
4007
+ unsigned long nr = 0;
4008
+ enum lru_list lru;
4009
+
4010
+ VM_BUG_ON((unsigned)nid >= nr_node_ids);
4011
+
4012
+ for_each_lru(lru) {
4013
+ if (!(BIT(lru) & lru_mask))
4014
+ continue;
4015
+ if (tree)
4016
+ nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
4017
+ else
4018
+ nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
4019
+ }
4020
+ return nr;
4021
+}
4022
+
4023
+static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
4024
+ unsigned int lru_mask,
4025
+ bool tree)
4026
+{
4027
+ unsigned long nr = 0;
4028
+ enum lru_list lru;
4029
+
4030
+ for_each_lru(lru) {
4031
+ if (!(BIT(lru) & lru_mask))
4032
+ continue;
4033
+ if (tree)
4034
+ nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
4035
+ else
4036
+ nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
4037
+ }
4038
+ return nr;
4039
+}
4040
+
33884041 static int memcg_numa_stat_show(struct seq_file *m, void *v)
33894042 {
33904043 struct numa_stat {
....@@ -3400,40 +4053,60 @@
34004053 };
34014054 const struct numa_stat *stat;
34024055 int nid;
3403
- unsigned long nr;
3404
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
4056
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
34054057
34064058 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3407
- nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
3408
- seq_printf(m, "%s=%lu", stat->name, nr);
3409
- for_each_node_state(nid, N_MEMORY) {
3410
- nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
3411
- stat->lru_mask);
3412
- seq_printf(m, " N%d=%lu", nid, nr);
3413
- }
4059
+ seq_printf(m, "%s=%lu", stat->name,
4060
+ mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
4061
+ false));
4062
+ for_each_node_state(nid, N_MEMORY)
4063
+ seq_printf(m, " N%d=%lu", nid,
4064
+ mem_cgroup_node_nr_lru_pages(memcg, nid,
4065
+ stat->lru_mask, false));
34144066 seq_putc(m, '\n');
34154067 }
34164068
34174069 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3418
- struct mem_cgroup *iter;
34194070
3420
- nr = 0;
3421
- for_each_mem_cgroup_tree(iter, memcg)
3422
- nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
3423
- seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
3424
- for_each_node_state(nid, N_MEMORY) {
3425
- nr = 0;
3426
- for_each_mem_cgroup_tree(iter, memcg)
3427
- nr += mem_cgroup_node_nr_lru_pages(
3428
- iter, nid, stat->lru_mask);
3429
- seq_printf(m, " N%d=%lu", nid, nr);
3430
- }
4071
+ seq_printf(m, "hierarchical_%s=%lu", stat->name,
4072
+ mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
4073
+ true));
4074
+ for_each_node_state(nid, N_MEMORY)
4075
+ seq_printf(m, " N%d=%lu", nid,
4076
+ mem_cgroup_node_nr_lru_pages(memcg, nid,
4077
+ stat->lru_mask, true));
34314078 seq_putc(m, '\n');
34324079 }
34334080
34344081 return 0;
34354082 }
34364083 #endif /* CONFIG_NUMA */
4084
+
4085
+static const unsigned int memcg1_stats[] = {
4086
+ NR_FILE_PAGES,
4087
+ NR_ANON_MAPPED,
4088
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4089
+ NR_ANON_THPS,
4090
+#endif
4091
+ NR_SHMEM,
4092
+ NR_FILE_MAPPED,
4093
+ NR_FILE_DIRTY,
4094
+ NR_WRITEBACK,
4095
+ MEMCG_SWAP,
4096
+};
4097
+
4098
+static const char *const memcg1_stat_names[] = {
4099
+ "cache",
4100
+ "rss",
4101
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4102
+ "rss_huge",
4103
+#endif
4104
+ "shmem",
4105
+ "mapped_file",
4106
+ "dirty",
4107
+ "writeback",
4108
+ "swap",
4109
+};
34374110
34384111 /* Universal VM events cgroup1 shows, original sort order */
34394112 static const unsigned int memcg1_events[] = {
....@@ -3443,45 +4116,42 @@
34434116 PGMAJFAULT,
34444117 };
34454118
3446
-static const char *const memcg1_event_names[] = {
3447
- "pgpgin",
3448
- "pgpgout",
3449
- "pgfault",
3450
- "pgmajfault",
3451
-};
3452
-
34534119 static int memcg_stat_show(struct seq_file *m, void *v)
34544120 {
3455
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
4121
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
34564122 unsigned long memory, memsw;
34574123 struct mem_cgroup *mi;
34584124 unsigned int i;
3459
- struct accumulated_stats acc;
34604125
34614126 BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
3462
- BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
34634127
34644128 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
4129
+ unsigned long nr;
4130
+
34654131 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
34664132 continue;
3467
- seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
3468
- memcg_page_state(memcg, memcg1_stats[i]) *
3469
- PAGE_SIZE);
4133
+ nr = memcg_page_state_local(memcg, memcg1_stats[i]);
4134
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4135
+ if (memcg1_stats[i] == NR_ANON_THPS)
4136
+ nr *= HPAGE_PMD_NR;
4137
+#endif
4138
+ seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE);
34704139 }
34714140
34724141 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3473
- seq_printf(m, "%s %lu\n", memcg1_event_names[i],
3474
- memcg_sum_events(memcg, memcg1_events[i]));
4142
+ seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]),
4143
+ memcg_events_local(memcg, memcg1_events[i]));
34754144
34764145 for (i = 0; i < NR_LRU_LISTS; i++)
3477
- seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
3478
- mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
4146
+ seq_printf(m, "%s %lu\n", lru_list_name(i),
4147
+ memcg_page_state_local(memcg, NR_LRU_BASE + i) *
4148
+ PAGE_SIZE);
34794149
34804150 /* Hierarchical information */
34814151 memory = memsw = PAGE_COUNTER_MAX;
34824152 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
3483
- memory = min(memory, mi->memory.max);
3484
- memsw = min(memsw, mi->memsw.max);
4153
+ memory = min(memory, READ_ONCE(mi->memory.max));
4154
+ memsw = min(memsw, READ_ONCE(mi->memsw.max));
34854155 }
34864156 seq_printf(m, "hierarchical_memory_limit %llu\n",
34874157 (u64)memory * PAGE_SIZE);
....@@ -3489,49 +4159,45 @@
34894159 seq_printf(m, "hierarchical_memsw_limit %llu\n",
34904160 (u64)memsw * PAGE_SIZE);
34914161
3492
- memset(&acc, 0, sizeof(acc));
3493
- acc.stats_size = ARRAY_SIZE(memcg1_stats);
3494
- acc.stats_array = memcg1_stats;
3495
- acc.events_size = ARRAY_SIZE(memcg1_events);
3496
- acc.events_array = memcg1_events;
3497
- accumulate_memcg_tree(memcg, &acc);
3498
-
34994162 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
4163
+ unsigned long nr;
4164
+
35004165 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
35014166 continue;
4167
+ nr = memcg_page_state(memcg, memcg1_stats[i]);
4168
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4169
+ if (memcg1_stats[i] == NR_ANON_THPS)
4170
+ nr *= HPAGE_PMD_NR;
4171
+#endif
35024172 seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
3503
- (u64)acc.stat[i] * PAGE_SIZE);
4173
+ (u64)nr * PAGE_SIZE);
35044174 }
35054175
35064176 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3507
- seq_printf(m, "total_%s %llu\n", memcg1_event_names[i],
3508
- (u64)acc.events[i]);
4177
+ seq_printf(m, "total_%s %llu\n",
4178
+ vm_event_name(memcg1_events[i]),
4179
+ (u64)memcg_events(memcg, memcg1_events[i]));
35094180
35104181 for (i = 0; i < NR_LRU_LISTS; i++)
3511
- seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i],
3512
- (u64)acc.lru_pages[i] * PAGE_SIZE);
4182
+ seq_printf(m, "total_%s %llu\n", lru_list_name(i),
4183
+ (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
4184
+ PAGE_SIZE);
35134185
35144186 #ifdef CONFIG_DEBUG_VM
35154187 {
35164188 pg_data_t *pgdat;
35174189 struct mem_cgroup_per_node *mz;
3518
- struct zone_reclaim_stat *rstat;
3519
- unsigned long recent_rotated[2] = {0, 0};
3520
- unsigned long recent_scanned[2] = {0, 0};
4190
+ unsigned long anon_cost = 0;
4191
+ unsigned long file_cost = 0;
35214192
35224193 for_each_online_pgdat(pgdat) {
35234194 mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
3524
- rstat = &mz->lruvec.reclaim_stat;
35254195
3526
- recent_rotated[0] += rstat->recent_rotated[0];
3527
- recent_rotated[1] += rstat->recent_rotated[1];
3528
- recent_scanned[0] += rstat->recent_scanned[0];
3529
- recent_scanned[1] += rstat->recent_scanned[1];
4196
+ anon_cost += mz->lruvec.anon_cost;
4197
+ file_cost += mz->lruvec.file_cost;
35304198 }
3531
- seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
3532
- seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
3533
- seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
3534
- seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
4199
+ seq_printf(m, "anon_cost %lu\n", anon_cost);
4200
+ seq_printf(m, "file_cost %lu\n", file_cost);
35354201 }
35364202 #endif
35374203
....@@ -3690,8 +4356,7 @@
36904356 size = thresholds->primary ? thresholds->primary->size + 1 : 1;
36914357
36924358 /* Allocate memory for new array of thresholds */
3693
- new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
3694
- GFP_KERNEL);
4359
+ new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
36954360 if (!new) {
36964361 ret = -ENOMEM;
36974362 goto unlock;
....@@ -3699,17 +4364,16 @@
36994364 new->size = size;
37004365
37014366 /* Copy thresholds (if any) to new array */
3702
- if (thresholds->primary) {
3703
- memcpy(new->entries, thresholds->primary->entries, (size - 1) *
3704
- sizeof(struct mem_cgroup_threshold));
3705
- }
4367
+ if (thresholds->primary)
4368
+ memcpy(new->entries, thresholds->primary->entries,
4369
+ flex_array_size(new, entries, size - 1));
37064370
37074371 /* Add new threshold */
37084372 new->entries[size - 1].eventfd = eventfd;
37094373 new->entries[size - 1].threshold = threshold;
37104374
37114375 /* Sort thresholds. Registering of new threshold isn't time-critical */
3712
- sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
4376
+ sort(new->entries, size, sizeof(*new->entries),
37134377 compare_thresholds, NULL);
37144378
37154379 /* Find current threshold */
....@@ -3891,7 +4555,7 @@
38914555
38924556 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
38934557 {
3894
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
4558
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
38954559
38964560 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
38974561 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
....@@ -3917,6 +4581,8 @@
39174581 }
39184582
39194583 #ifdef CONFIG_CGROUP_WRITEBACK
4584
+
4585
+#include <trace/events/writeback.h>
39204586
39214587 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
39224588 {
....@@ -3949,11 +4615,11 @@
39494615 */
39504616 static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
39514617 {
3952
- long x = atomic_long_read(&memcg->stat[idx]);
4618
+ long x = atomic_long_read(&memcg->vmstats[idx]);
39534619 int cpu;
39544620
39554621 for_each_online_cpu(cpu)
3956
- x += per_cpu_ptr(memcg->stat_cpu, cpu)->count[idx];
4622
+ x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx];
39574623 if (x < 0)
39584624 x = 0;
39594625 return x;
....@@ -3986,18 +4652,142 @@
39864652
39874653 *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
39884654
3989
- /* this should eventually include NR_UNSTABLE_NFS */
39904655 *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
3991
- *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
3992
- (1 << LRU_ACTIVE_FILE));
4656
+ *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
4657
+ memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
39934658 *pheadroom = PAGE_COUNTER_MAX;
39944659
39954660 while ((parent = parent_mem_cgroup(memcg))) {
3996
- unsigned long ceiling = min(memcg->memory.max, memcg->high);
4661
+ unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
4662
+ READ_ONCE(memcg->memory.high));
39974663 unsigned long used = page_counter_read(&memcg->memory);
39984664
39994665 *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
40004666 memcg = parent;
4667
+ }
4668
+}
4669
+
4670
+/*
4671
+ * Foreign dirty flushing
4672
+ *
4673
+ * There's an inherent mismatch between memcg and writeback. The former
4674
+ * trackes ownership per-page while the latter per-inode. This was a
4675
+ * deliberate design decision because honoring per-page ownership in the
4676
+ * writeback path is complicated, may lead to higher CPU and IO overheads
4677
+ * and deemed unnecessary given that write-sharing an inode across
4678
+ * different cgroups isn't a common use-case.
4679
+ *
4680
+ * Combined with inode majority-writer ownership switching, this works well
4681
+ * enough in most cases but there are some pathological cases. For
4682
+ * example, let's say there are two cgroups A and B which keep writing to
4683
+ * different but confined parts of the same inode. B owns the inode and
4684
+ * A's memory is limited far below B's. A's dirty ratio can rise enough to
4685
+ * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
4686
+ * triggering background writeback. A will be slowed down without a way to
4687
+ * make writeback of the dirty pages happen.
4688
+ *
4689
+ * Conditions like the above can lead to a cgroup getting repatedly and
4690
+ * severely throttled after making some progress after each
4691
+ * dirty_expire_interval while the underyling IO device is almost
4692
+ * completely idle.
4693
+ *
4694
+ * Solving this problem completely requires matching the ownership tracking
4695
+ * granularities between memcg and writeback in either direction. However,
4696
+ * the more egregious behaviors can be avoided by simply remembering the
4697
+ * most recent foreign dirtying events and initiating remote flushes on
4698
+ * them when local writeback isn't enough to keep the memory clean enough.
4699
+ *
4700
+ * The following two functions implement such mechanism. When a foreign
4701
+ * page - a page whose memcg and writeback ownerships don't match - is
4702
+ * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
4703
+ * bdi_writeback on the page owning memcg. When balance_dirty_pages()
4704
+ * decides that the memcg needs to sleep due to high dirty ratio, it calls
4705
+ * mem_cgroup_flush_foreign() which queues writeback on the recorded
4706
+ * foreign bdi_writebacks which haven't expired. Both the numbers of
4707
+ * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
4708
+ * limited to MEMCG_CGWB_FRN_CNT.
4709
+ *
4710
+ * The mechanism only remembers IDs and doesn't hold any object references.
4711
+ * As being wrong occasionally doesn't matter, updates and accesses to the
4712
+ * records are lockless and racy.
4713
+ */
4714
+void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
4715
+ struct bdi_writeback *wb)
4716
+{
4717
+ struct mem_cgroup *memcg = page->mem_cgroup;
4718
+ struct memcg_cgwb_frn *frn;
4719
+ u64 now = get_jiffies_64();
4720
+ u64 oldest_at = now;
4721
+ int oldest = -1;
4722
+ int i;
4723
+
4724
+ trace_track_foreign_dirty(page, wb);
4725
+
4726
+ /*
4727
+ * Pick the slot to use. If there is already a slot for @wb, keep
4728
+ * using it. If not replace the oldest one which isn't being
4729
+ * written out.
4730
+ */
4731
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4732
+ frn = &memcg->cgwb_frn[i];
4733
+ if (frn->bdi_id == wb->bdi->id &&
4734
+ frn->memcg_id == wb->memcg_css->id)
4735
+ break;
4736
+ if (time_before64(frn->at, oldest_at) &&
4737
+ atomic_read(&frn->done.cnt) == 1) {
4738
+ oldest = i;
4739
+ oldest_at = frn->at;
4740
+ }
4741
+ }
4742
+
4743
+ if (i < MEMCG_CGWB_FRN_CNT) {
4744
+ /*
4745
+ * Re-using an existing one. Update timestamp lazily to
4746
+ * avoid making the cacheline hot. We want them to be
4747
+ * reasonably up-to-date and significantly shorter than
4748
+ * dirty_expire_interval as that's what expires the record.
4749
+ * Use the shorter of 1s and dirty_expire_interval / 8.
4750
+ */
4751
+ unsigned long update_intv =
4752
+ min_t(unsigned long, HZ,
4753
+ msecs_to_jiffies(dirty_expire_interval * 10) / 8);
4754
+
4755
+ if (time_before64(frn->at, now - update_intv))
4756
+ frn->at = now;
4757
+ } else if (oldest >= 0) {
4758
+ /* replace the oldest free one */
4759
+ frn = &memcg->cgwb_frn[oldest];
4760
+ frn->bdi_id = wb->bdi->id;
4761
+ frn->memcg_id = wb->memcg_css->id;
4762
+ frn->at = now;
4763
+ }
4764
+}
4765
+
4766
+/* issue foreign writeback flushes for recorded foreign dirtying events */
4767
+void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
4768
+{
4769
+ struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4770
+ unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
4771
+ u64 now = jiffies_64;
4772
+ int i;
4773
+
4774
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4775
+ struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
4776
+
4777
+ /*
4778
+ * If the record is older than dirty_expire_interval,
4779
+ * writeback on it has already started. No need to kick it
4780
+ * off again. Also, don't start a new one if there's
4781
+ * already one in flight.
4782
+ */
4783
+ if (time_after64(frn->at, now - intv) &&
4784
+ atomic_read(&frn->done.cnt) == 1) {
4785
+ frn->at = 0;
4786
+ trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
4787
+ cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
4788
+ WB_REASON_FOREIGN_FLUSH,
4789
+ &frn->done);
4790
+ }
40014791 }
40024792 }
40034793
....@@ -4120,6 +4910,7 @@
41204910 unsigned int efd, cfd;
41214911 struct fd efile;
41224912 struct fd cfile;
4913
+ struct dentry *cdentry;
41234914 const char *name;
41244915 char *endp;
41254916 int ret;
....@@ -4171,6 +4962,16 @@
41714962 goto out_put_cfile;
41724963
41734964 /*
4965
+ * The control file must be a regular cgroup1 file. As a regular cgroup
4966
+ * file can't be renamed, it's safe to access its name afterwards.
4967
+ */
4968
+ cdentry = cfile.file->f_path.dentry;
4969
+ if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
4970
+ ret = -EINVAL;
4971
+ goto out_put_cfile;
4972
+ }
4973
+
4974
+ /*
41744975 * Determine the event callbacks and set them in @event. This used
41754976 * to be done via struct cftype but cgroup core no longer knows
41764977 * about these events. The following is crude but the whole thing
....@@ -4178,7 +4979,7 @@
41784979 *
41794980 * DO NOT ADD NEW FILES.
41804981 */
4181
- name = cfile.file->f_path.dentry->d_name.name;
4982
+ name = cdentry->d_name.name;
41824983
41834984 if (!strcmp(name, "memory.usage_in_bytes")) {
41844985 event->register_event = mem_cgroup_usage_register_event;
....@@ -4202,7 +5003,7 @@
42025003 * automatically removed on cgroup destruction but the removal is
42035004 * asynchronous, so take an extra ref on @css.
42045005 */
4205
- cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
5006
+ cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
42065007 &memory_cgrp_subsys);
42075008 ret = -EINVAL;
42085009 if (IS_ERR(cfile_css))
....@@ -4337,12 +5138,10 @@
43375138 .write = mem_cgroup_reset,
43385139 .read_u64 = mem_cgroup_read_u64,
43395140 },
4340
-#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
5141
+#if defined(CONFIG_MEMCG_KMEM) && \
5142
+ (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
43415143 {
43425144 .name = "kmem.slabinfo",
4343
- .seq_start = memcg_slab_start,
4344
- .seq_next = memcg_slab_next,
4345
- .seq_stop = memcg_slab_stop,
43465145 .seq_show = memcg_slab_show,
43475146 },
43485147 #endif
....@@ -4380,7 +5179,7 @@
43805179 * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
43815180 * memory-controlled cgroups to 64k.
43825181 *
4383
- * However, there usually are many references to the oflline CSS after
5182
+ * However, there usually are many references to the offline CSS after
43845183 * the cgroup has been destroyed, such as page cache or reclaimable
43855184 * slab objects, that don't need to hang on to the ID. We want to keep
43865185 * those dead CSS from occupying IDs, or we might quickly exhaust the
....@@ -4401,31 +5200,26 @@
44015200 static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
44025201 {
44035202 if (memcg->id.id > 0) {
5203
+ trace_android_vh_mem_cgroup_id_remove(memcg);
44045204 idr_remove(&mem_cgroup_idr, memcg->id.id);
44055205 memcg->id.id = 0;
44065206 }
44075207 }
44085208
4409
-static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
5209
+static void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg,
5210
+ unsigned int n)
44105211 {
4411
- VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0);
4412
- atomic_add(n, &memcg->id.ref);
5212
+ refcount_add(n, &memcg->id.ref);
44135213 }
44145214
44155215 static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
44165216 {
4417
- VM_BUG_ON(atomic_read(&memcg->id.ref) < n);
4418
- if (atomic_sub_and_test(n, &memcg->id.ref)) {
5217
+ if (refcount_sub_and_test(n, &memcg->id.ref)) {
44195218 mem_cgroup_id_remove(memcg);
44205219
44215220 /* Memcg ID pins CSS */
44225221 css_put(&memcg->css);
44235222 }
4424
-}
4425
-
4426
-static inline void mem_cgroup_id_get(struct mem_cgroup *memcg)
4427
-{
4428
- mem_cgroup_id_get_many(memcg, 1);
44295223 }
44305224
44315225 static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
....@@ -4444,6 +5238,7 @@
44445238 WARN_ON_ONCE(!rcu_read_lock_held());
44455239 return idr_find(&mem_cgroup_idr, id);
44465240 }
5241
+EXPORT_SYMBOL_GPL(mem_cgroup_from_id);
44475242
44485243 static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
44495244 {
....@@ -4463,8 +5258,17 @@
44635258 if (!pn)
44645259 return 1;
44655260
4466
- pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat);
5261
+ pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat,
5262
+ GFP_KERNEL_ACCOUNT);
5263
+ if (!pn->lruvec_stat_local) {
5264
+ kfree(pn);
5265
+ return 1;
5266
+ }
5267
+
5268
+ pn->lruvec_stat_cpu = alloc_percpu_gfp(struct lruvec_stat,
5269
+ GFP_KERNEL_ACCOUNT);
44675270 if (!pn->lruvec_stat_cpu) {
5271
+ free_percpu(pn->lruvec_stat_local);
44685272 kfree(pn);
44695273 return 1;
44705274 }
....@@ -4486,6 +5290,7 @@
44865290 return;
44875291
44885292 free_percpu(pn->lruvec_stat_cpu);
5293
+ free_percpu(pn->lruvec_stat_local);
44895294 kfree(pn);
44905295 }
44915296
....@@ -4493,39 +5298,57 @@
44935298 {
44945299 int node;
44955300
5301
+ trace_android_vh_mem_cgroup_free(memcg);
44965302 for_each_node(node)
44975303 free_mem_cgroup_per_node_info(memcg, node);
4498
- free_percpu(memcg->stat_cpu);
5304
+ free_percpu(memcg->vmstats_percpu);
5305
+ free_percpu(memcg->vmstats_local);
44995306 kfree(memcg);
45005307 }
45015308
45025309 static void mem_cgroup_free(struct mem_cgroup *memcg)
45035310 {
45045311 memcg_wb_domain_exit(memcg);
5312
+ /*
5313
+ * Flush percpu vmstats and vmevents to guarantee the value correctness
5314
+ * on parent's and all ancestor levels.
5315
+ */
5316
+ memcg_flush_percpu_vmstats(memcg);
5317
+ memcg_flush_percpu_vmevents(memcg);
45055318 __mem_cgroup_free(memcg);
45065319 }
45075320
45085321 static struct mem_cgroup *mem_cgroup_alloc(void)
45095322 {
45105323 struct mem_cgroup *memcg;
4511
- size_t size;
5324
+ unsigned int size;
45125325 int node;
5326
+ int __maybe_unused i;
5327
+ long error = -ENOMEM;
45135328
45145329 size = sizeof(struct mem_cgroup);
45155330 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
45165331
45175332 memcg = kzalloc(size, GFP_KERNEL);
45185333 if (!memcg)
4519
- return NULL;
5334
+ return ERR_PTR(error);
45205335
45215336 memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
45225337 1, MEM_CGROUP_ID_MAX,
45235338 GFP_KERNEL);
4524
- if (memcg->id.id < 0)
5339
+ if (memcg->id.id < 0) {
5340
+ error = memcg->id.id;
5341
+ goto fail;
5342
+ }
5343
+
5344
+ memcg->vmstats_local = alloc_percpu_gfp(struct memcg_vmstats_percpu,
5345
+ GFP_KERNEL_ACCOUNT);
5346
+ if (!memcg->vmstats_local)
45255347 goto fail;
45265348
4527
- memcg->stat_cpu = alloc_percpu(struct mem_cgroup_stat_cpu);
4528
- if (!memcg->stat_cpu)
5349
+ memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
5350
+ GFP_KERNEL_ACCOUNT);
5351
+ if (!memcg->vmstats_percpu)
45295352 goto fail;
45305353
45315354 for_each_node(node)
....@@ -4536,7 +5359,6 @@
45365359 goto fail;
45375360
45385361 INIT_WORK(&memcg->high_work, high_work_func);
4539
- memcg->last_scanned_node = MAX_NUMNODES;
45405362 INIT_LIST_HEAD(&memcg->oom_notify);
45415363 mutex_init(&memcg->thresholds_lock);
45425364 spin_lock_init(&memcg->move_lock);
....@@ -4546,48 +5368,64 @@
45465368 memcg->socket_pressure = jiffies;
45475369 #ifdef CONFIG_MEMCG_KMEM
45485370 memcg->kmemcg_id = -1;
5371
+ INIT_LIST_HEAD(&memcg->objcg_list);
45495372 #endif
45505373 #ifdef CONFIG_CGROUP_WRITEBACK
45515374 INIT_LIST_HEAD(&memcg->cgwb_list);
5375
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
5376
+ memcg->cgwb_frn[i].done =
5377
+ __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
5378
+#endif
5379
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5380
+ spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
5381
+ INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
5382
+ memcg->deferred_split_queue.split_queue_len = 0;
45525383 #endif
45535384 idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
5385
+ trace_android_vh_mem_cgroup_alloc(memcg);
45545386 return memcg;
45555387 fail:
45565388 mem_cgroup_id_remove(memcg);
45575389 __mem_cgroup_free(memcg);
4558
- return NULL;
5390
+ return ERR_PTR(error);
45595391 }
45605392
45615393 static struct cgroup_subsys_state * __ref
45625394 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
45635395 {
45645396 struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
4565
- struct mem_cgroup *memcg;
5397
+ struct mem_cgroup *memcg, *old_memcg;
45665398 long error = -ENOMEM;
45675399
5400
+ old_memcg = set_active_memcg(parent);
45685401 memcg = mem_cgroup_alloc();
4569
- if (!memcg)
4570
- return ERR_PTR(error);
5402
+ set_active_memcg(old_memcg);
5403
+ if (IS_ERR(memcg))
5404
+ return ERR_CAST(memcg);
45715405
4572
- memcg->high = PAGE_COUNTER_MAX;
5406
+ page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
45735407 memcg->soft_limit = PAGE_COUNTER_MAX;
5408
+ page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
45745409 if (parent) {
45755410 memcg->swappiness = mem_cgroup_swappiness(parent);
45765411 memcg->oom_kill_disable = parent->oom_kill_disable;
45775412 }
4578
- if (parent && parent->use_hierarchy) {
5413
+ if (!parent) {
5414
+ page_counter_init(&memcg->memory, NULL);
5415
+ page_counter_init(&memcg->swap, NULL);
5416
+ page_counter_init(&memcg->kmem, NULL);
5417
+ page_counter_init(&memcg->tcpmem, NULL);
5418
+ } else if (parent->use_hierarchy) {
45795419 memcg->use_hierarchy = true;
45805420 page_counter_init(&memcg->memory, &parent->memory);
45815421 page_counter_init(&memcg->swap, &parent->swap);
4582
- page_counter_init(&memcg->memsw, &parent->memsw);
45835422 page_counter_init(&memcg->kmem, &parent->kmem);
45845423 page_counter_init(&memcg->tcpmem, &parent->tcpmem);
45855424 } else {
4586
- page_counter_init(&memcg->memory, NULL);
4587
- page_counter_init(&memcg->swap, NULL);
4588
- page_counter_init(&memcg->memsw, NULL);
4589
- page_counter_init(&memcg->kmem, NULL);
4590
- page_counter_init(&memcg->tcpmem, NULL);
5425
+ page_counter_init(&memcg->memory, &root_mem_cgroup->memory);
5426
+ page_counter_init(&memcg->swap, &root_mem_cgroup->swap);
5427
+ page_counter_init(&memcg->kmem, &root_mem_cgroup->kmem);
5428
+ page_counter_init(&memcg->tcpmem, &root_mem_cgroup->tcpmem);
45915429 /*
45925430 * Deeper hierachy with use_hierarchy == false doesn't make
45935431 * much sense so let cgroup subsystem know about this
....@@ -4614,7 +5452,7 @@
46145452 fail:
46155453 mem_cgroup_id_remove(memcg);
46165454 mem_cgroup_free(memcg);
4617
- return ERR_PTR(-ENOMEM);
5455
+ return ERR_PTR(error);
46185456 }
46195457
46205458 static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
....@@ -4632,8 +5470,9 @@
46325470 }
46335471
46345472 /* Online state pins memcg ID, memcg ID pins CSS */
4635
- atomic_set(&memcg->id.ref, 1);
5473
+ refcount_set(&memcg->id.ref, 1);
46365474 css_get(css);
5475
+ trace_android_vh_mem_cgroup_css_online(css, memcg);
46375476 return 0;
46385477 }
46395478
....@@ -4642,6 +5481,7 @@
46425481 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
46435482 struct mem_cgroup_event *event, *tmp;
46445483
5484
+ trace_android_vh_mem_cgroup_css_offline(css, memcg);
46455485 /*
46465486 * Unregister events and notify userspace.
46475487 * Notify userspace about cgroup removing only after rmdir of cgroup
....@@ -4660,6 +5500,8 @@
46605500 memcg_offline_kmem(memcg);
46615501 wb_memcg_offline(memcg);
46625502
5503
+ drain_all_stock(memcg);
5504
+
46635505 mem_cgroup_id_put(memcg);
46645506 }
46655507
....@@ -4673,7 +5515,12 @@
46735515 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
46745516 {
46755517 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5518
+ int __maybe_unused i;
46765519
5520
+#ifdef CONFIG_CGROUP_WRITEBACK
5521
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
5522
+ wb_wait_for_completion(&memcg->cgwb_frn[i].done);
5523
+#endif
46775524 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
46785525 static_branch_dec(&memcg_sockets_enabled_key);
46795526
....@@ -4707,13 +5554,13 @@
47075554
47085555 page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
47095556 page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
4710
- page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
47115557 page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
47125558 page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
47135559 page_counter_set_min(&memcg->memory, 0);
47145560 page_counter_set_low(&memcg->memory, 0);
4715
- memcg->high = PAGE_COUNTER_MAX;
5561
+ page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
47165562 memcg->soft_limit = PAGE_COUNTER_MAX;
5563
+ page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
47175564 memcg_wb_domain_size_changed(memcg);
47185565 }
47195566
....@@ -4756,7 +5603,7 @@
47565603 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
47575604 unsigned long addr, pte_t ptent)
47585605 {
4759
- struct page *page = _vm_normal_page(vma, addr, ptent, true);
5606
+ struct page *page = vm_normal_page(vma, addr, ptent);
47605607
47615608 if (!page || !page_mapped(page))
47625609 return NULL;
....@@ -4807,8 +5654,7 @@
48075654 * we call find_get_page() with swapper_space directly.
48085655 */
48095656 page = find_get_page(swap_address_space(ent), swp_offset(ent));
4810
- if (do_memsw_account())
4811
- entry->val = ent.val;
5657
+ entry->val = ent.val;
48125658
48135659 return page;
48145660 }
....@@ -4823,36 +5669,15 @@
48235669 static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
48245670 unsigned long addr, pte_t ptent, swp_entry_t *entry)
48255671 {
4826
- struct page *page = NULL;
4827
- struct address_space *mapping;
4828
- pgoff_t pgoff;
4829
-
48305672 if (!vma->vm_file) /* anonymous vma */
48315673 return NULL;
48325674 if (!(mc.flags & MOVE_FILE))
48335675 return NULL;
48345676
4835
- mapping = vma->vm_file->f_mapping;
4836
- pgoff = linear_page_index(vma, addr);
4837
-
48385677 /* page is moved even if it's not RSS of this task(page-faulted). */
4839
-#ifdef CONFIG_SWAP
48405678 /* shmem/tmpfs may report page out on swap: account for that too. */
4841
- if (shmem_mapping(mapping)) {
4842
- page = find_get_entry(mapping, pgoff);
4843
- if (radix_tree_exceptional_entry(page)) {
4844
- swp_entry_t swp = radix_to_swp_entry(page);
4845
- if (do_memsw_account())
4846
- *entry = swp;
4847
- page = find_get_page(swap_address_space(swp),
4848
- swp_offset(swp));
4849
- }
4850
- } else
4851
- page = find_get_page(mapping, pgoff);
4852
-#else
4853
- page = find_get_page(mapping, pgoff);
4854
-#endif
4855
- return page;
5679
+ return find_get_incore_page(vma->vm_file->f_mapping,
5680
+ linear_page_index(vma, addr));
48565681 }
48575682
48585683 /**
....@@ -4872,10 +5697,10 @@
48725697 struct mem_cgroup *from,
48735698 struct mem_cgroup *to)
48745699 {
4875
- unsigned long flags;
4876
- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5700
+ struct lruvec *from_vec, *to_vec;
5701
+ struct pglist_data *pgdat;
5702
+ unsigned int nr_pages = compound ? thp_nr_pages(page) : 1;
48775703 int ret;
4878
- bool anon;
48795704
48805705 VM_BUG_ON(from == to);
48815706 VM_BUG_ON_PAGE(PageLRU(page), page);
....@@ -4893,52 +5718,83 @@
48935718 if (page->mem_cgroup != from)
48945719 goto out_unlock;
48955720
4896
- anon = PageAnon(page);
5721
+ pgdat = page_pgdat(page);
5722
+ from_vec = mem_cgroup_lruvec(from, pgdat);
5723
+ to_vec = mem_cgroup_lruvec(to, pgdat);
48975724
4898
- spin_lock_irqsave(&from->move_lock, flags);
5725
+ lock_page_memcg(page);
48995726
4900
- if (!anon && page_mapped(page)) {
4901
- __mod_memcg_state(from, NR_FILE_MAPPED, -nr_pages);
4902
- __mod_memcg_state(to, NR_FILE_MAPPED, nr_pages);
4903
- }
5727
+ if (PageAnon(page)) {
5728
+ if (page_mapped(page)) {
5729
+ __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
5730
+ __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
5731
+ if (PageTransHuge(page)) {
5732
+ __dec_lruvec_state(from_vec, NR_ANON_THPS);
5733
+ __inc_lruvec_state(to_vec, NR_ANON_THPS);
5734
+ }
49045735
4905
- /*
4906
- * move_lock grabbed above and caller set from->moving_account, so
4907
- * mod_memcg_page_state will serialize updates to PageDirty.
4908
- * So mapping should be stable for dirty pages.
4909
- */
4910
- if (!anon && PageDirty(page)) {
4911
- struct address_space *mapping = page_mapping(page);
5736
+ }
5737
+ } else {
5738
+ __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
5739
+ __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
49125740
4913
- if (mapping_cap_account_dirty(mapping)) {
4914
- __mod_memcg_state(from, NR_FILE_DIRTY, -nr_pages);
4915
- __mod_memcg_state(to, NR_FILE_DIRTY, nr_pages);
5741
+ if (PageSwapBacked(page)) {
5742
+ __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
5743
+ __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
5744
+ }
5745
+
5746
+ if (page_mapped(page)) {
5747
+ __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
5748
+ __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
5749
+ }
5750
+
5751
+ if (PageDirty(page)) {
5752
+ struct address_space *mapping = page_mapping(page);
5753
+
5754
+ if (mapping_can_writeback(mapping)) {
5755
+ __mod_lruvec_state(from_vec, NR_FILE_DIRTY,
5756
+ -nr_pages);
5757
+ __mod_lruvec_state(to_vec, NR_FILE_DIRTY,
5758
+ nr_pages);
5759
+ }
49165760 }
49175761 }
49185762
49195763 if (PageWriteback(page)) {
4920
- __mod_memcg_state(from, NR_WRITEBACK, -nr_pages);
4921
- __mod_memcg_state(to, NR_WRITEBACK, nr_pages);
5764
+ __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
5765
+ __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
49225766 }
49235767
49245768 /*
5769
+ * All state has been migrated, let's switch to the new memcg.
5770
+ *
49255771 * It is safe to change page->mem_cgroup here because the page
4926
- * is referenced, charged, and isolated - we can't race with
4927
- * uncharging, charging, migration, or LRU putback.
5772
+ * is referenced, charged, isolated, and locked: we can't race
5773
+ * with (un)charging, migration, LRU putback, or anything else
5774
+ * that would rely on a stable page->mem_cgroup.
5775
+ *
5776
+ * Note that lock_page_memcg is a memcg lock, not a page lock,
5777
+ * to save space. As soon as we switch page->mem_cgroup to a
5778
+ * new memcg that isn't locked, the above state can change
5779
+ * concurrently again. Make sure we're truly done with it.
49285780 */
5781
+ smp_mb();
49295782
4930
- /* caller should have done css_get */
5783
+ css_get(&to->css);
5784
+ css_put(&from->css);
5785
+
49315786 page->mem_cgroup = to;
4932
- spin_unlock_irqrestore(&from->move_lock, flags);
5787
+
5788
+ __unlock_page_memcg(from);
49335789
49345790 ret = 0;
49355791
4936
- local_irq_disable();
4937
- mem_cgroup_charge_statistics(to, page, compound, nr_pages);
5792
+ local_lock_irq(&event_lock.l);
5793
+ mem_cgroup_charge_statistics(to, page, nr_pages);
49385794 memcg_check_events(to, page);
4939
- mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
5795
+ mem_cgroup_charge_statistics(from, page, -nr_pages);
49405796 memcg_check_events(from, page);
4941
- local_irq_enable();
5797
+ local_unlock_irq(&event_lock.l);
49425798 out_unlock:
49435799 unlock_page(page);
49445800 out:
....@@ -4960,8 +5816,8 @@
49605816 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
49615817 * target for charge migration. if @target is not NULL, the entry is stored
49625818 * in target->ent.
4963
- * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PUBLIC
4964
- * or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).
5819
+ * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE
5820
+ * (so ZONE_DEVICE page and thus not on the lru).
49655821 * For now we such page is charge like a regular page would be as for all
49665822 * intent and purposes it is just special memory taking the place of a
49675823 * regular page.
....@@ -4995,8 +5851,7 @@
49955851 */
49965852 if (page->mem_cgroup == mc.from) {
49975853 ret = MC_TARGET_PAGE;
4998
- if (is_device_private_page(page) ||
4999
- is_device_public_page(page))
5854
+ if (is_device_private_page(page))
50005855 ret = MC_TARGET_DEVICE;
50015856 if (target)
50025857 target->page = page;
....@@ -5067,8 +5922,8 @@
50675922 if (ptl) {
50685923 /*
50695924 * Note their can not be MC_TARGET_DEVICE for now as we do not
5070
- * support transparent huge page with MEMORY_DEVICE_PUBLIC or
5071
- * MEMORY_DEVICE_PRIVATE but this might change.
5925
+ * support transparent huge page with MEMORY_DEVICE_PRIVATE but
5926
+ * this might change.
50725927 */
50735928 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
50745929 mc.precharge += HPAGE_PMD_NR;
....@@ -5088,18 +5943,17 @@
50885943 return 0;
50895944 }
50905945
5946
+static const struct mm_walk_ops precharge_walk_ops = {
5947
+ .pmd_entry = mem_cgroup_count_precharge_pte_range,
5948
+};
5949
+
50915950 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
50925951 {
50935952 unsigned long precharge;
50945953
5095
- struct mm_walk mem_cgroup_count_precharge_walk = {
5096
- .pmd_entry = mem_cgroup_count_precharge_pte_range,
5097
- .mm = mm,
5098
- };
5099
- down_read(&mm->mmap_sem);
5100
- walk_page_range(0, mm->highest_vm_end,
5101
- &mem_cgroup_count_precharge_walk);
5102
- up_read(&mm->mmap_sem);
5954
+ mmap_read_lock(mm);
5955
+ walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
5956
+ mmap_read_unlock(mm);
51035957
51045958 precharge = mc.precharge;
51055959 mc.precharge = 0;
....@@ -5149,8 +6003,6 @@
51496003 */
51506004 if (!mem_cgroup_is_root(mc.to))
51516005 page_counter_uncharge(&mc.to->memory, mc.moved_swap);
5152
-
5153
- css_put_many(&mc.to->css, mc.moved_swap);
51546006
51556007 mc.moved_swap = 0;
51566008 }
....@@ -5312,7 +6164,7 @@
53126164 switch (get_mctgt_type(vma, addr, ptent, &target)) {
53136165 case MC_TARGET_DEVICE:
53146166 device = true;
5315
- /* fall through */
6167
+ fallthrough;
53166168 case MC_TARGET_PAGE:
53176169 page = target.page;
53186170 /*
....@@ -5367,13 +6219,12 @@
53676219 return ret;
53686220 }
53696221
6222
+static const struct mm_walk_ops charge_walk_ops = {
6223
+ .pmd_entry = mem_cgroup_move_charge_pte_range,
6224
+};
6225
+
53706226 static void mem_cgroup_move_charge(void)
53716227 {
5372
- struct mm_walk mem_cgroup_move_charge_walk = {
5373
- .pmd_entry = mem_cgroup_move_charge_pte_range,
5374
- .mm = mc.mm,
5375
- };
5376
-
53776228 lru_add_drain_all();
53786229 /*
53796230 * Signal lock_page_memcg() to take the memcg's move_lock
....@@ -5383,9 +6234,9 @@
53836234 atomic_inc(&mc.from->moving_account);
53846235 synchronize_rcu();
53856236 retry:
5386
- if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) {
6237
+ if (unlikely(!mmap_read_trylock(mc.mm))) {
53876238 /*
5388
- * Someone who are holding the mmap_sem might be waiting in
6239
+ * Someone who are holding the mmap_lock might be waiting in
53896240 * waitq. So we cancel all extra charges, wake up all waiters,
53906241 * and retry. Because we cancel precharges, we might not be able
53916242 * to move enough charges, but moving charge is a best-effort
....@@ -5399,9 +6250,10 @@
53996250 * When we have consumed all precharges and failed in doing
54006251 * additional charge, the page walk just aborts.
54016252 */
5402
- walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk);
6253
+ walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
6254
+ NULL);
54036255
5404
- up_read(&mc.mm->mmap_sem);
6256
+ mmap_read_unlock(mc.mm);
54056257 atomic_dec(&mc.from->moving_account);
54066258 }
54076259
....@@ -5443,6 +6295,16 @@
54436295 root_mem_cgroup->use_hierarchy = false;
54446296 }
54456297
6298
+static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
6299
+{
6300
+ if (value == PAGE_COUNTER_MAX)
6301
+ seq_puts(m, "max\n");
6302
+ else
6303
+ seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
6304
+
6305
+ return 0;
6306
+}
6307
+
54466308 static u64 memory_current_read(struct cgroup_subsys_state *css,
54476309 struct cftype *cft)
54486310 {
....@@ -5453,15 +6315,8 @@
54536315
54546316 static int memory_min_show(struct seq_file *m, void *v)
54556317 {
5456
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5457
- unsigned long min = READ_ONCE(memcg->memory.min);
5458
-
5459
- if (min == PAGE_COUNTER_MAX)
5460
- seq_puts(m, "max\n");
5461
- else
5462
- seq_printf(m, "%llu\n", (u64)min * PAGE_SIZE);
5463
-
5464
- return 0;
6318
+ return seq_puts_memcg_tunable(m,
6319
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
54656320 }
54666321
54676322 static ssize_t memory_min_write(struct kernfs_open_file *of,
....@@ -5483,15 +6338,8 @@
54836338
54846339 static int memory_low_show(struct seq_file *m, void *v)
54856340 {
5486
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5487
- unsigned long low = READ_ONCE(memcg->memory.low);
5488
-
5489
- if (low == PAGE_COUNTER_MAX)
5490
- seq_puts(m, "max\n");
5491
- else
5492
- seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE);
5493
-
5494
- return 0;
6341
+ return seq_puts_memcg_tunable(m,
6342
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
54956343 }
54966344
54976345 static ssize_t memory_low_write(struct kernfs_open_file *of,
....@@ -5513,22 +6361,16 @@
55136361
55146362 static int memory_high_show(struct seq_file *m, void *v)
55156363 {
5516
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5517
- unsigned long high = READ_ONCE(memcg->high);
5518
-
5519
- if (high == PAGE_COUNTER_MAX)
5520
- seq_puts(m, "max\n");
5521
- else
5522
- seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE);
5523
-
5524
- return 0;
6364
+ return seq_puts_memcg_tunable(m,
6365
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.high));
55256366 }
55266367
55276368 static ssize_t memory_high_write(struct kernfs_open_file *of,
55286369 char *buf, size_t nbytes, loff_t off)
55296370 {
55306371 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5531
- unsigned long nr_pages;
6372
+ unsigned int nr_retries = MAX_RECLAIM_RETRIES;
6373
+ bool drained = false;
55326374 unsigned long high;
55336375 int err;
55346376
....@@ -5537,12 +6379,30 @@
55376379 if (err)
55386380 return err;
55396381
5540
- memcg->high = high;
6382
+ page_counter_set_high(&memcg->memory, high);
55416383
5542
- nr_pages = page_counter_read(&memcg->memory);
5543
- if (nr_pages > high)
5544
- try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
5545
- GFP_KERNEL, true);
6384
+ for (;;) {
6385
+ unsigned long nr_pages = page_counter_read(&memcg->memory);
6386
+ unsigned long reclaimed;
6387
+
6388
+ if (nr_pages <= high)
6389
+ break;
6390
+
6391
+ if (signal_pending(current))
6392
+ break;
6393
+
6394
+ if (!drained) {
6395
+ drain_all_stock(memcg);
6396
+ drained = true;
6397
+ continue;
6398
+ }
6399
+
6400
+ reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
6401
+ GFP_KERNEL, true);
6402
+
6403
+ if (!reclaimed && !nr_retries--)
6404
+ break;
6405
+ }
55466406
55476407 memcg_wb_domain_size_changed(memcg);
55486408 return nbytes;
....@@ -5550,22 +6410,15 @@
55506410
55516411 static int memory_max_show(struct seq_file *m, void *v)
55526412 {
5553
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5554
- unsigned long max = READ_ONCE(memcg->memory.max);
5555
-
5556
- if (max == PAGE_COUNTER_MAX)
5557
- seq_puts(m, "max\n");
5558
- else
5559
- seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
5560
-
5561
- return 0;
6413
+ return seq_puts_memcg_tunable(m,
6414
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
55626415 }
55636416
55646417 static ssize_t memory_max_write(struct kernfs_open_file *of,
55656418 char *buf, size_t nbytes, loff_t off)
55666419 {
55676420 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5568
- unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
6421
+ unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
55696422 bool drained = false;
55706423 unsigned long max;
55716424 int err;
....@@ -5583,10 +6436,8 @@
55836436 if (nr_pages <= max)
55846437 break;
55856438
5586
- if (signal_pending(current)) {
5587
- err = -EINTR;
6439
+ if (signal_pending(current))
55886440 break;
5589
- }
55906441
55916442 if (!drained) {
55926443 drain_all_stock(memcg);
....@@ -5610,104 +6461,77 @@
56106461 return nbytes;
56116462 }
56126463
6464
+static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
6465
+{
6466
+ seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
6467
+ seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
6468
+ seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
6469
+ seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
6470
+ seq_printf(m, "oom_kill %lu\n",
6471
+ atomic_long_read(&events[MEMCG_OOM_KILL]));
6472
+}
6473
+
56136474 static int memory_events_show(struct seq_file *m, void *v)
56146475 {
5615
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
6476
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
56166477
5617
- seq_printf(m, "low %lu\n",
5618
- atomic_long_read(&memcg->memory_events[MEMCG_LOW]));
5619
- seq_printf(m, "high %lu\n",
5620
- atomic_long_read(&memcg->memory_events[MEMCG_HIGH]));
5621
- seq_printf(m, "max %lu\n",
5622
- atomic_long_read(&memcg->memory_events[MEMCG_MAX]));
5623
- seq_printf(m, "oom %lu\n",
5624
- atomic_long_read(&memcg->memory_events[MEMCG_OOM]));
5625
- seq_printf(m, "oom_kill %lu\n",
5626
- atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
6478
+ __memory_events_show(m, memcg->memory_events);
6479
+ return 0;
6480
+}
56276481
6482
+static int memory_events_local_show(struct seq_file *m, void *v)
6483
+{
6484
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6485
+
6486
+ __memory_events_show(m, memcg->memory_events_local);
56286487 return 0;
56296488 }
56306489
56316490 static int memory_stat_show(struct seq_file *m, void *v)
56326491 {
5633
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5634
- struct accumulated_stats acc;
5635
- int i;
6492
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6493
+ char *buf;
56366494
5637
- /*
5638
- * Provide statistics on the state of the memory subsystem as
5639
- * well as cumulative event counters that show past behavior.
5640
- *
5641
- * This list is ordered following a combination of these gradients:
5642
- * 1) generic big picture -> specifics and details
5643
- * 2) reflecting userspace activity -> reflecting kernel heuristics
5644
- *
5645
- * Current memory state:
5646
- */
5647
-
5648
- memset(&acc, 0, sizeof(acc));
5649
- acc.stats_size = MEMCG_NR_STAT;
5650
- acc.events_size = NR_VM_EVENT_ITEMS;
5651
- accumulate_memcg_tree(memcg, &acc);
5652
-
5653
- seq_printf(m, "anon %llu\n",
5654
- (u64)acc.stat[MEMCG_RSS] * PAGE_SIZE);
5655
- seq_printf(m, "file %llu\n",
5656
- (u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE);
5657
- seq_printf(m, "kernel_stack %llu\n",
5658
- (u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024);
5659
- seq_printf(m, "slab %llu\n",
5660
- (u64)(acc.stat[NR_SLAB_RECLAIMABLE] +
5661
- acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
5662
- seq_printf(m, "sock %llu\n",
5663
- (u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE);
5664
-
5665
- seq_printf(m, "shmem %llu\n",
5666
- (u64)acc.stat[NR_SHMEM] * PAGE_SIZE);
5667
- seq_printf(m, "file_mapped %llu\n",
5668
- (u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE);
5669
- seq_printf(m, "file_dirty %llu\n",
5670
- (u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE);
5671
- seq_printf(m, "file_writeback %llu\n",
5672
- (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE);
5673
-
5674
- for (i = 0; i < NR_LRU_LISTS; i++)
5675
- seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
5676
- (u64)acc.lru_pages[i] * PAGE_SIZE);
5677
-
5678
- seq_printf(m, "slab_reclaimable %llu\n",
5679
- (u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
5680
- seq_printf(m, "slab_unreclaimable %llu\n",
5681
- (u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
5682
-
5683
- /* Accumulated memory events */
5684
-
5685
- seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]);
5686
- seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]);
5687
-
5688
- seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]);
5689
- seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] +
5690
- acc.events[PGSCAN_DIRECT]);
5691
- seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] +
5692
- acc.events[PGSTEAL_DIRECT]);
5693
- seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]);
5694
- seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]);
5695
- seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]);
5696
- seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
5697
-
5698
- seq_printf(m, "workingset_refault %lu\n",
5699
- acc.stat[WORKINGSET_REFAULT]);
5700
- seq_printf(m, "workingset_activate %lu\n",
5701
- acc.stat[WORKINGSET_ACTIVATE]);
5702
- seq_printf(m, "workingset_nodereclaim %lu\n",
5703
- acc.stat[WORKINGSET_NODERECLAIM]);
5704
-
6495
+ buf = memory_stat_format(memcg);
6496
+ if (!buf)
6497
+ return -ENOMEM;
6498
+ seq_puts(m, buf);
6499
+ kfree(buf);
57056500 return 0;
57066501 }
57076502
6503
+#ifdef CONFIG_NUMA
6504
+static int memory_numa_stat_show(struct seq_file *m, void *v)
6505
+{
6506
+ int i;
6507
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6508
+
6509
+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
6510
+ int nid;
6511
+
6512
+ if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS)
6513
+ continue;
6514
+
6515
+ seq_printf(m, "%s", memory_stats[i].name);
6516
+ for_each_node_state(nid, N_MEMORY) {
6517
+ u64 size;
6518
+ struct lruvec *lruvec;
6519
+
6520
+ lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
6521
+ size = lruvec_page_state(lruvec, memory_stats[i].idx);
6522
+ size *= memory_stats[i].ratio;
6523
+ seq_printf(m, " N%d=%llu", nid, size);
6524
+ }
6525
+ seq_putc(m, '\n');
6526
+ }
6527
+
6528
+ return 0;
6529
+}
6530
+#endif
6531
+
57086532 static int memory_oom_group_show(struct seq_file *m, void *v)
57096533 {
5710
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
6534
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
57116535
57126536 seq_printf(m, "%d\n", memcg->oom_group);
57136537
....@@ -5773,10 +6597,21 @@
57736597 .seq_show = memory_events_show,
57746598 },
57756599 {
5776
- .name = "stat",
6600
+ .name = "events.local",
57776601 .flags = CFTYPE_NOT_ON_ROOT,
6602
+ .file_offset = offsetof(struct mem_cgroup, events_local_file),
6603
+ .seq_show = memory_events_local_show,
6604
+ },
6605
+ {
6606
+ .name = "stat",
57786607 .seq_show = memory_stat_show,
57796608 },
6609
+#ifdef CONFIG_NUMA
6610
+ {
6611
+ .name = "numa_stat",
6612
+ .seq_show = memory_numa_stat_show,
6613
+ },
6614
+#endif
57806615 {
57816616 .name = "oom.group",
57826617 .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
....@@ -5802,6 +6637,122 @@
58026637 .early_init = 0,
58036638 };
58046639
6640
+/*
6641
+ * This function calculates an individual cgroup's effective
6642
+ * protection which is derived from its own memory.min/low, its
6643
+ * parent's and siblings' settings, as well as the actual memory
6644
+ * distribution in the tree.
6645
+ *
6646
+ * The following rules apply to the effective protection values:
6647
+ *
6648
+ * 1. At the first level of reclaim, effective protection is equal to
6649
+ * the declared protection in memory.min and memory.low.
6650
+ *
6651
+ * 2. To enable safe delegation of the protection configuration, at
6652
+ * subsequent levels the effective protection is capped to the
6653
+ * parent's effective protection.
6654
+ *
6655
+ * 3. To make complex and dynamic subtrees easier to configure, the
6656
+ * user is allowed to overcommit the declared protection at a given
6657
+ * level. If that is the case, the parent's effective protection is
6658
+ * distributed to the children in proportion to how much protection
6659
+ * they have declared and how much of it they are utilizing.
6660
+ *
6661
+ * This makes distribution proportional, but also work-conserving:
6662
+ * if one cgroup claims much more protection than it uses memory,
6663
+ * the unused remainder is available to its siblings.
6664
+ *
6665
+ * 4. Conversely, when the declared protection is undercommitted at a
6666
+ * given level, the distribution of the larger parental protection
6667
+ * budget is NOT proportional. A cgroup's protection from a sibling
6668
+ * is capped to its own memory.min/low setting.
6669
+ *
6670
+ * 5. However, to allow protecting recursive subtrees from each other
6671
+ * without having to declare each individual cgroup's fixed share
6672
+ * of the ancestor's claim to protection, any unutilized -
6673
+ * "floating" - protection from up the tree is distributed in
6674
+ * proportion to each cgroup's *usage*. This makes the protection
6675
+ * neutral wrt sibling cgroups and lets them compete freely over
6676
+ * the shared parental protection budget, but it protects the
6677
+ * subtree as a whole from neighboring subtrees.
6678
+ *
6679
+ * Note that 4. and 5. are not in conflict: 4. is about protecting
6680
+ * against immediate siblings whereas 5. is about protecting against
6681
+ * neighboring subtrees.
6682
+ */
6683
+static unsigned long effective_protection(unsigned long usage,
6684
+ unsigned long parent_usage,
6685
+ unsigned long setting,
6686
+ unsigned long parent_effective,
6687
+ unsigned long siblings_protected)
6688
+{
6689
+ unsigned long protected;
6690
+ unsigned long ep;
6691
+
6692
+ protected = min(usage, setting);
6693
+ /*
6694
+ * If all cgroups at this level combined claim and use more
6695
+ * protection then what the parent affords them, distribute
6696
+ * shares in proportion to utilization.
6697
+ *
6698
+ * We are using actual utilization rather than the statically
6699
+ * claimed protection in order to be work-conserving: claimed
6700
+ * but unused protection is available to siblings that would
6701
+ * otherwise get a smaller chunk than what they claimed.
6702
+ */
6703
+ if (siblings_protected > parent_effective)
6704
+ return protected * parent_effective / siblings_protected;
6705
+
6706
+ /*
6707
+ * Ok, utilized protection of all children is within what the
6708
+ * parent affords them, so we know whatever this child claims
6709
+ * and utilizes is effectively protected.
6710
+ *
6711
+ * If there is unprotected usage beyond this value, reclaim
6712
+ * will apply pressure in proportion to that amount.
6713
+ *
6714
+ * If there is unutilized protection, the cgroup will be fully
6715
+ * shielded from reclaim, but we do return a smaller value for
6716
+ * protection than what the group could enjoy in theory. This
6717
+ * is okay. With the overcommit distribution above, effective
6718
+ * protection is always dependent on how memory is actually
6719
+ * consumed among the siblings anyway.
6720
+ */
6721
+ ep = protected;
6722
+
6723
+ /*
6724
+ * If the children aren't claiming (all of) the protection
6725
+ * afforded to them by the parent, distribute the remainder in
6726
+ * proportion to the (unprotected) memory of each cgroup. That
6727
+ * way, cgroups that aren't explicitly prioritized wrt each
6728
+ * other compete freely over the allowance, but they are
6729
+ * collectively protected from neighboring trees.
6730
+ *
6731
+ * We're using unprotected memory for the weight so that if
6732
+ * some cgroups DO claim explicit protection, we don't protect
6733
+ * the same bytes twice.
6734
+ *
6735
+ * Check both usage and parent_usage against the respective
6736
+ * protected values. One should imply the other, but they
6737
+ * aren't read atomically - make sure the division is sane.
6738
+ */
6739
+ if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT))
6740
+ return ep;
6741
+ if (parent_effective > siblings_protected &&
6742
+ parent_usage > siblings_protected &&
6743
+ usage > protected) {
6744
+ unsigned long unclaimed;
6745
+
6746
+ unclaimed = parent_effective - siblings_protected;
6747
+ unclaimed *= usage - protected;
6748
+ unclaimed /= parent_usage - siblings_protected;
6749
+
6750
+ ep += unclaimed;
6751
+ }
6752
+
6753
+ return ep;
6754
+}
6755
+
58056756 /**
58066757 * mem_cgroup_protected - check if memory consumption is in the normal range
58076758 * @root: the top ancestor of the sub-tree being checked
....@@ -5809,259 +6760,125 @@
58096760 *
58106761 * WARNING: This function is not stateless! It can only be used as part
58116762 * of a top-down tree iteration, not for isolated queries.
5812
- *
5813
- * Returns one of the following:
5814
- * MEMCG_PROT_NONE: cgroup memory is not protected
5815
- * MEMCG_PROT_LOW: cgroup memory is protected as long there is
5816
- * an unprotected supply of reclaimable memory from other cgroups.
5817
- * MEMCG_PROT_MIN: cgroup memory is protected
5818
- *
5819
- * @root is exclusive; it is never protected when looked at directly
5820
- *
5821
- * To provide a proper hierarchical behavior, effective memory.min/low values
5822
- * are used. Below is the description of how effective memory.low is calculated.
5823
- * Effective memory.min values is calculated in the same way.
5824
- *
5825
- * Effective memory.low is always equal or less than the original memory.low.
5826
- * If there is no memory.low overcommittment (which is always true for
5827
- * top-level memory cgroups), these two values are equal.
5828
- * Otherwise, it's a part of parent's effective memory.low,
5829
- * calculated as a cgroup's memory.low usage divided by sum of sibling's
5830
- * memory.low usages, where memory.low usage is the size of actually
5831
- * protected memory.
5832
- *
5833
- * low_usage
5834
- * elow = min( memory.low, parent->elow * ------------------ ),
5835
- * siblings_low_usage
5836
- *
5837
- * | memory.current, if memory.current < memory.low
5838
- * low_usage = |
5839
- | 0, otherwise.
5840
- *
5841
- *
5842
- * Such definition of the effective memory.low provides the expected
5843
- * hierarchical behavior: parent's memory.low value is limiting
5844
- * children, unprotected memory is reclaimed first and cgroups,
5845
- * which are not using their guarantee do not affect actual memory
5846
- * distribution.
5847
- *
5848
- * For example, if there are memcgs A, A/B, A/C, A/D and A/E:
5849
- *
5850
- * A A/memory.low = 2G, A/memory.current = 6G
5851
- * //\\
5852
- * BC DE B/memory.low = 3G B/memory.current = 2G
5853
- * C/memory.low = 1G C/memory.current = 2G
5854
- * D/memory.low = 0 D/memory.current = 2G
5855
- * E/memory.low = 10G E/memory.current = 0
5856
- *
5857
- * and the memory pressure is applied, the following memory distribution
5858
- * is expected (approximately):
5859
- *
5860
- * A/memory.current = 2G
5861
- *
5862
- * B/memory.current = 1.3G
5863
- * C/memory.current = 0.6G
5864
- * D/memory.current = 0
5865
- * E/memory.current = 0
5866
- *
5867
- * These calculations require constant tracking of the actual low usages
5868
- * (see propagate_protected_usage()), as well as recursive calculation of
5869
- * effective memory.low values. But as we do call mem_cgroup_protected()
5870
- * path for each memory cgroup top-down from the reclaim,
5871
- * it's possible to optimize this part, and save calculated elow
5872
- * for next usage. This part is intentionally racy, but it's ok,
5873
- * as memory.low is a best-effort mechanism.
58746763 */
5875
-enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
5876
- struct mem_cgroup *memcg)
6764
+void mem_cgroup_calculate_protection(struct mem_cgroup *root,
6765
+ struct mem_cgroup *memcg)
58776766 {
6767
+ unsigned long usage, parent_usage;
58786768 struct mem_cgroup *parent;
5879
- unsigned long emin, parent_emin;
5880
- unsigned long elow, parent_elow;
5881
- unsigned long usage;
58826769
58836770 if (mem_cgroup_disabled())
5884
- return MEMCG_PROT_NONE;
6771
+ return;
58856772
58866773 if (!root)
58876774 root = root_mem_cgroup;
6775
+
6776
+ /*
6777
+ * Effective values of the reclaim targets are ignored so they
6778
+ * can be stale. Have a look at mem_cgroup_protection for more
6779
+ * details.
6780
+ * TODO: calculation should be more robust so that we do not need
6781
+ * that special casing.
6782
+ */
58886783 if (memcg == root)
5889
- return MEMCG_PROT_NONE;
6784
+ return;
58906785
58916786 usage = page_counter_read(&memcg->memory);
58926787 if (!usage)
5893
- return MEMCG_PROT_NONE;
5894
-
5895
- emin = memcg->memory.min;
5896
- elow = memcg->memory.low;
6788
+ return;
58976789
58986790 parent = parent_mem_cgroup(memcg);
58996791 /* No parent means a non-hierarchical mode on v1 memcg */
59006792 if (!parent)
5901
- return MEMCG_PROT_NONE;
6793
+ return;
59026794
5903
- if (parent == root)
5904
- goto exit;
5905
-
5906
- parent_emin = READ_ONCE(parent->memory.emin);
5907
- emin = min(emin, parent_emin);
5908
- if (emin && parent_emin) {
5909
- unsigned long min_usage, siblings_min_usage;
5910
-
5911
- min_usage = min(usage, memcg->memory.min);
5912
- siblings_min_usage = atomic_long_read(
5913
- &parent->memory.children_min_usage);
5914
-
5915
- if (min_usage && siblings_min_usage)
5916
- emin = min(emin, parent_emin * min_usage /
5917
- siblings_min_usage);
6795
+ if (parent == root) {
6796
+ memcg->memory.emin = READ_ONCE(memcg->memory.min);
6797
+ memcg->memory.elow = READ_ONCE(memcg->memory.low);
6798
+ return;
59186799 }
59196800
5920
- parent_elow = READ_ONCE(parent->memory.elow);
5921
- elow = min(elow, parent_elow);
5922
- if (elow && parent_elow) {
5923
- unsigned long low_usage, siblings_low_usage;
6801
+ parent_usage = page_counter_read(&parent->memory);
59246802
5925
- low_usage = min(usage, memcg->memory.low);
5926
- siblings_low_usage = atomic_long_read(
5927
- &parent->memory.children_low_usage);
6803
+ WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage,
6804
+ READ_ONCE(memcg->memory.min),
6805
+ READ_ONCE(parent->memory.emin),
6806
+ atomic_long_read(&parent->memory.children_min_usage)));
59286807
5929
- if (low_usage && siblings_low_usage)
5930
- elow = min(elow, parent_elow * low_usage /
5931
- siblings_low_usage);
5932
- }
5933
-
5934
-exit:
5935
- memcg->memory.emin = emin;
5936
- memcg->memory.elow = elow;
5937
-
5938
- if (usage <= emin)
5939
- return MEMCG_PROT_MIN;
5940
- else if (usage <= elow)
5941
- return MEMCG_PROT_LOW;
5942
- else
5943
- return MEMCG_PROT_NONE;
6808
+ WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage,
6809
+ READ_ONCE(memcg->memory.low),
6810
+ READ_ONCE(parent->memory.elow),
6811
+ atomic_long_read(&parent->memory.children_low_usage)));
59446812 }
59456813
59466814 /**
5947
- * mem_cgroup_try_charge - try charging a page
6815
+ * __mem_cgroup_charge - charge a newly allocated page to a cgroup
59486816 * @page: page to charge
59496817 * @mm: mm context of the victim
59506818 * @gfp_mask: reclaim mode
5951
- * @memcgp: charged memcg return
5952
- * @compound: charge the page as compound or small page
59536819 *
59546820 * Try to charge @page to the memcg that @mm belongs to, reclaiming
59556821 * pages according to @gfp_mask if necessary.
59566822 *
5957
- * Returns 0 on success, with *@memcgp pointing to the charged memcg.
5958
- * Otherwise, an error code is returned.
5959
- *
5960
- * After page->mapping has been set up, the caller must finalize the
5961
- * charge with mem_cgroup_commit_charge(). Or abort the transaction
5962
- * with mem_cgroup_cancel_charge() in case page instantiation fails.
6823
+ * Returns 0 on success. Otherwise, an error code is returned.
59636824 */
5964
-int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
5965
- gfp_t gfp_mask, struct mem_cgroup **memcgp,
5966
- bool compound)
6825
+int __mem_cgroup_charge(struct page *page, struct mm_struct *mm,
6826
+ gfp_t gfp_mask)
59676827 {
6828
+ unsigned int nr_pages = thp_nr_pages(page);
59686829 struct mem_cgroup *memcg = NULL;
5969
- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
59706830 int ret = 0;
59716831
5972
- if (mem_cgroup_disabled())
5973
- goto out;
5974
-
59756832 if (PageSwapCache(page)) {
6833
+ swp_entry_t ent = { .val = page_private(page), };
6834
+ unsigned short id;
6835
+
59766836 /*
59776837 * Every swap fault against a single page tries to charge the
59786838 * page, bail as early as possible. shmem_unuse() encounters
5979
- * already charged pages, too. The USED bit is protected by
5980
- * the page lock, which serializes swap cache removal, which
6839
+ * already charged pages, too. page->mem_cgroup is protected
6840
+ * by the page lock, which serializes swap cache removal, which
59816841 * in turn serializes uncharging.
59826842 */
59836843 VM_BUG_ON_PAGE(!PageLocked(page), page);
59846844 if (compound_head(page)->mem_cgroup)
59856845 goto out;
59866846
5987
- if (do_swap_account) {
5988
- swp_entry_t ent = { .val = page_private(page), };
5989
- unsigned short id = lookup_swap_cgroup_id(ent);
5990
-
5991
- rcu_read_lock();
5992
- memcg = mem_cgroup_from_id(id);
5993
- if (memcg && !css_tryget_online(&memcg->css))
5994
- memcg = NULL;
5995
- rcu_read_unlock();
5996
- }
6847
+ id = lookup_swap_cgroup_id(ent);
6848
+ rcu_read_lock();
6849
+ memcg = mem_cgroup_from_id(id);
6850
+ if (memcg && !css_tryget_online(&memcg->css))
6851
+ memcg = NULL;
6852
+ rcu_read_unlock();
59976853 }
59986854
59996855 if (!memcg)
60006856 memcg = get_mem_cgroup_from_mm(mm);
60016857
60026858 ret = try_charge(memcg, gfp_mask, nr_pages);
6859
+ if (ret)
6860
+ goto out_put;
60036861
6004
- css_put(&memcg->css);
6005
-out:
6006
- *memcgp = memcg;
6007
- return ret;
6008
-}
6862
+ css_get(&memcg->css);
6863
+ commit_charge(page, memcg);
60096864
6010
-int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm,
6011
- gfp_t gfp_mask, struct mem_cgroup **memcgp,
6012
- bool compound)
6013
-{
6014
- struct mem_cgroup *memcg;
6015
- int ret;
6016
-
6017
- ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound);
6018
- memcg = *memcgp;
6019
- mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask);
6020
- return ret;
6021
-}
6022
-
6023
-/**
6024
- * mem_cgroup_commit_charge - commit a page charge
6025
- * @page: page to charge
6026
- * @memcg: memcg to charge the page to
6027
- * @lrucare: page might be on LRU already
6028
- * @compound: charge the page as compound or small page
6029
- *
6030
- * Finalize a charge transaction started by mem_cgroup_try_charge(),
6031
- * after page->mapping has been set up. This must happen atomically
6032
- * as part of the page instantiation, i.e. under the page table lock
6033
- * for anonymous pages, under the page lock for page and swap cache.
6034
- *
6035
- * In addition, the page must not be on the LRU during the commit, to
6036
- * prevent racing with task migration. If it might be, use @lrucare.
6037
- *
6038
- * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
6039
- */
6040
-void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
6041
- bool lrucare, bool compound)
6042
-{
6043
- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
6044
-
6045
- VM_BUG_ON_PAGE(!page->mapping, page);
6046
- VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
6047
-
6048
- if (mem_cgroup_disabled())
6049
- return;
6050
- /*
6051
- * Swap faults will attempt to charge the same page multiple
6052
- * times. But reuse_swap_page() might have removed the page
6053
- * from swapcache already, so we can't check PageSwapCache().
6054
- */
6055
- if (!memcg)
6056
- return;
6057
-
6058
- commit_charge(page, memcg, lrucare);
6059
-
6060
- local_irq_disable();
6061
- mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
6865
+ local_lock_irq(&event_lock.l);
6866
+ mem_cgroup_charge_statistics(memcg, page, nr_pages);
60626867 memcg_check_events(memcg, page);
6063
- local_irq_enable();
6868
+ local_unlock_irq(&event_lock.l);
60646869
6870
+ /*
6871
+ * Cgroup1's unified memory+swap counter has been charged with the
6872
+ * new swapcache page, finish the transfer by uncharging the swap
6873
+ * slot. The swap slot would also get uncharged when it dies, but
6874
+ * it can stick around indefinitely and we'd count the page twice
6875
+ * the entire time.
6876
+ *
6877
+ * Cgroup2 has separate resource counters for memory and swap,
6878
+ * so this is a non-issue here. Memory and swap charge lifetimes
6879
+ * correspond 1:1 to page and swap slot lifetimes: we charge the
6880
+ * page to memory here, and uncharge swap when the slot is freed.
6881
+ */
60656882 if (do_memsw_account() && PageSwapCache(page)) {
60666883 swp_entry_t entry = { .val = page_private(page) };
60676884 /*
....@@ -6071,42 +6888,18 @@
60716888 */
60726889 mem_cgroup_uncharge_swap(entry, nr_pages);
60736890 }
6074
-}
60756891
6076
-/**
6077
- * mem_cgroup_cancel_charge - cancel a page charge
6078
- * @page: page to charge
6079
- * @memcg: memcg to charge the page to
6080
- * @compound: charge the page as compound or small page
6081
- *
6082
- * Cancel a charge transaction started by mem_cgroup_try_charge().
6083
- */
6084
-void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
6085
- bool compound)
6086
-{
6087
- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
6088
-
6089
- if (mem_cgroup_disabled())
6090
- return;
6091
- /*
6092
- * Swap faults will attempt to charge the same page multiple
6093
- * times. But reuse_swap_page() might have removed the page
6094
- * from swapcache already, so we can't check PageSwapCache().
6095
- */
6096
- if (!memcg)
6097
- return;
6098
-
6099
- cancel_charge(memcg, nr_pages);
6892
+out_put:
6893
+ css_put(&memcg->css);
6894
+out:
6895
+ return ret;
61006896 }
61016897
61026898 struct uncharge_gather {
61036899 struct mem_cgroup *memcg;
6900
+ unsigned long nr_pages;
61046901 unsigned long pgpgout;
6105
- unsigned long nr_anon;
6106
- unsigned long nr_file;
61076902 unsigned long nr_kmem;
6108
- unsigned long nr_huge;
6109
- unsigned long nr_shmem;
61106903 struct page *dummy_page;
61116904 };
61126905
....@@ -6117,37 +6910,32 @@
61176910
61186911 static void uncharge_batch(const struct uncharge_gather *ug)
61196912 {
6120
- unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem;
61216913 unsigned long flags;
61226914
61236915 if (!mem_cgroup_is_root(ug->memcg)) {
6124
- page_counter_uncharge(&ug->memcg->memory, nr_pages);
6916
+ page_counter_uncharge(&ug->memcg->memory, ug->nr_pages);
61256917 if (do_memsw_account())
6126
- page_counter_uncharge(&ug->memcg->memsw, nr_pages);
6918
+ page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages);
61276919 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
61286920 page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
61296921 memcg_oom_recover(ug->memcg);
61306922 }
61316923
6132
- local_irq_save(flags);
6133
- __mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon);
6134
- __mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file);
6135
- __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge);
6136
- __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem);
6924
+ local_lock_irqsave(&event_lock.l, flags);
61376925 __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
6138
- __this_cpu_add(ug->memcg->stat_cpu->nr_page_events, nr_pages);
6926
+ __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages);
61396927 memcg_check_events(ug->memcg, ug->dummy_page);
6140
- local_irq_restore(flags);
6928
+ local_unlock_irqrestore(&event_lock.l, flags);
61416929
6142
- if (!mem_cgroup_is_root(ug->memcg))
6143
- css_put_many(&ug->memcg->css, nr_pages);
6930
+ /* drop reference from uncharge_page */
6931
+ css_put(&ug->memcg->css);
61446932 }
61456933
61466934 static void uncharge_page(struct page *page, struct uncharge_gather *ug)
61476935 {
6936
+ unsigned long nr_pages;
6937
+
61486938 VM_BUG_ON_PAGE(PageLRU(page), page);
6149
- VM_BUG_ON_PAGE(page_count(page) && !is_zone_device_page(page) &&
6150
- !PageHWPoison(page) , page);
61516939
61526940 if (!page->mem_cgroup)
61536941 return;
....@@ -6164,30 +6952,24 @@
61646952 uncharge_gather_clear(ug);
61656953 }
61666954 ug->memcg = page->mem_cgroup;
6955
+
6956
+ /* pairs with css_put in uncharge_batch */
6957
+ css_get(&ug->memcg->css);
61676958 }
61686959
6169
- if (!PageKmemcg(page)) {
6170
- unsigned int nr_pages = 1;
6960
+ nr_pages = compound_nr(page);
6961
+ ug->nr_pages += nr_pages;
61716962
6172
- if (PageTransHuge(page)) {
6173
- nr_pages <<= compound_order(page);
6174
- ug->nr_huge += nr_pages;
6175
- }
6176
- if (PageAnon(page))
6177
- ug->nr_anon += nr_pages;
6178
- else {
6179
- ug->nr_file += nr_pages;
6180
- if (PageSwapBacked(page))
6181
- ug->nr_shmem += nr_pages;
6182
- }
6963
+ if (!PageKmemcg(page)) {
61836964 ug->pgpgout++;
61846965 } else {
6185
- ug->nr_kmem += 1 << compound_order(page);
6966
+ ug->nr_kmem += nr_pages;
61866967 __ClearPageKmemcg(page);
61876968 }
61886969
61896970 ug->dummy_page = page;
61906971 page->mem_cgroup = NULL;
6972
+ css_put(&ug->memcg->css);
61916973 }
61926974
61936975 static void uncharge_list(struct list_head *page_list)
....@@ -6216,18 +6998,14 @@
62166998 }
62176999
62187000 /**
6219
- * mem_cgroup_uncharge - uncharge a page
7001
+ * __mem_cgroup_uncharge - uncharge a page
62207002 * @page: page to uncharge
62217003 *
6222
- * Uncharge a page previously charged with mem_cgroup_try_charge() and
6223
- * mem_cgroup_commit_charge().
7004
+ * Uncharge a page previously charged with __mem_cgroup_charge().
62247005 */
6225
-void mem_cgroup_uncharge(struct page *page)
7006
+void __mem_cgroup_uncharge(struct page *page)
62267007 {
62277008 struct uncharge_gather ug;
6228
-
6229
- if (mem_cgroup_disabled())
6230
- return;
62317009
62327010 /* Don't touch page->lru of any random page, pre-check: */
62337011 if (!page->mem_cgroup)
....@@ -6239,17 +7017,14 @@
62397017 }
62407018
62417019 /**
6242
- * mem_cgroup_uncharge_list - uncharge a list of page
7020
+ * __mem_cgroup_uncharge_list - uncharge a list of page
62437021 * @page_list: list of pages to uncharge
62447022 *
62457023 * Uncharge a list of pages previously charged with
6246
- * mem_cgroup_try_charge() and mem_cgroup_commit_charge().
7024
+ * __mem_cgroup_charge().
62477025 */
6248
-void mem_cgroup_uncharge_list(struct list_head *page_list)
7026
+void __mem_cgroup_uncharge_list(struct list_head *page_list)
62497027 {
6250
- if (mem_cgroup_disabled())
6251
- return;
6252
-
62537028 if (!list_empty(page_list))
62547029 uncharge_list(page_list);
62557030 }
....@@ -6268,7 +7043,6 @@
62687043 {
62697044 struct mem_cgroup *memcg;
62707045 unsigned int nr_pages;
6271
- bool compound;
62727046 unsigned long flags;
62737047
62747048 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
....@@ -6290,20 +7064,19 @@
62907064 return;
62917065
62927066 /* Force-charge the new page. The old one will be freed soon */
6293
- compound = PageTransHuge(newpage);
6294
- nr_pages = compound ? hpage_nr_pages(newpage) : 1;
7067
+ nr_pages = thp_nr_pages(newpage);
62957068
62967069 page_counter_charge(&memcg->memory, nr_pages);
62977070 if (do_memsw_account())
62987071 page_counter_charge(&memcg->memsw, nr_pages);
6299
- css_get_many(&memcg->css, nr_pages);
63007072
6301
- commit_charge(newpage, memcg, false);
7073
+ css_get(&memcg->css);
7074
+ commit_charge(newpage, memcg);
63027075
6303
- local_irq_save(flags);
6304
- mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
7076
+ local_lock_irqsave(&event_lock.l, flags);
7077
+ mem_cgroup_charge_statistics(memcg, newpage, nr_pages);
63057078 memcg_check_events(memcg, newpage);
6306
- local_irq_restore(flags);
7079
+ local_unlock_irqrestore(&event_lock.l, flags);
63077080 }
63087081
63097082 DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
....@@ -6326,7 +7099,7 @@
63267099 goto out;
63277100 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
63287101 goto out;
6329
- if (css_tryget_online(&memcg->css))
7102
+ if (css_tryget(&memcg->css))
63307103 sk->sk_memcg = memcg;
63317104 out:
63327105 rcu_read_unlock();
....@@ -6404,7 +7177,7 @@
64047177 if (!strcmp(token, "nokmem"))
64057178 cgroup_memory_nokmem = true;
64067179 }
6407
- return 0;
7180
+ return 1;
64087181 }
64097182 __setup("cgroup.memory=", cgroup_memory);
64107183
....@@ -6420,23 +7193,16 @@
64207193 {
64217194 int cpu, node;
64227195
6423
-#ifdef CONFIG_MEMCG_KMEM
6424
- /*
6425
- * Kmem cache creation is mostly done with the slab_mutex held,
6426
- * so use a workqueue with limited concurrency to avoid stalling
6427
- * all worker threads in case lots of cgroups are created and
6428
- * destroyed simultaneously.
6429
- */
6430
- memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1);
6431
- BUG_ON(!memcg_kmem_cache_wq);
6432
-#endif
6433
-
64347196 cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
64357197 memcg_hotplug_cpu_dead);
64367198
6437
- for_each_possible_cpu(cpu)
6438
- INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
6439
- drain_local_stock);
7199
+ for_each_possible_cpu(cpu) {
7200
+ struct memcg_stock_pcp *stock;
7201
+
7202
+ stock = per_cpu_ptr(&memcg_stock, cpu);
7203
+ INIT_WORK(&stock->work, drain_local_stock);
7204
+ local_lock_init(&stock->lock);
7205
+ }
64407206
64417207 for_each_node(node) {
64427208 struct mem_cgroup_tree_per_node *rtpn;
....@@ -6457,7 +7223,7 @@
64577223 #ifdef CONFIG_MEMCG_SWAP
64587224 static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
64597225 {
6460
- while (!atomic_inc_not_zero(&memcg->id.ref)) {
7226
+ while (!refcount_inc_not_zero(&memcg->id.ref)) {
64617227 /*
64627228 * The root cgroup cannot be destroyed, so it's refcount must
64637229 * always be >= 1.
....@@ -6485,11 +7251,15 @@
64857251 struct mem_cgroup *memcg, *swap_memcg;
64867252 unsigned int nr_entries;
64877253 unsigned short oldid;
7254
+ unsigned long flags;
64887255
64897256 VM_BUG_ON_PAGE(PageLRU(page), page);
64907257 VM_BUG_ON_PAGE(page_count(page), page);
64917258
6492
- if (!do_memsw_account())
7259
+ if (mem_cgroup_disabled())
7260
+ return;
7261
+
7262
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
64937263 return;
64947264
64957265 memcg = page->mem_cgroup;
....@@ -6504,7 +7274,7 @@
65047274 * ancestor for the swap instead and transfer the memory+swap charge.
65057275 */
65067276 swap_memcg = mem_cgroup_id_get_online(memcg);
6507
- nr_entries = hpage_nr_pages(page);
7277
+ nr_entries = thp_nr_pages(page);
65087278 /* Get references for the tail pages, too */
65097279 if (nr_entries > 1)
65107280 mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
....@@ -6518,7 +7288,7 @@
65187288 if (!mem_cgroup_is_root(memcg))
65197289 page_counter_uncharge(&memcg->memory, nr_entries);
65207290
6521
- if (memcg != swap_memcg) {
7291
+ if (!cgroup_memory_noswap && memcg != swap_memcg) {
65227292 if (!mem_cgroup_is_root(swap_memcg))
65237293 page_counter_charge(&swap_memcg->memsw, nr_entries);
65247294 page_counter_uncharge(&memcg->memsw, nr_entries);
....@@ -6530,17 +7300,19 @@
65307300 * important here to have the interrupts disabled because it is the
65317301 * only synchronisation we have for updating the per-CPU variables.
65327302 */
7303
+ local_lock_irqsave(&event_lock.l, flags);
7304
+#ifndef CONFIG_PREEMPT_RT
65337305 VM_BUG_ON(!irqs_disabled());
6534
- mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
6535
- -nr_entries);
7306
+#endif
7307
+ mem_cgroup_charge_statistics(memcg, page, -nr_entries);
65367308 memcg_check_events(memcg, page);
7309
+ local_unlock_irqrestore(&event_lock.l, flags);
65377310
6538
- if (!mem_cgroup_is_root(memcg))
6539
- css_put_many(&memcg->css, nr_entries);
7311
+ css_put(&memcg->css);
65407312 }
65417313
65427314 /**
6543
- * mem_cgroup_try_charge_swap - try charging swap space for a page
7315
+ * __mem_cgroup_try_charge_swap - try charging swap space for a page
65447316 * @page: page being added to swap
65457317 * @entry: swap entry to charge
65467318 *
....@@ -6548,14 +7320,14 @@
65487320 *
65497321 * Returns 0 on success, -ENOMEM on failure.
65507322 */
6551
-int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
7323
+int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
65527324 {
6553
- unsigned int nr_pages = hpage_nr_pages(page);
7325
+ unsigned int nr_pages = thp_nr_pages(page);
65547326 struct page_counter *counter;
65557327 struct mem_cgroup *memcg;
65567328 unsigned short oldid;
65577329
6558
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account)
7330
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
65597331 return 0;
65607332
65617333 memcg = page->mem_cgroup;
....@@ -6571,7 +7343,7 @@
65717343
65727344 memcg = mem_cgroup_id_get_online(memcg);
65737345
6574
- if (!mem_cgroup_is_root(memcg) &&
7346
+ if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) &&
65757347 !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
65767348 memcg_memory_event(memcg, MEMCG_SWAP_MAX);
65777349 memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
....@@ -6590,23 +7362,20 @@
65907362 }
65917363
65927364 /**
6593
- * mem_cgroup_uncharge_swap - uncharge swap space
7365
+ * __mem_cgroup_uncharge_swap - uncharge swap space
65947366 * @entry: swap entry to uncharge
65957367 * @nr_pages: the amount of swap space to uncharge
65967368 */
6597
-void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
7369
+void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
65987370 {
65997371 struct mem_cgroup *memcg;
66007372 unsigned short id;
6601
-
6602
- if (!do_swap_account)
6603
- return;
66047373
66057374 id = swap_cgroup_record(entry, 0, nr_pages);
66067375 rcu_read_lock();
66077376 memcg = mem_cgroup_from_id(id);
66087377 if (memcg) {
6609
- if (!mem_cgroup_is_root(memcg)) {
7378
+ if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) {
66107379 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
66117380 page_counter_uncharge(&memcg->swap, nr_pages);
66127381 else
....@@ -6622,7 +7391,7 @@
66227391 {
66237392 long nr_swap_pages = get_nr_swap_pages();
66247393
6625
- if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
7394
+ if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
66267395 return nr_swap_pages;
66277396 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
66287397 nr_swap_pages = min_t(long, nr_swap_pages,
....@@ -6639,36 +7408,33 @@
66397408
66407409 if (vm_swap_full())
66417410 return true;
6642
- if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
7411
+ if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
66437412 return false;
66447413
66457414 memcg = page->mem_cgroup;
66467415 if (!memcg)
66477416 return false;
66487417
6649
- for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
6650
- if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.max)
7418
+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
7419
+ unsigned long usage = page_counter_read(&memcg->swap);
7420
+
7421
+ if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
7422
+ usage * 2 >= READ_ONCE(memcg->swap.max))
66517423 return true;
7424
+ }
66527425
66537426 return false;
66547427 }
66557428
6656
-/* for remember boot option*/
6657
-#ifdef CONFIG_MEMCG_SWAP_ENABLED
6658
-static int really_do_swap_account __initdata = 1;
6659
-#else
6660
-static int really_do_swap_account __initdata;
6661
-#endif
6662
-
6663
-static int __init enable_swap_account(char *s)
7429
+static int __init setup_swap_account(char *s)
66647430 {
66657431 if (!strcmp(s, "1"))
6666
- really_do_swap_account = 1;
7432
+ cgroup_memory_noswap = 0;
66677433 else if (!strcmp(s, "0"))
6668
- really_do_swap_account = 0;
7434
+ cgroup_memory_noswap = 1;
66697435 return 1;
66707436 }
6671
-__setup("swapaccount=", enable_swap_account);
7437
+__setup("swapaccount=", setup_swap_account);
66727438
66737439 static u64 swap_current_read(struct cgroup_subsys_state *css,
66747440 struct cftype *cft)
....@@ -6678,17 +7444,33 @@
66787444 return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
66797445 }
66807446
7447
+static int swap_high_show(struct seq_file *m, void *v)
7448
+{
7449
+ return seq_puts_memcg_tunable(m,
7450
+ READ_ONCE(mem_cgroup_from_seq(m)->swap.high));
7451
+}
7452
+
7453
+static ssize_t swap_high_write(struct kernfs_open_file *of,
7454
+ char *buf, size_t nbytes, loff_t off)
7455
+{
7456
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
7457
+ unsigned long high;
7458
+ int err;
7459
+
7460
+ buf = strstrip(buf);
7461
+ err = page_counter_memparse(buf, "max", &high);
7462
+ if (err)
7463
+ return err;
7464
+
7465
+ page_counter_set_high(&memcg->swap, high);
7466
+
7467
+ return nbytes;
7468
+}
7469
+
66817470 static int swap_max_show(struct seq_file *m, void *v)
66827471 {
6683
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
6684
- unsigned long max = READ_ONCE(memcg->swap.max);
6685
-
6686
- if (max == PAGE_COUNTER_MAX)
6687
- seq_puts(m, "max\n");
6688
- else
6689
- seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
6690
-
6691
- return 0;
7472
+ return seq_puts_memcg_tunable(m,
7473
+ READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
66927474 }
66937475
66947476 static ssize_t swap_max_write(struct kernfs_open_file *of,
....@@ -6710,8 +7492,10 @@
67107492
67117493 static int swap_events_show(struct seq_file *m, void *v)
67127494 {
6713
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
7495
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
67147496
7497
+ seq_printf(m, "high %lu\n",
7498
+ atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH]));
67157499 seq_printf(m, "max %lu\n",
67167500 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
67177501 seq_printf(m, "fail %lu\n",
....@@ -6725,6 +7509,12 @@
67257509 .name = "swap.current",
67267510 .flags = CFTYPE_NOT_ON_ROOT,
67277511 .read_u64 = swap_current_read,
7512
+ },
7513
+ {
7514
+ .name = "swap.high",
7515
+ .flags = CFTYPE_NOT_ON_ROOT,
7516
+ .seq_show = swap_high_show,
7517
+ .write = swap_high_write,
67287518 },
67297519 {
67307520 .name = "swap.max",
....@@ -6741,7 +7531,7 @@
67417531 { } /* terminate */
67427532 };
67437533
6744
-static struct cftype memsw_cgroup_files[] = {
7534
+static struct cftype memsw_files[] = {
67457535 {
67467536 .name = "memsw.usage_in_bytes",
67477537 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
....@@ -6768,17 +7558,27 @@
67687558 { }, /* terminate */
67697559 };
67707560
7561
+/*
7562
+ * If mem_cgroup_swap_init() is implemented as a subsys_initcall()
7563
+ * instead of a core_initcall(), this could mean cgroup_memory_noswap still
7564
+ * remains set to false even when memcg is disabled via "cgroup_disable=memory"
7565
+ * boot parameter. This may result in premature OOPS inside
7566
+ * mem_cgroup_get_nr_swap_pages() function in corner cases.
7567
+ */
67717568 static int __init mem_cgroup_swap_init(void)
67727569 {
6773
- if (!mem_cgroup_disabled() && really_do_swap_account) {
6774
- do_swap_account = 1;
6775
- WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys,
6776
- swap_files));
6777
- WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
6778
- memsw_cgroup_files));
6779
- }
7570
+ /* No memory control -> no swap control */
7571
+ if (mem_cgroup_disabled())
7572
+ cgroup_memory_noswap = true;
7573
+
7574
+ if (cgroup_memory_noswap)
7575
+ return 0;
7576
+
7577
+ WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
7578
+ WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files));
7579
+
67807580 return 0;
67817581 }
6782
-subsys_initcall(mem_cgroup_swap_init);
7582
+core_initcall(mem_cgroup_swap_init);
67837583
67847584 #endif /* CONFIG_MEMCG_SWAP */