hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/mm/memcontrol.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /* memcontrol.c - Memory Controller
23 *
34 * Copyright IBM Corporation, 2007
....@@ -19,26 +20,17 @@
1920 * Lockless page tracking & accounting
2021 * Unified hierarchy configuration model
2122 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
22
- *
23
- * This program is free software; you can redistribute it and/or modify
24
- * it under the terms of the GNU General Public License as published by
25
- * the Free Software Foundation; either version 2 of the License, or
26
- * (at your option) any later version.
27
- *
28
- * This program is distributed in the hope that it will be useful,
29
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
30
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
31
- * GNU General Public License for more details.
3223 */
3324
3425 #include <linux/page_counter.h>
3526 #include <linux/memcontrol.h>
3627 #include <linux/cgroup.h>
37
-#include <linux/mm.h>
28
+#include <linux/pagewalk.h>
3829 #include <linux/sched/mm.h>
3930 #include <linux/shmem_fs.h>
4031 #include <linux/hugetlb.h>
4132 #include <linux/pagemap.h>
33
+#include <linux/vm_event_item.h>
4234 #include <linux/smp.h>
4335 #include <linux/page-flags.h>
4436 #include <linux/backing-dev.h>
....@@ -65,6 +57,8 @@
6557 #include <linux/lockdep.h>
6658 #include <linux/file.h>
6759 #include <linux/tracehook.h>
60
+#include <linux/psi.h>
61
+#include <linux/seq_buf.h>
6862 #include "internal.h"
6963 #include <net/sock.h>
7064 #include <net/ip.h>
....@@ -73,13 +67,16 @@
7367 #include <linux/uaccess.h>
7468
7569 #include <trace/events/vmscan.h>
70
+#include <trace/hooks/mm.h>
7671
7772 struct cgroup_subsys memory_cgrp_subsys __read_mostly;
7873 EXPORT_SYMBOL(memory_cgrp_subsys);
7974
8075 struct mem_cgroup *root_mem_cgroup __read_mostly;
76
+EXPORT_SYMBOL_GPL(root_mem_cgroup);
8177
82
-#define MEM_CGROUP_RECLAIM_RETRIES 5
78
+/* Active memory cgroup to use from an interrupt context */
79
+DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
8380
8481 /* Socket memory accounting disabled? */
8582 static bool cgroup_memory_nosocket;
....@@ -89,28 +86,23 @@
8986
9087 /* Whether the swap controller is active */
9188 #ifdef CONFIG_MEMCG_SWAP
92
-int do_swap_account __read_mostly;
89
+bool cgroup_memory_noswap __read_mostly;
9390 #else
94
-#define do_swap_account 0
91
+#define cgroup_memory_noswap 1
92
+#endif
93
+
94
+#ifdef CONFIG_CGROUP_WRITEBACK
95
+static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
9596 #endif
9697
9798 /* Whether legacy memory+swap accounting is active */
9899 static bool do_memsw_account(void)
99100 {
100
- return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
101
+ return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
101102 }
102
-
103
-static const char *const mem_cgroup_lru_names[] = {
104
- "inactive_anon",
105
- "active_anon",
106
- "inactive_file",
107
- "active_file",
108
- "unevictable",
109
-};
110103
111104 #define THRESHOLDS_EVENTS_TARGET 128
112105 #define SOFTLIMIT_EVENTS_TARGET 1024
113
-#define NUMAINFO_EVENTS_TARGET 1024
114106
115107 /*
116108 * Cgroups above their limits are maintained in a RB-Tree, independent of
....@@ -210,14 +202,6 @@
210202 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
211203 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
212204
213
-enum charge_type {
214
- MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
215
- MEM_CGROUP_CHARGE_TYPE_ANON,
216
- MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
217
- MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */
218
- NR_CHARGE_TYPE,
219
-};
220
-
221205 /* for encoding cft->private value on file */
222206 enum res_type {
223207 _MEM,
....@@ -248,7 +232,7 @@
248232 iter != NULL; \
249233 iter = mem_cgroup_iter(NULL, iter, NULL))
250234
251
-static inline bool should_force_charge(void)
235
+static inline bool task_is_dying(void)
252236 {
253237 return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
254238 (current->flags & PF_EXITING);
....@@ -268,8 +252,100 @@
268252 }
269253
270254 #ifdef CONFIG_MEMCG_KMEM
255
+static DEFINE_SPINLOCK(objcg_lock);
256
+
257
+static void obj_cgroup_release(struct percpu_ref *ref)
258
+{
259
+ struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
260
+ struct mem_cgroup *memcg;
261
+ unsigned int nr_bytes;
262
+ unsigned int nr_pages;
263
+ unsigned long flags;
264
+
265
+ /*
266
+ * At this point all allocated objects are freed, and
267
+ * objcg->nr_charged_bytes can't have an arbitrary byte value.
268
+ * However, it can be PAGE_SIZE or (x * PAGE_SIZE).
269
+ *
270
+ * The following sequence can lead to it:
271
+ * 1) CPU0: objcg == stock->cached_objcg
272
+ * 2) CPU1: we do a small allocation (e.g. 92 bytes),
273
+ * PAGE_SIZE bytes are charged
274
+ * 3) CPU1: a process from another memcg is allocating something,
275
+ * the stock if flushed,
276
+ * objcg->nr_charged_bytes = PAGE_SIZE - 92
277
+ * 5) CPU0: we do release this object,
278
+ * 92 bytes are added to stock->nr_bytes
279
+ * 6) CPU0: stock is flushed,
280
+ * 92 bytes are added to objcg->nr_charged_bytes
281
+ *
282
+ * In the result, nr_charged_bytes == PAGE_SIZE.
283
+ * This page will be uncharged in obj_cgroup_release().
284
+ */
285
+ nr_bytes = atomic_read(&objcg->nr_charged_bytes);
286
+ WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
287
+ nr_pages = nr_bytes >> PAGE_SHIFT;
288
+
289
+ spin_lock_irqsave(&objcg_lock, flags);
290
+ memcg = obj_cgroup_memcg(objcg);
291
+ if (nr_pages)
292
+ __memcg_kmem_uncharge(memcg, nr_pages);
293
+ list_del(&objcg->list);
294
+ mem_cgroup_put(memcg);
295
+ spin_unlock_irqrestore(&objcg_lock, flags);
296
+
297
+ percpu_ref_exit(ref);
298
+ kfree_rcu(objcg, rcu);
299
+}
300
+
301
+static struct obj_cgroup *obj_cgroup_alloc(void)
302
+{
303
+ struct obj_cgroup *objcg;
304
+ int ret;
305
+
306
+ objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
307
+ if (!objcg)
308
+ return NULL;
309
+
310
+ ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0,
311
+ GFP_KERNEL);
312
+ if (ret) {
313
+ kfree(objcg);
314
+ return NULL;
315
+ }
316
+ INIT_LIST_HEAD(&objcg->list);
317
+ return objcg;
318
+}
319
+
320
+static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
321
+ struct mem_cgroup *parent)
322
+{
323
+ struct obj_cgroup *objcg, *iter;
324
+
325
+ objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
326
+
327
+ spin_lock_irq(&objcg_lock);
328
+
329
+ /* Move active objcg to the parent's list */
330
+ xchg(&objcg->memcg, parent);
331
+ css_get(&parent->css);
332
+ list_add(&objcg->list, &parent->objcg_list);
333
+
334
+ /* Move already reparented objcgs to the parent's list */
335
+ list_for_each_entry(iter, &memcg->objcg_list, list) {
336
+ css_get(&parent->css);
337
+ xchg(&iter->memcg, parent);
338
+ css_put(&memcg->css);
339
+ }
340
+ list_splice(&memcg->objcg_list, &parent->objcg_list);
341
+
342
+ spin_unlock_irq(&objcg_lock);
343
+
344
+ percpu_ref_kill(&objcg->refcnt);
345
+}
346
+
271347 /*
272
- * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
348
+ * This will be used as a shrinker list's index.
273349 * The main reason for not using cgroup id for this:
274350 * this works better in sparse environments, where we have a lot of memcgs,
275351 * but only a few kmem-limited. Or also, if we have, for instance, 200
....@@ -312,14 +388,13 @@
312388
313389 /*
314390 * A lot of the calls to the cache allocation functions are expected to be
315
- * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
391
+ * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are
316392 * conditional to this static branch, we'll have to allow modules that does
317393 * kmem_cache_alloc and the such to see this symbol as well
318394 */
319395 DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
320396 EXPORT_SYMBOL(memcg_kmem_enabled_key);
321
-
322
-struct workqueue_struct *memcg_kmem_cache_wq;
397
+#endif
323398
324399 static int memcg_shrinker_map_size;
325400 static DEFINE_MUTEX(memcg_shrinker_map_mutex);
....@@ -344,7 +419,7 @@
344419 if (!old)
345420 return 0;
346421
347
- new = kvmalloc(sizeof(*new) + size, GFP_KERNEL);
422
+ new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
348423 if (!new)
349424 return -ENOMEM;
350425
....@@ -388,7 +463,7 @@
388463 mutex_lock(&memcg_shrinker_map_mutex);
389464 size = memcg_shrinker_map_size;
390465 for_each_node(nid) {
391
- map = kvzalloc(sizeof(*map) + size, GFP_KERNEL);
466
+ map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid);
392467 if (!map) {
393468 memcg_free_shrinker_maps(memcg);
394469 ret = -ENOMEM;
....@@ -445,14 +520,6 @@
445520 }
446521 }
447522
448
-#else /* CONFIG_MEMCG_KMEM */
449
-static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
450
-{
451
- return 0;
452
-}
453
-static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { }
454
-#endif /* CONFIG_MEMCG_KMEM */
455
-
456523 /**
457524 * mem_cgroup_css_from_page - css of the memcg associated with a page
458525 * @page: page of interest
....@@ -495,7 +562,17 @@
495562 unsigned long ino = 0;
496563
497564 rcu_read_lock();
498
- memcg = READ_ONCE(page->mem_cgroup);
565
+ memcg = page->mem_cgroup;
566
+
567
+ /*
568
+ * The lowest bit set means that memcg isn't a valid
569
+ * memcg pointer, but a obj_cgroups pointer.
570
+ * In this case the page is shared and doesn't belong
571
+ * to any specific memory cgroup.
572
+ */
573
+ if ((unsigned long) memcg & 0x1UL)
574
+ memcg = NULL;
575
+
499576 while (memcg && !(memcg->css.flags & CSS_ONLINE))
500577 memcg = parent_mem_cgroup(memcg);
501578 if (memcg)
....@@ -671,7 +748,7 @@
671748 */
672749 __mem_cgroup_remove_exceeded(mz, mctz);
673750 if (!soft_limit_excess(mz->memcg) ||
674
- !css_tryget_online(&mz->memcg->css))
751
+ !css_tryget(&mz->memcg->css))
675752 goto retry;
676753 done:
677754 return mz;
....@@ -688,33 +765,186 @@
688765 return mz;
689766 }
690767
691
-static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
692
- int event)
768
+/**
769
+ * __mod_memcg_state - update cgroup memory statistics
770
+ * @memcg: the memory cgroup
771
+ * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
772
+ * @val: delta to add to the counter, can be negative
773
+ */
774
+void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
693775 {
694
- return atomic_long_read(&memcg->events[event]);
776
+ long x, threshold = MEMCG_CHARGE_BATCH;
777
+
778
+ if (mem_cgroup_disabled())
779
+ return;
780
+
781
+ if (memcg_stat_item_in_bytes(idx))
782
+ threshold <<= PAGE_SHIFT;
783
+
784
+ x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
785
+ if (unlikely(abs(x) > threshold)) {
786
+ struct mem_cgroup *mi;
787
+
788
+ /*
789
+ * Batch local counters to keep them in sync with
790
+ * the hierarchical ones.
791
+ */
792
+ __this_cpu_add(memcg->vmstats_local->stat[idx], x);
793
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
794
+ atomic_long_add(x, &mi->vmstats[idx]);
795
+ x = 0;
796
+ }
797
+ __this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
798
+}
799
+
800
+static struct mem_cgroup_per_node *
801
+parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
802
+{
803
+ struct mem_cgroup *parent;
804
+
805
+ parent = parent_mem_cgroup(pn->memcg);
806
+ if (!parent)
807
+ return NULL;
808
+ return mem_cgroup_nodeinfo(parent, nid);
809
+}
810
+
811
+void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
812
+ int val)
813
+{
814
+ struct mem_cgroup_per_node *pn;
815
+ struct mem_cgroup *memcg;
816
+ long x, threshold = MEMCG_CHARGE_BATCH;
817
+
818
+ pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
819
+ memcg = pn->memcg;
820
+
821
+ /* Update memcg */
822
+ __mod_memcg_state(memcg, idx, val);
823
+
824
+ /* Update lruvec */
825
+ __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
826
+
827
+ if (vmstat_item_in_bytes(idx))
828
+ threshold <<= PAGE_SHIFT;
829
+
830
+ x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
831
+ if (unlikely(abs(x) > threshold)) {
832
+ pg_data_t *pgdat = lruvec_pgdat(lruvec);
833
+ struct mem_cgroup_per_node *pi;
834
+
835
+ for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
836
+ atomic_long_add(x, &pi->lruvec_stat[idx]);
837
+ x = 0;
838
+ }
839
+ __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
840
+}
841
+
842
+/**
843
+ * __mod_lruvec_state - update lruvec memory statistics
844
+ * @lruvec: the lruvec
845
+ * @idx: the stat item
846
+ * @val: delta to add to the counter, can be negative
847
+ *
848
+ * The lruvec is the intersection of the NUMA node and a cgroup. This
849
+ * function updates the all three counters that are affected by a
850
+ * change of state at this level: per-node, per-cgroup, per-lruvec.
851
+ */
852
+void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
853
+ int val)
854
+{
855
+ /* Update node */
856
+ __mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
857
+
858
+ /* Update memcg and lruvec */
859
+ if (!mem_cgroup_disabled())
860
+ __mod_memcg_lruvec_state(lruvec, idx, val);
861
+}
862
+EXPORT_SYMBOL_GPL(__mod_lruvec_state);
863
+
864
+void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
865
+{
866
+ pg_data_t *pgdat = page_pgdat(virt_to_page(p));
867
+ struct mem_cgroup *memcg;
868
+ struct lruvec *lruvec;
869
+
870
+ rcu_read_lock();
871
+ memcg = mem_cgroup_from_obj(p);
872
+
873
+ /*
874
+ * Untracked pages have no memcg, no lruvec. Update only the
875
+ * node. If we reparent the slab objects to the root memcg,
876
+ * when we free the slab object, we need to update the per-memcg
877
+ * vmstats to keep it correct for the root memcg.
878
+ */
879
+ if (!memcg) {
880
+ __mod_node_page_state(pgdat, idx, val);
881
+ } else {
882
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
883
+ __mod_lruvec_state(lruvec, idx, val);
884
+ }
885
+ rcu_read_unlock();
886
+}
887
+
888
+void mod_memcg_obj_state(void *p, int idx, int val)
889
+{
890
+ struct mem_cgroup *memcg;
891
+
892
+ rcu_read_lock();
893
+ memcg = mem_cgroup_from_obj(p);
894
+ if (memcg)
895
+ mod_memcg_state(memcg, idx, val);
896
+ rcu_read_unlock();
897
+}
898
+
899
+/**
900
+ * __count_memcg_events - account VM events in a cgroup
901
+ * @memcg: the memory cgroup
902
+ * @idx: the event item
903
+ * @count: the number of events that occured
904
+ */
905
+void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
906
+ unsigned long count)
907
+{
908
+ unsigned long x;
909
+
910
+ if (mem_cgroup_disabled())
911
+ return;
912
+
913
+ x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
914
+ if (unlikely(x > MEMCG_CHARGE_BATCH)) {
915
+ struct mem_cgroup *mi;
916
+
917
+ /*
918
+ * Batch local counters to keep them in sync with
919
+ * the hierarchical ones.
920
+ */
921
+ __this_cpu_add(memcg->vmstats_local->events[idx], x);
922
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
923
+ atomic_long_add(x, &mi->vmevents[idx]);
924
+ x = 0;
925
+ }
926
+ __this_cpu_write(memcg->vmstats_percpu->events[idx], x);
927
+}
928
+
929
+static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
930
+{
931
+ return atomic_long_read(&memcg->vmevents[event]);
932
+}
933
+
934
+static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
935
+{
936
+ long x = 0;
937
+ int cpu;
938
+
939
+ for_each_possible_cpu(cpu)
940
+ x += per_cpu(memcg->vmstats_local->events[event], cpu);
941
+ return x;
695942 }
696943
697944 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
698945 struct page *page,
699
- bool compound, int nr_pages)
946
+ int nr_pages)
700947 {
701
- /*
702
- * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
703
- * counted as CACHE even if it's on ANON LRU.
704
- */
705
- if (PageAnon(page))
706
- __mod_memcg_state(memcg, MEMCG_RSS, nr_pages);
707
- else {
708
- __mod_memcg_state(memcg, MEMCG_CACHE, nr_pages);
709
- if (PageSwapBacked(page))
710
- __mod_memcg_state(memcg, NR_SHMEM, nr_pages);
711
- }
712
-
713
- if (compound) {
714
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
715
- __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages);
716
- }
717
-
718948 /* pagein of a big page is an event. So, ignore page size */
719949 if (nr_pages > 0)
720950 __count_memcg_events(memcg, PGPGIN, 1);
....@@ -723,35 +953,7 @@
723953 nr_pages = -nr_pages; /* for event */
724954 }
725955
726
- __this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages);
727
-}
728
-
729
-unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
730
- int nid, unsigned int lru_mask)
731
-{
732
- struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
733
- unsigned long nr = 0;
734
- enum lru_list lru;
735
-
736
- VM_BUG_ON((unsigned)nid >= nr_node_ids);
737
-
738
- for_each_lru(lru) {
739
- if (!(BIT(lru) & lru_mask))
740
- continue;
741
- nr += mem_cgroup_get_lru_size(lruvec, lru);
742
- }
743
- return nr;
744
-}
745
-
746
-static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
747
- unsigned int lru_mask)
748
-{
749
- unsigned long nr = 0;
750
- int nid;
751
-
752
- for_each_node_state(nid, N_MEMORY)
753
- nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
754
- return nr;
956
+ __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
755957 }
756958
757959 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
....@@ -759,8 +961,8 @@
759961 {
760962 unsigned long val, next;
761963
762
- val = __this_cpu_read(memcg->stat_cpu->nr_page_events);
763
- next = __this_cpu_read(memcg->stat_cpu->targets[target]);
964
+ val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
965
+ next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
764966 /* from time_after() in jiffies.h */
765967 if ((long)(next - val) < 0) {
766968 switch (target) {
....@@ -770,13 +972,10 @@
770972 case MEM_CGROUP_TARGET_SOFTLIMIT:
771973 next = val + SOFTLIMIT_EVENTS_TARGET;
772974 break;
773
- case MEM_CGROUP_TARGET_NUMAINFO:
774
- next = val + NUMAINFO_EVENTS_TARGET;
775
- break;
776975 default:
777976 break;
778977 }
779
- __this_cpu_write(memcg->stat_cpu->targets[target], next);
978
+ __this_cpu_write(memcg->vmstats_percpu->targets[target], next);
780979 return true;
781980 }
782981 return false;
....@@ -792,21 +991,12 @@
792991 if (unlikely(mem_cgroup_event_ratelimit(memcg,
793992 MEM_CGROUP_TARGET_THRESH))) {
794993 bool do_softlimit;
795
- bool do_numainfo __maybe_unused;
796994
797995 do_softlimit = mem_cgroup_event_ratelimit(memcg,
798996 MEM_CGROUP_TARGET_SOFTLIMIT);
799
-#if MAX_NUMNODES > 1
800
- do_numainfo = mem_cgroup_event_ratelimit(memcg,
801
- MEM_CGROUP_TARGET_NUMAINFO);
802
-#endif
803997 mem_cgroup_threshold(memcg);
804998 if (unlikely(do_softlimit))
805999 mem_cgroup_update_tree(memcg, page);
806
-#if MAX_NUMNODES > 1
807
- if (unlikely(do_numainfo))
808
- atomic_inc(&memcg->numainfo_events);
809
-#endif
8101000 }
8111001 }
8121002
....@@ -874,27 +1064,60 @@
8741064 return NULL;
8751065
8761066 rcu_read_lock();
877
- if (!memcg || !css_tryget_online(&memcg->css))
1067
+ /* Page should not get uncharged and freed memcg under us. */
1068
+ if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css)))
8781069 memcg = root_mem_cgroup;
8791070 rcu_read_unlock();
8801071 return memcg;
8811072 }
8821073 EXPORT_SYMBOL(get_mem_cgroup_from_page);
8831074
1075
+static __always_inline struct mem_cgroup *active_memcg(void)
1076
+{
1077
+ if (in_interrupt())
1078
+ return this_cpu_read(int_active_memcg);
1079
+ else
1080
+ return current->active_memcg;
1081
+}
1082
+
1083
+static __always_inline struct mem_cgroup *get_active_memcg(void)
1084
+{
1085
+ struct mem_cgroup *memcg;
1086
+
1087
+ rcu_read_lock();
1088
+ memcg = active_memcg();
1089
+ /* remote memcg must hold a ref. */
1090
+ if (memcg && WARN_ON_ONCE(!css_tryget(&memcg->css)))
1091
+ memcg = root_mem_cgroup;
1092
+ rcu_read_unlock();
1093
+
1094
+ return memcg;
1095
+}
1096
+
1097
+static __always_inline bool memcg_kmem_bypass(void)
1098
+{
1099
+ /* Allow remote memcg charging from any context. */
1100
+ if (unlikely(active_memcg()))
1101
+ return false;
1102
+
1103
+ /* Memcg to charge can't be determined. */
1104
+ if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
1105
+ return true;
1106
+
1107
+ return false;
1108
+}
1109
+
8841110 /**
885
- * If current->active_memcg is non-NULL, do not fallback to current->mm->memcg.
1111
+ * If active memcg is set, do not fallback to current->mm->memcg.
8861112 */
8871113 static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
8881114 {
889
- if (unlikely(current->active_memcg)) {
890
- struct mem_cgroup *memcg = root_mem_cgroup;
1115
+ if (memcg_kmem_bypass())
1116
+ return NULL;
8911117
892
- rcu_read_lock();
893
- if (css_tryget_online(&current->active_memcg->css))
894
- memcg = current->active_memcg;
895
- rcu_read_unlock();
896
- return memcg;
897
- }
1118
+ if (unlikely(active_memcg()))
1119
+ return get_active_memcg();
1120
+
8981121 return get_mem_cgroup_from_mm(current->mm);
8991122 }
9001123
....@@ -911,15 +1134,15 @@
9111134 * invocations for reference counting, or use mem_cgroup_iter_break()
9121135 * to cancel a hierarchy walk before the round-trip is complete.
9131136 *
914
- * Reclaimers can specify a node and a priority level in @reclaim to
915
- * divide up the memcgs in the hierarchy among all concurrent
916
- * reclaimers operating on the same node and priority.
1137
+ * Reclaimers can specify a node in @reclaim to divide up the memcgs
1138
+ * in the hierarchy among all concurrent reclaimers operating on the
1139
+ * same node.
9171140 */
9181141 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
9191142 struct mem_cgroup *prev,
9201143 struct mem_cgroup_reclaim_cookie *reclaim)
9211144 {
922
- struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
1145
+ struct mem_cgroup_reclaim_iter *iter;
9231146 struct cgroup_subsys_state *css = NULL;
9241147 struct mem_cgroup *memcg = NULL;
9251148 struct mem_cgroup *pos = NULL;
....@@ -945,7 +1168,7 @@
9451168 struct mem_cgroup_per_node *mz;
9461169
9471170 mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
948
- iter = &mz->iter[reclaim->priority];
1171
+ iter = &mz->iter;
9491172
9501173 if (prev && reclaim->generation != iter->generation)
9511174 goto out_unlock;
....@@ -1045,15 +1268,11 @@
10451268 struct mem_cgroup_reclaim_iter *iter;
10461269 struct mem_cgroup_per_node *mz;
10471270 int nid;
1048
- int i;
10491271
10501272 for_each_node(nid) {
10511273 mz = mem_cgroup_nodeinfo(from, nid);
1052
- for (i = 0; i <= DEF_PRIORITY; i++) {
1053
- iter = &mz->iter[i];
1054
- cmpxchg(&iter->position,
1055
- dead_memcg, NULL);
1056
- }
1274
+ iter = &mz->iter;
1275
+ cmpxchg(&iter->position, dead_memcg, NULL);
10571276 }
10581277 }
10591278
....@@ -1103,7 +1322,7 @@
11031322 struct css_task_iter it;
11041323 struct task_struct *task;
11051324
1106
- css_task_iter_start(&iter->css, 0, &it);
1325
+ css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
11071326 while (!ret && (task = css_task_iter_next(&it)))
11081327 ret = fn(task, arg);
11091328 css_task_iter_end(&it);
....@@ -1120,9 +1339,8 @@
11201339 * @page: the page
11211340 * @pgdat: pgdat of the page
11221341 *
1123
- * This function is only safe when following the LRU page isolation
1124
- * and putback protocol: the LRU lock must be held, and the page must
1125
- * either be PageLRU() or the caller must have isolated/allocated it.
1342
+ * This function relies on page->mem_cgroup being stable - see the
1343
+ * access rules in commit_charge().
11261344 */
11271345 struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
11281346 {
....@@ -1131,7 +1349,7 @@
11311349 struct lruvec *lruvec;
11321350
11331351 if (mem_cgroup_disabled()) {
1134
- lruvec = &pgdat->lruvec;
1352
+ lruvec = &pgdat->__lruvec;
11351353 goto out;
11361354 }
11371355
....@@ -1155,6 +1373,38 @@
11551373 lruvec->pgdat = pgdat;
11561374 return lruvec;
11571375 }
1376
+
1377
+struct lruvec *page_to_lruvec(struct page *page, pg_data_t *pgdat)
1378
+{
1379
+ struct lruvec *lruvec;
1380
+
1381
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
1382
+
1383
+ return lruvec;
1384
+}
1385
+EXPORT_SYMBOL_GPL(page_to_lruvec);
1386
+
1387
+void do_traversal_all_lruvec(void)
1388
+{
1389
+ pg_data_t *pgdat;
1390
+
1391
+ for_each_online_pgdat(pgdat) {
1392
+ struct mem_cgroup *memcg = NULL;
1393
+
1394
+ spin_lock_irq(&pgdat->lru_lock);
1395
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
1396
+ do {
1397
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
1398
+
1399
+ trace_android_vh_do_traversal_lruvec(lruvec);
1400
+
1401
+ memcg = mem_cgroup_iter(NULL, memcg, NULL);
1402
+ } while (memcg);
1403
+
1404
+ spin_unlock_irq(&pgdat->lru_lock);
1405
+ }
1406
+}
1407
+EXPORT_SYMBOL_GPL(do_traversal_all_lruvec);
11581408
11591409 /**
11601410 * mem_cgroup_update_lru_size - account for adding or removing an lru page
....@@ -1194,32 +1444,7 @@
11941444 if (nr_pages > 0)
11951445 *lru_size += nr_pages;
11961446 }
1197
-
1198
-bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
1199
-{
1200
- struct mem_cgroup *task_memcg;
1201
- struct task_struct *p;
1202
- bool ret;
1203
-
1204
- p = find_lock_task_mm(task);
1205
- if (p) {
1206
- task_memcg = get_mem_cgroup_from_mm(p->mm);
1207
- task_unlock(p);
1208
- } else {
1209
- /*
1210
- * All threads may have already detached their mm's, but the oom
1211
- * killer still needs to detect if they have already been oom
1212
- * killed to prevent needlessly killing additional tasks.
1213
- */
1214
- rcu_read_lock();
1215
- task_memcg = mem_cgroup_from_task(task);
1216
- css_get(&task_memcg->css);
1217
- rcu_read_unlock();
1218
- }
1219
- ret = mem_cgroup_is_descendant(task_memcg, memcg);
1220
- css_put(&task_memcg->css);
1221
- return ret;
1222
-}
1447
+EXPORT_SYMBOL_GPL(mem_cgroup_update_lru_size);
12231448
12241449 /**
12251450 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
....@@ -1242,7 +1467,7 @@
12421467 if (do_memsw_account()) {
12431468 count = page_counter_read(&memcg->memsw);
12441469 limit = READ_ONCE(memcg->memsw.max);
1245
- if (count <= limit)
1470
+ if (count < limit)
12461471 margin = min(margin, limit - count);
12471472 else
12481473 margin = 0;
....@@ -1296,85 +1521,199 @@
12961521 return false;
12971522 }
12981523
1299
-static const unsigned int memcg1_stats[] = {
1300
- MEMCG_CACHE,
1301
- MEMCG_RSS,
1302
- MEMCG_RSS_HUGE,
1303
- NR_SHMEM,
1304
- NR_FILE_MAPPED,
1305
- NR_FILE_DIRTY,
1306
- NR_WRITEBACK,
1307
- MEMCG_SWAP,
1524
+struct memory_stat {
1525
+ const char *name;
1526
+ unsigned int ratio;
1527
+ unsigned int idx;
13081528 };
13091529
1310
-static const char *const memcg1_stat_names[] = {
1311
- "cache",
1312
- "rss",
1313
- "rss_huge",
1314
- "shmem",
1315
- "mapped_file",
1316
- "dirty",
1317
- "writeback",
1318
- "swap",
1530
+static struct memory_stat memory_stats[] = {
1531
+ { "anon", PAGE_SIZE, NR_ANON_MAPPED },
1532
+ { "file", PAGE_SIZE, NR_FILE_PAGES },
1533
+ { "kernel_stack", 1024, NR_KERNEL_STACK_KB },
1534
+ { "percpu", 1, MEMCG_PERCPU_B },
1535
+ { "sock", PAGE_SIZE, MEMCG_SOCK },
1536
+ { "shmem", PAGE_SIZE, NR_SHMEM },
1537
+ { "file_mapped", PAGE_SIZE, NR_FILE_MAPPED },
1538
+ { "file_dirty", PAGE_SIZE, NR_FILE_DIRTY },
1539
+ { "file_writeback", PAGE_SIZE, NR_WRITEBACK },
1540
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1541
+ /*
1542
+ * The ratio will be initialized in memory_stats_init(). Because
1543
+ * on some architectures, the macro of HPAGE_PMD_SIZE is not
1544
+ * constant(e.g. powerpc).
1545
+ */
1546
+ { "anon_thp", 0, NR_ANON_THPS },
1547
+#endif
1548
+ { "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON },
1549
+ { "active_anon", PAGE_SIZE, NR_ACTIVE_ANON },
1550
+ { "inactive_file", PAGE_SIZE, NR_INACTIVE_FILE },
1551
+ { "active_file", PAGE_SIZE, NR_ACTIVE_FILE },
1552
+ { "unevictable", PAGE_SIZE, NR_UNEVICTABLE },
1553
+
1554
+ /*
1555
+ * Note: The slab_reclaimable and slab_unreclaimable must be
1556
+ * together and slab_reclaimable must be in front.
1557
+ */
1558
+ { "slab_reclaimable", 1, NR_SLAB_RECLAIMABLE_B },
1559
+ { "slab_unreclaimable", 1, NR_SLAB_UNRECLAIMABLE_B },
1560
+
1561
+ /* The memory events */
1562
+ { "workingset_refault_anon", 1, WORKINGSET_REFAULT_ANON },
1563
+ { "workingset_refault_file", 1, WORKINGSET_REFAULT_FILE },
1564
+ { "workingset_activate_anon", 1, WORKINGSET_ACTIVATE_ANON },
1565
+ { "workingset_activate_file", 1, WORKINGSET_ACTIVATE_FILE },
1566
+ { "workingset_restore_anon", 1, WORKINGSET_RESTORE_ANON },
1567
+ { "workingset_restore_file", 1, WORKINGSET_RESTORE_FILE },
1568
+ { "workingset_nodereclaim", 1, WORKINGSET_NODERECLAIM },
13191569 };
1570
+
1571
+static int __init memory_stats_init(void)
1572
+{
1573
+ int i;
1574
+
1575
+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
1576
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1577
+ if (memory_stats[i].idx == NR_ANON_THPS)
1578
+ memory_stats[i].ratio = HPAGE_PMD_SIZE;
1579
+#endif
1580
+ VM_BUG_ON(!memory_stats[i].ratio);
1581
+ VM_BUG_ON(memory_stats[i].idx >= MEMCG_NR_STAT);
1582
+ }
1583
+
1584
+ return 0;
1585
+}
1586
+pure_initcall(memory_stats_init);
1587
+
1588
+static char *memory_stat_format(struct mem_cgroup *memcg)
1589
+{
1590
+ struct seq_buf s;
1591
+ int i;
1592
+
1593
+ seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
1594
+ if (!s.buffer)
1595
+ return NULL;
1596
+
1597
+ /*
1598
+ * Provide statistics on the state of the memory subsystem as
1599
+ * well as cumulative event counters that show past behavior.
1600
+ *
1601
+ * This list is ordered following a combination of these gradients:
1602
+ * 1) generic big picture -> specifics and details
1603
+ * 2) reflecting userspace activity -> reflecting kernel heuristics
1604
+ *
1605
+ * Current memory state:
1606
+ */
1607
+
1608
+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
1609
+ u64 size;
1610
+
1611
+ size = memcg_page_state(memcg, memory_stats[i].idx);
1612
+ size *= memory_stats[i].ratio;
1613
+ seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size);
1614
+
1615
+ if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
1616
+ size = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
1617
+ memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B);
1618
+ seq_buf_printf(&s, "slab %llu\n", size);
1619
+ }
1620
+ }
1621
+
1622
+ /* Accumulated memory events */
1623
+
1624
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT),
1625
+ memcg_events(memcg, PGFAULT));
1626
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
1627
+ memcg_events(memcg, PGMAJFAULT));
1628
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGREFILL),
1629
+ memcg_events(memcg, PGREFILL));
1630
+ seq_buf_printf(&s, "pgscan %lu\n",
1631
+ memcg_events(memcg, PGSCAN_KSWAPD) +
1632
+ memcg_events(memcg, PGSCAN_DIRECT));
1633
+ seq_buf_printf(&s, "pgsteal %lu\n",
1634
+ memcg_events(memcg, PGSTEAL_KSWAPD) +
1635
+ memcg_events(memcg, PGSTEAL_DIRECT));
1636
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE),
1637
+ memcg_events(memcg, PGACTIVATE));
1638
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE),
1639
+ memcg_events(memcg, PGDEACTIVATE));
1640
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE),
1641
+ memcg_events(memcg, PGLAZYFREE));
1642
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED),
1643
+ memcg_events(memcg, PGLAZYFREED));
1644
+
1645
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1646
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC),
1647
+ memcg_events(memcg, THP_FAULT_ALLOC));
1648
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC),
1649
+ memcg_events(memcg, THP_COLLAPSE_ALLOC));
1650
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1651
+
1652
+ /* The above should easily fit into one page */
1653
+ WARN_ON_ONCE(seq_buf_has_overflowed(&s));
1654
+
1655
+ return s.buffer;
1656
+}
13201657
13211658 #define K(x) ((x) << (PAGE_SHIFT-10))
13221659 /**
1323
- * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
1660
+ * mem_cgroup_print_oom_context: Print OOM information relevant to
1661
+ * memory controller.
13241662 * @memcg: The memory cgroup that went over limit
13251663 * @p: Task that is going to be killed
13261664 *
13271665 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
13281666 * enabled
13291667 */
1330
-void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1668
+void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
13311669 {
1332
- struct mem_cgroup *iter;
1333
- unsigned int i;
1334
-
13351670 rcu_read_lock();
13361671
1672
+ if (memcg) {
1673
+ pr_cont(",oom_memcg=");
1674
+ pr_cont_cgroup_path(memcg->css.cgroup);
1675
+ } else
1676
+ pr_cont(",global_oom");
13371677 if (p) {
1338
- pr_info("Task in ");
1678
+ pr_cont(",task_memcg=");
13391679 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1340
- pr_cont(" killed as a result of limit of ");
1341
- } else {
1342
- pr_info("Memory limit reached of cgroup ");
13431680 }
1344
-
1345
- pr_cont_cgroup_path(memcg->css.cgroup);
1346
- pr_cont("\n");
1347
-
13481681 rcu_read_unlock();
1682
+}
1683
+
1684
+/**
1685
+ * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to
1686
+ * memory controller.
1687
+ * @memcg: The memory cgroup that went over limit
1688
+ */
1689
+void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
1690
+{
1691
+ char *buf;
13491692
13501693 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
13511694 K((u64)page_counter_read(&memcg->memory)),
1352
- K((u64)memcg->memory.max), memcg->memory.failcnt);
1353
- pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1354
- K((u64)page_counter_read(&memcg->memsw)),
1355
- K((u64)memcg->memsw.max), memcg->memsw.failcnt);
1356
- pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1357
- K((u64)page_counter_read(&memcg->kmem)),
1358
- K((u64)memcg->kmem.max), memcg->kmem.failcnt);
1359
-
1360
- for_each_mem_cgroup_tree(iter, memcg) {
1361
- pr_info("Memory cgroup stats for ");
1362
- pr_cont_cgroup_path(iter->css.cgroup);
1363
- pr_cont(":");
1364
-
1365
- for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
1366
- if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account)
1367
- continue;
1368
- pr_cont(" %s:%luKB", memcg1_stat_names[i],
1369
- K(memcg_page_state(iter, memcg1_stats[i])));
1370
- }
1371
-
1372
- for (i = 0; i < NR_LRU_LISTS; i++)
1373
- pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
1374
- K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
1375
-
1376
- pr_cont("\n");
1695
+ K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt);
1696
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
1697
+ pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
1698
+ K((u64)page_counter_read(&memcg->swap)),
1699
+ K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt);
1700
+ else {
1701
+ pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1702
+ K((u64)page_counter_read(&memcg->memsw)),
1703
+ K((u64)memcg->memsw.max), memcg->memsw.failcnt);
1704
+ pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1705
+ K((u64)page_counter_read(&memcg->kmem)),
1706
+ K((u64)memcg->kmem.max), memcg->kmem.failcnt);
13771707 }
1708
+
1709
+ pr_info("Memory cgroup stats for ");
1710
+ pr_cont_cgroup_path(memcg->css.cgroup);
1711
+ pr_cont(":");
1712
+ buf = memory_stat_format(memcg);
1713
+ if (!buf)
1714
+ return;
1715
+ pr_info("%s", buf);
1716
+ kfree(buf);
13781717 }
13791718
13801719 /*
....@@ -1382,19 +1721,26 @@
13821721 */
13831722 unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
13841723 {
1385
- unsigned long max;
1724
+ unsigned long max = READ_ONCE(memcg->memory.max);
13861725
1387
- max = memcg->memory.max;
1388
- if (mem_cgroup_swappiness(memcg)) {
1389
- unsigned long memsw_max;
1390
- unsigned long swap_max;
1726
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
1727
+ if (mem_cgroup_swappiness(memcg))
1728
+ max += min(READ_ONCE(memcg->swap.max),
1729
+ (unsigned long)total_swap_pages);
1730
+ } else { /* v1 */
1731
+ if (mem_cgroup_swappiness(memcg)) {
1732
+ /* Calculate swap excess capacity from memsw limit */
1733
+ unsigned long swap = READ_ONCE(memcg->memsw.max) - max;
13911734
1392
- memsw_max = memcg->memsw.max;
1393
- swap_max = memcg->swap.max;
1394
- swap_max = min(swap_max, (unsigned long)total_swap_pages);
1395
- max = min(max + swap_max, memsw_max);
1735
+ max += min(swap, (unsigned long)total_swap_pages);
1736
+ }
13961737 }
13971738 return max;
1739
+}
1740
+
1741
+unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
1742
+{
1743
+ return page_counter_read(&memcg->memory);
13981744 }
13991745
14001746 static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
....@@ -1407,112 +1753,24 @@
14071753 .gfp_mask = gfp_mask,
14081754 .order = order,
14091755 };
1410
- bool ret;
1756
+ bool ret = true;
14111757
14121758 if (mutex_lock_killable(&oom_lock))
14131759 return true;
1760
+
1761
+ if (mem_cgroup_margin(memcg) >= (1 << order))
1762
+ goto unlock;
1763
+
14141764 /*
14151765 * A few threads which were not waiting at mutex_lock_killable() can
14161766 * fail to bail out. Therefore, check again after holding oom_lock.
14171767 */
1418
- ret = should_force_charge() || out_of_memory(&oc);
1768
+ ret = task_is_dying() || out_of_memory(&oc);
1769
+
1770
+unlock:
14191771 mutex_unlock(&oom_lock);
14201772 return ret;
14211773 }
1422
-
1423
-#if MAX_NUMNODES > 1
1424
-
1425
-/**
1426
- * test_mem_cgroup_node_reclaimable
1427
- * @memcg: the target memcg
1428
- * @nid: the node ID to be checked.
1429
- * @noswap : specify true here if the user wants flle only information.
1430
- *
1431
- * This function returns whether the specified memcg contains any
1432
- * reclaimable pages on a node. Returns true if there are any reclaimable
1433
- * pages in the node.
1434
- */
1435
-static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1436
- int nid, bool noswap)
1437
-{
1438
- if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
1439
- return true;
1440
- if (noswap || !total_swap_pages)
1441
- return false;
1442
- if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
1443
- return true;
1444
- return false;
1445
-
1446
-}
1447
-
1448
-/*
1449
- * Always updating the nodemask is not very good - even if we have an empty
1450
- * list or the wrong list here, we can start from some node and traverse all
1451
- * nodes based on the zonelist. So update the list loosely once per 10 secs.
1452
- *
1453
- */
1454
-static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1455
-{
1456
- int nid;
1457
- /*
1458
- * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
1459
- * pagein/pageout changes since the last update.
1460
- */
1461
- if (!atomic_read(&memcg->numainfo_events))
1462
- return;
1463
- if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1464
- return;
1465
-
1466
- /* make a nodemask where this memcg uses memory from */
1467
- memcg->scan_nodes = node_states[N_MEMORY];
1468
-
1469
- for_each_node_mask(nid, node_states[N_MEMORY]) {
1470
-
1471
- if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1472
- node_clear(nid, memcg->scan_nodes);
1473
- }
1474
-
1475
- atomic_set(&memcg->numainfo_events, 0);
1476
- atomic_set(&memcg->numainfo_updating, 0);
1477
-}
1478
-
1479
-/*
1480
- * Selecting a node where we start reclaim from. Because what we need is just
1481
- * reducing usage counter, start from anywhere is O,K. Considering
1482
- * memory reclaim from current node, there are pros. and cons.
1483
- *
1484
- * Freeing memory from current node means freeing memory from a node which
1485
- * we'll use or we've used. So, it may make LRU bad. And if several threads
1486
- * hit limits, it will see a contention on a node. But freeing from remote
1487
- * node means more costs for memory reclaim because of memory latency.
1488
- *
1489
- * Now, we use round-robin. Better algorithm is welcomed.
1490
- */
1491
-int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1492
-{
1493
- int node;
1494
-
1495
- mem_cgroup_may_update_nodemask(memcg);
1496
- node = memcg->last_scanned_node;
1497
-
1498
- node = next_node_in(node, memcg->scan_nodes);
1499
- /*
1500
- * mem_cgroup_may_update_nodemask might have seen no reclaimmable pages
1501
- * last time it really checked all the LRUs due to rate limiting.
1502
- * Fallback to the current node in that case for simplicity.
1503
- */
1504
- if (unlikely(node == MAX_NUMNODES))
1505
- node = numa_node_id();
1506
-
1507
- memcg->last_scanned_node = node;
1508
- return node;
1509
-}
1510
-#else
1511
-int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1512
-{
1513
- return 0;
1514
-}
1515
-#endif
15161774
15171775 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
15181776 pg_data_t *pgdat,
....@@ -1526,7 +1784,6 @@
15261784 unsigned long nr_scanned;
15271785 struct mem_cgroup_reclaim_cookie reclaim = {
15281786 .pgdat = pgdat,
1529
- .priority = 0,
15301787 };
15311788
15321789 excess = soft_limit_excess(root_memcg);
....@@ -1621,7 +1878,7 @@
16211878 struct mem_cgroup *iter;
16221879
16231880 spin_lock(&memcg_oom_lock);
1624
- mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
1881
+ mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
16251882 for_each_mem_cgroup_tree(iter, memcg)
16261883 iter->oom_lock = false;
16271884 spin_unlock(&memcg_oom_lock);
....@@ -1642,8 +1899,8 @@
16421899 struct mem_cgroup *iter;
16431900
16441901 /*
1645
- * When a new child is created while the hierarchy is under oom,
1646
- * mem_cgroup_oom_lock() may not be called. Watch for underflow.
1902
+ * Be careful about under_oom underflows becase a child memcg
1903
+ * could have been added after mem_cgroup_mark_under_oom.
16471904 */
16481905 spin_lock(&memcg_oom_lock);
16491906 for_each_mem_cgroup_tree(iter, memcg)
....@@ -1703,6 +1960,8 @@
17031960
17041961 if (order > PAGE_ALLOC_COSTLY_ORDER)
17051962 return OOM_SKIPPED;
1963
+
1964
+ memcg_memory_event(memcg, MEMCG_OOM);
17061965
17071966 /*
17081967 * We are in the middle of the charge context here, so we
....@@ -1851,6 +2110,14 @@
18512110 goto out;
18522111
18532112 /*
2113
+ * If the victim task has been asynchronously moved to a different
2114
+ * memory cgroup, we might end up killing tasks outside oom_domain.
2115
+ * In this case it's better to ignore memory.group.oom.
2116
+ */
2117
+ if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain)))
2118
+ goto out;
2119
+
2120
+ /*
18542121 * Traverse the memory cgroup hierarchy from the victim task's
18552122 * cgroup up to the OOMing cgroup (or root) to find the
18562123 * highest-level memory cgroup with oom.group set.
....@@ -1891,6 +2158,7 @@
18912158 */
18922159 struct mem_cgroup *lock_page_memcg(struct page *page)
18932160 {
2161
+ struct page *head = compound_head(page); /* rmap on tail pages */
18942162 struct mem_cgroup *memcg;
18952163 unsigned long flags;
18962164
....@@ -1910,7 +2178,7 @@
19102178 if (mem_cgroup_disabled())
19112179 return NULL;
19122180 again:
1913
- memcg = page->mem_cgroup;
2181
+ memcg = head->mem_cgroup;
19142182 if (unlikely(!memcg))
19152183 return NULL;
19162184
....@@ -1918,7 +2186,7 @@
19182186 return memcg;
19192187
19202188 spin_lock_irqsave(&memcg->move_lock, flags);
1921
- if (memcg != page->mem_cgroup) {
2189
+ if (memcg != head->mem_cgroup) {
19222190 spin_unlock_irqrestore(&memcg->move_lock, flags);
19232191 goto again;
19242192 }
....@@ -1961,19 +2229,43 @@
19612229 */
19622230 void unlock_page_memcg(struct page *page)
19632231 {
1964
- __unlock_page_memcg(page->mem_cgroup);
2232
+ struct page *head = compound_head(page);
2233
+
2234
+ __unlock_page_memcg(head->mem_cgroup);
19652235 }
19662236 EXPORT_SYMBOL(unlock_page_memcg);
19672237
19682238 struct memcg_stock_pcp {
19692239 struct mem_cgroup *cached; /* this never be root cgroup */
19702240 unsigned int nr_pages;
2241
+
2242
+#ifdef CONFIG_MEMCG_KMEM
2243
+ struct obj_cgroup *cached_objcg;
2244
+ unsigned int nr_bytes;
2245
+#endif
2246
+
19712247 struct work_struct work;
19722248 unsigned long flags;
19732249 #define FLUSHING_CACHED_CHARGE 0
19742250 };
19752251 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
19762252 static DEFINE_MUTEX(percpu_charge_mutex);
2253
+
2254
+#ifdef CONFIG_MEMCG_KMEM
2255
+static void drain_obj_stock(struct memcg_stock_pcp *stock);
2256
+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
2257
+ struct mem_cgroup *root_memcg);
2258
+
2259
+#else
2260
+static inline void drain_obj_stock(struct memcg_stock_pcp *stock)
2261
+{
2262
+}
2263
+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
2264
+ struct mem_cgroup *root_memcg)
2265
+{
2266
+ return false;
2267
+}
2268
+#endif
19772269
19782270 /**
19792271 * consume_stock: Try to consume stocked charge on this cpu.
....@@ -2015,13 +2307,17 @@
20152307 {
20162308 struct mem_cgroup *old = stock->cached;
20172309
2310
+ if (!old)
2311
+ return;
2312
+
20182313 if (stock->nr_pages) {
20192314 page_counter_uncharge(&old->memory, stock->nr_pages);
20202315 if (do_memsw_account())
20212316 page_counter_uncharge(&old->memsw, stock->nr_pages);
2022
- css_put_many(&old->css, stock->nr_pages);
20232317 stock->nr_pages = 0;
20242318 }
2319
+
2320
+ css_put(&old->css);
20252321 stock->cached = NULL;
20262322 }
20272323
....@@ -2037,6 +2333,7 @@
20372333 local_irq_save(flags);
20382334
20392335 stock = this_cpu_ptr(&memcg_stock);
2336
+ drain_obj_stock(stock);
20402337 drain_stock(stock);
20412338 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
20422339
....@@ -2057,6 +2354,7 @@
20572354 stock = this_cpu_ptr(&memcg_stock);
20582355 if (stock->cached != memcg) { /* reset if necessary */
20592356 drain_stock(stock);
2357
+ css_get(&memcg->css);
20602358 stock->cached = memcg;
20612359 }
20622360 stock->nr_pages += nr_pages;
....@@ -2088,21 +2386,24 @@
20882386 for_each_online_cpu(cpu) {
20892387 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
20902388 struct mem_cgroup *memcg;
2389
+ bool flush = false;
20912390
2391
+ rcu_read_lock();
20922392 memcg = stock->cached;
2093
- if (!memcg || !stock->nr_pages || !css_tryget(&memcg->css))
2094
- continue;
2095
- if (!mem_cgroup_is_descendant(memcg, root_memcg)) {
2096
- css_put(&memcg->css);
2097
- continue;
2098
- }
2099
- if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2393
+ if (memcg && stock->nr_pages &&
2394
+ mem_cgroup_is_descendant(memcg, root_memcg))
2395
+ flush = true;
2396
+ if (obj_stock_flush_required(stock, root_memcg))
2397
+ flush = true;
2398
+ rcu_read_unlock();
2399
+
2400
+ if (flush &&
2401
+ !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
21002402 if (cpu == curcpu)
21012403 drain_local_stock(&stock->work);
21022404 else
21032405 schedule_work_on(cpu, &stock->work);
21042406 }
2105
- css_put(&memcg->css);
21062407 }
21072408 put_cpu();
21082409 mutex_unlock(&percpu_charge_mutex);
....@@ -2111,7 +2412,7 @@
21112412 static int memcg_hotplug_cpu_dead(unsigned int cpu)
21122413 {
21132414 struct memcg_stock_pcp *stock;
2114
- struct mem_cgroup *memcg;
2415
+ struct mem_cgroup *memcg, *mi;
21152416
21162417 stock = &per_cpu(memcg_stock, cpu);
21172418 drain_stock(stock);
....@@ -2123,9 +2424,10 @@
21232424 int nid;
21242425 long x;
21252426
2126
- x = this_cpu_xchg(memcg->stat_cpu->count[i], 0);
2427
+ x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
21272428 if (x)
2128
- atomic_long_add(x, &memcg->stat[i]);
2429
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
2430
+ atomic_long_add(x, &memcg->vmstats[i]);
21292431
21302432 if (i >= NR_VM_NODE_STAT_ITEMS)
21312433 continue;
....@@ -2136,32 +2438,48 @@
21362438 pn = mem_cgroup_nodeinfo(memcg, nid);
21372439 x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
21382440 if (x)
2139
- atomic_long_add(x, &pn->lruvec_stat[i]);
2441
+ do {
2442
+ atomic_long_add(x, &pn->lruvec_stat[i]);
2443
+ } while ((pn = parent_nodeinfo(pn, nid)));
21402444 }
21412445 }
21422446
21432447 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
21442448 long x;
21452449
2146
- x = this_cpu_xchg(memcg->stat_cpu->events[i], 0);
2450
+ x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
21472451 if (x)
2148
- atomic_long_add(x, &memcg->events[i]);
2452
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
2453
+ atomic_long_add(x, &memcg->vmevents[i]);
21492454 }
21502455 }
21512456
21522457 return 0;
21532458 }
21542459
2155
-static void reclaim_high(struct mem_cgroup *memcg,
2156
- unsigned int nr_pages,
2157
- gfp_t gfp_mask)
2460
+static unsigned long reclaim_high(struct mem_cgroup *memcg,
2461
+ unsigned int nr_pages,
2462
+ gfp_t gfp_mask)
21582463 {
2464
+ unsigned long nr_reclaimed = 0;
2465
+
21592466 do {
2160
- if (page_counter_read(&memcg->memory) <= memcg->high)
2467
+ unsigned long pflags;
2468
+
2469
+ if (page_counter_read(&memcg->memory) <=
2470
+ READ_ONCE(memcg->memory.high))
21612471 continue;
2472
+
21622473 memcg_memory_event(memcg, MEMCG_HIGH);
2163
- try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
2164
- } while ((memcg = parent_mem_cgroup(memcg)));
2474
+
2475
+ psi_memstall_enter(&pflags);
2476
+ nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
2477
+ gfp_mask, true);
2478
+ psi_memstall_leave(&pflags);
2479
+ } while ((memcg = parent_mem_cgroup(memcg)) &&
2480
+ !mem_cgroup_is_root(memcg));
2481
+
2482
+ return nr_reclaimed;
21652483 }
21662484
21672485 static void high_work_func(struct work_struct *work)
....@@ -2173,35 +2491,238 @@
21732491 }
21742492
21752493 /*
2494
+ * Clamp the maximum sleep time per allocation batch to 2 seconds. This is
2495
+ * enough to still cause a significant slowdown in most cases, while still
2496
+ * allowing diagnostics and tracing to proceed without becoming stuck.
2497
+ */
2498
+#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
2499
+
2500
+/*
2501
+ * When calculating the delay, we use these either side of the exponentiation to
2502
+ * maintain precision and scale to a reasonable number of jiffies (see the table
2503
+ * below.
2504
+ *
2505
+ * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
2506
+ * overage ratio to a delay.
2507
+ * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the
2508
+ * proposed penalty in order to reduce to a reasonable number of jiffies, and
2509
+ * to produce a reasonable delay curve.
2510
+ *
2511
+ * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
2512
+ * reasonable delay curve compared to precision-adjusted overage, not
2513
+ * penalising heavily at first, but still making sure that growth beyond the
2514
+ * limit penalises misbehaviour cgroups by slowing them down exponentially. For
2515
+ * example, with a high of 100 megabytes:
2516
+ *
2517
+ * +-------+------------------------+
2518
+ * | usage | time to allocate in ms |
2519
+ * +-------+------------------------+
2520
+ * | 100M | 0 |
2521
+ * | 101M | 6 |
2522
+ * | 102M | 25 |
2523
+ * | 103M | 57 |
2524
+ * | 104M | 102 |
2525
+ * | 105M | 159 |
2526
+ * | 106M | 230 |
2527
+ * | 107M | 313 |
2528
+ * | 108M | 409 |
2529
+ * | 109M | 518 |
2530
+ * | 110M | 639 |
2531
+ * | 111M | 774 |
2532
+ * | 112M | 921 |
2533
+ * | 113M | 1081 |
2534
+ * | 114M | 1254 |
2535
+ * | 115M | 1439 |
2536
+ * | 116M | 1638 |
2537
+ * | 117M | 1849 |
2538
+ * | 118M | 2000 |
2539
+ * | 119M | 2000 |
2540
+ * | 120M | 2000 |
2541
+ * +-------+------------------------+
2542
+ */
2543
+ #define MEMCG_DELAY_PRECISION_SHIFT 20
2544
+ #define MEMCG_DELAY_SCALING_SHIFT 14
2545
+
2546
+static u64 calculate_overage(unsigned long usage, unsigned long high)
2547
+{
2548
+ u64 overage;
2549
+
2550
+ if (usage <= high)
2551
+ return 0;
2552
+
2553
+ /*
2554
+ * Prevent division by 0 in overage calculation by acting as if
2555
+ * it was a threshold of 1 page
2556
+ */
2557
+ high = max(high, 1UL);
2558
+
2559
+ overage = usage - high;
2560
+ overage <<= MEMCG_DELAY_PRECISION_SHIFT;
2561
+ return div64_u64(overage, high);
2562
+}
2563
+
2564
+static u64 mem_find_max_overage(struct mem_cgroup *memcg)
2565
+{
2566
+ u64 overage, max_overage = 0;
2567
+
2568
+ do {
2569
+ overage = calculate_overage(page_counter_read(&memcg->memory),
2570
+ READ_ONCE(memcg->memory.high));
2571
+ max_overage = max(overage, max_overage);
2572
+ } while ((memcg = parent_mem_cgroup(memcg)) &&
2573
+ !mem_cgroup_is_root(memcg));
2574
+
2575
+ return max_overage;
2576
+}
2577
+
2578
+static u64 swap_find_max_overage(struct mem_cgroup *memcg)
2579
+{
2580
+ u64 overage, max_overage = 0;
2581
+
2582
+ do {
2583
+ overage = calculate_overage(page_counter_read(&memcg->swap),
2584
+ READ_ONCE(memcg->swap.high));
2585
+ if (overage)
2586
+ memcg_memory_event(memcg, MEMCG_SWAP_HIGH);
2587
+ max_overage = max(overage, max_overage);
2588
+ } while ((memcg = parent_mem_cgroup(memcg)) &&
2589
+ !mem_cgroup_is_root(memcg));
2590
+
2591
+ return max_overage;
2592
+}
2593
+
2594
+/*
2595
+ * Get the number of jiffies that we should penalise a mischievous cgroup which
2596
+ * is exceeding its memory.high by checking both it and its ancestors.
2597
+ */
2598
+static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
2599
+ unsigned int nr_pages,
2600
+ u64 max_overage)
2601
+{
2602
+ unsigned long penalty_jiffies;
2603
+
2604
+ if (!max_overage)
2605
+ return 0;
2606
+
2607
+ /*
2608
+ * We use overage compared to memory.high to calculate the number of
2609
+ * jiffies to sleep (penalty_jiffies). Ideally this value should be
2610
+ * fairly lenient on small overages, and increasingly harsh when the
2611
+ * memcg in question makes it clear that it has no intention of stopping
2612
+ * its crazy behaviour, so we exponentially increase the delay based on
2613
+ * overage amount.
2614
+ */
2615
+ penalty_jiffies = max_overage * max_overage * HZ;
2616
+ penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
2617
+ penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
2618
+
2619
+ /*
2620
+ * Factor in the task's own contribution to the overage, such that four
2621
+ * N-sized allocations are throttled approximately the same as one
2622
+ * 4N-sized allocation.
2623
+ *
2624
+ * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
2625
+ * larger the current charge patch is than that.
2626
+ */
2627
+ return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
2628
+}
2629
+
2630
+/*
21762631 * Scheduled by try_charge() to be executed from the userland return path
21772632 * and reclaims memory over the high limit.
21782633 */
21792634 void mem_cgroup_handle_over_high(void)
21802635 {
2636
+ unsigned long penalty_jiffies;
2637
+ unsigned long pflags;
2638
+ unsigned long nr_reclaimed;
21812639 unsigned int nr_pages = current->memcg_nr_pages_over_high;
2640
+ int nr_retries = MAX_RECLAIM_RETRIES;
21822641 struct mem_cgroup *memcg;
2642
+ bool in_retry = false;
21832643
21842644 if (likely(!nr_pages))
21852645 return;
21862646
21872647 memcg = get_mem_cgroup_from_mm(current->mm);
2188
- reclaim_high(memcg, nr_pages, GFP_KERNEL);
2189
- css_put(&memcg->css);
21902648 current->memcg_nr_pages_over_high = 0;
2649
+
2650
+retry_reclaim:
2651
+ /*
2652
+ * The allocating task should reclaim at least the batch size, but for
2653
+ * subsequent retries we only want to do what's necessary to prevent oom
2654
+ * or breaching resource isolation.
2655
+ *
2656
+ * This is distinct from memory.max or page allocator behaviour because
2657
+ * memory.high is currently batched, whereas memory.max and the page
2658
+ * allocator run every time an allocation is made.
2659
+ */
2660
+ nr_reclaimed = reclaim_high(memcg,
2661
+ in_retry ? SWAP_CLUSTER_MAX : nr_pages,
2662
+ GFP_KERNEL);
2663
+
2664
+ /*
2665
+ * memory.high is breached and reclaim is unable to keep up. Throttle
2666
+ * allocators proactively to slow down excessive growth.
2667
+ */
2668
+ penalty_jiffies = calculate_high_delay(memcg, nr_pages,
2669
+ mem_find_max_overage(memcg));
2670
+
2671
+ penalty_jiffies += calculate_high_delay(memcg, nr_pages,
2672
+ swap_find_max_overage(memcg));
2673
+
2674
+ /*
2675
+ * Clamp the max delay per usermode return so as to still keep the
2676
+ * application moving forwards and also permit diagnostics, albeit
2677
+ * extremely slowly.
2678
+ */
2679
+ penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
2680
+
2681
+ /*
2682
+ * Don't sleep if the amount of jiffies this memcg owes us is so low
2683
+ * that it's not even worth doing, in an attempt to be nice to those who
2684
+ * go only a small amount over their memory.high value and maybe haven't
2685
+ * been aggressively reclaimed enough yet.
2686
+ */
2687
+ if (penalty_jiffies <= HZ / 100)
2688
+ goto out;
2689
+
2690
+ /*
2691
+ * If reclaim is making forward progress but we're still over
2692
+ * memory.high, we want to encourage that rather than doing allocator
2693
+ * throttling.
2694
+ */
2695
+ if (nr_reclaimed || nr_retries--) {
2696
+ in_retry = true;
2697
+ goto retry_reclaim;
2698
+ }
2699
+
2700
+ /*
2701
+ * If we exit early, we're guaranteed to die (since
2702
+ * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
2703
+ * need to account for any ill-begotten jiffies to pay them off later.
2704
+ */
2705
+ psi_memstall_enter(&pflags);
2706
+ schedule_timeout_killable(penalty_jiffies);
2707
+ psi_memstall_leave(&pflags);
2708
+
2709
+out:
2710
+ css_put(&memcg->css);
21912711 }
21922712
21932713 static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
21942714 unsigned int nr_pages)
21952715 {
21962716 unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
2197
- int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2717
+ int nr_retries = MAX_RECLAIM_RETRIES;
21982718 struct mem_cgroup *mem_over_limit;
21992719 struct page_counter *counter;
2720
+ enum oom_status oom_status;
22002721 unsigned long nr_reclaimed;
2722
+ bool passed_oom = false;
22012723 bool may_swap = true;
22022724 bool drained = false;
2203
- bool oomed = false;
2204
- enum oom_status oom_status;
2725
+ unsigned long pflags;
22052726
22062727 if (mem_cgroup_is_root(memcg))
22072728 return 0;
....@@ -2236,15 +2757,6 @@
22362757 goto force;
22372758
22382759 /*
2239
- * Unlike in global OOM situations, memcg is not in a physical
2240
- * memory shortage. Allow dying and OOM-killed tasks to
2241
- * bypass the last charges so that they can exit quickly and
2242
- * free their memory.
2243
- */
2244
- if (unlikely(should_force_charge()))
2245
- goto force;
2246
-
2247
- /*
22482760 * Prevent unbounded recursion when reclaim operations need to
22492761 * allocate memory. This might exceed the limits temporarily,
22502762 * but we prefer facilitating memory reclaim and getting back
....@@ -2261,8 +2773,10 @@
22612773
22622774 memcg_memory_event(mem_over_limit, MEMCG_MAX);
22632775
2776
+ psi_memstall_enter(&pflags);
22642777 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
22652778 gfp_mask, may_swap);
2779
+ psi_memstall_leave(&pflags);
22662780
22672781 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
22682782 goto retry;
....@@ -2296,16 +2810,15 @@
22962810 if (nr_retries--)
22972811 goto retry;
22982812
2299
- if (gfp_mask & __GFP_RETRY_MAYFAIL && oomed)
2813
+ if (gfp_mask & __GFP_RETRY_MAYFAIL)
23002814 goto nomem;
23012815
23022816 if (gfp_mask & __GFP_NOFAIL)
23032817 goto force;
23042818
2305
- if (fatal_signal_pending(current))
2306
- goto force;
2307
-
2308
- memcg_memory_event(mem_over_limit, MEMCG_OOM);
2819
+ /* Avoid endless loop for tasks bypassed by the oom killer */
2820
+ if (passed_oom && task_is_dying())
2821
+ goto nomem;
23092822
23102823 /*
23112824 * keep retrying as long as the memcg oom killer is able to make
....@@ -2314,15 +2827,10 @@
23142827 */
23152828 oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
23162829 get_order(nr_pages * PAGE_SIZE));
2317
- switch (oom_status) {
2318
- case OOM_SUCCESS:
2319
- nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2320
- oomed = true;
2830
+ if (oom_status == OOM_SUCCESS) {
2831
+ passed_oom = true;
2832
+ nr_retries = MAX_RECLAIM_RETRIES;
23212833 goto retry;
2322
- case OOM_FAILED:
2323
- goto force;
2324
- default:
2325
- goto nomem;
23262834 }
23272835 nomem:
23282836 if (!(gfp_mask & __GFP_NOFAIL))
....@@ -2336,12 +2844,10 @@
23362844 page_counter_charge(&memcg->memory, nr_pages);
23372845 if (do_memsw_account())
23382846 page_counter_charge(&memcg->memsw, nr_pages);
2339
- css_get_many(&memcg->css, nr_pages);
23402847
23412848 return 0;
23422849
23432850 done_restock:
2344
- css_get_many(&memcg->css, batch);
23452851 if (batch > nr_pages)
23462852 refill_stock(memcg, batch - nr_pages);
23472853
....@@ -2355,12 +2861,32 @@
23552861 * reclaim, the cost of mismatch is negligible.
23562862 */
23572863 do {
2358
- if (page_counter_read(&memcg->memory) > memcg->high) {
2359
- /* Don't bother a random interrupted task */
2360
- if (in_interrupt()) {
2864
+ bool mem_high, swap_high;
2865
+
2866
+ mem_high = page_counter_read(&memcg->memory) >
2867
+ READ_ONCE(memcg->memory.high);
2868
+ swap_high = page_counter_read(&memcg->swap) >
2869
+ READ_ONCE(memcg->swap.high);
2870
+
2871
+ /* Don't bother a random interrupted task */
2872
+ if (in_interrupt()) {
2873
+ if (mem_high) {
23612874 schedule_work(&memcg->high_work);
23622875 break;
23632876 }
2877
+ continue;
2878
+ }
2879
+
2880
+ if (mem_high || swap_high) {
2881
+ /*
2882
+ * The allocating tasks in this cgroup will need to do
2883
+ * reclaim or be throttled to prevent further growth
2884
+ * of the memory or swap footprints.
2885
+ *
2886
+ * Target some best-effort fairness between the tasks,
2887
+ * and distribute reclaim work and delay penalties
2888
+ * based on how much each task is actually allocating.
2889
+ */
23642890 current->memcg_nr_pages_over_high += batch;
23652891 set_notify_resume(current);
23662892 break;
....@@ -2370,6 +2896,7 @@
23702896 return 0;
23712897 }
23722898
2899
+#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU)
23732900 static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
23742901 {
23752902 if (mem_cgroup_is_root(memcg))
....@@ -2378,76 +2905,124 @@
23782905 page_counter_uncharge(&memcg->memory, nr_pages);
23792906 if (do_memsw_account())
23802907 page_counter_uncharge(&memcg->memsw, nr_pages);
2381
-
2382
- css_put_many(&memcg->css, nr_pages);
23832908 }
2909
+#endif
23842910
2385
-static void lock_page_lru(struct page *page, int *isolated)
2911
+static void commit_charge(struct page *page, struct mem_cgroup *memcg)
23862912 {
2387
- struct zone *zone = page_zone(page);
2388
-
2389
- spin_lock_irq(zone_lru_lock(zone));
2390
- if (PageLRU(page)) {
2391
- struct lruvec *lruvec;
2392
-
2393
- lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
2394
- ClearPageLRU(page);
2395
- del_page_from_lru_list(page, lruvec, page_lru(page));
2396
- *isolated = 1;
2397
- } else
2398
- *isolated = 0;
2399
-}
2400
-
2401
-static void unlock_page_lru(struct page *page, int isolated)
2402
-{
2403
- struct zone *zone = page_zone(page);
2404
-
2405
- if (isolated) {
2406
- struct lruvec *lruvec;
2407
-
2408
- lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
2409
- VM_BUG_ON_PAGE(PageLRU(page), page);
2410
- SetPageLRU(page);
2411
- add_page_to_lru_list(page, lruvec, page_lru(page));
2412
- }
2413
- spin_unlock_irq(zone_lru_lock(zone));
2414
-}
2415
-
2416
-static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2417
- bool lrucare)
2418
-{
2419
- int isolated;
2420
-
24212913 VM_BUG_ON_PAGE(page->mem_cgroup, page);
2422
-
24232914 /*
2424
- * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
2425
- * may already be on some other mem_cgroup's LRU. Take care of it.
2426
- */
2427
- if (lrucare)
2428
- lock_page_lru(page, &isolated);
2429
-
2430
- /*
2431
- * Nobody should be changing or seriously looking at
2432
- * page->mem_cgroup at this point:
2915
+ * Any of the following ensures page->mem_cgroup stability:
24332916 *
2434
- * - the page is uncharged
2435
- *
2436
- * - the page is off-LRU
2437
- *
2438
- * - an anonymous fault has exclusive page access, except for
2439
- * a locked page table
2440
- *
2441
- * - a page cache insertion, a swapin fault, or a migration
2442
- * have the page locked
2917
+ * - the page lock
2918
+ * - LRU isolation
2919
+ * - lock_page_memcg()
2920
+ * - exclusive reference
24432921 */
24442922 page->mem_cgroup = memcg;
2445
-
2446
- if (lrucare)
2447
- unlock_page_lru(page, isolated);
24482923 }
24492924
24502925 #ifdef CONFIG_MEMCG_KMEM
2926
+/*
2927
+ * The allocated objcg pointers array is not accounted directly.
2928
+ * Moreover, it should not come from DMA buffer and is not readily
2929
+ * reclaimable. So those GFP bits should be masked off.
2930
+ */
2931
+#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT)
2932
+
2933
+int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
2934
+ gfp_t gfp)
2935
+{
2936
+ unsigned int objects = objs_per_slab_page(s, page);
2937
+ void *vec;
2938
+
2939
+ gfp &= ~OBJCGS_CLEAR_MASK;
2940
+ vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
2941
+ page_to_nid(page));
2942
+ if (!vec)
2943
+ return -ENOMEM;
2944
+
2945
+ if (cmpxchg(&page->obj_cgroups, NULL,
2946
+ (struct obj_cgroup **) ((unsigned long)vec | 0x1UL)))
2947
+ kfree(vec);
2948
+ else
2949
+ kmemleak_not_leak(vec);
2950
+
2951
+ return 0;
2952
+}
2953
+
2954
+/*
2955
+ * Returns a pointer to the memory cgroup to which the kernel object is charged.
2956
+ *
2957
+ * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
2958
+ * cgroup_mutex, etc.
2959
+ */
2960
+struct mem_cgroup *mem_cgroup_from_obj(void *p)
2961
+{
2962
+ struct page *page;
2963
+
2964
+ if (mem_cgroup_disabled())
2965
+ return NULL;
2966
+
2967
+ page = virt_to_head_page(p);
2968
+
2969
+ /*
2970
+ * If page->mem_cgroup is set, it's either a simple mem_cgroup pointer
2971
+ * or a pointer to obj_cgroup vector. In the latter case the lowest
2972
+ * bit of the pointer is set.
2973
+ * The page->mem_cgroup pointer can be asynchronously changed
2974
+ * from NULL to (obj_cgroup_vec | 0x1UL), but can't be changed
2975
+ * from a valid memcg pointer to objcg vector or back.
2976
+ */
2977
+ if (!page->mem_cgroup)
2978
+ return NULL;
2979
+
2980
+ /*
2981
+ * Slab objects are accounted individually, not per-page.
2982
+ * Memcg membership data for each individual object is saved in
2983
+ * the page->obj_cgroups.
2984
+ */
2985
+ if (page_has_obj_cgroups(page)) {
2986
+ struct obj_cgroup *objcg;
2987
+ unsigned int off;
2988
+
2989
+ off = obj_to_index(page->slab_cache, page, p);
2990
+ objcg = page_obj_cgroups(page)[off];
2991
+ if (objcg)
2992
+ return obj_cgroup_memcg(objcg);
2993
+
2994
+ return NULL;
2995
+ }
2996
+
2997
+ /* All other pages use page->mem_cgroup */
2998
+ return page->mem_cgroup;
2999
+}
3000
+
3001
+__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
3002
+{
3003
+ struct obj_cgroup *objcg = NULL;
3004
+ struct mem_cgroup *memcg;
3005
+
3006
+ if (memcg_kmem_bypass())
3007
+ return NULL;
3008
+
3009
+ rcu_read_lock();
3010
+ if (unlikely(active_memcg()))
3011
+ memcg = active_memcg();
3012
+ else
3013
+ memcg = mem_cgroup_from_task(current);
3014
+
3015
+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
3016
+ objcg = rcu_dereference(memcg->objcg);
3017
+ if (objcg && obj_cgroup_tryget(objcg))
3018
+ break;
3019
+ objcg = NULL;
3020
+ }
3021
+ rcu_read_unlock();
3022
+
3023
+ return objcg;
3024
+}
3025
+
24513026 static int memcg_alloc_cache_id(void)
24523027 {
24533028 int id, size;
....@@ -2473,9 +3048,7 @@
24733048 else if (size > MEMCG_CACHES_MAX_SIZE)
24743049 size = MEMCG_CACHES_MAX_SIZE;
24753050
2476
- err = memcg_update_all_caches(size);
2477
- if (!err)
2478
- err = memcg_update_all_list_lrus(size);
3051
+ err = memcg_update_all_list_lrus(size);
24793052 if (!err)
24803053 memcg_nr_cache_ids = size;
24813054
....@@ -2493,152 +3066,17 @@
24933066 ida_simple_remove(&memcg_cache_ida, id);
24943067 }
24953068
2496
-struct memcg_kmem_cache_create_work {
2497
- struct mem_cgroup *memcg;
2498
- struct kmem_cache *cachep;
2499
- struct work_struct work;
2500
-};
2501
-
2502
-static void memcg_kmem_cache_create_func(struct work_struct *w)
2503
-{
2504
- struct memcg_kmem_cache_create_work *cw =
2505
- container_of(w, struct memcg_kmem_cache_create_work, work);
2506
- struct mem_cgroup *memcg = cw->memcg;
2507
- struct kmem_cache *cachep = cw->cachep;
2508
-
2509
- memcg_create_kmem_cache(memcg, cachep);
2510
-
2511
- css_put(&memcg->css);
2512
- kfree(cw);
2513
-}
2514
-
2515
-/*
2516
- * Enqueue the creation of a per-memcg kmem_cache.
2517
- */
2518
-static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2519
- struct kmem_cache *cachep)
2520
-{
2521
- struct memcg_kmem_cache_create_work *cw;
2522
-
2523
- cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
2524
- if (!cw)
2525
- return;
2526
-
2527
- css_get(&memcg->css);
2528
-
2529
- cw->memcg = memcg;
2530
- cw->cachep = cachep;
2531
- INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
2532
-
2533
- queue_work(memcg_kmem_cache_wq, &cw->work);
2534
-}
2535
-
2536
-static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2537
- struct kmem_cache *cachep)
2538
-{
2539
- /*
2540
- * We need to stop accounting when we kmalloc, because if the
2541
- * corresponding kmalloc cache is not yet created, the first allocation
2542
- * in __memcg_schedule_kmem_cache_create will recurse.
2543
- *
2544
- * However, it is better to enclose the whole function. Depending on
2545
- * the debugging options enabled, INIT_WORK(), for instance, can
2546
- * trigger an allocation. This too, will make us recurse. Because at
2547
- * this point we can't allow ourselves back into memcg_kmem_get_cache,
2548
- * the safest choice is to do it like this, wrapping the whole function.
2549
- */
2550
- current->memcg_kmem_skip_account = 1;
2551
- __memcg_schedule_kmem_cache_create(memcg, cachep);
2552
- current->memcg_kmem_skip_account = 0;
2553
-}
2554
-
2555
-static inline bool memcg_kmem_bypass(void)
2556
-{
2557
- if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
2558
- return true;
2559
- return false;
2560
-}
2561
-
25623069 /**
2563
- * memcg_kmem_get_cache: select the correct per-memcg cache for allocation
2564
- * @cachep: the original global kmem cache
2565
- *
2566
- * Return the kmem_cache we're supposed to use for a slab allocation.
2567
- * We try to use the current memcg's version of the cache.
2568
- *
2569
- * If the cache does not exist yet, if we are the first user of it, we
2570
- * create it asynchronously in a workqueue and let the current allocation
2571
- * go through with the original cache.
2572
- *
2573
- * This function takes a reference to the cache it returns to assure it
2574
- * won't get destroyed while we are working with it. Once the caller is
2575
- * done with it, memcg_kmem_put_cache() must be called to release the
2576
- * reference.
2577
- */
2578
-struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
2579
-{
2580
- struct mem_cgroup *memcg;
2581
- struct kmem_cache *memcg_cachep;
2582
- int kmemcg_id;
2583
-
2584
- VM_BUG_ON(!is_root_cache(cachep));
2585
-
2586
- if (memcg_kmem_bypass())
2587
- return cachep;
2588
-
2589
- if (current->memcg_kmem_skip_account)
2590
- return cachep;
2591
-
2592
- memcg = get_mem_cgroup_from_current();
2593
- kmemcg_id = READ_ONCE(memcg->kmemcg_id);
2594
- if (kmemcg_id < 0)
2595
- goto out;
2596
-
2597
- memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
2598
- if (likely(memcg_cachep))
2599
- return memcg_cachep;
2600
-
2601
- /*
2602
- * If we are in a safe context (can wait, and not in interrupt
2603
- * context), we could be be predictable and return right away.
2604
- * This would guarantee that the allocation being performed
2605
- * already belongs in the new cache.
2606
- *
2607
- * However, there are some clashes that can arrive from locking.
2608
- * For instance, because we acquire the slab_mutex while doing
2609
- * memcg_create_kmem_cache, this means no further allocation
2610
- * could happen with the slab_mutex held. So it's better to
2611
- * defer everything.
2612
- */
2613
- memcg_schedule_kmem_cache_create(memcg, cachep);
2614
-out:
2615
- css_put(&memcg->css);
2616
- return cachep;
2617
-}
2618
-
2619
-/**
2620
- * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache
2621
- * @cachep: the cache returned by memcg_kmem_get_cache
2622
- */
2623
-void memcg_kmem_put_cache(struct kmem_cache *cachep)
2624
-{
2625
- if (!is_root_cache(cachep))
2626
- css_put(&cachep->memcg_params.memcg->css);
2627
-}
2628
-
2629
-/**
2630
- * memcg_kmem_charge_memcg: charge a kmem page
2631
- * @page: page to charge
2632
- * @gfp: reclaim mode
2633
- * @order: allocation order
3070
+ * __memcg_kmem_charge: charge a number of kernel pages to a memcg
26343071 * @memcg: memory cgroup to charge
3072
+ * @gfp: reclaim mode
3073
+ * @nr_pages: number of pages to charge
26353074 *
26363075 * Returns 0 on success, an error code on failure.
26373076 */
2638
-int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
2639
- struct mem_cgroup *memcg)
3077
+int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
3078
+ unsigned int nr_pages)
26403079 {
2641
- unsigned int nr_pages = 1 << order;
26423080 struct page_counter *counter;
26433081 int ret;
26443082
....@@ -2661,43 +3099,54 @@
26613099 cancel_charge(memcg, nr_pages);
26623100 return -ENOMEM;
26633101 }
2664
-
2665
- page->mem_cgroup = memcg;
2666
-
26673102 return 0;
26683103 }
26693104
26703105 /**
2671
- * memcg_kmem_charge: charge a kmem page to the current memory cgroup
3106
+ * __memcg_kmem_uncharge: uncharge a number of kernel pages from a memcg
3107
+ * @memcg: memcg to uncharge
3108
+ * @nr_pages: number of pages to uncharge
3109
+ */
3110
+void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
3111
+{
3112
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
3113
+ page_counter_uncharge(&memcg->kmem, nr_pages);
3114
+
3115
+ refill_stock(memcg, nr_pages);
3116
+}
3117
+
3118
+/**
3119
+ * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup
26723120 * @page: page to charge
26733121 * @gfp: reclaim mode
26743122 * @order: allocation order
26753123 *
26763124 * Returns 0 on success, an error code on failure.
26773125 */
2678
-int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
3126
+int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
26793127 {
26803128 struct mem_cgroup *memcg;
26813129 int ret = 0;
26823130
2683
- if (mem_cgroup_disabled() || memcg_kmem_bypass())
2684
- return 0;
2685
-
26863131 memcg = get_mem_cgroup_from_current();
2687
- if (!mem_cgroup_is_root(memcg)) {
2688
- ret = memcg_kmem_charge_memcg(page, gfp, order, memcg);
2689
- if (!ret)
3132
+ if (memcg && !mem_cgroup_is_root(memcg)) {
3133
+ ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
3134
+ if (!ret) {
3135
+ page->mem_cgroup = memcg;
26903136 __SetPageKmemcg(page);
3137
+ return 0;
3138
+ }
3139
+ css_put(&memcg->css);
26913140 }
2692
- css_put(&memcg->css);
26933141 return ret;
26943142 }
3143
+
26953144 /**
2696
- * memcg_kmem_uncharge: uncharge a kmem page
3145
+ * __memcg_kmem_uncharge_page: uncharge a kmem page
26973146 * @page: page to uncharge
26983147 * @order: allocation order
26993148 */
2700
-void memcg_kmem_uncharge(struct page *page, int order)
3149
+void __memcg_kmem_uncharge_page(struct page *page, int order)
27013150 {
27023151 struct mem_cgroup *memcg = page->mem_cgroup;
27033152 unsigned int nr_pages = 1 << order;
....@@ -2706,43 +3155,179 @@
27063155 return;
27073156
27083157 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
2709
-
2710
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
2711
- page_counter_uncharge(&memcg->kmem, nr_pages);
2712
-
2713
- page_counter_uncharge(&memcg->memory, nr_pages);
2714
- if (do_memsw_account())
2715
- page_counter_uncharge(&memcg->memsw, nr_pages);
2716
-
3158
+ __memcg_kmem_uncharge(memcg, nr_pages);
27173159 page->mem_cgroup = NULL;
3160
+ css_put(&memcg->css);
27183161
27193162 /* slab pages do not have PageKmemcg flag set */
27203163 if (PageKmemcg(page))
27213164 __ClearPageKmemcg(page);
2722
-
2723
- css_put_many(&memcg->css, nr_pages);
27243165 }
2725
-#endif /* CONFIG_MEMCG_KMEM */
27263166
2727
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2728
-
2729
-/*
2730
- * Because tail pages are not marked as "used", set it. We're under
2731
- * zone_lru_lock and migration entries setup in all page mappings.
2732
- */
2733
-void mem_cgroup_split_huge_fixup(struct page *head)
3167
+static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
27343168 {
2735
- int i;
3169
+ struct memcg_stock_pcp *stock;
3170
+ unsigned long flags;
3171
+ bool ret = false;
27363172
2737
- if (mem_cgroup_disabled())
3173
+ local_irq_save(flags);
3174
+
3175
+ stock = this_cpu_ptr(&memcg_stock);
3176
+ if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
3177
+ stock->nr_bytes -= nr_bytes;
3178
+ ret = true;
3179
+ }
3180
+
3181
+ local_irq_restore(flags);
3182
+
3183
+ return ret;
3184
+}
3185
+
3186
+static void drain_obj_stock(struct memcg_stock_pcp *stock)
3187
+{
3188
+ struct obj_cgroup *old = stock->cached_objcg;
3189
+
3190
+ if (!old)
27383191 return;
27393192
2740
- for (i = 1; i < HPAGE_PMD_NR; i++)
2741
- head[i].mem_cgroup = head->mem_cgroup;
3193
+ if (stock->nr_bytes) {
3194
+ unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
3195
+ unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
27423196
2743
- __mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR);
3197
+ if (nr_pages) {
3198
+ struct mem_cgroup *memcg;
3199
+
3200
+ rcu_read_lock();
3201
+retry:
3202
+ memcg = obj_cgroup_memcg(old);
3203
+ if (unlikely(!css_tryget(&memcg->css)))
3204
+ goto retry;
3205
+ rcu_read_unlock();
3206
+
3207
+ __memcg_kmem_uncharge(memcg, nr_pages);
3208
+ css_put(&memcg->css);
3209
+ }
3210
+
3211
+ /*
3212
+ * The leftover is flushed to the centralized per-memcg value.
3213
+ * On the next attempt to refill obj stock it will be moved
3214
+ * to a per-cpu stock (probably, on an other CPU), see
3215
+ * refill_obj_stock().
3216
+ *
3217
+ * How often it's flushed is a trade-off between the memory
3218
+ * limit enforcement accuracy and potential CPU contention,
3219
+ * so it might be changed in the future.
3220
+ */
3221
+ atomic_add(nr_bytes, &old->nr_charged_bytes);
3222
+ stock->nr_bytes = 0;
3223
+ }
3224
+
3225
+ obj_cgroup_put(old);
3226
+ stock->cached_objcg = NULL;
27443227 }
2745
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
3228
+
3229
+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
3230
+ struct mem_cgroup *root_memcg)
3231
+{
3232
+ struct mem_cgroup *memcg;
3233
+
3234
+ if (stock->cached_objcg) {
3235
+ memcg = obj_cgroup_memcg(stock->cached_objcg);
3236
+ if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
3237
+ return true;
3238
+ }
3239
+
3240
+ return false;
3241
+}
3242
+
3243
+static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
3244
+{
3245
+ struct memcg_stock_pcp *stock;
3246
+ unsigned long flags;
3247
+
3248
+ local_irq_save(flags);
3249
+
3250
+ stock = this_cpu_ptr(&memcg_stock);
3251
+ if (stock->cached_objcg != objcg) { /* reset if necessary */
3252
+ drain_obj_stock(stock);
3253
+ obj_cgroup_get(objcg);
3254
+ stock->cached_objcg = objcg;
3255
+ stock->nr_bytes = atomic_xchg(&objcg->nr_charged_bytes, 0);
3256
+ }
3257
+ stock->nr_bytes += nr_bytes;
3258
+
3259
+ if (stock->nr_bytes > PAGE_SIZE)
3260
+ drain_obj_stock(stock);
3261
+
3262
+ local_irq_restore(flags);
3263
+}
3264
+
3265
+int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
3266
+{
3267
+ struct mem_cgroup *memcg;
3268
+ unsigned int nr_pages, nr_bytes;
3269
+ int ret;
3270
+
3271
+ if (consume_obj_stock(objcg, size))
3272
+ return 0;
3273
+
3274
+ /*
3275
+ * In theory, memcg->nr_charged_bytes can have enough
3276
+ * pre-charged bytes to satisfy the allocation. However,
3277
+ * flushing memcg->nr_charged_bytes requires two atomic
3278
+ * operations, and memcg->nr_charged_bytes can't be big,
3279
+ * so it's better to ignore it and try grab some new pages.
3280
+ * memcg->nr_charged_bytes will be flushed in
3281
+ * refill_obj_stock(), called from this function or
3282
+ * independently later.
3283
+ */
3284
+ rcu_read_lock();
3285
+retry:
3286
+ memcg = obj_cgroup_memcg(objcg);
3287
+ if (unlikely(!css_tryget(&memcg->css)))
3288
+ goto retry;
3289
+ rcu_read_unlock();
3290
+
3291
+ nr_pages = size >> PAGE_SHIFT;
3292
+ nr_bytes = size & (PAGE_SIZE - 1);
3293
+
3294
+ if (nr_bytes)
3295
+ nr_pages += 1;
3296
+
3297
+ ret = __memcg_kmem_charge(memcg, gfp, nr_pages);
3298
+ if (!ret && nr_bytes)
3299
+ refill_obj_stock(objcg, PAGE_SIZE - nr_bytes);
3300
+
3301
+ css_put(&memcg->css);
3302
+ return ret;
3303
+}
3304
+
3305
+void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
3306
+{
3307
+ refill_obj_stock(objcg, size);
3308
+}
3309
+
3310
+#endif /* CONFIG_MEMCG_KMEM */
3311
+
3312
+/*
3313
+ * Because head->mem_cgroup is not set on tails, set it now.
3314
+ */
3315
+void split_page_memcg(struct page *head, unsigned int nr)
3316
+{
3317
+ struct mem_cgroup *memcg = head->mem_cgroup;
3318
+ int kmemcg = PageKmemcg(head);
3319
+ int i;
3320
+
3321
+ if (mem_cgroup_disabled() || !memcg)
3322
+ return;
3323
+
3324
+ for (i = 1; i < nr; i++) {
3325
+ head[i].mem_cgroup = memcg;
3326
+ if (kmemcg)
3327
+ __SetPageKmemcg(head + i);
3328
+ }
3329
+ css_get_many(&memcg->css, nr - 1);
3330
+}
27463331
27473332 #ifdef CONFIG_MEMCG_SWAP
27483333 /**
....@@ -2804,7 +3389,7 @@
28043389 * Make sure that the new limit (memsw or memory limit) doesn't
28053390 * break our basic invariant rule memory.max <= memsw.max.
28063391 */
2807
- limits_invariant = memsw ? max >= memcg->memory.max :
3392
+ limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
28083393 max <= memcg->memsw.max;
28093394 if (!limits_invariant) {
28103395 mutex_unlock(&memcg_max_mutex);
....@@ -2925,7 +3510,7 @@
29253510 * Test whether @memcg has children, dead or alive. Note that this
29263511 * function doesn't care whether @memcg has use_hierarchy enabled and
29273512 * returns %true if there are child csses according to the cgroup
2928
- * hierarchy. Testing use_hierarchy is the caller's responsiblity.
3513
+ * hierarchy. Testing use_hierarchy is the caller's responsibility.
29293514 */
29303515 static inline bool memcg_has_children(struct mem_cgroup *memcg)
29313516 {
....@@ -2944,7 +3529,7 @@
29443529 */
29453530 static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
29463531 {
2947
- int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3532
+ int nr_retries = MAX_RECLAIM_RETRIES;
29483533
29493534 /* we call try-to-free pages for make this cgroup empty */
29503535 lru_add_drain_all();
....@@ -3018,50 +3603,15 @@
30183603 return retval;
30193604 }
30203605
3021
-struct accumulated_stats {
3022
- unsigned long stat[MEMCG_NR_STAT];
3023
- unsigned long events[NR_VM_EVENT_ITEMS];
3024
- unsigned long lru_pages[NR_LRU_LISTS];
3025
- const unsigned int *stats_array;
3026
- const unsigned int *events_array;
3027
- int stats_size;
3028
- int events_size;
3029
-};
3030
-
3031
-static void accumulate_memcg_tree(struct mem_cgroup *memcg,
3032
- struct accumulated_stats *acc)
3033
-{
3034
- struct mem_cgroup *mi;
3035
- int i;
3036
-
3037
- for_each_mem_cgroup_tree(mi, memcg) {
3038
- for (i = 0; i < acc->stats_size; i++)
3039
- acc->stat[i] += memcg_page_state(mi,
3040
- acc->stats_array ? acc->stats_array[i] : i);
3041
-
3042
- for (i = 0; i < acc->events_size; i++)
3043
- acc->events[i] += memcg_sum_events(mi,
3044
- acc->events_array ? acc->events_array[i] : i);
3045
-
3046
- for (i = 0; i < NR_LRU_LISTS; i++)
3047
- acc->lru_pages[i] +=
3048
- mem_cgroup_nr_lru_pages(mi, BIT(i));
3049
- }
3050
-}
3051
-
30523606 static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
30533607 {
3054
- unsigned long val = 0;
3608
+ unsigned long val;
30553609
30563610 if (mem_cgroup_is_root(memcg)) {
3057
- struct mem_cgroup *iter;
3058
-
3059
- for_each_mem_cgroup_tree(iter, memcg) {
3060
- val += memcg_page_state(iter, MEMCG_CACHE);
3061
- val += memcg_page_state(iter, MEMCG_RSS);
3062
- if (swap)
3063
- val += memcg_page_state(iter, MEMCG_SWAP);
3064
- }
3611
+ val = memcg_page_state(memcg, NR_FILE_PAGES) +
3612
+ memcg_page_state(memcg, NR_ANON_MAPPED);
3613
+ if (swap)
3614
+ val += memcg_page_state(memcg, MEMCG_SWAP);
30653615 } else {
30663616 if (!swap)
30673617 val = page_counter_read(&memcg->memory);
....@@ -3122,9 +3672,61 @@
31223672 }
31233673 }
31243674
3675
+static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
3676
+{
3677
+ unsigned long stat[MEMCG_NR_STAT] = {0};
3678
+ struct mem_cgroup *mi;
3679
+ int node, cpu, i;
3680
+
3681
+ for_each_online_cpu(cpu)
3682
+ for (i = 0; i < MEMCG_NR_STAT; i++)
3683
+ stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
3684
+
3685
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
3686
+ for (i = 0; i < MEMCG_NR_STAT; i++)
3687
+ atomic_long_add(stat[i], &mi->vmstats[i]);
3688
+
3689
+ for_each_node(node) {
3690
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
3691
+ struct mem_cgroup_per_node *pi;
3692
+
3693
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3694
+ stat[i] = 0;
3695
+
3696
+ for_each_online_cpu(cpu)
3697
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3698
+ stat[i] += per_cpu(
3699
+ pn->lruvec_stat_cpu->count[i], cpu);
3700
+
3701
+ for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
3702
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3703
+ atomic_long_add(stat[i], &pi->lruvec_stat[i]);
3704
+ }
3705
+}
3706
+
3707
+static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
3708
+{
3709
+ unsigned long events[NR_VM_EVENT_ITEMS];
3710
+ struct mem_cgroup *mi;
3711
+ int cpu, i;
3712
+
3713
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3714
+ events[i] = 0;
3715
+
3716
+ for_each_online_cpu(cpu)
3717
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3718
+ events[i] += per_cpu(memcg->vmstats_percpu->events[i],
3719
+ cpu);
3720
+
3721
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
3722
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3723
+ atomic_long_add(events[i], &mi->vmevents[i]);
3724
+}
3725
+
31253726 #ifdef CONFIG_MEMCG_KMEM
31263727 static int memcg_online_kmem(struct mem_cgroup *memcg)
31273728 {
3729
+ struct obj_cgroup *objcg;
31283730 int memcg_id;
31293731
31303732 if (cgroup_memory_nokmem)
....@@ -3137,7 +3739,16 @@
31373739 if (memcg_id < 0)
31383740 return memcg_id;
31393741
3140
- static_branch_inc(&memcg_kmem_enabled_key);
3742
+ objcg = obj_cgroup_alloc();
3743
+ if (!objcg) {
3744
+ memcg_free_cache_id(memcg_id);
3745
+ return -ENOMEM;
3746
+ }
3747
+ objcg->memcg = memcg;
3748
+ rcu_assign_pointer(memcg->objcg, objcg);
3749
+
3750
+ static_branch_enable(&memcg_kmem_enabled_key);
3751
+
31413752 /*
31423753 * A memory cgroup is considered kmem-online as soon as it gets
31433754 * kmemcg_id. Setting the id after enabling static branching will
....@@ -3146,7 +3757,6 @@
31463757 */
31473758 memcg->kmemcg_id = memcg_id;
31483759 memcg->kmem_state = KMEM_ONLINE;
3149
- INIT_LIST_HEAD(&memcg->kmem_caches);
31503760
31513761 return 0;
31523762 }
....@@ -3159,22 +3769,17 @@
31593769
31603770 if (memcg->kmem_state != KMEM_ONLINE)
31613771 return;
3162
- /*
3163
- * Clear the online state before clearing memcg_caches array
3164
- * entries. The slab_mutex in memcg_deactivate_kmem_caches()
3165
- * guarantees that no cache will be created for this cgroup
3166
- * after we are done (see memcg_create_kmem_cache()).
3167
- */
3772
+
31683773 memcg->kmem_state = KMEM_ALLOCATED;
3169
-
3170
- memcg_deactivate_kmem_caches(memcg);
3171
-
3172
- kmemcg_id = memcg->kmemcg_id;
3173
- BUG_ON(kmemcg_id < 0);
31743774
31753775 parent = parent_mem_cgroup(memcg);
31763776 if (!parent)
31773777 parent = root_mem_cgroup;
3778
+
3779
+ memcg_reparent_objcgs(memcg, parent);
3780
+
3781
+ kmemcg_id = memcg->kmemcg_id;
3782
+ BUG_ON(kmemcg_id < 0);
31783783
31793784 /*
31803785 * Change kmemcg_id of this cgroup and all its descendants to the
....@@ -3204,12 +3809,6 @@
32043809 /* css_alloc() failed, offlining didn't happen */
32053810 if (unlikely(memcg->kmem_state == KMEM_ONLINE))
32063811 memcg_offline_kmem(memcg);
3207
-
3208
- if (memcg->kmem_state == KMEM_ALLOCATED) {
3209
- memcg_destroy_kmem_caches(memcg);
3210
- static_branch_dec(&memcg_kmem_enabled_key);
3211
- WARN_ON(page_counter_read(&memcg->kmem));
3212
- }
32133812 }
32143813 #else
32153814 static int memcg_online_kmem(struct mem_cgroup *memcg)
....@@ -3300,6 +3899,9 @@
33003899 ret = mem_cgroup_resize_max(memcg, nr_pages, true);
33013900 break;
33023901 case _KMEM:
3902
+ pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
3903
+ "Please report your usecase to linux-mm@kvack.org if you "
3904
+ "depend on this functionality.\n");
33033905 ret = memcg_update_kmem_max(memcg, nr_pages);
33043906 break;
33053907 case _TCP:
....@@ -3364,6 +3966,10 @@
33643966 {
33653967 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
33663968
3969
+ pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. "
3970
+ "Please report your usecase to linux-mm@kvack.org if you "
3971
+ "depend on this functionality.\n");
3972
+
33673973 if (val & ~MOVE_MASK)
33683974 return -EINVAL;
33693975
....@@ -3385,6 +3991,49 @@
33853991 #endif
33863992
33873993 #ifdef CONFIG_NUMA
3994
+
3995
+#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
3996
+#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
3997
+#define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
3998
+
3999
+static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
4000
+ int nid, unsigned int lru_mask, bool tree)
4001
+{
4002
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
4003
+ unsigned long nr = 0;
4004
+ enum lru_list lru;
4005
+
4006
+ VM_BUG_ON((unsigned)nid >= nr_node_ids);
4007
+
4008
+ for_each_lru(lru) {
4009
+ if (!(BIT(lru) & lru_mask))
4010
+ continue;
4011
+ if (tree)
4012
+ nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
4013
+ else
4014
+ nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
4015
+ }
4016
+ return nr;
4017
+}
4018
+
4019
+static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
4020
+ unsigned int lru_mask,
4021
+ bool tree)
4022
+{
4023
+ unsigned long nr = 0;
4024
+ enum lru_list lru;
4025
+
4026
+ for_each_lru(lru) {
4027
+ if (!(BIT(lru) & lru_mask))
4028
+ continue;
4029
+ if (tree)
4030
+ nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
4031
+ else
4032
+ nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
4033
+ }
4034
+ return nr;
4035
+}
4036
+
33884037 static int memcg_numa_stat_show(struct seq_file *m, void *v)
33894038 {
33904039 struct numa_stat {
....@@ -3400,40 +4049,60 @@
34004049 };
34014050 const struct numa_stat *stat;
34024051 int nid;
3403
- unsigned long nr;
3404
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
4052
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
34054053
34064054 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3407
- nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
3408
- seq_printf(m, "%s=%lu", stat->name, nr);
3409
- for_each_node_state(nid, N_MEMORY) {
3410
- nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
3411
- stat->lru_mask);
3412
- seq_printf(m, " N%d=%lu", nid, nr);
3413
- }
4055
+ seq_printf(m, "%s=%lu", stat->name,
4056
+ mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
4057
+ false));
4058
+ for_each_node_state(nid, N_MEMORY)
4059
+ seq_printf(m, " N%d=%lu", nid,
4060
+ mem_cgroup_node_nr_lru_pages(memcg, nid,
4061
+ stat->lru_mask, false));
34144062 seq_putc(m, '\n');
34154063 }
34164064
34174065 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3418
- struct mem_cgroup *iter;
34194066
3420
- nr = 0;
3421
- for_each_mem_cgroup_tree(iter, memcg)
3422
- nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
3423
- seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
3424
- for_each_node_state(nid, N_MEMORY) {
3425
- nr = 0;
3426
- for_each_mem_cgroup_tree(iter, memcg)
3427
- nr += mem_cgroup_node_nr_lru_pages(
3428
- iter, nid, stat->lru_mask);
3429
- seq_printf(m, " N%d=%lu", nid, nr);
3430
- }
4067
+ seq_printf(m, "hierarchical_%s=%lu", stat->name,
4068
+ mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
4069
+ true));
4070
+ for_each_node_state(nid, N_MEMORY)
4071
+ seq_printf(m, " N%d=%lu", nid,
4072
+ mem_cgroup_node_nr_lru_pages(memcg, nid,
4073
+ stat->lru_mask, true));
34314074 seq_putc(m, '\n');
34324075 }
34334076
34344077 return 0;
34354078 }
34364079 #endif /* CONFIG_NUMA */
4080
+
4081
+static const unsigned int memcg1_stats[] = {
4082
+ NR_FILE_PAGES,
4083
+ NR_ANON_MAPPED,
4084
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4085
+ NR_ANON_THPS,
4086
+#endif
4087
+ NR_SHMEM,
4088
+ NR_FILE_MAPPED,
4089
+ NR_FILE_DIRTY,
4090
+ NR_WRITEBACK,
4091
+ MEMCG_SWAP,
4092
+};
4093
+
4094
+static const char *const memcg1_stat_names[] = {
4095
+ "cache",
4096
+ "rss",
4097
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4098
+ "rss_huge",
4099
+#endif
4100
+ "shmem",
4101
+ "mapped_file",
4102
+ "dirty",
4103
+ "writeback",
4104
+ "swap",
4105
+};
34374106
34384107 /* Universal VM events cgroup1 shows, original sort order */
34394108 static const unsigned int memcg1_events[] = {
....@@ -3443,45 +4112,42 @@
34434112 PGMAJFAULT,
34444113 };
34454114
3446
-static const char *const memcg1_event_names[] = {
3447
- "pgpgin",
3448
- "pgpgout",
3449
- "pgfault",
3450
- "pgmajfault",
3451
-};
3452
-
34534115 static int memcg_stat_show(struct seq_file *m, void *v)
34544116 {
3455
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
4117
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
34564118 unsigned long memory, memsw;
34574119 struct mem_cgroup *mi;
34584120 unsigned int i;
3459
- struct accumulated_stats acc;
34604121
34614122 BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
3462
- BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
34634123
34644124 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
4125
+ unsigned long nr;
4126
+
34654127 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
34664128 continue;
3467
- seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
3468
- memcg_page_state(memcg, memcg1_stats[i]) *
3469
- PAGE_SIZE);
4129
+ nr = memcg_page_state_local(memcg, memcg1_stats[i]);
4130
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4131
+ if (memcg1_stats[i] == NR_ANON_THPS)
4132
+ nr *= HPAGE_PMD_NR;
4133
+#endif
4134
+ seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE);
34704135 }
34714136
34724137 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3473
- seq_printf(m, "%s %lu\n", memcg1_event_names[i],
3474
- memcg_sum_events(memcg, memcg1_events[i]));
4138
+ seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]),
4139
+ memcg_events_local(memcg, memcg1_events[i]));
34754140
34764141 for (i = 0; i < NR_LRU_LISTS; i++)
3477
- seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
3478
- mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
4142
+ seq_printf(m, "%s %lu\n", lru_list_name(i),
4143
+ memcg_page_state_local(memcg, NR_LRU_BASE + i) *
4144
+ PAGE_SIZE);
34794145
34804146 /* Hierarchical information */
34814147 memory = memsw = PAGE_COUNTER_MAX;
34824148 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
3483
- memory = min(memory, mi->memory.max);
3484
- memsw = min(memsw, mi->memsw.max);
4149
+ memory = min(memory, READ_ONCE(mi->memory.max));
4150
+ memsw = min(memsw, READ_ONCE(mi->memsw.max));
34854151 }
34864152 seq_printf(m, "hierarchical_memory_limit %llu\n",
34874153 (u64)memory * PAGE_SIZE);
....@@ -3489,49 +4155,45 @@
34894155 seq_printf(m, "hierarchical_memsw_limit %llu\n",
34904156 (u64)memsw * PAGE_SIZE);
34914157
3492
- memset(&acc, 0, sizeof(acc));
3493
- acc.stats_size = ARRAY_SIZE(memcg1_stats);
3494
- acc.stats_array = memcg1_stats;
3495
- acc.events_size = ARRAY_SIZE(memcg1_events);
3496
- acc.events_array = memcg1_events;
3497
- accumulate_memcg_tree(memcg, &acc);
3498
-
34994158 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
4159
+ unsigned long nr;
4160
+
35004161 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
35014162 continue;
4163
+ nr = memcg_page_state(memcg, memcg1_stats[i]);
4164
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4165
+ if (memcg1_stats[i] == NR_ANON_THPS)
4166
+ nr *= HPAGE_PMD_NR;
4167
+#endif
35024168 seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
3503
- (u64)acc.stat[i] * PAGE_SIZE);
4169
+ (u64)nr * PAGE_SIZE);
35044170 }
35054171
35064172 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3507
- seq_printf(m, "total_%s %llu\n", memcg1_event_names[i],
3508
- (u64)acc.events[i]);
4173
+ seq_printf(m, "total_%s %llu\n",
4174
+ vm_event_name(memcg1_events[i]),
4175
+ (u64)memcg_events(memcg, memcg1_events[i]));
35094176
35104177 for (i = 0; i < NR_LRU_LISTS; i++)
3511
- seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i],
3512
- (u64)acc.lru_pages[i] * PAGE_SIZE);
4178
+ seq_printf(m, "total_%s %llu\n", lru_list_name(i),
4179
+ (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
4180
+ PAGE_SIZE);
35134181
35144182 #ifdef CONFIG_DEBUG_VM
35154183 {
35164184 pg_data_t *pgdat;
35174185 struct mem_cgroup_per_node *mz;
3518
- struct zone_reclaim_stat *rstat;
3519
- unsigned long recent_rotated[2] = {0, 0};
3520
- unsigned long recent_scanned[2] = {0, 0};
4186
+ unsigned long anon_cost = 0;
4187
+ unsigned long file_cost = 0;
35214188
35224189 for_each_online_pgdat(pgdat) {
35234190 mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
3524
- rstat = &mz->lruvec.reclaim_stat;
35254191
3526
- recent_rotated[0] += rstat->recent_rotated[0];
3527
- recent_rotated[1] += rstat->recent_rotated[1];
3528
- recent_scanned[0] += rstat->recent_scanned[0];
3529
- recent_scanned[1] += rstat->recent_scanned[1];
4192
+ anon_cost += mz->lruvec.anon_cost;
4193
+ file_cost += mz->lruvec.file_cost;
35304194 }
3531
- seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
3532
- seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
3533
- seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
3534
- seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
4195
+ seq_printf(m, "anon_cost %lu\n", anon_cost);
4196
+ seq_printf(m, "file_cost %lu\n", file_cost);
35354197 }
35364198 #endif
35374199
....@@ -3551,7 +4213,7 @@
35514213 {
35524214 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
35534215
3554
- if (val > 100)
4216
+ if (val > 200)
35554217 return -EINVAL;
35564218
35574219 if (css->parent)
....@@ -3690,8 +4352,7 @@
36904352 size = thresholds->primary ? thresholds->primary->size + 1 : 1;
36914353
36924354 /* Allocate memory for new array of thresholds */
3693
- new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
3694
- GFP_KERNEL);
4355
+ new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
36954356 if (!new) {
36964357 ret = -ENOMEM;
36974358 goto unlock;
....@@ -3699,17 +4360,16 @@
36994360 new->size = size;
37004361
37014362 /* Copy thresholds (if any) to new array */
3702
- if (thresholds->primary) {
3703
- memcpy(new->entries, thresholds->primary->entries, (size - 1) *
3704
- sizeof(struct mem_cgroup_threshold));
3705
- }
4363
+ if (thresholds->primary)
4364
+ memcpy(new->entries, thresholds->primary->entries,
4365
+ flex_array_size(new, entries, size - 1));
37064366
37074367 /* Add new threshold */
37084368 new->entries[size - 1].eventfd = eventfd;
37094369 new->entries[size - 1].threshold = threshold;
37104370
37114371 /* Sort thresholds. Registering of new threshold isn't time-critical */
3712
- sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
4372
+ sort(new->entries, size, sizeof(*new->entries),
37134373 compare_thresholds, NULL);
37144374
37154375 /* Find current threshold */
....@@ -3891,7 +4551,7 @@
38914551
38924552 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
38934553 {
3894
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
4554
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
38954555
38964556 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
38974557 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
....@@ -3917,6 +4577,8 @@
39174577 }
39184578
39194579 #ifdef CONFIG_CGROUP_WRITEBACK
4580
+
4581
+#include <trace/events/writeback.h>
39204582
39214583 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
39224584 {
....@@ -3949,11 +4611,11 @@
39494611 */
39504612 static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
39514613 {
3952
- long x = atomic_long_read(&memcg->stat[idx]);
4614
+ long x = atomic_long_read(&memcg->vmstats[idx]);
39534615 int cpu;
39544616
39554617 for_each_online_cpu(cpu)
3956
- x += per_cpu_ptr(memcg->stat_cpu, cpu)->count[idx];
4618
+ x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx];
39574619 if (x < 0)
39584620 x = 0;
39594621 return x;
....@@ -3986,18 +4648,142 @@
39864648
39874649 *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
39884650
3989
- /* this should eventually include NR_UNSTABLE_NFS */
39904651 *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
3991
- *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
3992
- (1 << LRU_ACTIVE_FILE));
4652
+ *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
4653
+ memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
39934654 *pheadroom = PAGE_COUNTER_MAX;
39944655
39954656 while ((parent = parent_mem_cgroup(memcg))) {
3996
- unsigned long ceiling = min(memcg->memory.max, memcg->high);
4657
+ unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
4658
+ READ_ONCE(memcg->memory.high));
39974659 unsigned long used = page_counter_read(&memcg->memory);
39984660
39994661 *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
40004662 memcg = parent;
4663
+ }
4664
+}
4665
+
4666
+/*
4667
+ * Foreign dirty flushing
4668
+ *
4669
+ * There's an inherent mismatch between memcg and writeback. The former
4670
+ * trackes ownership per-page while the latter per-inode. This was a
4671
+ * deliberate design decision because honoring per-page ownership in the
4672
+ * writeback path is complicated, may lead to higher CPU and IO overheads
4673
+ * and deemed unnecessary given that write-sharing an inode across
4674
+ * different cgroups isn't a common use-case.
4675
+ *
4676
+ * Combined with inode majority-writer ownership switching, this works well
4677
+ * enough in most cases but there are some pathological cases. For
4678
+ * example, let's say there are two cgroups A and B which keep writing to
4679
+ * different but confined parts of the same inode. B owns the inode and
4680
+ * A's memory is limited far below B's. A's dirty ratio can rise enough to
4681
+ * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
4682
+ * triggering background writeback. A will be slowed down without a way to
4683
+ * make writeback of the dirty pages happen.
4684
+ *
4685
+ * Conditions like the above can lead to a cgroup getting repatedly and
4686
+ * severely throttled after making some progress after each
4687
+ * dirty_expire_interval while the underyling IO device is almost
4688
+ * completely idle.
4689
+ *
4690
+ * Solving this problem completely requires matching the ownership tracking
4691
+ * granularities between memcg and writeback in either direction. However,
4692
+ * the more egregious behaviors can be avoided by simply remembering the
4693
+ * most recent foreign dirtying events and initiating remote flushes on
4694
+ * them when local writeback isn't enough to keep the memory clean enough.
4695
+ *
4696
+ * The following two functions implement such mechanism. When a foreign
4697
+ * page - a page whose memcg and writeback ownerships don't match - is
4698
+ * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
4699
+ * bdi_writeback on the page owning memcg. When balance_dirty_pages()
4700
+ * decides that the memcg needs to sleep due to high dirty ratio, it calls
4701
+ * mem_cgroup_flush_foreign() which queues writeback on the recorded
4702
+ * foreign bdi_writebacks which haven't expired. Both the numbers of
4703
+ * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
4704
+ * limited to MEMCG_CGWB_FRN_CNT.
4705
+ *
4706
+ * The mechanism only remembers IDs and doesn't hold any object references.
4707
+ * As being wrong occasionally doesn't matter, updates and accesses to the
4708
+ * records are lockless and racy.
4709
+ */
4710
+void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
4711
+ struct bdi_writeback *wb)
4712
+{
4713
+ struct mem_cgroup *memcg = page->mem_cgroup;
4714
+ struct memcg_cgwb_frn *frn;
4715
+ u64 now = get_jiffies_64();
4716
+ u64 oldest_at = now;
4717
+ int oldest = -1;
4718
+ int i;
4719
+
4720
+ trace_track_foreign_dirty(page, wb);
4721
+
4722
+ /*
4723
+ * Pick the slot to use. If there is already a slot for @wb, keep
4724
+ * using it. If not replace the oldest one which isn't being
4725
+ * written out.
4726
+ */
4727
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4728
+ frn = &memcg->cgwb_frn[i];
4729
+ if (frn->bdi_id == wb->bdi->id &&
4730
+ frn->memcg_id == wb->memcg_css->id)
4731
+ break;
4732
+ if (time_before64(frn->at, oldest_at) &&
4733
+ atomic_read(&frn->done.cnt) == 1) {
4734
+ oldest = i;
4735
+ oldest_at = frn->at;
4736
+ }
4737
+ }
4738
+
4739
+ if (i < MEMCG_CGWB_FRN_CNT) {
4740
+ /*
4741
+ * Re-using an existing one. Update timestamp lazily to
4742
+ * avoid making the cacheline hot. We want them to be
4743
+ * reasonably up-to-date and significantly shorter than
4744
+ * dirty_expire_interval as that's what expires the record.
4745
+ * Use the shorter of 1s and dirty_expire_interval / 8.
4746
+ */
4747
+ unsigned long update_intv =
4748
+ min_t(unsigned long, HZ,
4749
+ msecs_to_jiffies(dirty_expire_interval * 10) / 8);
4750
+
4751
+ if (time_before64(frn->at, now - update_intv))
4752
+ frn->at = now;
4753
+ } else if (oldest >= 0) {
4754
+ /* replace the oldest free one */
4755
+ frn = &memcg->cgwb_frn[oldest];
4756
+ frn->bdi_id = wb->bdi->id;
4757
+ frn->memcg_id = wb->memcg_css->id;
4758
+ frn->at = now;
4759
+ }
4760
+}
4761
+
4762
+/* issue foreign writeback flushes for recorded foreign dirtying events */
4763
+void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
4764
+{
4765
+ struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4766
+ unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
4767
+ u64 now = jiffies_64;
4768
+ int i;
4769
+
4770
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4771
+ struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
4772
+
4773
+ /*
4774
+ * If the record is older than dirty_expire_interval,
4775
+ * writeback on it has already started. No need to kick it
4776
+ * off again. Also, don't start a new one if there's
4777
+ * already one in flight.
4778
+ */
4779
+ if (time_after64(frn->at, now - intv) &&
4780
+ atomic_read(&frn->done.cnt) == 1) {
4781
+ frn->at = 0;
4782
+ trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
4783
+ cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
4784
+ WB_REASON_FOREIGN_FLUSH,
4785
+ &frn->done);
4786
+ }
40014787 }
40024788 }
40034789
....@@ -4120,6 +4906,7 @@
41204906 unsigned int efd, cfd;
41214907 struct fd efile;
41224908 struct fd cfile;
4909
+ struct dentry *cdentry;
41234910 const char *name;
41244911 char *endp;
41254912 int ret;
....@@ -4171,6 +4958,16 @@
41714958 goto out_put_cfile;
41724959
41734960 /*
4961
+ * The control file must be a regular cgroup1 file. As a regular cgroup
4962
+ * file can't be renamed, it's safe to access its name afterwards.
4963
+ */
4964
+ cdentry = cfile.file->f_path.dentry;
4965
+ if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
4966
+ ret = -EINVAL;
4967
+ goto out_put_cfile;
4968
+ }
4969
+
4970
+ /*
41744971 * Determine the event callbacks and set them in @event. This used
41754972 * to be done via struct cftype but cgroup core no longer knows
41764973 * about these events. The following is crude but the whole thing
....@@ -4178,7 +4975,7 @@
41784975 *
41794976 * DO NOT ADD NEW FILES.
41804977 */
4181
- name = cfile.file->f_path.dentry->d_name.name;
4978
+ name = cdentry->d_name.name;
41824979
41834980 if (!strcmp(name, "memory.usage_in_bytes")) {
41844981 event->register_event = mem_cgroup_usage_register_event;
....@@ -4202,7 +4999,7 @@
42024999 * automatically removed on cgroup destruction but the removal is
42035000 * asynchronous, so take an extra ref on @css.
42045001 */
4205
- cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
5002
+ cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
42065003 &memory_cgrp_subsys);
42075004 ret = -EINVAL;
42085005 if (IS_ERR(cfile_css))
....@@ -4337,12 +5134,10 @@
43375134 .write = mem_cgroup_reset,
43385135 .read_u64 = mem_cgroup_read_u64,
43395136 },
4340
-#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
5137
+#if defined(CONFIG_MEMCG_KMEM) && \
5138
+ (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
43415139 {
43425140 .name = "kmem.slabinfo",
4343
- .seq_start = memcg_slab_start,
4344
- .seq_next = memcg_slab_next,
4345
- .seq_stop = memcg_slab_stop,
43465141 .seq_show = memcg_slab_show,
43475142 },
43485143 #endif
....@@ -4380,7 +5175,7 @@
43805175 * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
43815176 * memory-controlled cgroups to 64k.
43825177 *
4383
- * However, there usually are many references to the oflline CSS after
5178
+ * However, there usually are many references to the offline CSS after
43845179 * the cgroup has been destroyed, such as page cache or reclaimable
43855180 * slab objects, that don't need to hang on to the ID. We want to keep
43865181 * those dead CSS from occupying IDs, or we might quickly exhaust the
....@@ -4401,31 +5196,26 @@
44015196 static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
44025197 {
44035198 if (memcg->id.id > 0) {
5199
+ trace_android_vh_mem_cgroup_id_remove(memcg);
44045200 idr_remove(&mem_cgroup_idr, memcg->id.id);
44055201 memcg->id.id = 0;
44065202 }
44075203 }
44085204
4409
-static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
5205
+static void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg,
5206
+ unsigned int n)
44105207 {
4411
- VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0);
4412
- atomic_add(n, &memcg->id.ref);
5208
+ refcount_add(n, &memcg->id.ref);
44135209 }
44145210
44155211 static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
44165212 {
4417
- VM_BUG_ON(atomic_read(&memcg->id.ref) < n);
4418
- if (atomic_sub_and_test(n, &memcg->id.ref)) {
5213
+ if (refcount_sub_and_test(n, &memcg->id.ref)) {
44195214 mem_cgroup_id_remove(memcg);
44205215
44215216 /* Memcg ID pins CSS */
44225217 css_put(&memcg->css);
44235218 }
4424
-}
4425
-
4426
-static inline void mem_cgroup_id_get(struct mem_cgroup *memcg)
4427
-{
4428
- mem_cgroup_id_get_many(memcg, 1);
44295219 }
44305220
44315221 static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
....@@ -4444,6 +5234,7 @@
44445234 WARN_ON_ONCE(!rcu_read_lock_held());
44455235 return idr_find(&mem_cgroup_idr, id);
44465236 }
5237
+EXPORT_SYMBOL_GPL(mem_cgroup_from_id);
44475238
44485239 static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
44495240 {
....@@ -4463,8 +5254,17 @@
44635254 if (!pn)
44645255 return 1;
44655256
4466
- pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat);
5257
+ pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat,
5258
+ GFP_KERNEL_ACCOUNT);
5259
+ if (!pn->lruvec_stat_local) {
5260
+ kfree(pn);
5261
+ return 1;
5262
+ }
5263
+
5264
+ pn->lruvec_stat_cpu = alloc_percpu_gfp(struct lruvec_stat,
5265
+ GFP_KERNEL_ACCOUNT);
44675266 if (!pn->lruvec_stat_cpu) {
5267
+ free_percpu(pn->lruvec_stat_local);
44685268 kfree(pn);
44695269 return 1;
44705270 }
....@@ -4486,6 +5286,7 @@
44865286 return;
44875287
44885288 free_percpu(pn->lruvec_stat_cpu);
5289
+ free_percpu(pn->lruvec_stat_local);
44895290 kfree(pn);
44905291 }
44915292
....@@ -4493,39 +5294,57 @@
44935294 {
44945295 int node;
44955296
5297
+ trace_android_vh_mem_cgroup_free(memcg);
44965298 for_each_node(node)
44975299 free_mem_cgroup_per_node_info(memcg, node);
4498
- free_percpu(memcg->stat_cpu);
5300
+ free_percpu(memcg->vmstats_percpu);
5301
+ free_percpu(memcg->vmstats_local);
44995302 kfree(memcg);
45005303 }
45015304
45025305 static void mem_cgroup_free(struct mem_cgroup *memcg)
45035306 {
45045307 memcg_wb_domain_exit(memcg);
5308
+ /*
5309
+ * Flush percpu vmstats and vmevents to guarantee the value correctness
5310
+ * on parent's and all ancestor levels.
5311
+ */
5312
+ memcg_flush_percpu_vmstats(memcg);
5313
+ memcg_flush_percpu_vmevents(memcg);
45055314 __mem_cgroup_free(memcg);
45065315 }
45075316
45085317 static struct mem_cgroup *mem_cgroup_alloc(void)
45095318 {
45105319 struct mem_cgroup *memcg;
4511
- size_t size;
5320
+ unsigned int size;
45125321 int node;
5322
+ int __maybe_unused i;
5323
+ long error = -ENOMEM;
45135324
45145325 size = sizeof(struct mem_cgroup);
45155326 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
45165327
45175328 memcg = kzalloc(size, GFP_KERNEL);
45185329 if (!memcg)
4519
- return NULL;
5330
+ return ERR_PTR(error);
45205331
45215332 memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
45225333 1, MEM_CGROUP_ID_MAX,
45235334 GFP_KERNEL);
4524
- if (memcg->id.id < 0)
5335
+ if (memcg->id.id < 0) {
5336
+ error = memcg->id.id;
5337
+ goto fail;
5338
+ }
5339
+
5340
+ memcg->vmstats_local = alloc_percpu_gfp(struct memcg_vmstats_percpu,
5341
+ GFP_KERNEL_ACCOUNT);
5342
+ if (!memcg->vmstats_local)
45255343 goto fail;
45265344
4527
- memcg->stat_cpu = alloc_percpu(struct mem_cgroup_stat_cpu);
4528
- if (!memcg->stat_cpu)
5345
+ memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
5346
+ GFP_KERNEL_ACCOUNT);
5347
+ if (!memcg->vmstats_percpu)
45295348 goto fail;
45305349
45315350 for_each_node(node)
....@@ -4536,7 +5355,6 @@
45365355 goto fail;
45375356
45385357 INIT_WORK(&memcg->high_work, high_work_func);
4539
- memcg->last_scanned_node = MAX_NUMNODES;
45405358 INIT_LIST_HEAD(&memcg->oom_notify);
45415359 mutex_init(&memcg->thresholds_lock);
45425360 spin_lock_init(&memcg->move_lock);
....@@ -4546,48 +5364,64 @@
45465364 memcg->socket_pressure = jiffies;
45475365 #ifdef CONFIG_MEMCG_KMEM
45485366 memcg->kmemcg_id = -1;
5367
+ INIT_LIST_HEAD(&memcg->objcg_list);
45495368 #endif
45505369 #ifdef CONFIG_CGROUP_WRITEBACK
45515370 INIT_LIST_HEAD(&memcg->cgwb_list);
5371
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
5372
+ memcg->cgwb_frn[i].done =
5373
+ __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
5374
+#endif
5375
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5376
+ spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
5377
+ INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
5378
+ memcg->deferred_split_queue.split_queue_len = 0;
45525379 #endif
45535380 idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
5381
+ trace_android_vh_mem_cgroup_alloc(memcg);
45545382 return memcg;
45555383 fail:
45565384 mem_cgroup_id_remove(memcg);
45575385 __mem_cgroup_free(memcg);
4558
- return NULL;
5386
+ return ERR_PTR(error);
45595387 }
45605388
45615389 static struct cgroup_subsys_state * __ref
45625390 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
45635391 {
45645392 struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
4565
- struct mem_cgroup *memcg;
5393
+ struct mem_cgroup *memcg, *old_memcg;
45665394 long error = -ENOMEM;
45675395
5396
+ old_memcg = set_active_memcg(parent);
45685397 memcg = mem_cgroup_alloc();
4569
- if (!memcg)
4570
- return ERR_PTR(error);
5398
+ set_active_memcg(old_memcg);
5399
+ if (IS_ERR(memcg))
5400
+ return ERR_CAST(memcg);
45715401
4572
- memcg->high = PAGE_COUNTER_MAX;
5402
+ page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
45735403 memcg->soft_limit = PAGE_COUNTER_MAX;
5404
+ page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
45745405 if (parent) {
45755406 memcg->swappiness = mem_cgroup_swappiness(parent);
45765407 memcg->oom_kill_disable = parent->oom_kill_disable;
45775408 }
4578
- if (parent && parent->use_hierarchy) {
5409
+ if (!parent) {
5410
+ page_counter_init(&memcg->memory, NULL);
5411
+ page_counter_init(&memcg->swap, NULL);
5412
+ page_counter_init(&memcg->kmem, NULL);
5413
+ page_counter_init(&memcg->tcpmem, NULL);
5414
+ } else if (parent->use_hierarchy) {
45795415 memcg->use_hierarchy = true;
45805416 page_counter_init(&memcg->memory, &parent->memory);
45815417 page_counter_init(&memcg->swap, &parent->swap);
4582
- page_counter_init(&memcg->memsw, &parent->memsw);
45835418 page_counter_init(&memcg->kmem, &parent->kmem);
45845419 page_counter_init(&memcg->tcpmem, &parent->tcpmem);
45855420 } else {
4586
- page_counter_init(&memcg->memory, NULL);
4587
- page_counter_init(&memcg->swap, NULL);
4588
- page_counter_init(&memcg->memsw, NULL);
4589
- page_counter_init(&memcg->kmem, NULL);
4590
- page_counter_init(&memcg->tcpmem, NULL);
5421
+ page_counter_init(&memcg->memory, &root_mem_cgroup->memory);
5422
+ page_counter_init(&memcg->swap, &root_mem_cgroup->swap);
5423
+ page_counter_init(&memcg->kmem, &root_mem_cgroup->kmem);
5424
+ page_counter_init(&memcg->tcpmem, &root_mem_cgroup->tcpmem);
45915425 /*
45925426 * Deeper hierachy with use_hierarchy == false doesn't make
45935427 * much sense so let cgroup subsystem know about this
....@@ -4614,7 +5448,7 @@
46145448 fail:
46155449 mem_cgroup_id_remove(memcg);
46165450 mem_cgroup_free(memcg);
4617
- return ERR_PTR(-ENOMEM);
5451
+ return ERR_PTR(error);
46185452 }
46195453
46205454 static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
....@@ -4632,8 +5466,9 @@
46325466 }
46335467
46345468 /* Online state pins memcg ID, memcg ID pins CSS */
4635
- atomic_set(&memcg->id.ref, 1);
5469
+ refcount_set(&memcg->id.ref, 1);
46365470 css_get(css);
5471
+ trace_android_vh_mem_cgroup_css_online(css, memcg);
46375472 return 0;
46385473 }
46395474
....@@ -4642,6 +5477,7 @@
46425477 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
46435478 struct mem_cgroup_event *event, *tmp;
46445479
5480
+ trace_android_vh_mem_cgroup_css_offline(css, memcg);
46455481 /*
46465482 * Unregister events and notify userspace.
46475483 * Notify userspace about cgroup removing only after rmdir of cgroup
....@@ -4660,6 +5496,8 @@
46605496 memcg_offline_kmem(memcg);
46615497 wb_memcg_offline(memcg);
46625498
5499
+ drain_all_stock(memcg);
5500
+
46635501 mem_cgroup_id_put(memcg);
46645502 }
46655503
....@@ -4673,7 +5511,12 @@
46735511 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
46745512 {
46755513 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5514
+ int __maybe_unused i;
46765515
5516
+#ifdef CONFIG_CGROUP_WRITEBACK
5517
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
5518
+ wb_wait_for_completion(&memcg->cgwb_frn[i].done);
5519
+#endif
46775520 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
46785521 static_branch_dec(&memcg_sockets_enabled_key);
46795522
....@@ -4707,13 +5550,13 @@
47075550
47085551 page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
47095552 page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
4710
- page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
47115553 page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
47125554 page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
47135555 page_counter_set_min(&memcg->memory, 0);
47145556 page_counter_set_low(&memcg->memory, 0);
4715
- memcg->high = PAGE_COUNTER_MAX;
5557
+ page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
47165558 memcg->soft_limit = PAGE_COUNTER_MAX;
5559
+ page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
47175560 memcg_wb_domain_size_changed(memcg);
47185561 }
47195562
....@@ -4756,7 +5599,7 @@
47565599 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
47575600 unsigned long addr, pte_t ptent)
47585601 {
4759
- struct page *page = _vm_normal_page(vma, addr, ptent, true);
5602
+ struct page *page = vm_normal_page(vma, addr, ptent);
47605603
47615604 if (!page || !page_mapped(page))
47625605 return NULL;
....@@ -4807,8 +5650,7 @@
48075650 * we call find_get_page() with swapper_space directly.
48085651 */
48095652 page = find_get_page(swap_address_space(ent), swp_offset(ent));
4810
- if (do_memsw_account())
4811
- entry->val = ent.val;
5653
+ entry->val = ent.val;
48125654
48135655 return page;
48145656 }
....@@ -4823,36 +5665,15 @@
48235665 static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
48245666 unsigned long addr, pte_t ptent, swp_entry_t *entry)
48255667 {
4826
- struct page *page = NULL;
4827
- struct address_space *mapping;
4828
- pgoff_t pgoff;
4829
-
48305668 if (!vma->vm_file) /* anonymous vma */
48315669 return NULL;
48325670 if (!(mc.flags & MOVE_FILE))
48335671 return NULL;
48345672
4835
- mapping = vma->vm_file->f_mapping;
4836
- pgoff = linear_page_index(vma, addr);
4837
-
48385673 /* page is moved even if it's not RSS of this task(page-faulted). */
4839
-#ifdef CONFIG_SWAP
48405674 /* shmem/tmpfs may report page out on swap: account for that too. */
4841
- if (shmem_mapping(mapping)) {
4842
- page = find_get_entry(mapping, pgoff);
4843
- if (radix_tree_exceptional_entry(page)) {
4844
- swp_entry_t swp = radix_to_swp_entry(page);
4845
- if (do_memsw_account())
4846
- *entry = swp;
4847
- page = find_get_page(swap_address_space(swp),
4848
- swp_offset(swp));
4849
- }
4850
- } else
4851
- page = find_get_page(mapping, pgoff);
4852
-#else
4853
- page = find_get_page(mapping, pgoff);
4854
-#endif
4855
- return page;
5675
+ return find_get_incore_page(vma->vm_file->f_mapping,
5676
+ linear_page_index(vma, addr));
48565677 }
48575678
48585679 /**
....@@ -4872,10 +5693,10 @@
48725693 struct mem_cgroup *from,
48735694 struct mem_cgroup *to)
48745695 {
4875
- unsigned long flags;
4876
- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5696
+ struct lruvec *from_vec, *to_vec;
5697
+ struct pglist_data *pgdat;
5698
+ unsigned int nr_pages = compound ? thp_nr_pages(page) : 1;
48775699 int ret;
4878
- bool anon;
48795700
48805701 VM_BUG_ON(from == to);
48815702 VM_BUG_ON_PAGE(PageLRU(page), page);
....@@ -4893,50 +5714,81 @@
48935714 if (page->mem_cgroup != from)
48945715 goto out_unlock;
48955716
4896
- anon = PageAnon(page);
5717
+ pgdat = page_pgdat(page);
5718
+ from_vec = mem_cgroup_lruvec(from, pgdat);
5719
+ to_vec = mem_cgroup_lruvec(to, pgdat);
48975720
4898
- spin_lock_irqsave(&from->move_lock, flags);
5721
+ lock_page_memcg(page);
48995722
4900
- if (!anon && page_mapped(page)) {
4901
- __mod_memcg_state(from, NR_FILE_MAPPED, -nr_pages);
4902
- __mod_memcg_state(to, NR_FILE_MAPPED, nr_pages);
4903
- }
5723
+ if (PageAnon(page)) {
5724
+ if (page_mapped(page)) {
5725
+ __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
5726
+ __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
5727
+ if (PageTransHuge(page)) {
5728
+ __dec_lruvec_state(from_vec, NR_ANON_THPS);
5729
+ __inc_lruvec_state(to_vec, NR_ANON_THPS);
5730
+ }
49045731
4905
- /*
4906
- * move_lock grabbed above and caller set from->moving_account, so
4907
- * mod_memcg_page_state will serialize updates to PageDirty.
4908
- * So mapping should be stable for dirty pages.
4909
- */
4910
- if (!anon && PageDirty(page)) {
4911
- struct address_space *mapping = page_mapping(page);
5732
+ }
5733
+ } else {
5734
+ __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
5735
+ __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
49125736
4913
- if (mapping_cap_account_dirty(mapping)) {
4914
- __mod_memcg_state(from, NR_FILE_DIRTY, -nr_pages);
4915
- __mod_memcg_state(to, NR_FILE_DIRTY, nr_pages);
5737
+ if (PageSwapBacked(page)) {
5738
+ __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
5739
+ __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
5740
+ }
5741
+
5742
+ if (page_mapped(page)) {
5743
+ __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
5744
+ __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
5745
+ }
5746
+
5747
+ if (PageDirty(page)) {
5748
+ struct address_space *mapping = page_mapping(page);
5749
+
5750
+ if (mapping_can_writeback(mapping)) {
5751
+ __mod_lruvec_state(from_vec, NR_FILE_DIRTY,
5752
+ -nr_pages);
5753
+ __mod_lruvec_state(to_vec, NR_FILE_DIRTY,
5754
+ nr_pages);
5755
+ }
49165756 }
49175757 }
49185758
49195759 if (PageWriteback(page)) {
4920
- __mod_memcg_state(from, NR_WRITEBACK, -nr_pages);
4921
- __mod_memcg_state(to, NR_WRITEBACK, nr_pages);
5760
+ __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
5761
+ __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
49225762 }
49235763
49245764 /*
5765
+ * All state has been migrated, let's switch to the new memcg.
5766
+ *
49255767 * It is safe to change page->mem_cgroup here because the page
4926
- * is referenced, charged, and isolated - we can't race with
4927
- * uncharging, charging, migration, or LRU putback.
5768
+ * is referenced, charged, isolated, and locked: we can't race
5769
+ * with (un)charging, migration, LRU putback, or anything else
5770
+ * that would rely on a stable page->mem_cgroup.
5771
+ *
5772
+ * Note that lock_page_memcg is a memcg lock, not a page lock,
5773
+ * to save space. As soon as we switch page->mem_cgroup to a
5774
+ * new memcg that isn't locked, the above state can change
5775
+ * concurrently again. Make sure we're truly done with it.
49285776 */
5777
+ smp_mb();
49295778
4930
- /* caller should have done css_get */
5779
+ css_get(&to->css);
5780
+ css_put(&from->css);
5781
+
49315782 page->mem_cgroup = to;
4932
- spin_unlock_irqrestore(&from->move_lock, flags);
5783
+
5784
+ __unlock_page_memcg(from);
49335785
49345786 ret = 0;
49355787
49365788 local_irq_disable();
4937
- mem_cgroup_charge_statistics(to, page, compound, nr_pages);
5789
+ mem_cgroup_charge_statistics(to, page, nr_pages);
49385790 memcg_check_events(to, page);
4939
- mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
5791
+ mem_cgroup_charge_statistics(from, page, -nr_pages);
49405792 memcg_check_events(from, page);
49415793 local_irq_enable();
49425794 out_unlock:
....@@ -4960,8 +5812,8 @@
49605812 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
49615813 * target for charge migration. if @target is not NULL, the entry is stored
49625814 * in target->ent.
4963
- * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PUBLIC
4964
- * or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).
5815
+ * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE
5816
+ * (so ZONE_DEVICE page and thus not on the lru).
49655817 * For now we such page is charge like a regular page would be as for all
49665818 * intent and purposes it is just special memory taking the place of a
49675819 * regular page.
....@@ -4995,8 +5847,7 @@
49955847 */
49965848 if (page->mem_cgroup == mc.from) {
49975849 ret = MC_TARGET_PAGE;
4998
- if (is_device_private_page(page) ||
4999
- is_device_public_page(page))
5850
+ if (is_device_private_page(page))
50005851 ret = MC_TARGET_DEVICE;
50015852 if (target)
50025853 target->page = page;
....@@ -5067,8 +5918,8 @@
50675918 if (ptl) {
50685919 /*
50695920 * Note their can not be MC_TARGET_DEVICE for now as we do not
5070
- * support transparent huge page with MEMORY_DEVICE_PUBLIC or
5071
- * MEMORY_DEVICE_PRIVATE but this might change.
5921
+ * support transparent huge page with MEMORY_DEVICE_PRIVATE but
5922
+ * this might change.
50725923 */
50735924 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
50745925 mc.precharge += HPAGE_PMD_NR;
....@@ -5088,18 +5939,17 @@
50885939 return 0;
50895940 }
50905941
5942
+static const struct mm_walk_ops precharge_walk_ops = {
5943
+ .pmd_entry = mem_cgroup_count_precharge_pte_range,
5944
+};
5945
+
50915946 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
50925947 {
50935948 unsigned long precharge;
50945949
5095
- struct mm_walk mem_cgroup_count_precharge_walk = {
5096
- .pmd_entry = mem_cgroup_count_precharge_pte_range,
5097
- .mm = mm,
5098
- };
5099
- down_read(&mm->mmap_sem);
5100
- walk_page_range(0, mm->highest_vm_end,
5101
- &mem_cgroup_count_precharge_walk);
5102
- up_read(&mm->mmap_sem);
5950
+ mmap_read_lock(mm);
5951
+ walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
5952
+ mmap_read_unlock(mm);
51035953
51045954 precharge = mc.precharge;
51055955 mc.precharge = 0;
....@@ -5149,8 +5999,6 @@
51495999 */
51506000 if (!mem_cgroup_is_root(mc.to))
51516001 page_counter_uncharge(&mc.to->memory, mc.moved_swap);
5152
-
5153
- css_put_many(&mc.to->css, mc.moved_swap);
51546002
51556003 mc.moved_swap = 0;
51566004 }
....@@ -5312,7 +6160,7 @@
53126160 switch (get_mctgt_type(vma, addr, ptent, &target)) {
53136161 case MC_TARGET_DEVICE:
53146162 device = true;
5315
- /* fall through */
6163
+ fallthrough;
53166164 case MC_TARGET_PAGE:
53176165 page = target.page;
53186166 /*
....@@ -5367,13 +6215,12 @@
53676215 return ret;
53686216 }
53696217
6218
+static const struct mm_walk_ops charge_walk_ops = {
6219
+ .pmd_entry = mem_cgroup_move_charge_pte_range,
6220
+};
6221
+
53706222 static void mem_cgroup_move_charge(void)
53716223 {
5372
- struct mm_walk mem_cgroup_move_charge_walk = {
5373
- .pmd_entry = mem_cgroup_move_charge_pte_range,
5374
- .mm = mc.mm,
5375
- };
5376
-
53776224 lru_add_drain_all();
53786225 /*
53796226 * Signal lock_page_memcg() to take the memcg's move_lock
....@@ -5383,9 +6230,9 @@
53836230 atomic_inc(&mc.from->moving_account);
53846231 synchronize_rcu();
53856232 retry:
5386
- if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) {
6233
+ if (unlikely(!mmap_read_trylock(mc.mm))) {
53876234 /*
5388
- * Someone who are holding the mmap_sem might be waiting in
6235
+ * Someone who are holding the mmap_lock might be waiting in
53896236 * waitq. So we cancel all extra charges, wake up all waiters,
53906237 * and retry. Because we cancel precharges, we might not be able
53916238 * to move enough charges, but moving charge is a best-effort
....@@ -5399,9 +6246,10 @@
53996246 * When we have consumed all precharges and failed in doing
54006247 * additional charge, the page walk just aborts.
54016248 */
5402
- walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk);
6249
+ walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
6250
+ NULL);
54036251
5404
- up_read(&mc.mm->mmap_sem);
6252
+ mmap_read_unlock(mc.mm);
54056253 atomic_dec(&mc.from->moving_account);
54066254 }
54076255
....@@ -5443,6 +6291,16 @@
54436291 root_mem_cgroup->use_hierarchy = false;
54446292 }
54456293
6294
+static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
6295
+{
6296
+ if (value == PAGE_COUNTER_MAX)
6297
+ seq_puts(m, "max\n");
6298
+ else
6299
+ seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
6300
+
6301
+ return 0;
6302
+}
6303
+
54466304 static u64 memory_current_read(struct cgroup_subsys_state *css,
54476305 struct cftype *cft)
54486306 {
....@@ -5453,15 +6311,8 @@
54536311
54546312 static int memory_min_show(struct seq_file *m, void *v)
54556313 {
5456
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5457
- unsigned long min = READ_ONCE(memcg->memory.min);
5458
-
5459
- if (min == PAGE_COUNTER_MAX)
5460
- seq_puts(m, "max\n");
5461
- else
5462
- seq_printf(m, "%llu\n", (u64)min * PAGE_SIZE);
5463
-
5464
- return 0;
6314
+ return seq_puts_memcg_tunable(m,
6315
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
54656316 }
54666317
54676318 static ssize_t memory_min_write(struct kernfs_open_file *of,
....@@ -5483,15 +6334,8 @@
54836334
54846335 static int memory_low_show(struct seq_file *m, void *v)
54856336 {
5486
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5487
- unsigned long low = READ_ONCE(memcg->memory.low);
5488
-
5489
- if (low == PAGE_COUNTER_MAX)
5490
- seq_puts(m, "max\n");
5491
- else
5492
- seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE);
5493
-
5494
- return 0;
6337
+ return seq_puts_memcg_tunable(m,
6338
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
54956339 }
54966340
54976341 static ssize_t memory_low_write(struct kernfs_open_file *of,
....@@ -5513,22 +6357,16 @@
55136357
55146358 static int memory_high_show(struct seq_file *m, void *v)
55156359 {
5516
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5517
- unsigned long high = READ_ONCE(memcg->high);
5518
-
5519
- if (high == PAGE_COUNTER_MAX)
5520
- seq_puts(m, "max\n");
5521
- else
5522
- seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE);
5523
-
5524
- return 0;
6360
+ return seq_puts_memcg_tunable(m,
6361
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.high));
55256362 }
55266363
55276364 static ssize_t memory_high_write(struct kernfs_open_file *of,
55286365 char *buf, size_t nbytes, loff_t off)
55296366 {
55306367 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5531
- unsigned long nr_pages;
6368
+ unsigned int nr_retries = MAX_RECLAIM_RETRIES;
6369
+ bool drained = false;
55326370 unsigned long high;
55336371 int err;
55346372
....@@ -5537,12 +6375,30 @@
55376375 if (err)
55386376 return err;
55396377
5540
- memcg->high = high;
6378
+ page_counter_set_high(&memcg->memory, high);
55416379
5542
- nr_pages = page_counter_read(&memcg->memory);
5543
- if (nr_pages > high)
5544
- try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
5545
- GFP_KERNEL, true);
6380
+ for (;;) {
6381
+ unsigned long nr_pages = page_counter_read(&memcg->memory);
6382
+ unsigned long reclaimed;
6383
+
6384
+ if (nr_pages <= high)
6385
+ break;
6386
+
6387
+ if (signal_pending(current))
6388
+ break;
6389
+
6390
+ if (!drained) {
6391
+ drain_all_stock(memcg);
6392
+ drained = true;
6393
+ continue;
6394
+ }
6395
+
6396
+ reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
6397
+ GFP_KERNEL, true);
6398
+
6399
+ if (!reclaimed && !nr_retries--)
6400
+ break;
6401
+ }
55466402
55476403 memcg_wb_domain_size_changed(memcg);
55486404 return nbytes;
....@@ -5550,22 +6406,15 @@
55506406
55516407 static int memory_max_show(struct seq_file *m, void *v)
55526408 {
5553
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5554
- unsigned long max = READ_ONCE(memcg->memory.max);
5555
-
5556
- if (max == PAGE_COUNTER_MAX)
5557
- seq_puts(m, "max\n");
5558
- else
5559
- seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
5560
-
5561
- return 0;
6409
+ return seq_puts_memcg_tunable(m,
6410
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
55626411 }
55636412
55646413 static ssize_t memory_max_write(struct kernfs_open_file *of,
55656414 char *buf, size_t nbytes, loff_t off)
55666415 {
55676416 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5568
- unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
6417
+ unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
55696418 bool drained = false;
55706419 unsigned long max;
55716420 int err;
....@@ -5583,10 +6432,8 @@
55836432 if (nr_pages <= max)
55846433 break;
55856434
5586
- if (signal_pending(current)) {
5587
- err = -EINTR;
6435
+ if (signal_pending(current))
55886436 break;
5589
- }
55906437
55916438 if (!drained) {
55926439 drain_all_stock(memcg);
....@@ -5610,104 +6457,77 @@
56106457 return nbytes;
56116458 }
56126459
6460
+static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
6461
+{
6462
+ seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
6463
+ seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
6464
+ seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
6465
+ seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
6466
+ seq_printf(m, "oom_kill %lu\n",
6467
+ atomic_long_read(&events[MEMCG_OOM_KILL]));
6468
+}
6469
+
56136470 static int memory_events_show(struct seq_file *m, void *v)
56146471 {
5615
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
6472
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
56166473
5617
- seq_printf(m, "low %lu\n",
5618
- atomic_long_read(&memcg->memory_events[MEMCG_LOW]));
5619
- seq_printf(m, "high %lu\n",
5620
- atomic_long_read(&memcg->memory_events[MEMCG_HIGH]));
5621
- seq_printf(m, "max %lu\n",
5622
- atomic_long_read(&memcg->memory_events[MEMCG_MAX]));
5623
- seq_printf(m, "oom %lu\n",
5624
- atomic_long_read(&memcg->memory_events[MEMCG_OOM]));
5625
- seq_printf(m, "oom_kill %lu\n",
5626
- atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
6474
+ __memory_events_show(m, memcg->memory_events);
6475
+ return 0;
6476
+}
56276477
6478
+static int memory_events_local_show(struct seq_file *m, void *v)
6479
+{
6480
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6481
+
6482
+ __memory_events_show(m, memcg->memory_events_local);
56286483 return 0;
56296484 }
56306485
56316486 static int memory_stat_show(struct seq_file *m, void *v)
56326487 {
5633
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5634
- struct accumulated_stats acc;
5635
- int i;
6488
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6489
+ char *buf;
56366490
5637
- /*
5638
- * Provide statistics on the state of the memory subsystem as
5639
- * well as cumulative event counters that show past behavior.
5640
- *
5641
- * This list is ordered following a combination of these gradients:
5642
- * 1) generic big picture -> specifics and details
5643
- * 2) reflecting userspace activity -> reflecting kernel heuristics
5644
- *
5645
- * Current memory state:
5646
- */
5647
-
5648
- memset(&acc, 0, sizeof(acc));
5649
- acc.stats_size = MEMCG_NR_STAT;
5650
- acc.events_size = NR_VM_EVENT_ITEMS;
5651
- accumulate_memcg_tree(memcg, &acc);
5652
-
5653
- seq_printf(m, "anon %llu\n",
5654
- (u64)acc.stat[MEMCG_RSS] * PAGE_SIZE);
5655
- seq_printf(m, "file %llu\n",
5656
- (u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE);
5657
- seq_printf(m, "kernel_stack %llu\n",
5658
- (u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024);
5659
- seq_printf(m, "slab %llu\n",
5660
- (u64)(acc.stat[NR_SLAB_RECLAIMABLE] +
5661
- acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
5662
- seq_printf(m, "sock %llu\n",
5663
- (u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE);
5664
-
5665
- seq_printf(m, "shmem %llu\n",
5666
- (u64)acc.stat[NR_SHMEM] * PAGE_SIZE);
5667
- seq_printf(m, "file_mapped %llu\n",
5668
- (u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE);
5669
- seq_printf(m, "file_dirty %llu\n",
5670
- (u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE);
5671
- seq_printf(m, "file_writeback %llu\n",
5672
- (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE);
5673
-
5674
- for (i = 0; i < NR_LRU_LISTS; i++)
5675
- seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
5676
- (u64)acc.lru_pages[i] * PAGE_SIZE);
5677
-
5678
- seq_printf(m, "slab_reclaimable %llu\n",
5679
- (u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
5680
- seq_printf(m, "slab_unreclaimable %llu\n",
5681
- (u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
5682
-
5683
- /* Accumulated memory events */
5684
-
5685
- seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]);
5686
- seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]);
5687
-
5688
- seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]);
5689
- seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] +
5690
- acc.events[PGSCAN_DIRECT]);
5691
- seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] +
5692
- acc.events[PGSTEAL_DIRECT]);
5693
- seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]);
5694
- seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]);
5695
- seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]);
5696
- seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
5697
-
5698
- seq_printf(m, "workingset_refault %lu\n",
5699
- acc.stat[WORKINGSET_REFAULT]);
5700
- seq_printf(m, "workingset_activate %lu\n",
5701
- acc.stat[WORKINGSET_ACTIVATE]);
5702
- seq_printf(m, "workingset_nodereclaim %lu\n",
5703
- acc.stat[WORKINGSET_NODERECLAIM]);
5704
-
6491
+ buf = memory_stat_format(memcg);
6492
+ if (!buf)
6493
+ return -ENOMEM;
6494
+ seq_puts(m, buf);
6495
+ kfree(buf);
57056496 return 0;
57066497 }
57076498
6499
+#ifdef CONFIG_NUMA
6500
+static int memory_numa_stat_show(struct seq_file *m, void *v)
6501
+{
6502
+ int i;
6503
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6504
+
6505
+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
6506
+ int nid;
6507
+
6508
+ if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS)
6509
+ continue;
6510
+
6511
+ seq_printf(m, "%s", memory_stats[i].name);
6512
+ for_each_node_state(nid, N_MEMORY) {
6513
+ u64 size;
6514
+ struct lruvec *lruvec;
6515
+
6516
+ lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
6517
+ size = lruvec_page_state(lruvec, memory_stats[i].idx);
6518
+ size *= memory_stats[i].ratio;
6519
+ seq_printf(m, " N%d=%llu", nid, size);
6520
+ }
6521
+ seq_putc(m, '\n');
6522
+ }
6523
+
6524
+ return 0;
6525
+}
6526
+#endif
6527
+
57086528 static int memory_oom_group_show(struct seq_file *m, void *v)
57096529 {
5710
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
6530
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
57116531
57126532 seq_printf(m, "%d\n", memcg->oom_group);
57136533
....@@ -5773,10 +6593,21 @@
57736593 .seq_show = memory_events_show,
57746594 },
57756595 {
5776
- .name = "stat",
6596
+ .name = "events.local",
57776597 .flags = CFTYPE_NOT_ON_ROOT,
6598
+ .file_offset = offsetof(struct mem_cgroup, events_local_file),
6599
+ .seq_show = memory_events_local_show,
6600
+ },
6601
+ {
6602
+ .name = "stat",
57786603 .seq_show = memory_stat_show,
57796604 },
6605
+#ifdef CONFIG_NUMA
6606
+ {
6607
+ .name = "numa_stat",
6608
+ .seq_show = memory_numa_stat_show,
6609
+ },
6610
+#endif
57806611 {
57816612 .name = "oom.group",
57826613 .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
....@@ -5802,6 +6633,122 @@
58026633 .early_init = 0,
58036634 };
58046635
6636
+/*
6637
+ * This function calculates an individual cgroup's effective
6638
+ * protection which is derived from its own memory.min/low, its
6639
+ * parent's and siblings' settings, as well as the actual memory
6640
+ * distribution in the tree.
6641
+ *
6642
+ * The following rules apply to the effective protection values:
6643
+ *
6644
+ * 1. At the first level of reclaim, effective protection is equal to
6645
+ * the declared protection in memory.min and memory.low.
6646
+ *
6647
+ * 2. To enable safe delegation of the protection configuration, at
6648
+ * subsequent levels the effective protection is capped to the
6649
+ * parent's effective protection.
6650
+ *
6651
+ * 3. To make complex and dynamic subtrees easier to configure, the
6652
+ * user is allowed to overcommit the declared protection at a given
6653
+ * level. If that is the case, the parent's effective protection is
6654
+ * distributed to the children in proportion to how much protection
6655
+ * they have declared and how much of it they are utilizing.
6656
+ *
6657
+ * This makes distribution proportional, but also work-conserving:
6658
+ * if one cgroup claims much more protection than it uses memory,
6659
+ * the unused remainder is available to its siblings.
6660
+ *
6661
+ * 4. Conversely, when the declared protection is undercommitted at a
6662
+ * given level, the distribution of the larger parental protection
6663
+ * budget is NOT proportional. A cgroup's protection from a sibling
6664
+ * is capped to its own memory.min/low setting.
6665
+ *
6666
+ * 5. However, to allow protecting recursive subtrees from each other
6667
+ * without having to declare each individual cgroup's fixed share
6668
+ * of the ancestor's claim to protection, any unutilized -
6669
+ * "floating" - protection from up the tree is distributed in
6670
+ * proportion to each cgroup's *usage*. This makes the protection
6671
+ * neutral wrt sibling cgroups and lets them compete freely over
6672
+ * the shared parental protection budget, but it protects the
6673
+ * subtree as a whole from neighboring subtrees.
6674
+ *
6675
+ * Note that 4. and 5. are not in conflict: 4. is about protecting
6676
+ * against immediate siblings whereas 5. is about protecting against
6677
+ * neighboring subtrees.
6678
+ */
6679
+static unsigned long effective_protection(unsigned long usage,
6680
+ unsigned long parent_usage,
6681
+ unsigned long setting,
6682
+ unsigned long parent_effective,
6683
+ unsigned long siblings_protected)
6684
+{
6685
+ unsigned long protected;
6686
+ unsigned long ep;
6687
+
6688
+ protected = min(usage, setting);
6689
+ /*
6690
+ * If all cgroups at this level combined claim and use more
6691
+ * protection then what the parent affords them, distribute
6692
+ * shares in proportion to utilization.
6693
+ *
6694
+ * We are using actual utilization rather than the statically
6695
+ * claimed protection in order to be work-conserving: claimed
6696
+ * but unused protection is available to siblings that would
6697
+ * otherwise get a smaller chunk than what they claimed.
6698
+ */
6699
+ if (siblings_protected > parent_effective)
6700
+ return protected * parent_effective / siblings_protected;
6701
+
6702
+ /*
6703
+ * Ok, utilized protection of all children is within what the
6704
+ * parent affords them, so we know whatever this child claims
6705
+ * and utilizes is effectively protected.
6706
+ *
6707
+ * If there is unprotected usage beyond this value, reclaim
6708
+ * will apply pressure in proportion to that amount.
6709
+ *
6710
+ * If there is unutilized protection, the cgroup will be fully
6711
+ * shielded from reclaim, but we do return a smaller value for
6712
+ * protection than what the group could enjoy in theory. This
6713
+ * is okay. With the overcommit distribution above, effective
6714
+ * protection is always dependent on how memory is actually
6715
+ * consumed among the siblings anyway.
6716
+ */
6717
+ ep = protected;
6718
+
6719
+ /*
6720
+ * If the children aren't claiming (all of) the protection
6721
+ * afforded to them by the parent, distribute the remainder in
6722
+ * proportion to the (unprotected) memory of each cgroup. That
6723
+ * way, cgroups that aren't explicitly prioritized wrt each
6724
+ * other compete freely over the allowance, but they are
6725
+ * collectively protected from neighboring trees.
6726
+ *
6727
+ * We're using unprotected memory for the weight so that if
6728
+ * some cgroups DO claim explicit protection, we don't protect
6729
+ * the same bytes twice.
6730
+ *
6731
+ * Check both usage and parent_usage against the respective
6732
+ * protected values. One should imply the other, but they
6733
+ * aren't read atomically - make sure the division is sane.
6734
+ */
6735
+ if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT))
6736
+ return ep;
6737
+ if (parent_effective > siblings_protected &&
6738
+ parent_usage > siblings_protected &&
6739
+ usage > protected) {
6740
+ unsigned long unclaimed;
6741
+
6742
+ unclaimed = parent_effective - siblings_protected;
6743
+ unclaimed *= usage - protected;
6744
+ unclaimed /= parent_usage - siblings_protected;
6745
+
6746
+ ep += unclaimed;
6747
+ }
6748
+
6749
+ return ep;
6750
+}
6751
+
58056752 /**
58066753 * mem_cgroup_protected - check if memory consumption is in the normal range
58076754 * @root: the top ancestor of the sub-tree being checked
....@@ -5809,259 +6756,125 @@
58096756 *
58106757 * WARNING: This function is not stateless! It can only be used as part
58116758 * of a top-down tree iteration, not for isolated queries.
5812
- *
5813
- * Returns one of the following:
5814
- * MEMCG_PROT_NONE: cgroup memory is not protected
5815
- * MEMCG_PROT_LOW: cgroup memory is protected as long there is
5816
- * an unprotected supply of reclaimable memory from other cgroups.
5817
- * MEMCG_PROT_MIN: cgroup memory is protected
5818
- *
5819
- * @root is exclusive; it is never protected when looked at directly
5820
- *
5821
- * To provide a proper hierarchical behavior, effective memory.min/low values
5822
- * are used. Below is the description of how effective memory.low is calculated.
5823
- * Effective memory.min values is calculated in the same way.
5824
- *
5825
- * Effective memory.low is always equal or less than the original memory.low.
5826
- * If there is no memory.low overcommittment (which is always true for
5827
- * top-level memory cgroups), these two values are equal.
5828
- * Otherwise, it's a part of parent's effective memory.low,
5829
- * calculated as a cgroup's memory.low usage divided by sum of sibling's
5830
- * memory.low usages, where memory.low usage is the size of actually
5831
- * protected memory.
5832
- *
5833
- * low_usage
5834
- * elow = min( memory.low, parent->elow * ------------------ ),
5835
- * siblings_low_usage
5836
- *
5837
- * | memory.current, if memory.current < memory.low
5838
- * low_usage = |
5839
- | 0, otherwise.
5840
- *
5841
- *
5842
- * Such definition of the effective memory.low provides the expected
5843
- * hierarchical behavior: parent's memory.low value is limiting
5844
- * children, unprotected memory is reclaimed first and cgroups,
5845
- * which are not using their guarantee do not affect actual memory
5846
- * distribution.
5847
- *
5848
- * For example, if there are memcgs A, A/B, A/C, A/D and A/E:
5849
- *
5850
- * A A/memory.low = 2G, A/memory.current = 6G
5851
- * //\\
5852
- * BC DE B/memory.low = 3G B/memory.current = 2G
5853
- * C/memory.low = 1G C/memory.current = 2G
5854
- * D/memory.low = 0 D/memory.current = 2G
5855
- * E/memory.low = 10G E/memory.current = 0
5856
- *
5857
- * and the memory pressure is applied, the following memory distribution
5858
- * is expected (approximately):
5859
- *
5860
- * A/memory.current = 2G
5861
- *
5862
- * B/memory.current = 1.3G
5863
- * C/memory.current = 0.6G
5864
- * D/memory.current = 0
5865
- * E/memory.current = 0
5866
- *
5867
- * These calculations require constant tracking of the actual low usages
5868
- * (see propagate_protected_usage()), as well as recursive calculation of
5869
- * effective memory.low values. But as we do call mem_cgroup_protected()
5870
- * path for each memory cgroup top-down from the reclaim,
5871
- * it's possible to optimize this part, and save calculated elow
5872
- * for next usage. This part is intentionally racy, but it's ok,
5873
- * as memory.low is a best-effort mechanism.
58746759 */
5875
-enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
5876
- struct mem_cgroup *memcg)
6760
+void mem_cgroup_calculate_protection(struct mem_cgroup *root,
6761
+ struct mem_cgroup *memcg)
58776762 {
6763
+ unsigned long usage, parent_usage;
58786764 struct mem_cgroup *parent;
5879
- unsigned long emin, parent_emin;
5880
- unsigned long elow, parent_elow;
5881
- unsigned long usage;
58826765
58836766 if (mem_cgroup_disabled())
5884
- return MEMCG_PROT_NONE;
6767
+ return;
58856768
58866769 if (!root)
58876770 root = root_mem_cgroup;
6771
+
6772
+ /*
6773
+ * Effective values of the reclaim targets are ignored so they
6774
+ * can be stale. Have a look at mem_cgroup_protection for more
6775
+ * details.
6776
+ * TODO: calculation should be more robust so that we do not need
6777
+ * that special casing.
6778
+ */
58886779 if (memcg == root)
5889
- return MEMCG_PROT_NONE;
6780
+ return;
58906781
58916782 usage = page_counter_read(&memcg->memory);
58926783 if (!usage)
5893
- return MEMCG_PROT_NONE;
5894
-
5895
- emin = memcg->memory.min;
5896
- elow = memcg->memory.low;
6784
+ return;
58976785
58986786 parent = parent_mem_cgroup(memcg);
58996787 /* No parent means a non-hierarchical mode on v1 memcg */
59006788 if (!parent)
5901
- return MEMCG_PROT_NONE;
6789
+ return;
59026790
5903
- if (parent == root)
5904
- goto exit;
5905
-
5906
- parent_emin = READ_ONCE(parent->memory.emin);
5907
- emin = min(emin, parent_emin);
5908
- if (emin && parent_emin) {
5909
- unsigned long min_usage, siblings_min_usage;
5910
-
5911
- min_usage = min(usage, memcg->memory.min);
5912
- siblings_min_usage = atomic_long_read(
5913
- &parent->memory.children_min_usage);
5914
-
5915
- if (min_usage && siblings_min_usage)
5916
- emin = min(emin, parent_emin * min_usage /
5917
- siblings_min_usage);
6791
+ if (parent == root) {
6792
+ memcg->memory.emin = READ_ONCE(memcg->memory.min);
6793
+ memcg->memory.elow = READ_ONCE(memcg->memory.low);
6794
+ return;
59186795 }
59196796
5920
- parent_elow = READ_ONCE(parent->memory.elow);
5921
- elow = min(elow, parent_elow);
5922
- if (elow && parent_elow) {
5923
- unsigned long low_usage, siblings_low_usage;
6797
+ parent_usage = page_counter_read(&parent->memory);
59246798
5925
- low_usage = min(usage, memcg->memory.low);
5926
- siblings_low_usage = atomic_long_read(
5927
- &parent->memory.children_low_usage);
6799
+ WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage,
6800
+ READ_ONCE(memcg->memory.min),
6801
+ READ_ONCE(parent->memory.emin),
6802
+ atomic_long_read(&parent->memory.children_min_usage)));
59286803
5929
- if (low_usage && siblings_low_usage)
5930
- elow = min(elow, parent_elow * low_usage /
5931
- siblings_low_usage);
5932
- }
5933
-
5934
-exit:
5935
- memcg->memory.emin = emin;
5936
- memcg->memory.elow = elow;
5937
-
5938
- if (usage <= emin)
5939
- return MEMCG_PROT_MIN;
5940
- else if (usage <= elow)
5941
- return MEMCG_PROT_LOW;
5942
- else
5943
- return MEMCG_PROT_NONE;
6804
+ WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage,
6805
+ READ_ONCE(memcg->memory.low),
6806
+ READ_ONCE(parent->memory.elow),
6807
+ atomic_long_read(&parent->memory.children_low_usage)));
59446808 }
59456809
59466810 /**
5947
- * mem_cgroup_try_charge - try charging a page
6811
+ * __mem_cgroup_charge - charge a newly allocated page to a cgroup
59486812 * @page: page to charge
59496813 * @mm: mm context of the victim
59506814 * @gfp_mask: reclaim mode
5951
- * @memcgp: charged memcg return
5952
- * @compound: charge the page as compound or small page
59536815 *
59546816 * Try to charge @page to the memcg that @mm belongs to, reclaiming
59556817 * pages according to @gfp_mask if necessary.
59566818 *
5957
- * Returns 0 on success, with *@memcgp pointing to the charged memcg.
5958
- * Otherwise, an error code is returned.
5959
- *
5960
- * After page->mapping has been set up, the caller must finalize the
5961
- * charge with mem_cgroup_commit_charge(). Or abort the transaction
5962
- * with mem_cgroup_cancel_charge() in case page instantiation fails.
6819
+ * Returns 0 on success. Otherwise, an error code is returned.
59636820 */
5964
-int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
5965
- gfp_t gfp_mask, struct mem_cgroup **memcgp,
5966
- bool compound)
6821
+int __mem_cgroup_charge(struct page *page, struct mm_struct *mm,
6822
+ gfp_t gfp_mask)
59676823 {
6824
+ unsigned int nr_pages = thp_nr_pages(page);
59686825 struct mem_cgroup *memcg = NULL;
5969
- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
59706826 int ret = 0;
59716827
5972
- if (mem_cgroup_disabled())
5973
- goto out;
5974
-
59756828 if (PageSwapCache(page)) {
6829
+ swp_entry_t ent = { .val = page_private(page), };
6830
+ unsigned short id;
6831
+
59766832 /*
59776833 * Every swap fault against a single page tries to charge the
59786834 * page, bail as early as possible. shmem_unuse() encounters
5979
- * already charged pages, too. The USED bit is protected by
5980
- * the page lock, which serializes swap cache removal, which
6835
+ * already charged pages, too. page->mem_cgroup is protected
6836
+ * by the page lock, which serializes swap cache removal, which
59816837 * in turn serializes uncharging.
59826838 */
59836839 VM_BUG_ON_PAGE(!PageLocked(page), page);
59846840 if (compound_head(page)->mem_cgroup)
59856841 goto out;
59866842
5987
- if (do_swap_account) {
5988
- swp_entry_t ent = { .val = page_private(page), };
5989
- unsigned short id = lookup_swap_cgroup_id(ent);
5990
-
5991
- rcu_read_lock();
5992
- memcg = mem_cgroup_from_id(id);
5993
- if (memcg && !css_tryget_online(&memcg->css))
5994
- memcg = NULL;
5995
- rcu_read_unlock();
5996
- }
6843
+ id = lookup_swap_cgroup_id(ent);
6844
+ rcu_read_lock();
6845
+ memcg = mem_cgroup_from_id(id);
6846
+ if (memcg && !css_tryget_online(&memcg->css))
6847
+ memcg = NULL;
6848
+ rcu_read_unlock();
59976849 }
59986850
59996851 if (!memcg)
60006852 memcg = get_mem_cgroup_from_mm(mm);
60016853
60026854 ret = try_charge(memcg, gfp_mask, nr_pages);
6855
+ if (ret)
6856
+ goto out_put;
60036857
6004
- css_put(&memcg->css);
6005
-out:
6006
- *memcgp = memcg;
6007
- return ret;
6008
-}
6009
-
6010
-int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm,
6011
- gfp_t gfp_mask, struct mem_cgroup **memcgp,
6012
- bool compound)
6013
-{
6014
- struct mem_cgroup *memcg;
6015
- int ret;
6016
-
6017
- ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound);
6018
- memcg = *memcgp;
6019
- mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask);
6020
- return ret;
6021
-}
6022
-
6023
-/**
6024
- * mem_cgroup_commit_charge - commit a page charge
6025
- * @page: page to charge
6026
- * @memcg: memcg to charge the page to
6027
- * @lrucare: page might be on LRU already
6028
- * @compound: charge the page as compound or small page
6029
- *
6030
- * Finalize a charge transaction started by mem_cgroup_try_charge(),
6031
- * after page->mapping has been set up. This must happen atomically
6032
- * as part of the page instantiation, i.e. under the page table lock
6033
- * for anonymous pages, under the page lock for page and swap cache.
6034
- *
6035
- * In addition, the page must not be on the LRU during the commit, to
6036
- * prevent racing with task migration. If it might be, use @lrucare.
6037
- *
6038
- * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
6039
- */
6040
-void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
6041
- bool lrucare, bool compound)
6042
-{
6043
- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
6044
-
6045
- VM_BUG_ON_PAGE(!page->mapping, page);
6046
- VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
6047
-
6048
- if (mem_cgroup_disabled())
6049
- return;
6050
- /*
6051
- * Swap faults will attempt to charge the same page multiple
6052
- * times. But reuse_swap_page() might have removed the page
6053
- * from swapcache already, so we can't check PageSwapCache().
6054
- */
6055
- if (!memcg)
6056
- return;
6057
-
6058
- commit_charge(page, memcg, lrucare);
6858
+ css_get(&memcg->css);
6859
+ commit_charge(page, memcg);
60596860
60606861 local_irq_disable();
6061
- mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
6862
+ mem_cgroup_charge_statistics(memcg, page, nr_pages);
60626863 memcg_check_events(memcg, page);
60636864 local_irq_enable();
60646865
6866
+ /*
6867
+ * Cgroup1's unified memory+swap counter has been charged with the
6868
+ * new swapcache page, finish the transfer by uncharging the swap
6869
+ * slot. The swap slot would also get uncharged when it dies, but
6870
+ * it can stick around indefinitely and we'd count the page twice
6871
+ * the entire time.
6872
+ *
6873
+ * Cgroup2 has separate resource counters for memory and swap,
6874
+ * so this is a non-issue here. Memory and swap charge lifetimes
6875
+ * correspond 1:1 to page and swap slot lifetimes: we charge the
6876
+ * page to memory here, and uncharge swap when the slot is freed.
6877
+ */
60656878 if (do_memsw_account() && PageSwapCache(page)) {
60666879 swp_entry_t entry = { .val = page_private(page) };
60676880 /*
....@@ -6071,42 +6884,18 @@
60716884 */
60726885 mem_cgroup_uncharge_swap(entry, nr_pages);
60736886 }
6074
-}
60756887
6076
-/**
6077
- * mem_cgroup_cancel_charge - cancel a page charge
6078
- * @page: page to charge
6079
- * @memcg: memcg to charge the page to
6080
- * @compound: charge the page as compound or small page
6081
- *
6082
- * Cancel a charge transaction started by mem_cgroup_try_charge().
6083
- */
6084
-void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
6085
- bool compound)
6086
-{
6087
- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
6088
-
6089
- if (mem_cgroup_disabled())
6090
- return;
6091
- /*
6092
- * Swap faults will attempt to charge the same page multiple
6093
- * times. But reuse_swap_page() might have removed the page
6094
- * from swapcache already, so we can't check PageSwapCache().
6095
- */
6096
- if (!memcg)
6097
- return;
6098
-
6099
- cancel_charge(memcg, nr_pages);
6888
+out_put:
6889
+ css_put(&memcg->css);
6890
+out:
6891
+ return ret;
61006892 }
61016893
61026894 struct uncharge_gather {
61036895 struct mem_cgroup *memcg;
6896
+ unsigned long nr_pages;
61046897 unsigned long pgpgout;
6105
- unsigned long nr_anon;
6106
- unsigned long nr_file;
61076898 unsigned long nr_kmem;
6108
- unsigned long nr_huge;
6109
- unsigned long nr_shmem;
61106899 struct page *dummy_page;
61116900 };
61126901
....@@ -6117,37 +6906,32 @@
61176906
61186907 static void uncharge_batch(const struct uncharge_gather *ug)
61196908 {
6120
- unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem;
61216909 unsigned long flags;
61226910
61236911 if (!mem_cgroup_is_root(ug->memcg)) {
6124
- page_counter_uncharge(&ug->memcg->memory, nr_pages);
6912
+ page_counter_uncharge(&ug->memcg->memory, ug->nr_pages);
61256913 if (do_memsw_account())
6126
- page_counter_uncharge(&ug->memcg->memsw, nr_pages);
6914
+ page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages);
61276915 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
61286916 page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
61296917 memcg_oom_recover(ug->memcg);
61306918 }
61316919
61326920 local_irq_save(flags);
6133
- __mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon);
6134
- __mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file);
6135
- __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge);
6136
- __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem);
61376921 __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
6138
- __this_cpu_add(ug->memcg->stat_cpu->nr_page_events, nr_pages);
6922
+ __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages);
61396923 memcg_check_events(ug->memcg, ug->dummy_page);
61406924 local_irq_restore(flags);
61416925
6142
- if (!mem_cgroup_is_root(ug->memcg))
6143
- css_put_many(&ug->memcg->css, nr_pages);
6926
+ /* drop reference from uncharge_page */
6927
+ css_put(&ug->memcg->css);
61446928 }
61456929
61466930 static void uncharge_page(struct page *page, struct uncharge_gather *ug)
61476931 {
6932
+ unsigned long nr_pages;
6933
+
61486934 VM_BUG_ON_PAGE(PageLRU(page), page);
6149
- VM_BUG_ON_PAGE(page_count(page) && !is_zone_device_page(page) &&
6150
- !PageHWPoison(page) , page);
61516935
61526936 if (!page->mem_cgroup)
61536937 return;
....@@ -6164,30 +6948,24 @@
61646948 uncharge_gather_clear(ug);
61656949 }
61666950 ug->memcg = page->mem_cgroup;
6951
+
6952
+ /* pairs with css_put in uncharge_batch */
6953
+ css_get(&ug->memcg->css);
61676954 }
61686955
6169
- if (!PageKmemcg(page)) {
6170
- unsigned int nr_pages = 1;
6956
+ nr_pages = compound_nr(page);
6957
+ ug->nr_pages += nr_pages;
61716958
6172
- if (PageTransHuge(page)) {
6173
- nr_pages <<= compound_order(page);
6174
- ug->nr_huge += nr_pages;
6175
- }
6176
- if (PageAnon(page))
6177
- ug->nr_anon += nr_pages;
6178
- else {
6179
- ug->nr_file += nr_pages;
6180
- if (PageSwapBacked(page))
6181
- ug->nr_shmem += nr_pages;
6182
- }
6959
+ if (!PageKmemcg(page)) {
61836960 ug->pgpgout++;
61846961 } else {
6185
- ug->nr_kmem += 1 << compound_order(page);
6962
+ ug->nr_kmem += nr_pages;
61866963 __ClearPageKmemcg(page);
61876964 }
61886965
61896966 ug->dummy_page = page;
61906967 page->mem_cgroup = NULL;
6968
+ css_put(&ug->memcg->css);
61916969 }
61926970
61936971 static void uncharge_list(struct list_head *page_list)
....@@ -6216,18 +6994,14 @@
62166994 }
62176995
62186996 /**
6219
- * mem_cgroup_uncharge - uncharge a page
6997
+ * __mem_cgroup_uncharge - uncharge a page
62206998 * @page: page to uncharge
62216999 *
6222
- * Uncharge a page previously charged with mem_cgroup_try_charge() and
6223
- * mem_cgroup_commit_charge().
7000
+ * Uncharge a page previously charged with __mem_cgroup_charge().
62247001 */
6225
-void mem_cgroup_uncharge(struct page *page)
7002
+void __mem_cgroup_uncharge(struct page *page)
62267003 {
62277004 struct uncharge_gather ug;
6228
-
6229
- if (mem_cgroup_disabled())
6230
- return;
62317005
62327006 /* Don't touch page->lru of any random page, pre-check: */
62337007 if (!page->mem_cgroup)
....@@ -6239,17 +7013,14 @@
62397013 }
62407014
62417015 /**
6242
- * mem_cgroup_uncharge_list - uncharge a list of page
7016
+ * __mem_cgroup_uncharge_list - uncharge a list of page
62437017 * @page_list: list of pages to uncharge
62447018 *
62457019 * Uncharge a list of pages previously charged with
6246
- * mem_cgroup_try_charge() and mem_cgroup_commit_charge().
7020
+ * __mem_cgroup_charge().
62477021 */
6248
-void mem_cgroup_uncharge_list(struct list_head *page_list)
7022
+void __mem_cgroup_uncharge_list(struct list_head *page_list)
62497023 {
6250
- if (mem_cgroup_disabled())
6251
- return;
6252
-
62537024 if (!list_empty(page_list))
62547025 uncharge_list(page_list);
62557026 }
....@@ -6268,7 +7039,6 @@
62687039 {
62697040 struct mem_cgroup *memcg;
62707041 unsigned int nr_pages;
6271
- bool compound;
62727042 unsigned long flags;
62737043
62747044 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
....@@ -6290,18 +7060,17 @@
62907060 return;
62917061
62927062 /* Force-charge the new page. The old one will be freed soon */
6293
- compound = PageTransHuge(newpage);
6294
- nr_pages = compound ? hpage_nr_pages(newpage) : 1;
7063
+ nr_pages = thp_nr_pages(newpage);
62957064
62967065 page_counter_charge(&memcg->memory, nr_pages);
62977066 if (do_memsw_account())
62987067 page_counter_charge(&memcg->memsw, nr_pages);
6299
- css_get_many(&memcg->css, nr_pages);
63007068
6301
- commit_charge(newpage, memcg, false);
7069
+ css_get(&memcg->css);
7070
+ commit_charge(newpage, memcg);
63027071
63037072 local_irq_save(flags);
6304
- mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
7073
+ mem_cgroup_charge_statistics(memcg, newpage, nr_pages);
63057074 memcg_check_events(memcg, newpage);
63067075 local_irq_restore(flags);
63077076 }
....@@ -6326,7 +7095,7 @@
63267095 goto out;
63277096 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
63287097 goto out;
6329
- if (css_tryget_online(&memcg->css))
7098
+ if (css_tryget(&memcg->css))
63307099 sk->sk_memcg = memcg;
63317100 out:
63327101 rcu_read_unlock();
....@@ -6404,7 +7173,7 @@
64047173 if (!strcmp(token, "nokmem"))
64057174 cgroup_memory_nokmem = true;
64067175 }
6407
- return 0;
7176
+ return 1;
64087177 }
64097178 __setup("cgroup.memory=", cgroup_memory);
64107179
....@@ -6419,17 +7188,6 @@
64197188 static int __init mem_cgroup_init(void)
64207189 {
64217190 int cpu, node;
6422
-
6423
-#ifdef CONFIG_MEMCG_KMEM
6424
- /*
6425
- * Kmem cache creation is mostly done with the slab_mutex held,
6426
- * so use a workqueue with limited concurrency to avoid stalling
6427
- * all worker threads in case lots of cgroups are created and
6428
- * destroyed simultaneously.
6429
- */
6430
- memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1);
6431
- BUG_ON(!memcg_kmem_cache_wq);
6432
-#endif
64337191
64347192 cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
64357193 memcg_hotplug_cpu_dead);
....@@ -6457,7 +7215,7 @@
64577215 #ifdef CONFIG_MEMCG_SWAP
64587216 static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
64597217 {
6460
- while (!atomic_inc_not_zero(&memcg->id.ref)) {
7218
+ while (!refcount_inc_not_zero(&memcg->id.ref)) {
64617219 /*
64627220 * The root cgroup cannot be destroyed, so it's refcount must
64637221 * always be >= 1.
....@@ -6489,7 +7247,10 @@
64897247 VM_BUG_ON_PAGE(PageLRU(page), page);
64907248 VM_BUG_ON_PAGE(page_count(page), page);
64917249
6492
- if (!do_memsw_account())
7250
+ if (mem_cgroup_disabled())
7251
+ return;
7252
+
7253
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
64937254 return;
64947255
64957256 memcg = page->mem_cgroup;
....@@ -6504,7 +7265,7 @@
65047265 * ancestor for the swap instead and transfer the memory+swap charge.
65057266 */
65067267 swap_memcg = mem_cgroup_id_get_online(memcg);
6507
- nr_entries = hpage_nr_pages(page);
7268
+ nr_entries = thp_nr_pages(page);
65087269 /* Get references for the tail pages, too */
65097270 if (nr_entries > 1)
65107271 mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
....@@ -6518,7 +7279,7 @@
65187279 if (!mem_cgroup_is_root(memcg))
65197280 page_counter_uncharge(&memcg->memory, nr_entries);
65207281
6521
- if (memcg != swap_memcg) {
7282
+ if (!cgroup_memory_noswap && memcg != swap_memcg) {
65227283 if (!mem_cgroup_is_root(swap_memcg))
65237284 page_counter_charge(&swap_memcg->memsw, nr_entries);
65247285 page_counter_uncharge(&memcg->memsw, nr_entries);
....@@ -6531,16 +7292,14 @@
65317292 * only synchronisation we have for updating the per-CPU variables.
65327293 */
65337294 VM_BUG_ON(!irqs_disabled());
6534
- mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
6535
- -nr_entries);
7295
+ mem_cgroup_charge_statistics(memcg, page, -nr_entries);
65367296 memcg_check_events(memcg, page);
65377297
6538
- if (!mem_cgroup_is_root(memcg))
6539
- css_put_many(&memcg->css, nr_entries);
7298
+ css_put(&memcg->css);
65407299 }
65417300
65427301 /**
6543
- * mem_cgroup_try_charge_swap - try charging swap space for a page
7302
+ * __mem_cgroup_try_charge_swap - try charging swap space for a page
65447303 * @page: page being added to swap
65457304 * @entry: swap entry to charge
65467305 *
....@@ -6548,14 +7307,14 @@
65487307 *
65497308 * Returns 0 on success, -ENOMEM on failure.
65507309 */
6551
-int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
7310
+int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
65527311 {
6553
- unsigned int nr_pages = hpage_nr_pages(page);
7312
+ unsigned int nr_pages = thp_nr_pages(page);
65547313 struct page_counter *counter;
65557314 struct mem_cgroup *memcg;
65567315 unsigned short oldid;
65577316
6558
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account)
7317
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
65597318 return 0;
65607319
65617320 memcg = page->mem_cgroup;
....@@ -6571,7 +7330,7 @@
65717330
65727331 memcg = mem_cgroup_id_get_online(memcg);
65737332
6574
- if (!mem_cgroup_is_root(memcg) &&
7333
+ if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) &&
65757334 !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
65767335 memcg_memory_event(memcg, MEMCG_SWAP_MAX);
65777336 memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
....@@ -6590,23 +7349,20 @@
65907349 }
65917350
65927351 /**
6593
- * mem_cgroup_uncharge_swap - uncharge swap space
7352
+ * __mem_cgroup_uncharge_swap - uncharge swap space
65947353 * @entry: swap entry to uncharge
65957354 * @nr_pages: the amount of swap space to uncharge
65967355 */
6597
-void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
7356
+void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
65987357 {
65997358 struct mem_cgroup *memcg;
66007359 unsigned short id;
6601
-
6602
- if (!do_swap_account)
6603
- return;
66047360
66057361 id = swap_cgroup_record(entry, 0, nr_pages);
66067362 rcu_read_lock();
66077363 memcg = mem_cgroup_from_id(id);
66087364 if (memcg) {
6609
- if (!mem_cgroup_is_root(memcg)) {
7365
+ if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) {
66107366 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
66117367 page_counter_uncharge(&memcg->swap, nr_pages);
66127368 else
....@@ -6622,7 +7378,7 @@
66227378 {
66237379 long nr_swap_pages = get_nr_swap_pages();
66247380
6625
- if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
7381
+ if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
66267382 return nr_swap_pages;
66277383 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
66287384 nr_swap_pages = min_t(long, nr_swap_pages,
....@@ -6639,36 +7395,33 @@
66397395
66407396 if (vm_swap_full())
66417397 return true;
6642
- if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
7398
+ if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
66437399 return false;
66447400
66457401 memcg = page->mem_cgroup;
66467402 if (!memcg)
66477403 return false;
66487404
6649
- for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
6650
- if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.max)
7405
+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
7406
+ unsigned long usage = page_counter_read(&memcg->swap);
7407
+
7408
+ if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
7409
+ usage * 2 >= READ_ONCE(memcg->swap.max))
66517410 return true;
7411
+ }
66527412
66537413 return false;
66547414 }
66557415
6656
-/* for remember boot option*/
6657
-#ifdef CONFIG_MEMCG_SWAP_ENABLED
6658
-static int really_do_swap_account __initdata = 1;
6659
-#else
6660
-static int really_do_swap_account __initdata;
6661
-#endif
6662
-
6663
-static int __init enable_swap_account(char *s)
7416
+static int __init setup_swap_account(char *s)
66647417 {
66657418 if (!strcmp(s, "1"))
6666
- really_do_swap_account = 1;
7419
+ cgroup_memory_noswap = 0;
66677420 else if (!strcmp(s, "0"))
6668
- really_do_swap_account = 0;
7421
+ cgroup_memory_noswap = 1;
66697422 return 1;
66707423 }
6671
-__setup("swapaccount=", enable_swap_account);
7424
+__setup("swapaccount=", setup_swap_account);
66727425
66737426 static u64 swap_current_read(struct cgroup_subsys_state *css,
66747427 struct cftype *cft)
....@@ -6678,17 +7431,33 @@
66787431 return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
66797432 }
66807433
7434
+static int swap_high_show(struct seq_file *m, void *v)
7435
+{
7436
+ return seq_puts_memcg_tunable(m,
7437
+ READ_ONCE(mem_cgroup_from_seq(m)->swap.high));
7438
+}
7439
+
7440
+static ssize_t swap_high_write(struct kernfs_open_file *of,
7441
+ char *buf, size_t nbytes, loff_t off)
7442
+{
7443
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
7444
+ unsigned long high;
7445
+ int err;
7446
+
7447
+ buf = strstrip(buf);
7448
+ err = page_counter_memparse(buf, "max", &high);
7449
+ if (err)
7450
+ return err;
7451
+
7452
+ page_counter_set_high(&memcg->swap, high);
7453
+
7454
+ return nbytes;
7455
+}
7456
+
66817457 static int swap_max_show(struct seq_file *m, void *v)
66827458 {
6683
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
6684
- unsigned long max = READ_ONCE(memcg->swap.max);
6685
-
6686
- if (max == PAGE_COUNTER_MAX)
6687
- seq_puts(m, "max\n");
6688
- else
6689
- seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
6690
-
6691
- return 0;
7459
+ return seq_puts_memcg_tunable(m,
7460
+ READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
66927461 }
66937462
66947463 static ssize_t swap_max_write(struct kernfs_open_file *of,
....@@ -6710,8 +7479,10 @@
67107479
67117480 static int swap_events_show(struct seq_file *m, void *v)
67127481 {
6713
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
7482
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
67147483
7484
+ seq_printf(m, "high %lu\n",
7485
+ atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH]));
67157486 seq_printf(m, "max %lu\n",
67167487 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
67177488 seq_printf(m, "fail %lu\n",
....@@ -6725,6 +7496,12 @@
67257496 .name = "swap.current",
67267497 .flags = CFTYPE_NOT_ON_ROOT,
67277498 .read_u64 = swap_current_read,
7499
+ },
7500
+ {
7501
+ .name = "swap.high",
7502
+ .flags = CFTYPE_NOT_ON_ROOT,
7503
+ .seq_show = swap_high_show,
7504
+ .write = swap_high_write,
67287505 },
67297506 {
67307507 .name = "swap.max",
....@@ -6741,7 +7518,7 @@
67417518 { } /* terminate */
67427519 };
67437520
6744
-static struct cftype memsw_cgroup_files[] = {
7521
+static struct cftype memsw_files[] = {
67457522 {
67467523 .name = "memsw.usage_in_bytes",
67477524 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
....@@ -6768,17 +7545,27 @@
67687545 { }, /* terminate */
67697546 };
67707547
7548
+/*
7549
+ * If mem_cgroup_swap_init() is implemented as a subsys_initcall()
7550
+ * instead of a core_initcall(), this could mean cgroup_memory_noswap still
7551
+ * remains set to false even when memcg is disabled via "cgroup_disable=memory"
7552
+ * boot parameter. This may result in premature OOPS inside
7553
+ * mem_cgroup_get_nr_swap_pages() function in corner cases.
7554
+ */
67717555 static int __init mem_cgroup_swap_init(void)
67727556 {
6773
- if (!mem_cgroup_disabled() && really_do_swap_account) {
6774
- do_swap_account = 1;
6775
- WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys,
6776
- swap_files));
6777
- WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
6778
- memsw_cgroup_files));
6779
- }
7557
+ /* No memory control -> no swap control */
7558
+ if (mem_cgroup_disabled())
7559
+ cgroup_memory_noswap = true;
7560
+
7561
+ if (cgroup_memory_noswap)
7562
+ return 0;
7563
+
7564
+ WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
7565
+ WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files));
7566
+
67807567 return 0;
67817568 }
6782
-subsys_initcall(mem_cgroup_swap_init);
7569
+core_initcall(mem_cgroup_swap_init);
67837570
67847571 #endif /* CONFIG_MEMCG_SWAP */