hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/mm/memcontrol.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /* memcontrol.c - Memory Controller
23 *
34 * Copyright IBM Corporation, 2007
....@@ -19,26 +20,17 @@
1920 * Lockless page tracking & accounting
2021 * Unified hierarchy configuration model
2122 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
22
- *
23
- * This program is free software; you can redistribute it and/or modify
24
- * it under the terms of the GNU General Public License as published by
25
- * the Free Software Foundation; either version 2 of the License, or
26
- * (at your option) any later version.
27
- *
28
- * This program is distributed in the hope that it will be useful,
29
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
30
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
31
- * GNU General Public License for more details.
3223 */
3324
3425 #include <linux/page_counter.h>
3526 #include <linux/memcontrol.h>
3627 #include <linux/cgroup.h>
37
-#include <linux/mm.h>
28
+#include <linux/pagewalk.h>
3829 #include <linux/sched/mm.h>
3930 #include <linux/shmem_fs.h>
4031 #include <linux/hugetlb.h>
4132 #include <linux/pagemap.h>
33
+#include <linux/vm_event_item.h>
4234 #include <linux/smp.h>
4335 #include <linux/page-flags.h>
4436 #include <linux/backing-dev.h>
....@@ -65,22 +57,26 @@
6557 #include <linux/lockdep.h>
6658 #include <linux/file.h>
6759 #include <linux/tracehook.h>
60
+#include <linux/psi.h>
61
+#include <linux/seq_buf.h>
6862 #include "internal.h"
6963 #include <net/sock.h>
7064 #include <net/ip.h>
7165 #include "slab.h"
72
-#include <linux/locallock.h>
7366
7467 #include <linux/uaccess.h>
7568
7669 #include <trace/events/vmscan.h>
70
+#include <trace/hooks/mm.h>
7771
7872 struct cgroup_subsys memory_cgrp_subsys __read_mostly;
7973 EXPORT_SYMBOL(memory_cgrp_subsys);
8074
8175 struct mem_cgroup *root_mem_cgroup __read_mostly;
76
+EXPORT_SYMBOL_GPL(root_mem_cgroup);
8277
83
-#define MEM_CGROUP_RECLAIM_RETRIES 5
78
+/* Active memory cgroup to use from an interrupt context */
79
+DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
8480
8581 /* Socket memory accounting disabled? */
8682 static bool cgroup_memory_nosocket;
....@@ -90,30 +86,23 @@
9086
9187 /* Whether the swap controller is active */
9288 #ifdef CONFIG_MEMCG_SWAP
93
-int do_swap_account __read_mostly;
89
+bool cgroup_memory_noswap __read_mostly;
9490 #else
95
-#define do_swap_account 0
91
+#define cgroup_memory_noswap 1
9692 #endif
9793
98
-static DEFINE_LOCAL_IRQ_LOCK(event_lock);
94
+#ifdef CONFIG_CGROUP_WRITEBACK
95
+static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
96
+#endif
9997
10098 /* Whether legacy memory+swap accounting is active */
10199 static bool do_memsw_account(void)
102100 {
103
- return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
101
+ return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
104102 }
105
-
106
-static const char *const mem_cgroup_lru_names[] = {
107
- "inactive_anon",
108
- "active_anon",
109
- "inactive_file",
110
- "active_file",
111
- "unevictable",
112
-};
113103
114104 #define THRESHOLDS_EVENTS_TARGET 128
115105 #define SOFTLIMIT_EVENTS_TARGET 1024
116
-#define NUMAINFO_EVENTS_TARGET 1024
117106
118107 /*
119108 * Cgroups above their limits are maintained in a RB-Tree, independent of
....@@ -213,14 +202,6 @@
213202 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
214203 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
215204
216
-enum charge_type {
217
- MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
218
- MEM_CGROUP_CHARGE_TYPE_ANON,
219
- MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
220
- MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */
221
- NR_CHARGE_TYPE,
222
-};
223
-
224205 /* for encoding cft->private value on file */
225206 enum res_type {
226207 _MEM,
....@@ -251,7 +232,7 @@
251232 iter != NULL; \
252233 iter = mem_cgroup_iter(NULL, iter, NULL))
253234
254
-static inline bool should_force_charge(void)
235
+static inline bool task_is_dying(void)
255236 {
256237 return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
257238 (current->flags & PF_EXITING);
....@@ -271,8 +252,100 @@
271252 }
272253
273254 #ifdef CONFIG_MEMCG_KMEM
255
+static DEFINE_SPINLOCK(objcg_lock);
256
+
257
+static void obj_cgroup_release(struct percpu_ref *ref)
258
+{
259
+ struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
260
+ struct mem_cgroup *memcg;
261
+ unsigned int nr_bytes;
262
+ unsigned int nr_pages;
263
+ unsigned long flags;
264
+
265
+ /*
266
+ * At this point all allocated objects are freed, and
267
+ * objcg->nr_charged_bytes can't have an arbitrary byte value.
268
+ * However, it can be PAGE_SIZE or (x * PAGE_SIZE).
269
+ *
270
+ * The following sequence can lead to it:
271
+ * 1) CPU0: objcg == stock->cached_objcg
272
+ * 2) CPU1: we do a small allocation (e.g. 92 bytes),
273
+ * PAGE_SIZE bytes are charged
274
+ * 3) CPU1: a process from another memcg is allocating something,
275
+ * the stock if flushed,
276
+ * objcg->nr_charged_bytes = PAGE_SIZE - 92
277
+ * 5) CPU0: we do release this object,
278
+ * 92 bytes are added to stock->nr_bytes
279
+ * 6) CPU0: stock is flushed,
280
+ * 92 bytes are added to objcg->nr_charged_bytes
281
+ *
282
+ * In the result, nr_charged_bytes == PAGE_SIZE.
283
+ * This page will be uncharged in obj_cgroup_release().
284
+ */
285
+ nr_bytes = atomic_read(&objcg->nr_charged_bytes);
286
+ WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
287
+ nr_pages = nr_bytes >> PAGE_SHIFT;
288
+
289
+ spin_lock_irqsave(&objcg_lock, flags);
290
+ memcg = obj_cgroup_memcg(objcg);
291
+ if (nr_pages)
292
+ __memcg_kmem_uncharge(memcg, nr_pages);
293
+ list_del(&objcg->list);
294
+ mem_cgroup_put(memcg);
295
+ spin_unlock_irqrestore(&objcg_lock, flags);
296
+
297
+ percpu_ref_exit(ref);
298
+ kfree_rcu(objcg, rcu);
299
+}
300
+
301
+static struct obj_cgroup *obj_cgroup_alloc(void)
302
+{
303
+ struct obj_cgroup *objcg;
304
+ int ret;
305
+
306
+ objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
307
+ if (!objcg)
308
+ return NULL;
309
+
310
+ ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0,
311
+ GFP_KERNEL);
312
+ if (ret) {
313
+ kfree(objcg);
314
+ return NULL;
315
+ }
316
+ INIT_LIST_HEAD(&objcg->list);
317
+ return objcg;
318
+}
319
+
320
+static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
321
+ struct mem_cgroup *parent)
322
+{
323
+ struct obj_cgroup *objcg, *iter;
324
+
325
+ objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
326
+
327
+ spin_lock_irq(&objcg_lock);
328
+
329
+ /* Move active objcg to the parent's list */
330
+ xchg(&objcg->memcg, parent);
331
+ css_get(&parent->css);
332
+ list_add(&objcg->list, &parent->objcg_list);
333
+
334
+ /* Move already reparented objcgs to the parent's list */
335
+ list_for_each_entry(iter, &memcg->objcg_list, list) {
336
+ css_get(&parent->css);
337
+ xchg(&iter->memcg, parent);
338
+ css_put(&memcg->css);
339
+ }
340
+ list_splice(&memcg->objcg_list, &parent->objcg_list);
341
+
342
+ spin_unlock_irq(&objcg_lock);
343
+
344
+ percpu_ref_kill(&objcg->refcnt);
345
+}
346
+
274347 /*
275
- * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
348
+ * This will be used as a shrinker list's index.
276349 * The main reason for not using cgroup id for this:
277350 * this works better in sparse environments, where we have a lot of memcgs,
278351 * but only a few kmem-limited. Or also, if we have, for instance, 200
....@@ -315,14 +388,13 @@
315388
316389 /*
317390 * A lot of the calls to the cache allocation functions are expected to be
318
- * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
391
+ * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are
319392 * conditional to this static branch, we'll have to allow modules that does
320393 * kmem_cache_alloc and the such to see this symbol as well
321394 */
322395 DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
323396 EXPORT_SYMBOL(memcg_kmem_enabled_key);
324
-
325
-struct workqueue_struct *memcg_kmem_cache_wq;
397
+#endif
326398
327399 static int memcg_shrinker_map_size;
328400 static DEFINE_MUTEX(memcg_shrinker_map_mutex);
....@@ -347,7 +419,7 @@
347419 if (!old)
348420 return 0;
349421
350
- new = kvmalloc(sizeof(*new) + size, GFP_KERNEL);
422
+ new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
351423 if (!new)
352424 return -ENOMEM;
353425
....@@ -391,7 +463,7 @@
391463 mutex_lock(&memcg_shrinker_map_mutex);
392464 size = memcg_shrinker_map_size;
393465 for_each_node(nid) {
394
- map = kvzalloc(sizeof(*map) + size, GFP_KERNEL);
466
+ map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid);
395467 if (!map) {
396468 memcg_free_shrinker_maps(memcg);
397469 ret = -ENOMEM;
....@@ -448,14 +520,6 @@
448520 }
449521 }
450522
451
-#else /* CONFIG_MEMCG_KMEM */
452
-static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
453
-{
454
- return 0;
455
-}
456
-static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { }
457
-#endif /* CONFIG_MEMCG_KMEM */
458
-
459523 /**
460524 * mem_cgroup_css_from_page - css of the memcg associated with a page
461525 * @page: page of interest
....@@ -498,7 +562,17 @@
498562 unsigned long ino = 0;
499563
500564 rcu_read_lock();
501
- memcg = READ_ONCE(page->mem_cgroup);
565
+ memcg = page->mem_cgroup;
566
+
567
+ /*
568
+ * The lowest bit set means that memcg isn't a valid
569
+ * memcg pointer, but a obj_cgroups pointer.
570
+ * In this case the page is shared and doesn't belong
571
+ * to any specific memory cgroup.
572
+ */
573
+ if ((unsigned long) memcg & 0x1UL)
574
+ memcg = NULL;
575
+
502576 while (memcg && !(memcg->css.flags & CSS_ONLINE))
503577 memcg = parent_mem_cgroup(memcg);
504578 if (memcg)
....@@ -674,7 +748,7 @@
674748 */
675749 __mem_cgroup_remove_exceeded(mz, mctz);
676750 if (!soft_limit_excess(mz->memcg) ||
677
- !css_tryget_online(&mz->memcg->css))
751
+ !css_tryget(&mz->memcg->css))
678752 goto retry;
679753 done:
680754 return mz;
....@@ -691,33 +765,186 @@
691765 return mz;
692766 }
693767
694
-static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
695
- int event)
768
+/**
769
+ * __mod_memcg_state - update cgroup memory statistics
770
+ * @memcg: the memory cgroup
771
+ * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
772
+ * @val: delta to add to the counter, can be negative
773
+ */
774
+void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
696775 {
697
- return atomic_long_read(&memcg->events[event]);
776
+ long x, threshold = MEMCG_CHARGE_BATCH;
777
+
778
+ if (mem_cgroup_disabled())
779
+ return;
780
+
781
+ if (memcg_stat_item_in_bytes(idx))
782
+ threshold <<= PAGE_SHIFT;
783
+
784
+ x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
785
+ if (unlikely(abs(x) > threshold)) {
786
+ struct mem_cgroup *mi;
787
+
788
+ /*
789
+ * Batch local counters to keep them in sync with
790
+ * the hierarchical ones.
791
+ */
792
+ __this_cpu_add(memcg->vmstats_local->stat[idx], x);
793
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
794
+ atomic_long_add(x, &mi->vmstats[idx]);
795
+ x = 0;
796
+ }
797
+ __this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
798
+}
799
+
800
+static struct mem_cgroup_per_node *
801
+parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
802
+{
803
+ struct mem_cgroup *parent;
804
+
805
+ parent = parent_mem_cgroup(pn->memcg);
806
+ if (!parent)
807
+ return NULL;
808
+ return mem_cgroup_nodeinfo(parent, nid);
809
+}
810
+
811
+void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
812
+ int val)
813
+{
814
+ struct mem_cgroup_per_node *pn;
815
+ struct mem_cgroup *memcg;
816
+ long x, threshold = MEMCG_CHARGE_BATCH;
817
+
818
+ pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
819
+ memcg = pn->memcg;
820
+
821
+ /* Update memcg */
822
+ __mod_memcg_state(memcg, idx, val);
823
+
824
+ /* Update lruvec */
825
+ __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
826
+
827
+ if (vmstat_item_in_bytes(idx))
828
+ threshold <<= PAGE_SHIFT;
829
+
830
+ x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
831
+ if (unlikely(abs(x) > threshold)) {
832
+ pg_data_t *pgdat = lruvec_pgdat(lruvec);
833
+ struct mem_cgroup_per_node *pi;
834
+
835
+ for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
836
+ atomic_long_add(x, &pi->lruvec_stat[idx]);
837
+ x = 0;
838
+ }
839
+ __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
840
+}
841
+
842
+/**
843
+ * __mod_lruvec_state - update lruvec memory statistics
844
+ * @lruvec: the lruvec
845
+ * @idx: the stat item
846
+ * @val: delta to add to the counter, can be negative
847
+ *
848
+ * The lruvec is the intersection of the NUMA node and a cgroup. This
849
+ * function updates the all three counters that are affected by a
850
+ * change of state at this level: per-node, per-cgroup, per-lruvec.
851
+ */
852
+void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
853
+ int val)
854
+{
855
+ /* Update node */
856
+ __mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
857
+
858
+ /* Update memcg and lruvec */
859
+ if (!mem_cgroup_disabled())
860
+ __mod_memcg_lruvec_state(lruvec, idx, val);
861
+}
862
+EXPORT_SYMBOL_GPL(__mod_lruvec_state);
863
+
864
+void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
865
+{
866
+ pg_data_t *pgdat = page_pgdat(virt_to_page(p));
867
+ struct mem_cgroup *memcg;
868
+ struct lruvec *lruvec;
869
+
870
+ rcu_read_lock();
871
+ memcg = mem_cgroup_from_obj(p);
872
+
873
+ /*
874
+ * Untracked pages have no memcg, no lruvec. Update only the
875
+ * node. If we reparent the slab objects to the root memcg,
876
+ * when we free the slab object, we need to update the per-memcg
877
+ * vmstats to keep it correct for the root memcg.
878
+ */
879
+ if (!memcg) {
880
+ __mod_node_page_state(pgdat, idx, val);
881
+ } else {
882
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
883
+ __mod_lruvec_state(lruvec, idx, val);
884
+ }
885
+ rcu_read_unlock();
886
+}
887
+
888
+void mod_memcg_obj_state(void *p, int idx, int val)
889
+{
890
+ struct mem_cgroup *memcg;
891
+
892
+ rcu_read_lock();
893
+ memcg = mem_cgroup_from_obj(p);
894
+ if (memcg)
895
+ mod_memcg_state(memcg, idx, val);
896
+ rcu_read_unlock();
897
+}
898
+
899
+/**
900
+ * __count_memcg_events - account VM events in a cgroup
901
+ * @memcg: the memory cgroup
902
+ * @idx: the event item
903
+ * @count: the number of events that occured
904
+ */
905
+void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
906
+ unsigned long count)
907
+{
908
+ unsigned long x;
909
+
910
+ if (mem_cgroup_disabled())
911
+ return;
912
+
913
+ x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
914
+ if (unlikely(x > MEMCG_CHARGE_BATCH)) {
915
+ struct mem_cgroup *mi;
916
+
917
+ /*
918
+ * Batch local counters to keep them in sync with
919
+ * the hierarchical ones.
920
+ */
921
+ __this_cpu_add(memcg->vmstats_local->events[idx], x);
922
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
923
+ atomic_long_add(x, &mi->vmevents[idx]);
924
+ x = 0;
925
+ }
926
+ __this_cpu_write(memcg->vmstats_percpu->events[idx], x);
927
+}
928
+
929
+static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
930
+{
931
+ return atomic_long_read(&memcg->vmevents[event]);
932
+}
933
+
934
+static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
935
+{
936
+ long x = 0;
937
+ int cpu;
938
+
939
+ for_each_possible_cpu(cpu)
940
+ x += per_cpu(memcg->vmstats_local->events[event], cpu);
941
+ return x;
698942 }
699943
700944 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
701945 struct page *page,
702
- bool compound, int nr_pages)
946
+ int nr_pages)
703947 {
704
- /*
705
- * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
706
- * counted as CACHE even if it's on ANON LRU.
707
- */
708
- if (PageAnon(page))
709
- __mod_memcg_state(memcg, MEMCG_RSS, nr_pages);
710
- else {
711
- __mod_memcg_state(memcg, MEMCG_CACHE, nr_pages);
712
- if (PageSwapBacked(page))
713
- __mod_memcg_state(memcg, NR_SHMEM, nr_pages);
714
- }
715
-
716
- if (compound) {
717
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
718
- __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages);
719
- }
720
-
721948 /* pagein of a big page is an event. So, ignore page size */
722949 if (nr_pages > 0)
723950 __count_memcg_events(memcg, PGPGIN, 1);
....@@ -726,35 +953,7 @@
726953 nr_pages = -nr_pages; /* for event */
727954 }
728955
729
- __this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages);
730
-}
731
-
732
-unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
733
- int nid, unsigned int lru_mask)
734
-{
735
- struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
736
- unsigned long nr = 0;
737
- enum lru_list lru;
738
-
739
- VM_BUG_ON((unsigned)nid >= nr_node_ids);
740
-
741
- for_each_lru(lru) {
742
- if (!(BIT(lru) & lru_mask))
743
- continue;
744
- nr += mem_cgroup_get_lru_size(lruvec, lru);
745
- }
746
- return nr;
747
-}
748
-
749
-static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
750
- unsigned int lru_mask)
751
-{
752
- unsigned long nr = 0;
753
- int nid;
754
-
755
- for_each_node_state(nid, N_MEMORY)
756
- nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
757
- return nr;
956
+ __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
758957 }
759958
760959 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
....@@ -762,8 +961,8 @@
762961 {
763962 unsigned long val, next;
764963
765
- val = __this_cpu_read(memcg->stat_cpu->nr_page_events);
766
- next = __this_cpu_read(memcg->stat_cpu->targets[target]);
964
+ val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
965
+ next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
767966 /* from time_after() in jiffies.h */
768967 if ((long)(next - val) < 0) {
769968 switch (target) {
....@@ -773,13 +972,10 @@
773972 case MEM_CGROUP_TARGET_SOFTLIMIT:
774973 next = val + SOFTLIMIT_EVENTS_TARGET;
775974 break;
776
- case MEM_CGROUP_TARGET_NUMAINFO:
777
- next = val + NUMAINFO_EVENTS_TARGET;
778
- break;
779975 default:
780976 break;
781977 }
782
- __this_cpu_write(memcg->stat_cpu->targets[target], next);
978
+ __this_cpu_write(memcg->vmstats_percpu->targets[target], next);
783979 return true;
784980 }
785981 return false;
....@@ -795,21 +991,12 @@
795991 if (unlikely(mem_cgroup_event_ratelimit(memcg,
796992 MEM_CGROUP_TARGET_THRESH))) {
797993 bool do_softlimit;
798
- bool do_numainfo __maybe_unused;
799994
800995 do_softlimit = mem_cgroup_event_ratelimit(memcg,
801996 MEM_CGROUP_TARGET_SOFTLIMIT);
802
-#if MAX_NUMNODES > 1
803
- do_numainfo = mem_cgroup_event_ratelimit(memcg,
804
- MEM_CGROUP_TARGET_NUMAINFO);
805
-#endif
806997 mem_cgroup_threshold(memcg);
807998 if (unlikely(do_softlimit))
808999 mem_cgroup_update_tree(memcg, page);
809
-#if MAX_NUMNODES > 1
810
- if (unlikely(do_numainfo))
811
- atomic_inc(&memcg->numainfo_events);
812
-#endif
8131000 }
8141001 }
8151002
....@@ -877,27 +1064,60 @@
8771064 return NULL;
8781065
8791066 rcu_read_lock();
880
- if (!memcg || !css_tryget_online(&memcg->css))
1067
+ /* Page should not get uncharged and freed memcg under us. */
1068
+ if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css)))
8811069 memcg = root_mem_cgroup;
8821070 rcu_read_unlock();
8831071 return memcg;
8841072 }
8851073 EXPORT_SYMBOL(get_mem_cgroup_from_page);
8861074
1075
+static __always_inline struct mem_cgroup *active_memcg(void)
1076
+{
1077
+ if (in_interrupt())
1078
+ return this_cpu_read(int_active_memcg);
1079
+ else
1080
+ return current->active_memcg;
1081
+}
1082
+
1083
+static __always_inline struct mem_cgroup *get_active_memcg(void)
1084
+{
1085
+ struct mem_cgroup *memcg;
1086
+
1087
+ rcu_read_lock();
1088
+ memcg = active_memcg();
1089
+ /* remote memcg must hold a ref. */
1090
+ if (memcg && WARN_ON_ONCE(!css_tryget(&memcg->css)))
1091
+ memcg = root_mem_cgroup;
1092
+ rcu_read_unlock();
1093
+
1094
+ return memcg;
1095
+}
1096
+
1097
+static __always_inline bool memcg_kmem_bypass(void)
1098
+{
1099
+ /* Allow remote memcg charging from any context. */
1100
+ if (unlikely(active_memcg()))
1101
+ return false;
1102
+
1103
+ /* Memcg to charge can't be determined. */
1104
+ if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
1105
+ return true;
1106
+
1107
+ return false;
1108
+}
1109
+
8871110 /**
888
- * If current->active_memcg is non-NULL, do not fallback to current->mm->memcg.
1111
+ * If active memcg is set, do not fallback to current->mm->memcg.
8891112 */
8901113 static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
8911114 {
892
- if (unlikely(current->active_memcg)) {
893
- struct mem_cgroup *memcg = root_mem_cgroup;
1115
+ if (memcg_kmem_bypass())
1116
+ return NULL;
8941117
895
- rcu_read_lock();
896
- if (css_tryget_online(&current->active_memcg->css))
897
- memcg = current->active_memcg;
898
- rcu_read_unlock();
899
- return memcg;
900
- }
1118
+ if (unlikely(active_memcg()))
1119
+ return get_active_memcg();
1120
+
9011121 return get_mem_cgroup_from_mm(current->mm);
9021122 }
9031123
....@@ -914,15 +1134,15 @@
9141134 * invocations for reference counting, or use mem_cgroup_iter_break()
9151135 * to cancel a hierarchy walk before the round-trip is complete.
9161136 *
917
- * Reclaimers can specify a node and a priority level in @reclaim to
918
- * divide up the memcgs in the hierarchy among all concurrent
919
- * reclaimers operating on the same node and priority.
1137
+ * Reclaimers can specify a node in @reclaim to divide up the memcgs
1138
+ * in the hierarchy among all concurrent reclaimers operating on the
1139
+ * same node.
9201140 */
9211141 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
9221142 struct mem_cgroup *prev,
9231143 struct mem_cgroup_reclaim_cookie *reclaim)
9241144 {
925
- struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
1145
+ struct mem_cgroup_reclaim_iter *iter;
9261146 struct cgroup_subsys_state *css = NULL;
9271147 struct mem_cgroup *memcg = NULL;
9281148 struct mem_cgroup *pos = NULL;
....@@ -948,7 +1168,7 @@
9481168 struct mem_cgroup_per_node *mz;
9491169
9501170 mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
951
- iter = &mz->iter[reclaim->priority];
1171
+ iter = &mz->iter;
9521172
9531173 if (prev && reclaim->generation != iter->generation)
9541174 goto out_unlock;
....@@ -1048,15 +1268,11 @@
10481268 struct mem_cgroup_reclaim_iter *iter;
10491269 struct mem_cgroup_per_node *mz;
10501270 int nid;
1051
- int i;
10521271
10531272 for_each_node(nid) {
10541273 mz = mem_cgroup_nodeinfo(from, nid);
1055
- for (i = 0; i <= DEF_PRIORITY; i++) {
1056
- iter = &mz->iter[i];
1057
- cmpxchg(&iter->position,
1058
- dead_memcg, NULL);
1059
- }
1274
+ iter = &mz->iter;
1275
+ cmpxchg(&iter->position, dead_memcg, NULL);
10601276 }
10611277 }
10621278
....@@ -1106,7 +1322,7 @@
11061322 struct css_task_iter it;
11071323 struct task_struct *task;
11081324
1109
- css_task_iter_start(&iter->css, 0, &it);
1325
+ css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
11101326 while (!ret && (task = css_task_iter_next(&it)))
11111327 ret = fn(task, arg);
11121328 css_task_iter_end(&it);
....@@ -1123,9 +1339,8 @@
11231339 * @page: the page
11241340 * @pgdat: pgdat of the page
11251341 *
1126
- * This function is only safe when following the LRU page isolation
1127
- * and putback protocol: the LRU lock must be held, and the page must
1128
- * either be PageLRU() or the caller must have isolated/allocated it.
1342
+ * This function relies on page->mem_cgroup being stable - see the
1343
+ * access rules in commit_charge().
11291344 */
11301345 struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
11311346 {
....@@ -1134,7 +1349,7 @@
11341349 struct lruvec *lruvec;
11351350
11361351 if (mem_cgroup_disabled()) {
1137
- lruvec = &pgdat->lruvec;
1352
+ lruvec = &pgdat->__lruvec;
11381353 goto out;
11391354 }
11401355
....@@ -1158,6 +1373,38 @@
11581373 lruvec->pgdat = pgdat;
11591374 return lruvec;
11601375 }
1376
+
1377
+struct lruvec *page_to_lruvec(struct page *page, pg_data_t *pgdat)
1378
+{
1379
+ struct lruvec *lruvec;
1380
+
1381
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
1382
+
1383
+ return lruvec;
1384
+}
1385
+EXPORT_SYMBOL_GPL(page_to_lruvec);
1386
+
1387
+void do_traversal_all_lruvec(void)
1388
+{
1389
+ pg_data_t *pgdat;
1390
+
1391
+ for_each_online_pgdat(pgdat) {
1392
+ struct mem_cgroup *memcg = NULL;
1393
+
1394
+ spin_lock_irq(&pgdat->lru_lock);
1395
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
1396
+ do {
1397
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
1398
+
1399
+ trace_android_vh_do_traversal_lruvec(lruvec);
1400
+
1401
+ memcg = mem_cgroup_iter(NULL, memcg, NULL);
1402
+ } while (memcg);
1403
+
1404
+ spin_unlock_irq(&pgdat->lru_lock);
1405
+ }
1406
+}
1407
+EXPORT_SYMBOL_GPL(do_traversal_all_lruvec);
11611408
11621409 /**
11631410 * mem_cgroup_update_lru_size - account for adding or removing an lru page
....@@ -1197,32 +1444,7 @@
11971444 if (nr_pages > 0)
11981445 *lru_size += nr_pages;
11991446 }
1200
-
1201
-bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
1202
-{
1203
- struct mem_cgroup *task_memcg;
1204
- struct task_struct *p;
1205
- bool ret;
1206
-
1207
- p = find_lock_task_mm(task);
1208
- if (p) {
1209
- task_memcg = get_mem_cgroup_from_mm(p->mm);
1210
- task_unlock(p);
1211
- } else {
1212
- /*
1213
- * All threads may have already detached their mm's, but the oom
1214
- * killer still needs to detect if they have already been oom
1215
- * killed to prevent needlessly killing additional tasks.
1216
- */
1217
- rcu_read_lock();
1218
- task_memcg = mem_cgroup_from_task(task);
1219
- css_get(&task_memcg->css);
1220
- rcu_read_unlock();
1221
- }
1222
- ret = mem_cgroup_is_descendant(task_memcg, memcg);
1223
- css_put(&task_memcg->css);
1224
- return ret;
1225
-}
1447
+EXPORT_SYMBOL_GPL(mem_cgroup_update_lru_size);
12261448
12271449 /**
12281450 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
....@@ -1245,7 +1467,7 @@
12451467 if (do_memsw_account()) {
12461468 count = page_counter_read(&memcg->memsw);
12471469 limit = READ_ONCE(memcg->memsw.max);
1248
- if (count <= limit)
1470
+ if (count < limit)
12491471 margin = min(margin, limit - count);
12501472 else
12511473 margin = 0;
....@@ -1299,85 +1521,199 @@
12991521 return false;
13001522 }
13011523
1302
-static const unsigned int memcg1_stats[] = {
1303
- MEMCG_CACHE,
1304
- MEMCG_RSS,
1305
- MEMCG_RSS_HUGE,
1306
- NR_SHMEM,
1307
- NR_FILE_MAPPED,
1308
- NR_FILE_DIRTY,
1309
- NR_WRITEBACK,
1310
- MEMCG_SWAP,
1524
+struct memory_stat {
1525
+ const char *name;
1526
+ unsigned int ratio;
1527
+ unsigned int idx;
13111528 };
13121529
1313
-static const char *const memcg1_stat_names[] = {
1314
- "cache",
1315
- "rss",
1316
- "rss_huge",
1317
- "shmem",
1318
- "mapped_file",
1319
- "dirty",
1320
- "writeback",
1321
- "swap",
1530
+static struct memory_stat memory_stats[] = {
1531
+ { "anon", PAGE_SIZE, NR_ANON_MAPPED },
1532
+ { "file", PAGE_SIZE, NR_FILE_PAGES },
1533
+ { "kernel_stack", 1024, NR_KERNEL_STACK_KB },
1534
+ { "percpu", 1, MEMCG_PERCPU_B },
1535
+ { "sock", PAGE_SIZE, MEMCG_SOCK },
1536
+ { "shmem", PAGE_SIZE, NR_SHMEM },
1537
+ { "file_mapped", PAGE_SIZE, NR_FILE_MAPPED },
1538
+ { "file_dirty", PAGE_SIZE, NR_FILE_DIRTY },
1539
+ { "file_writeback", PAGE_SIZE, NR_WRITEBACK },
1540
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1541
+ /*
1542
+ * The ratio will be initialized in memory_stats_init(). Because
1543
+ * on some architectures, the macro of HPAGE_PMD_SIZE is not
1544
+ * constant(e.g. powerpc).
1545
+ */
1546
+ { "anon_thp", 0, NR_ANON_THPS },
1547
+#endif
1548
+ { "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON },
1549
+ { "active_anon", PAGE_SIZE, NR_ACTIVE_ANON },
1550
+ { "inactive_file", PAGE_SIZE, NR_INACTIVE_FILE },
1551
+ { "active_file", PAGE_SIZE, NR_ACTIVE_FILE },
1552
+ { "unevictable", PAGE_SIZE, NR_UNEVICTABLE },
1553
+
1554
+ /*
1555
+ * Note: The slab_reclaimable and slab_unreclaimable must be
1556
+ * together and slab_reclaimable must be in front.
1557
+ */
1558
+ { "slab_reclaimable", 1, NR_SLAB_RECLAIMABLE_B },
1559
+ { "slab_unreclaimable", 1, NR_SLAB_UNRECLAIMABLE_B },
1560
+
1561
+ /* The memory events */
1562
+ { "workingset_refault_anon", 1, WORKINGSET_REFAULT_ANON },
1563
+ { "workingset_refault_file", 1, WORKINGSET_REFAULT_FILE },
1564
+ { "workingset_activate_anon", 1, WORKINGSET_ACTIVATE_ANON },
1565
+ { "workingset_activate_file", 1, WORKINGSET_ACTIVATE_FILE },
1566
+ { "workingset_restore_anon", 1, WORKINGSET_RESTORE_ANON },
1567
+ { "workingset_restore_file", 1, WORKINGSET_RESTORE_FILE },
1568
+ { "workingset_nodereclaim", 1, WORKINGSET_NODERECLAIM },
13221569 };
1570
+
1571
+static int __init memory_stats_init(void)
1572
+{
1573
+ int i;
1574
+
1575
+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
1576
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1577
+ if (memory_stats[i].idx == NR_ANON_THPS)
1578
+ memory_stats[i].ratio = HPAGE_PMD_SIZE;
1579
+#endif
1580
+ VM_BUG_ON(!memory_stats[i].ratio);
1581
+ VM_BUG_ON(memory_stats[i].idx >= MEMCG_NR_STAT);
1582
+ }
1583
+
1584
+ return 0;
1585
+}
1586
+pure_initcall(memory_stats_init);
1587
+
1588
+static char *memory_stat_format(struct mem_cgroup *memcg)
1589
+{
1590
+ struct seq_buf s;
1591
+ int i;
1592
+
1593
+ seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
1594
+ if (!s.buffer)
1595
+ return NULL;
1596
+
1597
+ /*
1598
+ * Provide statistics on the state of the memory subsystem as
1599
+ * well as cumulative event counters that show past behavior.
1600
+ *
1601
+ * This list is ordered following a combination of these gradients:
1602
+ * 1) generic big picture -> specifics and details
1603
+ * 2) reflecting userspace activity -> reflecting kernel heuristics
1604
+ *
1605
+ * Current memory state:
1606
+ */
1607
+
1608
+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
1609
+ u64 size;
1610
+
1611
+ size = memcg_page_state(memcg, memory_stats[i].idx);
1612
+ size *= memory_stats[i].ratio;
1613
+ seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size);
1614
+
1615
+ if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
1616
+ size = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
1617
+ memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B);
1618
+ seq_buf_printf(&s, "slab %llu\n", size);
1619
+ }
1620
+ }
1621
+
1622
+ /* Accumulated memory events */
1623
+
1624
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT),
1625
+ memcg_events(memcg, PGFAULT));
1626
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
1627
+ memcg_events(memcg, PGMAJFAULT));
1628
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGREFILL),
1629
+ memcg_events(memcg, PGREFILL));
1630
+ seq_buf_printf(&s, "pgscan %lu\n",
1631
+ memcg_events(memcg, PGSCAN_KSWAPD) +
1632
+ memcg_events(memcg, PGSCAN_DIRECT));
1633
+ seq_buf_printf(&s, "pgsteal %lu\n",
1634
+ memcg_events(memcg, PGSTEAL_KSWAPD) +
1635
+ memcg_events(memcg, PGSTEAL_DIRECT));
1636
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE),
1637
+ memcg_events(memcg, PGACTIVATE));
1638
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE),
1639
+ memcg_events(memcg, PGDEACTIVATE));
1640
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE),
1641
+ memcg_events(memcg, PGLAZYFREE));
1642
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED),
1643
+ memcg_events(memcg, PGLAZYFREED));
1644
+
1645
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1646
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC),
1647
+ memcg_events(memcg, THP_FAULT_ALLOC));
1648
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC),
1649
+ memcg_events(memcg, THP_COLLAPSE_ALLOC));
1650
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1651
+
1652
+ /* The above should easily fit into one page */
1653
+ WARN_ON_ONCE(seq_buf_has_overflowed(&s));
1654
+
1655
+ return s.buffer;
1656
+}
13231657
13241658 #define K(x) ((x) << (PAGE_SHIFT-10))
13251659 /**
1326
- * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
1660
+ * mem_cgroup_print_oom_context: Print OOM information relevant to
1661
+ * memory controller.
13271662 * @memcg: The memory cgroup that went over limit
13281663 * @p: Task that is going to be killed
13291664 *
13301665 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
13311666 * enabled
13321667 */
1333
-void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1668
+void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
13341669 {
1335
- struct mem_cgroup *iter;
1336
- unsigned int i;
1337
-
13381670 rcu_read_lock();
13391671
1672
+ if (memcg) {
1673
+ pr_cont(",oom_memcg=");
1674
+ pr_cont_cgroup_path(memcg->css.cgroup);
1675
+ } else
1676
+ pr_cont(",global_oom");
13401677 if (p) {
1341
- pr_info("Task in ");
1678
+ pr_cont(",task_memcg=");
13421679 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1343
- pr_cont(" killed as a result of limit of ");
1344
- } else {
1345
- pr_info("Memory limit reached of cgroup ");
13461680 }
1347
-
1348
- pr_cont_cgroup_path(memcg->css.cgroup);
1349
- pr_cont("\n");
1350
-
13511681 rcu_read_unlock();
1682
+}
1683
+
1684
+/**
1685
+ * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to
1686
+ * memory controller.
1687
+ * @memcg: The memory cgroup that went over limit
1688
+ */
1689
+void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
1690
+{
1691
+ char *buf;
13521692
13531693 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
13541694 K((u64)page_counter_read(&memcg->memory)),
1355
- K((u64)memcg->memory.max), memcg->memory.failcnt);
1356
- pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1357
- K((u64)page_counter_read(&memcg->memsw)),
1358
- K((u64)memcg->memsw.max), memcg->memsw.failcnt);
1359
- pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1360
- K((u64)page_counter_read(&memcg->kmem)),
1361
- K((u64)memcg->kmem.max), memcg->kmem.failcnt);
1362
-
1363
- for_each_mem_cgroup_tree(iter, memcg) {
1364
- pr_info("Memory cgroup stats for ");
1365
- pr_cont_cgroup_path(iter->css.cgroup);
1366
- pr_cont(":");
1367
-
1368
- for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
1369
- if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account)
1370
- continue;
1371
- pr_cont(" %s:%luKB", memcg1_stat_names[i],
1372
- K(memcg_page_state(iter, memcg1_stats[i])));
1373
- }
1374
-
1375
- for (i = 0; i < NR_LRU_LISTS; i++)
1376
- pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
1377
- K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
1378
-
1379
- pr_cont("\n");
1695
+ K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt);
1696
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
1697
+ pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
1698
+ K((u64)page_counter_read(&memcg->swap)),
1699
+ K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt);
1700
+ else {
1701
+ pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1702
+ K((u64)page_counter_read(&memcg->memsw)),
1703
+ K((u64)memcg->memsw.max), memcg->memsw.failcnt);
1704
+ pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1705
+ K((u64)page_counter_read(&memcg->kmem)),
1706
+ K((u64)memcg->kmem.max), memcg->kmem.failcnt);
13801707 }
1708
+
1709
+ pr_info("Memory cgroup stats for ");
1710
+ pr_cont_cgroup_path(memcg->css.cgroup);
1711
+ pr_cont(":");
1712
+ buf = memory_stat_format(memcg);
1713
+ if (!buf)
1714
+ return;
1715
+ pr_info("%s", buf);
1716
+ kfree(buf);
13811717 }
13821718
13831719 /*
....@@ -1385,19 +1721,26 @@
13851721 */
13861722 unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
13871723 {
1388
- unsigned long max;
1724
+ unsigned long max = READ_ONCE(memcg->memory.max);
13891725
1390
- max = memcg->memory.max;
1391
- if (mem_cgroup_swappiness(memcg)) {
1392
- unsigned long memsw_max;
1393
- unsigned long swap_max;
1726
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
1727
+ if (mem_cgroup_swappiness(memcg))
1728
+ max += min(READ_ONCE(memcg->swap.max),
1729
+ (unsigned long)total_swap_pages);
1730
+ } else { /* v1 */
1731
+ if (mem_cgroup_swappiness(memcg)) {
1732
+ /* Calculate swap excess capacity from memsw limit */
1733
+ unsigned long swap = READ_ONCE(memcg->memsw.max) - max;
13941734
1395
- memsw_max = memcg->memsw.max;
1396
- swap_max = memcg->swap.max;
1397
- swap_max = min(swap_max, (unsigned long)total_swap_pages);
1398
- max = min(max + swap_max, memsw_max);
1735
+ max += min(swap, (unsigned long)total_swap_pages);
1736
+ }
13991737 }
14001738 return max;
1739
+}
1740
+
1741
+unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
1742
+{
1743
+ return page_counter_read(&memcg->memory);
14011744 }
14021745
14031746 static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
....@@ -1410,112 +1753,24 @@
14101753 .gfp_mask = gfp_mask,
14111754 .order = order,
14121755 };
1413
- bool ret;
1756
+ bool ret = true;
14141757
14151758 if (mutex_lock_killable(&oom_lock))
14161759 return true;
1760
+
1761
+ if (mem_cgroup_margin(memcg) >= (1 << order))
1762
+ goto unlock;
1763
+
14171764 /*
14181765 * A few threads which were not waiting at mutex_lock_killable() can
14191766 * fail to bail out. Therefore, check again after holding oom_lock.
14201767 */
1421
- ret = should_force_charge() || out_of_memory(&oc);
1768
+ ret = task_is_dying() || out_of_memory(&oc);
1769
+
1770
+unlock:
14221771 mutex_unlock(&oom_lock);
14231772 return ret;
14241773 }
1425
-
1426
-#if MAX_NUMNODES > 1
1427
-
1428
-/**
1429
- * test_mem_cgroup_node_reclaimable
1430
- * @memcg: the target memcg
1431
- * @nid: the node ID to be checked.
1432
- * @noswap : specify true here if the user wants flle only information.
1433
- *
1434
- * This function returns whether the specified memcg contains any
1435
- * reclaimable pages on a node. Returns true if there are any reclaimable
1436
- * pages in the node.
1437
- */
1438
-static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1439
- int nid, bool noswap)
1440
-{
1441
- if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
1442
- return true;
1443
- if (noswap || !total_swap_pages)
1444
- return false;
1445
- if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
1446
- return true;
1447
- return false;
1448
-
1449
-}
1450
-
1451
-/*
1452
- * Always updating the nodemask is not very good - even if we have an empty
1453
- * list or the wrong list here, we can start from some node and traverse all
1454
- * nodes based on the zonelist. So update the list loosely once per 10 secs.
1455
- *
1456
- */
1457
-static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1458
-{
1459
- int nid;
1460
- /*
1461
- * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
1462
- * pagein/pageout changes since the last update.
1463
- */
1464
- if (!atomic_read(&memcg->numainfo_events))
1465
- return;
1466
- if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1467
- return;
1468
-
1469
- /* make a nodemask where this memcg uses memory from */
1470
- memcg->scan_nodes = node_states[N_MEMORY];
1471
-
1472
- for_each_node_mask(nid, node_states[N_MEMORY]) {
1473
-
1474
- if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1475
- node_clear(nid, memcg->scan_nodes);
1476
- }
1477
-
1478
- atomic_set(&memcg->numainfo_events, 0);
1479
- atomic_set(&memcg->numainfo_updating, 0);
1480
-}
1481
-
1482
-/*
1483
- * Selecting a node where we start reclaim from. Because what we need is just
1484
- * reducing usage counter, start from anywhere is O,K. Considering
1485
- * memory reclaim from current node, there are pros. and cons.
1486
- *
1487
- * Freeing memory from current node means freeing memory from a node which
1488
- * we'll use or we've used. So, it may make LRU bad. And if several threads
1489
- * hit limits, it will see a contention on a node. But freeing from remote
1490
- * node means more costs for memory reclaim because of memory latency.
1491
- *
1492
- * Now, we use round-robin. Better algorithm is welcomed.
1493
- */
1494
-int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1495
-{
1496
- int node;
1497
-
1498
- mem_cgroup_may_update_nodemask(memcg);
1499
- node = memcg->last_scanned_node;
1500
-
1501
- node = next_node_in(node, memcg->scan_nodes);
1502
- /*
1503
- * mem_cgroup_may_update_nodemask might have seen no reclaimmable pages
1504
- * last time it really checked all the LRUs due to rate limiting.
1505
- * Fallback to the current node in that case for simplicity.
1506
- */
1507
- if (unlikely(node == MAX_NUMNODES))
1508
- node = numa_node_id();
1509
-
1510
- memcg->last_scanned_node = node;
1511
- return node;
1512
-}
1513
-#else
1514
-int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1515
-{
1516
- return 0;
1517
-}
1518
-#endif
15191774
15201775 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
15211776 pg_data_t *pgdat,
....@@ -1529,7 +1784,6 @@
15291784 unsigned long nr_scanned;
15301785 struct mem_cgroup_reclaim_cookie reclaim = {
15311786 .pgdat = pgdat,
1532
- .priority = 0,
15331787 };
15341788
15351789 excess = soft_limit_excess(root_memcg);
....@@ -1624,7 +1878,7 @@
16241878 struct mem_cgroup *iter;
16251879
16261880 spin_lock(&memcg_oom_lock);
1627
- mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
1881
+ mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
16281882 for_each_mem_cgroup_tree(iter, memcg)
16291883 iter->oom_lock = false;
16301884 spin_unlock(&memcg_oom_lock);
....@@ -1645,8 +1899,8 @@
16451899 struct mem_cgroup *iter;
16461900
16471901 /*
1648
- * When a new child is created while the hierarchy is under oom,
1649
- * mem_cgroup_oom_lock() may not be called. Watch for underflow.
1902
+ * Be careful about under_oom underflows becase a child memcg
1903
+ * could have been added after mem_cgroup_mark_under_oom.
16501904 */
16511905 spin_lock(&memcg_oom_lock);
16521906 for_each_mem_cgroup_tree(iter, memcg)
....@@ -1706,6 +1960,8 @@
17061960
17071961 if (order > PAGE_ALLOC_COSTLY_ORDER)
17081962 return OOM_SKIPPED;
1963
+
1964
+ memcg_memory_event(memcg, MEMCG_OOM);
17091965
17101966 /*
17111967 * We are in the middle of the charge context here, so we
....@@ -1854,6 +2110,14 @@
18542110 goto out;
18552111
18562112 /*
2113
+ * If the victim task has been asynchronously moved to a different
2114
+ * memory cgroup, we might end up killing tasks outside oom_domain.
2115
+ * In this case it's better to ignore memory.group.oom.
2116
+ */
2117
+ if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain)))
2118
+ goto out;
2119
+
2120
+ /*
18572121 * Traverse the memory cgroup hierarchy from the victim task's
18582122 * cgroup up to the OOMing cgroup (or root) to find the
18592123 * highest-level memory cgroup with oom.group set.
....@@ -1894,6 +2158,7 @@
18942158 */
18952159 struct mem_cgroup *lock_page_memcg(struct page *page)
18962160 {
2161
+ struct page *head = compound_head(page); /* rmap on tail pages */
18972162 struct mem_cgroup *memcg;
18982163 unsigned long flags;
18992164
....@@ -1913,7 +2178,7 @@
19132178 if (mem_cgroup_disabled())
19142179 return NULL;
19152180 again:
1916
- memcg = page->mem_cgroup;
2181
+ memcg = head->mem_cgroup;
19172182 if (unlikely(!memcg))
19182183 return NULL;
19192184
....@@ -1921,7 +2186,7 @@
19212186 return memcg;
19222187
19232188 spin_lock_irqsave(&memcg->move_lock, flags);
1924
- if (memcg != page->mem_cgroup) {
2189
+ if (memcg != head->mem_cgroup) {
19252190 spin_unlock_irqrestore(&memcg->move_lock, flags);
19262191 goto again;
19272192 }
....@@ -1964,19 +2229,43 @@
19642229 */
19652230 void unlock_page_memcg(struct page *page)
19662231 {
1967
- __unlock_page_memcg(page->mem_cgroup);
2232
+ struct page *head = compound_head(page);
2233
+
2234
+ __unlock_page_memcg(head->mem_cgroup);
19682235 }
19692236 EXPORT_SYMBOL(unlock_page_memcg);
19702237
19712238 struct memcg_stock_pcp {
19722239 struct mem_cgroup *cached; /* this never be root cgroup */
19732240 unsigned int nr_pages;
2241
+
2242
+#ifdef CONFIG_MEMCG_KMEM
2243
+ struct obj_cgroup *cached_objcg;
2244
+ unsigned int nr_bytes;
2245
+#endif
2246
+
19742247 struct work_struct work;
19752248 unsigned long flags;
19762249 #define FLUSHING_CACHED_CHARGE 0
19772250 };
19782251 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
19792252 static DEFINE_MUTEX(percpu_charge_mutex);
2253
+
2254
+#ifdef CONFIG_MEMCG_KMEM
2255
+static void drain_obj_stock(struct memcg_stock_pcp *stock);
2256
+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
2257
+ struct mem_cgroup *root_memcg);
2258
+
2259
+#else
2260
+static inline void drain_obj_stock(struct memcg_stock_pcp *stock)
2261
+{
2262
+}
2263
+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
2264
+ struct mem_cgroup *root_memcg)
2265
+{
2266
+ return false;
2267
+}
2268
+#endif
19802269
19812270 /**
19822271 * consume_stock: Try to consume stocked charge on this cpu.
....@@ -2018,13 +2307,17 @@
20182307 {
20192308 struct mem_cgroup *old = stock->cached;
20202309
2310
+ if (!old)
2311
+ return;
2312
+
20212313 if (stock->nr_pages) {
20222314 page_counter_uncharge(&old->memory, stock->nr_pages);
20232315 if (do_memsw_account())
20242316 page_counter_uncharge(&old->memsw, stock->nr_pages);
2025
- css_put_many(&old->css, stock->nr_pages);
20262317 stock->nr_pages = 0;
20272318 }
2319
+
2320
+ css_put(&old->css);
20282321 stock->cached = NULL;
20292322 }
20302323
....@@ -2040,6 +2333,7 @@
20402333 local_irq_save(flags);
20412334
20422335 stock = this_cpu_ptr(&memcg_stock);
2336
+ drain_obj_stock(stock);
20432337 drain_stock(stock);
20442338 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
20452339
....@@ -2060,6 +2354,7 @@
20602354 stock = this_cpu_ptr(&memcg_stock);
20612355 if (stock->cached != memcg) { /* reset if necessary */
20622356 drain_stock(stock);
2357
+ css_get(&memcg->css);
20632358 stock->cached = memcg;
20642359 }
20652360 stock->nr_pages += nr_pages;
....@@ -2087,34 +2382,37 @@
20872382 * as well as workers from this path always operate on the local
20882383 * per-cpu data. CPU up doesn't touch memcg_stock at all.
20892384 */
2090
- curcpu = get_cpu_light();
2385
+ curcpu = get_cpu();
20912386 for_each_online_cpu(cpu) {
20922387 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
20932388 struct mem_cgroup *memcg;
2389
+ bool flush = false;
20942390
2391
+ rcu_read_lock();
20952392 memcg = stock->cached;
2096
- if (!memcg || !stock->nr_pages || !css_tryget(&memcg->css))
2097
- continue;
2098
- if (!mem_cgroup_is_descendant(memcg, root_memcg)) {
2099
- css_put(&memcg->css);
2100
- continue;
2101
- }
2102
- if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2393
+ if (memcg && stock->nr_pages &&
2394
+ mem_cgroup_is_descendant(memcg, root_memcg))
2395
+ flush = true;
2396
+ if (obj_stock_flush_required(stock, root_memcg))
2397
+ flush = true;
2398
+ rcu_read_unlock();
2399
+
2400
+ if (flush &&
2401
+ !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
21032402 if (cpu == curcpu)
21042403 drain_local_stock(&stock->work);
21052404 else
21062405 schedule_work_on(cpu, &stock->work);
21072406 }
2108
- css_put(&memcg->css);
21092407 }
2110
- put_cpu_light();
2408
+ put_cpu();
21112409 mutex_unlock(&percpu_charge_mutex);
21122410 }
21132411
21142412 static int memcg_hotplug_cpu_dead(unsigned int cpu)
21152413 {
21162414 struct memcg_stock_pcp *stock;
2117
- struct mem_cgroup *memcg;
2415
+ struct mem_cgroup *memcg, *mi;
21182416
21192417 stock = &per_cpu(memcg_stock, cpu);
21202418 drain_stock(stock);
....@@ -2126,9 +2424,10 @@
21262424 int nid;
21272425 long x;
21282426
2129
- x = this_cpu_xchg(memcg->stat_cpu->count[i], 0);
2427
+ x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
21302428 if (x)
2131
- atomic_long_add(x, &memcg->stat[i]);
2429
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
2430
+ atomic_long_add(x, &memcg->vmstats[i]);
21322431
21332432 if (i >= NR_VM_NODE_STAT_ITEMS)
21342433 continue;
....@@ -2139,32 +2438,48 @@
21392438 pn = mem_cgroup_nodeinfo(memcg, nid);
21402439 x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
21412440 if (x)
2142
- atomic_long_add(x, &pn->lruvec_stat[i]);
2441
+ do {
2442
+ atomic_long_add(x, &pn->lruvec_stat[i]);
2443
+ } while ((pn = parent_nodeinfo(pn, nid)));
21432444 }
21442445 }
21452446
21462447 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
21472448 long x;
21482449
2149
- x = this_cpu_xchg(memcg->stat_cpu->events[i], 0);
2450
+ x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
21502451 if (x)
2151
- atomic_long_add(x, &memcg->events[i]);
2452
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
2453
+ atomic_long_add(x, &memcg->vmevents[i]);
21522454 }
21532455 }
21542456
21552457 return 0;
21562458 }
21572459
2158
-static void reclaim_high(struct mem_cgroup *memcg,
2159
- unsigned int nr_pages,
2160
- gfp_t gfp_mask)
2460
+static unsigned long reclaim_high(struct mem_cgroup *memcg,
2461
+ unsigned int nr_pages,
2462
+ gfp_t gfp_mask)
21612463 {
2464
+ unsigned long nr_reclaimed = 0;
2465
+
21622466 do {
2163
- if (page_counter_read(&memcg->memory) <= memcg->high)
2467
+ unsigned long pflags;
2468
+
2469
+ if (page_counter_read(&memcg->memory) <=
2470
+ READ_ONCE(memcg->memory.high))
21642471 continue;
2472
+
21652473 memcg_memory_event(memcg, MEMCG_HIGH);
2166
- try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
2167
- } while ((memcg = parent_mem_cgroup(memcg)));
2474
+
2475
+ psi_memstall_enter(&pflags);
2476
+ nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
2477
+ gfp_mask, true);
2478
+ psi_memstall_leave(&pflags);
2479
+ } while ((memcg = parent_mem_cgroup(memcg)) &&
2480
+ !mem_cgroup_is_root(memcg));
2481
+
2482
+ return nr_reclaimed;
21682483 }
21692484
21702485 static void high_work_func(struct work_struct *work)
....@@ -2176,35 +2491,238 @@
21762491 }
21772492
21782493 /*
2494
+ * Clamp the maximum sleep time per allocation batch to 2 seconds. This is
2495
+ * enough to still cause a significant slowdown in most cases, while still
2496
+ * allowing diagnostics and tracing to proceed without becoming stuck.
2497
+ */
2498
+#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
2499
+
2500
+/*
2501
+ * When calculating the delay, we use these either side of the exponentiation to
2502
+ * maintain precision and scale to a reasonable number of jiffies (see the table
2503
+ * below.
2504
+ *
2505
+ * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
2506
+ * overage ratio to a delay.
2507
+ * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the
2508
+ * proposed penalty in order to reduce to a reasonable number of jiffies, and
2509
+ * to produce a reasonable delay curve.
2510
+ *
2511
+ * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
2512
+ * reasonable delay curve compared to precision-adjusted overage, not
2513
+ * penalising heavily at first, but still making sure that growth beyond the
2514
+ * limit penalises misbehaviour cgroups by slowing them down exponentially. For
2515
+ * example, with a high of 100 megabytes:
2516
+ *
2517
+ * +-------+------------------------+
2518
+ * | usage | time to allocate in ms |
2519
+ * +-------+------------------------+
2520
+ * | 100M | 0 |
2521
+ * | 101M | 6 |
2522
+ * | 102M | 25 |
2523
+ * | 103M | 57 |
2524
+ * | 104M | 102 |
2525
+ * | 105M | 159 |
2526
+ * | 106M | 230 |
2527
+ * | 107M | 313 |
2528
+ * | 108M | 409 |
2529
+ * | 109M | 518 |
2530
+ * | 110M | 639 |
2531
+ * | 111M | 774 |
2532
+ * | 112M | 921 |
2533
+ * | 113M | 1081 |
2534
+ * | 114M | 1254 |
2535
+ * | 115M | 1439 |
2536
+ * | 116M | 1638 |
2537
+ * | 117M | 1849 |
2538
+ * | 118M | 2000 |
2539
+ * | 119M | 2000 |
2540
+ * | 120M | 2000 |
2541
+ * +-------+------------------------+
2542
+ */
2543
+ #define MEMCG_DELAY_PRECISION_SHIFT 20
2544
+ #define MEMCG_DELAY_SCALING_SHIFT 14
2545
+
2546
+static u64 calculate_overage(unsigned long usage, unsigned long high)
2547
+{
2548
+ u64 overage;
2549
+
2550
+ if (usage <= high)
2551
+ return 0;
2552
+
2553
+ /*
2554
+ * Prevent division by 0 in overage calculation by acting as if
2555
+ * it was a threshold of 1 page
2556
+ */
2557
+ high = max(high, 1UL);
2558
+
2559
+ overage = usage - high;
2560
+ overage <<= MEMCG_DELAY_PRECISION_SHIFT;
2561
+ return div64_u64(overage, high);
2562
+}
2563
+
2564
+static u64 mem_find_max_overage(struct mem_cgroup *memcg)
2565
+{
2566
+ u64 overage, max_overage = 0;
2567
+
2568
+ do {
2569
+ overage = calculate_overage(page_counter_read(&memcg->memory),
2570
+ READ_ONCE(memcg->memory.high));
2571
+ max_overage = max(overage, max_overage);
2572
+ } while ((memcg = parent_mem_cgroup(memcg)) &&
2573
+ !mem_cgroup_is_root(memcg));
2574
+
2575
+ return max_overage;
2576
+}
2577
+
2578
+static u64 swap_find_max_overage(struct mem_cgroup *memcg)
2579
+{
2580
+ u64 overage, max_overage = 0;
2581
+
2582
+ do {
2583
+ overage = calculate_overage(page_counter_read(&memcg->swap),
2584
+ READ_ONCE(memcg->swap.high));
2585
+ if (overage)
2586
+ memcg_memory_event(memcg, MEMCG_SWAP_HIGH);
2587
+ max_overage = max(overage, max_overage);
2588
+ } while ((memcg = parent_mem_cgroup(memcg)) &&
2589
+ !mem_cgroup_is_root(memcg));
2590
+
2591
+ return max_overage;
2592
+}
2593
+
2594
+/*
2595
+ * Get the number of jiffies that we should penalise a mischievous cgroup which
2596
+ * is exceeding its memory.high by checking both it and its ancestors.
2597
+ */
2598
+static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
2599
+ unsigned int nr_pages,
2600
+ u64 max_overage)
2601
+{
2602
+ unsigned long penalty_jiffies;
2603
+
2604
+ if (!max_overage)
2605
+ return 0;
2606
+
2607
+ /*
2608
+ * We use overage compared to memory.high to calculate the number of
2609
+ * jiffies to sleep (penalty_jiffies). Ideally this value should be
2610
+ * fairly lenient on small overages, and increasingly harsh when the
2611
+ * memcg in question makes it clear that it has no intention of stopping
2612
+ * its crazy behaviour, so we exponentially increase the delay based on
2613
+ * overage amount.
2614
+ */
2615
+ penalty_jiffies = max_overage * max_overage * HZ;
2616
+ penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
2617
+ penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
2618
+
2619
+ /*
2620
+ * Factor in the task's own contribution to the overage, such that four
2621
+ * N-sized allocations are throttled approximately the same as one
2622
+ * 4N-sized allocation.
2623
+ *
2624
+ * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
2625
+ * larger the current charge patch is than that.
2626
+ */
2627
+ return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
2628
+}
2629
+
2630
+/*
21792631 * Scheduled by try_charge() to be executed from the userland return path
21802632 * and reclaims memory over the high limit.
21812633 */
21822634 void mem_cgroup_handle_over_high(void)
21832635 {
2636
+ unsigned long penalty_jiffies;
2637
+ unsigned long pflags;
2638
+ unsigned long nr_reclaimed;
21842639 unsigned int nr_pages = current->memcg_nr_pages_over_high;
2640
+ int nr_retries = MAX_RECLAIM_RETRIES;
21852641 struct mem_cgroup *memcg;
2642
+ bool in_retry = false;
21862643
21872644 if (likely(!nr_pages))
21882645 return;
21892646
21902647 memcg = get_mem_cgroup_from_mm(current->mm);
2191
- reclaim_high(memcg, nr_pages, GFP_KERNEL);
2192
- css_put(&memcg->css);
21932648 current->memcg_nr_pages_over_high = 0;
2649
+
2650
+retry_reclaim:
2651
+ /*
2652
+ * The allocating task should reclaim at least the batch size, but for
2653
+ * subsequent retries we only want to do what's necessary to prevent oom
2654
+ * or breaching resource isolation.
2655
+ *
2656
+ * This is distinct from memory.max or page allocator behaviour because
2657
+ * memory.high is currently batched, whereas memory.max and the page
2658
+ * allocator run every time an allocation is made.
2659
+ */
2660
+ nr_reclaimed = reclaim_high(memcg,
2661
+ in_retry ? SWAP_CLUSTER_MAX : nr_pages,
2662
+ GFP_KERNEL);
2663
+
2664
+ /*
2665
+ * memory.high is breached and reclaim is unable to keep up. Throttle
2666
+ * allocators proactively to slow down excessive growth.
2667
+ */
2668
+ penalty_jiffies = calculate_high_delay(memcg, nr_pages,
2669
+ mem_find_max_overage(memcg));
2670
+
2671
+ penalty_jiffies += calculate_high_delay(memcg, nr_pages,
2672
+ swap_find_max_overage(memcg));
2673
+
2674
+ /*
2675
+ * Clamp the max delay per usermode return so as to still keep the
2676
+ * application moving forwards and also permit diagnostics, albeit
2677
+ * extremely slowly.
2678
+ */
2679
+ penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
2680
+
2681
+ /*
2682
+ * Don't sleep if the amount of jiffies this memcg owes us is so low
2683
+ * that it's not even worth doing, in an attempt to be nice to those who
2684
+ * go only a small amount over their memory.high value and maybe haven't
2685
+ * been aggressively reclaimed enough yet.
2686
+ */
2687
+ if (penalty_jiffies <= HZ / 100)
2688
+ goto out;
2689
+
2690
+ /*
2691
+ * If reclaim is making forward progress but we're still over
2692
+ * memory.high, we want to encourage that rather than doing allocator
2693
+ * throttling.
2694
+ */
2695
+ if (nr_reclaimed || nr_retries--) {
2696
+ in_retry = true;
2697
+ goto retry_reclaim;
2698
+ }
2699
+
2700
+ /*
2701
+ * If we exit early, we're guaranteed to die (since
2702
+ * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
2703
+ * need to account for any ill-begotten jiffies to pay them off later.
2704
+ */
2705
+ psi_memstall_enter(&pflags);
2706
+ schedule_timeout_killable(penalty_jiffies);
2707
+ psi_memstall_leave(&pflags);
2708
+
2709
+out:
2710
+ css_put(&memcg->css);
21942711 }
21952712
21962713 static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
21972714 unsigned int nr_pages)
21982715 {
21992716 unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
2200
- int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2717
+ int nr_retries = MAX_RECLAIM_RETRIES;
22012718 struct mem_cgroup *mem_over_limit;
22022719 struct page_counter *counter;
2720
+ enum oom_status oom_status;
22032721 unsigned long nr_reclaimed;
2722
+ bool passed_oom = false;
22042723 bool may_swap = true;
22052724 bool drained = false;
2206
- bool oomed = false;
2207
- enum oom_status oom_status;
2725
+ unsigned long pflags;
22082726
22092727 if (mem_cgroup_is_root(memcg))
22102728 return 0;
....@@ -2239,15 +2757,6 @@
22392757 goto force;
22402758
22412759 /*
2242
- * Unlike in global OOM situations, memcg is not in a physical
2243
- * memory shortage. Allow dying and OOM-killed tasks to
2244
- * bypass the last charges so that they can exit quickly and
2245
- * free their memory.
2246
- */
2247
- if (unlikely(should_force_charge()))
2248
- goto force;
2249
-
2250
- /*
22512760 * Prevent unbounded recursion when reclaim operations need to
22522761 * allocate memory. This might exceed the limits temporarily,
22532762 * but we prefer facilitating memory reclaim and getting back
....@@ -2264,8 +2773,10 @@
22642773
22652774 memcg_memory_event(mem_over_limit, MEMCG_MAX);
22662775
2776
+ psi_memstall_enter(&pflags);
22672777 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
22682778 gfp_mask, may_swap);
2779
+ psi_memstall_leave(&pflags);
22692780
22702781 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
22712782 goto retry;
....@@ -2299,16 +2810,15 @@
22992810 if (nr_retries--)
23002811 goto retry;
23012812
2302
- if (gfp_mask & __GFP_RETRY_MAYFAIL && oomed)
2813
+ if (gfp_mask & __GFP_RETRY_MAYFAIL)
23032814 goto nomem;
23042815
23052816 if (gfp_mask & __GFP_NOFAIL)
23062817 goto force;
23072818
2308
- if (fatal_signal_pending(current))
2309
- goto force;
2310
-
2311
- memcg_memory_event(mem_over_limit, MEMCG_OOM);
2819
+ /* Avoid endless loop for tasks bypassed by the oom killer */
2820
+ if (passed_oom && task_is_dying())
2821
+ goto nomem;
23122822
23132823 /*
23142824 * keep retrying as long as the memcg oom killer is able to make
....@@ -2317,15 +2827,10 @@
23172827 */
23182828 oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
23192829 get_order(nr_pages * PAGE_SIZE));
2320
- switch (oom_status) {
2321
- case OOM_SUCCESS:
2322
- nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2323
- oomed = true;
2830
+ if (oom_status == OOM_SUCCESS) {
2831
+ passed_oom = true;
2832
+ nr_retries = MAX_RECLAIM_RETRIES;
23242833 goto retry;
2325
- case OOM_FAILED:
2326
- goto force;
2327
- default:
2328
- goto nomem;
23292834 }
23302835 nomem:
23312836 if (!(gfp_mask & __GFP_NOFAIL))
....@@ -2339,12 +2844,10 @@
23392844 page_counter_charge(&memcg->memory, nr_pages);
23402845 if (do_memsw_account())
23412846 page_counter_charge(&memcg->memsw, nr_pages);
2342
- css_get_many(&memcg->css, nr_pages);
23432847
23442848 return 0;
23452849
23462850 done_restock:
2347
- css_get_many(&memcg->css, batch);
23482851 if (batch > nr_pages)
23492852 refill_stock(memcg, batch - nr_pages);
23502853
....@@ -2358,12 +2861,32 @@
23582861 * reclaim, the cost of mismatch is negligible.
23592862 */
23602863 do {
2361
- if (page_counter_read(&memcg->memory) > memcg->high) {
2362
- /* Don't bother a random interrupted task */
2363
- if (in_interrupt()) {
2864
+ bool mem_high, swap_high;
2865
+
2866
+ mem_high = page_counter_read(&memcg->memory) >
2867
+ READ_ONCE(memcg->memory.high);
2868
+ swap_high = page_counter_read(&memcg->swap) >
2869
+ READ_ONCE(memcg->swap.high);
2870
+
2871
+ /* Don't bother a random interrupted task */
2872
+ if (in_interrupt()) {
2873
+ if (mem_high) {
23642874 schedule_work(&memcg->high_work);
23652875 break;
23662876 }
2877
+ continue;
2878
+ }
2879
+
2880
+ if (mem_high || swap_high) {
2881
+ /*
2882
+ * The allocating tasks in this cgroup will need to do
2883
+ * reclaim or be throttled to prevent further growth
2884
+ * of the memory or swap footprints.
2885
+ *
2886
+ * Target some best-effort fairness between the tasks,
2887
+ * and distribute reclaim work and delay penalties
2888
+ * based on how much each task is actually allocating.
2889
+ */
23672890 current->memcg_nr_pages_over_high += batch;
23682891 set_notify_resume(current);
23692892 break;
....@@ -2373,6 +2896,7 @@
23732896 return 0;
23742897 }
23752898
2899
+#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU)
23762900 static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
23772901 {
23782902 if (mem_cgroup_is_root(memcg))
....@@ -2381,76 +2905,124 @@
23812905 page_counter_uncharge(&memcg->memory, nr_pages);
23822906 if (do_memsw_account())
23832907 page_counter_uncharge(&memcg->memsw, nr_pages);
2384
-
2385
- css_put_many(&memcg->css, nr_pages);
23862908 }
2909
+#endif
23872910
2388
-static void lock_page_lru(struct page *page, int *isolated)
2911
+static void commit_charge(struct page *page, struct mem_cgroup *memcg)
23892912 {
2390
- struct zone *zone = page_zone(page);
2391
-
2392
- spin_lock_irq(zone_lru_lock(zone));
2393
- if (PageLRU(page)) {
2394
- struct lruvec *lruvec;
2395
-
2396
- lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
2397
- ClearPageLRU(page);
2398
- del_page_from_lru_list(page, lruvec, page_lru(page));
2399
- *isolated = 1;
2400
- } else
2401
- *isolated = 0;
2402
-}
2403
-
2404
-static void unlock_page_lru(struct page *page, int isolated)
2405
-{
2406
- struct zone *zone = page_zone(page);
2407
-
2408
- if (isolated) {
2409
- struct lruvec *lruvec;
2410
-
2411
- lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
2412
- VM_BUG_ON_PAGE(PageLRU(page), page);
2413
- SetPageLRU(page);
2414
- add_page_to_lru_list(page, lruvec, page_lru(page));
2415
- }
2416
- spin_unlock_irq(zone_lru_lock(zone));
2417
-}
2418
-
2419
-static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2420
- bool lrucare)
2421
-{
2422
- int isolated;
2423
-
24242913 VM_BUG_ON_PAGE(page->mem_cgroup, page);
2425
-
24262914 /*
2427
- * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
2428
- * may already be on some other mem_cgroup's LRU. Take care of it.
2429
- */
2430
- if (lrucare)
2431
- lock_page_lru(page, &isolated);
2432
-
2433
- /*
2434
- * Nobody should be changing or seriously looking at
2435
- * page->mem_cgroup at this point:
2915
+ * Any of the following ensures page->mem_cgroup stability:
24362916 *
2437
- * - the page is uncharged
2438
- *
2439
- * - the page is off-LRU
2440
- *
2441
- * - an anonymous fault has exclusive page access, except for
2442
- * a locked page table
2443
- *
2444
- * - a page cache insertion, a swapin fault, or a migration
2445
- * have the page locked
2917
+ * - the page lock
2918
+ * - LRU isolation
2919
+ * - lock_page_memcg()
2920
+ * - exclusive reference
24462921 */
24472922 page->mem_cgroup = memcg;
2448
-
2449
- if (lrucare)
2450
- unlock_page_lru(page, isolated);
24512923 }
24522924
24532925 #ifdef CONFIG_MEMCG_KMEM
2926
+/*
2927
+ * The allocated objcg pointers array is not accounted directly.
2928
+ * Moreover, it should not come from DMA buffer and is not readily
2929
+ * reclaimable. So those GFP bits should be masked off.
2930
+ */
2931
+#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT)
2932
+
2933
+int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
2934
+ gfp_t gfp)
2935
+{
2936
+ unsigned int objects = objs_per_slab_page(s, page);
2937
+ void *vec;
2938
+
2939
+ gfp &= ~OBJCGS_CLEAR_MASK;
2940
+ vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
2941
+ page_to_nid(page));
2942
+ if (!vec)
2943
+ return -ENOMEM;
2944
+
2945
+ if (cmpxchg(&page->obj_cgroups, NULL,
2946
+ (struct obj_cgroup **) ((unsigned long)vec | 0x1UL)))
2947
+ kfree(vec);
2948
+ else
2949
+ kmemleak_not_leak(vec);
2950
+
2951
+ return 0;
2952
+}
2953
+
2954
+/*
2955
+ * Returns a pointer to the memory cgroup to which the kernel object is charged.
2956
+ *
2957
+ * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
2958
+ * cgroup_mutex, etc.
2959
+ */
2960
+struct mem_cgroup *mem_cgroup_from_obj(void *p)
2961
+{
2962
+ struct page *page;
2963
+
2964
+ if (mem_cgroup_disabled())
2965
+ return NULL;
2966
+
2967
+ page = virt_to_head_page(p);
2968
+
2969
+ /*
2970
+ * If page->mem_cgroup is set, it's either a simple mem_cgroup pointer
2971
+ * or a pointer to obj_cgroup vector. In the latter case the lowest
2972
+ * bit of the pointer is set.
2973
+ * The page->mem_cgroup pointer can be asynchronously changed
2974
+ * from NULL to (obj_cgroup_vec | 0x1UL), but can't be changed
2975
+ * from a valid memcg pointer to objcg vector or back.
2976
+ */
2977
+ if (!page->mem_cgroup)
2978
+ return NULL;
2979
+
2980
+ /*
2981
+ * Slab objects are accounted individually, not per-page.
2982
+ * Memcg membership data for each individual object is saved in
2983
+ * the page->obj_cgroups.
2984
+ */
2985
+ if (page_has_obj_cgroups(page)) {
2986
+ struct obj_cgroup *objcg;
2987
+ unsigned int off;
2988
+
2989
+ off = obj_to_index(page->slab_cache, page, p);
2990
+ objcg = page_obj_cgroups(page)[off];
2991
+ if (objcg)
2992
+ return obj_cgroup_memcg(objcg);
2993
+
2994
+ return NULL;
2995
+ }
2996
+
2997
+ /* All other pages use page->mem_cgroup */
2998
+ return page->mem_cgroup;
2999
+}
3000
+
3001
+__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
3002
+{
3003
+ struct obj_cgroup *objcg = NULL;
3004
+ struct mem_cgroup *memcg;
3005
+
3006
+ if (memcg_kmem_bypass())
3007
+ return NULL;
3008
+
3009
+ rcu_read_lock();
3010
+ if (unlikely(active_memcg()))
3011
+ memcg = active_memcg();
3012
+ else
3013
+ memcg = mem_cgroup_from_task(current);
3014
+
3015
+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
3016
+ objcg = rcu_dereference(memcg->objcg);
3017
+ if (objcg && obj_cgroup_tryget(objcg))
3018
+ break;
3019
+ objcg = NULL;
3020
+ }
3021
+ rcu_read_unlock();
3022
+
3023
+ return objcg;
3024
+}
3025
+
24543026 static int memcg_alloc_cache_id(void)
24553027 {
24563028 int id, size;
....@@ -2476,9 +3048,7 @@
24763048 else if (size > MEMCG_CACHES_MAX_SIZE)
24773049 size = MEMCG_CACHES_MAX_SIZE;
24783050
2479
- err = memcg_update_all_caches(size);
2480
- if (!err)
2481
- err = memcg_update_all_list_lrus(size);
3051
+ err = memcg_update_all_list_lrus(size);
24823052 if (!err)
24833053 memcg_nr_cache_ids = size;
24843054
....@@ -2496,152 +3066,17 @@
24963066 ida_simple_remove(&memcg_cache_ida, id);
24973067 }
24983068
2499
-struct memcg_kmem_cache_create_work {
2500
- struct mem_cgroup *memcg;
2501
- struct kmem_cache *cachep;
2502
- struct work_struct work;
2503
-};
2504
-
2505
-static void memcg_kmem_cache_create_func(struct work_struct *w)
2506
-{
2507
- struct memcg_kmem_cache_create_work *cw =
2508
- container_of(w, struct memcg_kmem_cache_create_work, work);
2509
- struct mem_cgroup *memcg = cw->memcg;
2510
- struct kmem_cache *cachep = cw->cachep;
2511
-
2512
- memcg_create_kmem_cache(memcg, cachep);
2513
-
2514
- css_put(&memcg->css);
2515
- kfree(cw);
2516
-}
2517
-
2518
-/*
2519
- * Enqueue the creation of a per-memcg kmem_cache.
2520
- */
2521
-static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2522
- struct kmem_cache *cachep)
2523
-{
2524
- struct memcg_kmem_cache_create_work *cw;
2525
-
2526
- cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
2527
- if (!cw)
2528
- return;
2529
-
2530
- css_get(&memcg->css);
2531
-
2532
- cw->memcg = memcg;
2533
- cw->cachep = cachep;
2534
- INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
2535
-
2536
- queue_work(memcg_kmem_cache_wq, &cw->work);
2537
-}
2538
-
2539
-static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2540
- struct kmem_cache *cachep)
2541
-{
2542
- /*
2543
- * We need to stop accounting when we kmalloc, because if the
2544
- * corresponding kmalloc cache is not yet created, the first allocation
2545
- * in __memcg_schedule_kmem_cache_create will recurse.
2546
- *
2547
- * However, it is better to enclose the whole function. Depending on
2548
- * the debugging options enabled, INIT_WORK(), for instance, can
2549
- * trigger an allocation. This too, will make us recurse. Because at
2550
- * this point we can't allow ourselves back into memcg_kmem_get_cache,
2551
- * the safest choice is to do it like this, wrapping the whole function.
2552
- */
2553
- current->memcg_kmem_skip_account = 1;
2554
- __memcg_schedule_kmem_cache_create(memcg, cachep);
2555
- current->memcg_kmem_skip_account = 0;
2556
-}
2557
-
2558
-static inline bool memcg_kmem_bypass(void)
2559
-{
2560
- if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
2561
- return true;
2562
- return false;
2563
-}
2564
-
25653069 /**
2566
- * memcg_kmem_get_cache: select the correct per-memcg cache for allocation
2567
- * @cachep: the original global kmem cache
2568
- *
2569
- * Return the kmem_cache we're supposed to use for a slab allocation.
2570
- * We try to use the current memcg's version of the cache.
2571
- *
2572
- * If the cache does not exist yet, if we are the first user of it, we
2573
- * create it asynchronously in a workqueue and let the current allocation
2574
- * go through with the original cache.
2575
- *
2576
- * This function takes a reference to the cache it returns to assure it
2577
- * won't get destroyed while we are working with it. Once the caller is
2578
- * done with it, memcg_kmem_put_cache() must be called to release the
2579
- * reference.
2580
- */
2581
-struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
2582
-{
2583
- struct mem_cgroup *memcg;
2584
- struct kmem_cache *memcg_cachep;
2585
- int kmemcg_id;
2586
-
2587
- VM_BUG_ON(!is_root_cache(cachep));
2588
-
2589
- if (memcg_kmem_bypass())
2590
- return cachep;
2591
-
2592
- if (current->memcg_kmem_skip_account)
2593
- return cachep;
2594
-
2595
- memcg = get_mem_cgroup_from_current();
2596
- kmemcg_id = READ_ONCE(memcg->kmemcg_id);
2597
- if (kmemcg_id < 0)
2598
- goto out;
2599
-
2600
- memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
2601
- if (likely(memcg_cachep))
2602
- return memcg_cachep;
2603
-
2604
- /*
2605
- * If we are in a safe context (can wait, and not in interrupt
2606
- * context), we could be be predictable and return right away.
2607
- * This would guarantee that the allocation being performed
2608
- * already belongs in the new cache.
2609
- *
2610
- * However, there are some clashes that can arrive from locking.
2611
- * For instance, because we acquire the slab_mutex while doing
2612
- * memcg_create_kmem_cache, this means no further allocation
2613
- * could happen with the slab_mutex held. So it's better to
2614
- * defer everything.
2615
- */
2616
- memcg_schedule_kmem_cache_create(memcg, cachep);
2617
-out:
2618
- css_put(&memcg->css);
2619
- return cachep;
2620
-}
2621
-
2622
-/**
2623
- * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache
2624
- * @cachep: the cache returned by memcg_kmem_get_cache
2625
- */
2626
-void memcg_kmem_put_cache(struct kmem_cache *cachep)
2627
-{
2628
- if (!is_root_cache(cachep))
2629
- css_put(&cachep->memcg_params.memcg->css);
2630
-}
2631
-
2632
-/**
2633
- * memcg_kmem_charge_memcg: charge a kmem page
2634
- * @page: page to charge
2635
- * @gfp: reclaim mode
2636
- * @order: allocation order
3070
+ * __memcg_kmem_charge: charge a number of kernel pages to a memcg
26373071 * @memcg: memory cgroup to charge
3072
+ * @gfp: reclaim mode
3073
+ * @nr_pages: number of pages to charge
26383074 *
26393075 * Returns 0 on success, an error code on failure.
26403076 */
2641
-int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
2642
- struct mem_cgroup *memcg)
3077
+int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
3078
+ unsigned int nr_pages)
26433079 {
2644
- unsigned int nr_pages = 1 << order;
26453080 struct page_counter *counter;
26463081 int ret;
26473082
....@@ -2664,43 +3099,54 @@
26643099 cancel_charge(memcg, nr_pages);
26653100 return -ENOMEM;
26663101 }
2667
-
2668
- page->mem_cgroup = memcg;
2669
-
26703102 return 0;
26713103 }
26723104
26733105 /**
2674
- * memcg_kmem_charge: charge a kmem page to the current memory cgroup
3106
+ * __memcg_kmem_uncharge: uncharge a number of kernel pages from a memcg
3107
+ * @memcg: memcg to uncharge
3108
+ * @nr_pages: number of pages to uncharge
3109
+ */
3110
+void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
3111
+{
3112
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
3113
+ page_counter_uncharge(&memcg->kmem, nr_pages);
3114
+
3115
+ refill_stock(memcg, nr_pages);
3116
+}
3117
+
3118
+/**
3119
+ * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup
26753120 * @page: page to charge
26763121 * @gfp: reclaim mode
26773122 * @order: allocation order
26783123 *
26793124 * Returns 0 on success, an error code on failure.
26803125 */
2681
-int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
3126
+int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
26823127 {
26833128 struct mem_cgroup *memcg;
26843129 int ret = 0;
26853130
2686
- if (mem_cgroup_disabled() || memcg_kmem_bypass())
2687
- return 0;
2688
-
26893131 memcg = get_mem_cgroup_from_current();
2690
- if (!mem_cgroup_is_root(memcg)) {
2691
- ret = memcg_kmem_charge_memcg(page, gfp, order, memcg);
2692
- if (!ret)
3132
+ if (memcg && !mem_cgroup_is_root(memcg)) {
3133
+ ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
3134
+ if (!ret) {
3135
+ page->mem_cgroup = memcg;
26933136 __SetPageKmemcg(page);
3137
+ return 0;
3138
+ }
3139
+ css_put(&memcg->css);
26943140 }
2695
- css_put(&memcg->css);
26963141 return ret;
26973142 }
3143
+
26983144 /**
2699
- * memcg_kmem_uncharge: uncharge a kmem page
3145
+ * __memcg_kmem_uncharge_page: uncharge a kmem page
27003146 * @page: page to uncharge
27013147 * @order: allocation order
27023148 */
2703
-void memcg_kmem_uncharge(struct page *page, int order)
3149
+void __memcg_kmem_uncharge_page(struct page *page, int order)
27043150 {
27053151 struct mem_cgroup *memcg = page->mem_cgroup;
27063152 unsigned int nr_pages = 1 << order;
....@@ -2709,43 +3155,179 @@
27093155 return;
27103156
27113157 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
2712
-
2713
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
2714
- page_counter_uncharge(&memcg->kmem, nr_pages);
2715
-
2716
- page_counter_uncharge(&memcg->memory, nr_pages);
2717
- if (do_memsw_account())
2718
- page_counter_uncharge(&memcg->memsw, nr_pages);
2719
-
3158
+ __memcg_kmem_uncharge(memcg, nr_pages);
27203159 page->mem_cgroup = NULL;
3160
+ css_put(&memcg->css);
27213161
27223162 /* slab pages do not have PageKmemcg flag set */
27233163 if (PageKmemcg(page))
27243164 __ClearPageKmemcg(page);
2725
-
2726
- css_put_many(&memcg->css, nr_pages);
27273165 }
2728
-#endif /* CONFIG_MEMCG_KMEM */
27293166
2730
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2731
-
2732
-/*
2733
- * Because tail pages are not marked as "used", set it. We're under
2734
- * zone_lru_lock and migration entries setup in all page mappings.
2735
- */
2736
-void mem_cgroup_split_huge_fixup(struct page *head)
3167
+static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
27373168 {
2738
- int i;
3169
+ struct memcg_stock_pcp *stock;
3170
+ unsigned long flags;
3171
+ bool ret = false;
27393172
2740
- if (mem_cgroup_disabled())
3173
+ local_irq_save(flags);
3174
+
3175
+ stock = this_cpu_ptr(&memcg_stock);
3176
+ if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
3177
+ stock->nr_bytes -= nr_bytes;
3178
+ ret = true;
3179
+ }
3180
+
3181
+ local_irq_restore(flags);
3182
+
3183
+ return ret;
3184
+}
3185
+
3186
+static void drain_obj_stock(struct memcg_stock_pcp *stock)
3187
+{
3188
+ struct obj_cgroup *old = stock->cached_objcg;
3189
+
3190
+ if (!old)
27413191 return;
27423192
2743
- for (i = 1; i < HPAGE_PMD_NR; i++)
2744
- head[i].mem_cgroup = head->mem_cgroup;
3193
+ if (stock->nr_bytes) {
3194
+ unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
3195
+ unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
27453196
2746
- __mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR);
3197
+ if (nr_pages) {
3198
+ struct mem_cgroup *memcg;
3199
+
3200
+ rcu_read_lock();
3201
+retry:
3202
+ memcg = obj_cgroup_memcg(old);
3203
+ if (unlikely(!css_tryget(&memcg->css)))
3204
+ goto retry;
3205
+ rcu_read_unlock();
3206
+
3207
+ __memcg_kmem_uncharge(memcg, nr_pages);
3208
+ css_put(&memcg->css);
3209
+ }
3210
+
3211
+ /*
3212
+ * The leftover is flushed to the centralized per-memcg value.
3213
+ * On the next attempt to refill obj stock it will be moved
3214
+ * to a per-cpu stock (probably, on an other CPU), see
3215
+ * refill_obj_stock().
3216
+ *
3217
+ * How often it's flushed is a trade-off between the memory
3218
+ * limit enforcement accuracy and potential CPU contention,
3219
+ * so it might be changed in the future.
3220
+ */
3221
+ atomic_add(nr_bytes, &old->nr_charged_bytes);
3222
+ stock->nr_bytes = 0;
3223
+ }
3224
+
3225
+ obj_cgroup_put(old);
3226
+ stock->cached_objcg = NULL;
27473227 }
2748
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
3228
+
3229
+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
3230
+ struct mem_cgroup *root_memcg)
3231
+{
3232
+ struct mem_cgroup *memcg;
3233
+
3234
+ if (stock->cached_objcg) {
3235
+ memcg = obj_cgroup_memcg(stock->cached_objcg);
3236
+ if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
3237
+ return true;
3238
+ }
3239
+
3240
+ return false;
3241
+}
3242
+
3243
+static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
3244
+{
3245
+ struct memcg_stock_pcp *stock;
3246
+ unsigned long flags;
3247
+
3248
+ local_irq_save(flags);
3249
+
3250
+ stock = this_cpu_ptr(&memcg_stock);
3251
+ if (stock->cached_objcg != objcg) { /* reset if necessary */
3252
+ drain_obj_stock(stock);
3253
+ obj_cgroup_get(objcg);
3254
+ stock->cached_objcg = objcg;
3255
+ stock->nr_bytes = atomic_xchg(&objcg->nr_charged_bytes, 0);
3256
+ }
3257
+ stock->nr_bytes += nr_bytes;
3258
+
3259
+ if (stock->nr_bytes > PAGE_SIZE)
3260
+ drain_obj_stock(stock);
3261
+
3262
+ local_irq_restore(flags);
3263
+}
3264
+
3265
+int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
3266
+{
3267
+ struct mem_cgroup *memcg;
3268
+ unsigned int nr_pages, nr_bytes;
3269
+ int ret;
3270
+
3271
+ if (consume_obj_stock(objcg, size))
3272
+ return 0;
3273
+
3274
+ /*
3275
+ * In theory, memcg->nr_charged_bytes can have enough
3276
+ * pre-charged bytes to satisfy the allocation. However,
3277
+ * flushing memcg->nr_charged_bytes requires two atomic
3278
+ * operations, and memcg->nr_charged_bytes can't be big,
3279
+ * so it's better to ignore it and try grab some new pages.
3280
+ * memcg->nr_charged_bytes will be flushed in
3281
+ * refill_obj_stock(), called from this function or
3282
+ * independently later.
3283
+ */
3284
+ rcu_read_lock();
3285
+retry:
3286
+ memcg = obj_cgroup_memcg(objcg);
3287
+ if (unlikely(!css_tryget(&memcg->css)))
3288
+ goto retry;
3289
+ rcu_read_unlock();
3290
+
3291
+ nr_pages = size >> PAGE_SHIFT;
3292
+ nr_bytes = size & (PAGE_SIZE - 1);
3293
+
3294
+ if (nr_bytes)
3295
+ nr_pages += 1;
3296
+
3297
+ ret = __memcg_kmem_charge(memcg, gfp, nr_pages);
3298
+ if (!ret && nr_bytes)
3299
+ refill_obj_stock(objcg, PAGE_SIZE - nr_bytes);
3300
+
3301
+ css_put(&memcg->css);
3302
+ return ret;
3303
+}
3304
+
3305
+void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
3306
+{
3307
+ refill_obj_stock(objcg, size);
3308
+}
3309
+
3310
+#endif /* CONFIG_MEMCG_KMEM */
3311
+
3312
+/*
3313
+ * Because head->mem_cgroup is not set on tails, set it now.
3314
+ */
3315
+void split_page_memcg(struct page *head, unsigned int nr)
3316
+{
3317
+ struct mem_cgroup *memcg = head->mem_cgroup;
3318
+ int kmemcg = PageKmemcg(head);
3319
+ int i;
3320
+
3321
+ if (mem_cgroup_disabled() || !memcg)
3322
+ return;
3323
+
3324
+ for (i = 1; i < nr; i++) {
3325
+ head[i].mem_cgroup = memcg;
3326
+ if (kmemcg)
3327
+ __SetPageKmemcg(head + i);
3328
+ }
3329
+ css_get_many(&memcg->css, nr - 1);
3330
+}
27493331
27503332 #ifdef CONFIG_MEMCG_SWAP
27513333 /**
....@@ -2807,7 +3389,7 @@
28073389 * Make sure that the new limit (memsw or memory limit) doesn't
28083390 * break our basic invariant rule memory.max <= memsw.max.
28093391 */
2810
- limits_invariant = memsw ? max >= memcg->memory.max :
3392
+ limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
28113393 max <= memcg->memsw.max;
28123394 if (!limits_invariant) {
28133395 mutex_unlock(&memcg_max_mutex);
....@@ -2928,7 +3510,7 @@
29283510 * Test whether @memcg has children, dead or alive. Note that this
29293511 * function doesn't care whether @memcg has use_hierarchy enabled and
29303512 * returns %true if there are child csses according to the cgroup
2931
- * hierarchy. Testing use_hierarchy is the caller's responsiblity.
3513
+ * hierarchy. Testing use_hierarchy is the caller's responsibility.
29323514 */
29333515 static inline bool memcg_has_children(struct mem_cgroup *memcg)
29343516 {
....@@ -2947,7 +3529,7 @@
29473529 */
29483530 static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
29493531 {
2950
- int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3532
+ int nr_retries = MAX_RECLAIM_RETRIES;
29513533
29523534 /* we call try-to-free pages for make this cgroup empty */
29533535 lru_add_drain_all();
....@@ -3021,50 +3603,15 @@
30213603 return retval;
30223604 }
30233605
3024
-struct accumulated_stats {
3025
- unsigned long stat[MEMCG_NR_STAT];
3026
- unsigned long events[NR_VM_EVENT_ITEMS];
3027
- unsigned long lru_pages[NR_LRU_LISTS];
3028
- const unsigned int *stats_array;
3029
- const unsigned int *events_array;
3030
- int stats_size;
3031
- int events_size;
3032
-};
3033
-
3034
-static void accumulate_memcg_tree(struct mem_cgroup *memcg,
3035
- struct accumulated_stats *acc)
3036
-{
3037
- struct mem_cgroup *mi;
3038
- int i;
3039
-
3040
- for_each_mem_cgroup_tree(mi, memcg) {
3041
- for (i = 0; i < acc->stats_size; i++)
3042
- acc->stat[i] += memcg_page_state(mi,
3043
- acc->stats_array ? acc->stats_array[i] : i);
3044
-
3045
- for (i = 0; i < acc->events_size; i++)
3046
- acc->events[i] += memcg_sum_events(mi,
3047
- acc->events_array ? acc->events_array[i] : i);
3048
-
3049
- for (i = 0; i < NR_LRU_LISTS; i++)
3050
- acc->lru_pages[i] +=
3051
- mem_cgroup_nr_lru_pages(mi, BIT(i));
3052
- }
3053
-}
3054
-
30553606 static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
30563607 {
3057
- unsigned long val = 0;
3608
+ unsigned long val;
30583609
30593610 if (mem_cgroup_is_root(memcg)) {
3060
- struct mem_cgroup *iter;
3061
-
3062
- for_each_mem_cgroup_tree(iter, memcg) {
3063
- val += memcg_page_state(iter, MEMCG_CACHE);
3064
- val += memcg_page_state(iter, MEMCG_RSS);
3065
- if (swap)
3066
- val += memcg_page_state(iter, MEMCG_SWAP);
3067
- }
3611
+ val = memcg_page_state(memcg, NR_FILE_PAGES) +
3612
+ memcg_page_state(memcg, NR_ANON_MAPPED);
3613
+ if (swap)
3614
+ val += memcg_page_state(memcg, MEMCG_SWAP);
30683615 } else {
30693616 if (!swap)
30703617 val = page_counter_read(&memcg->memory);
....@@ -3125,9 +3672,61 @@
31253672 }
31263673 }
31273674
3675
+static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
3676
+{
3677
+ unsigned long stat[MEMCG_NR_STAT] = {0};
3678
+ struct mem_cgroup *mi;
3679
+ int node, cpu, i;
3680
+
3681
+ for_each_online_cpu(cpu)
3682
+ for (i = 0; i < MEMCG_NR_STAT; i++)
3683
+ stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
3684
+
3685
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
3686
+ for (i = 0; i < MEMCG_NR_STAT; i++)
3687
+ atomic_long_add(stat[i], &mi->vmstats[i]);
3688
+
3689
+ for_each_node(node) {
3690
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
3691
+ struct mem_cgroup_per_node *pi;
3692
+
3693
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3694
+ stat[i] = 0;
3695
+
3696
+ for_each_online_cpu(cpu)
3697
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3698
+ stat[i] += per_cpu(
3699
+ pn->lruvec_stat_cpu->count[i], cpu);
3700
+
3701
+ for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
3702
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3703
+ atomic_long_add(stat[i], &pi->lruvec_stat[i]);
3704
+ }
3705
+}
3706
+
3707
+static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
3708
+{
3709
+ unsigned long events[NR_VM_EVENT_ITEMS];
3710
+ struct mem_cgroup *mi;
3711
+ int cpu, i;
3712
+
3713
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3714
+ events[i] = 0;
3715
+
3716
+ for_each_online_cpu(cpu)
3717
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3718
+ events[i] += per_cpu(memcg->vmstats_percpu->events[i],
3719
+ cpu);
3720
+
3721
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
3722
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3723
+ atomic_long_add(events[i], &mi->vmevents[i]);
3724
+}
3725
+
31283726 #ifdef CONFIG_MEMCG_KMEM
31293727 static int memcg_online_kmem(struct mem_cgroup *memcg)
31303728 {
3729
+ struct obj_cgroup *objcg;
31313730 int memcg_id;
31323731
31333732 if (cgroup_memory_nokmem)
....@@ -3140,7 +3739,16 @@
31403739 if (memcg_id < 0)
31413740 return memcg_id;
31423741
3143
- static_branch_inc(&memcg_kmem_enabled_key);
3742
+ objcg = obj_cgroup_alloc();
3743
+ if (!objcg) {
3744
+ memcg_free_cache_id(memcg_id);
3745
+ return -ENOMEM;
3746
+ }
3747
+ objcg->memcg = memcg;
3748
+ rcu_assign_pointer(memcg->objcg, objcg);
3749
+
3750
+ static_branch_enable(&memcg_kmem_enabled_key);
3751
+
31443752 /*
31453753 * A memory cgroup is considered kmem-online as soon as it gets
31463754 * kmemcg_id. Setting the id after enabling static branching will
....@@ -3149,7 +3757,6 @@
31493757 */
31503758 memcg->kmemcg_id = memcg_id;
31513759 memcg->kmem_state = KMEM_ONLINE;
3152
- INIT_LIST_HEAD(&memcg->kmem_caches);
31533760
31543761 return 0;
31553762 }
....@@ -3162,22 +3769,17 @@
31623769
31633770 if (memcg->kmem_state != KMEM_ONLINE)
31643771 return;
3165
- /*
3166
- * Clear the online state before clearing memcg_caches array
3167
- * entries. The slab_mutex in memcg_deactivate_kmem_caches()
3168
- * guarantees that no cache will be created for this cgroup
3169
- * after we are done (see memcg_create_kmem_cache()).
3170
- */
3772
+
31713773 memcg->kmem_state = KMEM_ALLOCATED;
3172
-
3173
- memcg_deactivate_kmem_caches(memcg);
3174
-
3175
- kmemcg_id = memcg->kmemcg_id;
3176
- BUG_ON(kmemcg_id < 0);
31773774
31783775 parent = parent_mem_cgroup(memcg);
31793776 if (!parent)
31803777 parent = root_mem_cgroup;
3778
+
3779
+ memcg_reparent_objcgs(memcg, parent);
3780
+
3781
+ kmemcg_id = memcg->kmemcg_id;
3782
+ BUG_ON(kmemcg_id < 0);
31813783
31823784 /*
31833785 * Change kmemcg_id of this cgroup and all its descendants to the
....@@ -3207,12 +3809,6 @@
32073809 /* css_alloc() failed, offlining didn't happen */
32083810 if (unlikely(memcg->kmem_state == KMEM_ONLINE))
32093811 memcg_offline_kmem(memcg);
3210
-
3211
- if (memcg->kmem_state == KMEM_ALLOCATED) {
3212
- memcg_destroy_kmem_caches(memcg);
3213
- static_branch_dec(&memcg_kmem_enabled_key);
3214
- WARN_ON(page_counter_read(&memcg->kmem));
3215
- }
32163812 }
32173813 #else
32183814 static int memcg_online_kmem(struct mem_cgroup *memcg)
....@@ -3303,6 +3899,9 @@
33033899 ret = mem_cgroup_resize_max(memcg, nr_pages, true);
33043900 break;
33053901 case _KMEM:
3902
+ pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
3903
+ "Please report your usecase to linux-mm@kvack.org if you "
3904
+ "depend on this functionality.\n");
33063905 ret = memcg_update_kmem_max(memcg, nr_pages);
33073906 break;
33083907 case _TCP:
....@@ -3367,6 +3966,10 @@
33673966 {
33683967 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
33693968
3969
+ pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. "
3970
+ "Please report your usecase to linux-mm@kvack.org if you "
3971
+ "depend on this functionality.\n");
3972
+
33703973 if (val & ~MOVE_MASK)
33713974 return -EINVAL;
33723975
....@@ -3388,6 +3991,49 @@
33883991 #endif
33893992
33903993 #ifdef CONFIG_NUMA
3994
+
3995
+#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
3996
+#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
3997
+#define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
3998
+
3999
+static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
4000
+ int nid, unsigned int lru_mask, bool tree)
4001
+{
4002
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
4003
+ unsigned long nr = 0;
4004
+ enum lru_list lru;
4005
+
4006
+ VM_BUG_ON((unsigned)nid >= nr_node_ids);
4007
+
4008
+ for_each_lru(lru) {
4009
+ if (!(BIT(lru) & lru_mask))
4010
+ continue;
4011
+ if (tree)
4012
+ nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
4013
+ else
4014
+ nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
4015
+ }
4016
+ return nr;
4017
+}
4018
+
4019
+static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
4020
+ unsigned int lru_mask,
4021
+ bool tree)
4022
+{
4023
+ unsigned long nr = 0;
4024
+ enum lru_list lru;
4025
+
4026
+ for_each_lru(lru) {
4027
+ if (!(BIT(lru) & lru_mask))
4028
+ continue;
4029
+ if (tree)
4030
+ nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
4031
+ else
4032
+ nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
4033
+ }
4034
+ return nr;
4035
+}
4036
+
33914037 static int memcg_numa_stat_show(struct seq_file *m, void *v)
33924038 {
33934039 struct numa_stat {
....@@ -3403,40 +4049,60 @@
34034049 };
34044050 const struct numa_stat *stat;
34054051 int nid;
3406
- unsigned long nr;
3407
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
4052
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
34084053
34094054 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3410
- nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
3411
- seq_printf(m, "%s=%lu", stat->name, nr);
3412
- for_each_node_state(nid, N_MEMORY) {
3413
- nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
3414
- stat->lru_mask);
3415
- seq_printf(m, " N%d=%lu", nid, nr);
3416
- }
4055
+ seq_printf(m, "%s=%lu", stat->name,
4056
+ mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
4057
+ false));
4058
+ for_each_node_state(nid, N_MEMORY)
4059
+ seq_printf(m, " N%d=%lu", nid,
4060
+ mem_cgroup_node_nr_lru_pages(memcg, nid,
4061
+ stat->lru_mask, false));
34174062 seq_putc(m, '\n');
34184063 }
34194064
34204065 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3421
- struct mem_cgroup *iter;
34224066
3423
- nr = 0;
3424
- for_each_mem_cgroup_tree(iter, memcg)
3425
- nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
3426
- seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
3427
- for_each_node_state(nid, N_MEMORY) {
3428
- nr = 0;
3429
- for_each_mem_cgroup_tree(iter, memcg)
3430
- nr += mem_cgroup_node_nr_lru_pages(
3431
- iter, nid, stat->lru_mask);
3432
- seq_printf(m, " N%d=%lu", nid, nr);
3433
- }
4067
+ seq_printf(m, "hierarchical_%s=%lu", stat->name,
4068
+ mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
4069
+ true));
4070
+ for_each_node_state(nid, N_MEMORY)
4071
+ seq_printf(m, " N%d=%lu", nid,
4072
+ mem_cgroup_node_nr_lru_pages(memcg, nid,
4073
+ stat->lru_mask, true));
34344074 seq_putc(m, '\n');
34354075 }
34364076
34374077 return 0;
34384078 }
34394079 #endif /* CONFIG_NUMA */
4080
+
4081
+static const unsigned int memcg1_stats[] = {
4082
+ NR_FILE_PAGES,
4083
+ NR_ANON_MAPPED,
4084
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4085
+ NR_ANON_THPS,
4086
+#endif
4087
+ NR_SHMEM,
4088
+ NR_FILE_MAPPED,
4089
+ NR_FILE_DIRTY,
4090
+ NR_WRITEBACK,
4091
+ MEMCG_SWAP,
4092
+};
4093
+
4094
+static const char *const memcg1_stat_names[] = {
4095
+ "cache",
4096
+ "rss",
4097
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4098
+ "rss_huge",
4099
+#endif
4100
+ "shmem",
4101
+ "mapped_file",
4102
+ "dirty",
4103
+ "writeback",
4104
+ "swap",
4105
+};
34404106
34414107 /* Universal VM events cgroup1 shows, original sort order */
34424108 static const unsigned int memcg1_events[] = {
....@@ -3446,45 +4112,42 @@
34464112 PGMAJFAULT,
34474113 };
34484114
3449
-static const char *const memcg1_event_names[] = {
3450
- "pgpgin",
3451
- "pgpgout",
3452
- "pgfault",
3453
- "pgmajfault",
3454
-};
3455
-
34564115 static int memcg_stat_show(struct seq_file *m, void *v)
34574116 {
3458
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
4117
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
34594118 unsigned long memory, memsw;
34604119 struct mem_cgroup *mi;
34614120 unsigned int i;
3462
- struct accumulated_stats acc;
34634121
34644122 BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
3465
- BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
34664123
34674124 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
4125
+ unsigned long nr;
4126
+
34684127 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
34694128 continue;
3470
- seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
3471
- memcg_page_state(memcg, memcg1_stats[i]) *
3472
- PAGE_SIZE);
4129
+ nr = memcg_page_state_local(memcg, memcg1_stats[i]);
4130
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4131
+ if (memcg1_stats[i] == NR_ANON_THPS)
4132
+ nr *= HPAGE_PMD_NR;
4133
+#endif
4134
+ seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE);
34734135 }
34744136
34754137 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3476
- seq_printf(m, "%s %lu\n", memcg1_event_names[i],
3477
- memcg_sum_events(memcg, memcg1_events[i]));
4138
+ seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]),
4139
+ memcg_events_local(memcg, memcg1_events[i]));
34784140
34794141 for (i = 0; i < NR_LRU_LISTS; i++)
3480
- seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
3481
- mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
4142
+ seq_printf(m, "%s %lu\n", lru_list_name(i),
4143
+ memcg_page_state_local(memcg, NR_LRU_BASE + i) *
4144
+ PAGE_SIZE);
34824145
34834146 /* Hierarchical information */
34844147 memory = memsw = PAGE_COUNTER_MAX;
34854148 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
3486
- memory = min(memory, mi->memory.max);
3487
- memsw = min(memsw, mi->memsw.max);
4149
+ memory = min(memory, READ_ONCE(mi->memory.max));
4150
+ memsw = min(memsw, READ_ONCE(mi->memsw.max));
34884151 }
34894152 seq_printf(m, "hierarchical_memory_limit %llu\n",
34904153 (u64)memory * PAGE_SIZE);
....@@ -3492,49 +4155,45 @@
34924155 seq_printf(m, "hierarchical_memsw_limit %llu\n",
34934156 (u64)memsw * PAGE_SIZE);
34944157
3495
- memset(&acc, 0, sizeof(acc));
3496
- acc.stats_size = ARRAY_SIZE(memcg1_stats);
3497
- acc.stats_array = memcg1_stats;
3498
- acc.events_size = ARRAY_SIZE(memcg1_events);
3499
- acc.events_array = memcg1_events;
3500
- accumulate_memcg_tree(memcg, &acc);
3501
-
35024158 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
4159
+ unsigned long nr;
4160
+
35034161 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
35044162 continue;
4163
+ nr = memcg_page_state(memcg, memcg1_stats[i]);
4164
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4165
+ if (memcg1_stats[i] == NR_ANON_THPS)
4166
+ nr *= HPAGE_PMD_NR;
4167
+#endif
35054168 seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
3506
- (u64)acc.stat[i] * PAGE_SIZE);
4169
+ (u64)nr * PAGE_SIZE);
35074170 }
35084171
35094172 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3510
- seq_printf(m, "total_%s %llu\n", memcg1_event_names[i],
3511
- (u64)acc.events[i]);
4173
+ seq_printf(m, "total_%s %llu\n",
4174
+ vm_event_name(memcg1_events[i]),
4175
+ (u64)memcg_events(memcg, memcg1_events[i]));
35124176
35134177 for (i = 0; i < NR_LRU_LISTS; i++)
3514
- seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i],
3515
- (u64)acc.lru_pages[i] * PAGE_SIZE);
4178
+ seq_printf(m, "total_%s %llu\n", lru_list_name(i),
4179
+ (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
4180
+ PAGE_SIZE);
35164181
35174182 #ifdef CONFIG_DEBUG_VM
35184183 {
35194184 pg_data_t *pgdat;
35204185 struct mem_cgroup_per_node *mz;
3521
- struct zone_reclaim_stat *rstat;
3522
- unsigned long recent_rotated[2] = {0, 0};
3523
- unsigned long recent_scanned[2] = {0, 0};
4186
+ unsigned long anon_cost = 0;
4187
+ unsigned long file_cost = 0;
35244188
35254189 for_each_online_pgdat(pgdat) {
35264190 mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
3527
- rstat = &mz->lruvec.reclaim_stat;
35284191
3529
- recent_rotated[0] += rstat->recent_rotated[0];
3530
- recent_rotated[1] += rstat->recent_rotated[1];
3531
- recent_scanned[0] += rstat->recent_scanned[0];
3532
- recent_scanned[1] += rstat->recent_scanned[1];
4192
+ anon_cost += mz->lruvec.anon_cost;
4193
+ file_cost += mz->lruvec.file_cost;
35334194 }
3534
- seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
3535
- seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
3536
- seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
3537
- seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
4195
+ seq_printf(m, "anon_cost %lu\n", anon_cost);
4196
+ seq_printf(m, "file_cost %lu\n", file_cost);
35384197 }
35394198 #endif
35404199
....@@ -3554,7 +4213,7 @@
35544213 {
35554214 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
35564215
3557
- if (val > 100)
4216
+ if (val > 200)
35584217 return -EINVAL;
35594218
35604219 if (css->parent)
....@@ -3693,8 +4352,7 @@
36934352 size = thresholds->primary ? thresholds->primary->size + 1 : 1;
36944353
36954354 /* Allocate memory for new array of thresholds */
3696
- new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
3697
- GFP_KERNEL);
4355
+ new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
36984356 if (!new) {
36994357 ret = -ENOMEM;
37004358 goto unlock;
....@@ -3702,17 +4360,16 @@
37024360 new->size = size;
37034361
37044362 /* Copy thresholds (if any) to new array */
3705
- if (thresholds->primary) {
3706
- memcpy(new->entries, thresholds->primary->entries, (size - 1) *
3707
- sizeof(struct mem_cgroup_threshold));
3708
- }
4363
+ if (thresholds->primary)
4364
+ memcpy(new->entries, thresholds->primary->entries,
4365
+ flex_array_size(new, entries, size - 1));
37094366
37104367 /* Add new threshold */
37114368 new->entries[size - 1].eventfd = eventfd;
37124369 new->entries[size - 1].threshold = threshold;
37134370
37144371 /* Sort thresholds. Registering of new threshold isn't time-critical */
3715
- sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
4372
+ sort(new->entries, size, sizeof(*new->entries),
37164373 compare_thresholds, NULL);
37174374
37184375 /* Find current threshold */
....@@ -3894,7 +4551,7 @@
38944551
38954552 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
38964553 {
3897
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
4554
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
38984555
38994556 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
39004557 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
....@@ -3920,6 +4577,8 @@
39204577 }
39214578
39224579 #ifdef CONFIG_CGROUP_WRITEBACK
4580
+
4581
+#include <trace/events/writeback.h>
39234582
39244583 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
39254584 {
....@@ -3952,11 +4611,11 @@
39524611 */
39534612 static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
39544613 {
3955
- long x = atomic_long_read(&memcg->stat[idx]);
4614
+ long x = atomic_long_read(&memcg->vmstats[idx]);
39564615 int cpu;
39574616
39584617 for_each_online_cpu(cpu)
3959
- x += per_cpu_ptr(memcg->stat_cpu, cpu)->count[idx];
4618
+ x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx];
39604619 if (x < 0)
39614620 x = 0;
39624621 return x;
....@@ -3989,18 +4648,142 @@
39894648
39904649 *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
39914650
3992
- /* this should eventually include NR_UNSTABLE_NFS */
39934651 *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
3994
- *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
3995
- (1 << LRU_ACTIVE_FILE));
4652
+ *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
4653
+ memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
39964654 *pheadroom = PAGE_COUNTER_MAX;
39974655
39984656 while ((parent = parent_mem_cgroup(memcg))) {
3999
- unsigned long ceiling = min(memcg->memory.max, memcg->high);
4657
+ unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
4658
+ READ_ONCE(memcg->memory.high));
40004659 unsigned long used = page_counter_read(&memcg->memory);
40014660
40024661 *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
40034662 memcg = parent;
4663
+ }
4664
+}
4665
+
4666
+/*
4667
+ * Foreign dirty flushing
4668
+ *
4669
+ * There's an inherent mismatch between memcg and writeback. The former
4670
+ * trackes ownership per-page while the latter per-inode. This was a
4671
+ * deliberate design decision because honoring per-page ownership in the
4672
+ * writeback path is complicated, may lead to higher CPU and IO overheads
4673
+ * and deemed unnecessary given that write-sharing an inode across
4674
+ * different cgroups isn't a common use-case.
4675
+ *
4676
+ * Combined with inode majority-writer ownership switching, this works well
4677
+ * enough in most cases but there are some pathological cases. For
4678
+ * example, let's say there are two cgroups A and B which keep writing to
4679
+ * different but confined parts of the same inode. B owns the inode and
4680
+ * A's memory is limited far below B's. A's dirty ratio can rise enough to
4681
+ * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
4682
+ * triggering background writeback. A will be slowed down without a way to
4683
+ * make writeback of the dirty pages happen.
4684
+ *
4685
+ * Conditions like the above can lead to a cgroup getting repatedly and
4686
+ * severely throttled after making some progress after each
4687
+ * dirty_expire_interval while the underyling IO device is almost
4688
+ * completely idle.
4689
+ *
4690
+ * Solving this problem completely requires matching the ownership tracking
4691
+ * granularities between memcg and writeback in either direction. However,
4692
+ * the more egregious behaviors can be avoided by simply remembering the
4693
+ * most recent foreign dirtying events and initiating remote flushes on
4694
+ * them when local writeback isn't enough to keep the memory clean enough.
4695
+ *
4696
+ * The following two functions implement such mechanism. When a foreign
4697
+ * page - a page whose memcg and writeback ownerships don't match - is
4698
+ * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
4699
+ * bdi_writeback on the page owning memcg. When balance_dirty_pages()
4700
+ * decides that the memcg needs to sleep due to high dirty ratio, it calls
4701
+ * mem_cgroup_flush_foreign() which queues writeback on the recorded
4702
+ * foreign bdi_writebacks which haven't expired. Both the numbers of
4703
+ * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
4704
+ * limited to MEMCG_CGWB_FRN_CNT.
4705
+ *
4706
+ * The mechanism only remembers IDs and doesn't hold any object references.
4707
+ * As being wrong occasionally doesn't matter, updates and accesses to the
4708
+ * records are lockless and racy.
4709
+ */
4710
+void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
4711
+ struct bdi_writeback *wb)
4712
+{
4713
+ struct mem_cgroup *memcg = page->mem_cgroup;
4714
+ struct memcg_cgwb_frn *frn;
4715
+ u64 now = get_jiffies_64();
4716
+ u64 oldest_at = now;
4717
+ int oldest = -1;
4718
+ int i;
4719
+
4720
+ trace_track_foreign_dirty(page, wb);
4721
+
4722
+ /*
4723
+ * Pick the slot to use. If there is already a slot for @wb, keep
4724
+ * using it. If not replace the oldest one which isn't being
4725
+ * written out.
4726
+ */
4727
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4728
+ frn = &memcg->cgwb_frn[i];
4729
+ if (frn->bdi_id == wb->bdi->id &&
4730
+ frn->memcg_id == wb->memcg_css->id)
4731
+ break;
4732
+ if (time_before64(frn->at, oldest_at) &&
4733
+ atomic_read(&frn->done.cnt) == 1) {
4734
+ oldest = i;
4735
+ oldest_at = frn->at;
4736
+ }
4737
+ }
4738
+
4739
+ if (i < MEMCG_CGWB_FRN_CNT) {
4740
+ /*
4741
+ * Re-using an existing one. Update timestamp lazily to
4742
+ * avoid making the cacheline hot. We want them to be
4743
+ * reasonably up-to-date and significantly shorter than
4744
+ * dirty_expire_interval as that's what expires the record.
4745
+ * Use the shorter of 1s and dirty_expire_interval / 8.
4746
+ */
4747
+ unsigned long update_intv =
4748
+ min_t(unsigned long, HZ,
4749
+ msecs_to_jiffies(dirty_expire_interval * 10) / 8);
4750
+
4751
+ if (time_before64(frn->at, now - update_intv))
4752
+ frn->at = now;
4753
+ } else if (oldest >= 0) {
4754
+ /* replace the oldest free one */
4755
+ frn = &memcg->cgwb_frn[oldest];
4756
+ frn->bdi_id = wb->bdi->id;
4757
+ frn->memcg_id = wb->memcg_css->id;
4758
+ frn->at = now;
4759
+ }
4760
+}
4761
+
4762
+/* issue foreign writeback flushes for recorded foreign dirtying events */
4763
+void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
4764
+{
4765
+ struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4766
+ unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
4767
+ u64 now = jiffies_64;
4768
+ int i;
4769
+
4770
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4771
+ struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
4772
+
4773
+ /*
4774
+ * If the record is older than dirty_expire_interval,
4775
+ * writeback on it has already started. No need to kick it
4776
+ * off again. Also, don't start a new one if there's
4777
+ * already one in flight.
4778
+ */
4779
+ if (time_after64(frn->at, now - intv) &&
4780
+ atomic_read(&frn->done.cnt) == 1) {
4781
+ frn->at = 0;
4782
+ trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
4783
+ cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
4784
+ WB_REASON_FOREIGN_FLUSH,
4785
+ &frn->done);
4786
+ }
40044787 }
40054788 }
40064789
....@@ -4123,6 +4906,7 @@
41234906 unsigned int efd, cfd;
41244907 struct fd efile;
41254908 struct fd cfile;
4909
+ struct dentry *cdentry;
41264910 const char *name;
41274911 char *endp;
41284912 int ret;
....@@ -4174,6 +4958,16 @@
41744958 goto out_put_cfile;
41754959
41764960 /*
4961
+ * The control file must be a regular cgroup1 file. As a regular cgroup
4962
+ * file can't be renamed, it's safe to access its name afterwards.
4963
+ */
4964
+ cdentry = cfile.file->f_path.dentry;
4965
+ if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
4966
+ ret = -EINVAL;
4967
+ goto out_put_cfile;
4968
+ }
4969
+
4970
+ /*
41774971 * Determine the event callbacks and set them in @event. This used
41784972 * to be done via struct cftype but cgroup core no longer knows
41794973 * about these events. The following is crude but the whole thing
....@@ -4181,7 +4975,7 @@
41814975 *
41824976 * DO NOT ADD NEW FILES.
41834977 */
4184
- name = cfile.file->f_path.dentry->d_name.name;
4978
+ name = cdentry->d_name.name;
41854979
41864980 if (!strcmp(name, "memory.usage_in_bytes")) {
41874981 event->register_event = mem_cgroup_usage_register_event;
....@@ -4205,7 +4999,7 @@
42054999 * automatically removed on cgroup destruction but the removal is
42065000 * asynchronous, so take an extra ref on @css.
42075001 */
4208
- cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
5002
+ cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
42095003 &memory_cgrp_subsys);
42105004 ret = -EINVAL;
42115005 if (IS_ERR(cfile_css))
....@@ -4340,12 +5134,10 @@
43405134 .write = mem_cgroup_reset,
43415135 .read_u64 = mem_cgroup_read_u64,
43425136 },
4343
-#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
5137
+#if defined(CONFIG_MEMCG_KMEM) && \
5138
+ (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
43445139 {
43455140 .name = "kmem.slabinfo",
4346
- .seq_start = memcg_slab_start,
4347
- .seq_next = memcg_slab_next,
4348
- .seq_stop = memcg_slab_stop,
43495141 .seq_show = memcg_slab_show,
43505142 },
43515143 #endif
....@@ -4383,7 +5175,7 @@
43835175 * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
43845176 * memory-controlled cgroups to 64k.
43855177 *
4386
- * However, there usually are many references to the oflline CSS after
5178
+ * However, there usually are many references to the offline CSS after
43875179 * the cgroup has been destroyed, such as page cache or reclaimable
43885180 * slab objects, that don't need to hang on to the ID. We want to keep
43895181 * those dead CSS from occupying IDs, or we might quickly exhaust the
....@@ -4404,31 +5196,26 @@
44045196 static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
44055197 {
44065198 if (memcg->id.id > 0) {
5199
+ trace_android_vh_mem_cgroup_id_remove(memcg);
44075200 idr_remove(&mem_cgroup_idr, memcg->id.id);
44085201 memcg->id.id = 0;
44095202 }
44105203 }
44115204
4412
-static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
5205
+static void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg,
5206
+ unsigned int n)
44135207 {
4414
- VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0);
4415
- atomic_add(n, &memcg->id.ref);
5208
+ refcount_add(n, &memcg->id.ref);
44165209 }
44175210
44185211 static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
44195212 {
4420
- VM_BUG_ON(atomic_read(&memcg->id.ref) < n);
4421
- if (atomic_sub_and_test(n, &memcg->id.ref)) {
5213
+ if (refcount_sub_and_test(n, &memcg->id.ref)) {
44225214 mem_cgroup_id_remove(memcg);
44235215
44245216 /* Memcg ID pins CSS */
44255217 css_put(&memcg->css);
44265218 }
4427
-}
4428
-
4429
-static inline void mem_cgroup_id_get(struct mem_cgroup *memcg)
4430
-{
4431
- mem_cgroup_id_get_many(memcg, 1);
44325219 }
44335220
44345221 static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
....@@ -4447,6 +5234,7 @@
44475234 WARN_ON_ONCE(!rcu_read_lock_held());
44485235 return idr_find(&mem_cgroup_idr, id);
44495236 }
5237
+EXPORT_SYMBOL_GPL(mem_cgroup_from_id);
44505238
44515239 static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
44525240 {
....@@ -4466,8 +5254,17 @@
44665254 if (!pn)
44675255 return 1;
44685256
4469
- pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat);
5257
+ pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat,
5258
+ GFP_KERNEL_ACCOUNT);
5259
+ if (!pn->lruvec_stat_local) {
5260
+ kfree(pn);
5261
+ return 1;
5262
+ }
5263
+
5264
+ pn->lruvec_stat_cpu = alloc_percpu_gfp(struct lruvec_stat,
5265
+ GFP_KERNEL_ACCOUNT);
44705266 if (!pn->lruvec_stat_cpu) {
5267
+ free_percpu(pn->lruvec_stat_local);
44715268 kfree(pn);
44725269 return 1;
44735270 }
....@@ -4489,6 +5286,7 @@
44895286 return;
44905287
44915288 free_percpu(pn->lruvec_stat_cpu);
5289
+ free_percpu(pn->lruvec_stat_local);
44925290 kfree(pn);
44935291 }
44945292
....@@ -4496,39 +5294,57 @@
44965294 {
44975295 int node;
44985296
5297
+ trace_android_vh_mem_cgroup_free(memcg);
44995298 for_each_node(node)
45005299 free_mem_cgroup_per_node_info(memcg, node);
4501
- free_percpu(memcg->stat_cpu);
5300
+ free_percpu(memcg->vmstats_percpu);
5301
+ free_percpu(memcg->vmstats_local);
45025302 kfree(memcg);
45035303 }
45045304
45055305 static void mem_cgroup_free(struct mem_cgroup *memcg)
45065306 {
45075307 memcg_wb_domain_exit(memcg);
5308
+ /*
5309
+ * Flush percpu vmstats and vmevents to guarantee the value correctness
5310
+ * on parent's and all ancestor levels.
5311
+ */
5312
+ memcg_flush_percpu_vmstats(memcg);
5313
+ memcg_flush_percpu_vmevents(memcg);
45085314 __mem_cgroup_free(memcg);
45095315 }
45105316
45115317 static struct mem_cgroup *mem_cgroup_alloc(void)
45125318 {
45135319 struct mem_cgroup *memcg;
4514
- size_t size;
5320
+ unsigned int size;
45155321 int node;
5322
+ int __maybe_unused i;
5323
+ long error = -ENOMEM;
45165324
45175325 size = sizeof(struct mem_cgroup);
45185326 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
45195327
45205328 memcg = kzalloc(size, GFP_KERNEL);
45215329 if (!memcg)
4522
- return NULL;
5330
+ return ERR_PTR(error);
45235331
45245332 memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
45255333 1, MEM_CGROUP_ID_MAX,
45265334 GFP_KERNEL);
4527
- if (memcg->id.id < 0)
5335
+ if (memcg->id.id < 0) {
5336
+ error = memcg->id.id;
5337
+ goto fail;
5338
+ }
5339
+
5340
+ memcg->vmstats_local = alloc_percpu_gfp(struct memcg_vmstats_percpu,
5341
+ GFP_KERNEL_ACCOUNT);
5342
+ if (!memcg->vmstats_local)
45285343 goto fail;
45295344
4530
- memcg->stat_cpu = alloc_percpu(struct mem_cgroup_stat_cpu);
4531
- if (!memcg->stat_cpu)
5345
+ memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
5346
+ GFP_KERNEL_ACCOUNT);
5347
+ if (!memcg->vmstats_percpu)
45325348 goto fail;
45335349
45345350 for_each_node(node)
....@@ -4539,7 +5355,6 @@
45395355 goto fail;
45405356
45415357 INIT_WORK(&memcg->high_work, high_work_func);
4542
- memcg->last_scanned_node = MAX_NUMNODES;
45435358 INIT_LIST_HEAD(&memcg->oom_notify);
45445359 mutex_init(&memcg->thresholds_lock);
45455360 spin_lock_init(&memcg->move_lock);
....@@ -4549,48 +5364,64 @@
45495364 memcg->socket_pressure = jiffies;
45505365 #ifdef CONFIG_MEMCG_KMEM
45515366 memcg->kmemcg_id = -1;
5367
+ INIT_LIST_HEAD(&memcg->objcg_list);
45525368 #endif
45535369 #ifdef CONFIG_CGROUP_WRITEBACK
45545370 INIT_LIST_HEAD(&memcg->cgwb_list);
5371
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
5372
+ memcg->cgwb_frn[i].done =
5373
+ __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
5374
+#endif
5375
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5376
+ spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
5377
+ INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
5378
+ memcg->deferred_split_queue.split_queue_len = 0;
45555379 #endif
45565380 idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
5381
+ trace_android_vh_mem_cgroup_alloc(memcg);
45575382 return memcg;
45585383 fail:
45595384 mem_cgroup_id_remove(memcg);
45605385 __mem_cgroup_free(memcg);
4561
- return NULL;
5386
+ return ERR_PTR(error);
45625387 }
45635388
45645389 static struct cgroup_subsys_state * __ref
45655390 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
45665391 {
45675392 struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
4568
- struct mem_cgroup *memcg;
5393
+ struct mem_cgroup *memcg, *old_memcg;
45695394 long error = -ENOMEM;
45705395
5396
+ old_memcg = set_active_memcg(parent);
45715397 memcg = mem_cgroup_alloc();
4572
- if (!memcg)
4573
- return ERR_PTR(error);
5398
+ set_active_memcg(old_memcg);
5399
+ if (IS_ERR(memcg))
5400
+ return ERR_CAST(memcg);
45745401
4575
- memcg->high = PAGE_COUNTER_MAX;
5402
+ page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
45765403 memcg->soft_limit = PAGE_COUNTER_MAX;
5404
+ page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
45775405 if (parent) {
45785406 memcg->swappiness = mem_cgroup_swappiness(parent);
45795407 memcg->oom_kill_disable = parent->oom_kill_disable;
45805408 }
4581
- if (parent && parent->use_hierarchy) {
5409
+ if (!parent) {
5410
+ page_counter_init(&memcg->memory, NULL);
5411
+ page_counter_init(&memcg->swap, NULL);
5412
+ page_counter_init(&memcg->kmem, NULL);
5413
+ page_counter_init(&memcg->tcpmem, NULL);
5414
+ } else if (parent->use_hierarchy) {
45825415 memcg->use_hierarchy = true;
45835416 page_counter_init(&memcg->memory, &parent->memory);
45845417 page_counter_init(&memcg->swap, &parent->swap);
4585
- page_counter_init(&memcg->memsw, &parent->memsw);
45865418 page_counter_init(&memcg->kmem, &parent->kmem);
45875419 page_counter_init(&memcg->tcpmem, &parent->tcpmem);
45885420 } else {
4589
- page_counter_init(&memcg->memory, NULL);
4590
- page_counter_init(&memcg->swap, NULL);
4591
- page_counter_init(&memcg->memsw, NULL);
4592
- page_counter_init(&memcg->kmem, NULL);
4593
- page_counter_init(&memcg->tcpmem, NULL);
5421
+ page_counter_init(&memcg->memory, &root_mem_cgroup->memory);
5422
+ page_counter_init(&memcg->swap, &root_mem_cgroup->swap);
5423
+ page_counter_init(&memcg->kmem, &root_mem_cgroup->kmem);
5424
+ page_counter_init(&memcg->tcpmem, &root_mem_cgroup->tcpmem);
45945425 /*
45955426 * Deeper hierachy with use_hierarchy == false doesn't make
45965427 * much sense so let cgroup subsystem know about this
....@@ -4617,7 +5448,7 @@
46175448 fail:
46185449 mem_cgroup_id_remove(memcg);
46195450 mem_cgroup_free(memcg);
4620
- return ERR_PTR(-ENOMEM);
5451
+ return ERR_PTR(error);
46215452 }
46225453
46235454 static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
....@@ -4635,8 +5466,9 @@
46355466 }
46365467
46375468 /* Online state pins memcg ID, memcg ID pins CSS */
4638
- atomic_set(&memcg->id.ref, 1);
5469
+ refcount_set(&memcg->id.ref, 1);
46395470 css_get(css);
5471
+ trace_android_vh_mem_cgroup_css_online(css, memcg);
46405472 return 0;
46415473 }
46425474
....@@ -4645,6 +5477,7 @@
46455477 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
46465478 struct mem_cgroup_event *event, *tmp;
46475479
5480
+ trace_android_vh_mem_cgroup_css_offline(css, memcg);
46485481 /*
46495482 * Unregister events and notify userspace.
46505483 * Notify userspace about cgroup removing only after rmdir of cgroup
....@@ -4663,6 +5496,8 @@
46635496 memcg_offline_kmem(memcg);
46645497 wb_memcg_offline(memcg);
46655498
5499
+ drain_all_stock(memcg);
5500
+
46665501 mem_cgroup_id_put(memcg);
46675502 }
46685503
....@@ -4676,7 +5511,12 @@
46765511 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
46775512 {
46785513 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5514
+ int __maybe_unused i;
46795515
5516
+#ifdef CONFIG_CGROUP_WRITEBACK
5517
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
5518
+ wb_wait_for_completion(&memcg->cgwb_frn[i].done);
5519
+#endif
46805520 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
46815521 static_branch_dec(&memcg_sockets_enabled_key);
46825522
....@@ -4710,13 +5550,13 @@
47105550
47115551 page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
47125552 page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
4713
- page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
47145553 page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
47155554 page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
47165555 page_counter_set_min(&memcg->memory, 0);
47175556 page_counter_set_low(&memcg->memory, 0);
4718
- memcg->high = PAGE_COUNTER_MAX;
5557
+ page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
47195558 memcg->soft_limit = PAGE_COUNTER_MAX;
5559
+ page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
47205560 memcg_wb_domain_size_changed(memcg);
47215561 }
47225562
....@@ -4759,7 +5599,7 @@
47595599 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
47605600 unsigned long addr, pte_t ptent)
47615601 {
4762
- struct page *page = _vm_normal_page(vma, addr, ptent, true);
5602
+ struct page *page = vm_normal_page(vma, addr, ptent);
47635603
47645604 if (!page || !page_mapped(page))
47655605 return NULL;
....@@ -4810,8 +5650,7 @@
48105650 * we call find_get_page() with swapper_space directly.
48115651 */
48125652 page = find_get_page(swap_address_space(ent), swp_offset(ent));
4813
- if (do_memsw_account())
4814
- entry->val = ent.val;
5653
+ entry->val = ent.val;
48155654
48165655 return page;
48175656 }
....@@ -4826,36 +5665,15 @@
48265665 static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
48275666 unsigned long addr, pte_t ptent, swp_entry_t *entry)
48285667 {
4829
- struct page *page = NULL;
4830
- struct address_space *mapping;
4831
- pgoff_t pgoff;
4832
-
48335668 if (!vma->vm_file) /* anonymous vma */
48345669 return NULL;
48355670 if (!(mc.flags & MOVE_FILE))
48365671 return NULL;
48375672
4838
- mapping = vma->vm_file->f_mapping;
4839
- pgoff = linear_page_index(vma, addr);
4840
-
48415673 /* page is moved even if it's not RSS of this task(page-faulted). */
4842
-#ifdef CONFIG_SWAP
48435674 /* shmem/tmpfs may report page out on swap: account for that too. */
4844
- if (shmem_mapping(mapping)) {
4845
- page = find_get_entry(mapping, pgoff);
4846
- if (radix_tree_exceptional_entry(page)) {
4847
- swp_entry_t swp = radix_to_swp_entry(page);
4848
- if (do_memsw_account())
4849
- *entry = swp;
4850
- page = find_get_page(swap_address_space(swp),
4851
- swp_offset(swp));
4852
- }
4853
- } else
4854
- page = find_get_page(mapping, pgoff);
4855
-#else
4856
- page = find_get_page(mapping, pgoff);
4857
-#endif
4858
- return page;
5675
+ return find_get_incore_page(vma->vm_file->f_mapping,
5676
+ linear_page_index(vma, addr));
48595677 }
48605678
48615679 /**
....@@ -4875,10 +5693,10 @@
48755693 struct mem_cgroup *from,
48765694 struct mem_cgroup *to)
48775695 {
4878
- unsigned long flags;
4879
- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5696
+ struct lruvec *from_vec, *to_vec;
5697
+ struct pglist_data *pgdat;
5698
+ unsigned int nr_pages = compound ? thp_nr_pages(page) : 1;
48805699 int ret;
4881
- bool anon;
48825700
48835701 VM_BUG_ON(from == to);
48845702 VM_BUG_ON_PAGE(PageLRU(page), page);
....@@ -4896,52 +5714,83 @@
48965714 if (page->mem_cgroup != from)
48975715 goto out_unlock;
48985716
4899
- anon = PageAnon(page);
5717
+ pgdat = page_pgdat(page);
5718
+ from_vec = mem_cgroup_lruvec(from, pgdat);
5719
+ to_vec = mem_cgroup_lruvec(to, pgdat);
49005720
4901
- spin_lock_irqsave(&from->move_lock, flags);
5721
+ lock_page_memcg(page);
49025722
4903
- if (!anon && page_mapped(page)) {
4904
- __mod_memcg_state(from, NR_FILE_MAPPED, -nr_pages);
4905
- __mod_memcg_state(to, NR_FILE_MAPPED, nr_pages);
4906
- }
5723
+ if (PageAnon(page)) {
5724
+ if (page_mapped(page)) {
5725
+ __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
5726
+ __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
5727
+ if (PageTransHuge(page)) {
5728
+ __dec_lruvec_state(from_vec, NR_ANON_THPS);
5729
+ __inc_lruvec_state(to_vec, NR_ANON_THPS);
5730
+ }
49075731
4908
- /*
4909
- * move_lock grabbed above and caller set from->moving_account, so
4910
- * mod_memcg_page_state will serialize updates to PageDirty.
4911
- * So mapping should be stable for dirty pages.
4912
- */
4913
- if (!anon && PageDirty(page)) {
4914
- struct address_space *mapping = page_mapping(page);
5732
+ }
5733
+ } else {
5734
+ __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
5735
+ __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
49155736
4916
- if (mapping_cap_account_dirty(mapping)) {
4917
- __mod_memcg_state(from, NR_FILE_DIRTY, -nr_pages);
4918
- __mod_memcg_state(to, NR_FILE_DIRTY, nr_pages);
5737
+ if (PageSwapBacked(page)) {
5738
+ __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
5739
+ __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
5740
+ }
5741
+
5742
+ if (page_mapped(page)) {
5743
+ __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
5744
+ __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
5745
+ }
5746
+
5747
+ if (PageDirty(page)) {
5748
+ struct address_space *mapping = page_mapping(page);
5749
+
5750
+ if (mapping_can_writeback(mapping)) {
5751
+ __mod_lruvec_state(from_vec, NR_FILE_DIRTY,
5752
+ -nr_pages);
5753
+ __mod_lruvec_state(to_vec, NR_FILE_DIRTY,
5754
+ nr_pages);
5755
+ }
49195756 }
49205757 }
49215758
49225759 if (PageWriteback(page)) {
4923
- __mod_memcg_state(from, NR_WRITEBACK, -nr_pages);
4924
- __mod_memcg_state(to, NR_WRITEBACK, nr_pages);
5760
+ __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
5761
+ __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
49255762 }
49265763
49275764 /*
5765
+ * All state has been migrated, let's switch to the new memcg.
5766
+ *
49285767 * It is safe to change page->mem_cgroup here because the page
4929
- * is referenced, charged, and isolated - we can't race with
4930
- * uncharging, charging, migration, or LRU putback.
5768
+ * is referenced, charged, isolated, and locked: we can't race
5769
+ * with (un)charging, migration, LRU putback, or anything else
5770
+ * that would rely on a stable page->mem_cgroup.
5771
+ *
5772
+ * Note that lock_page_memcg is a memcg lock, not a page lock,
5773
+ * to save space. As soon as we switch page->mem_cgroup to a
5774
+ * new memcg that isn't locked, the above state can change
5775
+ * concurrently again. Make sure we're truly done with it.
49315776 */
5777
+ smp_mb();
49325778
4933
- /* caller should have done css_get */
5779
+ css_get(&to->css);
5780
+ css_put(&from->css);
5781
+
49345782 page->mem_cgroup = to;
4935
- spin_unlock_irqrestore(&from->move_lock, flags);
5783
+
5784
+ __unlock_page_memcg(from);
49365785
49375786 ret = 0;
49385787
4939
- local_lock_irq(event_lock);
4940
- mem_cgroup_charge_statistics(to, page, compound, nr_pages);
5788
+ local_irq_disable();
5789
+ mem_cgroup_charge_statistics(to, page, nr_pages);
49415790 memcg_check_events(to, page);
4942
- mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
5791
+ mem_cgroup_charge_statistics(from, page, -nr_pages);
49435792 memcg_check_events(from, page);
4944
- local_unlock_irq(event_lock);
5793
+ local_irq_enable();
49455794 out_unlock:
49465795 unlock_page(page);
49475796 out:
....@@ -4963,8 +5812,8 @@
49635812 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
49645813 * target for charge migration. if @target is not NULL, the entry is stored
49655814 * in target->ent.
4966
- * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PUBLIC
4967
- * or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).
5815
+ * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE
5816
+ * (so ZONE_DEVICE page and thus not on the lru).
49685817 * For now we such page is charge like a regular page would be as for all
49695818 * intent and purposes it is just special memory taking the place of a
49705819 * regular page.
....@@ -4998,8 +5847,7 @@
49985847 */
49995848 if (page->mem_cgroup == mc.from) {
50005849 ret = MC_TARGET_PAGE;
5001
- if (is_device_private_page(page) ||
5002
- is_device_public_page(page))
5850
+ if (is_device_private_page(page))
50035851 ret = MC_TARGET_DEVICE;
50045852 if (target)
50055853 target->page = page;
....@@ -5070,8 +5918,8 @@
50705918 if (ptl) {
50715919 /*
50725920 * Note their can not be MC_TARGET_DEVICE for now as we do not
5073
- * support transparent huge page with MEMORY_DEVICE_PUBLIC or
5074
- * MEMORY_DEVICE_PRIVATE but this might change.
5921
+ * support transparent huge page with MEMORY_DEVICE_PRIVATE but
5922
+ * this might change.
50755923 */
50765924 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
50775925 mc.precharge += HPAGE_PMD_NR;
....@@ -5091,18 +5939,17 @@
50915939 return 0;
50925940 }
50935941
5942
+static const struct mm_walk_ops precharge_walk_ops = {
5943
+ .pmd_entry = mem_cgroup_count_precharge_pte_range,
5944
+};
5945
+
50945946 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
50955947 {
50965948 unsigned long precharge;
50975949
5098
- struct mm_walk mem_cgroup_count_precharge_walk = {
5099
- .pmd_entry = mem_cgroup_count_precharge_pte_range,
5100
- .mm = mm,
5101
- };
5102
- down_read(&mm->mmap_sem);
5103
- walk_page_range(0, mm->highest_vm_end,
5104
- &mem_cgroup_count_precharge_walk);
5105
- up_read(&mm->mmap_sem);
5950
+ mmap_read_lock(mm);
5951
+ walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
5952
+ mmap_read_unlock(mm);
51065953
51075954 precharge = mc.precharge;
51085955 mc.precharge = 0;
....@@ -5152,8 +5999,6 @@
51525999 */
51536000 if (!mem_cgroup_is_root(mc.to))
51546001 page_counter_uncharge(&mc.to->memory, mc.moved_swap);
5155
-
5156
- css_put_many(&mc.to->css, mc.moved_swap);
51576002
51586003 mc.moved_swap = 0;
51596004 }
....@@ -5315,7 +6160,7 @@
53156160 switch (get_mctgt_type(vma, addr, ptent, &target)) {
53166161 case MC_TARGET_DEVICE:
53176162 device = true;
5318
- /* fall through */
6163
+ fallthrough;
53196164 case MC_TARGET_PAGE:
53206165 page = target.page;
53216166 /*
....@@ -5370,13 +6215,12 @@
53706215 return ret;
53716216 }
53726217
6218
+static const struct mm_walk_ops charge_walk_ops = {
6219
+ .pmd_entry = mem_cgroup_move_charge_pte_range,
6220
+};
6221
+
53736222 static void mem_cgroup_move_charge(void)
53746223 {
5375
- struct mm_walk mem_cgroup_move_charge_walk = {
5376
- .pmd_entry = mem_cgroup_move_charge_pte_range,
5377
- .mm = mc.mm,
5378
- };
5379
-
53806224 lru_add_drain_all();
53816225 /*
53826226 * Signal lock_page_memcg() to take the memcg's move_lock
....@@ -5386,9 +6230,9 @@
53866230 atomic_inc(&mc.from->moving_account);
53876231 synchronize_rcu();
53886232 retry:
5389
- if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) {
6233
+ if (unlikely(!mmap_read_trylock(mc.mm))) {
53906234 /*
5391
- * Someone who are holding the mmap_sem might be waiting in
6235
+ * Someone who are holding the mmap_lock might be waiting in
53926236 * waitq. So we cancel all extra charges, wake up all waiters,
53936237 * and retry. Because we cancel precharges, we might not be able
53946238 * to move enough charges, but moving charge is a best-effort
....@@ -5402,9 +6246,10 @@
54026246 * When we have consumed all precharges and failed in doing
54036247 * additional charge, the page walk just aborts.
54046248 */
5405
- walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk);
6249
+ walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
6250
+ NULL);
54066251
5407
- up_read(&mc.mm->mmap_sem);
6252
+ mmap_read_unlock(mc.mm);
54086253 atomic_dec(&mc.from->moving_account);
54096254 }
54106255
....@@ -5446,6 +6291,16 @@
54466291 root_mem_cgroup->use_hierarchy = false;
54476292 }
54486293
6294
+static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
6295
+{
6296
+ if (value == PAGE_COUNTER_MAX)
6297
+ seq_puts(m, "max\n");
6298
+ else
6299
+ seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
6300
+
6301
+ return 0;
6302
+}
6303
+
54496304 static u64 memory_current_read(struct cgroup_subsys_state *css,
54506305 struct cftype *cft)
54516306 {
....@@ -5456,15 +6311,8 @@
54566311
54576312 static int memory_min_show(struct seq_file *m, void *v)
54586313 {
5459
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5460
- unsigned long min = READ_ONCE(memcg->memory.min);
5461
-
5462
- if (min == PAGE_COUNTER_MAX)
5463
- seq_puts(m, "max\n");
5464
- else
5465
- seq_printf(m, "%llu\n", (u64)min * PAGE_SIZE);
5466
-
5467
- return 0;
6314
+ return seq_puts_memcg_tunable(m,
6315
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
54686316 }
54696317
54706318 static ssize_t memory_min_write(struct kernfs_open_file *of,
....@@ -5486,15 +6334,8 @@
54866334
54876335 static int memory_low_show(struct seq_file *m, void *v)
54886336 {
5489
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5490
- unsigned long low = READ_ONCE(memcg->memory.low);
5491
-
5492
- if (low == PAGE_COUNTER_MAX)
5493
- seq_puts(m, "max\n");
5494
- else
5495
- seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE);
5496
-
5497
- return 0;
6337
+ return seq_puts_memcg_tunable(m,
6338
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
54986339 }
54996340
55006341 static ssize_t memory_low_write(struct kernfs_open_file *of,
....@@ -5516,22 +6357,16 @@
55166357
55176358 static int memory_high_show(struct seq_file *m, void *v)
55186359 {
5519
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5520
- unsigned long high = READ_ONCE(memcg->high);
5521
-
5522
- if (high == PAGE_COUNTER_MAX)
5523
- seq_puts(m, "max\n");
5524
- else
5525
- seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE);
5526
-
5527
- return 0;
6360
+ return seq_puts_memcg_tunable(m,
6361
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.high));
55286362 }
55296363
55306364 static ssize_t memory_high_write(struct kernfs_open_file *of,
55316365 char *buf, size_t nbytes, loff_t off)
55326366 {
55336367 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5534
- unsigned long nr_pages;
6368
+ unsigned int nr_retries = MAX_RECLAIM_RETRIES;
6369
+ bool drained = false;
55356370 unsigned long high;
55366371 int err;
55376372
....@@ -5540,12 +6375,30 @@
55406375 if (err)
55416376 return err;
55426377
5543
- memcg->high = high;
6378
+ page_counter_set_high(&memcg->memory, high);
55446379
5545
- nr_pages = page_counter_read(&memcg->memory);
5546
- if (nr_pages > high)
5547
- try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
5548
- GFP_KERNEL, true);
6380
+ for (;;) {
6381
+ unsigned long nr_pages = page_counter_read(&memcg->memory);
6382
+ unsigned long reclaimed;
6383
+
6384
+ if (nr_pages <= high)
6385
+ break;
6386
+
6387
+ if (signal_pending(current))
6388
+ break;
6389
+
6390
+ if (!drained) {
6391
+ drain_all_stock(memcg);
6392
+ drained = true;
6393
+ continue;
6394
+ }
6395
+
6396
+ reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
6397
+ GFP_KERNEL, true);
6398
+
6399
+ if (!reclaimed && !nr_retries--)
6400
+ break;
6401
+ }
55496402
55506403 memcg_wb_domain_size_changed(memcg);
55516404 return nbytes;
....@@ -5553,22 +6406,15 @@
55536406
55546407 static int memory_max_show(struct seq_file *m, void *v)
55556408 {
5556
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5557
- unsigned long max = READ_ONCE(memcg->memory.max);
5558
-
5559
- if (max == PAGE_COUNTER_MAX)
5560
- seq_puts(m, "max\n");
5561
- else
5562
- seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
5563
-
5564
- return 0;
6409
+ return seq_puts_memcg_tunable(m,
6410
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
55656411 }
55666412
55676413 static ssize_t memory_max_write(struct kernfs_open_file *of,
55686414 char *buf, size_t nbytes, loff_t off)
55696415 {
55706416 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5571
- unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
6417
+ unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
55726418 bool drained = false;
55736419 unsigned long max;
55746420 int err;
....@@ -5586,10 +6432,8 @@
55866432 if (nr_pages <= max)
55876433 break;
55886434
5589
- if (signal_pending(current)) {
5590
- err = -EINTR;
6435
+ if (signal_pending(current))
55916436 break;
5592
- }
55936437
55946438 if (!drained) {
55956439 drain_all_stock(memcg);
....@@ -5613,104 +6457,77 @@
56136457 return nbytes;
56146458 }
56156459
6460
+static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
6461
+{
6462
+ seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
6463
+ seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
6464
+ seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
6465
+ seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
6466
+ seq_printf(m, "oom_kill %lu\n",
6467
+ atomic_long_read(&events[MEMCG_OOM_KILL]));
6468
+}
6469
+
56166470 static int memory_events_show(struct seq_file *m, void *v)
56176471 {
5618
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
6472
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
56196473
5620
- seq_printf(m, "low %lu\n",
5621
- atomic_long_read(&memcg->memory_events[MEMCG_LOW]));
5622
- seq_printf(m, "high %lu\n",
5623
- atomic_long_read(&memcg->memory_events[MEMCG_HIGH]));
5624
- seq_printf(m, "max %lu\n",
5625
- atomic_long_read(&memcg->memory_events[MEMCG_MAX]));
5626
- seq_printf(m, "oom %lu\n",
5627
- atomic_long_read(&memcg->memory_events[MEMCG_OOM]));
5628
- seq_printf(m, "oom_kill %lu\n",
5629
- atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
6474
+ __memory_events_show(m, memcg->memory_events);
6475
+ return 0;
6476
+}
56306477
6478
+static int memory_events_local_show(struct seq_file *m, void *v)
6479
+{
6480
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6481
+
6482
+ __memory_events_show(m, memcg->memory_events_local);
56316483 return 0;
56326484 }
56336485
56346486 static int memory_stat_show(struct seq_file *m, void *v)
56356487 {
5636
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5637
- struct accumulated_stats acc;
5638
- int i;
6488
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6489
+ char *buf;
56396490
5640
- /*
5641
- * Provide statistics on the state of the memory subsystem as
5642
- * well as cumulative event counters that show past behavior.
5643
- *
5644
- * This list is ordered following a combination of these gradients:
5645
- * 1) generic big picture -> specifics and details
5646
- * 2) reflecting userspace activity -> reflecting kernel heuristics
5647
- *
5648
- * Current memory state:
5649
- */
5650
-
5651
- memset(&acc, 0, sizeof(acc));
5652
- acc.stats_size = MEMCG_NR_STAT;
5653
- acc.events_size = NR_VM_EVENT_ITEMS;
5654
- accumulate_memcg_tree(memcg, &acc);
5655
-
5656
- seq_printf(m, "anon %llu\n",
5657
- (u64)acc.stat[MEMCG_RSS] * PAGE_SIZE);
5658
- seq_printf(m, "file %llu\n",
5659
- (u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE);
5660
- seq_printf(m, "kernel_stack %llu\n",
5661
- (u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024);
5662
- seq_printf(m, "slab %llu\n",
5663
- (u64)(acc.stat[NR_SLAB_RECLAIMABLE] +
5664
- acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
5665
- seq_printf(m, "sock %llu\n",
5666
- (u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE);
5667
-
5668
- seq_printf(m, "shmem %llu\n",
5669
- (u64)acc.stat[NR_SHMEM] * PAGE_SIZE);
5670
- seq_printf(m, "file_mapped %llu\n",
5671
- (u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE);
5672
- seq_printf(m, "file_dirty %llu\n",
5673
- (u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE);
5674
- seq_printf(m, "file_writeback %llu\n",
5675
- (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE);
5676
-
5677
- for (i = 0; i < NR_LRU_LISTS; i++)
5678
- seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
5679
- (u64)acc.lru_pages[i] * PAGE_SIZE);
5680
-
5681
- seq_printf(m, "slab_reclaimable %llu\n",
5682
- (u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
5683
- seq_printf(m, "slab_unreclaimable %llu\n",
5684
- (u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
5685
-
5686
- /* Accumulated memory events */
5687
-
5688
- seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]);
5689
- seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]);
5690
-
5691
- seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]);
5692
- seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] +
5693
- acc.events[PGSCAN_DIRECT]);
5694
- seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] +
5695
- acc.events[PGSTEAL_DIRECT]);
5696
- seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]);
5697
- seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]);
5698
- seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]);
5699
- seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
5700
-
5701
- seq_printf(m, "workingset_refault %lu\n",
5702
- acc.stat[WORKINGSET_REFAULT]);
5703
- seq_printf(m, "workingset_activate %lu\n",
5704
- acc.stat[WORKINGSET_ACTIVATE]);
5705
- seq_printf(m, "workingset_nodereclaim %lu\n",
5706
- acc.stat[WORKINGSET_NODERECLAIM]);
5707
-
6491
+ buf = memory_stat_format(memcg);
6492
+ if (!buf)
6493
+ return -ENOMEM;
6494
+ seq_puts(m, buf);
6495
+ kfree(buf);
57086496 return 0;
57096497 }
57106498
6499
+#ifdef CONFIG_NUMA
6500
+static int memory_numa_stat_show(struct seq_file *m, void *v)
6501
+{
6502
+ int i;
6503
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6504
+
6505
+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
6506
+ int nid;
6507
+
6508
+ if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS)
6509
+ continue;
6510
+
6511
+ seq_printf(m, "%s", memory_stats[i].name);
6512
+ for_each_node_state(nid, N_MEMORY) {
6513
+ u64 size;
6514
+ struct lruvec *lruvec;
6515
+
6516
+ lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
6517
+ size = lruvec_page_state(lruvec, memory_stats[i].idx);
6518
+ size *= memory_stats[i].ratio;
6519
+ seq_printf(m, " N%d=%llu", nid, size);
6520
+ }
6521
+ seq_putc(m, '\n');
6522
+ }
6523
+
6524
+ return 0;
6525
+}
6526
+#endif
6527
+
57116528 static int memory_oom_group_show(struct seq_file *m, void *v)
57126529 {
5713
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
6530
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
57146531
57156532 seq_printf(m, "%d\n", memcg->oom_group);
57166533
....@@ -5776,10 +6593,21 @@
57766593 .seq_show = memory_events_show,
57776594 },
57786595 {
5779
- .name = "stat",
6596
+ .name = "events.local",
57806597 .flags = CFTYPE_NOT_ON_ROOT,
6598
+ .file_offset = offsetof(struct mem_cgroup, events_local_file),
6599
+ .seq_show = memory_events_local_show,
6600
+ },
6601
+ {
6602
+ .name = "stat",
57816603 .seq_show = memory_stat_show,
57826604 },
6605
+#ifdef CONFIG_NUMA
6606
+ {
6607
+ .name = "numa_stat",
6608
+ .seq_show = memory_numa_stat_show,
6609
+ },
6610
+#endif
57836611 {
57846612 .name = "oom.group",
57856613 .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
....@@ -5805,6 +6633,122 @@
58056633 .early_init = 0,
58066634 };
58076635
6636
+/*
6637
+ * This function calculates an individual cgroup's effective
6638
+ * protection which is derived from its own memory.min/low, its
6639
+ * parent's and siblings' settings, as well as the actual memory
6640
+ * distribution in the tree.
6641
+ *
6642
+ * The following rules apply to the effective protection values:
6643
+ *
6644
+ * 1. At the first level of reclaim, effective protection is equal to
6645
+ * the declared protection in memory.min and memory.low.
6646
+ *
6647
+ * 2. To enable safe delegation of the protection configuration, at
6648
+ * subsequent levels the effective protection is capped to the
6649
+ * parent's effective protection.
6650
+ *
6651
+ * 3. To make complex and dynamic subtrees easier to configure, the
6652
+ * user is allowed to overcommit the declared protection at a given
6653
+ * level. If that is the case, the parent's effective protection is
6654
+ * distributed to the children in proportion to how much protection
6655
+ * they have declared and how much of it they are utilizing.
6656
+ *
6657
+ * This makes distribution proportional, but also work-conserving:
6658
+ * if one cgroup claims much more protection than it uses memory,
6659
+ * the unused remainder is available to its siblings.
6660
+ *
6661
+ * 4. Conversely, when the declared protection is undercommitted at a
6662
+ * given level, the distribution of the larger parental protection
6663
+ * budget is NOT proportional. A cgroup's protection from a sibling
6664
+ * is capped to its own memory.min/low setting.
6665
+ *
6666
+ * 5. However, to allow protecting recursive subtrees from each other
6667
+ * without having to declare each individual cgroup's fixed share
6668
+ * of the ancestor's claim to protection, any unutilized -
6669
+ * "floating" - protection from up the tree is distributed in
6670
+ * proportion to each cgroup's *usage*. This makes the protection
6671
+ * neutral wrt sibling cgroups and lets them compete freely over
6672
+ * the shared parental protection budget, but it protects the
6673
+ * subtree as a whole from neighboring subtrees.
6674
+ *
6675
+ * Note that 4. and 5. are not in conflict: 4. is about protecting
6676
+ * against immediate siblings whereas 5. is about protecting against
6677
+ * neighboring subtrees.
6678
+ */
6679
+static unsigned long effective_protection(unsigned long usage,
6680
+ unsigned long parent_usage,
6681
+ unsigned long setting,
6682
+ unsigned long parent_effective,
6683
+ unsigned long siblings_protected)
6684
+{
6685
+ unsigned long protected;
6686
+ unsigned long ep;
6687
+
6688
+ protected = min(usage, setting);
6689
+ /*
6690
+ * If all cgroups at this level combined claim and use more
6691
+ * protection then what the parent affords them, distribute
6692
+ * shares in proportion to utilization.
6693
+ *
6694
+ * We are using actual utilization rather than the statically
6695
+ * claimed protection in order to be work-conserving: claimed
6696
+ * but unused protection is available to siblings that would
6697
+ * otherwise get a smaller chunk than what they claimed.
6698
+ */
6699
+ if (siblings_protected > parent_effective)
6700
+ return protected * parent_effective / siblings_protected;
6701
+
6702
+ /*
6703
+ * Ok, utilized protection of all children is within what the
6704
+ * parent affords them, so we know whatever this child claims
6705
+ * and utilizes is effectively protected.
6706
+ *
6707
+ * If there is unprotected usage beyond this value, reclaim
6708
+ * will apply pressure in proportion to that amount.
6709
+ *
6710
+ * If there is unutilized protection, the cgroup will be fully
6711
+ * shielded from reclaim, but we do return a smaller value for
6712
+ * protection than what the group could enjoy in theory. This
6713
+ * is okay. With the overcommit distribution above, effective
6714
+ * protection is always dependent on how memory is actually
6715
+ * consumed among the siblings anyway.
6716
+ */
6717
+ ep = protected;
6718
+
6719
+ /*
6720
+ * If the children aren't claiming (all of) the protection
6721
+ * afforded to them by the parent, distribute the remainder in
6722
+ * proportion to the (unprotected) memory of each cgroup. That
6723
+ * way, cgroups that aren't explicitly prioritized wrt each
6724
+ * other compete freely over the allowance, but they are
6725
+ * collectively protected from neighboring trees.
6726
+ *
6727
+ * We're using unprotected memory for the weight so that if
6728
+ * some cgroups DO claim explicit protection, we don't protect
6729
+ * the same bytes twice.
6730
+ *
6731
+ * Check both usage and parent_usage against the respective
6732
+ * protected values. One should imply the other, but they
6733
+ * aren't read atomically - make sure the division is sane.
6734
+ */
6735
+ if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT))
6736
+ return ep;
6737
+ if (parent_effective > siblings_protected &&
6738
+ parent_usage > siblings_protected &&
6739
+ usage > protected) {
6740
+ unsigned long unclaimed;
6741
+
6742
+ unclaimed = parent_effective - siblings_protected;
6743
+ unclaimed *= usage - protected;
6744
+ unclaimed /= parent_usage - siblings_protected;
6745
+
6746
+ ep += unclaimed;
6747
+ }
6748
+
6749
+ return ep;
6750
+}
6751
+
58086752 /**
58096753 * mem_cgroup_protected - check if memory consumption is in the normal range
58106754 * @root: the top ancestor of the sub-tree being checked
....@@ -5812,259 +6756,125 @@
58126756 *
58136757 * WARNING: This function is not stateless! It can only be used as part
58146758 * of a top-down tree iteration, not for isolated queries.
5815
- *
5816
- * Returns one of the following:
5817
- * MEMCG_PROT_NONE: cgroup memory is not protected
5818
- * MEMCG_PROT_LOW: cgroup memory is protected as long there is
5819
- * an unprotected supply of reclaimable memory from other cgroups.
5820
- * MEMCG_PROT_MIN: cgroup memory is protected
5821
- *
5822
- * @root is exclusive; it is never protected when looked at directly
5823
- *
5824
- * To provide a proper hierarchical behavior, effective memory.min/low values
5825
- * are used. Below is the description of how effective memory.low is calculated.
5826
- * Effective memory.min values is calculated in the same way.
5827
- *
5828
- * Effective memory.low is always equal or less than the original memory.low.
5829
- * If there is no memory.low overcommittment (which is always true for
5830
- * top-level memory cgroups), these two values are equal.
5831
- * Otherwise, it's a part of parent's effective memory.low,
5832
- * calculated as a cgroup's memory.low usage divided by sum of sibling's
5833
- * memory.low usages, where memory.low usage is the size of actually
5834
- * protected memory.
5835
- *
5836
- * low_usage
5837
- * elow = min( memory.low, parent->elow * ------------------ ),
5838
- * siblings_low_usage
5839
- *
5840
- * | memory.current, if memory.current < memory.low
5841
- * low_usage = |
5842
- | 0, otherwise.
5843
- *
5844
- *
5845
- * Such definition of the effective memory.low provides the expected
5846
- * hierarchical behavior: parent's memory.low value is limiting
5847
- * children, unprotected memory is reclaimed first and cgroups,
5848
- * which are not using their guarantee do not affect actual memory
5849
- * distribution.
5850
- *
5851
- * For example, if there are memcgs A, A/B, A/C, A/D and A/E:
5852
- *
5853
- * A A/memory.low = 2G, A/memory.current = 6G
5854
- * //\\
5855
- * BC DE B/memory.low = 3G B/memory.current = 2G
5856
- * C/memory.low = 1G C/memory.current = 2G
5857
- * D/memory.low = 0 D/memory.current = 2G
5858
- * E/memory.low = 10G E/memory.current = 0
5859
- *
5860
- * and the memory pressure is applied, the following memory distribution
5861
- * is expected (approximately):
5862
- *
5863
- * A/memory.current = 2G
5864
- *
5865
- * B/memory.current = 1.3G
5866
- * C/memory.current = 0.6G
5867
- * D/memory.current = 0
5868
- * E/memory.current = 0
5869
- *
5870
- * These calculations require constant tracking of the actual low usages
5871
- * (see propagate_protected_usage()), as well as recursive calculation of
5872
- * effective memory.low values. But as we do call mem_cgroup_protected()
5873
- * path for each memory cgroup top-down from the reclaim,
5874
- * it's possible to optimize this part, and save calculated elow
5875
- * for next usage. This part is intentionally racy, but it's ok,
5876
- * as memory.low is a best-effort mechanism.
58776759 */
5878
-enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
5879
- struct mem_cgroup *memcg)
6760
+void mem_cgroup_calculate_protection(struct mem_cgroup *root,
6761
+ struct mem_cgroup *memcg)
58806762 {
6763
+ unsigned long usage, parent_usage;
58816764 struct mem_cgroup *parent;
5882
- unsigned long emin, parent_emin;
5883
- unsigned long elow, parent_elow;
5884
- unsigned long usage;
58856765
58866766 if (mem_cgroup_disabled())
5887
- return MEMCG_PROT_NONE;
6767
+ return;
58886768
58896769 if (!root)
58906770 root = root_mem_cgroup;
6771
+
6772
+ /*
6773
+ * Effective values of the reclaim targets are ignored so they
6774
+ * can be stale. Have a look at mem_cgroup_protection for more
6775
+ * details.
6776
+ * TODO: calculation should be more robust so that we do not need
6777
+ * that special casing.
6778
+ */
58916779 if (memcg == root)
5892
- return MEMCG_PROT_NONE;
6780
+ return;
58936781
58946782 usage = page_counter_read(&memcg->memory);
58956783 if (!usage)
5896
- return MEMCG_PROT_NONE;
5897
-
5898
- emin = memcg->memory.min;
5899
- elow = memcg->memory.low;
6784
+ return;
59006785
59016786 parent = parent_mem_cgroup(memcg);
59026787 /* No parent means a non-hierarchical mode on v1 memcg */
59036788 if (!parent)
5904
- return MEMCG_PROT_NONE;
6789
+ return;
59056790
5906
- if (parent == root)
5907
- goto exit;
5908
-
5909
- parent_emin = READ_ONCE(parent->memory.emin);
5910
- emin = min(emin, parent_emin);
5911
- if (emin && parent_emin) {
5912
- unsigned long min_usage, siblings_min_usage;
5913
-
5914
- min_usage = min(usage, memcg->memory.min);
5915
- siblings_min_usage = atomic_long_read(
5916
- &parent->memory.children_min_usage);
5917
-
5918
- if (min_usage && siblings_min_usage)
5919
- emin = min(emin, parent_emin * min_usage /
5920
- siblings_min_usage);
6791
+ if (parent == root) {
6792
+ memcg->memory.emin = READ_ONCE(memcg->memory.min);
6793
+ memcg->memory.elow = READ_ONCE(memcg->memory.low);
6794
+ return;
59216795 }
59226796
5923
- parent_elow = READ_ONCE(parent->memory.elow);
5924
- elow = min(elow, parent_elow);
5925
- if (elow && parent_elow) {
5926
- unsigned long low_usage, siblings_low_usage;
6797
+ parent_usage = page_counter_read(&parent->memory);
59276798
5928
- low_usage = min(usage, memcg->memory.low);
5929
- siblings_low_usage = atomic_long_read(
5930
- &parent->memory.children_low_usage);
6799
+ WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage,
6800
+ READ_ONCE(memcg->memory.min),
6801
+ READ_ONCE(parent->memory.emin),
6802
+ atomic_long_read(&parent->memory.children_min_usage)));
59316803
5932
- if (low_usage && siblings_low_usage)
5933
- elow = min(elow, parent_elow * low_usage /
5934
- siblings_low_usage);
5935
- }
5936
-
5937
-exit:
5938
- memcg->memory.emin = emin;
5939
- memcg->memory.elow = elow;
5940
-
5941
- if (usage <= emin)
5942
- return MEMCG_PROT_MIN;
5943
- else if (usage <= elow)
5944
- return MEMCG_PROT_LOW;
5945
- else
5946
- return MEMCG_PROT_NONE;
6804
+ WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage,
6805
+ READ_ONCE(memcg->memory.low),
6806
+ READ_ONCE(parent->memory.elow),
6807
+ atomic_long_read(&parent->memory.children_low_usage)));
59476808 }
59486809
59496810 /**
5950
- * mem_cgroup_try_charge - try charging a page
6811
+ * __mem_cgroup_charge - charge a newly allocated page to a cgroup
59516812 * @page: page to charge
59526813 * @mm: mm context of the victim
59536814 * @gfp_mask: reclaim mode
5954
- * @memcgp: charged memcg return
5955
- * @compound: charge the page as compound or small page
59566815 *
59576816 * Try to charge @page to the memcg that @mm belongs to, reclaiming
59586817 * pages according to @gfp_mask if necessary.
59596818 *
5960
- * Returns 0 on success, with *@memcgp pointing to the charged memcg.
5961
- * Otherwise, an error code is returned.
5962
- *
5963
- * After page->mapping has been set up, the caller must finalize the
5964
- * charge with mem_cgroup_commit_charge(). Or abort the transaction
5965
- * with mem_cgroup_cancel_charge() in case page instantiation fails.
6819
+ * Returns 0 on success. Otherwise, an error code is returned.
59666820 */
5967
-int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
5968
- gfp_t gfp_mask, struct mem_cgroup **memcgp,
5969
- bool compound)
6821
+int __mem_cgroup_charge(struct page *page, struct mm_struct *mm,
6822
+ gfp_t gfp_mask)
59706823 {
6824
+ unsigned int nr_pages = thp_nr_pages(page);
59716825 struct mem_cgroup *memcg = NULL;
5972
- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
59736826 int ret = 0;
59746827
5975
- if (mem_cgroup_disabled())
5976
- goto out;
5977
-
59786828 if (PageSwapCache(page)) {
6829
+ swp_entry_t ent = { .val = page_private(page), };
6830
+ unsigned short id;
6831
+
59796832 /*
59806833 * Every swap fault against a single page tries to charge the
59816834 * page, bail as early as possible. shmem_unuse() encounters
5982
- * already charged pages, too. The USED bit is protected by
5983
- * the page lock, which serializes swap cache removal, which
6835
+ * already charged pages, too. page->mem_cgroup is protected
6836
+ * by the page lock, which serializes swap cache removal, which
59846837 * in turn serializes uncharging.
59856838 */
59866839 VM_BUG_ON_PAGE(!PageLocked(page), page);
59876840 if (compound_head(page)->mem_cgroup)
59886841 goto out;
59896842
5990
- if (do_swap_account) {
5991
- swp_entry_t ent = { .val = page_private(page), };
5992
- unsigned short id = lookup_swap_cgroup_id(ent);
5993
-
5994
- rcu_read_lock();
5995
- memcg = mem_cgroup_from_id(id);
5996
- if (memcg && !css_tryget_online(&memcg->css))
5997
- memcg = NULL;
5998
- rcu_read_unlock();
5999
- }
6843
+ id = lookup_swap_cgroup_id(ent);
6844
+ rcu_read_lock();
6845
+ memcg = mem_cgroup_from_id(id);
6846
+ if (memcg && !css_tryget_online(&memcg->css))
6847
+ memcg = NULL;
6848
+ rcu_read_unlock();
60006849 }
60016850
60026851 if (!memcg)
60036852 memcg = get_mem_cgroup_from_mm(mm);
60046853
60056854 ret = try_charge(memcg, gfp_mask, nr_pages);
6855
+ if (ret)
6856
+ goto out_put;
60066857
6007
- css_put(&memcg->css);
6008
-out:
6009
- *memcgp = memcg;
6010
- return ret;
6011
-}
6858
+ css_get(&memcg->css);
6859
+ commit_charge(page, memcg);
60126860
6013
-int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm,
6014
- gfp_t gfp_mask, struct mem_cgroup **memcgp,
6015
- bool compound)
6016
-{
6017
- struct mem_cgroup *memcg;
6018
- int ret;
6019
-
6020
- ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound);
6021
- memcg = *memcgp;
6022
- mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask);
6023
- return ret;
6024
-}
6025
-
6026
-/**
6027
- * mem_cgroup_commit_charge - commit a page charge
6028
- * @page: page to charge
6029
- * @memcg: memcg to charge the page to
6030
- * @lrucare: page might be on LRU already
6031
- * @compound: charge the page as compound or small page
6032
- *
6033
- * Finalize a charge transaction started by mem_cgroup_try_charge(),
6034
- * after page->mapping has been set up. This must happen atomically
6035
- * as part of the page instantiation, i.e. under the page table lock
6036
- * for anonymous pages, under the page lock for page and swap cache.
6037
- *
6038
- * In addition, the page must not be on the LRU during the commit, to
6039
- * prevent racing with task migration. If it might be, use @lrucare.
6040
- *
6041
- * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
6042
- */
6043
-void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
6044
- bool lrucare, bool compound)
6045
-{
6046
- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
6047
-
6048
- VM_BUG_ON_PAGE(!page->mapping, page);
6049
- VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
6050
-
6051
- if (mem_cgroup_disabled())
6052
- return;
6053
- /*
6054
- * Swap faults will attempt to charge the same page multiple
6055
- * times. But reuse_swap_page() might have removed the page
6056
- * from swapcache already, so we can't check PageSwapCache().
6057
- */
6058
- if (!memcg)
6059
- return;
6060
-
6061
- commit_charge(page, memcg, lrucare);
6062
-
6063
- local_lock_irq(event_lock);
6064
- mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
6861
+ local_irq_disable();
6862
+ mem_cgroup_charge_statistics(memcg, page, nr_pages);
60656863 memcg_check_events(memcg, page);
6066
- local_unlock_irq(event_lock);
6864
+ local_irq_enable();
60676865
6866
+ /*
6867
+ * Cgroup1's unified memory+swap counter has been charged with the
6868
+ * new swapcache page, finish the transfer by uncharging the swap
6869
+ * slot. The swap slot would also get uncharged when it dies, but
6870
+ * it can stick around indefinitely and we'd count the page twice
6871
+ * the entire time.
6872
+ *
6873
+ * Cgroup2 has separate resource counters for memory and swap,
6874
+ * so this is a non-issue here. Memory and swap charge lifetimes
6875
+ * correspond 1:1 to page and swap slot lifetimes: we charge the
6876
+ * page to memory here, and uncharge swap when the slot is freed.
6877
+ */
60686878 if (do_memsw_account() && PageSwapCache(page)) {
60696879 swp_entry_t entry = { .val = page_private(page) };
60706880 /*
....@@ -6074,42 +6884,18 @@
60746884 */
60756885 mem_cgroup_uncharge_swap(entry, nr_pages);
60766886 }
6077
-}
60786887
6079
-/**
6080
- * mem_cgroup_cancel_charge - cancel a page charge
6081
- * @page: page to charge
6082
- * @memcg: memcg to charge the page to
6083
- * @compound: charge the page as compound or small page
6084
- *
6085
- * Cancel a charge transaction started by mem_cgroup_try_charge().
6086
- */
6087
-void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
6088
- bool compound)
6089
-{
6090
- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
6091
-
6092
- if (mem_cgroup_disabled())
6093
- return;
6094
- /*
6095
- * Swap faults will attempt to charge the same page multiple
6096
- * times. But reuse_swap_page() might have removed the page
6097
- * from swapcache already, so we can't check PageSwapCache().
6098
- */
6099
- if (!memcg)
6100
- return;
6101
-
6102
- cancel_charge(memcg, nr_pages);
6888
+out_put:
6889
+ css_put(&memcg->css);
6890
+out:
6891
+ return ret;
61036892 }
61046893
61056894 struct uncharge_gather {
61066895 struct mem_cgroup *memcg;
6896
+ unsigned long nr_pages;
61076897 unsigned long pgpgout;
6108
- unsigned long nr_anon;
6109
- unsigned long nr_file;
61106898 unsigned long nr_kmem;
6111
- unsigned long nr_huge;
6112
- unsigned long nr_shmem;
61136899 struct page *dummy_page;
61146900 };
61156901
....@@ -6120,37 +6906,32 @@
61206906
61216907 static void uncharge_batch(const struct uncharge_gather *ug)
61226908 {
6123
- unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem;
61246909 unsigned long flags;
61256910
61266911 if (!mem_cgroup_is_root(ug->memcg)) {
6127
- page_counter_uncharge(&ug->memcg->memory, nr_pages);
6912
+ page_counter_uncharge(&ug->memcg->memory, ug->nr_pages);
61286913 if (do_memsw_account())
6129
- page_counter_uncharge(&ug->memcg->memsw, nr_pages);
6914
+ page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages);
61306915 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
61316916 page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
61326917 memcg_oom_recover(ug->memcg);
61336918 }
61346919
6135
- local_lock_irqsave(event_lock, flags);
6136
- __mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon);
6137
- __mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file);
6138
- __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge);
6139
- __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem);
6920
+ local_irq_save(flags);
61406921 __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
6141
- __this_cpu_add(ug->memcg->stat_cpu->nr_page_events, nr_pages);
6922
+ __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages);
61426923 memcg_check_events(ug->memcg, ug->dummy_page);
6143
- local_unlock_irqrestore(event_lock, flags);
6924
+ local_irq_restore(flags);
61446925
6145
- if (!mem_cgroup_is_root(ug->memcg))
6146
- css_put_many(&ug->memcg->css, nr_pages);
6926
+ /* drop reference from uncharge_page */
6927
+ css_put(&ug->memcg->css);
61476928 }
61486929
61496930 static void uncharge_page(struct page *page, struct uncharge_gather *ug)
61506931 {
6932
+ unsigned long nr_pages;
6933
+
61516934 VM_BUG_ON_PAGE(PageLRU(page), page);
6152
- VM_BUG_ON_PAGE(page_count(page) && !is_zone_device_page(page) &&
6153
- !PageHWPoison(page) , page);
61546935
61556936 if (!page->mem_cgroup)
61566937 return;
....@@ -6167,30 +6948,24 @@
61676948 uncharge_gather_clear(ug);
61686949 }
61696950 ug->memcg = page->mem_cgroup;
6951
+
6952
+ /* pairs with css_put in uncharge_batch */
6953
+ css_get(&ug->memcg->css);
61706954 }
61716955
6172
- if (!PageKmemcg(page)) {
6173
- unsigned int nr_pages = 1;
6956
+ nr_pages = compound_nr(page);
6957
+ ug->nr_pages += nr_pages;
61746958
6175
- if (PageTransHuge(page)) {
6176
- nr_pages <<= compound_order(page);
6177
- ug->nr_huge += nr_pages;
6178
- }
6179
- if (PageAnon(page))
6180
- ug->nr_anon += nr_pages;
6181
- else {
6182
- ug->nr_file += nr_pages;
6183
- if (PageSwapBacked(page))
6184
- ug->nr_shmem += nr_pages;
6185
- }
6959
+ if (!PageKmemcg(page)) {
61866960 ug->pgpgout++;
61876961 } else {
6188
- ug->nr_kmem += 1 << compound_order(page);
6962
+ ug->nr_kmem += nr_pages;
61896963 __ClearPageKmemcg(page);
61906964 }
61916965
61926966 ug->dummy_page = page;
61936967 page->mem_cgroup = NULL;
6968
+ css_put(&ug->memcg->css);
61946969 }
61956970
61966971 static void uncharge_list(struct list_head *page_list)
....@@ -6219,18 +6994,14 @@
62196994 }
62206995
62216996 /**
6222
- * mem_cgroup_uncharge - uncharge a page
6997
+ * __mem_cgroup_uncharge - uncharge a page
62236998 * @page: page to uncharge
62246999 *
6225
- * Uncharge a page previously charged with mem_cgroup_try_charge() and
6226
- * mem_cgroup_commit_charge().
7000
+ * Uncharge a page previously charged with __mem_cgroup_charge().
62277001 */
6228
-void mem_cgroup_uncharge(struct page *page)
7002
+void __mem_cgroup_uncharge(struct page *page)
62297003 {
62307004 struct uncharge_gather ug;
6231
-
6232
- if (mem_cgroup_disabled())
6233
- return;
62347005
62357006 /* Don't touch page->lru of any random page, pre-check: */
62367007 if (!page->mem_cgroup)
....@@ -6242,17 +7013,14 @@
62427013 }
62437014
62447015 /**
6245
- * mem_cgroup_uncharge_list - uncharge a list of page
7016
+ * __mem_cgroup_uncharge_list - uncharge a list of page
62467017 * @page_list: list of pages to uncharge
62477018 *
62487019 * Uncharge a list of pages previously charged with
6249
- * mem_cgroup_try_charge() and mem_cgroup_commit_charge().
7020
+ * __mem_cgroup_charge().
62507021 */
6251
-void mem_cgroup_uncharge_list(struct list_head *page_list)
7022
+void __mem_cgroup_uncharge_list(struct list_head *page_list)
62527023 {
6253
- if (mem_cgroup_disabled())
6254
- return;
6255
-
62567024 if (!list_empty(page_list))
62577025 uncharge_list(page_list);
62587026 }
....@@ -6271,7 +7039,6 @@
62717039 {
62727040 struct mem_cgroup *memcg;
62737041 unsigned int nr_pages;
6274
- bool compound;
62757042 unsigned long flags;
62767043
62777044 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
....@@ -6293,20 +7060,19 @@
62937060 return;
62947061
62957062 /* Force-charge the new page. The old one will be freed soon */
6296
- compound = PageTransHuge(newpage);
6297
- nr_pages = compound ? hpage_nr_pages(newpage) : 1;
7063
+ nr_pages = thp_nr_pages(newpage);
62987064
62997065 page_counter_charge(&memcg->memory, nr_pages);
63007066 if (do_memsw_account())
63017067 page_counter_charge(&memcg->memsw, nr_pages);
6302
- css_get_many(&memcg->css, nr_pages);
63037068
6304
- commit_charge(newpage, memcg, false);
7069
+ css_get(&memcg->css);
7070
+ commit_charge(newpage, memcg);
63057071
6306
- local_lock_irqsave(event_lock, flags);
6307
- mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
7072
+ local_irq_save(flags);
7073
+ mem_cgroup_charge_statistics(memcg, newpage, nr_pages);
63087074 memcg_check_events(memcg, newpage);
6309
- local_unlock_irqrestore(event_lock, flags);
7075
+ local_irq_restore(flags);
63107076 }
63117077
63127078 DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
....@@ -6329,7 +7095,7 @@
63297095 goto out;
63307096 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
63317097 goto out;
6332
- if (css_tryget_online(&memcg->css))
7098
+ if (css_tryget(&memcg->css))
63337099 sk->sk_memcg = memcg;
63347100 out:
63357101 rcu_read_unlock();
....@@ -6407,7 +7173,7 @@
64077173 if (!strcmp(token, "nokmem"))
64087174 cgroup_memory_nokmem = true;
64097175 }
6410
- return 0;
7176
+ return 1;
64117177 }
64127178 __setup("cgroup.memory=", cgroup_memory);
64137179
....@@ -6422,17 +7188,6 @@
64227188 static int __init mem_cgroup_init(void)
64237189 {
64247190 int cpu, node;
6425
-
6426
-#ifdef CONFIG_MEMCG_KMEM
6427
- /*
6428
- * Kmem cache creation is mostly done with the slab_mutex held,
6429
- * so use a workqueue with limited concurrency to avoid stalling
6430
- * all worker threads in case lots of cgroups are created and
6431
- * destroyed simultaneously.
6432
- */
6433
- memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1);
6434
- BUG_ON(!memcg_kmem_cache_wq);
6435
-#endif
64367191
64377192 cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
64387193 memcg_hotplug_cpu_dead);
....@@ -6460,7 +7215,7 @@
64607215 #ifdef CONFIG_MEMCG_SWAP
64617216 static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
64627217 {
6463
- while (!atomic_inc_not_zero(&memcg->id.ref)) {
7218
+ while (!refcount_inc_not_zero(&memcg->id.ref)) {
64647219 /*
64657220 * The root cgroup cannot be destroyed, so it's refcount must
64667221 * always be >= 1.
....@@ -6488,12 +7243,14 @@
64887243 struct mem_cgroup *memcg, *swap_memcg;
64897244 unsigned int nr_entries;
64907245 unsigned short oldid;
6491
- unsigned long flags;
64927246
64937247 VM_BUG_ON_PAGE(PageLRU(page), page);
64947248 VM_BUG_ON_PAGE(page_count(page), page);
64957249
6496
- if (!do_memsw_account())
7250
+ if (mem_cgroup_disabled())
7251
+ return;
7252
+
7253
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
64977254 return;
64987255
64997256 memcg = page->mem_cgroup;
....@@ -6508,7 +7265,7 @@
65087265 * ancestor for the swap instead and transfer the memory+swap charge.
65097266 */
65107267 swap_memcg = mem_cgroup_id_get_online(memcg);
6511
- nr_entries = hpage_nr_pages(page);
7268
+ nr_entries = thp_nr_pages(page);
65127269 /* Get references for the tail pages, too */
65137270 if (nr_entries > 1)
65147271 mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
....@@ -6522,7 +7279,7 @@
65227279 if (!mem_cgroup_is_root(memcg))
65237280 page_counter_uncharge(&memcg->memory, nr_entries);
65247281
6525
- if (memcg != swap_memcg) {
7282
+ if (!cgroup_memory_noswap && memcg != swap_memcg) {
65267283 if (!mem_cgroup_is_root(swap_memcg))
65277284 page_counter_charge(&swap_memcg->memsw, nr_entries);
65287285 page_counter_uncharge(&memcg->memsw, nr_entries);
....@@ -6534,21 +7291,15 @@
65347291 * important here to have the interrupts disabled because it is the
65357292 * only synchronisation we have for updating the per-CPU variables.
65367293 */
6537
- local_lock_irqsave(event_lock, flags);
6538
-#ifndef CONFIG_PREEMPT_RT_BASE
65397294 VM_BUG_ON(!irqs_disabled());
6540
-#endif
6541
- mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
6542
- -nr_entries);
7295
+ mem_cgroup_charge_statistics(memcg, page, -nr_entries);
65437296 memcg_check_events(memcg, page);
6544
- local_unlock_irqrestore(event_lock, flags);
65457297
6546
- if (!mem_cgroup_is_root(memcg))
6547
- css_put_many(&memcg->css, nr_entries);
7298
+ css_put(&memcg->css);
65487299 }
65497300
65507301 /**
6551
- * mem_cgroup_try_charge_swap - try charging swap space for a page
7302
+ * __mem_cgroup_try_charge_swap - try charging swap space for a page
65527303 * @page: page being added to swap
65537304 * @entry: swap entry to charge
65547305 *
....@@ -6556,14 +7307,14 @@
65567307 *
65577308 * Returns 0 on success, -ENOMEM on failure.
65587309 */
6559
-int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
7310
+int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
65607311 {
6561
- unsigned int nr_pages = hpage_nr_pages(page);
7312
+ unsigned int nr_pages = thp_nr_pages(page);
65627313 struct page_counter *counter;
65637314 struct mem_cgroup *memcg;
65647315 unsigned short oldid;
65657316
6566
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account)
7317
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
65677318 return 0;
65687319
65697320 memcg = page->mem_cgroup;
....@@ -6579,7 +7330,7 @@
65797330
65807331 memcg = mem_cgroup_id_get_online(memcg);
65817332
6582
- if (!mem_cgroup_is_root(memcg) &&
7333
+ if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) &&
65837334 !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
65847335 memcg_memory_event(memcg, MEMCG_SWAP_MAX);
65857336 memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
....@@ -6598,23 +7349,20 @@
65987349 }
65997350
66007351 /**
6601
- * mem_cgroup_uncharge_swap - uncharge swap space
7352
+ * __mem_cgroup_uncharge_swap - uncharge swap space
66027353 * @entry: swap entry to uncharge
66037354 * @nr_pages: the amount of swap space to uncharge
66047355 */
6605
-void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
7356
+void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
66067357 {
66077358 struct mem_cgroup *memcg;
66087359 unsigned short id;
6609
-
6610
- if (!do_swap_account)
6611
- return;
66127360
66137361 id = swap_cgroup_record(entry, 0, nr_pages);
66147362 rcu_read_lock();
66157363 memcg = mem_cgroup_from_id(id);
66167364 if (memcg) {
6617
- if (!mem_cgroup_is_root(memcg)) {
7365
+ if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) {
66187366 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
66197367 page_counter_uncharge(&memcg->swap, nr_pages);
66207368 else
....@@ -6630,7 +7378,7 @@
66307378 {
66317379 long nr_swap_pages = get_nr_swap_pages();
66327380
6633
- if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
7381
+ if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
66347382 return nr_swap_pages;
66357383 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
66367384 nr_swap_pages = min_t(long, nr_swap_pages,
....@@ -6647,36 +7395,33 @@
66477395
66487396 if (vm_swap_full())
66497397 return true;
6650
- if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
7398
+ if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
66517399 return false;
66527400
66537401 memcg = page->mem_cgroup;
66547402 if (!memcg)
66557403 return false;
66567404
6657
- for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
6658
- if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.max)
7405
+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
7406
+ unsigned long usage = page_counter_read(&memcg->swap);
7407
+
7408
+ if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
7409
+ usage * 2 >= READ_ONCE(memcg->swap.max))
66597410 return true;
7411
+ }
66607412
66617413 return false;
66627414 }
66637415
6664
-/* for remember boot option*/
6665
-#ifdef CONFIG_MEMCG_SWAP_ENABLED
6666
-static int really_do_swap_account __initdata = 1;
6667
-#else
6668
-static int really_do_swap_account __initdata;
6669
-#endif
6670
-
6671
-static int __init enable_swap_account(char *s)
7416
+static int __init setup_swap_account(char *s)
66727417 {
66737418 if (!strcmp(s, "1"))
6674
- really_do_swap_account = 1;
7419
+ cgroup_memory_noswap = 0;
66757420 else if (!strcmp(s, "0"))
6676
- really_do_swap_account = 0;
7421
+ cgroup_memory_noswap = 1;
66777422 return 1;
66787423 }
6679
-__setup("swapaccount=", enable_swap_account);
7424
+__setup("swapaccount=", setup_swap_account);
66807425
66817426 static u64 swap_current_read(struct cgroup_subsys_state *css,
66827427 struct cftype *cft)
....@@ -6686,17 +7431,33 @@
66867431 return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
66877432 }
66887433
7434
+static int swap_high_show(struct seq_file *m, void *v)
7435
+{
7436
+ return seq_puts_memcg_tunable(m,
7437
+ READ_ONCE(mem_cgroup_from_seq(m)->swap.high));
7438
+}
7439
+
7440
+static ssize_t swap_high_write(struct kernfs_open_file *of,
7441
+ char *buf, size_t nbytes, loff_t off)
7442
+{
7443
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
7444
+ unsigned long high;
7445
+ int err;
7446
+
7447
+ buf = strstrip(buf);
7448
+ err = page_counter_memparse(buf, "max", &high);
7449
+ if (err)
7450
+ return err;
7451
+
7452
+ page_counter_set_high(&memcg->swap, high);
7453
+
7454
+ return nbytes;
7455
+}
7456
+
66897457 static int swap_max_show(struct seq_file *m, void *v)
66907458 {
6691
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
6692
- unsigned long max = READ_ONCE(memcg->swap.max);
6693
-
6694
- if (max == PAGE_COUNTER_MAX)
6695
- seq_puts(m, "max\n");
6696
- else
6697
- seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
6698
-
6699
- return 0;
7459
+ return seq_puts_memcg_tunable(m,
7460
+ READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
67007461 }
67017462
67027463 static ssize_t swap_max_write(struct kernfs_open_file *of,
....@@ -6718,8 +7479,10 @@
67187479
67197480 static int swap_events_show(struct seq_file *m, void *v)
67207481 {
6721
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
7482
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
67227483
7484
+ seq_printf(m, "high %lu\n",
7485
+ atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH]));
67237486 seq_printf(m, "max %lu\n",
67247487 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
67257488 seq_printf(m, "fail %lu\n",
....@@ -6733,6 +7496,12 @@
67337496 .name = "swap.current",
67347497 .flags = CFTYPE_NOT_ON_ROOT,
67357498 .read_u64 = swap_current_read,
7499
+ },
7500
+ {
7501
+ .name = "swap.high",
7502
+ .flags = CFTYPE_NOT_ON_ROOT,
7503
+ .seq_show = swap_high_show,
7504
+ .write = swap_high_write,
67367505 },
67377506 {
67387507 .name = "swap.max",
....@@ -6749,7 +7518,7 @@
67497518 { } /* terminate */
67507519 };
67517520
6752
-static struct cftype memsw_cgroup_files[] = {
7521
+static struct cftype memsw_files[] = {
67537522 {
67547523 .name = "memsw.usage_in_bytes",
67557524 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
....@@ -6776,17 +7545,27 @@
67767545 { }, /* terminate */
67777546 };
67787547
7548
+/*
7549
+ * If mem_cgroup_swap_init() is implemented as a subsys_initcall()
7550
+ * instead of a core_initcall(), this could mean cgroup_memory_noswap still
7551
+ * remains set to false even when memcg is disabled via "cgroup_disable=memory"
7552
+ * boot parameter. This may result in premature OOPS inside
7553
+ * mem_cgroup_get_nr_swap_pages() function in corner cases.
7554
+ */
67797555 static int __init mem_cgroup_swap_init(void)
67807556 {
6781
- if (!mem_cgroup_disabled() && really_do_swap_account) {
6782
- do_swap_account = 1;
6783
- WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys,
6784
- swap_files));
6785
- WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
6786
- memsw_cgroup_files));
6787
- }
7557
+ /* No memory control -> no swap control */
7558
+ if (mem_cgroup_disabled())
7559
+ cgroup_memory_noswap = true;
7560
+
7561
+ if (cgroup_memory_noswap)
7562
+ return 0;
7563
+
7564
+ WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
7565
+ WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files));
7566
+
67887567 return 0;
67897568 }
6790
-subsys_initcall(mem_cgroup_swap_init);
7569
+core_initcall(mem_cgroup_swap_init);
67917570
67927571 #endif /* CONFIG_MEMCG_SWAP */