hc
2023-12-09 95099d4622f8cb224d94e314c7a8e0df60b13f87
kernel/mm/memcontrol.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /* memcontrol.c - Memory Controller
23 *
34 * Copyright IBM Corporation, 2007
....@@ -19,26 +20,17 @@
1920 * Lockless page tracking & accounting
2021 * Unified hierarchy configuration model
2122 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
22
- *
23
- * This program is free software; you can redistribute it and/or modify
24
- * it under the terms of the GNU General Public License as published by
25
- * the Free Software Foundation; either version 2 of the License, or
26
- * (at your option) any later version.
27
- *
28
- * This program is distributed in the hope that it will be useful,
29
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
30
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
31
- * GNU General Public License for more details.
3223 */
3324
3425 #include <linux/page_counter.h>
3526 #include <linux/memcontrol.h>
3627 #include <linux/cgroup.h>
37
-#include <linux/mm.h>
28
+#include <linux/pagewalk.h>
3829 #include <linux/sched/mm.h>
3930 #include <linux/shmem_fs.h>
4031 #include <linux/hugetlb.h>
4132 #include <linux/pagemap.h>
33
+#include <linux/vm_event_item.h>
4234 #include <linux/smp.h>
4335 #include <linux/page-flags.h>
4436 #include <linux/backing-dev.h>
....@@ -65,22 +57,26 @@
6557 #include <linux/lockdep.h>
6658 #include <linux/file.h>
6759 #include <linux/tracehook.h>
60
+#include <linux/psi.h>
61
+#include <linux/seq_buf.h>
6862 #include "internal.h"
6963 #include <net/sock.h>
7064 #include <net/ip.h>
7165 #include "slab.h"
72
-#include <linux/locallock.h>
66
+#include <linux/local_lock.h>
7367
7468 #include <linux/uaccess.h>
7569
7670 #include <trace/events/vmscan.h>
71
+#include <trace/hooks/mm.h>
7772
7873 struct cgroup_subsys memory_cgrp_subsys __read_mostly;
7974 EXPORT_SYMBOL(memory_cgrp_subsys);
8075
8176 struct mem_cgroup *root_mem_cgroup __read_mostly;
8277
83
-#define MEM_CGROUP_RECLAIM_RETRIES 5
78
+/* Active memory cgroup to use from an interrupt context */
79
+DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
8480
8581 /* Socket memory accounting disabled? */
8682 static bool cgroup_memory_nosocket;
....@@ -90,30 +86,30 @@
9086
9187 /* Whether the swap controller is active */
9288 #ifdef CONFIG_MEMCG_SWAP
93
-int do_swap_account __read_mostly;
89
+bool cgroup_memory_noswap __read_mostly;
9490 #else
95
-#define do_swap_account 0
91
+#define cgroup_memory_noswap 1
9692 #endif
9793
98
-static DEFINE_LOCAL_IRQ_LOCK(event_lock);
94
+#ifdef CONFIG_CGROUP_WRITEBACK
95
+static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
96
+#endif
97
+
98
+struct event_lock {
99
+ local_lock_t l;
100
+};
101
+static DEFINE_PER_CPU(struct event_lock, event_lock) = {
102
+ .l = INIT_LOCAL_LOCK(l),
103
+};
99104
100105 /* Whether legacy memory+swap accounting is active */
101106 static bool do_memsw_account(void)
102107 {
103
- return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
108
+ return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
104109 }
105
-
106
-static const char *const mem_cgroup_lru_names[] = {
107
- "inactive_anon",
108
- "active_anon",
109
- "inactive_file",
110
- "active_file",
111
- "unevictable",
112
-};
113110
114111 #define THRESHOLDS_EVENTS_TARGET 128
115112 #define SOFTLIMIT_EVENTS_TARGET 1024
116
-#define NUMAINFO_EVENTS_TARGET 1024
117113
118114 /*
119115 * Cgroups above their limits are maintained in a RB-Tree, independent of
....@@ -213,14 +209,6 @@
213209 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
214210 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
215211
216
-enum charge_type {
217
- MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
218
- MEM_CGROUP_CHARGE_TYPE_ANON,
219
- MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
220
- MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */
221
- NR_CHARGE_TYPE,
222
-};
223
-
224212 /* for encoding cft->private value on file */
225213 enum res_type {
226214 _MEM,
....@@ -251,7 +239,7 @@
251239 iter != NULL; \
252240 iter = mem_cgroup_iter(NULL, iter, NULL))
253241
254
-static inline bool should_force_charge(void)
242
+static inline bool task_is_dying(void)
255243 {
256244 return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
257245 (current->flags & PF_EXITING);
....@@ -271,8 +259,100 @@
271259 }
272260
273261 #ifdef CONFIG_MEMCG_KMEM
262
+static DEFINE_SPINLOCK(objcg_lock);
263
+
264
+static void obj_cgroup_release(struct percpu_ref *ref)
265
+{
266
+ struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
267
+ struct mem_cgroup *memcg;
268
+ unsigned int nr_bytes;
269
+ unsigned int nr_pages;
270
+ unsigned long flags;
271
+
272
+ /*
273
+ * At this point all allocated objects are freed, and
274
+ * objcg->nr_charged_bytes can't have an arbitrary byte value.
275
+ * However, it can be PAGE_SIZE or (x * PAGE_SIZE).
276
+ *
277
+ * The following sequence can lead to it:
278
+ * 1) CPU0: objcg == stock->cached_objcg
279
+ * 2) CPU1: we do a small allocation (e.g. 92 bytes),
280
+ * PAGE_SIZE bytes are charged
281
+ * 3) CPU1: a process from another memcg is allocating something,
282
+ * the stock if flushed,
283
+ * objcg->nr_charged_bytes = PAGE_SIZE - 92
284
+ * 5) CPU0: we do release this object,
285
+ * 92 bytes are added to stock->nr_bytes
286
+ * 6) CPU0: stock is flushed,
287
+ * 92 bytes are added to objcg->nr_charged_bytes
288
+ *
289
+ * In the result, nr_charged_bytes == PAGE_SIZE.
290
+ * This page will be uncharged in obj_cgroup_release().
291
+ */
292
+ nr_bytes = atomic_read(&objcg->nr_charged_bytes);
293
+ WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
294
+ nr_pages = nr_bytes >> PAGE_SHIFT;
295
+
296
+ spin_lock_irqsave(&objcg_lock, flags);
297
+ memcg = obj_cgroup_memcg(objcg);
298
+ if (nr_pages)
299
+ __memcg_kmem_uncharge(memcg, nr_pages);
300
+ list_del(&objcg->list);
301
+ mem_cgroup_put(memcg);
302
+ spin_unlock_irqrestore(&objcg_lock, flags);
303
+
304
+ percpu_ref_exit(ref);
305
+ kfree_rcu(objcg, rcu);
306
+}
307
+
308
+static struct obj_cgroup *obj_cgroup_alloc(void)
309
+{
310
+ struct obj_cgroup *objcg;
311
+ int ret;
312
+
313
+ objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
314
+ if (!objcg)
315
+ return NULL;
316
+
317
+ ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0,
318
+ GFP_KERNEL);
319
+ if (ret) {
320
+ kfree(objcg);
321
+ return NULL;
322
+ }
323
+ INIT_LIST_HEAD(&objcg->list);
324
+ return objcg;
325
+}
326
+
327
+static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
328
+ struct mem_cgroup *parent)
329
+{
330
+ struct obj_cgroup *objcg, *iter;
331
+
332
+ objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
333
+
334
+ spin_lock_irq(&objcg_lock);
335
+
336
+ /* Move active objcg to the parent's list */
337
+ xchg(&objcg->memcg, parent);
338
+ css_get(&parent->css);
339
+ list_add(&objcg->list, &parent->objcg_list);
340
+
341
+ /* Move already reparented objcgs to the parent's list */
342
+ list_for_each_entry(iter, &memcg->objcg_list, list) {
343
+ css_get(&parent->css);
344
+ xchg(&iter->memcg, parent);
345
+ css_put(&memcg->css);
346
+ }
347
+ list_splice(&memcg->objcg_list, &parent->objcg_list);
348
+
349
+ spin_unlock_irq(&objcg_lock);
350
+
351
+ percpu_ref_kill(&objcg->refcnt);
352
+}
353
+
274354 /*
275
- * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
355
+ * This will be used as a shrinker list's index.
276356 * The main reason for not using cgroup id for this:
277357 * this works better in sparse environments, where we have a lot of memcgs,
278358 * but only a few kmem-limited. Or also, if we have, for instance, 200
....@@ -315,14 +395,13 @@
315395
316396 /*
317397 * A lot of the calls to the cache allocation functions are expected to be
318
- * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
398
+ * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are
319399 * conditional to this static branch, we'll have to allow modules that does
320400 * kmem_cache_alloc and the such to see this symbol as well
321401 */
322402 DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
323403 EXPORT_SYMBOL(memcg_kmem_enabled_key);
324
-
325
-struct workqueue_struct *memcg_kmem_cache_wq;
404
+#endif
326405
327406 static int memcg_shrinker_map_size;
328407 static DEFINE_MUTEX(memcg_shrinker_map_mutex);
....@@ -347,7 +426,7 @@
347426 if (!old)
348427 return 0;
349428
350
- new = kvmalloc(sizeof(*new) + size, GFP_KERNEL);
429
+ new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
351430 if (!new)
352431 return -ENOMEM;
353432
....@@ -391,7 +470,7 @@
391470 mutex_lock(&memcg_shrinker_map_mutex);
392471 size = memcg_shrinker_map_size;
393472 for_each_node(nid) {
394
- map = kvzalloc(sizeof(*map) + size, GFP_KERNEL);
473
+ map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid);
395474 if (!map) {
396475 memcg_free_shrinker_maps(memcg);
397476 ret = -ENOMEM;
....@@ -448,14 +527,6 @@
448527 }
449528 }
450529
451
-#else /* CONFIG_MEMCG_KMEM */
452
-static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
453
-{
454
- return 0;
455
-}
456
-static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { }
457
-#endif /* CONFIG_MEMCG_KMEM */
458
-
459530 /**
460531 * mem_cgroup_css_from_page - css of the memcg associated with a page
461532 * @page: page of interest
....@@ -498,7 +569,17 @@
498569 unsigned long ino = 0;
499570
500571 rcu_read_lock();
501
- memcg = READ_ONCE(page->mem_cgroup);
572
+ memcg = page->mem_cgroup;
573
+
574
+ /*
575
+ * The lowest bit set means that memcg isn't a valid
576
+ * memcg pointer, but a obj_cgroups pointer.
577
+ * In this case the page is shared and doesn't belong
578
+ * to any specific memory cgroup.
579
+ */
580
+ if ((unsigned long) memcg & 0x1UL)
581
+ memcg = NULL;
582
+
502583 while (memcg && !(memcg->css.flags & CSS_ONLINE))
503584 memcg = parent_mem_cgroup(memcg);
504585 if (memcg)
....@@ -674,7 +755,7 @@
674755 */
675756 __mem_cgroup_remove_exceeded(mz, mctz);
676757 if (!soft_limit_excess(mz->memcg) ||
677
- !css_tryget_online(&mz->memcg->css))
758
+ !css_tryget(&mz->memcg->css))
678759 goto retry;
679760 done:
680761 return mz;
....@@ -691,33 +772,187 @@
691772 return mz;
692773 }
693774
694
-static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
695
- int event)
775
+/**
776
+ * __mod_memcg_state - update cgroup memory statistics
777
+ * @memcg: the memory cgroup
778
+ * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
779
+ * @val: delta to add to the counter, can be negative
780
+ */
781
+void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
696782 {
697
- return atomic_long_read(&memcg->events[event]);
783
+ long x, threshold = MEMCG_CHARGE_BATCH;
784
+
785
+ if (mem_cgroup_disabled())
786
+ return;
787
+
788
+ if (memcg_stat_item_in_bytes(idx))
789
+ threshold <<= PAGE_SHIFT;
790
+
791
+ x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
792
+ if (unlikely(abs(x) > threshold)) {
793
+ struct mem_cgroup *mi;
794
+
795
+ /*
796
+ * Batch local counters to keep them in sync with
797
+ * the hierarchical ones.
798
+ */
799
+ __this_cpu_add(memcg->vmstats_local->stat[idx], x);
800
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
801
+ atomic_long_add(x, &mi->vmstats[idx]);
802
+ x = 0;
803
+ }
804
+ __this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
805
+}
806
+
807
+static struct mem_cgroup_per_node *
808
+parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
809
+{
810
+ struct mem_cgroup *parent;
811
+
812
+ parent = parent_mem_cgroup(pn->memcg);
813
+ if (!parent)
814
+ return NULL;
815
+ return mem_cgroup_nodeinfo(parent, nid);
816
+}
817
+
818
+void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
819
+ int val)
820
+{
821
+ struct mem_cgroup_per_node *pn;
822
+ struct mem_cgroup *memcg;
823
+ long x, threshold = MEMCG_CHARGE_BATCH;
824
+
825
+ pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
826
+ memcg = pn->memcg;
827
+
828
+ preempt_disable_rt();
829
+ /* Update memcg */
830
+ __mod_memcg_state(memcg, idx, val);
831
+
832
+ /* Update lruvec */
833
+ __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
834
+
835
+ if (vmstat_item_in_bytes(idx))
836
+ threshold <<= PAGE_SHIFT;
837
+
838
+ x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
839
+ if (unlikely(abs(x) > threshold)) {
840
+ pg_data_t *pgdat = lruvec_pgdat(lruvec);
841
+ struct mem_cgroup_per_node *pi;
842
+
843
+ for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
844
+ atomic_long_add(x, &pi->lruvec_stat[idx]);
845
+ x = 0;
846
+ }
847
+ __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
848
+ preempt_enable_rt();
849
+}
850
+
851
+/**
852
+ * __mod_lruvec_state - update lruvec memory statistics
853
+ * @lruvec: the lruvec
854
+ * @idx: the stat item
855
+ * @val: delta to add to the counter, can be negative
856
+ *
857
+ * The lruvec is the intersection of the NUMA node and a cgroup. This
858
+ * function updates the all three counters that are affected by a
859
+ * change of state at this level: per-node, per-cgroup, per-lruvec.
860
+ */
861
+void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
862
+ int val)
863
+{
864
+ /* Update node */
865
+ __mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
866
+
867
+ /* Update memcg and lruvec */
868
+ if (!mem_cgroup_disabled())
869
+ __mod_memcg_lruvec_state(lruvec, idx, val);
870
+}
871
+
872
+void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
873
+{
874
+ pg_data_t *pgdat = page_pgdat(virt_to_page(p));
875
+ struct mem_cgroup *memcg;
876
+ struct lruvec *lruvec;
877
+
878
+ rcu_read_lock();
879
+ memcg = mem_cgroup_from_obj(p);
880
+
881
+ /*
882
+ * Untracked pages have no memcg, no lruvec. Update only the
883
+ * node. If we reparent the slab objects to the root memcg,
884
+ * when we free the slab object, we need to update the per-memcg
885
+ * vmstats to keep it correct for the root memcg.
886
+ */
887
+ if (!memcg) {
888
+ __mod_node_page_state(pgdat, idx, val);
889
+ } else {
890
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
891
+ __mod_lruvec_state(lruvec, idx, val);
892
+ }
893
+ rcu_read_unlock();
894
+}
895
+
896
+void mod_memcg_obj_state(void *p, int idx, int val)
897
+{
898
+ struct mem_cgroup *memcg;
899
+
900
+ rcu_read_lock();
901
+ memcg = mem_cgroup_from_obj(p);
902
+ if (memcg)
903
+ mod_memcg_state(memcg, idx, val);
904
+ rcu_read_unlock();
905
+}
906
+
907
+/**
908
+ * __count_memcg_events - account VM events in a cgroup
909
+ * @memcg: the memory cgroup
910
+ * @idx: the event item
911
+ * @count: the number of events that occured
912
+ */
913
+void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
914
+ unsigned long count)
915
+{
916
+ unsigned long x;
917
+
918
+ if (mem_cgroup_disabled())
919
+ return;
920
+
921
+ x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
922
+ if (unlikely(x > MEMCG_CHARGE_BATCH)) {
923
+ struct mem_cgroup *mi;
924
+
925
+ /*
926
+ * Batch local counters to keep them in sync with
927
+ * the hierarchical ones.
928
+ */
929
+ __this_cpu_add(memcg->vmstats_local->events[idx], x);
930
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
931
+ atomic_long_add(x, &mi->vmevents[idx]);
932
+ x = 0;
933
+ }
934
+ __this_cpu_write(memcg->vmstats_percpu->events[idx], x);
935
+}
936
+
937
+static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
938
+{
939
+ return atomic_long_read(&memcg->vmevents[event]);
940
+}
941
+
942
+static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
943
+{
944
+ long x = 0;
945
+ int cpu;
946
+
947
+ for_each_possible_cpu(cpu)
948
+ x += per_cpu(memcg->vmstats_local->events[event], cpu);
949
+ return x;
698950 }
699951
700952 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
701953 struct page *page,
702
- bool compound, int nr_pages)
954
+ int nr_pages)
703955 {
704
- /*
705
- * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
706
- * counted as CACHE even if it's on ANON LRU.
707
- */
708
- if (PageAnon(page))
709
- __mod_memcg_state(memcg, MEMCG_RSS, nr_pages);
710
- else {
711
- __mod_memcg_state(memcg, MEMCG_CACHE, nr_pages);
712
- if (PageSwapBacked(page))
713
- __mod_memcg_state(memcg, NR_SHMEM, nr_pages);
714
- }
715
-
716
- if (compound) {
717
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
718
- __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages);
719
- }
720
-
721956 /* pagein of a big page is an event. So, ignore page size */
722957 if (nr_pages > 0)
723958 __count_memcg_events(memcg, PGPGIN, 1);
....@@ -726,35 +961,7 @@
726961 nr_pages = -nr_pages; /* for event */
727962 }
728963
729
- __this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages);
730
-}
731
-
732
-unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
733
- int nid, unsigned int lru_mask)
734
-{
735
- struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
736
- unsigned long nr = 0;
737
- enum lru_list lru;
738
-
739
- VM_BUG_ON((unsigned)nid >= nr_node_ids);
740
-
741
- for_each_lru(lru) {
742
- if (!(BIT(lru) & lru_mask))
743
- continue;
744
- nr += mem_cgroup_get_lru_size(lruvec, lru);
745
- }
746
- return nr;
747
-}
748
-
749
-static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
750
- unsigned int lru_mask)
751
-{
752
- unsigned long nr = 0;
753
- int nid;
754
-
755
- for_each_node_state(nid, N_MEMORY)
756
- nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
757
- return nr;
964
+ __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
758965 }
759966
760967 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
....@@ -762,8 +969,8 @@
762969 {
763970 unsigned long val, next;
764971
765
- val = __this_cpu_read(memcg->stat_cpu->nr_page_events);
766
- next = __this_cpu_read(memcg->stat_cpu->targets[target]);
972
+ val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
973
+ next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
767974 /* from time_after() in jiffies.h */
768975 if ((long)(next - val) < 0) {
769976 switch (target) {
....@@ -773,13 +980,10 @@
773980 case MEM_CGROUP_TARGET_SOFTLIMIT:
774981 next = val + SOFTLIMIT_EVENTS_TARGET;
775982 break;
776
- case MEM_CGROUP_TARGET_NUMAINFO:
777
- next = val + NUMAINFO_EVENTS_TARGET;
778
- break;
779983 default:
780984 break;
781985 }
782
- __this_cpu_write(memcg->stat_cpu->targets[target], next);
986
+ __this_cpu_write(memcg->vmstats_percpu->targets[target], next);
783987 return true;
784988 }
785989 return false;
....@@ -795,21 +999,12 @@
795999 if (unlikely(mem_cgroup_event_ratelimit(memcg,
7961000 MEM_CGROUP_TARGET_THRESH))) {
7971001 bool do_softlimit;
798
- bool do_numainfo __maybe_unused;
7991002
8001003 do_softlimit = mem_cgroup_event_ratelimit(memcg,
8011004 MEM_CGROUP_TARGET_SOFTLIMIT);
802
-#if MAX_NUMNODES > 1
803
- do_numainfo = mem_cgroup_event_ratelimit(memcg,
804
- MEM_CGROUP_TARGET_NUMAINFO);
805
-#endif
8061005 mem_cgroup_threshold(memcg);
8071006 if (unlikely(do_softlimit))
8081007 mem_cgroup_update_tree(memcg, page);
809
-#if MAX_NUMNODES > 1
810
- if (unlikely(do_numainfo))
811
- atomic_inc(&memcg->numainfo_events);
812
-#endif
8131008 }
8141009 }
8151010
....@@ -877,27 +1072,60 @@
8771072 return NULL;
8781073
8791074 rcu_read_lock();
880
- if (!memcg || !css_tryget_online(&memcg->css))
1075
+ /* Page should not get uncharged and freed memcg under us. */
1076
+ if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css)))
8811077 memcg = root_mem_cgroup;
8821078 rcu_read_unlock();
8831079 return memcg;
8841080 }
8851081 EXPORT_SYMBOL(get_mem_cgroup_from_page);
8861082
1083
+static __always_inline struct mem_cgroup *active_memcg(void)
1084
+{
1085
+ if (in_interrupt())
1086
+ return this_cpu_read(int_active_memcg);
1087
+ else
1088
+ return current->active_memcg;
1089
+}
1090
+
1091
+static __always_inline struct mem_cgroup *get_active_memcg(void)
1092
+{
1093
+ struct mem_cgroup *memcg;
1094
+
1095
+ rcu_read_lock();
1096
+ memcg = active_memcg();
1097
+ /* remote memcg must hold a ref. */
1098
+ if (memcg && WARN_ON_ONCE(!css_tryget(&memcg->css)))
1099
+ memcg = root_mem_cgroup;
1100
+ rcu_read_unlock();
1101
+
1102
+ return memcg;
1103
+}
1104
+
1105
+static __always_inline bool memcg_kmem_bypass(void)
1106
+{
1107
+ /* Allow remote memcg charging from any context. */
1108
+ if (unlikely(active_memcg()))
1109
+ return false;
1110
+
1111
+ /* Memcg to charge can't be determined. */
1112
+ if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
1113
+ return true;
1114
+
1115
+ return false;
1116
+}
1117
+
8871118 /**
888
- * If current->active_memcg is non-NULL, do not fallback to current->mm->memcg.
1119
+ * If active memcg is set, do not fallback to current->mm->memcg.
8891120 */
8901121 static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
8911122 {
892
- if (unlikely(current->active_memcg)) {
893
- struct mem_cgroup *memcg = root_mem_cgroup;
1123
+ if (memcg_kmem_bypass())
1124
+ return NULL;
8941125
895
- rcu_read_lock();
896
- if (css_tryget_online(&current->active_memcg->css))
897
- memcg = current->active_memcg;
898
- rcu_read_unlock();
899
- return memcg;
900
- }
1126
+ if (unlikely(active_memcg()))
1127
+ return get_active_memcg();
1128
+
9011129 return get_mem_cgroup_from_mm(current->mm);
9021130 }
9031131
....@@ -914,15 +1142,15 @@
9141142 * invocations for reference counting, or use mem_cgroup_iter_break()
9151143 * to cancel a hierarchy walk before the round-trip is complete.
9161144 *
917
- * Reclaimers can specify a node and a priority level in @reclaim to
918
- * divide up the memcgs in the hierarchy among all concurrent
919
- * reclaimers operating on the same node and priority.
1145
+ * Reclaimers can specify a node in @reclaim to divide up the memcgs
1146
+ * in the hierarchy among all concurrent reclaimers operating on the
1147
+ * same node.
9201148 */
9211149 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
9221150 struct mem_cgroup *prev,
9231151 struct mem_cgroup_reclaim_cookie *reclaim)
9241152 {
925
- struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
1153
+ struct mem_cgroup_reclaim_iter *iter;
9261154 struct cgroup_subsys_state *css = NULL;
9271155 struct mem_cgroup *memcg = NULL;
9281156 struct mem_cgroup *pos = NULL;
....@@ -948,7 +1176,7 @@
9481176 struct mem_cgroup_per_node *mz;
9491177
9501178 mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
951
- iter = &mz->iter[reclaim->priority];
1179
+ iter = &mz->iter;
9521180
9531181 if (prev && reclaim->generation != iter->generation)
9541182 goto out_unlock;
....@@ -1048,15 +1276,11 @@
10481276 struct mem_cgroup_reclaim_iter *iter;
10491277 struct mem_cgroup_per_node *mz;
10501278 int nid;
1051
- int i;
10521279
10531280 for_each_node(nid) {
10541281 mz = mem_cgroup_nodeinfo(from, nid);
1055
- for (i = 0; i <= DEF_PRIORITY; i++) {
1056
- iter = &mz->iter[i];
1057
- cmpxchg(&iter->position,
1058
- dead_memcg, NULL);
1059
- }
1282
+ iter = &mz->iter;
1283
+ cmpxchg(&iter->position, dead_memcg, NULL);
10601284 }
10611285 }
10621286
....@@ -1106,7 +1330,7 @@
11061330 struct css_task_iter it;
11071331 struct task_struct *task;
11081332
1109
- css_task_iter_start(&iter->css, 0, &it);
1333
+ css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
11101334 while (!ret && (task = css_task_iter_next(&it)))
11111335 ret = fn(task, arg);
11121336 css_task_iter_end(&it);
....@@ -1123,9 +1347,8 @@
11231347 * @page: the page
11241348 * @pgdat: pgdat of the page
11251349 *
1126
- * This function is only safe when following the LRU page isolation
1127
- * and putback protocol: the LRU lock must be held, and the page must
1128
- * either be PageLRU() or the caller must have isolated/allocated it.
1350
+ * This function relies on page->mem_cgroup being stable - see the
1351
+ * access rules in commit_charge().
11291352 */
11301353 struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
11311354 {
....@@ -1134,7 +1357,7 @@
11341357 struct lruvec *lruvec;
11351358
11361359 if (mem_cgroup_disabled()) {
1137
- lruvec = &pgdat->lruvec;
1360
+ lruvec = &pgdat->__lruvec;
11381361 goto out;
11391362 }
11401363
....@@ -1158,6 +1381,38 @@
11581381 lruvec->pgdat = pgdat;
11591382 return lruvec;
11601383 }
1384
+
1385
+struct lruvec *page_to_lruvec(struct page *page, pg_data_t *pgdat)
1386
+{
1387
+ struct lruvec *lruvec;
1388
+
1389
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
1390
+
1391
+ return lruvec;
1392
+}
1393
+EXPORT_SYMBOL_GPL(page_to_lruvec);
1394
+
1395
+void do_traversal_all_lruvec(void)
1396
+{
1397
+ pg_data_t *pgdat;
1398
+
1399
+ for_each_online_pgdat(pgdat) {
1400
+ struct mem_cgroup *memcg = NULL;
1401
+
1402
+ spin_lock_irq(&pgdat->lru_lock);
1403
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
1404
+ do {
1405
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
1406
+
1407
+ trace_android_vh_do_traversal_lruvec(lruvec);
1408
+
1409
+ memcg = mem_cgroup_iter(NULL, memcg, NULL);
1410
+ } while (memcg);
1411
+
1412
+ spin_unlock_irq(&pgdat->lru_lock);
1413
+ }
1414
+}
1415
+EXPORT_SYMBOL_GPL(do_traversal_all_lruvec);
11611416
11621417 /**
11631418 * mem_cgroup_update_lru_size - account for adding or removing an lru page
....@@ -1198,32 +1453,6 @@
11981453 *lru_size += nr_pages;
11991454 }
12001455
1201
-bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
1202
-{
1203
- struct mem_cgroup *task_memcg;
1204
- struct task_struct *p;
1205
- bool ret;
1206
-
1207
- p = find_lock_task_mm(task);
1208
- if (p) {
1209
- task_memcg = get_mem_cgroup_from_mm(p->mm);
1210
- task_unlock(p);
1211
- } else {
1212
- /*
1213
- * All threads may have already detached their mm's, but the oom
1214
- * killer still needs to detect if they have already been oom
1215
- * killed to prevent needlessly killing additional tasks.
1216
- */
1217
- rcu_read_lock();
1218
- task_memcg = mem_cgroup_from_task(task);
1219
- css_get(&task_memcg->css);
1220
- rcu_read_unlock();
1221
- }
1222
- ret = mem_cgroup_is_descendant(task_memcg, memcg);
1223
- css_put(&task_memcg->css);
1224
- return ret;
1225
-}
1226
-
12271456 /**
12281457 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
12291458 * @memcg: the memory cgroup
....@@ -1245,7 +1474,7 @@
12451474 if (do_memsw_account()) {
12461475 count = page_counter_read(&memcg->memsw);
12471476 limit = READ_ONCE(memcg->memsw.max);
1248
- if (count <= limit)
1477
+ if (count < limit)
12491478 margin = min(margin, limit - count);
12501479 else
12511480 margin = 0;
....@@ -1299,85 +1528,199 @@
12991528 return false;
13001529 }
13011530
1302
-static const unsigned int memcg1_stats[] = {
1303
- MEMCG_CACHE,
1304
- MEMCG_RSS,
1305
- MEMCG_RSS_HUGE,
1306
- NR_SHMEM,
1307
- NR_FILE_MAPPED,
1308
- NR_FILE_DIRTY,
1309
- NR_WRITEBACK,
1310
- MEMCG_SWAP,
1531
+struct memory_stat {
1532
+ const char *name;
1533
+ unsigned int ratio;
1534
+ unsigned int idx;
13111535 };
13121536
1313
-static const char *const memcg1_stat_names[] = {
1314
- "cache",
1315
- "rss",
1316
- "rss_huge",
1317
- "shmem",
1318
- "mapped_file",
1319
- "dirty",
1320
- "writeback",
1321
- "swap",
1537
+static struct memory_stat memory_stats[] = {
1538
+ { "anon", PAGE_SIZE, NR_ANON_MAPPED },
1539
+ { "file", PAGE_SIZE, NR_FILE_PAGES },
1540
+ { "kernel_stack", 1024, NR_KERNEL_STACK_KB },
1541
+ { "percpu", 1, MEMCG_PERCPU_B },
1542
+ { "sock", PAGE_SIZE, MEMCG_SOCK },
1543
+ { "shmem", PAGE_SIZE, NR_SHMEM },
1544
+ { "file_mapped", PAGE_SIZE, NR_FILE_MAPPED },
1545
+ { "file_dirty", PAGE_SIZE, NR_FILE_DIRTY },
1546
+ { "file_writeback", PAGE_SIZE, NR_WRITEBACK },
1547
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1548
+ /*
1549
+ * The ratio will be initialized in memory_stats_init(). Because
1550
+ * on some architectures, the macro of HPAGE_PMD_SIZE is not
1551
+ * constant(e.g. powerpc).
1552
+ */
1553
+ { "anon_thp", 0, NR_ANON_THPS },
1554
+#endif
1555
+ { "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON },
1556
+ { "active_anon", PAGE_SIZE, NR_ACTIVE_ANON },
1557
+ { "inactive_file", PAGE_SIZE, NR_INACTIVE_FILE },
1558
+ { "active_file", PAGE_SIZE, NR_ACTIVE_FILE },
1559
+ { "unevictable", PAGE_SIZE, NR_UNEVICTABLE },
1560
+
1561
+ /*
1562
+ * Note: The slab_reclaimable and slab_unreclaimable must be
1563
+ * together and slab_reclaimable must be in front.
1564
+ */
1565
+ { "slab_reclaimable", 1, NR_SLAB_RECLAIMABLE_B },
1566
+ { "slab_unreclaimable", 1, NR_SLAB_UNRECLAIMABLE_B },
1567
+
1568
+ /* The memory events */
1569
+ { "workingset_refault_anon", 1, WORKINGSET_REFAULT_ANON },
1570
+ { "workingset_refault_file", 1, WORKINGSET_REFAULT_FILE },
1571
+ { "workingset_activate_anon", 1, WORKINGSET_ACTIVATE_ANON },
1572
+ { "workingset_activate_file", 1, WORKINGSET_ACTIVATE_FILE },
1573
+ { "workingset_restore_anon", 1, WORKINGSET_RESTORE_ANON },
1574
+ { "workingset_restore_file", 1, WORKINGSET_RESTORE_FILE },
1575
+ { "workingset_nodereclaim", 1, WORKINGSET_NODERECLAIM },
13221576 };
1577
+
1578
+static int __init memory_stats_init(void)
1579
+{
1580
+ int i;
1581
+
1582
+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
1583
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1584
+ if (memory_stats[i].idx == NR_ANON_THPS)
1585
+ memory_stats[i].ratio = HPAGE_PMD_SIZE;
1586
+#endif
1587
+ VM_BUG_ON(!memory_stats[i].ratio);
1588
+ VM_BUG_ON(memory_stats[i].idx >= MEMCG_NR_STAT);
1589
+ }
1590
+
1591
+ return 0;
1592
+}
1593
+pure_initcall(memory_stats_init);
1594
+
1595
+static char *memory_stat_format(struct mem_cgroup *memcg)
1596
+{
1597
+ struct seq_buf s;
1598
+ int i;
1599
+
1600
+ seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
1601
+ if (!s.buffer)
1602
+ return NULL;
1603
+
1604
+ /*
1605
+ * Provide statistics on the state of the memory subsystem as
1606
+ * well as cumulative event counters that show past behavior.
1607
+ *
1608
+ * This list is ordered following a combination of these gradients:
1609
+ * 1) generic big picture -> specifics and details
1610
+ * 2) reflecting userspace activity -> reflecting kernel heuristics
1611
+ *
1612
+ * Current memory state:
1613
+ */
1614
+
1615
+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
1616
+ u64 size;
1617
+
1618
+ size = memcg_page_state(memcg, memory_stats[i].idx);
1619
+ size *= memory_stats[i].ratio;
1620
+ seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size);
1621
+
1622
+ if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
1623
+ size = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
1624
+ memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B);
1625
+ seq_buf_printf(&s, "slab %llu\n", size);
1626
+ }
1627
+ }
1628
+
1629
+ /* Accumulated memory events */
1630
+
1631
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT),
1632
+ memcg_events(memcg, PGFAULT));
1633
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
1634
+ memcg_events(memcg, PGMAJFAULT));
1635
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGREFILL),
1636
+ memcg_events(memcg, PGREFILL));
1637
+ seq_buf_printf(&s, "pgscan %lu\n",
1638
+ memcg_events(memcg, PGSCAN_KSWAPD) +
1639
+ memcg_events(memcg, PGSCAN_DIRECT));
1640
+ seq_buf_printf(&s, "pgsteal %lu\n",
1641
+ memcg_events(memcg, PGSTEAL_KSWAPD) +
1642
+ memcg_events(memcg, PGSTEAL_DIRECT));
1643
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE),
1644
+ memcg_events(memcg, PGACTIVATE));
1645
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE),
1646
+ memcg_events(memcg, PGDEACTIVATE));
1647
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE),
1648
+ memcg_events(memcg, PGLAZYFREE));
1649
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED),
1650
+ memcg_events(memcg, PGLAZYFREED));
1651
+
1652
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1653
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC),
1654
+ memcg_events(memcg, THP_FAULT_ALLOC));
1655
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC),
1656
+ memcg_events(memcg, THP_COLLAPSE_ALLOC));
1657
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1658
+
1659
+ /* The above should easily fit into one page */
1660
+ WARN_ON_ONCE(seq_buf_has_overflowed(&s));
1661
+
1662
+ return s.buffer;
1663
+}
13231664
13241665 #define K(x) ((x) << (PAGE_SHIFT-10))
13251666 /**
1326
- * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
1667
+ * mem_cgroup_print_oom_context: Print OOM information relevant to
1668
+ * memory controller.
13271669 * @memcg: The memory cgroup that went over limit
13281670 * @p: Task that is going to be killed
13291671 *
13301672 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
13311673 * enabled
13321674 */
1333
-void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1675
+void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
13341676 {
1335
- struct mem_cgroup *iter;
1336
- unsigned int i;
1337
-
13381677 rcu_read_lock();
13391678
1679
+ if (memcg) {
1680
+ pr_cont(",oom_memcg=");
1681
+ pr_cont_cgroup_path(memcg->css.cgroup);
1682
+ } else
1683
+ pr_cont(",global_oom");
13401684 if (p) {
1341
- pr_info("Task in ");
1685
+ pr_cont(",task_memcg=");
13421686 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1343
- pr_cont(" killed as a result of limit of ");
1344
- } else {
1345
- pr_info("Memory limit reached of cgroup ");
13461687 }
1347
-
1348
- pr_cont_cgroup_path(memcg->css.cgroup);
1349
- pr_cont("\n");
1350
-
13511688 rcu_read_unlock();
1689
+}
1690
+
1691
+/**
1692
+ * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to
1693
+ * memory controller.
1694
+ * @memcg: The memory cgroup that went over limit
1695
+ */
1696
+void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
1697
+{
1698
+ char *buf;
13521699
13531700 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
13541701 K((u64)page_counter_read(&memcg->memory)),
1355
- K((u64)memcg->memory.max), memcg->memory.failcnt);
1356
- pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1357
- K((u64)page_counter_read(&memcg->memsw)),
1358
- K((u64)memcg->memsw.max), memcg->memsw.failcnt);
1359
- pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1360
- K((u64)page_counter_read(&memcg->kmem)),
1361
- K((u64)memcg->kmem.max), memcg->kmem.failcnt);
1362
-
1363
- for_each_mem_cgroup_tree(iter, memcg) {
1364
- pr_info("Memory cgroup stats for ");
1365
- pr_cont_cgroup_path(iter->css.cgroup);
1366
- pr_cont(":");
1367
-
1368
- for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
1369
- if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account)
1370
- continue;
1371
- pr_cont(" %s:%luKB", memcg1_stat_names[i],
1372
- K(memcg_page_state(iter, memcg1_stats[i])));
1373
- }
1374
-
1375
- for (i = 0; i < NR_LRU_LISTS; i++)
1376
- pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
1377
- K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
1378
-
1379
- pr_cont("\n");
1702
+ K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt);
1703
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
1704
+ pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
1705
+ K((u64)page_counter_read(&memcg->swap)),
1706
+ K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt);
1707
+ else {
1708
+ pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1709
+ K((u64)page_counter_read(&memcg->memsw)),
1710
+ K((u64)memcg->memsw.max), memcg->memsw.failcnt);
1711
+ pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1712
+ K((u64)page_counter_read(&memcg->kmem)),
1713
+ K((u64)memcg->kmem.max), memcg->kmem.failcnt);
13801714 }
1715
+
1716
+ pr_info("Memory cgroup stats for ");
1717
+ pr_cont_cgroup_path(memcg->css.cgroup);
1718
+ pr_cont(":");
1719
+ buf = memory_stat_format(memcg);
1720
+ if (!buf)
1721
+ return;
1722
+ pr_info("%s", buf);
1723
+ kfree(buf);
13811724 }
13821725
13831726 /*
....@@ -1385,19 +1728,26 @@
13851728 */
13861729 unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
13871730 {
1388
- unsigned long max;
1731
+ unsigned long max = READ_ONCE(memcg->memory.max);
13891732
1390
- max = memcg->memory.max;
1391
- if (mem_cgroup_swappiness(memcg)) {
1392
- unsigned long memsw_max;
1393
- unsigned long swap_max;
1733
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
1734
+ if (mem_cgroup_swappiness(memcg))
1735
+ max += min(READ_ONCE(memcg->swap.max),
1736
+ (unsigned long)total_swap_pages);
1737
+ } else { /* v1 */
1738
+ if (mem_cgroup_swappiness(memcg)) {
1739
+ /* Calculate swap excess capacity from memsw limit */
1740
+ unsigned long swap = READ_ONCE(memcg->memsw.max) - max;
13941741
1395
- memsw_max = memcg->memsw.max;
1396
- swap_max = memcg->swap.max;
1397
- swap_max = min(swap_max, (unsigned long)total_swap_pages);
1398
- max = min(max + swap_max, memsw_max);
1742
+ max += min(swap, (unsigned long)total_swap_pages);
1743
+ }
13991744 }
14001745 return max;
1746
+}
1747
+
1748
+unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
1749
+{
1750
+ return page_counter_read(&memcg->memory);
14011751 }
14021752
14031753 static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
....@@ -1410,112 +1760,24 @@
14101760 .gfp_mask = gfp_mask,
14111761 .order = order,
14121762 };
1413
- bool ret;
1763
+ bool ret = true;
14141764
14151765 if (mutex_lock_killable(&oom_lock))
14161766 return true;
1767
+
1768
+ if (mem_cgroup_margin(memcg) >= (1 << order))
1769
+ goto unlock;
1770
+
14171771 /*
14181772 * A few threads which were not waiting at mutex_lock_killable() can
14191773 * fail to bail out. Therefore, check again after holding oom_lock.
14201774 */
1421
- ret = should_force_charge() || out_of_memory(&oc);
1775
+ ret = task_is_dying() || out_of_memory(&oc);
1776
+
1777
+unlock:
14221778 mutex_unlock(&oom_lock);
14231779 return ret;
14241780 }
1425
-
1426
-#if MAX_NUMNODES > 1
1427
-
1428
-/**
1429
- * test_mem_cgroup_node_reclaimable
1430
- * @memcg: the target memcg
1431
- * @nid: the node ID to be checked.
1432
- * @noswap : specify true here if the user wants flle only information.
1433
- *
1434
- * This function returns whether the specified memcg contains any
1435
- * reclaimable pages on a node. Returns true if there are any reclaimable
1436
- * pages in the node.
1437
- */
1438
-static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1439
- int nid, bool noswap)
1440
-{
1441
- if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
1442
- return true;
1443
- if (noswap || !total_swap_pages)
1444
- return false;
1445
- if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
1446
- return true;
1447
- return false;
1448
-
1449
-}
1450
-
1451
-/*
1452
- * Always updating the nodemask is not very good - even if we have an empty
1453
- * list or the wrong list here, we can start from some node and traverse all
1454
- * nodes based on the zonelist. So update the list loosely once per 10 secs.
1455
- *
1456
- */
1457
-static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1458
-{
1459
- int nid;
1460
- /*
1461
- * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
1462
- * pagein/pageout changes since the last update.
1463
- */
1464
- if (!atomic_read(&memcg->numainfo_events))
1465
- return;
1466
- if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1467
- return;
1468
-
1469
- /* make a nodemask where this memcg uses memory from */
1470
- memcg->scan_nodes = node_states[N_MEMORY];
1471
-
1472
- for_each_node_mask(nid, node_states[N_MEMORY]) {
1473
-
1474
- if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1475
- node_clear(nid, memcg->scan_nodes);
1476
- }
1477
-
1478
- atomic_set(&memcg->numainfo_events, 0);
1479
- atomic_set(&memcg->numainfo_updating, 0);
1480
-}
1481
-
1482
-/*
1483
- * Selecting a node where we start reclaim from. Because what we need is just
1484
- * reducing usage counter, start from anywhere is O,K. Considering
1485
- * memory reclaim from current node, there are pros. and cons.
1486
- *
1487
- * Freeing memory from current node means freeing memory from a node which
1488
- * we'll use or we've used. So, it may make LRU bad. And if several threads
1489
- * hit limits, it will see a contention on a node. But freeing from remote
1490
- * node means more costs for memory reclaim because of memory latency.
1491
- *
1492
- * Now, we use round-robin. Better algorithm is welcomed.
1493
- */
1494
-int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1495
-{
1496
- int node;
1497
-
1498
- mem_cgroup_may_update_nodemask(memcg);
1499
- node = memcg->last_scanned_node;
1500
-
1501
- node = next_node_in(node, memcg->scan_nodes);
1502
- /*
1503
- * mem_cgroup_may_update_nodemask might have seen no reclaimmable pages
1504
- * last time it really checked all the LRUs due to rate limiting.
1505
- * Fallback to the current node in that case for simplicity.
1506
- */
1507
- if (unlikely(node == MAX_NUMNODES))
1508
- node = numa_node_id();
1509
-
1510
- memcg->last_scanned_node = node;
1511
- return node;
1512
-}
1513
-#else
1514
-int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1515
-{
1516
- return 0;
1517
-}
1518
-#endif
15191781
15201782 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
15211783 pg_data_t *pgdat,
....@@ -1529,7 +1791,6 @@
15291791 unsigned long nr_scanned;
15301792 struct mem_cgroup_reclaim_cookie reclaim = {
15311793 .pgdat = pgdat,
1532
- .priority = 0,
15331794 };
15341795
15351796 excess = soft_limit_excess(root_memcg);
....@@ -1624,7 +1885,7 @@
16241885 struct mem_cgroup *iter;
16251886
16261887 spin_lock(&memcg_oom_lock);
1627
- mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
1888
+ mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
16281889 for_each_mem_cgroup_tree(iter, memcg)
16291890 iter->oom_lock = false;
16301891 spin_unlock(&memcg_oom_lock);
....@@ -1645,8 +1906,8 @@
16451906 struct mem_cgroup *iter;
16461907
16471908 /*
1648
- * When a new child is created while the hierarchy is under oom,
1649
- * mem_cgroup_oom_lock() may not be called. Watch for underflow.
1909
+ * Be careful about under_oom underflows becase a child memcg
1910
+ * could have been added after mem_cgroup_mark_under_oom.
16501911 */
16511912 spin_lock(&memcg_oom_lock);
16521913 for_each_mem_cgroup_tree(iter, memcg)
....@@ -1706,6 +1967,8 @@
17061967
17071968 if (order > PAGE_ALLOC_COSTLY_ORDER)
17081969 return OOM_SKIPPED;
1970
+
1971
+ memcg_memory_event(memcg, MEMCG_OOM);
17091972
17101973 /*
17111974 * We are in the middle of the charge context here, so we
....@@ -1854,6 +2117,14 @@
18542117 goto out;
18552118
18562119 /*
2120
+ * If the victim task has been asynchronously moved to a different
2121
+ * memory cgroup, we might end up killing tasks outside oom_domain.
2122
+ * In this case it's better to ignore memory.group.oom.
2123
+ */
2124
+ if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain)))
2125
+ goto out;
2126
+
2127
+ /*
18572128 * Traverse the memory cgroup hierarchy from the victim task's
18582129 * cgroup up to the OOMing cgroup (or root) to find the
18592130 * highest-level memory cgroup with oom.group set.
....@@ -1894,6 +2165,7 @@
18942165 */
18952166 struct mem_cgroup *lock_page_memcg(struct page *page)
18962167 {
2168
+ struct page *head = compound_head(page); /* rmap on tail pages */
18972169 struct mem_cgroup *memcg;
18982170 unsigned long flags;
18992171
....@@ -1913,7 +2185,7 @@
19132185 if (mem_cgroup_disabled())
19142186 return NULL;
19152187 again:
1916
- memcg = page->mem_cgroup;
2188
+ memcg = head->mem_cgroup;
19172189 if (unlikely(!memcg))
19182190 return NULL;
19192191
....@@ -1921,7 +2193,7 @@
19212193 return memcg;
19222194
19232195 spin_lock_irqsave(&memcg->move_lock, flags);
1924
- if (memcg != page->mem_cgroup) {
2196
+ if (memcg != head->mem_cgroup) {
19252197 spin_unlock_irqrestore(&memcg->move_lock, flags);
19262198 goto again;
19272199 }
....@@ -1964,19 +2236,44 @@
19642236 */
19652237 void unlock_page_memcg(struct page *page)
19662238 {
1967
- __unlock_page_memcg(page->mem_cgroup);
2239
+ struct page *head = compound_head(page);
2240
+
2241
+ __unlock_page_memcg(head->mem_cgroup);
19682242 }
19692243 EXPORT_SYMBOL(unlock_page_memcg);
19702244
19712245 struct memcg_stock_pcp {
2246
+ local_lock_t lock;
19722247 struct mem_cgroup *cached; /* this never be root cgroup */
19732248 unsigned int nr_pages;
2249
+
2250
+#ifdef CONFIG_MEMCG_KMEM
2251
+ struct obj_cgroup *cached_objcg;
2252
+ unsigned int nr_bytes;
2253
+#endif
2254
+
19742255 struct work_struct work;
19752256 unsigned long flags;
19762257 #define FLUSHING_CACHED_CHARGE 0
19772258 };
19782259 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
19792260 static DEFINE_MUTEX(percpu_charge_mutex);
2261
+
2262
+#ifdef CONFIG_MEMCG_KMEM
2263
+static void drain_obj_stock(struct memcg_stock_pcp *stock);
2264
+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
2265
+ struct mem_cgroup *root_memcg);
2266
+
2267
+#else
2268
+static inline void drain_obj_stock(struct memcg_stock_pcp *stock)
2269
+{
2270
+}
2271
+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
2272
+ struct mem_cgroup *root_memcg)
2273
+{
2274
+ return false;
2275
+}
2276
+#endif
19802277
19812278 /**
19822279 * consume_stock: Try to consume stocked charge on this cpu.
....@@ -1998,7 +2295,7 @@
19982295 if (nr_pages > MEMCG_CHARGE_BATCH)
19992296 return ret;
20002297
2001
- local_irq_save(flags);
2298
+ local_lock_irqsave(&memcg_stock.lock, flags);
20022299
20032300 stock = this_cpu_ptr(&memcg_stock);
20042301 if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
....@@ -2006,7 +2303,7 @@
20062303 ret = true;
20072304 }
20082305
2009
- local_irq_restore(flags);
2306
+ local_unlock_irqrestore(&memcg_stock.lock, flags);
20102307
20112308 return ret;
20122309 }
....@@ -2018,13 +2315,17 @@
20182315 {
20192316 struct mem_cgroup *old = stock->cached;
20202317
2318
+ if (!old)
2319
+ return;
2320
+
20212321 if (stock->nr_pages) {
20222322 page_counter_uncharge(&old->memory, stock->nr_pages);
20232323 if (do_memsw_account())
20242324 page_counter_uncharge(&old->memsw, stock->nr_pages);
2025
- css_put_many(&old->css, stock->nr_pages);
20262325 stock->nr_pages = 0;
20272326 }
2327
+
2328
+ css_put(&old->css);
20282329 stock->cached = NULL;
20292330 }
20302331
....@@ -2037,13 +2338,14 @@
20372338 * The only protection from memory hotplug vs. drain_stock races is
20382339 * that we always operate on local CPU stock here with IRQ disabled
20392340 */
2040
- local_irq_save(flags);
2341
+ local_lock_irqsave(&memcg_stock.lock, flags);
20412342
20422343 stock = this_cpu_ptr(&memcg_stock);
2344
+ drain_obj_stock(stock);
20432345 drain_stock(stock);
20442346 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
20452347
2046
- local_irq_restore(flags);
2348
+ local_unlock_irqrestore(&memcg_stock.lock, flags);
20472349 }
20482350
20492351 /*
....@@ -2055,11 +2357,12 @@
20552357 struct memcg_stock_pcp *stock;
20562358 unsigned long flags;
20572359
2058
- local_irq_save(flags);
2360
+ local_lock_irqsave(&memcg_stock.lock, flags);
20592361
20602362 stock = this_cpu_ptr(&memcg_stock);
20612363 if (stock->cached != memcg) { /* reset if necessary */
20622364 drain_stock(stock);
2365
+ css_get(&memcg->css);
20632366 stock->cached = memcg;
20642367 }
20652368 stock->nr_pages += nr_pages;
....@@ -2067,7 +2370,7 @@
20672370 if (stock->nr_pages > MEMCG_CHARGE_BATCH)
20682371 drain_stock(stock);
20692372
2070
- local_irq_restore(flags);
2373
+ local_unlock_irqrestore(&memcg_stock.lock, flags);
20712374 }
20722375
20732376 /*
....@@ -2091,21 +2394,24 @@
20912394 for_each_online_cpu(cpu) {
20922395 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
20932396 struct mem_cgroup *memcg;
2397
+ bool flush = false;
20942398
2399
+ rcu_read_lock();
20952400 memcg = stock->cached;
2096
- if (!memcg || !stock->nr_pages || !css_tryget(&memcg->css))
2097
- continue;
2098
- if (!mem_cgroup_is_descendant(memcg, root_memcg)) {
2099
- css_put(&memcg->css);
2100
- continue;
2101
- }
2102
- if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2401
+ if (memcg && stock->nr_pages &&
2402
+ mem_cgroup_is_descendant(memcg, root_memcg))
2403
+ flush = true;
2404
+ if (obj_stock_flush_required(stock, root_memcg))
2405
+ flush = true;
2406
+ rcu_read_unlock();
2407
+
2408
+ if (flush &&
2409
+ !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
21032410 if (cpu == curcpu)
21042411 drain_local_stock(&stock->work);
21052412 else
21062413 schedule_work_on(cpu, &stock->work);
21072414 }
2108
- css_put(&memcg->css);
21092415 }
21102416 put_cpu_light();
21112417 mutex_unlock(&percpu_charge_mutex);
....@@ -2114,7 +2420,7 @@
21142420 static int memcg_hotplug_cpu_dead(unsigned int cpu)
21152421 {
21162422 struct memcg_stock_pcp *stock;
2117
- struct mem_cgroup *memcg;
2423
+ struct mem_cgroup *memcg, *mi;
21182424
21192425 stock = &per_cpu(memcg_stock, cpu);
21202426 drain_stock(stock);
....@@ -2126,9 +2432,10 @@
21262432 int nid;
21272433 long x;
21282434
2129
- x = this_cpu_xchg(memcg->stat_cpu->count[i], 0);
2435
+ x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
21302436 if (x)
2131
- atomic_long_add(x, &memcg->stat[i]);
2437
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
2438
+ atomic_long_add(x, &memcg->vmstats[i]);
21322439
21332440 if (i >= NR_VM_NODE_STAT_ITEMS)
21342441 continue;
....@@ -2139,32 +2446,48 @@
21392446 pn = mem_cgroup_nodeinfo(memcg, nid);
21402447 x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
21412448 if (x)
2142
- atomic_long_add(x, &pn->lruvec_stat[i]);
2449
+ do {
2450
+ atomic_long_add(x, &pn->lruvec_stat[i]);
2451
+ } while ((pn = parent_nodeinfo(pn, nid)));
21432452 }
21442453 }
21452454
21462455 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
21472456 long x;
21482457
2149
- x = this_cpu_xchg(memcg->stat_cpu->events[i], 0);
2458
+ x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
21502459 if (x)
2151
- atomic_long_add(x, &memcg->events[i]);
2460
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
2461
+ atomic_long_add(x, &memcg->vmevents[i]);
21522462 }
21532463 }
21542464
21552465 return 0;
21562466 }
21572467
2158
-static void reclaim_high(struct mem_cgroup *memcg,
2159
- unsigned int nr_pages,
2160
- gfp_t gfp_mask)
2468
+static unsigned long reclaim_high(struct mem_cgroup *memcg,
2469
+ unsigned int nr_pages,
2470
+ gfp_t gfp_mask)
21612471 {
2472
+ unsigned long nr_reclaimed = 0;
2473
+
21622474 do {
2163
- if (page_counter_read(&memcg->memory) <= memcg->high)
2475
+ unsigned long pflags;
2476
+
2477
+ if (page_counter_read(&memcg->memory) <=
2478
+ READ_ONCE(memcg->memory.high))
21642479 continue;
2480
+
21652481 memcg_memory_event(memcg, MEMCG_HIGH);
2166
- try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
2167
- } while ((memcg = parent_mem_cgroup(memcg)));
2482
+
2483
+ psi_memstall_enter(&pflags);
2484
+ nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
2485
+ gfp_mask, true);
2486
+ psi_memstall_leave(&pflags);
2487
+ } while ((memcg = parent_mem_cgroup(memcg)) &&
2488
+ !mem_cgroup_is_root(memcg));
2489
+
2490
+ return nr_reclaimed;
21682491 }
21692492
21702493 static void high_work_func(struct work_struct *work)
....@@ -2176,35 +2499,238 @@
21762499 }
21772500
21782501 /*
2502
+ * Clamp the maximum sleep time per allocation batch to 2 seconds. This is
2503
+ * enough to still cause a significant slowdown in most cases, while still
2504
+ * allowing diagnostics and tracing to proceed without becoming stuck.
2505
+ */
2506
+#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
2507
+
2508
+/*
2509
+ * When calculating the delay, we use these either side of the exponentiation to
2510
+ * maintain precision and scale to a reasonable number of jiffies (see the table
2511
+ * below.
2512
+ *
2513
+ * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
2514
+ * overage ratio to a delay.
2515
+ * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the
2516
+ * proposed penalty in order to reduce to a reasonable number of jiffies, and
2517
+ * to produce a reasonable delay curve.
2518
+ *
2519
+ * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
2520
+ * reasonable delay curve compared to precision-adjusted overage, not
2521
+ * penalising heavily at first, but still making sure that growth beyond the
2522
+ * limit penalises misbehaviour cgroups by slowing them down exponentially. For
2523
+ * example, with a high of 100 megabytes:
2524
+ *
2525
+ * +-------+------------------------+
2526
+ * | usage | time to allocate in ms |
2527
+ * +-------+------------------------+
2528
+ * | 100M | 0 |
2529
+ * | 101M | 6 |
2530
+ * | 102M | 25 |
2531
+ * | 103M | 57 |
2532
+ * | 104M | 102 |
2533
+ * | 105M | 159 |
2534
+ * | 106M | 230 |
2535
+ * | 107M | 313 |
2536
+ * | 108M | 409 |
2537
+ * | 109M | 518 |
2538
+ * | 110M | 639 |
2539
+ * | 111M | 774 |
2540
+ * | 112M | 921 |
2541
+ * | 113M | 1081 |
2542
+ * | 114M | 1254 |
2543
+ * | 115M | 1439 |
2544
+ * | 116M | 1638 |
2545
+ * | 117M | 1849 |
2546
+ * | 118M | 2000 |
2547
+ * | 119M | 2000 |
2548
+ * | 120M | 2000 |
2549
+ * +-------+------------------------+
2550
+ */
2551
+ #define MEMCG_DELAY_PRECISION_SHIFT 20
2552
+ #define MEMCG_DELAY_SCALING_SHIFT 14
2553
+
2554
+static u64 calculate_overage(unsigned long usage, unsigned long high)
2555
+{
2556
+ u64 overage;
2557
+
2558
+ if (usage <= high)
2559
+ return 0;
2560
+
2561
+ /*
2562
+ * Prevent division by 0 in overage calculation by acting as if
2563
+ * it was a threshold of 1 page
2564
+ */
2565
+ high = max(high, 1UL);
2566
+
2567
+ overage = usage - high;
2568
+ overage <<= MEMCG_DELAY_PRECISION_SHIFT;
2569
+ return div64_u64(overage, high);
2570
+}
2571
+
2572
+static u64 mem_find_max_overage(struct mem_cgroup *memcg)
2573
+{
2574
+ u64 overage, max_overage = 0;
2575
+
2576
+ do {
2577
+ overage = calculate_overage(page_counter_read(&memcg->memory),
2578
+ READ_ONCE(memcg->memory.high));
2579
+ max_overage = max(overage, max_overage);
2580
+ } while ((memcg = parent_mem_cgroup(memcg)) &&
2581
+ !mem_cgroup_is_root(memcg));
2582
+
2583
+ return max_overage;
2584
+}
2585
+
2586
+static u64 swap_find_max_overage(struct mem_cgroup *memcg)
2587
+{
2588
+ u64 overage, max_overage = 0;
2589
+
2590
+ do {
2591
+ overage = calculate_overage(page_counter_read(&memcg->swap),
2592
+ READ_ONCE(memcg->swap.high));
2593
+ if (overage)
2594
+ memcg_memory_event(memcg, MEMCG_SWAP_HIGH);
2595
+ max_overage = max(overage, max_overage);
2596
+ } while ((memcg = parent_mem_cgroup(memcg)) &&
2597
+ !mem_cgroup_is_root(memcg));
2598
+
2599
+ return max_overage;
2600
+}
2601
+
2602
+/*
2603
+ * Get the number of jiffies that we should penalise a mischievous cgroup which
2604
+ * is exceeding its memory.high by checking both it and its ancestors.
2605
+ */
2606
+static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
2607
+ unsigned int nr_pages,
2608
+ u64 max_overage)
2609
+{
2610
+ unsigned long penalty_jiffies;
2611
+
2612
+ if (!max_overage)
2613
+ return 0;
2614
+
2615
+ /*
2616
+ * We use overage compared to memory.high to calculate the number of
2617
+ * jiffies to sleep (penalty_jiffies). Ideally this value should be
2618
+ * fairly lenient on small overages, and increasingly harsh when the
2619
+ * memcg in question makes it clear that it has no intention of stopping
2620
+ * its crazy behaviour, so we exponentially increase the delay based on
2621
+ * overage amount.
2622
+ */
2623
+ penalty_jiffies = max_overage * max_overage * HZ;
2624
+ penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
2625
+ penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
2626
+
2627
+ /*
2628
+ * Factor in the task's own contribution to the overage, such that four
2629
+ * N-sized allocations are throttled approximately the same as one
2630
+ * 4N-sized allocation.
2631
+ *
2632
+ * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
2633
+ * larger the current charge patch is than that.
2634
+ */
2635
+ return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
2636
+}
2637
+
2638
+/*
21792639 * Scheduled by try_charge() to be executed from the userland return path
21802640 * and reclaims memory over the high limit.
21812641 */
21822642 void mem_cgroup_handle_over_high(void)
21832643 {
2644
+ unsigned long penalty_jiffies;
2645
+ unsigned long pflags;
2646
+ unsigned long nr_reclaimed;
21842647 unsigned int nr_pages = current->memcg_nr_pages_over_high;
2648
+ int nr_retries = MAX_RECLAIM_RETRIES;
21852649 struct mem_cgroup *memcg;
2650
+ bool in_retry = false;
21862651
21872652 if (likely(!nr_pages))
21882653 return;
21892654
21902655 memcg = get_mem_cgroup_from_mm(current->mm);
2191
- reclaim_high(memcg, nr_pages, GFP_KERNEL);
2192
- css_put(&memcg->css);
21932656 current->memcg_nr_pages_over_high = 0;
2657
+
2658
+retry_reclaim:
2659
+ /*
2660
+ * The allocating task should reclaim at least the batch size, but for
2661
+ * subsequent retries we only want to do what's necessary to prevent oom
2662
+ * or breaching resource isolation.
2663
+ *
2664
+ * This is distinct from memory.max or page allocator behaviour because
2665
+ * memory.high is currently batched, whereas memory.max and the page
2666
+ * allocator run every time an allocation is made.
2667
+ */
2668
+ nr_reclaimed = reclaim_high(memcg,
2669
+ in_retry ? SWAP_CLUSTER_MAX : nr_pages,
2670
+ GFP_KERNEL);
2671
+
2672
+ /*
2673
+ * memory.high is breached and reclaim is unable to keep up. Throttle
2674
+ * allocators proactively to slow down excessive growth.
2675
+ */
2676
+ penalty_jiffies = calculate_high_delay(memcg, nr_pages,
2677
+ mem_find_max_overage(memcg));
2678
+
2679
+ penalty_jiffies += calculate_high_delay(memcg, nr_pages,
2680
+ swap_find_max_overage(memcg));
2681
+
2682
+ /*
2683
+ * Clamp the max delay per usermode return so as to still keep the
2684
+ * application moving forwards and also permit diagnostics, albeit
2685
+ * extremely slowly.
2686
+ */
2687
+ penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
2688
+
2689
+ /*
2690
+ * Don't sleep if the amount of jiffies this memcg owes us is so low
2691
+ * that it's not even worth doing, in an attempt to be nice to those who
2692
+ * go only a small amount over their memory.high value and maybe haven't
2693
+ * been aggressively reclaimed enough yet.
2694
+ */
2695
+ if (penalty_jiffies <= HZ / 100)
2696
+ goto out;
2697
+
2698
+ /*
2699
+ * If reclaim is making forward progress but we're still over
2700
+ * memory.high, we want to encourage that rather than doing allocator
2701
+ * throttling.
2702
+ */
2703
+ if (nr_reclaimed || nr_retries--) {
2704
+ in_retry = true;
2705
+ goto retry_reclaim;
2706
+ }
2707
+
2708
+ /*
2709
+ * If we exit early, we're guaranteed to die (since
2710
+ * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
2711
+ * need to account for any ill-begotten jiffies to pay them off later.
2712
+ */
2713
+ psi_memstall_enter(&pflags);
2714
+ schedule_timeout_killable(penalty_jiffies);
2715
+ psi_memstall_leave(&pflags);
2716
+
2717
+out:
2718
+ css_put(&memcg->css);
21942719 }
21952720
21962721 static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
21972722 unsigned int nr_pages)
21982723 {
21992724 unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
2200
- int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2725
+ int nr_retries = MAX_RECLAIM_RETRIES;
22012726 struct mem_cgroup *mem_over_limit;
22022727 struct page_counter *counter;
2728
+ enum oom_status oom_status;
22032729 unsigned long nr_reclaimed;
2730
+ bool passed_oom = false;
22042731 bool may_swap = true;
22052732 bool drained = false;
2206
- bool oomed = false;
2207
- enum oom_status oom_status;
2733
+ unsigned long pflags;
22082734
22092735 if (mem_cgroup_is_root(memcg))
22102736 return 0;
....@@ -2239,15 +2765,6 @@
22392765 goto force;
22402766
22412767 /*
2242
- * Unlike in global OOM situations, memcg is not in a physical
2243
- * memory shortage. Allow dying and OOM-killed tasks to
2244
- * bypass the last charges so that they can exit quickly and
2245
- * free their memory.
2246
- */
2247
- if (unlikely(should_force_charge()))
2248
- goto force;
2249
-
2250
- /*
22512768 * Prevent unbounded recursion when reclaim operations need to
22522769 * allocate memory. This might exceed the limits temporarily,
22532770 * but we prefer facilitating memory reclaim and getting back
....@@ -2264,8 +2781,10 @@
22642781
22652782 memcg_memory_event(mem_over_limit, MEMCG_MAX);
22662783
2784
+ psi_memstall_enter(&pflags);
22672785 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
22682786 gfp_mask, may_swap);
2787
+ psi_memstall_leave(&pflags);
22692788
22702789 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
22712790 goto retry;
....@@ -2299,16 +2818,15 @@
22992818 if (nr_retries--)
23002819 goto retry;
23012820
2302
- if (gfp_mask & __GFP_RETRY_MAYFAIL && oomed)
2821
+ if (gfp_mask & __GFP_RETRY_MAYFAIL)
23032822 goto nomem;
23042823
23052824 if (gfp_mask & __GFP_NOFAIL)
23062825 goto force;
23072826
2308
- if (fatal_signal_pending(current))
2309
- goto force;
2310
-
2311
- memcg_memory_event(mem_over_limit, MEMCG_OOM);
2827
+ /* Avoid endless loop for tasks bypassed by the oom killer */
2828
+ if (passed_oom && task_is_dying())
2829
+ goto nomem;
23122830
23132831 /*
23142832 * keep retrying as long as the memcg oom killer is able to make
....@@ -2317,15 +2835,10 @@
23172835 */
23182836 oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
23192837 get_order(nr_pages * PAGE_SIZE));
2320
- switch (oom_status) {
2321
- case OOM_SUCCESS:
2322
- nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2323
- oomed = true;
2838
+ if (oom_status == OOM_SUCCESS) {
2839
+ passed_oom = true;
2840
+ nr_retries = MAX_RECLAIM_RETRIES;
23242841 goto retry;
2325
- case OOM_FAILED:
2326
- goto force;
2327
- default:
2328
- goto nomem;
23292842 }
23302843 nomem:
23312844 if (!(gfp_mask & __GFP_NOFAIL))
....@@ -2339,12 +2852,10 @@
23392852 page_counter_charge(&memcg->memory, nr_pages);
23402853 if (do_memsw_account())
23412854 page_counter_charge(&memcg->memsw, nr_pages);
2342
- css_get_many(&memcg->css, nr_pages);
23432855
23442856 return 0;
23452857
23462858 done_restock:
2347
- css_get_many(&memcg->css, batch);
23482859 if (batch > nr_pages)
23492860 refill_stock(memcg, batch - nr_pages);
23502861
....@@ -2358,12 +2869,32 @@
23582869 * reclaim, the cost of mismatch is negligible.
23592870 */
23602871 do {
2361
- if (page_counter_read(&memcg->memory) > memcg->high) {
2362
- /* Don't bother a random interrupted task */
2363
- if (in_interrupt()) {
2872
+ bool mem_high, swap_high;
2873
+
2874
+ mem_high = page_counter_read(&memcg->memory) >
2875
+ READ_ONCE(memcg->memory.high);
2876
+ swap_high = page_counter_read(&memcg->swap) >
2877
+ READ_ONCE(memcg->swap.high);
2878
+
2879
+ /* Don't bother a random interrupted task */
2880
+ if (in_interrupt()) {
2881
+ if (mem_high) {
23642882 schedule_work(&memcg->high_work);
23652883 break;
23662884 }
2885
+ continue;
2886
+ }
2887
+
2888
+ if (mem_high || swap_high) {
2889
+ /*
2890
+ * The allocating tasks in this cgroup will need to do
2891
+ * reclaim or be throttled to prevent further growth
2892
+ * of the memory or swap footprints.
2893
+ *
2894
+ * Target some best-effort fairness between the tasks,
2895
+ * and distribute reclaim work and delay penalties
2896
+ * based on how much each task is actually allocating.
2897
+ */
23672898 current->memcg_nr_pages_over_high += batch;
23682899 set_notify_resume(current);
23692900 break;
....@@ -2373,6 +2904,7 @@
23732904 return 0;
23742905 }
23752906
2907
+#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU)
23762908 static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
23772909 {
23782910 if (mem_cgroup_is_root(memcg))
....@@ -2381,76 +2913,124 @@
23812913 page_counter_uncharge(&memcg->memory, nr_pages);
23822914 if (do_memsw_account())
23832915 page_counter_uncharge(&memcg->memsw, nr_pages);
2384
-
2385
- css_put_many(&memcg->css, nr_pages);
23862916 }
2917
+#endif
23872918
2388
-static void lock_page_lru(struct page *page, int *isolated)
2919
+static void commit_charge(struct page *page, struct mem_cgroup *memcg)
23892920 {
2390
- struct zone *zone = page_zone(page);
2391
-
2392
- spin_lock_irq(zone_lru_lock(zone));
2393
- if (PageLRU(page)) {
2394
- struct lruvec *lruvec;
2395
-
2396
- lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
2397
- ClearPageLRU(page);
2398
- del_page_from_lru_list(page, lruvec, page_lru(page));
2399
- *isolated = 1;
2400
- } else
2401
- *isolated = 0;
2402
-}
2403
-
2404
-static void unlock_page_lru(struct page *page, int isolated)
2405
-{
2406
- struct zone *zone = page_zone(page);
2407
-
2408
- if (isolated) {
2409
- struct lruvec *lruvec;
2410
-
2411
- lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
2412
- VM_BUG_ON_PAGE(PageLRU(page), page);
2413
- SetPageLRU(page);
2414
- add_page_to_lru_list(page, lruvec, page_lru(page));
2415
- }
2416
- spin_unlock_irq(zone_lru_lock(zone));
2417
-}
2418
-
2419
-static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2420
- bool lrucare)
2421
-{
2422
- int isolated;
2423
-
24242921 VM_BUG_ON_PAGE(page->mem_cgroup, page);
2425
-
24262922 /*
2427
- * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
2428
- * may already be on some other mem_cgroup's LRU. Take care of it.
2429
- */
2430
- if (lrucare)
2431
- lock_page_lru(page, &isolated);
2432
-
2433
- /*
2434
- * Nobody should be changing or seriously looking at
2435
- * page->mem_cgroup at this point:
2923
+ * Any of the following ensures page->mem_cgroup stability:
24362924 *
2437
- * - the page is uncharged
2438
- *
2439
- * - the page is off-LRU
2440
- *
2441
- * - an anonymous fault has exclusive page access, except for
2442
- * a locked page table
2443
- *
2444
- * - a page cache insertion, a swapin fault, or a migration
2445
- * have the page locked
2925
+ * - the page lock
2926
+ * - LRU isolation
2927
+ * - lock_page_memcg()
2928
+ * - exclusive reference
24462929 */
24472930 page->mem_cgroup = memcg;
2448
-
2449
- if (lrucare)
2450
- unlock_page_lru(page, isolated);
24512931 }
24522932
24532933 #ifdef CONFIG_MEMCG_KMEM
2934
+/*
2935
+ * The allocated objcg pointers array is not accounted directly.
2936
+ * Moreover, it should not come from DMA buffer and is not readily
2937
+ * reclaimable. So those GFP bits should be masked off.
2938
+ */
2939
+#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT)
2940
+
2941
+int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
2942
+ gfp_t gfp)
2943
+{
2944
+ unsigned int objects = objs_per_slab_page(s, page);
2945
+ void *vec;
2946
+
2947
+ gfp &= ~OBJCGS_CLEAR_MASK;
2948
+ vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
2949
+ page_to_nid(page));
2950
+ if (!vec)
2951
+ return -ENOMEM;
2952
+
2953
+ if (cmpxchg(&page->obj_cgroups, NULL,
2954
+ (struct obj_cgroup **) ((unsigned long)vec | 0x1UL)))
2955
+ kfree(vec);
2956
+ else
2957
+ kmemleak_not_leak(vec);
2958
+
2959
+ return 0;
2960
+}
2961
+
2962
+/*
2963
+ * Returns a pointer to the memory cgroup to which the kernel object is charged.
2964
+ *
2965
+ * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
2966
+ * cgroup_mutex, etc.
2967
+ */
2968
+struct mem_cgroup *mem_cgroup_from_obj(void *p)
2969
+{
2970
+ struct page *page;
2971
+
2972
+ if (mem_cgroup_disabled())
2973
+ return NULL;
2974
+
2975
+ page = virt_to_head_page(p);
2976
+
2977
+ /*
2978
+ * If page->mem_cgroup is set, it's either a simple mem_cgroup pointer
2979
+ * or a pointer to obj_cgroup vector. In the latter case the lowest
2980
+ * bit of the pointer is set.
2981
+ * The page->mem_cgroup pointer can be asynchronously changed
2982
+ * from NULL to (obj_cgroup_vec | 0x1UL), but can't be changed
2983
+ * from a valid memcg pointer to objcg vector or back.
2984
+ */
2985
+ if (!page->mem_cgroup)
2986
+ return NULL;
2987
+
2988
+ /*
2989
+ * Slab objects are accounted individually, not per-page.
2990
+ * Memcg membership data for each individual object is saved in
2991
+ * the page->obj_cgroups.
2992
+ */
2993
+ if (page_has_obj_cgroups(page)) {
2994
+ struct obj_cgroup *objcg;
2995
+ unsigned int off;
2996
+
2997
+ off = obj_to_index(page->slab_cache, page, p);
2998
+ objcg = page_obj_cgroups(page)[off];
2999
+ if (objcg)
3000
+ return obj_cgroup_memcg(objcg);
3001
+
3002
+ return NULL;
3003
+ }
3004
+
3005
+ /* All other pages use page->mem_cgroup */
3006
+ return page->mem_cgroup;
3007
+}
3008
+
3009
+__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
3010
+{
3011
+ struct obj_cgroup *objcg = NULL;
3012
+ struct mem_cgroup *memcg;
3013
+
3014
+ if (memcg_kmem_bypass())
3015
+ return NULL;
3016
+
3017
+ rcu_read_lock();
3018
+ if (unlikely(active_memcg()))
3019
+ memcg = active_memcg();
3020
+ else
3021
+ memcg = mem_cgroup_from_task(current);
3022
+
3023
+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
3024
+ objcg = rcu_dereference(memcg->objcg);
3025
+ if (objcg && obj_cgroup_tryget(objcg))
3026
+ break;
3027
+ objcg = NULL;
3028
+ }
3029
+ rcu_read_unlock();
3030
+
3031
+ return objcg;
3032
+}
3033
+
24543034 static int memcg_alloc_cache_id(void)
24553035 {
24563036 int id, size;
....@@ -2476,9 +3056,7 @@
24763056 else if (size > MEMCG_CACHES_MAX_SIZE)
24773057 size = MEMCG_CACHES_MAX_SIZE;
24783058
2479
- err = memcg_update_all_caches(size);
2480
- if (!err)
2481
- err = memcg_update_all_list_lrus(size);
3059
+ err = memcg_update_all_list_lrus(size);
24823060 if (!err)
24833061 memcg_nr_cache_ids = size;
24843062
....@@ -2496,152 +3074,17 @@
24963074 ida_simple_remove(&memcg_cache_ida, id);
24973075 }
24983076
2499
-struct memcg_kmem_cache_create_work {
2500
- struct mem_cgroup *memcg;
2501
- struct kmem_cache *cachep;
2502
- struct work_struct work;
2503
-};
2504
-
2505
-static void memcg_kmem_cache_create_func(struct work_struct *w)
2506
-{
2507
- struct memcg_kmem_cache_create_work *cw =
2508
- container_of(w, struct memcg_kmem_cache_create_work, work);
2509
- struct mem_cgroup *memcg = cw->memcg;
2510
- struct kmem_cache *cachep = cw->cachep;
2511
-
2512
- memcg_create_kmem_cache(memcg, cachep);
2513
-
2514
- css_put(&memcg->css);
2515
- kfree(cw);
2516
-}
2517
-
2518
-/*
2519
- * Enqueue the creation of a per-memcg kmem_cache.
2520
- */
2521
-static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2522
- struct kmem_cache *cachep)
2523
-{
2524
- struct memcg_kmem_cache_create_work *cw;
2525
-
2526
- cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
2527
- if (!cw)
2528
- return;
2529
-
2530
- css_get(&memcg->css);
2531
-
2532
- cw->memcg = memcg;
2533
- cw->cachep = cachep;
2534
- INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
2535
-
2536
- queue_work(memcg_kmem_cache_wq, &cw->work);
2537
-}
2538
-
2539
-static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2540
- struct kmem_cache *cachep)
2541
-{
2542
- /*
2543
- * We need to stop accounting when we kmalloc, because if the
2544
- * corresponding kmalloc cache is not yet created, the first allocation
2545
- * in __memcg_schedule_kmem_cache_create will recurse.
2546
- *
2547
- * However, it is better to enclose the whole function. Depending on
2548
- * the debugging options enabled, INIT_WORK(), for instance, can
2549
- * trigger an allocation. This too, will make us recurse. Because at
2550
- * this point we can't allow ourselves back into memcg_kmem_get_cache,
2551
- * the safest choice is to do it like this, wrapping the whole function.
2552
- */
2553
- current->memcg_kmem_skip_account = 1;
2554
- __memcg_schedule_kmem_cache_create(memcg, cachep);
2555
- current->memcg_kmem_skip_account = 0;
2556
-}
2557
-
2558
-static inline bool memcg_kmem_bypass(void)
2559
-{
2560
- if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
2561
- return true;
2562
- return false;
2563
-}
2564
-
25653077 /**
2566
- * memcg_kmem_get_cache: select the correct per-memcg cache for allocation
2567
- * @cachep: the original global kmem cache
2568
- *
2569
- * Return the kmem_cache we're supposed to use for a slab allocation.
2570
- * We try to use the current memcg's version of the cache.
2571
- *
2572
- * If the cache does not exist yet, if we are the first user of it, we
2573
- * create it asynchronously in a workqueue and let the current allocation
2574
- * go through with the original cache.
2575
- *
2576
- * This function takes a reference to the cache it returns to assure it
2577
- * won't get destroyed while we are working with it. Once the caller is
2578
- * done with it, memcg_kmem_put_cache() must be called to release the
2579
- * reference.
2580
- */
2581
-struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
2582
-{
2583
- struct mem_cgroup *memcg;
2584
- struct kmem_cache *memcg_cachep;
2585
- int kmemcg_id;
2586
-
2587
- VM_BUG_ON(!is_root_cache(cachep));
2588
-
2589
- if (memcg_kmem_bypass())
2590
- return cachep;
2591
-
2592
- if (current->memcg_kmem_skip_account)
2593
- return cachep;
2594
-
2595
- memcg = get_mem_cgroup_from_current();
2596
- kmemcg_id = READ_ONCE(memcg->kmemcg_id);
2597
- if (kmemcg_id < 0)
2598
- goto out;
2599
-
2600
- memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
2601
- if (likely(memcg_cachep))
2602
- return memcg_cachep;
2603
-
2604
- /*
2605
- * If we are in a safe context (can wait, and not in interrupt
2606
- * context), we could be be predictable and return right away.
2607
- * This would guarantee that the allocation being performed
2608
- * already belongs in the new cache.
2609
- *
2610
- * However, there are some clashes that can arrive from locking.
2611
- * For instance, because we acquire the slab_mutex while doing
2612
- * memcg_create_kmem_cache, this means no further allocation
2613
- * could happen with the slab_mutex held. So it's better to
2614
- * defer everything.
2615
- */
2616
- memcg_schedule_kmem_cache_create(memcg, cachep);
2617
-out:
2618
- css_put(&memcg->css);
2619
- return cachep;
2620
-}
2621
-
2622
-/**
2623
- * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache
2624
- * @cachep: the cache returned by memcg_kmem_get_cache
2625
- */
2626
-void memcg_kmem_put_cache(struct kmem_cache *cachep)
2627
-{
2628
- if (!is_root_cache(cachep))
2629
- css_put(&cachep->memcg_params.memcg->css);
2630
-}
2631
-
2632
-/**
2633
- * memcg_kmem_charge_memcg: charge a kmem page
2634
- * @page: page to charge
2635
- * @gfp: reclaim mode
2636
- * @order: allocation order
3078
+ * __memcg_kmem_charge: charge a number of kernel pages to a memcg
26373079 * @memcg: memory cgroup to charge
3080
+ * @gfp: reclaim mode
3081
+ * @nr_pages: number of pages to charge
26383082 *
26393083 * Returns 0 on success, an error code on failure.
26403084 */
2641
-int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
2642
- struct mem_cgroup *memcg)
3085
+int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
3086
+ unsigned int nr_pages)
26433087 {
2644
- unsigned int nr_pages = 1 << order;
26453088 struct page_counter *counter;
26463089 int ret;
26473090
....@@ -2664,43 +3107,54 @@
26643107 cancel_charge(memcg, nr_pages);
26653108 return -ENOMEM;
26663109 }
2667
-
2668
- page->mem_cgroup = memcg;
2669
-
26703110 return 0;
26713111 }
26723112
26733113 /**
2674
- * memcg_kmem_charge: charge a kmem page to the current memory cgroup
3114
+ * __memcg_kmem_uncharge: uncharge a number of kernel pages from a memcg
3115
+ * @memcg: memcg to uncharge
3116
+ * @nr_pages: number of pages to uncharge
3117
+ */
3118
+void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
3119
+{
3120
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
3121
+ page_counter_uncharge(&memcg->kmem, nr_pages);
3122
+
3123
+ refill_stock(memcg, nr_pages);
3124
+}
3125
+
3126
+/**
3127
+ * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup
26753128 * @page: page to charge
26763129 * @gfp: reclaim mode
26773130 * @order: allocation order
26783131 *
26793132 * Returns 0 on success, an error code on failure.
26803133 */
2681
-int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
3134
+int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
26823135 {
26833136 struct mem_cgroup *memcg;
26843137 int ret = 0;
26853138
2686
- if (mem_cgroup_disabled() || memcg_kmem_bypass())
2687
- return 0;
2688
-
26893139 memcg = get_mem_cgroup_from_current();
2690
- if (!mem_cgroup_is_root(memcg)) {
2691
- ret = memcg_kmem_charge_memcg(page, gfp, order, memcg);
2692
- if (!ret)
3140
+ if (memcg && !mem_cgroup_is_root(memcg)) {
3141
+ ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
3142
+ if (!ret) {
3143
+ page->mem_cgroup = memcg;
26933144 __SetPageKmemcg(page);
3145
+ return 0;
3146
+ }
3147
+ css_put(&memcg->css);
26943148 }
2695
- css_put(&memcg->css);
26963149 return ret;
26973150 }
3151
+
26983152 /**
2699
- * memcg_kmem_uncharge: uncharge a kmem page
3153
+ * __memcg_kmem_uncharge_page: uncharge a kmem page
27003154 * @page: page to uncharge
27013155 * @order: allocation order
27023156 */
2703
-void memcg_kmem_uncharge(struct page *page, int order)
3157
+void __memcg_kmem_uncharge_page(struct page *page, int order)
27043158 {
27053159 struct mem_cgroup *memcg = page->mem_cgroup;
27063160 unsigned int nr_pages = 1 << order;
....@@ -2709,43 +3163,179 @@
27093163 return;
27103164
27113165 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
2712
-
2713
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
2714
- page_counter_uncharge(&memcg->kmem, nr_pages);
2715
-
2716
- page_counter_uncharge(&memcg->memory, nr_pages);
2717
- if (do_memsw_account())
2718
- page_counter_uncharge(&memcg->memsw, nr_pages);
2719
-
3166
+ __memcg_kmem_uncharge(memcg, nr_pages);
27203167 page->mem_cgroup = NULL;
3168
+ css_put(&memcg->css);
27213169
27223170 /* slab pages do not have PageKmemcg flag set */
27233171 if (PageKmemcg(page))
27243172 __ClearPageKmemcg(page);
2725
-
2726
- css_put_many(&memcg->css, nr_pages);
27273173 }
2728
-#endif /* CONFIG_MEMCG_KMEM */
27293174
2730
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2731
-
2732
-/*
2733
- * Because tail pages are not marked as "used", set it. We're under
2734
- * zone_lru_lock and migration entries setup in all page mappings.
2735
- */
2736
-void mem_cgroup_split_huge_fixup(struct page *head)
3175
+static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
27373176 {
2738
- int i;
3177
+ struct memcg_stock_pcp *stock;
3178
+ unsigned long flags;
3179
+ bool ret = false;
27393180
2740
- if (mem_cgroup_disabled())
3181
+ local_lock_irqsave(&memcg_stock.lock, flags);
3182
+
3183
+ stock = this_cpu_ptr(&memcg_stock);
3184
+ if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
3185
+ stock->nr_bytes -= nr_bytes;
3186
+ ret = true;
3187
+ }
3188
+
3189
+ local_unlock_irqrestore(&memcg_stock.lock, flags);
3190
+
3191
+ return ret;
3192
+}
3193
+
3194
+static void drain_obj_stock(struct memcg_stock_pcp *stock)
3195
+{
3196
+ struct obj_cgroup *old = stock->cached_objcg;
3197
+
3198
+ if (!old)
27413199 return;
27423200
2743
- for (i = 1; i < HPAGE_PMD_NR; i++)
2744
- head[i].mem_cgroup = head->mem_cgroup;
3201
+ if (stock->nr_bytes) {
3202
+ unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
3203
+ unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
27453204
2746
- __mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR);
3205
+ if (nr_pages) {
3206
+ struct mem_cgroup *memcg;
3207
+
3208
+ rcu_read_lock();
3209
+retry:
3210
+ memcg = obj_cgroup_memcg(old);
3211
+ if (unlikely(!css_tryget(&memcg->css)))
3212
+ goto retry;
3213
+ rcu_read_unlock();
3214
+
3215
+ __memcg_kmem_uncharge(memcg, nr_pages);
3216
+ css_put(&memcg->css);
3217
+ }
3218
+
3219
+ /*
3220
+ * The leftover is flushed to the centralized per-memcg value.
3221
+ * On the next attempt to refill obj stock it will be moved
3222
+ * to a per-cpu stock (probably, on an other CPU), see
3223
+ * refill_obj_stock().
3224
+ *
3225
+ * How often it's flushed is a trade-off between the memory
3226
+ * limit enforcement accuracy and potential CPU contention,
3227
+ * so it might be changed in the future.
3228
+ */
3229
+ atomic_add(nr_bytes, &old->nr_charged_bytes);
3230
+ stock->nr_bytes = 0;
3231
+ }
3232
+
3233
+ obj_cgroup_put(old);
3234
+ stock->cached_objcg = NULL;
27473235 }
2748
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
3236
+
3237
+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
3238
+ struct mem_cgroup *root_memcg)
3239
+{
3240
+ struct mem_cgroup *memcg;
3241
+
3242
+ if (stock->cached_objcg) {
3243
+ memcg = obj_cgroup_memcg(stock->cached_objcg);
3244
+ if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
3245
+ return true;
3246
+ }
3247
+
3248
+ return false;
3249
+}
3250
+
3251
+static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
3252
+{
3253
+ struct memcg_stock_pcp *stock;
3254
+ unsigned long flags;
3255
+
3256
+ local_lock_irqsave(&memcg_stock.lock, flags);
3257
+
3258
+ stock = this_cpu_ptr(&memcg_stock);
3259
+ if (stock->cached_objcg != objcg) { /* reset if necessary */
3260
+ drain_obj_stock(stock);
3261
+ obj_cgroup_get(objcg);
3262
+ stock->cached_objcg = objcg;
3263
+ stock->nr_bytes = atomic_xchg(&objcg->nr_charged_bytes, 0);
3264
+ }
3265
+ stock->nr_bytes += nr_bytes;
3266
+
3267
+ if (stock->nr_bytes > PAGE_SIZE)
3268
+ drain_obj_stock(stock);
3269
+
3270
+ local_unlock_irqrestore(&memcg_stock.lock, flags);
3271
+}
3272
+
3273
+int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
3274
+{
3275
+ struct mem_cgroup *memcg;
3276
+ unsigned int nr_pages, nr_bytes;
3277
+ int ret;
3278
+
3279
+ if (consume_obj_stock(objcg, size))
3280
+ return 0;
3281
+
3282
+ /*
3283
+ * In theory, memcg->nr_charged_bytes can have enough
3284
+ * pre-charged bytes to satisfy the allocation. However,
3285
+ * flushing memcg->nr_charged_bytes requires two atomic
3286
+ * operations, and memcg->nr_charged_bytes can't be big,
3287
+ * so it's better to ignore it and try grab some new pages.
3288
+ * memcg->nr_charged_bytes will be flushed in
3289
+ * refill_obj_stock(), called from this function or
3290
+ * independently later.
3291
+ */
3292
+ rcu_read_lock();
3293
+retry:
3294
+ memcg = obj_cgroup_memcg(objcg);
3295
+ if (unlikely(!css_tryget(&memcg->css)))
3296
+ goto retry;
3297
+ rcu_read_unlock();
3298
+
3299
+ nr_pages = size >> PAGE_SHIFT;
3300
+ nr_bytes = size & (PAGE_SIZE - 1);
3301
+
3302
+ if (nr_bytes)
3303
+ nr_pages += 1;
3304
+
3305
+ ret = __memcg_kmem_charge(memcg, gfp, nr_pages);
3306
+ if (!ret && nr_bytes)
3307
+ refill_obj_stock(objcg, PAGE_SIZE - nr_bytes);
3308
+
3309
+ css_put(&memcg->css);
3310
+ return ret;
3311
+}
3312
+
3313
+void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
3314
+{
3315
+ refill_obj_stock(objcg, size);
3316
+}
3317
+
3318
+#endif /* CONFIG_MEMCG_KMEM */
3319
+
3320
+/*
3321
+ * Because head->mem_cgroup is not set on tails, set it now.
3322
+ */
3323
+void split_page_memcg(struct page *head, unsigned int nr)
3324
+{
3325
+ struct mem_cgroup *memcg = head->mem_cgroup;
3326
+ int kmemcg = PageKmemcg(head);
3327
+ int i;
3328
+
3329
+ if (mem_cgroup_disabled() || !memcg)
3330
+ return;
3331
+
3332
+ for (i = 1; i < nr; i++) {
3333
+ head[i].mem_cgroup = memcg;
3334
+ if (kmemcg)
3335
+ __SetPageKmemcg(head + i);
3336
+ }
3337
+ css_get_many(&memcg->css, nr - 1);
3338
+}
27493339
27503340 #ifdef CONFIG_MEMCG_SWAP
27513341 /**
....@@ -2807,7 +3397,7 @@
28073397 * Make sure that the new limit (memsw or memory limit) doesn't
28083398 * break our basic invariant rule memory.max <= memsw.max.
28093399 */
2810
- limits_invariant = memsw ? max >= memcg->memory.max :
3400
+ limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
28113401 max <= memcg->memsw.max;
28123402 if (!limits_invariant) {
28133403 mutex_unlock(&memcg_max_mutex);
....@@ -2928,7 +3518,7 @@
29283518 * Test whether @memcg has children, dead or alive. Note that this
29293519 * function doesn't care whether @memcg has use_hierarchy enabled and
29303520 * returns %true if there are child csses according to the cgroup
2931
- * hierarchy. Testing use_hierarchy is the caller's responsiblity.
3521
+ * hierarchy. Testing use_hierarchy is the caller's responsibility.
29323522 */
29333523 static inline bool memcg_has_children(struct mem_cgroup *memcg)
29343524 {
....@@ -2947,7 +3537,7 @@
29473537 */
29483538 static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
29493539 {
2950
- int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3540
+ int nr_retries = MAX_RECLAIM_RETRIES;
29513541
29523542 /* we call try-to-free pages for make this cgroup empty */
29533543 lru_add_drain_all();
....@@ -3021,50 +3611,15 @@
30213611 return retval;
30223612 }
30233613
3024
-struct accumulated_stats {
3025
- unsigned long stat[MEMCG_NR_STAT];
3026
- unsigned long events[NR_VM_EVENT_ITEMS];
3027
- unsigned long lru_pages[NR_LRU_LISTS];
3028
- const unsigned int *stats_array;
3029
- const unsigned int *events_array;
3030
- int stats_size;
3031
- int events_size;
3032
-};
3033
-
3034
-static void accumulate_memcg_tree(struct mem_cgroup *memcg,
3035
- struct accumulated_stats *acc)
3036
-{
3037
- struct mem_cgroup *mi;
3038
- int i;
3039
-
3040
- for_each_mem_cgroup_tree(mi, memcg) {
3041
- for (i = 0; i < acc->stats_size; i++)
3042
- acc->stat[i] += memcg_page_state(mi,
3043
- acc->stats_array ? acc->stats_array[i] : i);
3044
-
3045
- for (i = 0; i < acc->events_size; i++)
3046
- acc->events[i] += memcg_sum_events(mi,
3047
- acc->events_array ? acc->events_array[i] : i);
3048
-
3049
- for (i = 0; i < NR_LRU_LISTS; i++)
3050
- acc->lru_pages[i] +=
3051
- mem_cgroup_nr_lru_pages(mi, BIT(i));
3052
- }
3053
-}
3054
-
30553614 static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
30563615 {
3057
- unsigned long val = 0;
3616
+ unsigned long val;
30583617
30593618 if (mem_cgroup_is_root(memcg)) {
3060
- struct mem_cgroup *iter;
3061
-
3062
- for_each_mem_cgroup_tree(iter, memcg) {
3063
- val += memcg_page_state(iter, MEMCG_CACHE);
3064
- val += memcg_page_state(iter, MEMCG_RSS);
3065
- if (swap)
3066
- val += memcg_page_state(iter, MEMCG_SWAP);
3067
- }
3619
+ val = memcg_page_state(memcg, NR_FILE_PAGES) +
3620
+ memcg_page_state(memcg, NR_ANON_MAPPED);
3621
+ if (swap)
3622
+ val += memcg_page_state(memcg, MEMCG_SWAP);
30683623 } else {
30693624 if (!swap)
30703625 val = page_counter_read(&memcg->memory);
....@@ -3125,9 +3680,61 @@
31253680 }
31263681 }
31273682
3683
+static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
3684
+{
3685
+ unsigned long stat[MEMCG_NR_STAT] = {0};
3686
+ struct mem_cgroup *mi;
3687
+ int node, cpu, i;
3688
+
3689
+ for_each_online_cpu(cpu)
3690
+ for (i = 0; i < MEMCG_NR_STAT; i++)
3691
+ stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
3692
+
3693
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
3694
+ for (i = 0; i < MEMCG_NR_STAT; i++)
3695
+ atomic_long_add(stat[i], &mi->vmstats[i]);
3696
+
3697
+ for_each_node(node) {
3698
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
3699
+ struct mem_cgroup_per_node *pi;
3700
+
3701
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3702
+ stat[i] = 0;
3703
+
3704
+ for_each_online_cpu(cpu)
3705
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3706
+ stat[i] += per_cpu(
3707
+ pn->lruvec_stat_cpu->count[i], cpu);
3708
+
3709
+ for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
3710
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
3711
+ atomic_long_add(stat[i], &pi->lruvec_stat[i]);
3712
+ }
3713
+}
3714
+
3715
+static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
3716
+{
3717
+ unsigned long events[NR_VM_EVENT_ITEMS];
3718
+ struct mem_cgroup *mi;
3719
+ int cpu, i;
3720
+
3721
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3722
+ events[i] = 0;
3723
+
3724
+ for_each_online_cpu(cpu)
3725
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3726
+ events[i] += per_cpu(memcg->vmstats_percpu->events[i],
3727
+ cpu);
3728
+
3729
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
3730
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
3731
+ atomic_long_add(events[i], &mi->vmevents[i]);
3732
+}
3733
+
31283734 #ifdef CONFIG_MEMCG_KMEM
31293735 static int memcg_online_kmem(struct mem_cgroup *memcg)
31303736 {
3737
+ struct obj_cgroup *objcg;
31313738 int memcg_id;
31323739
31333740 if (cgroup_memory_nokmem)
....@@ -3140,7 +3747,16 @@
31403747 if (memcg_id < 0)
31413748 return memcg_id;
31423749
3143
- static_branch_inc(&memcg_kmem_enabled_key);
3750
+ objcg = obj_cgroup_alloc();
3751
+ if (!objcg) {
3752
+ memcg_free_cache_id(memcg_id);
3753
+ return -ENOMEM;
3754
+ }
3755
+ objcg->memcg = memcg;
3756
+ rcu_assign_pointer(memcg->objcg, objcg);
3757
+
3758
+ static_branch_enable(&memcg_kmem_enabled_key);
3759
+
31443760 /*
31453761 * A memory cgroup is considered kmem-online as soon as it gets
31463762 * kmemcg_id. Setting the id after enabling static branching will
....@@ -3149,7 +3765,6 @@
31493765 */
31503766 memcg->kmemcg_id = memcg_id;
31513767 memcg->kmem_state = KMEM_ONLINE;
3152
- INIT_LIST_HEAD(&memcg->kmem_caches);
31533768
31543769 return 0;
31553770 }
....@@ -3162,22 +3777,17 @@
31623777
31633778 if (memcg->kmem_state != KMEM_ONLINE)
31643779 return;
3165
- /*
3166
- * Clear the online state before clearing memcg_caches array
3167
- * entries. The slab_mutex in memcg_deactivate_kmem_caches()
3168
- * guarantees that no cache will be created for this cgroup
3169
- * after we are done (see memcg_create_kmem_cache()).
3170
- */
3780
+
31713781 memcg->kmem_state = KMEM_ALLOCATED;
3172
-
3173
- memcg_deactivate_kmem_caches(memcg);
3174
-
3175
- kmemcg_id = memcg->kmemcg_id;
3176
- BUG_ON(kmemcg_id < 0);
31773782
31783783 parent = parent_mem_cgroup(memcg);
31793784 if (!parent)
31803785 parent = root_mem_cgroup;
3786
+
3787
+ memcg_reparent_objcgs(memcg, parent);
3788
+
3789
+ kmemcg_id = memcg->kmemcg_id;
3790
+ BUG_ON(kmemcg_id < 0);
31813791
31823792 /*
31833793 * Change kmemcg_id of this cgroup and all its descendants to the
....@@ -3207,12 +3817,6 @@
32073817 /* css_alloc() failed, offlining didn't happen */
32083818 if (unlikely(memcg->kmem_state == KMEM_ONLINE))
32093819 memcg_offline_kmem(memcg);
3210
-
3211
- if (memcg->kmem_state == KMEM_ALLOCATED) {
3212
- memcg_destroy_kmem_caches(memcg);
3213
- static_branch_dec(&memcg_kmem_enabled_key);
3214
- WARN_ON(page_counter_read(&memcg->kmem));
3215
- }
32163820 }
32173821 #else
32183822 static int memcg_online_kmem(struct mem_cgroup *memcg)
....@@ -3303,6 +3907,9 @@
33033907 ret = mem_cgroup_resize_max(memcg, nr_pages, true);
33043908 break;
33053909 case _KMEM:
3910
+ pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
3911
+ "Please report your usecase to linux-mm@kvack.org if you "
3912
+ "depend on this functionality.\n");
33063913 ret = memcg_update_kmem_max(memcg, nr_pages);
33073914 break;
33083915 case _TCP:
....@@ -3388,6 +3995,49 @@
33883995 #endif
33893996
33903997 #ifdef CONFIG_NUMA
3998
+
3999
+#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
4000
+#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
4001
+#define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
4002
+
4003
+static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
4004
+ int nid, unsigned int lru_mask, bool tree)
4005
+{
4006
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
4007
+ unsigned long nr = 0;
4008
+ enum lru_list lru;
4009
+
4010
+ VM_BUG_ON((unsigned)nid >= nr_node_ids);
4011
+
4012
+ for_each_lru(lru) {
4013
+ if (!(BIT(lru) & lru_mask))
4014
+ continue;
4015
+ if (tree)
4016
+ nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
4017
+ else
4018
+ nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
4019
+ }
4020
+ return nr;
4021
+}
4022
+
4023
+static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
4024
+ unsigned int lru_mask,
4025
+ bool tree)
4026
+{
4027
+ unsigned long nr = 0;
4028
+ enum lru_list lru;
4029
+
4030
+ for_each_lru(lru) {
4031
+ if (!(BIT(lru) & lru_mask))
4032
+ continue;
4033
+ if (tree)
4034
+ nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
4035
+ else
4036
+ nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
4037
+ }
4038
+ return nr;
4039
+}
4040
+
33914041 static int memcg_numa_stat_show(struct seq_file *m, void *v)
33924042 {
33934043 struct numa_stat {
....@@ -3403,40 +4053,60 @@
34034053 };
34044054 const struct numa_stat *stat;
34054055 int nid;
3406
- unsigned long nr;
3407
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
4056
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
34084057
34094058 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3410
- nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
3411
- seq_printf(m, "%s=%lu", stat->name, nr);
3412
- for_each_node_state(nid, N_MEMORY) {
3413
- nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
3414
- stat->lru_mask);
3415
- seq_printf(m, " N%d=%lu", nid, nr);
3416
- }
4059
+ seq_printf(m, "%s=%lu", stat->name,
4060
+ mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
4061
+ false));
4062
+ for_each_node_state(nid, N_MEMORY)
4063
+ seq_printf(m, " N%d=%lu", nid,
4064
+ mem_cgroup_node_nr_lru_pages(memcg, nid,
4065
+ stat->lru_mask, false));
34174066 seq_putc(m, '\n');
34184067 }
34194068
34204069 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3421
- struct mem_cgroup *iter;
34224070
3423
- nr = 0;
3424
- for_each_mem_cgroup_tree(iter, memcg)
3425
- nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
3426
- seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
3427
- for_each_node_state(nid, N_MEMORY) {
3428
- nr = 0;
3429
- for_each_mem_cgroup_tree(iter, memcg)
3430
- nr += mem_cgroup_node_nr_lru_pages(
3431
- iter, nid, stat->lru_mask);
3432
- seq_printf(m, " N%d=%lu", nid, nr);
3433
- }
4071
+ seq_printf(m, "hierarchical_%s=%lu", stat->name,
4072
+ mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
4073
+ true));
4074
+ for_each_node_state(nid, N_MEMORY)
4075
+ seq_printf(m, " N%d=%lu", nid,
4076
+ mem_cgroup_node_nr_lru_pages(memcg, nid,
4077
+ stat->lru_mask, true));
34344078 seq_putc(m, '\n');
34354079 }
34364080
34374081 return 0;
34384082 }
34394083 #endif /* CONFIG_NUMA */
4084
+
4085
+static const unsigned int memcg1_stats[] = {
4086
+ NR_FILE_PAGES,
4087
+ NR_ANON_MAPPED,
4088
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4089
+ NR_ANON_THPS,
4090
+#endif
4091
+ NR_SHMEM,
4092
+ NR_FILE_MAPPED,
4093
+ NR_FILE_DIRTY,
4094
+ NR_WRITEBACK,
4095
+ MEMCG_SWAP,
4096
+};
4097
+
4098
+static const char *const memcg1_stat_names[] = {
4099
+ "cache",
4100
+ "rss",
4101
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4102
+ "rss_huge",
4103
+#endif
4104
+ "shmem",
4105
+ "mapped_file",
4106
+ "dirty",
4107
+ "writeback",
4108
+ "swap",
4109
+};
34404110
34414111 /* Universal VM events cgroup1 shows, original sort order */
34424112 static const unsigned int memcg1_events[] = {
....@@ -3446,45 +4116,42 @@
34464116 PGMAJFAULT,
34474117 };
34484118
3449
-static const char *const memcg1_event_names[] = {
3450
- "pgpgin",
3451
- "pgpgout",
3452
- "pgfault",
3453
- "pgmajfault",
3454
-};
3455
-
34564119 static int memcg_stat_show(struct seq_file *m, void *v)
34574120 {
3458
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
4121
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
34594122 unsigned long memory, memsw;
34604123 struct mem_cgroup *mi;
34614124 unsigned int i;
3462
- struct accumulated_stats acc;
34634125
34644126 BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
3465
- BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
34664127
34674128 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
4129
+ unsigned long nr;
4130
+
34684131 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
34694132 continue;
3470
- seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
3471
- memcg_page_state(memcg, memcg1_stats[i]) *
3472
- PAGE_SIZE);
4133
+ nr = memcg_page_state_local(memcg, memcg1_stats[i]);
4134
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4135
+ if (memcg1_stats[i] == NR_ANON_THPS)
4136
+ nr *= HPAGE_PMD_NR;
4137
+#endif
4138
+ seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE);
34734139 }
34744140
34754141 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3476
- seq_printf(m, "%s %lu\n", memcg1_event_names[i],
3477
- memcg_sum_events(memcg, memcg1_events[i]));
4142
+ seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]),
4143
+ memcg_events_local(memcg, memcg1_events[i]));
34784144
34794145 for (i = 0; i < NR_LRU_LISTS; i++)
3480
- seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
3481
- mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
4146
+ seq_printf(m, "%s %lu\n", lru_list_name(i),
4147
+ memcg_page_state_local(memcg, NR_LRU_BASE + i) *
4148
+ PAGE_SIZE);
34824149
34834150 /* Hierarchical information */
34844151 memory = memsw = PAGE_COUNTER_MAX;
34854152 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
3486
- memory = min(memory, mi->memory.max);
3487
- memsw = min(memsw, mi->memsw.max);
4153
+ memory = min(memory, READ_ONCE(mi->memory.max));
4154
+ memsw = min(memsw, READ_ONCE(mi->memsw.max));
34884155 }
34894156 seq_printf(m, "hierarchical_memory_limit %llu\n",
34904157 (u64)memory * PAGE_SIZE);
....@@ -3492,49 +4159,45 @@
34924159 seq_printf(m, "hierarchical_memsw_limit %llu\n",
34934160 (u64)memsw * PAGE_SIZE);
34944161
3495
- memset(&acc, 0, sizeof(acc));
3496
- acc.stats_size = ARRAY_SIZE(memcg1_stats);
3497
- acc.stats_array = memcg1_stats;
3498
- acc.events_size = ARRAY_SIZE(memcg1_events);
3499
- acc.events_array = memcg1_events;
3500
- accumulate_memcg_tree(memcg, &acc);
3501
-
35024162 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
4163
+ unsigned long nr;
4164
+
35034165 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
35044166 continue;
4167
+ nr = memcg_page_state(memcg, memcg1_stats[i]);
4168
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4169
+ if (memcg1_stats[i] == NR_ANON_THPS)
4170
+ nr *= HPAGE_PMD_NR;
4171
+#endif
35054172 seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
3506
- (u64)acc.stat[i] * PAGE_SIZE);
4173
+ (u64)nr * PAGE_SIZE);
35074174 }
35084175
35094176 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3510
- seq_printf(m, "total_%s %llu\n", memcg1_event_names[i],
3511
- (u64)acc.events[i]);
4177
+ seq_printf(m, "total_%s %llu\n",
4178
+ vm_event_name(memcg1_events[i]),
4179
+ (u64)memcg_events(memcg, memcg1_events[i]));
35124180
35134181 for (i = 0; i < NR_LRU_LISTS; i++)
3514
- seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i],
3515
- (u64)acc.lru_pages[i] * PAGE_SIZE);
4182
+ seq_printf(m, "total_%s %llu\n", lru_list_name(i),
4183
+ (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
4184
+ PAGE_SIZE);
35164185
35174186 #ifdef CONFIG_DEBUG_VM
35184187 {
35194188 pg_data_t *pgdat;
35204189 struct mem_cgroup_per_node *mz;
3521
- struct zone_reclaim_stat *rstat;
3522
- unsigned long recent_rotated[2] = {0, 0};
3523
- unsigned long recent_scanned[2] = {0, 0};
4190
+ unsigned long anon_cost = 0;
4191
+ unsigned long file_cost = 0;
35244192
35254193 for_each_online_pgdat(pgdat) {
35264194 mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
3527
- rstat = &mz->lruvec.reclaim_stat;
35284195
3529
- recent_rotated[0] += rstat->recent_rotated[0];
3530
- recent_rotated[1] += rstat->recent_rotated[1];
3531
- recent_scanned[0] += rstat->recent_scanned[0];
3532
- recent_scanned[1] += rstat->recent_scanned[1];
4196
+ anon_cost += mz->lruvec.anon_cost;
4197
+ file_cost += mz->lruvec.file_cost;
35334198 }
3534
- seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
3535
- seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
3536
- seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
3537
- seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
4199
+ seq_printf(m, "anon_cost %lu\n", anon_cost);
4200
+ seq_printf(m, "file_cost %lu\n", file_cost);
35384201 }
35394202 #endif
35404203
....@@ -3693,8 +4356,7 @@
36934356 size = thresholds->primary ? thresholds->primary->size + 1 : 1;
36944357
36954358 /* Allocate memory for new array of thresholds */
3696
- new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
3697
- GFP_KERNEL);
4359
+ new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
36984360 if (!new) {
36994361 ret = -ENOMEM;
37004362 goto unlock;
....@@ -3702,17 +4364,16 @@
37024364 new->size = size;
37034365
37044366 /* Copy thresholds (if any) to new array */
3705
- if (thresholds->primary) {
3706
- memcpy(new->entries, thresholds->primary->entries, (size - 1) *
3707
- sizeof(struct mem_cgroup_threshold));
3708
- }
4367
+ if (thresholds->primary)
4368
+ memcpy(new->entries, thresholds->primary->entries,
4369
+ flex_array_size(new, entries, size - 1));
37094370
37104371 /* Add new threshold */
37114372 new->entries[size - 1].eventfd = eventfd;
37124373 new->entries[size - 1].threshold = threshold;
37134374
37144375 /* Sort thresholds. Registering of new threshold isn't time-critical */
3715
- sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
4376
+ sort(new->entries, size, sizeof(*new->entries),
37164377 compare_thresholds, NULL);
37174378
37184379 /* Find current threshold */
....@@ -3894,7 +4555,7 @@
38944555
38954556 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
38964557 {
3897
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
4558
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
38984559
38994560 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
39004561 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
....@@ -3920,6 +4581,8 @@
39204581 }
39214582
39224583 #ifdef CONFIG_CGROUP_WRITEBACK
4584
+
4585
+#include <trace/events/writeback.h>
39234586
39244587 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
39254588 {
....@@ -3952,11 +4615,11 @@
39524615 */
39534616 static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
39544617 {
3955
- long x = atomic_long_read(&memcg->stat[idx]);
4618
+ long x = atomic_long_read(&memcg->vmstats[idx]);
39564619 int cpu;
39574620
39584621 for_each_online_cpu(cpu)
3959
- x += per_cpu_ptr(memcg->stat_cpu, cpu)->count[idx];
4622
+ x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx];
39604623 if (x < 0)
39614624 x = 0;
39624625 return x;
....@@ -3989,18 +4652,142 @@
39894652
39904653 *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
39914654
3992
- /* this should eventually include NR_UNSTABLE_NFS */
39934655 *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
3994
- *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
3995
- (1 << LRU_ACTIVE_FILE));
4656
+ *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
4657
+ memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
39964658 *pheadroom = PAGE_COUNTER_MAX;
39974659
39984660 while ((parent = parent_mem_cgroup(memcg))) {
3999
- unsigned long ceiling = min(memcg->memory.max, memcg->high);
4661
+ unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
4662
+ READ_ONCE(memcg->memory.high));
40004663 unsigned long used = page_counter_read(&memcg->memory);
40014664
40024665 *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
40034666 memcg = parent;
4667
+ }
4668
+}
4669
+
4670
+/*
4671
+ * Foreign dirty flushing
4672
+ *
4673
+ * There's an inherent mismatch between memcg and writeback. The former
4674
+ * trackes ownership per-page while the latter per-inode. This was a
4675
+ * deliberate design decision because honoring per-page ownership in the
4676
+ * writeback path is complicated, may lead to higher CPU and IO overheads
4677
+ * and deemed unnecessary given that write-sharing an inode across
4678
+ * different cgroups isn't a common use-case.
4679
+ *
4680
+ * Combined with inode majority-writer ownership switching, this works well
4681
+ * enough in most cases but there are some pathological cases. For
4682
+ * example, let's say there are two cgroups A and B which keep writing to
4683
+ * different but confined parts of the same inode. B owns the inode and
4684
+ * A's memory is limited far below B's. A's dirty ratio can rise enough to
4685
+ * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
4686
+ * triggering background writeback. A will be slowed down without a way to
4687
+ * make writeback of the dirty pages happen.
4688
+ *
4689
+ * Conditions like the above can lead to a cgroup getting repatedly and
4690
+ * severely throttled after making some progress after each
4691
+ * dirty_expire_interval while the underyling IO device is almost
4692
+ * completely idle.
4693
+ *
4694
+ * Solving this problem completely requires matching the ownership tracking
4695
+ * granularities between memcg and writeback in either direction. However,
4696
+ * the more egregious behaviors can be avoided by simply remembering the
4697
+ * most recent foreign dirtying events and initiating remote flushes on
4698
+ * them when local writeback isn't enough to keep the memory clean enough.
4699
+ *
4700
+ * The following two functions implement such mechanism. When a foreign
4701
+ * page - a page whose memcg and writeback ownerships don't match - is
4702
+ * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
4703
+ * bdi_writeback on the page owning memcg. When balance_dirty_pages()
4704
+ * decides that the memcg needs to sleep due to high dirty ratio, it calls
4705
+ * mem_cgroup_flush_foreign() which queues writeback on the recorded
4706
+ * foreign bdi_writebacks which haven't expired. Both the numbers of
4707
+ * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
4708
+ * limited to MEMCG_CGWB_FRN_CNT.
4709
+ *
4710
+ * The mechanism only remembers IDs and doesn't hold any object references.
4711
+ * As being wrong occasionally doesn't matter, updates and accesses to the
4712
+ * records are lockless and racy.
4713
+ */
4714
+void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
4715
+ struct bdi_writeback *wb)
4716
+{
4717
+ struct mem_cgroup *memcg = page->mem_cgroup;
4718
+ struct memcg_cgwb_frn *frn;
4719
+ u64 now = get_jiffies_64();
4720
+ u64 oldest_at = now;
4721
+ int oldest = -1;
4722
+ int i;
4723
+
4724
+ trace_track_foreign_dirty(page, wb);
4725
+
4726
+ /*
4727
+ * Pick the slot to use. If there is already a slot for @wb, keep
4728
+ * using it. If not replace the oldest one which isn't being
4729
+ * written out.
4730
+ */
4731
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4732
+ frn = &memcg->cgwb_frn[i];
4733
+ if (frn->bdi_id == wb->bdi->id &&
4734
+ frn->memcg_id == wb->memcg_css->id)
4735
+ break;
4736
+ if (time_before64(frn->at, oldest_at) &&
4737
+ atomic_read(&frn->done.cnt) == 1) {
4738
+ oldest = i;
4739
+ oldest_at = frn->at;
4740
+ }
4741
+ }
4742
+
4743
+ if (i < MEMCG_CGWB_FRN_CNT) {
4744
+ /*
4745
+ * Re-using an existing one. Update timestamp lazily to
4746
+ * avoid making the cacheline hot. We want them to be
4747
+ * reasonably up-to-date and significantly shorter than
4748
+ * dirty_expire_interval as that's what expires the record.
4749
+ * Use the shorter of 1s and dirty_expire_interval / 8.
4750
+ */
4751
+ unsigned long update_intv =
4752
+ min_t(unsigned long, HZ,
4753
+ msecs_to_jiffies(dirty_expire_interval * 10) / 8);
4754
+
4755
+ if (time_before64(frn->at, now - update_intv))
4756
+ frn->at = now;
4757
+ } else if (oldest >= 0) {
4758
+ /* replace the oldest free one */
4759
+ frn = &memcg->cgwb_frn[oldest];
4760
+ frn->bdi_id = wb->bdi->id;
4761
+ frn->memcg_id = wb->memcg_css->id;
4762
+ frn->at = now;
4763
+ }
4764
+}
4765
+
4766
+/* issue foreign writeback flushes for recorded foreign dirtying events */
4767
+void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
4768
+{
4769
+ struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4770
+ unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
4771
+ u64 now = jiffies_64;
4772
+ int i;
4773
+
4774
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4775
+ struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
4776
+
4777
+ /*
4778
+ * If the record is older than dirty_expire_interval,
4779
+ * writeback on it has already started. No need to kick it
4780
+ * off again. Also, don't start a new one if there's
4781
+ * already one in flight.
4782
+ */
4783
+ if (time_after64(frn->at, now - intv) &&
4784
+ atomic_read(&frn->done.cnt) == 1) {
4785
+ frn->at = 0;
4786
+ trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
4787
+ cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
4788
+ WB_REASON_FOREIGN_FLUSH,
4789
+ &frn->done);
4790
+ }
40044791 }
40054792 }
40064793
....@@ -4123,6 +4910,7 @@
41234910 unsigned int efd, cfd;
41244911 struct fd efile;
41254912 struct fd cfile;
4913
+ struct dentry *cdentry;
41264914 const char *name;
41274915 char *endp;
41284916 int ret;
....@@ -4174,6 +4962,16 @@
41744962 goto out_put_cfile;
41754963
41764964 /*
4965
+ * The control file must be a regular cgroup1 file. As a regular cgroup
4966
+ * file can't be renamed, it's safe to access its name afterwards.
4967
+ */
4968
+ cdentry = cfile.file->f_path.dentry;
4969
+ if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
4970
+ ret = -EINVAL;
4971
+ goto out_put_cfile;
4972
+ }
4973
+
4974
+ /*
41774975 * Determine the event callbacks and set them in @event. This used
41784976 * to be done via struct cftype but cgroup core no longer knows
41794977 * about these events. The following is crude but the whole thing
....@@ -4181,7 +4979,7 @@
41814979 *
41824980 * DO NOT ADD NEW FILES.
41834981 */
4184
- name = cfile.file->f_path.dentry->d_name.name;
4982
+ name = cdentry->d_name.name;
41854983
41864984 if (!strcmp(name, "memory.usage_in_bytes")) {
41874985 event->register_event = mem_cgroup_usage_register_event;
....@@ -4205,7 +5003,7 @@
42055003 * automatically removed on cgroup destruction but the removal is
42065004 * asynchronous, so take an extra ref on @css.
42075005 */
4208
- cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
5006
+ cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
42095007 &memory_cgrp_subsys);
42105008 ret = -EINVAL;
42115009 if (IS_ERR(cfile_css))
....@@ -4340,12 +5138,10 @@
43405138 .write = mem_cgroup_reset,
43415139 .read_u64 = mem_cgroup_read_u64,
43425140 },
4343
-#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
5141
+#if defined(CONFIG_MEMCG_KMEM) && \
5142
+ (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
43445143 {
43455144 .name = "kmem.slabinfo",
4346
- .seq_start = memcg_slab_start,
4347
- .seq_next = memcg_slab_next,
4348
- .seq_stop = memcg_slab_stop,
43495145 .seq_show = memcg_slab_show,
43505146 },
43515147 #endif
....@@ -4383,7 +5179,7 @@
43835179 * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
43845180 * memory-controlled cgroups to 64k.
43855181 *
4386
- * However, there usually are many references to the oflline CSS after
5182
+ * However, there usually are many references to the offline CSS after
43875183 * the cgroup has been destroyed, such as page cache or reclaimable
43885184 * slab objects, that don't need to hang on to the ID. We want to keep
43895185 * those dead CSS from occupying IDs, or we might quickly exhaust the
....@@ -4404,31 +5200,26 @@
44045200 static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
44055201 {
44065202 if (memcg->id.id > 0) {
5203
+ trace_android_vh_mem_cgroup_id_remove(memcg);
44075204 idr_remove(&mem_cgroup_idr, memcg->id.id);
44085205 memcg->id.id = 0;
44095206 }
44105207 }
44115208
4412
-static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
5209
+static void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg,
5210
+ unsigned int n)
44135211 {
4414
- VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0);
4415
- atomic_add(n, &memcg->id.ref);
5212
+ refcount_add(n, &memcg->id.ref);
44165213 }
44175214
44185215 static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
44195216 {
4420
- VM_BUG_ON(atomic_read(&memcg->id.ref) < n);
4421
- if (atomic_sub_and_test(n, &memcg->id.ref)) {
5217
+ if (refcount_sub_and_test(n, &memcg->id.ref)) {
44225218 mem_cgroup_id_remove(memcg);
44235219
44245220 /* Memcg ID pins CSS */
44255221 css_put(&memcg->css);
44265222 }
4427
-}
4428
-
4429
-static inline void mem_cgroup_id_get(struct mem_cgroup *memcg)
4430
-{
4431
- mem_cgroup_id_get_many(memcg, 1);
44325223 }
44335224
44345225 static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
....@@ -4447,6 +5238,7 @@
44475238 WARN_ON_ONCE(!rcu_read_lock_held());
44485239 return idr_find(&mem_cgroup_idr, id);
44495240 }
5241
+EXPORT_SYMBOL_GPL(mem_cgroup_from_id);
44505242
44515243 static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
44525244 {
....@@ -4466,8 +5258,17 @@
44665258 if (!pn)
44675259 return 1;
44685260
4469
- pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat);
5261
+ pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat,
5262
+ GFP_KERNEL_ACCOUNT);
5263
+ if (!pn->lruvec_stat_local) {
5264
+ kfree(pn);
5265
+ return 1;
5266
+ }
5267
+
5268
+ pn->lruvec_stat_cpu = alloc_percpu_gfp(struct lruvec_stat,
5269
+ GFP_KERNEL_ACCOUNT);
44705270 if (!pn->lruvec_stat_cpu) {
5271
+ free_percpu(pn->lruvec_stat_local);
44715272 kfree(pn);
44725273 return 1;
44735274 }
....@@ -4489,6 +5290,7 @@
44895290 return;
44905291
44915292 free_percpu(pn->lruvec_stat_cpu);
5293
+ free_percpu(pn->lruvec_stat_local);
44925294 kfree(pn);
44935295 }
44945296
....@@ -4496,39 +5298,57 @@
44965298 {
44975299 int node;
44985300
5301
+ trace_android_vh_mem_cgroup_free(memcg);
44995302 for_each_node(node)
45005303 free_mem_cgroup_per_node_info(memcg, node);
4501
- free_percpu(memcg->stat_cpu);
5304
+ free_percpu(memcg->vmstats_percpu);
5305
+ free_percpu(memcg->vmstats_local);
45025306 kfree(memcg);
45035307 }
45045308
45055309 static void mem_cgroup_free(struct mem_cgroup *memcg)
45065310 {
45075311 memcg_wb_domain_exit(memcg);
5312
+ /*
5313
+ * Flush percpu vmstats and vmevents to guarantee the value correctness
5314
+ * on parent's and all ancestor levels.
5315
+ */
5316
+ memcg_flush_percpu_vmstats(memcg);
5317
+ memcg_flush_percpu_vmevents(memcg);
45085318 __mem_cgroup_free(memcg);
45095319 }
45105320
45115321 static struct mem_cgroup *mem_cgroup_alloc(void)
45125322 {
45135323 struct mem_cgroup *memcg;
4514
- size_t size;
5324
+ unsigned int size;
45155325 int node;
5326
+ int __maybe_unused i;
5327
+ long error = -ENOMEM;
45165328
45175329 size = sizeof(struct mem_cgroup);
45185330 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
45195331
45205332 memcg = kzalloc(size, GFP_KERNEL);
45215333 if (!memcg)
4522
- return NULL;
5334
+ return ERR_PTR(error);
45235335
45245336 memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
45255337 1, MEM_CGROUP_ID_MAX,
45265338 GFP_KERNEL);
4527
- if (memcg->id.id < 0)
5339
+ if (memcg->id.id < 0) {
5340
+ error = memcg->id.id;
5341
+ goto fail;
5342
+ }
5343
+
5344
+ memcg->vmstats_local = alloc_percpu_gfp(struct memcg_vmstats_percpu,
5345
+ GFP_KERNEL_ACCOUNT);
5346
+ if (!memcg->vmstats_local)
45285347 goto fail;
45295348
4530
- memcg->stat_cpu = alloc_percpu(struct mem_cgroup_stat_cpu);
4531
- if (!memcg->stat_cpu)
5349
+ memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
5350
+ GFP_KERNEL_ACCOUNT);
5351
+ if (!memcg->vmstats_percpu)
45325352 goto fail;
45335353
45345354 for_each_node(node)
....@@ -4539,7 +5359,6 @@
45395359 goto fail;
45405360
45415361 INIT_WORK(&memcg->high_work, high_work_func);
4542
- memcg->last_scanned_node = MAX_NUMNODES;
45435362 INIT_LIST_HEAD(&memcg->oom_notify);
45445363 mutex_init(&memcg->thresholds_lock);
45455364 spin_lock_init(&memcg->move_lock);
....@@ -4549,48 +5368,64 @@
45495368 memcg->socket_pressure = jiffies;
45505369 #ifdef CONFIG_MEMCG_KMEM
45515370 memcg->kmemcg_id = -1;
5371
+ INIT_LIST_HEAD(&memcg->objcg_list);
45525372 #endif
45535373 #ifdef CONFIG_CGROUP_WRITEBACK
45545374 INIT_LIST_HEAD(&memcg->cgwb_list);
5375
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
5376
+ memcg->cgwb_frn[i].done =
5377
+ __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
5378
+#endif
5379
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5380
+ spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
5381
+ INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
5382
+ memcg->deferred_split_queue.split_queue_len = 0;
45555383 #endif
45565384 idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
5385
+ trace_android_vh_mem_cgroup_alloc(memcg);
45575386 return memcg;
45585387 fail:
45595388 mem_cgroup_id_remove(memcg);
45605389 __mem_cgroup_free(memcg);
4561
- return NULL;
5390
+ return ERR_PTR(error);
45625391 }
45635392
45645393 static struct cgroup_subsys_state * __ref
45655394 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
45665395 {
45675396 struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
4568
- struct mem_cgroup *memcg;
5397
+ struct mem_cgroup *memcg, *old_memcg;
45695398 long error = -ENOMEM;
45705399
5400
+ old_memcg = set_active_memcg(parent);
45715401 memcg = mem_cgroup_alloc();
4572
- if (!memcg)
4573
- return ERR_PTR(error);
5402
+ set_active_memcg(old_memcg);
5403
+ if (IS_ERR(memcg))
5404
+ return ERR_CAST(memcg);
45745405
4575
- memcg->high = PAGE_COUNTER_MAX;
5406
+ page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
45765407 memcg->soft_limit = PAGE_COUNTER_MAX;
5408
+ page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
45775409 if (parent) {
45785410 memcg->swappiness = mem_cgroup_swappiness(parent);
45795411 memcg->oom_kill_disable = parent->oom_kill_disable;
45805412 }
4581
- if (parent && parent->use_hierarchy) {
5413
+ if (!parent) {
5414
+ page_counter_init(&memcg->memory, NULL);
5415
+ page_counter_init(&memcg->swap, NULL);
5416
+ page_counter_init(&memcg->kmem, NULL);
5417
+ page_counter_init(&memcg->tcpmem, NULL);
5418
+ } else if (parent->use_hierarchy) {
45825419 memcg->use_hierarchy = true;
45835420 page_counter_init(&memcg->memory, &parent->memory);
45845421 page_counter_init(&memcg->swap, &parent->swap);
4585
- page_counter_init(&memcg->memsw, &parent->memsw);
45865422 page_counter_init(&memcg->kmem, &parent->kmem);
45875423 page_counter_init(&memcg->tcpmem, &parent->tcpmem);
45885424 } else {
4589
- page_counter_init(&memcg->memory, NULL);
4590
- page_counter_init(&memcg->swap, NULL);
4591
- page_counter_init(&memcg->memsw, NULL);
4592
- page_counter_init(&memcg->kmem, NULL);
4593
- page_counter_init(&memcg->tcpmem, NULL);
5425
+ page_counter_init(&memcg->memory, &root_mem_cgroup->memory);
5426
+ page_counter_init(&memcg->swap, &root_mem_cgroup->swap);
5427
+ page_counter_init(&memcg->kmem, &root_mem_cgroup->kmem);
5428
+ page_counter_init(&memcg->tcpmem, &root_mem_cgroup->tcpmem);
45945429 /*
45955430 * Deeper hierachy with use_hierarchy == false doesn't make
45965431 * much sense so let cgroup subsystem know about this
....@@ -4617,7 +5452,7 @@
46175452 fail:
46185453 mem_cgroup_id_remove(memcg);
46195454 mem_cgroup_free(memcg);
4620
- return ERR_PTR(-ENOMEM);
5455
+ return ERR_PTR(error);
46215456 }
46225457
46235458 static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
....@@ -4635,8 +5470,9 @@
46355470 }
46365471
46375472 /* Online state pins memcg ID, memcg ID pins CSS */
4638
- atomic_set(&memcg->id.ref, 1);
5473
+ refcount_set(&memcg->id.ref, 1);
46395474 css_get(css);
5475
+ trace_android_vh_mem_cgroup_css_online(css, memcg);
46405476 return 0;
46415477 }
46425478
....@@ -4645,6 +5481,7 @@
46455481 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
46465482 struct mem_cgroup_event *event, *tmp;
46475483
5484
+ trace_android_vh_mem_cgroup_css_offline(css, memcg);
46485485 /*
46495486 * Unregister events and notify userspace.
46505487 * Notify userspace about cgroup removing only after rmdir of cgroup
....@@ -4663,6 +5500,8 @@
46635500 memcg_offline_kmem(memcg);
46645501 wb_memcg_offline(memcg);
46655502
5503
+ drain_all_stock(memcg);
5504
+
46665505 mem_cgroup_id_put(memcg);
46675506 }
46685507
....@@ -4676,7 +5515,12 @@
46765515 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
46775516 {
46785517 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5518
+ int __maybe_unused i;
46795519
5520
+#ifdef CONFIG_CGROUP_WRITEBACK
5521
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
5522
+ wb_wait_for_completion(&memcg->cgwb_frn[i].done);
5523
+#endif
46805524 if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
46815525 static_branch_dec(&memcg_sockets_enabled_key);
46825526
....@@ -4710,13 +5554,13 @@
47105554
47115555 page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
47125556 page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
4713
- page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
47145557 page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
47155558 page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
47165559 page_counter_set_min(&memcg->memory, 0);
47175560 page_counter_set_low(&memcg->memory, 0);
4718
- memcg->high = PAGE_COUNTER_MAX;
5561
+ page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
47195562 memcg->soft_limit = PAGE_COUNTER_MAX;
5563
+ page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
47205564 memcg_wb_domain_size_changed(memcg);
47215565 }
47225566
....@@ -4759,7 +5603,7 @@
47595603 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
47605604 unsigned long addr, pte_t ptent)
47615605 {
4762
- struct page *page = _vm_normal_page(vma, addr, ptent, true);
5606
+ struct page *page = vm_normal_page(vma, addr, ptent);
47635607
47645608 if (!page || !page_mapped(page))
47655609 return NULL;
....@@ -4810,8 +5654,7 @@
48105654 * we call find_get_page() with swapper_space directly.
48115655 */
48125656 page = find_get_page(swap_address_space(ent), swp_offset(ent));
4813
- if (do_memsw_account())
4814
- entry->val = ent.val;
5657
+ entry->val = ent.val;
48155658
48165659 return page;
48175660 }
....@@ -4826,36 +5669,15 @@
48265669 static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
48275670 unsigned long addr, pte_t ptent, swp_entry_t *entry)
48285671 {
4829
- struct page *page = NULL;
4830
- struct address_space *mapping;
4831
- pgoff_t pgoff;
4832
-
48335672 if (!vma->vm_file) /* anonymous vma */
48345673 return NULL;
48355674 if (!(mc.flags & MOVE_FILE))
48365675 return NULL;
48375676
4838
- mapping = vma->vm_file->f_mapping;
4839
- pgoff = linear_page_index(vma, addr);
4840
-
48415677 /* page is moved even if it's not RSS of this task(page-faulted). */
4842
-#ifdef CONFIG_SWAP
48435678 /* shmem/tmpfs may report page out on swap: account for that too. */
4844
- if (shmem_mapping(mapping)) {
4845
- page = find_get_entry(mapping, pgoff);
4846
- if (radix_tree_exceptional_entry(page)) {
4847
- swp_entry_t swp = radix_to_swp_entry(page);
4848
- if (do_memsw_account())
4849
- *entry = swp;
4850
- page = find_get_page(swap_address_space(swp),
4851
- swp_offset(swp));
4852
- }
4853
- } else
4854
- page = find_get_page(mapping, pgoff);
4855
-#else
4856
- page = find_get_page(mapping, pgoff);
4857
-#endif
4858
- return page;
5679
+ return find_get_incore_page(vma->vm_file->f_mapping,
5680
+ linear_page_index(vma, addr));
48595681 }
48605682
48615683 /**
....@@ -4875,10 +5697,10 @@
48755697 struct mem_cgroup *from,
48765698 struct mem_cgroup *to)
48775699 {
4878
- unsigned long flags;
4879
- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5700
+ struct lruvec *from_vec, *to_vec;
5701
+ struct pglist_data *pgdat;
5702
+ unsigned int nr_pages = compound ? thp_nr_pages(page) : 1;
48805703 int ret;
4881
- bool anon;
48825704
48835705 VM_BUG_ON(from == to);
48845706 VM_BUG_ON_PAGE(PageLRU(page), page);
....@@ -4896,52 +5718,83 @@
48965718 if (page->mem_cgroup != from)
48975719 goto out_unlock;
48985720
4899
- anon = PageAnon(page);
5721
+ pgdat = page_pgdat(page);
5722
+ from_vec = mem_cgroup_lruvec(from, pgdat);
5723
+ to_vec = mem_cgroup_lruvec(to, pgdat);
49005724
4901
- spin_lock_irqsave(&from->move_lock, flags);
5725
+ lock_page_memcg(page);
49025726
4903
- if (!anon && page_mapped(page)) {
4904
- __mod_memcg_state(from, NR_FILE_MAPPED, -nr_pages);
4905
- __mod_memcg_state(to, NR_FILE_MAPPED, nr_pages);
4906
- }
5727
+ if (PageAnon(page)) {
5728
+ if (page_mapped(page)) {
5729
+ __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
5730
+ __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
5731
+ if (PageTransHuge(page)) {
5732
+ __dec_lruvec_state(from_vec, NR_ANON_THPS);
5733
+ __inc_lruvec_state(to_vec, NR_ANON_THPS);
5734
+ }
49075735
4908
- /*
4909
- * move_lock grabbed above and caller set from->moving_account, so
4910
- * mod_memcg_page_state will serialize updates to PageDirty.
4911
- * So mapping should be stable for dirty pages.
4912
- */
4913
- if (!anon && PageDirty(page)) {
4914
- struct address_space *mapping = page_mapping(page);
5736
+ }
5737
+ } else {
5738
+ __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
5739
+ __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
49155740
4916
- if (mapping_cap_account_dirty(mapping)) {
4917
- __mod_memcg_state(from, NR_FILE_DIRTY, -nr_pages);
4918
- __mod_memcg_state(to, NR_FILE_DIRTY, nr_pages);
5741
+ if (PageSwapBacked(page)) {
5742
+ __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
5743
+ __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
5744
+ }
5745
+
5746
+ if (page_mapped(page)) {
5747
+ __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
5748
+ __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
5749
+ }
5750
+
5751
+ if (PageDirty(page)) {
5752
+ struct address_space *mapping = page_mapping(page);
5753
+
5754
+ if (mapping_can_writeback(mapping)) {
5755
+ __mod_lruvec_state(from_vec, NR_FILE_DIRTY,
5756
+ -nr_pages);
5757
+ __mod_lruvec_state(to_vec, NR_FILE_DIRTY,
5758
+ nr_pages);
5759
+ }
49195760 }
49205761 }
49215762
49225763 if (PageWriteback(page)) {
4923
- __mod_memcg_state(from, NR_WRITEBACK, -nr_pages);
4924
- __mod_memcg_state(to, NR_WRITEBACK, nr_pages);
5764
+ __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
5765
+ __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
49255766 }
49265767
49275768 /*
5769
+ * All state has been migrated, let's switch to the new memcg.
5770
+ *
49285771 * It is safe to change page->mem_cgroup here because the page
4929
- * is referenced, charged, and isolated - we can't race with
4930
- * uncharging, charging, migration, or LRU putback.
5772
+ * is referenced, charged, isolated, and locked: we can't race
5773
+ * with (un)charging, migration, LRU putback, or anything else
5774
+ * that would rely on a stable page->mem_cgroup.
5775
+ *
5776
+ * Note that lock_page_memcg is a memcg lock, not a page lock,
5777
+ * to save space. As soon as we switch page->mem_cgroup to a
5778
+ * new memcg that isn't locked, the above state can change
5779
+ * concurrently again. Make sure we're truly done with it.
49315780 */
5781
+ smp_mb();
49325782
4933
- /* caller should have done css_get */
5783
+ css_get(&to->css);
5784
+ css_put(&from->css);
5785
+
49345786 page->mem_cgroup = to;
4935
- spin_unlock_irqrestore(&from->move_lock, flags);
5787
+
5788
+ __unlock_page_memcg(from);
49365789
49375790 ret = 0;
49385791
4939
- local_lock_irq(event_lock);
4940
- mem_cgroup_charge_statistics(to, page, compound, nr_pages);
5792
+ local_lock_irq(&event_lock.l);
5793
+ mem_cgroup_charge_statistics(to, page, nr_pages);
49415794 memcg_check_events(to, page);
4942
- mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
5795
+ mem_cgroup_charge_statistics(from, page, -nr_pages);
49435796 memcg_check_events(from, page);
4944
- local_unlock_irq(event_lock);
5797
+ local_unlock_irq(&event_lock.l);
49455798 out_unlock:
49465799 unlock_page(page);
49475800 out:
....@@ -4963,8 +5816,8 @@
49635816 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
49645817 * target for charge migration. if @target is not NULL, the entry is stored
49655818 * in target->ent.
4966
- * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PUBLIC
4967
- * or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).
5819
+ * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE
5820
+ * (so ZONE_DEVICE page and thus not on the lru).
49685821 * For now we such page is charge like a regular page would be as for all
49695822 * intent and purposes it is just special memory taking the place of a
49705823 * regular page.
....@@ -4998,8 +5851,7 @@
49985851 */
49995852 if (page->mem_cgroup == mc.from) {
50005853 ret = MC_TARGET_PAGE;
5001
- if (is_device_private_page(page) ||
5002
- is_device_public_page(page))
5854
+ if (is_device_private_page(page))
50035855 ret = MC_TARGET_DEVICE;
50045856 if (target)
50055857 target->page = page;
....@@ -5070,8 +5922,8 @@
50705922 if (ptl) {
50715923 /*
50725924 * Note their can not be MC_TARGET_DEVICE for now as we do not
5073
- * support transparent huge page with MEMORY_DEVICE_PUBLIC or
5074
- * MEMORY_DEVICE_PRIVATE but this might change.
5925
+ * support transparent huge page with MEMORY_DEVICE_PRIVATE but
5926
+ * this might change.
50755927 */
50765928 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
50775929 mc.precharge += HPAGE_PMD_NR;
....@@ -5091,18 +5943,17 @@
50915943 return 0;
50925944 }
50935945
5946
+static const struct mm_walk_ops precharge_walk_ops = {
5947
+ .pmd_entry = mem_cgroup_count_precharge_pte_range,
5948
+};
5949
+
50945950 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
50955951 {
50965952 unsigned long precharge;
50975953
5098
- struct mm_walk mem_cgroup_count_precharge_walk = {
5099
- .pmd_entry = mem_cgroup_count_precharge_pte_range,
5100
- .mm = mm,
5101
- };
5102
- down_read(&mm->mmap_sem);
5103
- walk_page_range(0, mm->highest_vm_end,
5104
- &mem_cgroup_count_precharge_walk);
5105
- up_read(&mm->mmap_sem);
5954
+ mmap_read_lock(mm);
5955
+ walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
5956
+ mmap_read_unlock(mm);
51065957
51075958 precharge = mc.precharge;
51085959 mc.precharge = 0;
....@@ -5152,8 +6003,6 @@
51526003 */
51536004 if (!mem_cgroup_is_root(mc.to))
51546005 page_counter_uncharge(&mc.to->memory, mc.moved_swap);
5155
-
5156
- css_put_many(&mc.to->css, mc.moved_swap);
51576006
51586007 mc.moved_swap = 0;
51596008 }
....@@ -5315,7 +6164,7 @@
53156164 switch (get_mctgt_type(vma, addr, ptent, &target)) {
53166165 case MC_TARGET_DEVICE:
53176166 device = true;
5318
- /* fall through */
6167
+ fallthrough;
53196168 case MC_TARGET_PAGE:
53206169 page = target.page;
53216170 /*
....@@ -5370,13 +6219,12 @@
53706219 return ret;
53716220 }
53726221
6222
+static const struct mm_walk_ops charge_walk_ops = {
6223
+ .pmd_entry = mem_cgroup_move_charge_pte_range,
6224
+};
6225
+
53736226 static void mem_cgroup_move_charge(void)
53746227 {
5375
- struct mm_walk mem_cgroup_move_charge_walk = {
5376
- .pmd_entry = mem_cgroup_move_charge_pte_range,
5377
- .mm = mc.mm,
5378
- };
5379
-
53806228 lru_add_drain_all();
53816229 /*
53826230 * Signal lock_page_memcg() to take the memcg's move_lock
....@@ -5386,9 +6234,9 @@
53866234 atomic_inc(&mc.from->moving_account);
53876235 synchronize_rcu();
53886236 retry:
5389
- if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) {
6237
+ if (unlikely(!mmap_read_trylock(mc.mm))) {
53906238 /*
5391
- * Someone who are holding the mmap_sem might be waiting in
6239
+ * Someone who are holding the mmap_lock might be waiting in
53926240 * waitq. So we cancel all extra charges, wake up all waiters,
53936241 * and retry. Because we cancel precharges, we might not be able
53946242 * to move enough charges, but moving charge is a best-effort
....@@ -5402,9 +6250,10 @@
54026250 * When we have consumed all precharges and failed in doing
54036251 * additional charge, the page walk just aborts.
54046252 */
5405
- walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk);
6253
+ walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
6254
+ NULL);
54066255
5407
- up_read(&mc.mm->mmap_sem);
6256
+ mmap_read_unlock(mc.mm);
54086257 atomic_dec(&mc.from->moving_account);
54096258 }
54106259
....@@ -5446,6 +6295,16 @@
54466295 root_mem_cgroup->use_hierarchy = false;
54476296 }
54486297
6298
+static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
6299
+{
6300
+ if (value == PAGE_COUNTER_MAX)
6301
+ seq_puts(m, "max\n");
6302
+ else
6303
+ seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
6304
+
6305
+ return 0;
6306
+}
6307
+
54496308 static u64 memory_current_read(struct cgroup_subsys_state *css,
54506309 struct cftype *cft)
54516310 {
....@@ -5456,15 +6315,8 @@
54566315
54576316 static int memory_min_show(struct seq_file *m, void *v)
54586317 {
5459
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5460
- unsigned long min = READ_ONCE(memcg->memory.min);
5461
-
5462
- if (min == PAGE_COUNTER_MAX)
5463
- seq_puts(m, "max\n");
5464
- else
5465
- seq_printf(m, "%llu\n", (u64)min * PAGE_SIZE);
5466
-
5467
- return 0;
6318
+ return seq_puts_memcg_tunable(m,
6319
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
54686320 }
54696321
54706322 static ssize_t memory_min_write(struct kernfs_open_file *of,
....@@ -5486,15 +6338,8 @@
54866338
54876339 static int memory_low_show(struct seq_file *m, void *v)
54886340 {
5489
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5490
- unsigned long low = READ_ONCE(memcg->memory.low);
5491
-
5492
- if (low == PAGE_COUNTER_MAX)
5493
- seq_puts(m, "max\n");
5494
- else
5495
- seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE);
5496
-
5497
- return 0;
6341
+ return seq_puts_memcg_tunable(m,
6342
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
54986343 }
54996344
55006345 static ssize_t memory_low_write(struct kernfs_open_file *of,
....@@ -5516,22 +6361,16 @@
55166361
55176362 static int memory_high_show(struct seq_file *m, void *v)
55186363 {
5519
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5520
- unsigned long high = READ_ONCE(memcg->high);
5521
-
5522
- if (high == PAGE_COUNTER_MAX)
5523
- seq_puts(m, "max\n");
5524
- else
5525
- seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE);
5526
-
5527
- return 0;
6364
+ return seq_puts_memcg_tunable(m,
6365
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.high));
55286366 }
55296367
55306368 static ssize_t memory_high_write(struct kernfs_open_file *of,
55316369 char *buf, size_t nbytes, loff_t off)
55326370 {
55336371 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5534
- unsigned long nr_pages;
6372
+ unsigned int nr_retries = MAX_RECLAIM_RETRIES;
6373
+ bool drained = false;
55356374 unsigned long high;
55366375 int err;
55376376
....@@ -5540,12 +6379,30 @@
55406379 if (err)
55416380 return err;
55426381
5543
- memcg->high = high;
6382
+ page_counter_set_high(&memcg->memory, high);
55446383
5545
- nr_pages = page_counter_read(&memcg->memory);
5546
- if (nr_pages > high)
5547
- try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
5548
- GFP_KERNEL, true);
6384
+ for (;;) {
6385
+ unsigned long nr_pages = page_counter_read(&memcg->memory);
6386
+ unsigned long reclaimed;
6387
+
6388
+ if (nr_pages <= high)
6389
+ break;
6390
+
6391
+ if (signal_pending(current))
6392
+ break;
6393
+
6394
+ if (!drained) {
6395
+ drain_all_stock(memcg);
6396
+ drained = true;
6397
+ continue;
6398
+ }
6399
+
6400
+ reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
6401
+ GFP_KERNEL, true);
6402
+
6403
+ if (!reclaimed && !nr_retries--)
6404
+ break;
6405
+ }
55496406
55506407 memcg_wb_domain_size_changed(memcg);
55516408 return nbytes;
....@@ -5553,22 +6410,15 @@
55536410
55546411 static int memory_max_show(struct seq_file *m, void *v)
55556412 {
5556
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5557
- unsigned long max = READ_ONCE(memcg->memory.max);
5558
-
5559
- if (max == PAGE_COUNTER_MAX)
5560
- seq_puts(m, "max\n");
5561
- else
5562
- seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
5563
-
5564
- return 0;
6413
+ return seq_puts_memcg_tunable(m,
6414
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
55656415 }
55666416
55676417 static ssize_t memory_max_write(struct kernfs_open_file *of,
55686418 char *buf, size_t nbytes, loff_t off)
55696419 {
55706420 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5571
- unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
6421
+ unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
55726422 bool drained = false;
55736423 unsigned long max;
55746424 int err;
....@@ -5586,10 +6436,8 @@
55866436 if (nr_pages <= max)
55876437 break;
55886438
5589
- if (signal_pending(current)) {
5590
- err = -EINTR;
6439
+ if (signal_pending(current))
55916440 break;
5592
- }
55936441
55946442 if (!drained) {
55956443 drain_all_stock(memcg);
....@@ -5613,104 +6461,77 @@
56136461 return nbytes;
56146462 }
56156463
6464
+static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
6465
+{
6466
+ seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
6467
+ seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
6468
+ seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
6469
+ seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
6470
+ seq_printf(m, "oom_kill %lu\n",
6471
+ atomic_long_read(&events[MEMCG_OOM_KILL]));
6472
+}
6473
+
56166474 static int memory_events_show(struct seq_file *m, void *v)
56176475 {
5618
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
6476
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
56196477
5620
- seq_printf(m, "low %lu\n",
5621
- atomic_long_read(&memcg->memory_events[MEMCG_LOW]));
5622
- seq_printf(m, "high %lu\n",
5623
- atomic_long_read(&memcg->memory_events[MEMCG_HIGH]));
5624
- seq_printf(m, "max %lu\n",
5625
- atomic_long_read(&memcg->memory_events[MEMCG_MAX]));
5626
- seq_printf(m, "oom %lu\n",
5627
- atomic_long_read(&memcg->memory_events[MEMCG_OOM]));
5628
- seq_printf(m, "oom_kill %lu\n",
5629
- atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
6478
+ __memory_events_show(m, memcg->memory_events);
6479
+ return 0;
6480
+}
56306481
6482
+static int memory_events_local_show(struct seq_file *m, void *v)
6483
+{
6484
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6485
+
6486
+ __memory_events_show(m, memcg->memory_events_local);
56316487 return 0;
56326488 }
56336489
56346490 static int memory_stat_show(struct seq_file *m, void *v)
56356491 {
5636
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5637
- struct accumulated_stats acc;
5638
- int i;
6492
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6493
+ char *buf;
56396494
5640
- /*
5641
- * Provide statistics on the state of the memory subsystem as
5642
- * well as cumulative event counters that show past behavior.
5643
- *
5644
- * This list is ordered following a combination of these gradients:
5645
- * 1) generic big picture -> specifics and details
5646
- * 2) reflecting userspace activity -> reflecting kernel heuristics
5647
- *
5648
- * Current memory state:
5649
- */
5650
-
5651
- memset(&acc, 0, sizeof(acc));
5652
- acc.stats_size = MEMCG_NR_STAT;
5653
- acc.events_size = NR_VM_EVENT_ITEMS;
5654
- accumulate_memcg_tree(memcg, &acc);
5655
-
5656
- seq_printf(m, "anon %llu\n",
5657
- (u64)acc.stat[MEMCG_RSS] * PAGE_SIZE);
5658
- seq_printf(m, "file %llu\n",
5659
- (u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE);
5660
- seq_printf(m, "kernel_stack %llu\n",
5661
- (u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024);
5662
- seq_printf(m, "slab %llu\n",
5663
- (u64)(acc.stat[NR_SLAB_RECLAIMABLE] +
5664
- acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
5665
- seq_printf(m, "sock %llu\n",
5666
- (u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE);
5667
-
5668
- seq_printf(m, "shmem %llu\n",
5669
- (u64)acc.stat[NR_SHMEM] * PAGE_SIZE);
5670
- seq_printf(m, "file_mapped %llu\n",
5671
- (u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE);
5672
- seq_printf(m, "file_dirty %llu\n",
5673
- (u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE);
5674
- seq_printf(m, "file_writeback %llu\n",
5675
- (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE);
5676
-
5677
- for (i = 0; i < NR_LRU_LISTS; i++)
5678
- seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
5679
- (u64)acc.lru_pages[i] * PAGE_SIZE);
5680
-
5681
- seq_printf(m, "slab_reclaimable %llu\n",
5682
- (u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
5683
- seq_printf(m, "slab_unreclaimable %llu\n",
5684
- (u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
5685
-
5686
- /* Accumulated memory events */
5687
-
5688
- seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]);
5689
- seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]);
5690
-
5691
- seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]);
5692
- seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] +
5693
- acc.events[PGSCAN_DIRECT]);
5694
- seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] +
5695
- acc.events[PGSTEAL_DIRECT]);
5696
- seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]);
5697
- seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]);
5698
- seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]);
5699
- seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
5700
-
5701
- seq_printf(m, "workingset_refault %lu\n",
5702
- acc.stat[WORKINGSET_REFAULT]);
5703
- seq_printf(m, "workingset_activate %lu\n",
5704
- acc.stat[WORKINGSET_ACTIVATE]);
5705
- seq_printf(m, "workingset_nodereclaim %lu\n",
5706
- acc.stat[WORKINGSET_NODERECLAIM]);
5707
-
6495
+ buf = memory_stat_format(memcg);
6496
+ if (!buf)
6497
+ return -ENOMEM;
6498
+ seq_puts(m, buf);
6499
+ kfree(buf);
57086500 return 0;
57096501 }
57106502
6503
+#ifdef CONFIG_NUMA
6504
+static int memory_numa_stat_show(struct seq_file *m, void *v)
6505
+{
6506
+ int i;
6507
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6508
+
6509
+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
6510
+ int nid;
6511
+
6512
+ if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS)
6513
+ continue;
6514
+
6515
+ seq_printf(m, "%s", memory_stats[i].name);
6516
+ for_each_node_state(nid, N_MEMORY) {
6517
+ u64 size;
6518
+ struct lruvec *lruvec;
6519
+
6520
+ lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
6521
+ size = lruvec_page_state(lruvec, memory_stats[i].idx);
6522
+ size *= memory_stats[i].ratio;
6523
+ seq_printf(m, " N%d=%llu", nid, size);
6524
+ }
6525
+ seq_putc(m, '\n');
6526
+ }
6527
+
6528
+ return 0;
6529
+}
6530
+#endif
6531
+
57116532 static int memory_oom_group_show(struct seq_file *m, void *v)
57126533 {
5713
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
6534
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
57146535
57156536 seq_printf(m, "%d\n", memcg->oom_group);
57166537
....@@ -5776,10 +6597,21 @@
57766597 .seq_show = memory_events_show,
57776598 },
57786599 {
5779
- .name = "stat",
6600
+ .name = "events.local",
57806601 .flags = CFTYPE_NOT_ON_ROOT,
6602
+ .file_offset = offsetof(struct mem_cgroup, events_local_file),
6603
+ .seq_show = memory_events_local_show,
6604
+ },
6605
+ {
6606
+ .name = "stat",
57816607 .seq_show = memory_stat_show,
57826608 },
6609
+#ifdef CONFIG_NUMA
6610
+ {
6611
+ .name = "numa_stat",
6612
+ .seq_show = memory_numa_stat_show,
6613
+ },
6614
+#endif
57836615 {
57846616 .name = "oom.group",
57856617 .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
....@@ -5805,6 +6637,122 @@
58056637 .early_init = 0,
58066638 };
58076639
6640
+/*
6641
+ * This function calculates an individual cgroup's effective
6642
+ * protection which is derived from its own memory.min/low, its
6643
+ * parent's and siblings' settings, as well as the actual memory
6644
+ * distribution in the tree.
6645
+ *
6646
+ * The following rules apply to the effective protection values:
6647
+ *
6648
+ * 1. At the first level of reclaim, effective protection is equal to
6649
+ * the declared protection in memory.min and memory.low.
6650
+ *
6651
+ * 2. To enable safe delegation of the protection configuration, at
6652
+ * subsequent levels the effective protection is capped to the
6653
+ * parent's effective protection.
6654
+ *
6655
+ * 3. To make complex and dynamic subtrees easier to configure, the
6656
+ * user is allowed to overcommit the declared protection at a given
6657
+ * level. If that is the case, the parent's effective protection is
6658
+ * distributed to the children in proportion to how much protection
6659
+ * they have declared and how much of it they are utilizing.
6660
+ *
6661
+ * This makes distribution proportional, but also work-conserving:
6662
+ * if one cgroup claims much more protection than it uses memory,
6663
+ * the unused remainder is available to its siblings.
6664
+ *
6665
+ * 4. Conversely, when the declared protection is undercommitted at a
6666
+ * given level, the distribution of the larger parental protection
6667
+ * budget is NOT proportional. A cgroup's protection from a sibling
6668
+ * is capped to its own memory.min/low setting.
6669
+ *
6670
+ * 5. However, to allow protecting recursive subtrees from each other
6671
+ * without having to declare each individual cgroup's fixed share
6672
+ * of the ancestor's claim to protection, any unutilized -
6673
+ * "floating" - protection from up the tree is distributed in
6674
+ * proportion to each cgroup's *usage*. This makes the protection
6675
+ * neutral wrt sibling cgroups and lets them compete freely over
6676
+ * the shared parental protection budget, but it protects the
6677
+ * subtree as a whole from neighboring subtrees.
6678
+ *
6679
+ * Note that 4. and 5. are not in conflict: 4. is about protecting
6680
+ * against immediate siblings whereas 5. is about protecting against
6681
+ * neighboring subtrees.
6682
+ */
6683
+static unsigned long effective_protection(unsigned long usage,
6684
+ unsigned long parent_usage,
6685
+ unsigned long setting,
6686
+ unsigned long parent_effective,
6687
+ unsigned long siblings_protected)
6688
+{
6689
+ unsigned long protected;
6690
+ unsigned long ep;
6691
+
6692
+ protected = min(usage, setting);
6693
+ /*
6694
+ * If all cgroups at this level combined claim and use more
6695
+ * protection then what the parent affords them, distribute
6696
+ * shares in proportion to utilization.
6697
+ *
6698
+ * We are using actual utilization rather than the statically
6699
+ * claimed protection in order to be work-conserving: claimed
6700
+ * but unused protection is available to siblings that would
6701
+ * otherwise get a smaller chunk than what they claimed.
6702
+ */
6703
+ if (siblings_protected > parent_effective)
6704
+ return protected * parent_effective / siblings_protected;
6705
+
6706
+ /*
6707
+ * Ok, utilized protection of all children is within what the
6708
+ * parent affords them, so we know whatever this child claims
6709
+ * and utilizes is effectively protected.
6710
+ *
6711
+ * If there is unprotected usage beyond this value, reclaim
6712
+ * will apply pressure in proportion to that amount.
6713
+ *
6714
+ * If there is unutilized protection, the cgroup will be fully
6715
+ * shielded from reclaim, but we do return a smaller value for
6716
+ * protection than what the group could enjoy in theory. This
6717
+ * is okay. With the overcommit distribution above, effective
6718
+ * protection is always dependent on how memory is actually
6719
+ * consumed among the siblings anyway.
6720
+ */
6721
+ ep = protected;
6722
+
6723
+ /*
6724
+ * If the children aren't claiming (all of) the protection
6725
+ * afforded to them by the parent, distribute the remainder in
6726
+ * proportion to the (unprotected) memory of each cgroup. That
6727
+ * way, cgroups that aren't explicitly prioritized wrt each
6728
+ * other compete freely over the allowance, but they are
6729
+ * collectively protected from neighboring trees.
6730
+ *
6731
+ * We're using unprotected memory for the weight so that if
6732
+ * some cgroups DO claim explicit protection, we don't protect
6733
+ * the same bytes twice.
6734
+ *
6735
+ * Check both usage and parent_usage against the respective
6736
+ * protected values. One should imply the other, but they
6737
+ * aren't read atomically - make sure the division is sane.
6738
+ */
6739
+ if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT))
6740
+ return ep;
6741
+ if (parent_effective > siblings_protected &&
6742
+ parent_usage > siblings_protected &&
6743
+ usage > protected) {
6744
+ unsigned long unclaimed;
6745
+
6746
+ unclaimed = parent_effective - siblings_protected;
6747
+ unclaimed *= usage - protected;
6748
+ unclaimed /= parent_usage - siblings_protected;
6749
+
6750
+ ep += unclaimed;
6751
+ }
6752
+
6753
+ return ep;
6754
+}
6755
+
58086756 /**
58096757 * mem_cgroup_protected - check if memory consumption is in the normal range
58106758 * @root: the top ancestor of the sub-tree being checked
....@@ -5812,259 +6760,125 @@
58126760 *
58136761 * WARNING: This function is not stateless! It can only be used as part
58146762 * of a top-down tree iteration, not for isolated queries.
5815
- *
5816
- * Returns one of the following:
5817
- * MEMCG_PROT_NONE: cgroup memory is not protected
5818
- * MEMCG_PROT_LOW: cgroup memory is protected as long there is
5819
- * an unprotected supply of reclaimable memory from other cgroups.
5820
- * MEMCG_PROT_MIN: cgroup memory is protected
5821
- *
5822
- * @root is exclusive; it is never protected when looked at directly
5823
- *
5824
- * To provide a proper hierarchical behavior, effective memory.min/low values
5825
- * are used. Below is the description of how effective memory.low is calculated.
5826
- * Effective memory.min values is calculated in the same way.
5827
- *
5828
- * Effective memory.low is always equal or less than the original memory.low.
5829
- * If there is no memory.low overcommittment (which is always true for
5830
- * top-level memory cgroups), these two values are equal.
5831
- * Otherwise, it's a part of parent's effective memory.low,
5832
- * calculated as a cgroup's memory.low usage divided by sum of sibling's
5833
- * memory.low usages, where memory.low usage is the size of actually
5834
- * protected memory.
5835
- *
5836
- * low_usage
5837
- * elow = min( memory.low, parent->elow * ------------------ ),
5838
- * siblings_low_usage
5839
- *
5840
- * | memory.current, if memory.current < memory.low
5841
- * low_usage = |
5842
- | 0, otherwise.
5843
- *
5844
- *
5845
- * Such definition of the effective memory.low provides the expected
5846
- * hierarchical behavior: parent's memory.low value is limiting
5847
- * children, unprotected memory is reclaimed first and cgroups,
5848
- * which are not using their guarantee do not affect actual memory
5849
- * distribution.
5850
- *
5851
- * For example, if there are memcgs A, A/B, A/C, A/D and A/E:
5852
- *
5853
- * A A/memory.low = 2G, A/memory.current = 6G
5854
- * //\\
5855
- * BC DE B/memory.low = 3G B/memory.current = 2G
5856
- * C/memory.low = 1G C/memory.current = 2G
5857
- * D/memory.low = 0 D/memory.current = 2G
5858
- * E/memory.low = 10G E/memory.current = 0
5859
- *
5860
- * and the memory pressure is applied, the following memory distribution
5861
- * is expected (approximately):
5862
- *
5863
- * A/memory.current = 2G
5864
- *
5865
- * B/memory.current = 1.3G
5866
- * C/memory.current = 0.6G
5867
- * D/memory.current = 0
5868
- * E/memory.current = 0
5869
- *
5870
- * These calculations require constant tracking of the actual low usages
5871
- * (see propagate_protected_usage()), as well as recursive calculation of
5872
- * effective memory.low values. But as we do call mem_cgroup_protected()
5873
- * path for each memory cgroup top-down from the reclaim,
5874
- * it's possible to optimize this part, and save calculated elow
5875
- * for next usage. This part is intentionally racy, but it's ok,
5876
- * as memory.low is a best-effort mechanism.
58776763 */
5878
-enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
5879
- struct mem_cgroup *memcg)
6764
+void mem_cgroup_calculate_protection(struct mem_cgroup *root,
6765
+ struct mem_cgroup *memcg)
58806766 {
6767
+ unsigned long usage, parent_usage;
58816768 struct mem_cgroup *parent;
5882
- unsigned long emin, parent_emin;
5883
- unsigned long elow, parent_elow;
5884
- unsigned long usage;
58856769
58866770 if (mem_cgroup_disabled())
5887
- return MEMCG_PROT_NONE;
6771
+ return;
58886772
58896773 if (!root)
58906774 root = root_mem_cgroup;
6775
+
6776
+ /*
6777
+ * Effective values of the reclaim targets are ignored so they
6778
+ * can be stale. Have a look at mem_cgroup_protection for more
6779
+ * details.
6780
+ * TODO: calculation should be more robust so that we do not need
6781
+ * that special casing.
6782
+ */
58916783 if (memcg == root)
5892
- return MEMCG_PROT_NONE;
6784
+ return;
58936785
58946786 usage = page_counter_read(&memcg->memory);
58956787 if (!usage)
5896
- return MEMCG_PROT_NONE;
5897
-
5898
- emin = memcg->memory.min;
5899
- elow = memcg->memory.low;
6788
+ return;
59006789
59016790 parent = parent_mem_cgroup(memcg);
59026791 /* No parent means a non-hierarchical mode on v1 memcg */
59036792 if (!parent)
5904
- return MEMCG_PROT_NONE;
6793
+ return;
59056794
5906
- if (parent == root)
5907
- goto exit;
5908
-
5909
- parent_emin = READ_ONCE(parent->memory.emin);
5910
- emin = min(emin, parent_emin);
5911
- if (emin && parent_emin) {
5912
- unsigned long min_usage, siblings_min_usage;
5913
-
5914
- min_usage = min(usage, memcg->memory.min);
5915
- siblings_min_usage = atomic_long_read(
5916
- &parent->memory.children_min_usage);
5917
-
5918
- if (min_usage && siblings_min_usage)
5919
- emin = min(emin, parent_emin * min_usage /
5920
- siblings_min_usage);
6795
+ if (parent == root) {
6796
+ memcg->memory.emin = READ_ONCE(memcg->memory.min);
6797
+ memcg->memory.elow = READ_ONCE(memcg->memory.low);
6798
+ return;
59216799 }
59226800
5923
- parent_elow = READ_ONCE(parent->memory.elow);
5924
- elow = min(elow, parent_elow);
5925
- if (elow && parent_elow) {
5926
- unsigned long low_usage, siblings_low_usage;
6801
+ parent_usage = page_counter_read(&parent->memory);
59276802
5928
- low_usage = min(usage, memcg->memory.low);
5929
- siblings_low_usage = atomic_long_read(
5930
- &parent->memory.children_low_usage);
6803
+ WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage,
6804
+ READ_ONCE(memcg->memory.min),
6805
+ READ_ONCE(parent->memory.emin),
6806
+ atomic_long_read(&parent->memory.children_min_usage)));
59316807
5932
- if (low_usage && siblings_low_usage)
5933
- elow = min(elow, parent_elow * low_usage /
5934
- siblings_low_usage);
5935
- }
5936
-
5937
-exit:
5938
- memcg->memory.emin = emin;
5939
- memcg->memory.elow = elow;
5940
-
5941
- if (usage <= emin)
5942
- return MEMCG_PROT_MIN;
5943
- else if (usage <= elow)
5944
- return MEMCG_PROT_LOW;
5945
- else
5946
- return MEMCG_PROT_NONE;
6808
+ WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage,
6809
+ READ_ONCE(memcg->memory.low),
6810
+ READ_ONCE(parent->memory.elow),
6811
+ atomic_long_read(&parent->memory.children_low_usage)));
59476812 }
59486813
59496814 /**
5950
- * mem_cgroup_try_charge - try charging a page
6815
+ * __mem_cgroup_charge - charge a newly allocated page to a cgroup
59516816 * @page: page to charge
59526817 * @mm: mm context of the victim
59536818 * @gfp_mask: reclaim mode
5954
- * @memcgp: charged memcg return
5955
- * @compound: charge the page as compound or small page
59566819 *
59576820 * Try to charge @page to the memcg that @mm belongs to, reclaiming
59586821 * pages according to @gfp_mask if necessary.
59596822 *
5960
- * Returns 0 on success, with *@memcgp pointing to the charged memcg.
5961
- * Otherwise, an error code is returned.
5962
- *
5963
- * After page->mapping has been set up, the caller must finalize the
5964
- * charge with mem_cgroup_commit_charge(). Or abort the transaction
5965
- * with mem_cgroup_cancel_charge() in case page instantiation fails.
6823
+ * Returns 0 on success. Otherwise, an error code is returned.
59666824 */
5967
-int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
5968
- gfp_t gfp_mask, struct mem_cgroup **memcgp,
5969
- bool compound)
6825
+int __mem_cgroup_charge(struct page *page, struct mm_struct *mm,
6826
+ gfp_t gfp_mask)
59706827 {
6828
+ unsigned int nr_pages = thp_nr_pages(page);
59716829 struct mem_cgroup *memcg = NULL;
5972
- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
59736830 int ret = 0;
59746831
5975
- if (mem_cgroup_disabled())
5976
- goto out;
5977
-
59786832 if (PageSwapCache(page)) {
6833
+ swp_entry_t ent = { .val = page_private(page), };
6834
+ unsigned short id;
6835
+
59796836 /*
59806837 * Every swap fault against a single page tries to charge the
59816838 * page, bail as early as possible. shmem_unuse() encounters
5982
- * already charged pages, too. The USED bit is protected by
5983
- * the page lock, which serializes swap cache removal, which
6839
+ * already charged pages, too. page->mem_cgroup is protected
6840
+ * by the page lock, which serializes swap cache removal, which
59846841 * in turn serializes uncharging.
59856842 */
59866843 VM_BUG_ON_PAGE(!PageLocked(page), page);
59876844 if (compound_head(page)->mem_cgroup)
59886845 goto out;
59896846
5990
- if (do_swap_account) {
5991
- swp_entry_t ent = { .val = page_private(page), };
5992
- unsigned short id = lookup_swap_cgroup_id(ent);
5993
-
5994
- rcu_read_lock();
5995
- memcg = mem_cgroup_from_id(id);
5996
- if (memcg && !css_tryget_online(&memcg->css))
5997
- memcg = NULL;
5998
- rcu_read_unlock();
5999
- }
6847
+ id = lookup_swap_cgroup_id(ent);
6848
+ rcu_read_lock();
6849
+ memcg = mem_cgroup_from_id(id);
6850
+ if (memcg && !css_tryget_online(&memcg->css))
6851
+ memcg = NULL;
6852
+ rcu_read_unlock();
60006853 }
60016854
60026855 if (!memcg)
60036856 memcg = get_mem_cgroup_from_mm(mm);
60046857
60056858 ret = try_charge(memcg, gfp_mask, nr_pages);
6859
+ if (ret)
6860
+ goto out_put;
60066861
6007
- css_put(&memcg->css);
6008
-out:
6009
- *memcgp = memcg;
6010
- return ret;
6011
-}
6862
+ css_get(&memcg->css);
6863
+ commit_charge(page, memcg);
60126864
6013
-int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm,
6014
- gfp_t gfp_mask, struct mem_cgroup **memcgp,
6015
- bool compound)
6016
-{
6017
- struct mem_cgroup *memcg;
6018
- int ret;
6019
-
6020
- ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound);
6021
- memcg = *memcgp;
6022
- mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask);
6023
- return ret;
6024
-}
6025
-
6026
-/**
6027
- * mem_cgroup_commit_charge - commit a page charge
6028
- * @page: page to charge
6029
- * @memcg: memcg to charge the page to
6030
- * @lrucare: page might be on LRU already
6031
- * @compound: charge the page as compound or small page
6032
- *
6033
- * Finalize a charge transaction started by mem_cgroup_try_charge(),
6034
- * after page->mapping has been set up. This must happen atomically
6035
- * as part of the page instantiation, i.e. under the page table lock
6036
- * for anonymous pages, under the page lock for page and swap cache.
6037
- *
6038
- * In addition, the page must not be on the LRU during the commit, to
6039
- * prevent racing with task migration. If it might be, use @lrucare.
6040
- *
6041
- * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
6042
- */
6043
-void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
6044
- bool lrucare, bool compound)
6045
-{
6046
- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
6047
-
6048
- VM_BUG_ON_PAGE(!page->mapping, page);
6049
- VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
6050
-
6051
- if (mem_cgroup_disabled())
6052
- return;
6053
- /*
6054
- * Swap faults will attempt to charge the same page multiple
6055
- * times. But reuse_swap_page() might have removed the page
6056
- * from swapcache already, so we can't check PageSwapCache().
6057
- */
6058
- if (!memcg)
6059
- return;
6060
-
6061
- commit_charge(page, memcg, lrucare);
6062
-
6063
- local_lock_irq(event_lock);
6064
- mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
6865
+ local_lock_irq(&event_lock.l);
6866
+ mem_cgroup_charge_statistics(memcg, page, nr_pages);
60656867 memcg_check_events(memcg, page);
6066
- local_unlock_irq(event_lock);
6868
+ local_unlock_irq(&event_lock.l);
60676869
6870
+ /*
6871
+ * Cgroup1's unified memory+swap counter has been charged with the
6872
+ * new swapcache page, finish the transfer by uncharging the swap
6873
+ * slot. The swap slot would also get uncharged when it dies, but
6874
+ * it can stick around indefinitely and we'd count the page twice
6875
+ * the entire time.
6876
+ *
6877
+ * Cgroup2 has separate resource counters for memory and swap,
6878
+ * so this is a non-issue here. Memory and swap charge lifetimes
6879
+ * correspond 1:1 to page and swap slot lifetimes: we charge the
6880
+ * page to memory here, and uncharge swap when the slot is freed.
6881
+ */
60686882 if (do_memsw_account() && PageSwapCache(page)) {
60696883 swp_entry_t entry = { .val = page_private(page) };
60706884 /*
....@@ -6074,42 +6888,18 @@
60746888 */
60756889 mem_cgroup_uncharge_swap(entry, nr_pages);
60766890 }
6077
-}
60786891
6079
-/**
6080
- * mem_cgroup_cancel_charge - cancel a page charge
6081
- * @page: page to charge
6082
- * @memcg: memcg to charge the page to
6083
- * @compound: charge the page as compound or small page
6084
- *
6085
- * Cancel a charge transaction started by mem_cgroup_try_charge().
6086
- */
6087
-void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
6088
- bool compound)
6089
-{
6090
- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
6091
-
6092
- if (mem_cgroup_disabled())
6093
- return;
6094
- /*
6095
- * Swap faults will attempt to charge the same page multiple
6096
- * times. But reuse_swap_page() might have removed the page
6097
- * from swapcache already, so we can't check PageSwapCache().
6098
- */
6099
- if (!memcg)
6100
- return;
6101
-
6102
- cancel_charge(memcg, nr_pages);
6892
+out_put:
6893
+ css_put(&memcg->css);
6894
+out:
6895
+ return ret;
61036896 }
61046897
61056898 struct uncharge_gather {
61066899 struct mem_cgroup *memcg;
6900
+ unsigned long nr_pages;
61076901 unsigned long pgpgout;
6108
- unsigned long nr_anon;
6109
- unsigned long nr_file;
61106902 unsigned long nr_kmem;
6111
- unsigned long nr_huge;
6112
- unsigned long nr_shmem;
61136903 struct page *dummy_page;
61146904 };
61156905
....@@ -6120,37 +6910,32 @@
61206910
61216911 static void uncharge_batch(const struct uncharge_gather *ug)
61226912 {
6123
- unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem;
61246913 unsigned long flags;
61256914
61266915 if (!mem_cgroup_is_root(ug->memcg)) {
6127
- page_counter_uncharge(&ug->memcg->memory, nr_pages);
6916
+ page_counter_uncharge(&ug->memcg->memory, ug->nr_pages);
61286917 if (do_memsw_account())
6129
- page_counter_uncharge(&ug->memcg->memsw, nr_pages);
6918
+ page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages);
61306919 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
61316920 page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
61326921 memcg_oom_recover(ug->memcg);
61336922 }
61346923
6135
- local_lock_irqsave(event_lock, flags);
6136
- __mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon);
6137
- __mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file);
6138
- __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge);
6139
- __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem);
6924
+ local_lock_irqsave(&event_lock.l, flags);
61406925 __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
6141
- __this_cpu_add(ug->memcg->stat_cpu->nr_page_events, nr_pages);
6926
+ __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages);
61426927 memcg_check_events(ug->memcg, ug->dummy_page);
6143
- local_unlock_irqrestore(event_lock, flags);
6928
+ local_unlock_irqrestore(&event_lock.l, flags);
61446929
6145
- if (!mem_cgroup_is_root(ug->memcg))
6146
- css_put_many(&ug->memcg->css, nr_pages);
6930
+ /* drop reference from uncharge_page */
6931
+ css_put(&ug->memcg->css);
61476932 }
61486933
61496934 static void uncharge_page(struct page *page, struct uncharge_gather *ug)
61506935 {
6936
+ unsigned long nr_pages;
6937
+
61516938 VM_BUG_ON_PAGE(PageLRU(page), page);
6152
- VM_BUG_ON_PAGE(page_count(page) && !is_zone_device_page(page) &&
6153
- !PageHWPoison(page) , page);
61546939
61556940 if (!page->mem_cgroup)
61566941 return;
....@@ -6167,30 +6952,24 @@
61676952 uncharge_gather_clear(ug);
61686953 }
61696954 ug->memcg = page->mem_cgroup;
6955
+
6956
+ /* pairs with css_put in uncharge_batch */
6957
+ css_get(&ug->memcg->css);
61706958 }
61716959
6172
- if (!PageKmemcg(page)) {
6173
- unsigned int nr_pages = 1;
6960
+ nr_pages = compound_nr(page);
6961
+ ug->nr_pages += nr_pages;
61746962
6175
- if (PageTransHuge(page)) {
6176
- nr_pages <<= compound_order(page);
6177
- ug->nr_huge += nr_pages;
6178
- }
6179
- if (PageAnon(page))
6180
- ug->nr_anon += nr_pages;
6181
- else {
6182
- ug->nr_file += nr_pages;
6183
- if (PageSwapBacked(page))
6184
- ug->nr_shmem += nr_pages;
6185
- }
6963
+ if (!PageKmemcg(page)) {
61866964 ug->pgpgout++;
61876965 } else {
6188
- ug->nr_kmem += 1 << compound_order(page);
6966
+ ug->nr_kmem += nr_pages;
61896967 __ClearPageKmemcg(page);
61906968 }
61916969
61926970 ug->dummy_page = page;
61936971 page->mem_cgroup = NULL;
6972
+ css_put(&ug->memcg->css);
61946973 }
61956974
61966975 static void uncharge_list(struct list_head *page_list)
....@@ -6219,18 +6998,14 @@
62196998 }
62206999
62217000 /**
6222
- * mem_cgroup_uncharge - uncharge a page
7001
+ * __mem_cgroup_uncharge - uncharge a page
62237002 * @page: page to uncharge
62247003 *
6225
- * Uncharge a page previously charged with mem_cgroup_try_charge() and
6226
- * mem_cgroup_commit_charge().
7004
+ * Uncharge a page previously charged with __mem_cgroup_charge().
62277005 */
6228
-void mem_cgroup_uncharge(struct page *page)
7006
+void __mem_cgroup_uncharge(struct page *page)
62297007 {
62307008 struct uncharge_gather ug;
6231
-
6232
- if (mem_cgroup_disabled())
6233
- return;
62347009
62357010 /* Don't touch page->lru of any random page, pre-check: */
62367011 if (!page->mem_cgroup)
....@@ -6242,17 +7017,14 @@
62427017 }
62437018
62447019 /**
6245
- * mem_cgroup_uncharge_list - uncharge a list of page
7020
+ * __mem_cgroup_uncharge_list - uncharge a list of page
62467021 * @page_list: list of pages to uncharge
62477022 *
62487023 * Uncharge a list of pages previously charged with
6249
- * mem_cgroup_try_charge() and mem_cgroup_commit_charge().
7024
+ * __mem_cgroup_charge().
62507025 */
6251
-void mem_cgroup_uncharge_list(struct list_head *page_list)
7026
+void __mem_cgroup_uncharge_list(struct list_head *page_list)
62527027 {
6253
- if (mem_cgroup_disabled())
6254
- return;
6255
-
62567028 if (!list_empty(page_list))
62577029 uncharge_list(page_list);
62587030 }
....@@ -6271,7 +7043,6 @@
62717043 {
62727044 struct mem_cgroup *memcg;
62737045 unsigned int nr_pages;
6274
- bool compound;
62757046 unsigned long flags;
62767047
62777048 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
....@@ -6293,20 +7064,19 @@
62937064 return;
62947065
62957066 /* Force-charge the new page. The old one will be freed soon */
6296
- compound = PageTransHuge(newpage);
6297
- nr_pages = compound ? hpage_nr_pages(newpage) : 1;
7067
+ nr_pages = thp_nr_pages(newpage);
62987068
62997069 page_counter_charge(&memcg->memory, nr_pages);
63007070 if (do_memsw_account())
63017071 page_counter_charge(&memcg->memsw, nr_pages);
6302
- css_get_many(&memcg->css, nr_pages);
63037072
6304
- commit_charge(newpage, memcg, false);
7073
+ css_get(&memcg->css);
7074
+ commit_charge(newpage, memcg);
63057075
6306
- local_lock_irqsave(event_lock, flags);
6307
- mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
7076
+ local_lock_irqsave(&event_lock.l, flags);
7077
+ mem_cgroup_charge_statistics(memcg, newpage, nr_pages);
63087078 memcg_check_events(memcg, newpage);
6309
- local_unlock_irqrestore(event_lock, flags);
7079
+ local_unlock_irqrestore(&event_lock.l, flags);
63107080 }
63117081
63127082 DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
....@@ -6329,7 +7099,7 @@
63297099 goto out;
63307100 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
63317101 goto out;
6332
- if (css_tryget_online(&memcg->css))
7102
+ if (css_tryget(&memcg->css))
63337103 sk->sk_memcg = memcg;
63347104 out:
63357105 rcu_read_unlock();
....@@ -6407,7 +7177,7 @@
64077177 if (!strcmp(token, "nokmem"))
64087178 cgroup_memory_nokmem = true;
64097179 }
6410
- return 0;
7180
+ return 1;
64117181 }
64127182 __setup("cgroup.memory=", cgroup_memory);
64137183
....@@ -6423,23 +7193,16 @@
64237193 {
64247194 int cpu, node;
64257195
6426
-#ifdef CONFIG_MEMCG_KMEM
6427
- /*
6428
- * Kmem cache creation is mostly done with the slab_mutex held,
6429
- * so use a workqueue with limited concurrency to avoid stalling
6430
- * all worker threads in case lots of cgroups are created and
6431
- * destroyed simultaneously.
6432
- */
6433
- memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1);
6434
- BUG_ON(!memcg_kmem_cache_wq);
6435
-#endif
6436
-
64377196 cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
64387197 memcg_hotplug_cpu_dead);
64397198
6440
- for_each_possible_cpu(cpu)
6441
- INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
6442
- drain_local_stock);
7199
+ for_each_possible_cpu(cpu) {
7200
+ struct memcg_stock_pcp *stock;
7201
+
7202
+ stock = per_cpu_ptr(&memcg_stock, cpu);
7203
+ INIT_WORK(&stock->work, drain_local_stock);
7204
+ local_lock_init(&stock->lock);
7205
+ }
64437206
64447207 for_each_node(node) {
64457208 struct mem_cgroup_tree_per_node *rtpn;
....@@ -6460,7 +7223,7 @@
64607223 #ifdef CONFIG_MEMCG_SWAP
64617224 static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
64627225 {
6463
- while (!atomic_inc_not_zero(&memcg->id.ref)) {
7226
+ while (!refcount_inc_not_zero(&memcg->id.ref)) {
64647227 /*
64657228 * The root cgroup cannot be destroyed, so it's refcount must
64667229 * always be >= 1.
....@@ -6493,7 +7256,10 @@
64937256 VM_BUG_ON_PAGE(PageLRU(page), page);
64947257 VM_BUG_ON_PAGE(page_count(page), page);
64957258
6496
- if (!do_memsw_account())
7259
+ if (mem_cgroup_disabled())
7260
+ return;
7261
+
7262
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
64977263 return;
64987264
64997265 memcg = page->mem_cgroup;
....@@ -6508,7 +7274,7 @@
65087274 * ancestor for the swap instead and transfer the memory+swap charge.
65097275 */
65107276 swap_memcg = mem_cgroup_id_get_online(memcg);
6511
- nr_entries = hpage_nr_pages(page);
7277
+ nr_entries = thp_nr_pages(page);
65127278 /* Get references for the tail pages, too */
65137279 if (nr_entries > 1)
65147280 mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
....@@ -6522,7 +7288,7 @@
65227288 if (!mem_cgroup_is_root(memcg))
65237289 page_counter_uncharge(&memcg->memory, nr_entries);
65247290
6525
- if (memcg != swap_memcg) {
7291
+ if (!cgroup_memory_noswap && memcg != swap_memcg) {
65267292 if (!mem_cgroup_is_root(swap_memcg))
65277293 page_counter_charge(&swap_memcg->memsw, nr_entries);
65287294 page_counter_uncharge(&memcg->memsw, nr_entries);
....@@ -6534,21 +7300,19 @@
65347300 * important here to have the interrupts disabled because it is the
65357301 * only synchronisation we have for updating the per-CPU variables.
65367302 */
6537
- local_lock_irqsave(event_lock, flags);
6538
-#ifndef CONFIG_PREEMPT_RT_BASE
7303
+ local_lock_irqsave(&event_lock.l, flags);
7304
+#ifndef CONFIG_PREEMPT_RT
65397305 VM_BUG_ON(!irqs_disabled());
65407306 #endif
6541
- mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
6542
- -nr_entries);
7307
+ mem_cgroup_charge_statistics(memcg, page, -nr_entries);
65437308 memcg_check_events(memcg, page);
6544
- local_unlock_irqrestore(event_lock, flags);
7309
+ local_unlock_irqrestore(&event_lock.l, flags);
65457310
6546
- if (!mem_cgroup_is_root(memcg))
6547
- css_put_many(&memcg->css, nr_entries);
7311
+ css_put(&memcg->css);
65487312 }
65497313
65507314 /**
6551
- * mem_cgroup_try_charge_swap - try charging swap space for a page
7315
+ * __mem_cgroup_try_charge_swap - try charging swap space for a page
65527316 * @page: page being added to swap
65537317 * @entry: swap entry to charge
65547318 *
....@@ -6556,14 +7320,14 @@
65567320 *
65577321 * Returns 0 on success, -ENOMEM on failure.
65587322 */
6559
-int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
7323
+int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
65607324 {
6561
- unsigned int nr_pages = hpage_nr_pages(page);
7325
+ unsigned int nr_pages = thp_nr_pages(page);
65627326 struct page_counter *counter;
65637327 struct mem_cgroup *memcg;
65647328 unsigned short oldid;
65657329
6566
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account)
7330
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
65677331 return 0;
65687332
65697333 memcg = page->mem_cgroup;
....@@ -6579,7 +7343,7 @@
65797343
65807344 memcg = mem_cgroup_id_get_online(memcg);
65817345
6582
- if (!mem_cgroup_is_root(memcg) &&
7346
+ if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) &&
65837347 !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
65847348 memcg_memory_event(memcg, MEMCG_SWAP_MAX);
65857349 memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
....@@ -6598,23 +7362,20 @@
65987362 }
65997363
66007364 /**
6601
- * mem_cgroup_uncharge_swap - uncharge swap space
7365
+ * __mem_cgroup_uncharge_swap - uncharge swap space
66027366 * @entry: swap entry to uncharge
66037367 * @nr_pages: the amount of swap space to uncharge
66047368 */
6605
-void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
7369
+void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
66067370 {
66077371 struct mem_cgroup *memcg;
66087372 unsigned short id;
6609
-
6610
- if (!do_swap_account)
6611
- return;
66127373
66137374 id = swap_cgroup_record(entry, 0, nr_pages);
66147375 rcu_read_lock();
66157376 memcg = mem_cgroup_from_id(id);
66167377 if (memcg) {
6617
- if (!mem_cgroup_is_root(memcg)) {
7378
+ if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) {
66187379 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
66197380 page_counter_uncharge(&memcg->swap, nr_pages);
66207381 else
....@@ -6630,7 +7391,7 @@
66307391 {
66317392 long nr_swap_pages = get_nr_swap_pages();
66327393
6633
- if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
7394
+ if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
66347395 return nr_swap_pages;
66357396 for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
66367397 nr_swap_pages = min_t(long, nr_swap_pages,
....@@ -6647,36 +7408,33 @@
66477408
66487409 if (vm_swap_full())
66497410 return true;
6650
- if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
7411
+ if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
66517412 return false;
66527413
66537414 memcg = page->mem_cgroup;
66547415 if (!memcg)
66557416 return false;
66567417
6657
- for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
6658
- if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.max)
7418
+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
7419
+ unsigned long usage = page_counter_read(&memcg->swap);
7420
+
7421
+ if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
7422
+ usage * 2 >= READ_ONCE(memcg->swap.max))
66597423 return true;
7424
+ }
66607425
66617426 return false;
66627427 }
66637428
6664
-/* for remember boot option*/
6665
-#ifdef CONFIG_MEMCG_SWAP_ENABLED
6666
-static int really_do_swap_account __initdata = 1;
6667
-#else
6668
-static int really_do_swap_account __initdata;
6669
-#endif
6670
-
6671
-static int __init enable_swap_account(char *s)
7429
+static int __init setup_swap_account(char *s)
66727430 {
66737431 if (!strcmp(s, "1"))
6674
- really_do_swap_account = 1;
7432
+ cgroup_memory_noswap = 0;
66757433 else if (!strcmp(s, "0"))
6676
- really_do_swap_account = 0;
7434
+ cgroup_memory_noswap = 1;
66777435 return 1;
66787436 }
6679
-__setup("swapaccount=", enable_swap_account);
7437
+__setup("swapaccount=", setup_swap_account);
66807438
66817439 static u64 swap_current_read(struct cgroup_subsys_state *css,
66827440 struct cftype *cft)
....@@ -6686,17 +7444,33 @@
66867444 return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
66877445 }
66887446
7447
+static int swap_high_show(struct seq_file *m, void *v)
7448
+{
7449
+ return seq_puts_memcg_tunable(m,
7450
+ READ_ONCE(mem_cgroup_from_seq(m)->swap.high));
7451
+}
7452
+
7453
+static ssize_t swap_high_write(struct kernfs_open_file *of,
7454
+ char *buf, size_t nbytes, loff_t off)
7455
+{
7456
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
7457
+ unsigned long high;
7458
+ int err;
7459
+
7460
+ buf = strstrip(buf);
7461
+ err = page_counter_memparse(buf, "max", &high);
7462
+ if (err)
7463
+ return err;
7464
+
7465
+ page_counter_set_high(&memcg->swap, high);
7466
+
7467
+ return nbytes;
7468
+}
7469
+
66897470 static int swap_max_show(struct seq_file *m, void *v)
66907471 {
6691
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
6692
- unsigned long max = READ_ONCE(memcg->swap.max);
6693
-
6694
- if (max == PAGE_COUNTER_MAX)
6695
- seq_puts(m, "max\n");
6696
- else
6697
- seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
6698
-
6699
- return 0;
7472
+ return seq_puts_memcg_tunable(m,
7473
+ READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
67007474 }
67017475
67027476 static ssize_t swap_max_write(struct kernfs_open_file *of,
....@@ -6718,8 +7492,10 @@
67187492
67197493 static int swap_events_show(struct seq_file *m, void *v)
67207494 {
6721
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
7495
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
67227496
7497
+ seq_printf(m, "high %lu\n",
7498
+ atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH]));
67237499 seq_printf(m, "max %lu\n",
67247500 atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
67257501 seq_printf(m, "fail %lu\n",
....@@ -6733,6 +7509,12 @@
67337509 .name = "swap.current",
67347510 .flags = CFTYPE_NOT_ON_ROOT,
67357511 .read_u64 = swap_current_read,
7512
+ },
7513
+ {
7514
+ .name = "swap.high",
7515
+ .flags = CFTYPE_NOT_ON_ROOT,
7516
+ .seq_show = swap_high_show,
7517
+ .write = swap_high_write,
67367518 },
67377519 {
67387520 .name = "swap.max",
....@@ -6749,7 +7531,7 @@
67497531 { } /* terminate */
67507532 };
67517533
6752
-static struct cftype memsw_cgroup_files[] = {
7534
+static struct cftype memsw_files[] = {
67537535 {
67547536 .name = "memsw.usage_in_bytes",
67557537 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
....@@ -6776,17 +7558,27 @@
67767558 { }, /* terminate */
67777559 };
67787560
7561
+/*
7562
+ * If mem_cgroup_swap_init() is implemented as a subsys_initcall()
7563
+ * instead of a core_initcall(), this could mean cgroup_memory_noswap still
7564
+ * remains set to false even when memcg is disabled via "cgroup_disable=memory"
7565
+ * boot parameter. This may result in premature OOPS inside
7566
+ * mem_cgroup_get_nr_swap_pages() function in corner cases.
7567
+ */
67797568 static int __init mem_cgroup_swap_init(void)
67807569 {
6781
- if (!mem_cgroup_disabled() && really_do_swap_account) {
6782
- do_swap_account = 1;
6783
- WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys,
6784
- swap_files));
6785
- WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
6786
- memsw_cgroup_files));
6787
- }
7570
+ /* No memory control -> no swap control */
7571
+ if (mem_cgroup_disabled())
7572
+ cgroup_memory_noswap = true;
7573
+
7574
+ if (cgroup_memory_noswap)
7575
+ return 0;
7576
+
7577
+ WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
7578
+ WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files));
7579
+
67887580 return 0;
67897581 }
6790
-subsys_initcall(mem_cgroup_swap_init);
7582
+core_initcall(mem_cgroup_swap_init);
67917583
67927584 #endif /* CONFIG_MEMCG_SWAP */