~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0-or-later
1	2	/* memcontrol.c - Memory Controller
2	3	*
3	4	* Copyright IBM Corporation, 2007
..	..	@@ -19,26 +20,17 @@
19	20	* Lockless page tracking & accounting
20	21	* Unified hierarchy configuration model
21	22	* Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
22		- *
23		- * This program is free software; you can redistribute it and/or modify
24		- * it under the terms of the GNU General Public License as published by
25		- * the Free Software Foundation; either version 2 of the License, or
26		- * (at your option) any later version.
27		- *
28		- * This program is distributed in the hope that it will be useful,
29		- * but WITHOUT ANY WARRANTY; without even the implied warranty of
30		- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
31		- * GNU General Public License for more details.
32	23	*/
33	24
34	25	#include <linux/page_counter.h>
35	26	#include <linux/memcontrol.h>
36	27	#include <linux/cgroup.h>
37		-#include <linux/mm.h>
	28	+#include <linux/pagewalk.h>
38	29	#include <linux/sched/mm.h>
39	30	#include <linux/shmem_fs.h>
40	31	#include <linux/hugetlb.h>
41	32	#include <linux/pagemap.h>
	33	+#include <linux/vm_event_item.h>
42	34	#include <linux/smp.h>
43	35	#include <linux/page-flags.h>
44	36	#include <linux/backing-dev.h>
..	..	@@ -65,22 +57,26 @@
65	57	#include <linux/lockdep.h>
66	58	#include <linux/file.h>
67	59	#include <linux/tracehook.h>
	60	+#include <linux/psi.h>
	61	+#include <linux/seq_buf.h>
68	62	#include "internal.h"
69	63	#include <net/sock.h>
70	64	#include <net/ip.h>
71	65	#include "slab.h"
72		-#include <linux/locallock.h>
	66	+#include <linux/local_lock.h>
73	67
74	68	#include <linux/uaccess.h>
75	69
76	70	#include <trace/events/vmscan.h>
	71	+#include <trace/hooks/mm.h>
77	72
78	73	struct cgroup_subsys memory_cgrp_subsys __read_mostly;
79	74	EXPORT_SYMBOL(memory_cgrp_subsys);
80	75
81	76	struct mem_cgroup *root_mem_cgroup __read_mostly;
82	77
83		-#define MEM_CGROUP_RECLAIM_RETRIES 5
	78	+/* Active memory cgroup to use from an interrupt context */
	79	+DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
84	80
85	81	/* Socket memory accounting disabled? */
86	82	static bool cgroup_memory_nosocket;
..	..	@@ -90,30 +86,30 @@
90	86
91	87	/* Whether the swap controller is active */
92	88	#ifdef CONFIG_MEMCG_SWAP
93		-int do_swap_account __read_mostly;
	89	+bool cgroup_memory_noswap __read_mostly;
94	90	#else
95		-#define do_swap_account 0
	91	+#define cgroup_memory_noswap 1
96	92	#endif
97	93
98		-static DEFINE_LOCAL_IRQ_LOCK(event_lock);
	94	+#ifdef CONFIG_CGROUP_WRITEBACK
	95	+static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
	96	+#endif
	97	+
	98	+struct event_lock {
	99	+ local_lock_t l;
	100	+};
	101	+static DEFINE_PER_CPU(struct event_lock, event_lock) = {
	102	+ .l = INIT_LOCAL_LOCK(l),
	103	+};
99	104
100	105	/* Whether legacy memory+swap accounting is active */
101	106	static bool do_memsw_account(void)
102	107	{
103		- return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
	108	+ return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
104	109	}
105		-
106		-static const char *const mem_cgroup_lru_names[] = {
107		- "inactive_anon",
108		- "active_anon",
109		- "inactive_file",
110		- "active_file",
111		- "unevictable",
112		-};
113	110
114	111	#define THRESHOLDS_EVENTS_TARGET 128
115	112	#define SOFTLIMIT_EVENTS_TARGET 1024
116		-#define NUMAINFO_EVENTS_TARGET 1024
117	113
118	114	/*
119	115	* Cgroups above their limits are maintained in a RB-Tree, independent of
..	..	@@ -213,14 +209,6 @@
213	209	#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
214	210	#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
215	211
216		-enum charge_type {
217		- MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
218		- MEM_CGROUP_CHARGE_TYPE_ANON,
219		- MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
220		- MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */
221		- NR_CHARGE_TYPE,
222		-};
223		-
224	212	/* for encoding cft->private value on file */
225	213	enum res_type {
226	214	_MEM,
..	..	@@ -251,7 +239,7 @@
251	239	iter != NULL; \
252	240	iter = mem_cgroup_iter(NULL, iter, NULL))
253	241
254		-static inline bool should_force_charge(void)
	242	+static inline bool task_is_dying(void)
255	243	{
256	244	return tsk_is_oom_victim(current) \|\| fatal_signal_pending(current) \|\|
257	245	(current->flags & PF_EXITING);
..	..	@@ -271,8 +259,100 @@
271	259	}
272	260
273	261	#ifdef CONFIG_MEMCG_KMEM
	262	+static DEFINE_SPINLOCK(objcg_lock);
	263	+
	264	+static void obj_cgroup_release(struct percpu_ref *ref)
	265	+{
	266	+ struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
	267	+ struct mem_cgroup *memcg;
	268	+ unsigned int nr_bytes;
	269	+ unsigned int nr_pages;
	270	+ unsigned long flags;
	271	+
	272	+ /*
	273	+ * At this point all allocated objects are freed, and
	274	+ * objcg->nr_charged_bytes can't have an arbitrary byte value.
	275	+ * However, it can be PAGE_SIZE or (x * PAGE_SIZE).
	276	+ *
	277	+ * The following sequence can lead to it:
	278	+ * 1) CPU0: objcg == stock->cached_objcg
	279	+ * 2) CPU1: we do a small allocation (e.g. 92 bytes),
	280	+ * PAGE_SIZE bytes are charged
	281	+ * 3) CPU1: a process from another memcg is allocating something,
	282	+ * the stock if flushed,
	283	+ * objcg->nr_charged_bytes = PAGE_SIZE - 92
	284	+ * 5) CPU0: we do release this object,
	285	+ * 92 bytes are added to stock->nr_bytes
	286	+ * 6) CPU0: stock is flushed,
	287	+ * 92 bytes are added to objcg->nr_charged_bytes
	288	+ *
	289	+ * In the result, nr_charged_bytes == PAGE_SIZE.
	290	+ * This page will be uncharged in obj_cgroup_release().
	291	+ */
	292	+ nr_bytes = atomic_read(&objcg->nr_charged_bytes);
	293	+ WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
	294	+ nr_pages = nr_bytes >> PAGE_SHIFT;
	295	+
	296	+ spin_lock_irqsave(&objcg_lock, flags);
	297	+ memcg = obj_cgroup_memcg(objcg);
	298	+ if (nr_pages)
	299	+ __memcg_kmem_uncharge(memcg, nr_pages);
	300	+ list_del(&objcg->list);
	301	+ mem_cgroup_put(memcg);
	302	+ spin_unlock_irqrestore(&objcg_lock, flags);
	303	+
	304	+ percpu_ref_exit(ref);
	305	+ kfree_rcu(objcg, rcu);
	306	+}
	307	+
	308	+static struct obj_cgroup *obj_cgroup_alloc(void)
	309	+{
	310	+ struct obj_cgroup *objcg;
	311	+ int ret;
	312	+
	313	+ objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
	314	+ if (!objcg)
	315	+ return NULL;
	316	+
	317	+ ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0,
	318	+ GFP_KERNEL);
	319	+ if (ret) {
	320	+ kfree(objcg);
	321	+ return NULL;
	322	+ }
	323	+ INIT_LIST_HEAD(&objcg->list);
	324	+ return objcg;
	325	+}
	326	+
	327	+static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
	328	+ struct mem_cgroup *parent)
	329	+{
	330	+ struct obj_cgroup objcg, iter;
	331	+
	332	+ objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
	333	+
	334	+ spin_lock_irq(&objcg_lock);
	335	+
	336	+ /* Move active objcg to the parent's list */
	337	+ xchg(&objcg->memcg, parent);
	338	+ css_get(&parent->css);
	339	+ list_add(&objcg->list, &parent->objcg_list);
	340	+
	341	+ /* Move already reparented objcgs to the parent's list */
	342	+ list_for_each_entry(iter, &memcg->objcg_list, list) {
	343	+ css_get(&parent->css);
	344	+ xchg(&iter->memcg, parent);
	345	+ css_put(&memcg->css);
	346	+ }
	347	+ list_splice(&memcg->objcg_list, &parent->objcg_list);
	348	+
	349	+ spin_unlock_irq(&objcg_lock);
	350	+
	351	+ percpu_ref_kill(&objcg->refcnt);
	352	+}
	353	+
274	354	/*
275		- * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
	355	+ * This will be used as a shrinker list's index.
276	356	* The main reason for not using cgroup id for this:
277	357	* this works better in sparse environments, where we have a lot of memcgs,
278	358	* but only a few kmem-limited. Or also, if we have, for instance, 200
..	..	@@ -315,14 +395,13 @@
315	395
316	396	/*
317	397	* A lot of the calls to the cache allocation functions are expected to be
318		- * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
	398	+ * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are
319	399	* conditional to this static branch, we'll have to allow modules that does
320	400	* kmem_cache_alloc and the such to see this symbol as well
321	401	*/
322	402	DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
323	403	EXPORT_SYMBOL(memcg_kmem_enabled_key);
324		-
325		-struct workqueue_struct *memcg_kmem_cache_wq;
	404	+#endif
326	405
327	406	static int memcg_shrinker_map_size;
328	407	static DEFINE_MUTEX(memcg_shrinker_map_mutex);
..	..	@@ -347,7 +426,7 @@
347	426	if (!old)
348	427	return 0;
349	428
350		- new = kvmalloc(sizeof(*new) + size, GFP_KERNEL);
	429	+ new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
351	430	if (!new)
352	431	return -ENOMEM;
353	432
..	..	@@ -391,7 +470,7 @@
391	470	mutex_lock(&memcg_shrinker_map_mutex);
392	471	size = memcg_shrinker_map_size;
393	472	for_each_node(nid) {
394		- map = kvzalloc(sizeof(*map) + size, GFP_KERNEL);
	473	+ map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid);
395	474	if (!map) {
396	475	memcg_free_shrinker_maps(memcg);
397	476	ret = -ENOMEM;
..	..	@@ -448,14 +527,6 @@
448	527	}
449	528	}
450	529
451		-#else /* CONFIG_MEMCG_KMEM */
452		-static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
453		-{
454		- return 0;
455		-}
456		-static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { }
457		-#endif /* CONFIG_MEMCG_KMEM */
458		-
459	530	/**
460	531	* mem_cgroup_css_from_page - css of the memcg associated with a page
461	532	* @page: page of interest
..	..	@@ -498,7 +569,17 @@
498	569	unsigned long ino = 0;
499	570
500	571	rcu_read_lock();
501		- memcg = READ_ONCE(page->mem_cgroup);
	572	+ memcg = page->mem_cgroup;
	573	+
	574	+ /*
	575	+ * The lowest bit set means that memcg isn't a valid
	576	+ * memcg pointer, but a obj_cgroups pointer.
	577	+ * In this case the page is shared and doesn't belong
	578	+ * to any specific memory cgroup.
	579	+ */
	580	+ if ((unsigned long) memcg & 0x1UL)
	581	+ memcg = NULL;
	582	+
502	583	while (memcg && !(memcg->css.flags & CSS_ONLINE))
503	584	memcg = parent_mem_cgroup(memcg);
504	585	if (memcg)
..	..	@@ -674,7 +755,7 @@
674	755	*/
675	756	__mem_cgroup_remove_exceeded(mz, mctz);
676	757	if (!soft_limit_excess(mz->memcg) \|\|
677		- !css_tryget_online(&mz->memcg->css))
	758	+ !css_tryget(&mz->memcg->css))
678	759	goto retry;
679	760	done:
680	761	return mz;
..	..	@@ -691,33 +772,187 @@
691	772	return mz;
692	773	}
693	774
694		-static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
695		- int event)
	775	+/**
	776	+ * __mod_memcg_state - update cgroup memory statistics
	777	+ * @memcg: the memory cgroup
	778	+ * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
	779	+ * @val: delta to add to the counter, can be negative
	780	+ */
	781	+void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
696	782	{
697		- return atomic_long_read(&memcg->events[event]);
	783	+ long x, threshold = MEMCG_CHARGE_BATCH;
	784	+
	785	+ if (mem_cgroup_disabled())
	786	+ return;
	787	+
	788	+ if (memcg_stat_item_in_bytes(idx))
	789	+ threshold <<= PAGE_SHIFT;
	790	+
	791	+ x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
	792	+ if (unlikely(abs(x) > threshold)) {
	793	+ struct mem_cgroup *mi;
	794	+
	795	+ /*
	796	+ * Batch local counters to keep them in sync with
	797	+ * the hierarchical ones.
	798	+ */
	799	+ __this_cpu_add(memcg->vmstats_local->stat[idx], x);
	800	+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
	801	+ atomic_long_add(x, &mi->vmstats[idx]);
	802	+ x = 0;
	803	+ }
	804	+ __this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
	805	+}
	806	+
	807	+static struct mem_cgroup_per_node *
	808	+parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
	809	+{
	810	+ struct mem_cgroup *parent;
	811	+
	812	+ parent = parent_mem_cgroup(pn->memcg);
	813	+ if (!parent)
	814	+ return NULL;
	815	+ return mem_cgroup_nodeinfo(parent, nid);
	816	+}
	817	+
	818	+void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
	819	+ int val)
	820	+{
	821	+ struct mem_cgroup_per_node *pn;
	822	+ struct mem_cgroup *memcg;
	823	+ long x, threshold = MEMCG_CHARGE_BATCH;
	824	+
	825	+ pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
	826	+ memcg = pn->memcg;
	827	+
	828	+ preempt_disable_rt();
	829	+ /* Update memcg */
	830	+ __mod_memcg_state(memcg, idx, val);
	831	+
	832	+ /* Update lruvec */
	833	+ __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
	834	+
	835	+ if (vmstat_item_in_bytes(idx))
	836	+ threshold <<= PAGE_SHIFT;
	837	+
	838	+ x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
	839	+ if (unlikely(abs(x) > threshold)) {
	840	+ pg_data_t *pgdat = lruvec_pgdat(lruvec);
	841	+ struct mem_cgroup_per_node *pi;
	842	+
	843	+ for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
	844	+ atomic_long_add(x, &pi->lruvec_stat[idx]);
	845	+ x = 0;
	846	+ }
	847	+ __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
	848	+ preempt_enable_rt();
	849	+}
	850	+
	851	+/**
	852	+ * __mod_lruvec_state - update lruvec memory statistics
	853	+ * @lruvec: the lruvec
	854	+ * @idx: the stat item
	855	+ * @val: delta to add to the counter, can be negative
	856	+ *
	857	+ * The lruvec is the intersection of the NUMA node and a cgroup. This
	858	+ * function updates the all three counters that are affected by a
	859	+ * change of state at this level: per-node, per-cgroup, per-lruvec.
	860	+ */
	861	+void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
	862	+ int val)
	863	+{
	864	+ /* Update node */
	865	+ __mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
	866	+
	867	+ /* Update memcg and lruvec */
	868	+ if (!mem_cgroup_disabled())
	869	+ __mod_memcg_lruvec_state(lruvec, idx, val);
	870	+}
	871	+
	872	+void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
	873	+{
	874	+ pg_data_t *pgdat = page_pgdat(virt_to_page(p));
	875	+ struct mem_cgroup *memcg;
	876	+ struct lruvec *lruvec;
	877	+
	878	+ rcu_read_lock();
	879	+ memcg = mem_cgroup_from_obj(p);
	880	+
	881	+ /*
	882	+ * Untracked pages have no memcg, no lruvec. Update only the
	883	+ * node. If we reparent the slab objects to the root memcg,
	884	+ * when we free the slab object, we need to update the per-memcg
	885	+ * vmstats to keep it correct for the root memcg.
	886	+ */
	887	+ if (!memcg) {
	888	+ __mod_node_page_state(pgdat, idx, val);
	889	+ } else {
	890	+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
	891	+ __mod_lruvec_state(lruvec, idx, val);
	892	+ }
	893	+ rcu_read_unlock();
	894	+}
	895	+
	896	+void mod_memcg_obj_state(void *p, int idx, int val)
	897	+{
	898	+ struct mem_cgroup *memcg;
	899	+
	900	+ rcu_read_lock();
	901	+ memcg = mem_cgroup_from_obj(p);
	902	+ if (memcg)
	903	+ mod_memcg_state(memcg, idx, val);
	904	+ rcu_read_unlock();
	905	+}
	906	+
	907	+/**
	908	+ * __count_memcg_events - account VM events in a cgroup
	909	+ * @memcg: the memory cgroup
	910	+ * @idx: the event item
	911	+ * @count: the number of events that occured
	912	+ */
	913	+void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
	914	+ unsigned long count)
	915	+{
	916	+ unsigned long x;
	917	+
	918	+ if (mem_cgroup_disabled())
	919	+ return;
	920	+
	921	+ x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
	922	+ if (unlikely(x > MEMCG_CHARGE_BATCH)) {
	923	+ struct mem_cgroup *mi;
	924	+
	925	+ /*
	926	+ * Batch local counters to keep them in sync with
	927	+ * the hierarchical ones.
	928	+ */
	929	+ __this_cpu_add(memcg->vmstats_local->events[idx], x);
	930	+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
	931	+ atomic_long_add(x, &mi->vmevents[idx]);
	932	+ x = 0;
	933	+ }
	934	+ __this_cpu_write(memcg->vmstats_percpu->events[idx], x);
	935	+}
	936	+
	937	+static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
	938	+{
	939	+ return atomic_long_read(&memcg->vmevents[event]);
	940	+}
	941	+
	942	+static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
	943	+{
	944	+ long x = 0;
	945	+ int cpu;
	946	+
	947	+ for_each_possible_cpu(cpu)
	948	+ x += per_cpu(memcg->vmstats_local->events[event], cpu);
	949	+ return x;
698	950	}
699	951
700	952	static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
701	953	struct page *page,
702		- bool compound, int nr_pages)
	954	+ int nr_pages)
703	955	{
704		- /*
705		- * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
706		- * counted as CACHE even if it's on ANON LRU.
707		- */
708		- if (PageAnon(page))
709		- __mod_memcg_state(memcg, MEMCG_RSS, nr_pages);
710		- else {
711		- __mod_memcg_state(memcg, MEMCG_CACHE, nr_pages);
712		- if (PageSwapBacked(page))
713		- __mod_memcg_state(memcg, NR_SHMEM, nr_pages);
714		- }
715		-
716		- if (compound) {
717		- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
718		- __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages);
719		- }
720		-
721	956	/* pagein of a big page is an event. So, ignore page size */
722	957	if (nr_pages > 0)
723	958	__count_memcg_events(memcg, PGPGIN, 1);
..	..	@@ -726,35 +961,7 @@
726	961	nr_pages = -nr_pages; /* for event */
727	962	}
728	963
729		- __this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages);
730		-}
731		-
732		-unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
733		- int nid, unsigned int lru_mask)
734		-{
735		- struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
736		- unsigned long nr = 0;
737		- enum lru_list lru;
738		-
739		- VM_BUG_ON((unsigned)nid >= nr_node_ids);
740		-
741		- for_each_lru(lru) {
742		- if (!(BIT(lru) & lru_mask))
743		- continue;
744		- nr += mem_cgroup_get_lru_size(lruvec, lru);
745		- }
746		- return nr;
747		-}
748		-
749		-static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
750		- unsigned int lru_mask)
751		-{
752		- unsigned long nr = 0;
753		- int nid;
754		-
755		- for_each_node_state(nid, N_MEMORY)
756		- nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
757		- return nr;
	964	+ __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
758	965	}
759	966
760	967	static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
..	..	@@ -762,8 +969,8 @@
762	969	{
763	970	unsigned long val, next;
764	971
765		- val = __this_cpu_read(memcg->stat_cpu->nr_page_events);
766		- next = __this_cpu_read(memcg->stat_cpu->targets[target]);
	972	+ val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
	973	+ next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
767	974	/* from time_after() in jiffies.h */
768	975	if ((long)(next - val) < 0) {
769	976	switch (target) {
..	..	@@ -773,13 +980,10 @@
773	980	case MEM_CGROUP_TARGET_SOFTLIMIT:
774	981	next = val + SOFTLIMIT_EVENTS_TARGET;
775	982	break;
776		- case MEM_CGROUP_TARGET_NUMAINFO:
777		- next = val + NUMAINFO_EVENTS_TARGET;
778		- break;
779	983	default:
780	984	break;
781	985	}
782		- __this_cpu_write(memcg->stat_cpu->targets[target], next);
	986	+ __this_cpu_write(memcg->vmstats_percpu->targets[target], next);
783	987	return true;
784	988	}
785	989	return false;
..	..	@@ -795,21 +999,12 @@
795	999	if (unlikely(mem_cgroup_event_ratelimit(memcg,
796	1000	MEM_CGROUP_TARGET_THRESH))) {
797	1001	bool do_softlimit;
798		- bool do_numainfo __maybe_unused;
799	1002
800	1003	do_softlimit = mem_cgroup_event_ratelimit(memcg,
801	1004	MEM_CGROUP_TARGET_SOFTLIMIT);
802		-#if MAX_NUMNODES > 1
803		- do_numainfo = mem_cgroup_event_ratelimit(memcg,
804		- MEM_CGROUP_TARGET_NUMAINFO);
805		-#endif
806	1005	mem_cgroup_threshold(memcg);
807	1006	if (unlikely(do_softlimit))
808	1007	mem_cgroup_update_tree(memcg, page);
809		-#if MAX_NUMNODES > 1
810		- if (unlikely(do_numainfo))
811		- atomic_inc(&memcg->numainfo_events);
812		-#endif
813	1008	}
814	1009	}
815	1010
..	..	@@ -877,27 +1072,60 @@
877	1072	return NULL;
878	1073
879	1074	rcu_read_lock();
880		- if (!memcg \|\| !css_tryget_online(&memcg->css))
	1075	+ /* Page should not get uncharged and freed memcg under us. */
	1076	+ if (!memcg \|\| WARN_ON_ONCE(!css_tryget(&memcg->css)))
881	1077	memcg = root_mem_cgroup;
882	1078	rcu_read_unlock();
883	1079	return memcg;
884	1080	}
885	1081	EXPORT_SYMBOL(get_mem_cgroup_from_page);
886	1082
	1083	+static __always_inline struct mem_cgroup *active_memcg(void)
	1084	+{
	1085	+ if (in_interrupt())
	1086	+ return this_cpu_read(int_active_memcg);
	1087	+ else
	1088	+ return current->active_memcg;
	1089	+}
	1090	+
	1091	+static __always_inline struct mem_cgroup *get_active_memcg(void)
	1092	+{
	1093	+ struct mem_cgroup *memcg;
	1094	+
	1095	+ rcu_read_lock();
	1096	+ memcg = active_memcg();
	1097	+ /* remote memcg must hold a ref. */
	1098	+ if (memcg && WARN_ON_ONCE(!css_tryget(&memcg->css)))
	1099	+ memcg = root_mem_cgroup;
	1100	+ rcu_read_unlock();
	1101	+
	1102	+ return memcg;
	1103	+}
	1104	+
	1105	+static __always_inline bool memcg_kmem_bypass(void)
	1106	+{
	1107	+ /* Allow remote memcg charging from any context. */
	1108	+ if (unlikely(active_memcg()))
	1109	+ return false;
	1110	+
	1111	+ /* Memcg to charge can't be determined. */
	1112	+ if (in_interrupt() \|\| !current->mm \|\| (current->flags & PF_KTHREAD))
	1113	+ return true;
	1114	+
	1115	+ return false;
	1116	+}
	1117	+
887	1118	/**
888		- * If current->active_memcg is non-NULL, do not fallback to current->mm->memcg.
	1119	+ * If active memcg is set, do not fallback to current->mm->memcg.
889	1120	*/
890	1121	static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
891	1122	{
892		- if (unlikely(current->active_memcg)) {
893		- struct mem_cgroup *memcg = root_mem_cgroup;
	1123	+ if (memcg_kmem_bypass())
	1124	+ return NULL;
894	1125
895		- rcu_read_lock();
896		- if (css_tryget_online(&current->active_memcg->css))
897		- memcg = current->active_memcg;
898		- rcu_read_unlock();
899		- return memcg;
900		- }
	1126	+ if (unlikely(active_memcg()))
	1127	+ return get_active_memcg();
	1128	+
901	1129	return get_mem_cgroup_from_mm(current->mm);
902	1130	}
903	1131
..	..	@@ -914,15 +1142,15 @@
914	1142	* invocations for reference counting, or use mem_cgroup_iter_break()
915	1143	* to cancel a hierarchy walk before the round-trip is complete.
916	1144	*
917		- * Reclaimers can specify a node and a priority level in @reclaim to
918		- * divide up the memcgs in the hierarchy among all concurrent
919		- * reclaimers operating on the same node and priority.
	1145	+ * Reclaimers can specify a node in @reclaim to divide up the memcgs
	1146	+ * in the hierarchy among all concurrent reclaimers operating on the
	1147	+ * same node.
920	1148	*/
921	1149	struct mem_cgroup mem_cgroup_iter(struct mem_cgroup root,
922	1150	struct mem_cgroup *prev,
923	1151	struct mem_cgroup_reclaim_cookie *reclaim)
924	1152	{
925		- struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
	1153	+ struct mem_cgroup_reclaim_iter *iter;
926	1154	struct cgroup_subsys_state *css = NULL;
927	1155	struct mem_cgroup *memcg = NULL;
928	1156	struct mem_cgroup *pos = NULL;
..	..	@@ -948,7 +1176,7 @@
948	1176	struct mem_cgroup_per_node *mz;
949	1177
950	1178	mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
951		- iter = &mz->iter[reclaim->priority];
	1179	+ iter = &mz->iter;
952	1180
953	1181	if (prev && reclaim->generation != iter->generation)
954	1182	goto out_unlock;
..	..	@@ -1048,15 +1276,11 @@
1048	1276	struct mem_cgroup_reclaim_iter *iter;
1049	1277	struct mem_cgroup_per_node *mz;
1050	1278	int nid;
1051		- int i;
1052	1279
1053	1280	for_each_node(nid) {
1054	1281	mz = mem_cgroup_nodeinfo(from, nid);
1055		- for (i = 0; i <= DEF_PRIORITY; i++) {
1056		- iter = &mz->iter[i];
1057		- cmpxchg(&iter->position,
1058		- dead_memcg, NULL);
1059		- }
	1282	+ iter = &mz->iter;
	1283	+ cmpxchg(&iter->position, dead_memcg, NULL);
1060	1284	}
1061	1285	}
1062	1286
..	..	@@ -1106,7 +1330,7 @@
1106	1330	struct css_task_iter it;
1107	1331	struct task_struct *task;
1108	1332
1109		- css_task_iter_start(&iter->css, 0, &it);
	1333	+ css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
1110	1334	while (!ret && (task = css_task_iter_next(&it)))
1111	1335	ret = fn(task, arg);
1112	1336	css_task_iter_end(&it);
..	..	@@ -1123,9 +1347,8 @@
1123	1347	* @page: the page
1124	1348	* @pgdat: pgdat of the page
1125	1349	*
1126		- * This function is only safe when following the LRU page isolation
1127		- * and putback protocol: the LRU lock must be held, and the page must
1128		- * either be PageLRU() or the caller must have isolated/allocated it.
	1350	+ * This function relies on page->mem_cgroup being stable - see the
	1351	+ * access rules in commit_charge().
1129	1352	*/
1130	1353	struct lruvec mem_cgroup_page_lruvec(struct page page, struct pglist_data *pgdat)
1131	1354	{
..	..	@@ -1134,7 +1357,7 @@
1134	1357	struct lruvec *lruvec;
1135	1358
1136	1359	if (mem_cgroup_disabled()) {
1137		- lruvec = &pgdat->lruvec;
	1360	+ lruvec = &pgdat->__lruvec;
1138	1361	goto out;
1139	1362	}
1140	1363
..	..	@@ -1158,6 +1381,38 @@
1158	1381	lruvec->pgdat = pgdat;
1159	1382	return lruvec;
1160	1383	}
	1384	+
	1385	+struct lruvec page_to_lruvec(struct page page, pg_data_t *pgdat)
	1386	+{
	1387	+ struct lruvec *lruvec;
	1388	+
	1389	+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
	1390	+
	1391	+ return lruvec;
	1392	+}
	1393	+EXPORT_SYMBOL_GPL(page_to_lruvec);
	1394	+
	1395	+void do_traversal_all_lruvec(void)
	1396	+{
	1397	+ pg_data_t *pgdat;
	1398	+
	1399	+ for_each_online_pgdat(pgdat) {
	1400	+ struct mem_cgroup *memcg = NULL;
	1401	+
	1402	+ spin_lock_irq(&pgdat->lru_lock);
	1403	+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
	1404	+ do {
	1405	+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
	1406	+
	1407	+ trace_android_vh_do_traversal_lruvec(lruvec);
	1408	+
	1409	+ memcg = mem_cgroup_iter(NULL, memcg, NULL);
	1410	+ } while (memcg);
	1411	+
	1412	+ spin_unlock_irq(&pgdat->lru_lock);
	1413	+ }
	1414	+}
	1415	+EXPORT_SYMBOL_GPL(do_traversal_all_lruvec);
1161	1416
1162	1417	/**
1163	1418	* mem_cgroup_update_lru_size - account for adding or removing an lru page
..	..	@@ -1198,32 +1453,6 @@
1198	1453	*lru_size += nr_pages;
1199	1454	}
1200	1455
1201		-bool task_in_mem_cgroup(struct task_struct task, struct mem_cgroup memcg)
1202		-{
1203		- struct mem_cgroup *task_memcg;
1204		- struct task_struct *p;
1205		- bool ret;
1206		-
1207		- p = find_lock_task_mm(task);
1208		- if (p) {
1209		- task_memcg = get_mem_cgroup_from_mm(p->mm);
1210		- task_unlock(p);
1211		- } else {
1212		- /*
1213		- * All threads may have already detached their mm's, but the oom
1214		- * killer still needs to detect if they have already been oom
1215		- * killed to prevent needlessly killing additional tasks.
1216		- */
1217		- rcu_read_lock();
1218		- task_memcg = mem_cgroup_from_task(task);
1219		- css_get(&task_memcg->css);
1220		- rcu_read_unlock();
1221		- }
1222		- ret = mem_cgroup_is_descendant(task_memcg, memcg);
1223		- css_put(&task_memcg->css);
1224		- return ret;
1225		-}
1226		-
1227	1456	/**
1228	1457	* mem_cgroup_margin - calculate chargeable space of a memory cgroup
1229	1458	* @memcg: the memory cgroup
..	..	@@ -1245,7 +1474,7 @@
1245	1474	if (do_memsw_account()) {
1246	1475	count = page_counter_read(&memcg->memsw);
1247	1476	limit = READ_ONCE(memcg->memsw.max);
1248		- if (count <= limit)
	1477	+ if (count < limit)
1249	1478	margin = min(margin, limit - count);
1250	1479	else
1251	1480	margin = 0;
..	..	@@ -1299,85 +1528,199 @@
1299	1528	return false;
1300	1529	}
1301	1530
1302		-static const unsigned int memcg1_stats[] = {
1303		- MEMCG_CACHE,
1304		- MEMCG_RSS,
1305		- MEMCG_RSS_HUGE,
1306		- NR_SHMEM,
1307		- NR_FILE_MAPPED,
1308		- NR_FILE_DIRTY,
1309		- NR_WRITEBACK,
1310		- MEMCG_SWAP,
	1531	+struct memory_stat {
	1532	+ const char *name;
	1533	+ unsigned int ratio;
	1534	+ unsigned int idx;
1311	1535	};
1312	1536
1313		-static const char *const memcg1_stat_names[] = {
1314		- "cache",
1315		- "rss",
1316		- "rss_huge",
1317		- "shmem",
1318		- "mapped_file",
1319		- "dirty",
1320		- "writeback",
1321		- "swap",
	1537	+static struct memory_stat memory_stats[] = {
	1538	+ { "anon", PAGE_SIZE, NR_ANON_MAPPED },
	1539	+ { "file", PAGE_SIZE, NR_FILE_PAGES },
	1540	+ { "kernel_stack", 1024, NR_KERNEL_STACK_KB },
	1541	+ { "percpu", 1, MEMCG_PERCPU_B },
	1542	+ { "sock", PAGE_SIZE, MEMCG_SOCK },
	1543	+ { "shmem", PAGE_SIZE, NR_SHMEM },
	1544	+ { "file_mapped", PAGE_SIZE, NR_FILE_MAPPED },
	1545	+ { "file_dirty", PAGE_SIZE, NR_FILE_DIRTY },
	1546	+ { "file_writeback", PAGE_SIZE, NR_WRITEBACK },
	1547	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	1548	+ /*
	1549	+ * The ratio will be initialized in memory_stats_init(). Because
	1550	+ * on some architectures, the macro of HPAGE_PMD_SIZE is not
	1551	+ * constant(e.g. powerpc).
	1552	+ */
	1553	+ { "anon_thp", 0, NR_ANON_THPS },
	1554	+#endif
	1555	+ { "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON },
	1556	+ { "active_anon", PAGE_SIZE, NR_ACTIVE_ANON },
	1557	+ { "inactive_file", PAGE_SIZE, NR_INACTIVE_FILE },
	1558	+ { "active_file", PAGE_SIZE, NR_ACTIVE_FILE },
	1559	+ { "unevictable", PAGE_SIZE, NR_UNEVICTABLE },
	1560	+
	1561	+ /*
	1562	+ * Note: The slab_reclaimable and slab_unreclaimable must be
	1563	+ * together and slab_reclaimable must be in front.
	1564	+ */
	1565	+ { "slab_reclaimable", 1, NR_SLAB_RECLAIMABLE_B },
	1566	+ { "slab_unreclaimable", 1, NR_SLAB_UNRECLAIMABLE_B },
	1567	+
	1568	+ /* The memory events */
	1569	+ { "workingset_refault_anon", 1, WORKINGSET_REFAULT_ANON },
	1570	+ { "workingset_refault_file", 1, WORKINGSET_REFAULT_FILE },
	1571	+ { "workingset_activate_anon", 1, WORKINGSET_ACTIVATE_ANON },
	1572	+ { "workingset_activate_file", 1, WORKINGSET_ACTIVATE_FILE },
	1573	+ { "workingset_restore_anon", 1, WORKINGSET_RESTORE_ANON },
	1574	+ { "workingset_restore_file", 1, WORKINGSET_RESTORE_FILE },
	1575	+ { "workingset_nodereclaim", 1, WORKINGSET_NODERECLAIM },
1322	1576	};
	1577	+
	1578	+static int __init memory_stats_init(void)
	1579	+{
	1580	+ int i;
	1581	+
	1582	+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
	1583	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	1584	+ if (memory_stats[i].idx == NR_ANON_THPS)
	1585	+ memory_stats[i].ratio = HPAGE_PMD_SIZE;
	1586	+#endif
	1587	+ VM_BUG_ON(!memory_stats[i].ratio);
	1588	+ VM_BUG_ON(memory_stats[i].idx >= MEMCG_NR_STAT);
	1589	+ }
	1590	+
	1591	+ return 0;
	1592	+}
	1593	+pure_initcall(memory_stats_init);
	1594	+
	1595	+static char memory_stat_format(struct mem_cgroup memcg)
	1596	+{
	1597	+ struct seq_buf s;
	1598	+ int i;
	1599	+
	1600	+ seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
	1601	+ if (!s.buffer)
	1602	+ return NULL;
	1603	+
	1604	+ /*
	1605	+ * Provide statistics on the state of the memory subsystem as
	1606	+ * well as cumulative event counters that show past behavior.
	1607	+ *
	1608	+ * This list is ordered following a combination of these gradients:
	1609	+ * 1) generic big picture -> specifics and details
	1610	+ * 2) reflecting userspace activity -> reflecting kernel heuristics
	1611	+ *
	1612	+ * Current memory state:
	1613	+ */
	1614	+
	1615	+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
	1616	+ u64 size;
	1617	+
	1618	+ size = memcg_page_state(memcg, memory_stats[i].idx);
	1619	+ size *= memory_stats[i].ratio;
	1620	+ seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size);
	1621	+
	1622	+ if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
	1623	+ size = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
	1624	+ memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B);
	1625	+ seq_buf_printf(&s, "slab %llu\n", size);
	1626	+ }
	1627	+ }
	1628	+
	1629	+ /* Accumulated memory events */
	1630	+
	1631	+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT),
	1632	+ memcg_events(memcg, PGFAULT));
	1633	+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
	1634	+ memcg_events(memcg, PGMAJFAULT));
	1635	+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGREFILL),
	1636	+ memcg_events(memcg, PGREFILL));
	1637	+ seq_buf_printf(&s, "pgscan %lu\n",
	1638	+ memcg_events(memcg, PGSCAN_KSWAPD) +
	1639	+ memcg_events(memcg, PGSCAN_DIRECT));
	1640	+ seq_buf_printf(&s, "pgsteal %lu\n",
	1641	+ memcg_events(memcg, PGSTEAL_KSWAPD) +
	1642	+ memcg_events(memcg, PGSTEAL_DIRECT));
	1643	+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE),
	1644	+ memcg_events(memcg, PGACTIVATE));
	1645	+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE),
	1646	+ memcg_events(memcg, PGDEACTIVATE));
	1647	+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE),
	1648	+ memcg_events(memcg, PGLAZYFREE));
	1649	+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED),
	1650	+ memcg_events(memcg, PGLAZYFREED));
	1651	+
	1652	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	1653	+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC),
	1654	+ memcg_events(memcg, THP_FAULT_ALLOC));
	1655	+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC),
	1656	+ memcg_events(memcg, THP_COLLAPSE_ALLOC));
	1657	+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
	1658	+
	1659	+ /* The above should easily fit into one page */
	1660	+ WARN_ON_ONCE(seq_buf_has_overflowed(&s));
	1661	+
	1662	+ return s.buffer;
	1663	+}
1323	1664
1324	1665	#define K(x) ((x) << (PAGE_SHIFT-10))
1325	1666	/**
1326		- * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
	1667	+ * mem_cgroup_print_oom_context: Print OOM information relevant to
	1668	+ * memory controller.
1327	1669	* @memcg: The memory cgroup that went over limit
1328	1670	* @p: Task that is going to be killed
1329	1671	*
1330	1672	* NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1331	1673	* enabled
1332	1674	*/
1333		-void mem_cgroup_print_oom_info(struct mem_cgroup memcg, struct task_struct p)
	1675	+void mem_cgroup_print_oom_context(struct mem_cgroup memcg, struct task_struct p)
1334	1676	{
1335		- struct mem_cgroup *iter;
1336		- unsigned int i;
1337		-
1338	1677	rcu_read_lock();
1339	1678
	1679	+ if (memcg) {
	1680	+ pr_cont(",oom_memcg=");
	1681	+ pr_cont_cgroup_path(memcg->css.cgroup);
	1682	+ } else
	1683	+ pr_cont(",global_oom");
1340	1684	if (p) {
1341		- pr_info("Task in ");
	1685	+ pr_cont(",task_memcg=");
1342	1686	pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1343		- pr_cont(" killed as a result of limit of ");
1344		- } else {
1345		- pr_info("Memory limit reached of cgroup ");
1346	1687	}
1347		-
1348		- pr_cont_cgroup_path(memcg->css.cgroup);
1349		- pr_cont("\n");
1350		-
1351	1688	rcu_read_unlock();
	1689	+}
	1690	+
	1691	+/**
	1692	+ * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to
	1693	+ * memory controller.
	1694	+ * @memcg: The memory cgroup that went over limit
	1695	+ */
	1696	+void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
	1697	+{
	1698	+ char *buf;
1352	1699
1353	1700	pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1354	1701	K((u64)page_counter_read(&memcg->memory)),
1355		- K((u64)memcg->memory.max), memcg->memory.failcnt);
1356		- pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1357		- K((u64)page_counter_read(&memcg->memsw)),
1358		- K((u64)memcg->memsw.max), memcg->memsw.failcnt);
1359		- pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1360		- K((u64)page_counter_read(&memcg->kmem)),
1361		- K((u64)memcg->kmem.max), memcg->kmem.failcnt);
1362		-
1363		- for_each_mem_cgroup_tree(iter, memcg) {
1364		- pr_info("Memory cgroup stats for ");
1365		- pr_cont_cgroup_path(iter->css.cgroup);
1366		- pr_cont(":");
1367		-
1368		- for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
1369		- if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account)
1370		- continue;
1371		- pr_cont(" %s:%luKB", memcg1_stat_names[i],
1372		- K(memcg_page_state(iter, memcg1_stats[i])));
1373		- }
1374		-
1375		- for (i = 0; i < NR_LRU_LISTS; i++)
1376		- pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
1377		- K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
1378		-
1379		- pr_cont("\n");
	1702	+ K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt);
	1703	+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
	1704	+ pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
	1705	+ K((u64)page_counter_read(&memcg->swap)),
	1706	+ K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt);
	1707	+ else {
	1708	+ pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
	1709	+ K((u64)page_counter_read(&memcg->memsw)),
	1710	+ K((u64)memcg->memsw.max), memcg->memsw.failcnt);
	1711	+ pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
	1712	+ K((u64)page_counter_read(&memcg->kmem)),
	1713	+ K((u64)memcg->kmem.max), memcg->kmem.failcnt);
1380	1714	}
	1715	+
	1716	+ pr_info("Memory cgroup stats for ");
	1717	+ pr_cont_cgroup_path(memcg->css.cgroup);
	1718	+ pr_cont(":");
	1719	+ buf = memory_stat_format(memcg);
	1720	+ if (!buf)
	1721	+ return;
	1722	+ pr_info("%s", buf);
	1723	+ kfree(buf);
1381	1724	}
1382	1725
1383	1726	/*
..	..	@@ -1385,19 +1728,26 @@
1385	1728	*/
1386	1729	unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
1387	1730	{
1388		- unsigned long max;
	1731	+ unsigned long max = READ_ONCE(memcg->memory.max);
1389	1732
1390		- max = memcg->memory.max;
1391		- if (mem_cgroup_swappiness(memcg)) {
1392		- unsigned long memsw_max;
1393		- unsigned long swap_max;
	1733	+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
	1734	+ if (mem_cgroup_swappiness(memcg))
	1735	+ max += min(READ_ONCE(memcg->swap.max),
	1736	+ (unsigned long)total_swap_pages);
	1737	+ } else { /* v1 */
	1738	+ if (mem_cgroup_swappiness(memcg)) {
	1739	+ /* Calculate swap excess capacity from memsw limit */
	1740	+ unsigned long swap = READ_ONCE(memcg->memsw.max) - max;
1394	1741
1395		- memsw_max = memcg->memsw.max;
1396		- swap_max = memcg->swap.max;
1397		- swap_max = min(swap_max, (unsigned long)total_swap_pages);
1398		- max = min(max + swap_max, memsw_max);
	1742	+ max += min(swap, (unsigned long)total_swap_pages);
	1743	+ }
1399	1744	}
1400	1745	return max;
	1746	+}
	1747	+
	1748	+unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
	1749	+{
	1750	+ return page_counter_read(&memcg->memory);
1401	1751	}
1402	1752
1403	1753	static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
..	..	@@ -1410,112 +1760,24 @@
1410	1760	.gfp_mask = gfp_mask,
1411	1761	.order = order,
1412	1762	};
1413		- bool ret;
	1763	+ bool ret = true;
1414	1764
1415	1765	if (mutex_lock_killable(&oom_lock))
1416	1766	return true;
	1767	+
	1768	+ if (mem_cgroup_margin(memcg) >= (1 << order))
	1769	+ goto unlock;
	1770	+
1417	1771	/*
1418	1772	* A few threads which were not waiting at mutex_lock_killable() can
1419	1773	* fail to bail out. Therefore, check again after holding oom_lock.
1420	1774	*/
1421		- ret = should_force_charge() \|\| out_of_memory(&oc);
	1775	+ ret = task_is_dying() \|\| out_of_memory(&oc);
	1776	+
	1777	+unlock:
1422	1778	mutex_unlock(&oom_lock);
1423	1779	return ret;
1424	1780	}
1425		-
1426		-#if MAX_NUMNODES > 1
1427		-
1428		-/**
1429		- * test_mem_cgroup_node_reclaimable
1430		- * @memcg: the target memcg
1431		- * @nid: the node ID to be checked.
1432		- * @noswap : specify true here if the user wants flle only information.
1433		- *
1434		- * This function returns whether the specified memcg contains any
1435		- * reclaimable pages on a node. Returns true if there are any reclaimable
1436		- * pages in the node.
1437		- */
1438		-static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1439		- int nid, bool noswap)
1440		-{
1441		- if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
1442		- return true;
1443		- if (noswap \|\| !total_swap_pages)
1444		- return false;
1445		- if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
1446		- return true;
1447		- return false;
1448		-
1449		-}
1450		-
1451		-/*
1452		- * Always updating the nodemask is not very good - even if we have an empty
1453		- * list or the wrong list here, we can start from some node and traverse all
1454		- * nodes based on the zonelist. So update the list loosely once per 10 secs.
1455		- *
1456		- */
1457		-static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1458		-{
1459		- int nid;
1460		- /*
1461		- * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
1462		- * pagein/pageout changes since the last update.
1463		- */
1464		- if (!atomic_read(&memcg->numainfo_events))
1465		- return;
1466		- if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1467		- return;
1468		-
1469		- /* make a nodemask where this memcg uses memory from */
1470		- memcg->scan_nodes = node_states[N_MEMORY];
1471		-
1472		- for_each_node_mask(nid, node_states[N_MEMORY]) {
1473		-
1474		- if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1475		- node_clear(nid, memcg->scan_nodes);
1476		- }
1477		-
1478		- atomic_set(&memcg->numainfo_events, 0);
1479		- atomic_set(&memcg->numainfo_updating, 0);
1480		-}
1481		-
1482		-/*
1483		- * Selecting a node where we start reclaim from. Because what we need is just
1484		- * reducing usage counter, start from anywhere is O,K. Considering
1485		- * memory reclaim from current node, there are pros. and cons.
1486		- *
1487		- * Freeing memory from current node means freeing memory from a node which
1488		- * we'll use or we've used. So, it may make LRU bad. And if several threads
1489		- * hit limits, it will see a contention on a node. But freeing from remote
1490		- * node means more costs for memory reclaim because of memory latency.
1491		- *
1492		- * Now, we use round-robin. Better algorithm is welcomed.
1493		- */
1494		-int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1495		-{
1496		- int node;
1497		-
1498		- mem_cgroup_may_update_nodemask(memcg);
1499		- node = memcg->last_scanned_node;
1500		-
1501		- node = next_node_in(node, memcg->scan_nodes);
1502		- /*
1503		- * mem_cgroup_may_update_nodemask might have seen no reclaimmable pages
1504		- * last time it really checked all the LRUs due to rate limiting.
1505		- * Fallback to the current node in that case for simplicity.
1506		- */
1507		- if (unlikely(node == MAX_NUMNODES))
1508		- node = numa_node_id();
1509		-
1510		- memcg->last_scanned_node = node;
1511		- return node;
1512		-}
1513		-#else
1514		-int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1515		-{
1516		- return 0;
1517		-}
1518		-#endif
1519	1781
1520	1782	static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1521	1783	pg_data_t *pgdat,
..	..	@@ -1529,7 +1791,6 @@
1529	1791	unsigned long nr_scanned;
1530	1792	struct mem_cgroup_reclaim_cookie reclaim = {
1531	1793	.pgdat = pgdat,
1532		- .priority = 0,
1533	1794	};
1534	1795
1535	1796	excess = soft_limit_excess(root_memcg);
..	..	@@ -1624,7 +1885,7 @@
1624	1885	struct mem_cgroup *iter;
1625	1886
1626	1887	spin_lock(&memcg_oom_lock);
1627		- mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
	1888	+ mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
1628	1889	for_each_mem_cgroup_tree(iter, memcg)
1629	1890	iter->oom_lock = false;
1630	1891	spin_unlock(&memcg_oom_lock);
..	..	@@ -1645,8 +1906,8 @@
1645	1906	struct mem_cgroup *iter;
1646	1907
1647	1908	/*
1648		- * When a new child is created while the hierarchy is under oom,
1649		- * mem_cgroup_oom_lock() may not be called. Watch for underflow.
	1909	+ * Be careful about under_oom underflows becase a child memcg
	1910	+ * could have been added after mem_cgroup_mark_under_oom.
1650	1911	*/
1651	1912	spin_lock(&memcg_oom_lock);
1652	1913	for_each_mem_cgroup_tree(iter, memcg)
..	..	@@ -1706,6 +1967,8 @@
1706	1967
1707	1968	if (order > PAGE_ALLOC_COSTLY_ORDER)
1708	1969	return OOM_SKIPPED;
	1970	+
	1971	+ memcg_memory_event(memcg, MEMCG_OOM);
1709	1972
1710	1973	/*
1711	1974	* We are in the middle of the charge context here, so we
..	..	@@ -1854,6 +2117,14 @@
1854	2117	goto out;
1855	2118
1856	2119	/*
	2120	+ * If the victim task has been asynchronously moved to a different
	2121	+ * memory cgroup, we might end up killing tasks outside oom_domain.
	2122	+ * In this case it's better to ignore memory.group.oom.
	2123	+ */
	2124	+ if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain)))
	2125	+ goto out;
	2126	+
	2127	+ /*
1857	2128	* Traverse the memory cgroup hierarchy from the victim task's
1858	2129	* cgroup up to the OOMing cgroup (or root) to find the
1859	2130	* highest-level memory cgroup with oom.group set.
..	..	@@ -1894,6 +2165,7 @@
1894	2165	*/
1895	2166	struct mem_cgroup lock_page_memcg(struct page page)
1896	2167	{
	2168	+ struct page head = compound_head(page); / rmap on tail pages */
1897	2169	struct mem_cgroup *memcg;
1898	2170	unsigned long flags;
1899	2171
..	..	@@ -1913,7 +2185,7 @@
1913	2185	if (mem_cgroup_disabled())
1914	2186	return NULL;
1915	2187	again:
1916		- memcg = page->mem_cgroup;
	2188	+ memcg = head->mem_cgroup;
1917	2189	if (unlikely(!memcg))
1918	2190	return NULL;
1919	2191
..	..	@@ -1921,7 +2193,7 @@
1921	2193	return memcg;
1922	2194
1923	2195	spin_lock_irqsave(&memcg->move_lock, flags);
1924		- if (memcg != page->mem_cgroup) {
	2196	+ if (memcg != head->mem_cgroup) {
1925	2197	spin_unlock_irqrestore(&memcg->move_lock, flags);
1926	2198	goto again;
1927	2199	}
..	..	@@ -1964,19 +2236,44 @@
1964	2236	*/
1965	2237	void unlock_page_memcg(struct page *page)
1966	2238	{
1967		- __unlock_page_memcg(page->mem_cgroup);
	2239	+ struct page *head = compound_head(page);
	2240	+
	2241	+ __unlock_page_memcg(head->mem_cgroup);
1968	2242	}
1969	2243	EXPORT_SYMBOL(unlock_page_memcg);
1970	2244
1971	2245	struct memcg_stock_pcp {
	2246	+ local_lock_t lock;
1972	2247	struct mem_cgroup cached; / this never be root cgroup */
1973	2248	unsigned int nr_pages;
	2249	+
	2250	+#ifdef CONFIG_MEMCG_KMEM
	2251	+ struct obj_cgroup *cached_objcg;
	2252	+ unsigned int nr_bytes;
	2253	+#endif
	2254	+
1974	2255	struct work_struct work;
1975	2256	unsigned long flags;
1976	2257	#define FLUSHING_CACHED_CHARGE 0
1977	2258	};
1978	2259	static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1979	2260	static DEFINE_MUTEX(percpu_charge_mutex);
	2261	+
	2262	+#ifdef CONFIG_MEMCG_KMEM
	2263	+static void drain_obj_stock(struct memcg_stock_pcp *stock);
	2264	+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
	2265	+ struct mem_cgroup *root_memcg);
	2266	+
	2267	+#else
	2268	+static inline void drain_obj_stock(struct memcg_stock_pcp *stock)
	2269	+{
	2270	+}
	2271	+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
	2272	+ struct mem_cgroup *root_memcg)
	2273	+{
	2274	+ return false;
	2275	+}
	2276	+#endif
1980	2277
1981	2278	/**
1982	2279	* consume_stock: Try to consume stocked charge on this cpu.
..	..	@@ -1998,7 +2295,7 @@
1998	2295	if (nr_pages > MEMCG_CHARGE_BATCH)
1999	2296	return ret;
2000	2297
2001		- local_irq_save(flags);
	2298	+ local_lock_irqsave(&memcg_stock.lock, flags);
2002	2299
2003	2300	stock = this_cpu_ptr(&memcg_stock);
2004	2301	if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
..	..	@@ -2006,7 +2303,7 @@
2006	2303	ret = true;
2007	2304	}
2008	2305
2009		- local_irq_restore(flags);
	2306	+ local_unlock_irqrestore(&memcg_stock.lock, flags);
2010	2307
2011	2308	return ret;
2012	2309	}
..	..	@@ -2018,13 +2315,17 @@
2018	2315	{
2019	2316	struct mem_cgroup *old = stock->cached;
2020	2317
	2318	+ if (!old)
	2319	+ return;
	2320	+
2021	2321	if (stock->nr_pages) {
2022	2322	page_counter_uncharge(&old->memory, stock->nr_pages);
2023	2323	if (do_memsw_account())
2024	2324	page_counter_uncharge(&old->memsw, stock->nr_pages);
2025		- css_put_many(&old->css, stock->nr_pages);
2026	2325	stock->nr_pages = 0;
2027	2326	}
	2327	+
	2328	+ css_put(&old->css);
2028	2329	stock->cached = NULL;
2029	2330	}
2030	2331
..	..	@@ -2037,13 +2338,14 @@
2037	2338	* The only protection from memory hotplug vs. drain_stock races is
2038	2339	* that we always operate on local CPU stock here with IRQ disabled
2039	2340	*/
2040		- local_irq_save(flags);
	2341	+ local_lock_irqsave(&memcg_stock.lock, flags);
2041	2342
2042	2343	stock = this_cpu_ptr(&memcg_stock);
	2344	+ drain_obj_stock(stock);
2043	2345	drain_stock(stock);
2044	2346	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2045	2347
2046		- local_irq_restore(flags);
	2348	+ local_unlock_irqrestore(&memcg_stock.lock, flags);
2047	2349	}
2048	2350
2049	2351	/*
..	..	@@ -2055,11 +2357,12 @@
2055	2357	struct memcg_stock_pcp *stock;
2056	2358	unsigned long flags;
2057	2359
2058		- local_irq_save(flags);
	2360	+ local_lock_irqsave(&memcg_stock.lock, flags);
2059	2361
2060	2362	stock = this_cpu_ptr(&memcg_stock);
2061	2363	if (stock->cached != memcg) { /* reset if necessary */
2062	2364	drain_stock(stock);
	2365	+ css_get(&memcg->css);
2063	2366	stock->cached = memcg;
2064	2367	}
2065	2368	stock->nr_pages += nr_pages;
..	..	@@ -2067,7 +2370,7 @@
2067	2370	if (stock->nr_pages > MEMCG_CHARGE_BATCH)
2068	2371	drain_stock(stock);
2069	2372
2070		- local_irq_restore(flags);
	2373	+ local_unlock_irqrestore(&memcg_stock.lock, flags);
2071	2374	}
2072	2375
2073	2376	/*
..	..	@@ -2091,21 +2394,24 @@
2091	2394	for_each_online_cpu(cpu) {
2092	2395	struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2093	2396	struct mem_cgroup *memcg;
	2397	+ bool flush = false;
2094	2398
	2399	+ rcu_read_lock();
2095	2400	memcg = stock->cached;
2096		- if (!memcg \|\| !stock->nr_pages \|\| !css_tryget(&memcg->css))
2097		- continue;
2098		- if (!mem_cgroup_is_descendant(memcg, root_memcg)) {
2099		- css_put(&memcg->css);
2100		- continue;
2101		- }
2102		- if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
	2401	+ if (memcg && stock->nr_pages &&
	2402	+ mem_cgroup_is_descendant(memcg, root_memcg))
	2403	+ flush = true;
	2404	+ if (obj_stock_flush_required(stock, root_memcg))
	2405	+ flush = true;
	2406	+ rcu_read_unlock();
	2407	+
	2408	+ if (flush &&
	2409	+ !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2103	2410	if (cpu == curcpu)
2104	2411	drain_local_stock(&stock->work);
2105	2412	else
2106	2413	schedule_work_on(cpu, &stock->work);
2107	2414	}
2108		- css_put(&memcg->css);
2109	2415	}
2110	2416	put_cpu_light();
2111	2417	mutex_unlock(&percpu_charge_mutex);
..	..	@@ -2114,7 +2420,7 @@
2114	2420	static int memcg_hotplug_cpu_dead(unsigned int cpu)
2115	2421	{
2116	2422	struct memcg_stock_pcp *stock;
2117		- struct mem_cgroup *memcg;
	2423	+ struct mem_cgroup memcg, mi;
2118	2424
2119	2425	stock = &per_cpu(memcg_stock, cpu);
2120	2426	drain_stock(stock);
..	..	@@ -2126,9 +2432,10 @@
2126	2432	int nid;
2127	2433	long x;
2128	2434
2129		- x = this_cpu_xchg(memcg->stat_cpu->count[i], 0);
	2435	+ x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
2130	2436	if (x)
2131		- atomic_long_add(x, &memcg->stat[i]);
	2437	+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
	2438	+ atomic_long_add(x, &memcg->vmstats[i]);
2132	2439
2133	2440	if (i >= NR_VM_NODE_STAT_ITEMS)
2134	2441	continue;
..	..	@@ -2139,32 +2446,48 @@
2139	2446	pn = mem_cgroup_nodeinfo(memcg, nid);
2140	2447	x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
2141	2448	if (x)
2142		- atomic_long_add(x, &pn->lruvec_stat[i]);
	2449	+ do {
	2450	+ atomic_long_add(x, &pn->lruvec_stat[i]);
	2451	+ } while ((pn = parent_nodeinfo(pn, nid)));
2143	2452	}
2144	2453	}
2145	2454
2146	2455	for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
2147	2456	long x;
2148	2457
2149		- x = this_cpu_xchg(memcg->stat_cpu->events[i], 0);
	2458	+ x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
2150	2459	if (x)
2151		- atomic_long_add(x, &memcg->events[i]);
	2460	+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
	2461	+ atomic_long_add(x, &memcg->vmevents[i]);
2152	2462	}
2153	2463	}
2154	2464
2155	2465	return 0;
2156	2466	}
2157	2467
2158		-static void reclaim_high(struct mem_cgroup *memcg,
2159		- unsigned int nr_pages,
2160		- gfp_t gfp_mask)
	2468	+static unsigned long reclaim_high(struct mem_cgroup *memcg,
	2469	+ unsigned int nr_pages,
	2470	+ gfp_t gfp_mask)
2161	2471	{
	2472	+ unsigned long nr_reclaimed = 0;
	2473	+
2162	2474	do {
2163		- if (page_counter_read(&memcg->memory) <= memcg->high)
	2475	+ unsigned long pflags;
	2476	+
	2477	+ if (page_counter_read(&memcg->memory) <=
	2478	+ READ_ONCE(memcg->memory.high))
2164	2479	continue;
	2480	+
2165	2481	memcg_memory_event(memcg, MEMCG_HIGH);
2166		- try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
2167		- } while ((memcg = parent_mem_cgroup(memcg)));
	2482	+
	2483	+ psi_memstall_enter(&pflags);
	2484	+ nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
	2485	+ gfp_mask, true);
	2486	+ psi_memstall_leave(&pflags);
	2487	+ } while ((memcg = parent_mem_cgroup(memcg)) &&
	2488	+ !mem_cgroup_is_root(memcg));
	2489	+
	2490	+ return nr_reclaimed;
2168	2491	}
2169	2492
2170	2493	static void high_work_func(struct work_struct *work)
..	..	@@ -2176,35 +2499,238 @@
2176	2499	}
2177	2500
2178	2501	/*
	2502	+ * Clamp the maximum sleep time per allocation batch to 2 seconds. This is
	2503	+ * enough to still cause a significant slowdown in most cases, while still
	2504	+ * allowing diagnostics and tracing to proceed without becoming stuck.
	2505	+ */
	2506	+#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
	2507	+
	2508	+/*
	2509	+ * When calculating the delay, we use these either side of the exponentiation to
	2510	+ * maintain precision and scale to a reasonable number of jiffies (see the table
	2511	+ * below.
	2512	+ *
	2513	+ * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
	2514	+ * overage ratio to a delay.
	2515	+ * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the
	2516	+ * proposed penalty in order to reduce to a reasonable number of jiffies, and
	2517	+ * to produce a reasonable delay curve.
	2518	+ *
	2519	+ * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
	2520	+ * reasonable delay curve compared to precision-adjusted overage, not
	2521	+ * penalising heavily at first, but still making sure that growth beyond the
	2522	+ * limit penalises misbehaviour cgroups by slowing them down exponentially. For
	2523	+ * example, with a high of 100 megabytes:
	2524	+ *
	2525	+ * +-------+------------------------+
	2526	+ * \| usage \| time to allocate in ms \|
	2527	+ * +-------+------------------------+
	2528	+ * \| 100M \| 0 \|
	2529	+ * \| 101M \| 6 \|
	2530	+ * \| 102M \| 25 \|
	2531	+ * \| 103M \| 57 \|
	2532	+ * \| 104M \| 102 \|
	2533	+ * \| 105M \| 159 \|
	2534	+ * \| 106M \| 230 \|
	2535	+ * \| 107M \| 313 \|
	2536	+ * \| 108M \| 409 \|
	2537	+ * \| 109M \| 518 \|
	2538	+ * \| 110M \| 639 \|
	2539	+ * \| 111M \| 774 \|
	2540	+ * \| 112M \| 921 \|
	2541	+ * \| 113M \| 1081 \|
	2542	+ * \| 114M \| 1254 \|
	2543	+ * \| 115M \| 1439 \|
	2544	+ * \| 116M \| 1638 \|
	2545	+ * \| 117M \| 1849 \|
	2546	+ * \| 118M \| 2000 \|
	2547	+ * \| 119M \| 2000 \|
	2548	+ * \| 120M \| 2000 \|
	2549	+ * +-------+------------------------+
	2550	+ */
	2551	+ #define MEMCG_DELAY_PRECISION_SHIFT 20
	2552	+ #define MEMCG_DELAY_SCALING_SHIFT 14
	2553	+
	2554	+static u64 calculate_overage(unsigned long usage, unsigned long high)
	2555	+{
	2556	+ u64 overage;
	2557	+
	2558	+ if (usage <= high)
	2559	+ return 0;
	2560	+
	2561	+ /*
	2562	+ * Prevent division by 0 in overage calculation by acting as if
	2563	+ * it was a threshold of 1 page
	2564	+ */
	2565	+ high = max(high, 1UL);
	2566	+
	2567	+ overage = usage - high;
	2568	+ overage <<= MEMCG_DELAY_PRECISION_SHIFT;
	2569	+ return div64_u64(overage, high);
	2570	+}
	2571	+
	2572	+static u64 mem_find_max_overage(struct mem_cgroup *memcg)
	2573	+{
	2574	+ u64 overage, max_overage = 0;
	2575	+
	2576	+ do {
	2577	+ overage = calculate_overage(page_counter_read(&memcg->memory),
	2578	+ READ_ONCE(memcg->memory.high));
	2579	+ max_overage = max(overage, max_overage);
	2580	+ } while ((memcg = parent_mem_cgroup(memcg)) &&
	2581	+ !mem_cgroup_is_root(memcg));
	2582	+
	2583	+ return max_overage;
	2584	+}
	2585	+
	2586	+static u64 swap_find_max_overage(struct mem_cgroup *memcg)
	2587	+{
	2588	+ u64 overage, max_overage = 0;
	2589	+
	2590	+ do {
	2591	+ overage = calculate_overage(page_counter_read(&memcg->swap),
	2592	+ READ_ONCE(memcg->swap.high));
	2593	+ if (overage)
	2594	+ memcg_memory_event(memcg, MEMCG_SWAP_HIGH);
	2595	+ max_overage = max(overage, max_overage);
	2596	+ } while ((memcg = parent_mem_cgroup(memcg)) &&
	2597	+ !mem_cgroup_is_root(memcg));
	2598	+
	2599	+ return max_overage;
	2600	+}
	2601	+
	2602	+/*
	2603	+ * Get the number of jiffies that we should penalise a mischievous cgroup which
	2604	+ * is exceeding its memory.high by checking both it and its ancestors.
	2605	+ */
	2606	+static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
	2607	+ unsigned int nr_pages,
	2608	+ u64 max_overage)
	2609	+{
	2610	+ unsigned long penalty_jiffies;
	2611	+
	2612	+ if (!max_overage)
	2613	+ return 0;
	2614	+
	2615	+ /*
	2616	+ * We use overage compared to memory.high to calculate the number of
	2617	+ * jiffies to sleep (penalty_jiffies). Ideally this value should be
	2618	+ * fairly lenient on small overages, and increasingly harsh when the
	2619	+ * memcg in question makes it clear that it has no intention of stopping
	2620	+ * its crazy behaviour, so we exponentially increase the delay based on
	2621	+ * overage amount.
	2622	+ */
	2623	+ penalty_jiffies = max_overage * max_overage * HZ;
	2624	+ penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
	2625	+ penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
	2626	+
	2627	+ /*
	2628	+ * Factor in the task's own contribution to the overage, such that four
	2629	+ * N-sized allocations are throttled approximately the same as one
	2630	+ * 4N-sized allocation.
	2631	+ *
	2632	+ * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
	2633	+ * larger the current charge patch is than that.
	2634	+ */
	2635	+ return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
	2636	+}
	2637	+
	2638	+/*
2179	2639	* Scheduled by try_charge() to be executed from the userland return path
2180	2640	* and reclaims memory over the high limit.
2181	2641	*/
2182	2642	void mem_cgroup_handle_over_high(void)
2183	2643	{
	2644	+ unsigned long penalty_jiffies;
	2645	+ unsigned long pflags;
	2646	+ unsigned long nr_reclaimed;
2184	2647	unsigned int nr_pages = current->memcg_nr_pages_over_high;
	2648	+ int nr_retries = MAX_RECLAIM_RETRIES;
2185	2649	struct mem_cgroup *memcg;
	2650	+ bool in_retry = false;
2186	2651
2187	2652	if (likely(!nr_pages))
2188	2653	return;
2189	2654
2190	2655	memcg = get_mem_cgroup_from_mm(current->mm);
2191		- reclaim_high(memcg, nr_pages, GFP_KERNEL);
2192		- css_put(&memcg->css);
2193	2656	current->memcg_nr_pages_over_high = 0;
	2657	+
	2658	+retry_reclaim:
	2659	+ /*
	2660	+ * The allocating task should reclaim at least the batch size, but for
	2661	+ * subsequent retries we only want to do what's necessary to prevent oom
	2662	+ * or breaching resource isolation.
	2663	+ *
	2664	+ * This is distinct from memory.max or page allocator behaviour because
	2665	+ * memory.high is currently batched, whereas memory.max and the page
	2666	+ * allocator run every time an allocation is made.
	2667	+ */
	2668	+ nr_reclaimed = reclaim_high(memcg,
	2669	+ in_retry ? SWAP_CLUSTER_MAX : nr_pages,
	2670	+ GFP_KERNEL);
	2671	+
	2672	+ /*
	2673	+ * memory.high is breached and reclaim is unable to keep up. Throttle
	2674	+ * allocators proactively to slow down excessive growth.
	2675	+ */
	2676	+ penalty_jiffies = calculate_high_delay(memcg, nr_pages,
	2677	+ mem_find_max_overage(memcg));
	2678	+
	2679	+ penalty_jiffies += calculate_high_delay(memcg, nr_pages,
	2680	+ swap_find_max_overage(memcg));
	2681	+
	2682	+ /*
	2683	+ * Clamp the max delay per usermode return so as to still keep the
	2684	+ * application moving forwards and also permit diagnostics, albeit
	2685	+ * extremely slowly.
	2686	+ */
	2687	+ penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
	2688	+
	2689	+ /*
	2690	+ * Don't sleep if the amount of jiffies this memcg owes us is so low
	2691	+ * that it's not even worth doing, in an attempt to be nice to those who
	2692	+ * go only a small amount over their memory.high value and maybe haven't
	2693	+ * been aggressively reclaimed enough yet.
	2694	+ */
	2695	+ if (penalty_jiffies <= HZ / 100)
	2696	+ goto out;
	2697	+
	2698	+ /*
	2699	+ * If reclaim is making forward progress but we're still over
	2700	+ * memory.high, we want to encourage that rather than doing allocator
	2701	+ * throttling.
	2702	+ */
	2703	+ if (nr_reclaimed \|\| nr_retries--) {
	2704	+ in_retry = true;
	2705	+ goto retry_reclaim;
	2706	+ }
	2707	+
	2708	+ /*
	2709	+ * If we exit early, we're guaranteed to die (since
	2710	+ * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
	2711	+ * need to account for any ill-begotten jiffies to pay them off later.
	2712	+ */
	2713	+ psi_memstall_enter(&pflags);
	2714	+ schedule_timeout_killable(penalty_jiffies);
	2715	+ psi_memstall_leave(&pflags);
	2716	+
	2717	+out:
	2718	+ css_put(&memcg->css);
2194	2719	}
2195	2720
2196	2721	static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2197	2722	unsigned int nr_pages)
2198	2723	{
2199	2724	unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
2200		- int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
	2725	+ int nr_retries = MAX_RECLAIM_RETRIES;
2201	2726	struct mem_cgroup *mem_over_limit;
2202	2727	struct page_counter *counter;
	2728	+ enum oom_status oom_status;
2203	2729	unsigned long nr_reclaimed;
	2730	+ bool passed_oom = false;
2204	2731	bool may_swap = true;
2205	2732	bool drained = false;
2206		- bool oomed = false;
2207		- enum oom_status oom_status;
	2733	+ unsigned long pflags;
2208	2734
2209	2735	if (mem_cgroup_is_root(memcg))
2210	2736	return 0;
..	..	@@ -2239,15 +2765,6 @@
2239	2765	goto force;
2240	2766
2241	2767	/*
2242		- * Unlike in global OOM situations, memcg is not in a physical
2243		- * memory shortage. Allow dying and OOM-killed tasks to
2244		- * bypass the last charges so that they can exit quickly and
2245		- * free their memory.
2246		- */
2247		- if (unlikely(should_force_charge()))
2248		- goto force;
2249		-
2250		- /*
2251	2768	* Prevent unbounded recursion when reclaim operations need to
2252	2769	* allocate memory. This might exceed the limits temporarily,
2253	2770	* but we prefer facilitating memory reclaim and getting back
..	..	@@ -2264,8 +2781,10 @@
2264	2781
2265	2782	memcg_memory_event(mem_over_limit, MEMCG_MAX);
2266	2783
	2784	+ psi_memstall_enter(&pflags);
2267	2785	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2268	2786	gfp_mask, may_swap);
	2787	+ psi_memstall_leave(&pflags);
2269	2788
2270	2789	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2271	2790	goto retry;
..	..	@@ -2299,16 +2818,15 @@
2299	2818	if (nr_retries--)
2300	2819	goto retry;
2301	2820
2302		- if (gfp_mask & __GFP_RETRY_MAYFAIL && oomed)
	2821	+ if (gfp_mask & __GFP_RETRY_MAYFAIL)
2303	2822	goto nomem;
2304	2823
2305	2824	if (gfp_mask & __GFP_NOFAIL)
2306	2825	goto force;
2307	2826
2308		- if (fatal_signal_pending(current))
2309		- goto force;
2310		-
2311		- memcg_memory_event(mem_over_limit, MEMCG_OOM);
	2827	+ /* Avoid endless loop for tasks bypassed by the oom killer */
	2828	+ if (passed_oom && task_is_dying())
	2829	+ goto nomem;
2312	2830
2313	2831	/*
2314	2832	* keep retrying as long as the memcg oom killer is able to make
..	..	@@ -2317,15 +2835,10 @@
2317	2835	*/
2318	2836	oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
2319	2837	get_order(nr_pages * PAGE_SIZE));
2320		- switch (oom_status) {
2321		- case OOM_SUCCESS:
2322		- nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2323		- oomed = true;
	2838	+ if (oom_status == OOM_SUCCESS) {
	2839	+ passed_oom = true;
	2840	+ nr_retries = MAX_RECLAIM_RETRIES;
2324	2841	goto retry;
2325		- case OOM_FAILED:
2326		- goto force;
2327		- default:
2328		- goto nomem;
2329	2842	}
2330	2843	nomem:
2331	2844	if (!(gfp_mask & __GFP_NOFAIL))
..	..	@@ -2339,12 +2852,10 @@
2339	2852	page_counter_charge(&memcg->memory, nr_pages);
2340	2853	if (do_memsw_account())
2341	2854	page_counter_charge(&memcg->memsw, nr_pages);
2342		- css_get_many(&memcg->css, nr_pages);
2343	2855
2344	2856	return 0;
2345	2857
2346	2858	done_restock:
2347		- css_get_many(&memcg->css, batch);
2348	2859	if (batch > nr_pages)
2349	2860	refill_stock(memcg, batch - nr_pages);
2350	2861
..	..	@@ -2358,12 +2869,32 @@
2358	2869	* reclaim, the cost of mismatch is negligible.
2359	2870	*/
2360	2871	do {
2361		- if (page_counter_read(&memcg->memory) > memcg->high) {
2362		- /* Don't bother a random interrupted task */
2363		- if (in_interrupt()) {
	2872	+ bool mem_high, swap_high;
	2873	+
	2874	+ mem_high = page_counter_read(&memcg->memory) >
	2875	+ READ_ONCE(memcg->memory.high);
	2876	+ swap_high = page_counter_read(&memcg->swap) >
	2877	+ READ_ONCE(memcg->swap.high);
	2878	+
	2879	+ /* Don't bother a random interrupted task */
	2880	+ if (in_interrupt()) {
	2881	+ if (mem_high) {
2364	2882	schedule_work(&memcg->high_work);
2365	2883	break;
2366	2884	}
	2885	+ continue;
	2886	+ }
	2887	+
	2888	+ if (mem_high \|\| swap_high) {
	2889	+ /*
	2890	+ * The allocating tasks in this cgroup will need to do
	2891	+ * reclaim or be throttled to prevent further growth
	2892	+ * of the memory or swap footprints.
	2893	+ *
	2894	+ * Target some best-effort fairness between the tasks,
	2895	+ * and distribute reclaim work and delay penalties
	2896	+ * based on how much each task is actually allocating.
	2897	+ */
2367	2898	current->memcg_nr_pages_over_high += batch;
2368	2899	set_notify_resume(current);
2369	2900	break;
..	..	@@ -2373,6 +2904,7 @@
2373	2904	return 0;
2374	2905	}
2375	2906
	2907	+#if defined(CONFIG_MEMCG_KMEM) \|\| defined(CONFIG_MMU)
2376	2908	static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2377	2909	{
2378	2910	if (mem_cgroup_is_root(memcg))
..	..	@@ -2381,76 +2913,124 @@
2381	2913	page_counter_uncharge(&memcg->memory, nr_pages);
2382	2914	if (do_memsw_account())
2383	2915	page_counter_uncharge(&memcg->memsw, nr_pages);
2384		-
2385		- css_put_many(&memcg->css, nr_pages);
2386	2916	}
	2917	+#endif
2387	2918
2388		-static void lock_page_lru(struct page page, int isolated)
	2919	+static void commit_charge(struct page page, struct mem_cgroup memcg)
2389	2920	{
2390		- struct zone *zone = page_zone(page);
2391		-
2392		- spin_lock_irq(zone_lru_lock(zone));
2393		- if (PageLRU(page)) {
2394		- struct lruvec *lruvec;
2395		-
2396		- lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
2397		- ClearPageLRU(page);
2398		- del_page_from_lru_list(page, lruvec, page_lru(page));
2399		- *isolated = 1;
2400		- } else
2401		- *isolated = 0;
2402		-}
2403		-
2404		-static void unlock_page_lru(struct page *page, int isolated)
2405		-{
2406		- struct zone *zone = page_zone(page);
2407		-
2408		- if (isolated) {
2409		- struct lruvec *lruvec;
2410		-
2411		- lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
2412		- VM_BUG_ON_PAGE(PageLRU(page), page);
2413		- SetPageLRU(page);
2414		- add_page_to_lru_list(page, lruvec, page_lru(page));
2415		- }
2416		- spin_unlock_irq(zone_lru_lock(zone));
2417		-}
2418		-
2419		-static void commit_charge(struct page page, struct mem_cgroup memcg,
2420		- bool lrucare)
2421		-{
2422		- int isolated;
2423		-
2424	2921	VM_BUG_ON_PAGE(page->mem_cgroup, page);
2425		-
2426	2922	/*
2427		- * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
2428		- * may already be on some other mem_cgroup's LRU. Take care of it.
2429		- */
2430		- if (lrucare)
2431		- lock_page_lru(page, &isolated);
2432		-
2433		- /*
2434		- * Nobody should be changing or seriously looking at
2435		- * page->mem_cgroup at this point:
	2923	+ * Any of the following ensures page->mem_cgroup stability:
2436	2924	*
2437		- * - the page is uncharged
2438		- *
2439		- * - the page is off-LRU
2440		- *
2441		- * - an anonymous fault has exclusive page access, except for
2442		- * a locked page table
2443		- *
2444		- * - a page cache insertion, a swapin fault, or a migration
2445		- * have the page locked
	2925	+ * - the page lock
	2926	+ * - LRU isolation
	2927	+ * - lock_page_memcg()
	2928	+ * - exclusive reference
2446	2929	*/
2447	2930	page->mem_cgroup = memcg;
2448		-
2449		- if (lrucare)
2450		- unlock_page_lru(page, isolated);
2451	2931	}
2452	2932
2453	2933	#ifdef CONFIG_MEMCG_KMEM
	2934	+/*
	2935	+ * The allocated objcg pointers array is not accounted directly.
	2936	+ * Moreover, it should not come from DMA buffer and is not readily
	2937	+ * reclaimable. So those GFP bits should be masked off.
	2938	+ */
	2939	+#define OBJCGS_CLEAR_MASK (__GFP_DMA \| __GFP_RECLAIMABLE \| __GFP_ACCOUNT)
	2940	+
	2941	+int memcg_alloc_page_obj_cgroups(struct page page, struct kmem_cache s,
	2942	+ gfp_t gfp)
	2943	+{
	2944	+ unsigned int objects = objs_per_slab_page(s, page);
	2945	+ void *vec;
	2946	+
	2947	+ gfp &= ~OBJCGS_CLEAR_MASK;
	2948	+ vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
	2949	+ page_to_nid(page));
	2950	+ if (!vec)
	2951	+ return -ENOMEM;
	2952	+
	2953	+ if (cmpxchg(&page->obj_cgroups, NULL,
	2954	+ (struct obj_cgroup **) ((unsigned long)vec \| 0x1UL)))
	2955	+ kfree(vec);
	2956	+ else
	2957	+ kmemleak_not_leak(vec);
	2958	+
	2959	+ return 0;
	2960	+}
	2961	+
	2962	+/*
	2963	+ * Returns a pointer to the memory cgroup to which the kernel object is charged.
	2964	+ *
	2965	+ * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
	2966	+ * cgroup_mutex, etc.
	2967	+ */
	2968	+struct mem_cgroup mem_cgroup_from_obj(void p)
	2969	+{
	2970	+ struct page *page;
	2971	+
	2972	+ if (mem_cgroup_disabled())
	2973	+ return NULL;
	2974	+
	2975	+ page = virt_to_head_page(p);
	2976	+
	2977	+ /*
	2978	+ * If page->mem_cgroup is set, it's either a simple mem_cgroup pointer
	2979	+ * or a pointer to obj_cgroup vector. In the latter case the lowest
	2980	+ * bit of the pointer is set.
	2981	+ * The page->mem_cgroup pointer can be asynchronously changed
	2982	+ * from NULL to (obj_cgroup_vec \| 0x1UL), but can't be changed
	2983	+ * from a valid memcg pointer to objcg vector or back.
	2984	+ */
	2985	+ if (!page->mem_cgroup)
	2986	+ return NULL;
	2987	+
	2988	+ /*
	2989	+ * Slab objects are accounted individually, not per-page.
	2990	+ * Memcg membership data for each individual object is saved in
	2991	+ * the page->obj_cgroups.
	2992	+ */
	2993	+ if (page_has_obj_cgroups(page)) {
	2994	+ struct obj_cgroup *objcg;
	2995	+ unsigned int off;
	2996	+
	2997	+ off = obj_to_index(page->slab_cache, page, p);
	2998	+ objcg = page_obj_cgroups(page)[off];
	2999	+ if (objcg)
	3000	+ return obj_cgroup_memcg(objcg);
	3001	+
	3002	+ return NULL;
	3003	+ }
	3004	+
	3005	+ /* All other pages use page->mem_cgroup */
	3006	+ return page->mem_cgroup;
	3007	+}
	3008	+
	3009	+__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
	3010	+{
	3011	+ struct obj_cgroup *objcg = NULL;
	3012	+ struct mem_cgroup *memcg;
	3013	+
	3014	+ if (memcg_kmem_bypass())
	3015	+ return NULL;
	3016	+
	3017	+ rcu_read_lock();
	3018	+ if (unlikely(active_memcg()))
	3019	+ memcg = active_memcg();
	3020	+ else
	3021	+ memcg = mem_cgroup_from_task(current);
	3022	+
	3023	+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
	3024	+ objcg = rcu_dereference(memcg->objcg);
	3025	+ if (objcg && obj_cgroup_tryget(objcg))
	3026	+ break;
	3027	+ objcg = NULL;
	3028	+ }
	3029	+ rcu_read_unlock();
	3030	+
	3031	+ return objcg;
	3032	+}
	3033	+
2454	3034	static int memcg_alloc_cache_id(void)
2455	3035	{
2456	3036	int id, size;
..	..	@@ -2476,9 +3056,7 @@
2476	3056	else if (size > MEMCG_CACHES_MAX_SIZE)
2477	3057	size = MEMCG_CACHES_MAX_SIZE;
2478	3058
2479		- err = memcg_update_all_caches(size);
2480		- if (!err)
2481		- err = memcg_update_all_list_lrus(size);
	3059	+ err = memcg_update_all_list_lrus(size);
2482	3060	if (!err)
2483	3061	memcg_nr_cache_ids = size;
2484	3062
..	..	@@ -2496,152 +3074,17 @@
2496	3074	ida_simple_remove(&memcg_cache_ida, id);
2497	3075	}
2498	3076
2499		-struct memcg_kmem_cache_create_work {
2500		- struct mem_cgroup *memcg;
2501		- struct kmem_cache *cachep;
2502		- struct work_struct work;
2503		-};
2504		-
2505		-static void memcg_kmem_cache_create_func(struct work_struct *w)
2506		-{
2507		- struct memcg_kmem_cache_create_work *cw =
2508		- container_of(w, struct memcg_kmem_cache_create_work, work);
2509		- struct mem_cgroup *memcg = cw->memcg;
2510		- struct kmem_cache *cachep = cw->cachep;
2511		-
2512		- memcg_create_kmem_cache(memcg, cachep);
2513		-
2514		- css_put(&memcg->css);
2515		- kfree(cw);
2516		-}
2517		-
2518		-/*
2519		- * Enqueue the creation of a per-memcg kmem_cache.
2520		- */
2521		-static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2522		- struct kmem_cache *cachep)
2523		-{
2524		- struct memcg_kmem_cache_create_work *cw;
2525		-
2526		- cw = kmalloc(sizeof(*cw), GFP_NOWAIT \| __GFP_NOWARN);
2527		- if (!cw)
2528		- return;
2529		-
2530		- css_get(&memcg->css);
2531		-
2532		- cw->memcg = memcg;
2533		- cw->cachep = cachep;
2534		- INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
2535		-
2536		- queue_work(memcg_kmem_cache_wq, &cw->work);
2537		-}
2538		-
2539		-static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2540		- struct kmem_cache *cachep)
2541		-{
2542		- /*
2543		- * We need to stop accounting when we kmalloc, because if the
2544		- * corresponding kmalloc cache is not yet created, the first allocation
2545		- * in __memcg_schedule_kmem_cache_create will recurse.
2546		- *
2547		- * However, it is better to enclose the whole function. Depending on
2548		- * the debugging options enabled, INIT_WORK(), for instance, can
2549		- * trigger an allocation. This too, will make us recurse. Because at
2550		- * this point we can't allow ourselves back into memcg_kmem_get_cache,
2551		- * the safest choice is to do it like this, wrapping the whole function.
2552		- */
2553		- current->memcg_kmem_skip_account = 1;
2554		- __memcg_schedule_kmem_cache_create(memcg, cachep);
2555		- current->memcg_kmem_skip_account = 0;
2556		-}
2557		-
2558		-static inline bool memcg_kmem_bypass(void)
2559		-{
2560		- if (in_interrupt() \|\| !current->mm \|\| (current->flags & PF_KTHREAD))
2561		- return true;
2562		- return false;
2563		-}
2564		-
2565	3077	/**
2566		- * memcg_kmem_get_cache: select the correct per-memcg cache for allocation
2567		- * @cachep: the original global kmem cache
2568		- *
2569		- * Return the kmem_cache we're supposed to use for a slab allocation.
2570		- * We try to use the current memcg's version of the cache.
2571		- *
2572		- * If the cache does not exist yet, if we are the first user of it, we
2573		- * create it asynchronously in a workqueue and let the current allocation
2574		- * go through with the original cache.
2575		- *
2576		- * This function takes a reference to the cache it returns to assure it
2577		- * won't get destroyed while we are working with it. Once the caller is
2578		- * done with it, memcg_kmem_put_cache() must be called to release the
2579		- * reference.
2580		- */
2581		-struct kmem_cache memcg_kmem_get_cache(struct kmem_cache cachep)
2582		-{
2583		- struct mem_cgroup *memcg;
2584		- struct kmem_cache *memcg_cachep;
2585		- int kmemcg_id;
2586		-
2587		- VM_BUG_ON(!is_root_cache(cachep));
2588		-
2589		- if (memcg_kmem_bypass())
2590		- return cachep;
2591		-
2592		- if (current->memcg_kmem_skip_account)
2593		- return cachep;
2594		-
2595		- memcg = get_mem_cgroup_from_current();
2596		- kmemcg_id = READ_ONCE(memcg->kmemcg_id);
2597		- if (kmemcg_id < 0)
2598		- goto out;
2599		-
2600		- memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
2601		- if (likely(memcg_cachep))
2602		- return memcg_cachep;
2603		-
2604		- /*
2605		- * If we are in a safe context (can wait, and not in interrupt
2606		- * context), we could be be predictable and return right away.
2607		- * This would guarantee that the allocation being performed
2608		- * already belongs in the new cache.
2609		- *
2610		- * However, there are some clashes that can arrive from locking.
2611		- * For instance, because we acquire the slab_mutex while doing
2612		- * memcg_create_kmem_cache, this means no further allocation
2613		- * could happen with the slab_mutex held. So it's better to
2614		- * defer everything.
2615		- */
2616		- memcg_schedule_kmem_cache_create(memcg, cachep);
2617		-out:
2618		- css_put(&memcg->css);
2619		- return cachep;
2620		-}
2621		-
2622		-/**
2623		- * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache
2624		- * @cachep: the cache returned by memcg_kmem_get_cache
2625		- */
2626		-void memcg_kmem_put_cache(struct kmem_cache *cachep)
2627		-{
2628		- if (!is_root_cache(cachep))
2629		- css_put(&cachep->memcg_params.memcg->css);
2630		-}
2631		-
2632		-/**
2633		- * memcg_kmem_charge_memcg: charge a kmem page
2634		- * @page: page to charge
2635		- * @gfp: reclaim mode
2636		- * @order: allocation order
	3078	+ * __memcg_kmem_charge: charge a number of kernel pages to a memcg
2637	3079	* @memcg: memory cgroup to charge
	3080	+ * @gfp: reclaim mode
	3081	+ * @nr_pages: number of pages to charge
2638	3082	*
2639	3083	* Returns 0 on success, an error code on failure.
2640	3084	*/
2641		-int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
2642		- struct mem_cgroup *memcg)
	3085	+int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
	3086	+ unsigned int nr_pages)
2643	3087	{
2644		- unsigned int nr_pages = 1 << order;
2645	3088	struct page_counter *counter;
2646	3089	int ret;
2647	3090
..	..	@@ -2664,43 +3107,54 @@
2664	3107	cancel_charge(memcg, nr_pages);
2665	3108	return -ENOMEM;
2666	3109	}
2667		-
2668		- page->mem_cgroup = memcg;
2669		-
2670	3110	return 0;
2671	3111	}
2672	3112
2673	3113	/**
2674		- * memcg_kmem_charge: charge a kmem page to the current memory cgroup
	3114	+ * __memcg_kmem_uncharge: uncharge a number of kernel pages from a memcg
	3115	+ * @memcg: memcg to uncharge
	3116	+ * @nr_pages: number of pages to uncharge
	3117	+ */
	3118	+void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
	3119	+{
	3120	+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
	3121	+ page_counter_uncharge(&memcg->kmem, nr_pages);
	3122	+
	3123	+ refill_stock(memcg, nr_pages);
	3124	+}
	3125	+
	3126	+/**
	3127	+ * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup
2675	3128	* @page: page to charge
2676	3129	* @gfp: reclaim mode
2677	3130	* @order: allocation order
2678	3131	*
2679	3132	* Returns 0 on success, an error code on failure.
2680	3133	*/
2681		-int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
	3134	+int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
2682	3135	{
2683	3136	struct mem_cgroup *memcg;
2684	3137	int ret = 0;
2685	3138
2686		- if (mem_cgroup_disabled() \|\| memcg_kmem_bypass())
2687		- return 0;
2688		-
2689	3139	memcg = get_mem_cgroup_from_current();
2690		- if (!mem_cgroup_is_root(memcg)) {
2691		- ret = memcg_kmem_charge_memcg(page, gfp, order, memcg);
2692		- if (!ret)
	3140	+ if (memcg && !mem_cgroup_is_root(memcg)) {
	3141	+ ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
	3142	+ if (!ret) {
	3143	+ page->mem_cgroup = memcg;
2693	3144	__SetPageKmemcg(page);
	3145	+ return 0;
	3146	+ }
	3147	+ css_put(&memcg->css);
2694	3148	}
2695		- css_put(&memcg->css);
2696	3149	return ret;
2697	3150	}
	3151	+
2698	3152	/**
2699		- * memcg_kmem_uncharge: uncharge a kmem page
	3153	+ * __memcg_kmem_uncharge_page: uncharge a kmem page
2700	3154	* @page: page to uncharge
2701	3155	* @order: allocation order
2702	3156	*/
2703		-void memcg_kmem_uncharge(struct page *page, int order)
	3157	+void __memcg_kmem_uncharge_page(struct page *page, int order)
2704	3158	{
2705	3159	struct mem_cgroup *memcg = page->mem_cgroup;
2706	3160	unsigned int nr_pages = 1 << order;
..	..	@@ -2709,43 +3163,179 @@
2709	3163	return;
2710	3164
2711	3165	VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
2712		-
2713		- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
2714		- page_counter_uncharge(&memcg->kmem, nr_pages);
2715		-
2716		- page_counter_uncharge(&memcg->memory, nr_pages);
2717		- if (do_memsw_account())
2718		- page_counter_uncharge(&memcg->memsw, nr_pages);
2719		-
	3166	+ __memcg_kmem_uncharge(memcg, nr_pages);
2720	3167	page->mem_cgroup = NULL;
	3168	+ css_put(&memcg->css);
2721	3169
2722	3170	/* slab pages do not have PageKmemcg flag set */
2723	3171	if (PageKmemcg(page))
2724	3172	__ClearPageKmemcg(page);
2725		-
2726		- css_put_many(&memcg->css, nr_pages);
2727	3173	}
2728		-#endif /* CONFIG_MEMCG_KMEM */
2729	3174
2730		-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2731		-
2732		-/*
2733		- * Because tail pages are not marked as "used", set it. We're under
2734		- * zone_lru_lock and migration entries setup in all page mappings.
2735		- */
2736		-void mem_cgroup_split_huge_fixup(struct page *head)
	3175	+static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
2737	3176	{
2738		- int i;
	3177	+ struct memcg_stock_pcp *stock;
	3178	+ unsigned long flags;
	3179	+ bool ret = false;
2739	3180
2740		- if (mem_cgroup_disabled())
	3181	+ local_lock_irqsave(&memcg_stock.lock, flags);
	3182	+
	3183	+ stock = this_cpu_ptr(&memcg_stock);
	3184	+ if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
	3185	+ stock->nr_bytes -= nr_bytes;
	3186	+ ret = true;
	3187	+ }
	3188	+
	3189	+ local_unlock_irqrestore(&memcg_stock.lock, flags);
	3190	+
	3191	+ return ret;
	3192	+}
	3193	+
	3194	+static void drain_obj_stock(struct memcg_stock_pcp *stock)
	3195	+{
	3196	+ struct obj_cgroup *old = stock->cached_objcg;
	3197	+
	3198	+ if (!old)
2741	3199	return;
2742	3200
2743		- for (i = 1; i < HPAGE_PMD_NR; i++)
2744		- head[i].mem_cgroup = head->mem_cgroup;
	3201	+ if (stock->nr_bytes) {
	3202	+ unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
	3203	+ unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
2745	3204
2746		- __mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR);
	3205	+ if (nr_pages) {
	3206	+ struct mem_cgroup *memcg;
	3207	+
	3208	+ rcu_read_lock();
	3209	+retry:
	3210	+ memcg = obj_cgroup_memcg(old);
	3211	+ if (unlikely(!css_tryget(&memcg->css)))
	3212	+ goto retry;
	3213	+ rcu_read_unlock();
	3214	+
	3215	+ __memcg_kmem_uncharge(memcg, nr_pages);
	3216	+ css_put(&memcg->css);
	3217	+ }
	3218	+
	3219	+ /*
	3220	+ * The leftover is flushed to the centralized per-memcg value.
	3221	+ * On the next attempt to refill obj stock it will be moved
	3222	+ * to a per-cpu stock (probably, on an other CPU), see
	3223	+ * refill_obj_stock().
	3224	+ *
	3225	+ * How often it's flushed is a trade-off between the memory
	3226	+ * limit enforcement accuracy and potential CPU contention,
	3227	+ * so it might be changed in the future.
	3228	+ */
	3229	+ atomic_add(nr_bytes, &old->nr_charged_bytes);
	3230	+ stock->nr_bytes = 0;
	3231	+ }
	3232	+
	3233	+ obj_cgroup_put(old);
	3234	+ stock->cached_objcg = NULL;
2747	3235	}
2748		-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
	3236	+
	3237	+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
	3238	+ struct mem_cgroup *root_memcg)
	3239	+{
	3240	+ struct mem_cgroup *memcg;
	3241	+
	3242	+ if (stock->cached_objcg) {
	3243	+ memcg = obj_cgroup_memcg(stock->cached_objcg);
	3244	+ if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
	3245	+ return true;
	3246	+ }
	3247	+
	3248	+ return false;
	3249	+}
	3250	+
	3251	+static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
	3252	+{
	3253	+ struct memcg_stock_pcp *stock;
	3254	+ unsigned long flags;
	3255	+
	3256	+ local_lock_irqsave(&memcg_stock.lock, flags);
	3257	+
	3258	+ stock = this_cpu_ptr(&memcg_stock);
	3259	+ if (stock->cached_objcg != objcg) { /* reset if necessary */
	3260	+ drain_obj_stock(stock);
	3261	+ obj_cgroup_get(objcg);
	3262	+ stock->cached_objcg = objcg;
	3263	+ stock->nr_bytes = atomic_xchg(&objcg->nr_charged_bytes, 0);
	3264	+ }
	3265	+ stock->nr_bytes += nr_bytes;
	3266	+
	3267	+ if (stock->nr_bytes > PAGE_SIZE)
	3268	+ drain_obj_stock(stock);
	3269	+
	3270	+ local_unlock_irqrestore(&memcg_stock.lock, flags);
	3271	+}
	3272	+
	3273	+int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
	3274	+{
	3275	+ struct mem_cgroup *memcg;
	3276	+ unsigned int nr_pages, nr_bytes;
	3277	+ int ret;
	3278	+
	3279	+ if (consume_obj_stock(objcg, size))
	3280	+ return 0;
	3281	+
	3282	+ /*
	3283	+ * In theory, memcg->nr_charged_bytes can have enough
	3284	+ * pre-charged bytes to satisfy the allocation. However,
	3285	+ * flushing memcg->nr_charged_bytes requires two atomic
	3286	+ * operations, and memcg->nr_charged_bytes can't be big,
	3287	+ * so it's better to ignore it and try grab some new pages.
	3288	+ * memcg->nr_charged_bytes will be flushed in
	3289	+ * refill_obj_stock(), called from this function or
	3290	+ * independently later.
	3291	+ */
	3292	+ rcu_read_lock();
	3293	+retry:
	3294	+ memcg = obj_cgroup_memcg(objcg);
	3295	+ if (unlikely(!css_tryget(&memcg->css)))
	3296	+ goto retry;
	3297	+ rcu_read_unlock();
	3298	+
	3299	+ nr_pages = size >> PAGE_SHIFT;
	3300	+ nr_bytes = size & (PAGE_SIZE - 1);
	3301	+
	3302	+ if (nr_bytes)
	3303	+ nr_pages += 1;
	3304	+
	3305	+ ret = __memcg_kmem_charge(memcg, gfp, nr_pages);
	3306	+ if (!ret && nr_bytes)
	3307	+ refill_obj_stock(objcg, PAGE_SIZE - nr_bytes);
	3308	+
	3309	+ css_put(&memcg->css);
	3310	+ return ret;
	3311	+}
	3312	+
	3313	+void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
	3314	+{
	3315	+ refill_obj_stock(objcg, size);
	3316	+}
	3317	+
	3318	+#endif /* CONFIG_MEMCG_KMEM */
	3319	+
	3320	+/*
	3321	+ * Because head->mem_cgroup is not set on tails, set it now.
	3322	+ */
	3323	+void split_page_memcg(struct page *head, unsigned int nr)
	3324	+{
	3325	+ struct mem_cgroup *memcg = head->mem_cgroup;
	3326	+ int kmemcg = PageKmemcg(head);
	3327	+ int i;
	3328	+
	3329	+ if (mem_cgroup_disabled() \|\| !memcg)
	3330	+ return;
	3331	+
	3332	+ for (i = 1; i < nr; i++) {
	3333	+ head[i].mem_cgroup = memcg;
	3334	+ if (kmemcg)
	3335	+ __SetPageKmemcg(head + i);
	3336	+ }
	3337	+ css_get_many(&memcg->css, nr - 1);
	3338	+}
2749	3339
2750	3340	#ifdef CONFIG_MEMCG_SWAP
2751	3341	/**
..	..	@@ -2807,7 +3397,7 @@
2807	3397	* Make sure that the new limit (memsw or memory limit) doesn't
2808	3398	* break our basic invariant rule memory.max <= memsw.max.
2809	3399	*/
2810		- limits_invariant = memsw ? max >= memcg->memory.max :
	3400	+ limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
2811	3401	max <= memcg->memsw.max;
2812	3402	if (!limits_invariant) {
2813	3403	mutex_unlock(&memcg_max_mutex);
..	..	@@ -2928,7 +3518,7 @@
2928	3518	* Test whether @memcg has children, dead or alive. Note that this
2929	3519	* function doesn't care whether @memcg has use_hierarchy enabled and
2930	3520	* returns %true if there are child csses according to the cgroup
2931		- * hierarchy. Testing use_hierarchy is the caller's responsiblity.
	3521	+ * hierarchy. Testing use_hierarchy is the caller's responsibility.
2932	3522	*/
2933	3523	static inline bool memcg_has_children(struct mem_cgroup *memcg)
2934	3524	{
..	..	@@ -2947,7 +3537,7 @@
2947	3537	*/
2948	3538	static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
2949	3539	{
2950		- int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
	3540	+ int nr_retries = MAX_RECLAIM_RETRIES;
2951	3541
2952	3542	/* we call try-to-free pages for make this cgroup empty */
2953	3543	lru_add_drain_all();
..	..	@@ -3021,50 +3611,15 @@
3021	3611	return retval;
3022	3612	}
3023	3613
3024		-struct accumulated_stats {
3025		- unsigned long stat[MEMCG_NR_STAT];
3026		- unsigned long events[NR_VM_EVENT_ITEMS];
3027		- unsigned long lru_pages[NR_LRU_LISTS];
3028		- const unsigned int *stats_array;
3029		- const unsigned int *events_array;
3030		- int stats_size;
3031		- int events_size;
3032		-};
3033		-
3034		-static void accumulate_memcg_tree(struct mem_cgroup *memcg,
3035		- struct accumulated_stats *acc)
3036		-{
3037		- struct mem_cgroup *mi;
3038		- int i;
3039		-
3040		- for_each_mem_cgroup_tree(mi, memcg) {
3041		- for (i = 0; i < acc->stats_size; i++)
3042		- acc->stat[i] += memcg_page_state(mi,
3043		- acc->stats_array ? acc->stats_array[i] : i);
3044		-
3045		- for (i = 0; i < acc->events_size; i++)
3046		- acc->events[i] += memcg_sum_events(mi,
3047		- acc->events_array ? acc->events_array[i] : i);
3048		-
3049		- for (i = 0; i < NR_LRU_LISTS; i++)
3050		- acc->lru_pages[i] +=
3051		- mem_cgroup_nr_lru_pages(mi, BIT(i));
3052		- }
3053		-}
3054		-
3055	3614	static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3056	3615	{
3057		- unsigned long val = 0;
	3616	+ unsigned long val;
3058	3617
3059	3618	if (mem_cgroup_is_root(memcg)) {
3060		- struct mem_cgroup *iter;
3061		-
3062		- for_each_mem_cgroup_tree(iter, memcg) {
3063		- val += memcg_page_state(iter, MEMCG_CACHE);
3064		- val += memcg_page_state(iter, MEMCG_RSS);
3065		- if (swap)
3066		- val += memcg_page_state(iter, MEMCG_SWAP);
3067		- }
	3619	+ val = memcg_page_state(memcg, NR_FILE_PAGES) +
	3620	+ memcg_page_state(memcg, NR_ANON_MAPPED);
	3621	+ if (swap)
	3622	+ val += memcg_page_state(memcg, MEMCG_SWAP);
3068	3623	} else {
3069	3624	if (!swap)
3070	3625	val = page_counter_read(&memcg->memory);
..	..	@@ -3125,9 +3680,61 @@
3125	3680	}
3126	3681	}
3127	3682
	3683	+static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
	3684	+{
	3685	+ unsigned long stat[MEMCG_NR_STAT] = {0};
	3686	+ struct mem_cgroup *mi;
	3687	+ int node, cpu, i;
	3688	+
	3689	+ for_each_online_cpu(cpu)
	3690	+ for (i = 0; i < MEMCG_NR_STAT; i++)
	3691	+ stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
	3692	+
	3693	+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
	3694	+ for (i = 0; i < MEMCG_NR_STAT; i++)
	3695	+ atomic_long_add(stat[i], &mi->vmstats[i]);
	3696	+
	3697	+ for_each_node(node) {
	3698	+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
	3699	+ struct mem_cgroup_per_node *pi;
	3700	+
	3701	+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
	3702	+ stat[i] = 0;
	3703	+
	3704	+ for_each_online_cpu(cpu)
	3705	+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
	3706	+ stat[i] += per_cpu(
	3707	+ pn->lruvec_stat_cpu->count[i], cpu);
	3708	+
	3709	+ for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
	3710	+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
	3711	+ atomic_long_add(stat[i], &pi->lruvec_stat[i]);
	3712	+ }
	3713	+}
	3714	+
	3715	+static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
	3716	+{
	3717	+ unsigned long events[NR_VM_EVENT_ITEMS];
	3718	+ struct mem_cgroup *mi;
	3719	+ int cpu, i;
	3720	+
	3721	+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
	3722	+ events[i] = 0;
	3723	+
	3724	+ for_each_online_cpu(cpu)
	3725	+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
	3726	+ events[i] += per_cpu(memcg->vmstats_percpu->events[i],
	3727	+ cpu);
	3728	+
	3729	+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
	3730	+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
	3731	+ atomic_long_add(events[i], &mi->vmevents[i]);
	3732	+}
	3733	+
3128	3734	#ifdef CONFIG_MEMCG_KMEM
3129	3735	static int memcg_online_kmem(struct mem_cgroup *memcg)
3130	3736	{
	3737	+ struct obj_cgroup *objcg;
3131	3738	int memcg_id;
3132	3739
3133	3740	if (cgroup_memory_nokmem)
..	..	@@ -3140,7 +3747,16 @@
3140	3747	if (memcg_id < 0)
3141	3748	return memcg_id;
3142	3749
3143		- static_branch_inc(&memcg_kmem_enabled_key);
	3750	+ objcg = obj_cgroup_alloc();
	3751	+ if (!objcg) {
	3752	+ memcg_free_cache_id(memcg_id);
	3753	+ return -ENOMEM;
	3754	+ }
	3755	+ objcg->memcg = memcg;
	3756	+ rcu_assign_pointer(memcg->objcg, objcg);
	3757	+
	3758	+ static_branch_enable(&memcg_kmem_enabled_key);
	3759	+
3144	3760	/*
3145	3761	* A memory cgroup is considered kmem-online as soon as it gets
3146	3762	* kmemcg_id. Setting the id after enabling static branching will
..	..	@@ -3149,7 +3765,6 @@
3149	3765	*/
3150	3766	memcg->kmemcg_id = memcg_id;
3151	3767	memcg->kmem_state = KMEM_ONLINE;
3152		- INIT_LIST_HEAD(&memcg->kmem_caches);
3153	3768
3154	3769	return 0;
3155	3770	}
..	..	@@ -3162,22 +3777,17 @@
3162	3777
3163	3778	if (memcg->kmem_state != KMEM_ONLINE)
3164	3779	return;
3165		- /*
3166		- * Clear the online state before clearing memcg_caches array
3167		- * entries. The slab_mutex in memcg_deactivate_kmem_caches()
3168		- * guarantees that no cache will be created for this cgroup
3169		- * after we are done (see memcg_create_kmem_cache()).
3170		- */
	3780	+
3171	3781	memcg->kmem_state = KMEM_ALLOCATED;
3172		-
3173		- memcg_deactivate_kmem_caches(memcg);
3174		-
3175		- kmemcg_id = memcg->kmemcg_id;
3176		- BUG_ON(kmemcg_id < 0);
3177	3782
3178	3783	parent = parent_mem_cgroup(memcg);
3179	3784	if (!parent)
3180	3785	parent = root_mem_cgroup;
	3786	+
	3787	+ memcg_reparent_objcgs(memcg, parent);
	3788	+
	3789	+ kmemcg_id = memcg->kmemcg_id;
	3790	+ BUG_ON(kmemcg_id < 0);
3181	3791
3182	3792	/*
3183	3793	* Change kmemcg_id of this cgroup and all its descendants to the
..	..	@@ -3207,12 +3817,6 @@
3207	3817	/* css_alloc() failed, offlining didn't happen */
3208	3818	if (unlikely(memcg->kmem_state == KMEM_ONLINE))
3209	3819	memcg_offline_kmem(memcg);
3210		-
3211		- if (memcg->kmem_state == KMEM_ALLOCATED) {
3212		- memcg_destroy_kmem_caches(memcg);
3213		- static_branch_dec(&memcg_kmem_enabled_key);
3214		- WARN_ON(page_counter_read(&memcg->kmem));
3215		- }
3216	3820	}
3217	3821	#else
3218	3822	static int memcg_online_kmem(struct mem_cgroup *memcg)
..	..	@@ -3303,6 +3907,9 @@
3303	3907	ret = mem_cgroup_resize_max(memcg, nr_pages, true);
3304	3908	break;
3305	3909	case _KMEM:
	3910	+ pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
	3911	+ "Please report your usecase to linux-mm@kvack.org if you "
	3912	+ "depend on this functionality.\n");
3306	3913	ret = memcg_update_kmem_max(memcg, nr_pages);
3307	3914	break;
3308	3915	case _TCP:
..	..	@@ -3388,6 +3995,49 @@
3388	3995	#endif
3389	3996
3390	3997	#ifdef CONFIG_NUMA
	3998	+
	3999	+#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) \| BIT(LRU_ACTIVE_FILE))
	4000	+#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) \| BIT(LRU_ACTIVE_ANON))
	4001	+#define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
	4002	+
	4003	+static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
	4004	+ int nid, unsigned int lru_mask, bool tree)
	4005	+{
	4006	+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
	4007	+ unsigned long nr = 0;
	4008	+ enum lru_list lru;
	4009	+
	4010	+ VM_BUG_ON((unsigned)nid >= nr_node_ids);
	4011	+
	4012	+ for_each_lru(lru) {
	4013	+ if (!(BIT(lru) & lru_mask))
	4014	+ continue;
	4015	+ if (tree)
	4016	+ nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
	4017	+ else
	4018	+ nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
	4019	+ }
	4020	+ return nr;
	4021	+}
	4022	+
	4023	+static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
	4024	+ unsigned int lru_mask,
	4025	+ bool tree)
	4026	+{
	4027	+ unsigned long nr = 0;
	4028	+ enum lru_list lru;
	4029	+
	4030	+ for_each_lru(lru) {
	4031	+ if (!(BIT(lru) & lru_mask))
	4032	+ continue;
	4033	+ if (tree)
	4034	+ nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
	4035	+ else
	4036	+ nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
	4037	+ }
	4038	+ return nr;
	4039	+}
	4040	+
3391	4041	static int memcg_numa_stat_show(struct seq_file m, void v)
3392	4042	{
3393	4043	struct numa_stat {
..	..	@@ -3403,40 +4053,60 @@
3403	4053	};
3404	4054	const struct numa_stat *stat;
3405	4055	int nid;
3406		- unsigned long nr;
3407		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
	4056	+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
3408	4057
3409	4058	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3410		- nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
3411		- seq_printf(m, "%s=%lu", stat->name, nr);
3412		- for_each_node_state(nid, N_MEMORY) {
3413		- nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
3414		- stat->lru_mask);
3415		- seq_printf(m, " N%d=%lu", nid, nr);
3416		- }
	4059	+ seq_printf(m, "%s=%lu", stat->name,
	4060	+ mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
	4061	+ false));
	4062	+ for_each_node_state(nid, N_MEMORY)
	4063	+ seq_printf(m, " N%d=%lu", nid,
	4064	+ mem_cgroup_node_nr_lru_pages(memcg, nid,
	4065	+ stat->lru_mask, false));
3417	4066	seq_putc(m, '\n');
3418	4067	}
3419	4068
3420	4069	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3421		- struct mem_cgroup *iter;
3422	4070
3423		- nr = 0;
3424		- for_each_mem_cgroup_tree(iter, memcg)
3425		- nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
3426		- seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
3427		- for_each_node_state(nid, N_MEMORY) {
3428		- nr = 0;
3429		- for_each_mem_cgroup_tree(iter, memcg)
3430		- nr += mem_cgroup_node_nr_lru_pages(
3431		- iter, nid, stat->lru_mask);
3432		- seq_printf(m, " N%d=%lu", nid, nr);
3433		- }
	4071	+ seq_printf(m, "hierarchical_%s=%lu", stat->name,
	4072	+ mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
	4073	+ true));
	4074	+ for_each_node_state(nid, N_MEMORY)
	4075	+ seq_printf(m, " N%d=%lu", nid,
	4076	+ mem_cgroup_node_nr_lru_pages(memcg, nid,
	4077	+ stat->lru_mask, true));
3434	4078	seq_putc(m, '\n');
3435	4079	}
3436	4080
3437	4081	return 0;
3438	4082	}
3439	4083	#endif /* CONFIG_NUMA */
	4084	+
	4085	+static const unsigned int memcg1_stats[] = {
	4086	+ NR_FILE_PAGES,
	4087	+ NR_ANON_MAPPED,
	4088	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	4089	+ NR_ANON_THPS,
	4090	+#endif
	4091	+ NR_SHMEM,
	4092	+ NR_FILE_MAPPED,
	4093	+ NR_FILE_DIRTY,
	4094	+ NR_WRITEBACK,
	4095	+ MEMCG_SWAP,
	4096	+};
	4097	+
	4098	+static const char *const memcg1_stat_names[] = {
	4099	+ "cache",
	4100	+ "rss",
	4101	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	4102	+ "rss_huge",
	4103	+#endif
	4104	+ "shmem",
	4105	+ "mapped_file",
	4106	+ "dirty",
	4107	+ "writeback",
	4108	+ "swap",
	4109	+};
3440	4110
3441	4111	/* Universal VM events cgroup1 shows, original sort order */
3442	4112	static const unsigned int memcg1_events[] = {
..	..	@@ -3446,45 +4116,42 @@
3446	4116	PGMAJFAULT,
3447	4117	};
3448	4118
3449		-static const char *const memcg1_event_names[] = {
3450		- "pgpgin",
3451		- "pgpgout",
3452		- "pgfault",
3453		- "pgmajfault",
3454		-};
3455		-
3456	4119	static int memcg_stat_show(struct seq_file m, void v)
3457	4120	{
3458		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
	4121	+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
3459	4122	unsigned long memory, memsw;
3460	4123	struct mem_cgroup *mi;
3461	4124	unsigned int i;
3462		- struct accumulated_stats acc;
3463	4125
3464	4126	BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
3465		- BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
3466	4127
3467	4128	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
	4129	+ unsigned long nr;
	4130	+
3468	4131	if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
3469	4132	continue;
3470		- seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
3471		- memcg_page_state(memcg, memcg1_stats[i]) *
3472		- PAGE_SIZE);
	4133	+ nr = memcg_page_state_local(memcg, memcg1_stats[i]);
	4134	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	4135	+ if (memcg1_stats[i] == NR_ANON_THPS)
	4136	+ nr *= HPAGE_PMD_NR;
	4137	+#endif
	4138	+ seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE);
3473	4139	}
3474	4140
3475	4141	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3476		- seq_printf(m, "%s %lu\n", memcg1_event_names[i],
3477		- memcg_sum_events(memcg, memcg1_events[i]));
	4142	+ seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]),
	4143	+ memcg_events_local(memcg, memcg1_events[i]));
3478	4144
3479	4145	for (i = 0; i < NR_LRU_LISTS; i++)
3480		- seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
3481		- mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
	4146	+ seq_printf(m, "%s %lu\n", lru_list_name(i),
	4147	+ memcg_page_state_local(memcg, NR_LRU_BASE + i) *
	4148	+ PAGE_SIZE);
3482	4149
3483	4150	/* Hierarchical information */
3484	4151	memory = memsw = PAGE_COUNTER_MAX;
3485	4152	for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
3486		- memory = min(memory, mi->memory.max);
3487		- memsw = min(memsw, mi->memsw.max);
	4153	+ memory = min(memory, READ_ONCE(mi->memory.max));
	4154	+ memsw = min(memsw, READ_ONCE(mi->memsw.max));
3488	4155	}
3489	4156	seq_printf(m, "hierarchical_memory_limit %llu\n",
3490	4157	(u64)memory * PAGE_SIZE);
..	..	@@ -3492,49 +4159,45 @@
3492	4159	seq_printf(m, "hierarchical_memsw_limit %llu\n",
3493	4160	(u64)memsw * PAGE_SIZE);
3494	4161
3495		- memset(&acc, 0, sizeof(acc));
3496		- acc.stats_size = ARRAY_SIZE(memcg1_stats);
3497		- acc.stats_array = memcg1_stats;
3498		- acc.events_size = ARRAY_SIZE(memcg1_events);
3499		- acc.events_array = memcg1_events;
3500		- accumulate_memcg_tree(memcg, &acc);
3501		-
3502	4162	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
	4163	+ unsigned long nr;
	4164	+
3503	4165	if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
3504	4166	continue;
	4167	+ nr = memcg_page_state(memcg, memcg1_stats[i]);
	4168	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	4169	+ if (memcg1_stats[i] == NR_ANON_THPS)
	4170	+ nr *= HPAGE_PMD_NR;
	4171	+#endif
3505	4172	seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
3506		- (u64)acc.stat[i] * PAGE_SIZE);
	4173	+ (u64)nr * PAGE_SIZE);
3507	4174	}
3508	4175
3509	4176	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3510		- seq_printf(m, "total_%s %llu\n", memcg1_event_names[i],
3511		- (u64)acc.events[i]);
	4177	+ seq_printf(m, "total_%s %llu\n",
	4178	+ vm_event_name(memcg1_events[i]),
	4179	+ (u64)memcg_events(memcg, memcg1_events[i]));
3512	4180
3513	4181	for (i = 0; i < NR_LRU_LISTS; i++)
3514		- seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i],
3515		- (u64)acc.lru_pages[i] * PAGE_SIZE);
	4182	+ seq_printf(m, "total_%s %llu\n", lru_list_name(i),
	4183	+ (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
	4184	+ PAGE_SIZE);
3516	4185
3517	4186	#ifdef CONFIG_DEBUG_VM
3518	4187	{
3519	4188	pg_data_t *pgdat;
3520	4189	struct mem_cgroup_per_node *mz;
3521		- struct zone_reclaim_stat *rstat;
3522		- unsigned long recent_rotated[2] = {0, 0};
3523		- unsigned long recent_scanned[2] = {0, 0};
	4190	+ unsigned long anon_cost = 0;
	4191	+ unsigned long file_cost = 0;
3524	4192
3525	4193	for_each_online_pgdat(pgdat) {
3526	4194	mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
3527		- rstat = &mz->lruvec.reclaim_stat;
3528	4195
3529		- recent_rotated[0] += rstat->recent_rotated[0];
3530		- recent_rotated[1] += rstat->recent_rotated[1];
3531		- recent_scanned[0] += rstat->recent_scanned[0];
3532		- recent_scanned[1] += rstat->recent_scanned[1];
	4196	+ anon_cost += mz->lruvec.anon_cost;
	4197	+ file_cost += mz->lruvec.file_cost;
3533	4198	}
3534		- seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
3535		- seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
3536		- seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
3537		- seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
	4199	+ seq_printf(m, "anon_cost %lu\n", anon_cost);
	4200	+ seq_printf(m, "file_cost %lu\n", file_cost);
3538	4201	}
3539	4202	#endif
3540	4203
..	..	@@ -3693,8 +4356,7 @@
3693	4356	size = thresholds->primary ? thresholds->primary->size + 1 : 1;
3694	4357
3695	4358	/* Allocate memory for new array of thresholds */
3696		- new = kmalloc(sizeof(new) + size sizeof(struct mem_cgroup_threshold),
3697		- GFP_KERNEL);
	4359	+ new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
3698	4360	if (!new) {
3699	4361	ret = -ENOMEM;
3700	4362	goto unlock;
..	..	@@ -3702,17 +4364,16 @@
3702	4364	new->size = size;
3703	4365
3704	4366	/* Copy thresholds (if any) to new array */
3705		- if (thresholds->primary) {
3706		- memcpy(new->entries, thresholds->primary->entries, (size - 1) *
3707		- sizeof(struct mem_cgroup_threshold));
3708		- }
	4367	+ if (thresholds->primary)
	4368	+ memcpy(new->entries, thresholds->primary->entries,
	4369	+ flex_array_size(new, entries, size - 1));
3709	4370
3710	4371	/* Add new threshold */
3711	4372	new->entries[size - 1].eventfd = eventfd;
3712	4373	new->entries[size - 1].threshold = threshold;
3713	4374
3714	4375	/* Sort thresholds. Registering of new threshold isn't time-critical */
3715		- sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
	4376	+ sort(new->entries, size, sizeof(*new->entries),
3716	4377	compare_thresholds, NULL);
3717	4378
3718	4379	/* Find current threshold */
..	..	@@ -3894,7 +4555,7 @@
3894	4555
3895	4556	static int mem_cgroup_oom_control_read(struct seq_file sf, void v)
3896	4557	{
3897		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
	4558	+ struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
3898	4559
3899	4560	seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
3900	4561	seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
..	..	@@ -3920,6 +4581,8 @@
3920	4581	}
3921	4582
3922	4583	#ifdef CONFIG_CGROUP_WRITEBACK
	4584	+
	4585	+#include <trace/events/writeback.h>
3923	4586
3924	4587	static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
3925	4588	{
..	..	@@ -3952,11 +4615,11 @@
3952	4615	*/
3953	4616	static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
3954	4617	{
3955		- long x = atomic_long_read(&memcg->stat[idx]);
	4618	+ long x = atomic_long_read(&memcg->vmstats[idx]);
3956	4619	int cpu;
3957	4620
3958	4621	for_each_online_cpu(cpu)
3959		- x += per_cpu_ptr(memcg->stat_cpu, cpu)->count[idx];
	4622	+ x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx];
3960	4623	if (x < 0)
3961	4624	x = 0;
3962	4625	return x;
..	..	@@ -3989,18 +4652,142 @@
3989	4652
3990	4653	*pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
3991	4654
3992		- /* this should eventually include NR_UNSTABLE_NFS */
3993	4655	*pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
3994		- *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) \|
3995		- (1 << LRU_ACTIVE_FILE));
	4656	+ *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
	4657	+ memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
3996	4658	*pheadroom = PAGE_COUNTER_MAX;
3997	4659
3998	4660	while ((parent = parent_mem_cgroup(memcg))) {
3999		- unsigned long ceiling = min(memcg->memory.max, memcg->high);
	4661	+ unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
	4662	+ READ_ONCE(memcg->memory.high));
4000	4663	unsigned long used = page_counter_read(&memcg->memory);
4001	4664
4002	4665	pheadroom = min(pheadroom, ceiling - min(ceiling, used));
4003	4666	memcg = parent;
	4667	+ }
	4668	+}
	4669	+
	4670	+/*
	4671	+ * Foreign dirty flushing
	4672	+ *
	4673	+ * There's an inherent mismatch between memcg and writeback. The former
	4674	+ * trackes ownership per-page while the latter per-inode. This was a
	4675	+ * deliberate design decision because honoring per-page ownership in the
	4676	+ * writeback path is complicated, may lead to higher CPU and IO overheads
	4677	+ * and deemed unnecessary given that write-sharing an inode across
	4678	+ * different cgroups isn't a common use-case.
	4679	+ *
	4680	+ * Combined with inode majority-writer ownership switching, this works well
	4681	+ * enough in most cases but there are some pathological cases. For
	4682	+ * example, let's say there are two cgroups A and B which keep writing to
	4683	+ * different but confined parts of the same inode. B owns the inode and
	4684	+ * A's memory is limited far below B's. A's dirty ratio can rise enough to
	4685	+ * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
	4686	+ * triggering background writeback. A will be slowed down without a way to
	4687	+ * make writeback of the dirty pages happen.
	4688	+ *
	4689	+ * Conditions like the above can lead to a cgroup getting repatedly and
	4690	+ * severely throttled after making some progress after each
	4691	+ * dirty_expire_interval while the underyling IO device is almost
	4692	+ * completely idle.
	4693	+ *
	4694	+ * Solving this problem completely requires matching the ownership tracking
	4695	+ * granularities between memcg and writeback in either direction. However,
	4696	+ * the more egregious behaviors can be avoided by simply remembering the
	4697	+ * most recent foreign dirtying events and initiating remote flushes on
	4698	+ * them when local writeback isn't enough to keep the memory clean enough.
	4699	+ *
	4700	+ * The following two functions implement such mechanism. When a foreign
	4701	+ * page - a page whose memcg and writeback ownerships don't match - is
	4702	+ * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
	4703	+ * bdi_writeback on the page owning memcg. When balance_dirty_pages()
	4704	+ * decides that the memcg needs to sleep due to high dirty ratio, it calls
	4705	+ * mem_cgroup_flush_foreign() which queues writeback on the recorded
	4706	+ * foreign bdi_writebacks which haven't expired. Both the numbers of
	4707	+ * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
	4708	+ * limited to MEMCG_CGWB_FRN_CNT.
	4709	+ *
	4710	+ * The mechanism only remembers IDs and doesn't hold any object references.
	4711	+ * As being wrong occasionally doesn't matter, updates and accesses to the
	4712	+ * records are lockless and racy.
	4713	+ */
	4714	+void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
	4715	+ struct bdi_writeback *wb)
	4716	+{
	4717	+ struct mem_cgroup *memcg = page->mem_cgroup;
	4718	+ struct memcg_cgwb_frn *frn;
	4719	+ u64 now = get_jiffies_64();
	4720	+ u64 oldest_at = now;
	4721	+ int oldest = -1;
	4722	+ int i;
	4723	+
	4724	+ trace_track_foreign_dirty(page, wb);
	4725	+
	4726	+ /*
	4727	+ * Pick the slot to use. If there is already a slot for @wb, keep
	4728	+ * using it. If not replace the oldest one which isn't being
	4729	+ * written out.
	4730	+ */
	4731	+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
	4732	+ frn = &memcg->cgwb_frn[i];
	4733	+ if (frn->bdi_id == wb->bdi->id &&
	4734	+ frn->memcg_id == wb->memcg_css->id)
	4735	+ break;
	4736	+ if (time_before64(frn->at, oldest_at) &&
	4737	+ atomic_read(&frn->done.cnt) == 1) {
	4738	+ oldest = i;
	4739	+ oldest_at = frn->at;
	4740	+ }
	4741	+ }
	4742	+
	4743	+ if (i < MEMCG_CGWB_FRN_CNT) {
	4744	+ /*
	4745	+ * Re-using an existing one. Update timestamp lazily to
	4746	+ * avoid making the cacheline hot. We want them to be
	4747	+ * reasonably up-to-date and significantly shorter than
	4748	+ * dirty_expire_interval as that's what expires the record.
	4749	+ * Use the shorter of 1s and dirty_expire_interval / 8.
	4750	+ */
	4751	+ unsigned long update_intv =
	4752	+ min_t(unsigned long, HZ,
	4753	+ msecs_to_jiffies(dirty_expire_interval * 10) / 8);
	4754	+
	4755	+ if (time_before64(frn->at, now - update_intv))
	4756	+ frn->at = now;
	4757	+ } else if (oldest >= 0) {
	4758	+ /* replace the oldest free one */
	4759	+ frn = &memcg->cgwb_frn[oldest];
	4760	+ frn->bdi_id = wb->bdi->id;
	4761	+ frn->memcg_id = wb->memcg_css->id;
	4762	+ frn->at = now;
	4763	+ }
	4764	+}
	4765	+
	4766	+/* issue foreign writeback flushes for recorded foreign dirtying events */
	4767	+void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
	4768	+{
	4769	+ struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
	4770	+ unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
	4771	+ u64 now = jiffies_64;
	4772	+ int i;
	4773	+
	4774	+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
	4775	+ struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
	4776	+
	4777	+ /*
	4778	+ * If the record is older than dirty_expire_interval,
	4779	+ * writeback on it has already started. No need to kick it
	4780	+ * off again. Also, don't start a new one if there's
	4781	+ * already one in flight.
	4782	+ */
	4783	+ if (time_after64(frn->at, now - intv) &&
	4784	+ atomic_read(&frn->done.cnt) == 1) {
	4785	+ frn->at = 0;
	4786	+ trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
	4787	+ cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
	4788	+ WB_REASON_FOREIGN_FLUSH,
	4789	+ &frn->done);
	4790	+ }
4004	4791	}
4005	4792	}
4006	4793
..	..	@@ -4123,6 +4910,7 @@
4123	4910	unsigned int efd, cfd;
4124	4911	struct fd efile;
4125	4912	struct fd cfile;
	4913	+ struct dentry *cdentry;
4126	4914	const char *name;
4127	4915	char *endp;
4128	4916	int ret;
..	..	@@ -4174,6 +4962,16 @@
4174	4962	goto out_put_cfile;
4175	4963
4176	4964	/*
	4965	+ * The control file must be a regular cgroup1 file. As a regular cgroup
	4966	+ * file can't be renamed, it's safe to access its name afterwards.
	4967	+ */
	4968	+ cdentry = cfile.file->f_path.dentry;
	4969	+ if (cdentry->d_sb->s_type != &cgroup_fs_type \|\| !d_is_reg(cdentry)) {
	4970	+ ret = -EINVAL;
	4971	+ goto out_put_cfile;
	4972	+ }
	4973	+
	4974	+ /*
4177	4975	* Determine the event callbacks and set them in @event. This used
4178	4976	* to be done via struct cftype but cgroup core no longer knows
4179	4977	* about these events. The following is crude but the whole thing
..	..	@@ -4181,7 +4979,7 @@
4181	4979	*
4182	4980	* DO NOT ADD NEW FILES.
4183	4981	*/
4184		- name = cfile.file->f_path.dentry->d_name.name;
	4982	+ name = cdentry->d_name.name;
4185	4983
4186	4984	if (!strcmp(name, "memory.usage_in_bytes")) {
4187	4985	event->register_event = mem_cgroup_usage_register_event;
..	..	@@ -4205,7 +5003,7 @@
4205	5003	* automatically removed on cgroup destruction but the removal is
4206	5004	* asynchronous, so take an extra ref on @css.
4207	5005	*/
4208		- cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
	5006	+ cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
4209	5007	&memory_cgrp_subsys);
4210	5008	ret = -EINVAL;
4211	5009	if (IS_ERR(cfile_css))
..	..	@@ -4340,12 +5138,10 @@
4340	5138	.write = mem_cgroup_reset,
4341	5139	.read_u64 = mem_cgroup_read_u64,
4342	5140	},
4343		-#if defined(CONFIG_SLAB) \|\| defined(CONFIG_SLUB_DEBUG)
	5141	+#if defined(CONFIG_MEMCG_KMEM) && \
	5142	+ (defined(CONFIG_SLAB) \|\| defined(CONFIG_SLUB_DEBUG))
4344	5143	{
4345	5144	.name = "kmem.slabinfo",
4346		- .seq_start = memcg_slab_start,
4347		- .seq_next = memcg_slab_next,
4348		- .seq_stop = memcg_slab_stop,
4349	5145	.seq_show = memcg_slab_show,
4350	5146	},
4351	5147	#endif
..	..	@@ -4383,7 +5179,7 @@
4383	5179	* limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
4384	5180	* memory-controlled cgroups to 64k.
4385	5181	*
4386		- * However, there usually are many references to the oflline CSS after
	5182	+ * However, there usually are many references to the offline CSS after
4387	5183	* the cgroup has been destroyed, such as page cache or reclaimable
4388	5184	* slab objects, that don't need to hang on to the ID. We want to keep
4389	5185	* those dead CSS from occupying IDs, or we might quickly exhaust the
..	..	@@ -4404,31 +5200,26 @@
4404	5200	static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
4405	5201	{
4406	5202	if (memcg->id.id > 0) {
	5203	+ trace_android_vh_mem_cgroup_id_remove(memcg);
4407	5204	idr_remove(&mem_cgroup_idr, memcg->id.id);
4408	5205	memcg->id.id = 0;
4409	5206	}
4410	5207	}
4411	5208
4412		-static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
	5209	+static void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg,
	5210	+ unsigned int n)
4413	5211	{
4414		- VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0);
4415		- atomic_add(n, &memcg->id.ref);
	5212	+ refcount_add(n, &memcg->id.ref);
4416	5213	}
4417	5214
4418	5215	static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
4419	5216	{
4420		- VM_BUG_ON(atomic_read(&memcg->id.ref) < n);
4421		- if (atomic_sub_and_test(n, &memcg->id.ref)) {
	5217	+ if (refcount_sub_and_test(n, &memcg->id.ref)) {
4422	5218	mem_cgroup_id_remove(memcg);
4423	5219
4424	5220	/* Memcg ID pins CSS */
4425	5221	css_put(&memcg->css);
4426	5222	}
4427		-}
4428		-
4429		-static inline void mem_cgroup_id_get(struct mem_cgroup *memcg)
4430		-{
4431		- mem_cgroup_id_get_many(memcg, 1);
4432	5223	}
4433	5224
4434	5225	static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
..	..	@@ -4447,6 +5238,7 @@
4447	5238	WARN_ON_ONCE(!rcu_read_lock_held());
4448	5239	return idr_find(&mem_cgroup_idr, id);
4449	5240	}
	5241	+EXPORT_SYMBOL_GPL(mem_cgroup_from_id);
4450	5242
4451	5243	static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
4452	5244	{
..	..	@@ -4466,8 +5258,17 @@
4466	5258	if (!pn)
4467	5259	return 1;
4468	5260
4469		- pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat);
	5261	+ pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat,
	5262	+ GFP_KERNEL_ACCOUNT);
	5263	+ if (!pn->lruvec_stat_local) {
	5264	+ kfree(pn);
	5265	+ return 1;
	5266	+ }
	5267	+
	5268	+ pn->lruvec_stat_cpu = alloc_percpu_gfp(struct lruvec_stat,
	5269	+ GFP_KERNEL_ACCOUNT);
4470	5270	if (!pn->lruvec_stat_cpu) {
	5271	+ free_percpu(pn->lruvec_stat_local);
4471	5272	kfree(pn);
4472	5273	return 1;
4473	5274	}
..	..	@@ -4489,6 +5290,7 @@
4489	5290	return;
4490	5291
4491	5292	free_percpu(pn->lruvec_stat_cpu);
	5293	+ free_percpu(pn->lruvec_stat_local);
4492	5294	kfree(pn);
4493	5295	}
4494	5296
..	..	@@ -4496,39 +5298,57 @@
4496	5298	{
4497	5299	int node;
4498	5300
	5301	+ trace_android_vh_mem_cgroup_free(memcg);
4499	5302	for_each_node(node)
4500	5303	free_mem_cgroup_per_node_info(memcg, node);
4501		- free_percpu(memcg->stat_cpu);
	5304	+ free_percpu(memcg->vmstats_percpu);
	5305	+ free_percpu(memcg->vmstats_local);
4502	5306	kfree(memcg);
4503	5307	}
4504	5308
4505	5309	static void mem_cgroup_free(struct mem_cgroup *memcg)
4506	5310	{
4507	5311	memcg_wb_domain_exit(memcg);
	5312	+ /*
	5313	+ * Flush percpu vmstats and vmevents to guarantee the value correctness
	5314	+ * on parent's and all ancestor levels.
	5315	+ */
	5316	+ memcg_flush_percpu_vmstats(memcg);
	5317	+ memcg_flush_percpu_vmevents(memcg);
4508	5318	__mem_cgroup_free(memcg);
4509	5319	}
4510	5320
4511	5321	static struct mem_cgroup *mem_cgroup_alloc(void)
4512	5322	{
4513	5323	struct mem_cgroup *memcg;
4514		- size_t size;
	5324	+ unsigned int size;
4515	5325	int node;
	5326	+ int __maybe_unused i;
	5327	+ long error = -ENOMEM;
4516	5328
4517	5329	size = sizeof(struct mem_cgroup);
4518	5330	size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
4519	5331
4520	5332	memcg = kzalloc(size, GFP_KERNEL);
4521	5333	if (!memcg)
4522		- return NULL;
	5334	+ return ERR_PTR(error);
4523	5335
4524	5336	memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
4525	5337	1, MEM_CGROUP_ID_MAX,
4526	5338	GFP_KERNEL);
4527		- if (memcg->id.id < 0)
	5339	+ if (memcg->id.id < 0) {
	5340	+ error = memcg->id.id;
	5341	+ goto fail;
	5342	+ }
	5343	+
	5344	+ memcg->vmstats_local = alloc_percpu_gfp(struct memcg_vmstats_percpu,
	5345	+ GFP_KERNEL_ACCOUNT);
	5346	+ if (!memcg->vmstats_local)
4528	5347	goto fail;
4529	5348
4530		- memcg->stat_cpu = alloc_percpu(struct mem_cgroup_stat_cpu);
4531		- if (!memcg->stat_cpu)
	5349	+ memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
	5350	+ GFP_KERNEL_ACCOUNT);
	5351	+ if (!memcg->vmstats_percpu)
4532	5352	goto fail;
4533	5353
4534	5354	for_each_node(node)
..	..	@@ -4539,7 +5359,6 @@
4539	5359	goto fail;
4540	5360
4541	5361	INIT_WORK(&memcg->high_work, high_work_func);
4542		- memcg->last_scanned_node = MAX_NUMNODES;
4543	5362	INIT_LIST_HEAD(&memcg->oom_notify);
4544	5363	mutex_init(&memcg->thresholds_lock);
4545	5364	spin_lock_init(&memcg->move_lock);
..	..	@@ -4549,48 +5368,64 @@
4549	5368	memcg->socket_pressure = jiffies;
4550	5369	#ifdef CONFIG_MEMCG_KMEM
4551	5370	memcg->kmemcg_id = -1;
	5371	+ INIT_LIST_HEAD(&memcg->objcg_list);
4552	5372	#endif
4553	5373	#ifdef CONFIG_CGROUP_WRITEBACK
4554	5374	INIT_LIST_HEAD(&memcg->cgwb_list);
	5375	+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
	5376	+ memcg->cgwb_frn[i].done =
	5377	+ __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
	5378	+#endif
	5379	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	5380	+ spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
	5381	+ INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
	5382	+ memcg->deferred_split_queue.split_queue_len = 0;
4555	5383	#endif
4556	5384	idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
	5385	+ trace_android_vh_mem_cgroup_alloc(memcg);
4557	5386	return memcg;
4558	5387	fail:
4559	5388	mem_cgroup_id_remove(memcg);
4560	5389	__mem_cgroup_free(memcg);
4561		- return NULL;
	5390	+ return ERR_PTR(error);
4562	5391	}
4563	5392
4564	5393	static struct cgroup_subsys_state * __ref
4565	5394	mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4566	5395	{
4567	5396	struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
4568		- struct mem_cgroup *memcg;
	5397	+ struct mem_cgroup memcg, old_memcg;
4569	5398	long error = -ENOMEM;
4570	5399
	5400	+ old_memcg = set_active_memcg(parent);
4571	5401	memcg = mem_cgroup_alloc();
4572		- if (!memcg)
4573		- return ERR_PTR(error);
	5402	+ set_active_memcg(old_memcg);
	5403	+ if (IS_ERR(memcg))
	5404	+ return ERR_CAST(memcg);
4574	5405
4575		- memcg->high = PAGE_COUNTER_MAX;
	5406	+ page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
4576	5407	memcg->soft_limit = PAGE_COUNTER_MAX;
	5408	+ page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
4577	5409	if (parent) {
4578	5410	memcg->swappiness = mem_cgroup_swappiness(parent);
4579	5411	memcg->oom_kill_disable = parent->oom_kill_disable;
4580	5412	}
4581		- if (parent && parent->use_hierarchy) {
	5413	+ if (!parent) {
	5414	+ page_counter_init(&memcg->memory, NULL);
	5415	+ page_counter_init(&memcg->swap, NULL);
	5416	+ page_counter_init(&memcg->kmem, NULL);
	5417	+ page_counter_init(&memcg->tcpmem, NULL);
	5418	+ } else if (parent->use_hierarchy) {
4582	5419	memcg->use_hierarchy = true;
4583	5420	page_counter_init(&memcg->memory, &parent->memory);
4584	5421	page_counter_init(&memcg->swap, &parent->swap);
4585		- page_counter_init(&memcg->memsw, &parent->memsw);
4586	5422	page_counter_init(&memcg->kmem, &parent->kmem);
4587	5423	page_counter_init(&memcg->tcpmem, &parent->tcpmem);
4588	5424	} else {
4589		- page_counter_init(&memcg->memory, NULL);
4590		- page_counter_init(&memcg->swap, NULL);
4591		- page_counter_init(&memcg->memsw, NULL);
4592		- page_counter_init(&memcg->kmem, NULL);
4593		- page_counter_init(&memcg->tcpmem, NULL);
	5425	+ page_counter_init(&memcg->memory, &root_mem_cgroup->memory);
	5426	+ page_counter_init(&memcg->swap, &root_mem_cgroup->swap);
	5427	+ page_counter_init(&memcg->kmem, &root_mem_cgroup->kmem);
	5428	+ page_counter_init(&memcg->tcpmem, &root_mem_cgroup->tcpmem);
4594	5429	/*
4595	5430	* Deeper hierachy with use_hierarchy == false doesn't make
4596	5431	* much sense so let cgroup subsystem know about this
..	..	@@ -4617,7 +5452,7 @@
4617	5452	fail:
4618	5453	mem_cgroup_id_remove(memcg);
4619	5454	mem_cgroup_free(memcg);
4620		- return ERR_PTR(-ENOMEM);
	5455	+ return ERR_PTR(error);
4621	5456	}
4622	5457
4623	5458	static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
..	..	@@ -4635,8 +5470,9 @@
4635	5470	}
4636	5471
4637	5472	/* Online state pins memcg ID, memcg ID pins CSS */
4638		- atomic_set(&memcg->id.ref, 1);
	5473	+ refcount_set(&memcg->id.ref, 1);
4639	5474	css_get(css);
	5475	+ trace_android_vh_mem_cgroup_css_online(css, memcg);
4640	5476	return 0;
4641	5477	}
4642	5478
..	..	@@ -4645,6 +5481,7 @@
4645	5481	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4646	5482	struct mem_cgroup_event event, tmp;
4647	5483
	5484	+ trace_android_vh_mem_cgroup_css_offline(css, memcg);
4648	5485	/*
4649	5486	* Unregister events and notify userspace.
4650	5487	* Notify userspace about cgroup removing only after rmdir of cgroup
..	..	@@ -4663,6 +5500,8 @@
4663	5500	memcg_offline_kmem(memcg);
4664	5501	wb_memcg_offline(memcg);
4665	5502
	5503	+ drain_all_stock(memcg);
	5504	+
4666	5505	mem_cgroup_id_put(memcg);
4667	5506	}
4668	5507
..	..	@@ -4676,7 +5515,12 @@
4676	5515	static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
4677	5516	{
4678	5517	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
	5518	+ int __maybe_unused i;
4679	5519
	5520	+#ifdef CONFIG_CGROUP_WRITEBACK
	5521	+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
	5522	+ wb_wait_for_completion(&memcg->cgwb_frn[i].done);
	5523	+#endif
4680	5524	if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
4681	5525	static_branch_dec(&memcg_sockets_enabled_key);
4682	5526
..	..	@@ -4710,13 +5554,13 @@
4710	5554
4711	5555	page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
4712	5556	page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
4713		- page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
4714	5557	page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
4715	5558	page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
4716	5559	page_counter_set_min(&memcg->memory, 0);
4717	5560	page_counter_set_low(&memcg->memory, 0);
4718		- memcg->high = PAGE_COUNTER_MAX;
	5561	+ page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
4719	5562	memcg->soft_limit = PAGE_COUNTER_MAX;
	5563	+ page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
4720	5564	memcg_wb_domain_size_changed(memcg);
4721	5565	}
4722	5566
..	..	@@ -4759,7 +5603,7 @@
4759	5603	static struct page mc_handle_present_pte(struct vm_area_struct vma,
4760	5604	unsigned long addr, pte_t ptent)
4761	5605	{
4762		- struct page *page = _vm_normal_page(vma, addr, ptent, true);
	5606	+ struct page *page = vm_normal_page(vma, addr, ptent);
4763	5607
4764	5608	if (!page \|\| !page_mapped(page))
4765	5609	return NULL;
..	..	@@ -4810,8 +5654,7 @@
4810	5654	* we call find_get_page() with swapper_space directly.
4811	5655	*/
4812	5656	page = find_get_page(swap_address_space(ent), swp_offset(ent));
4813		- if (do_memsw_account())
4814		- entry->val = ent.val;
	5657	+ entry->val = ent.val;
4815	5658
4816	5659	return page;
4817	5660	}
..	..	@@ -4826,36 +5669,15 @@
4826	5669	static struct page mc_handle_file_pte(struct vm_area_struct vma,
4827	5670	unsigned long addr, pte_t ptent, swp_entry_t *entry)
4828	5671	{
4829		- struct page *page = NULL;
4830		- struct address_space *mapping;
4831		- pgoff_t pgoff;
4832		-
4833	5672	if (!vma->vm_file) /* anonymous vma */
4834	5673	return NULL;
4835	5674	if (!(mc.flags & MOVE_FILE))
4836	5675	return NULL;
4837	5676
4838		- mapping = vma->vm_file->f_mapping;
4839		- pgoff = linear_page_index(vma, addr);
4840		-
4841	5677	/* page is moved even if it's not RSS of this task(page-faulted). */
4842		-#ifdef CONFIG_SWAP
4843	5678	/* shmem/tmpfs may report page out on swap: account for that too. */
4844		- if (shmem_mapping(mapping)) {
4845		- page = find_get_entry(mapping, pgoff);
4846		- if (radix_tree_exceptional_entry(page)) {
4847		- swp_entry_t swp = radix_to_swp_entry(page);
4848		- if (do_memsw_account())
4849		- *entry = swp;
4850		- page = find_get_page(swap_address_space(swp),
4851		- swp_offset(swp));
4852		- }
4853		- } else
4854		- page = find_get_page(mapping, pgoff);
4855		-#else
4856		- page = find_get_page(mapping, pgoff);
4857		-#endif
4858		- return page;
	5679	+ return find_get_incore_page(vma->vm_file->f_mapping,
	5680	+ linear_page_index(vma, addr));
4859	5681	}
4860	5682
4861	5683	/**
..	..	@@ -4875,10 +5697,10 @@
4875	5697	struct mem_cgroup *from,
4876	5698	struct mem_cgroup *to)
4877	5699	{
4878		- unsigned long flags;
4879		- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
	5700	+ struct lruvec from_vec, to_vec;
	5701	+ struct pglist_data *pgdat;
	5702	+ unsigned int nr_pages = compound ? thp_nr_pages(page) : 1;
4880	5703	int ret;
4881		- bool anon;
4882	5704
4883	5705	VM_BUG_ON(from == to);
4884	5706	VM_BUG_ON_PAGE(PageLRU(page), page);
..	..	@@ -4896,52 +5718,83 @@
4896	5718	if (page->mem_cgroup != from)
4897	5719	goto out_unlock;
4898	5720
4899		- anon = PageAnon(page);
	5721	+ pgdat = page_pgdat(page);
	5722	+ from_vec = mem_cgroup_lruvec(from, pgdat);
	5723	+ to_vec = mem_cgroup_lruvec(to, pgdat);
4900	5724
4901		- spin_lock_irqsave(&from->move_lock, flags);
	5725	+ lock_page_memcg(page);
4902	5726
4903		- if (!anon && page_mapped(page)) {
4904		- __mod_memcg_state(from, NR_FILE_MAPPED, -nr_pages);
4905		- __mod_memcg_state(to, NR_FILE_MAPPED, nr_pages);
4906		- }
	5727	+ if (PageAnon(page)) {
	5728	+ if (page_mapped(page)) {
	5729	+ __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
	5730	+ __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
	5731	+ if (PageTransHuge(page)) {
	5732	+ __dec_lruvec_state(from_vec, NR_ANON_THPS);
	5733	+ __inc_lruvec_state(to_vec, NR_ANON_THPS);
	5734	+ }
4907	5735
4908		- /*
4909		- * move_lock grabbed above and caller set from->moving_account, so
4910		- * mod_memcg_page_state will serialize updates to PageDirty.
4911		- * So mapping should be stable for dirty pages.
4912		- */
4913		- if (!anon && PageDirty(page)) {
4914		- struct address_space *mapping = page_mapping(page);
	5736	+ }
	5737	+ } else {
	5738	+ __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
	5739	+ __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
4915	5740
4916		- if (mapping_cap_account_dirty(mapping)) {
4917		- __mod_memcg_state(from, NR_FILE_DIRTY, -nr_pages);
4918		- __mod_memcg_state(to, NR_FILE_DIRTY, nr_pages);
	5741	+ if (PageSwapBacked(page)) {
	5742	+ __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
	5743	+ __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
	5744	+ }
	5745	+
	5746	+ if (page_mapped(page)) {
	5747	+ __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
	5748	+ __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
	5749	+ }
	5750	+
	5751	+ if (PageDirty(page)) {
	5752	+ struct address_space *mapping = page_mapping(page);
	5753	+
	5754	+ if (mapping_can_writeback(mapping)) {
	5755	+ __mod_lruvec_state(from_vec, NR_FILE_DIRTY,
	5756	+ -nr_pages);
	5757	+ __mod_lruvec_state(to_vec, NR_FILE_DIRTY,
	5758	+ nr_pages);
	5759	+ }
4919	5760	}
4920	5761	}
4921	5762
4922	5763	if (PageWriteback(page)) {
4923		- __mod_memcg_state(from, NR_WRITEBACK, -nr_pages);
4924		- __mod_memcg_state(to, NR_WRITEBACK, nr_pages);
	5764	+ __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
	5765	+ __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
4925	5766	}
4926	5767
4927	5768	/*
	5769	+ * All state has been migrated, let's switch to the new memcg.
	5770	+ *
4928	5771	* It is safe to change page->mem_cgroup here because the page
4929		- * is referenced, charged, and isolated - we can't race with
4930		- * uncharging, charging, migration, or LRU putback.
	5772	+ * is referenced, charged, isolated, and locked: we can't race
	5773	+ * with (un)charging, migration, LRU putback, or anything else
	5774	+ * that would rely on a stable page->mem_cgroup.
	5775	+ *
	5776	+ * Note that lock_page_memcg is a memcg lock, not a page lock,
	5777	+ * to save space. As soon as we switch page->mem_cgroup to a
	5778	+ * new memcg that isn't locked, the above state can change
	5779	+ * concurrently again. Make sure we're truly done with it.
4931	5780	*/
	5781	+ smp_mb();
4932	5782
4933		- /* caller should have done css_get */
	5783	+ css_get(&to->css);
	5784	+ css_put(&from->css);
	5785	+
4934	5786	page->mem_cgroup = to;
4935		- spin_unlock_irqrestore(&from->move_lock, flags);
	5787	+
	5788	+ __unlock_page_memcg(from);
4936	5789
4937	5790	ret = 0;
4938	5791
4939		- local_lock_irq(event_lock);
4940		- mem_cgroup_charge_statistics(to, page, compound, nr_pages);
	5792	+ local_lock_irq(&event_lock.l);
	5793	+ mem_cgroup_charge_statistics(to, page, nr_pages);
4941	5794	memcg_check_events(to, page);
4942		- mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
	5795	+ mem_cgroup_charge_statistics(from, page, -nr_pages);
4943	5796	memcg_check_events(from, page);
4944		- local_unlock_irq(event_lock);
	5797	+ local_unlock_irq(&event_lock.l);
4945	5798	out_unlock:
4946	5799	unlock_page(page);
4947	5800	out:
..	..	@@ -4963,8 +5816,8 @@
4963	5816	* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
4964	5817	* target for charge migration. if @target is not NULL, the entry is stored
4965	5818	* in target->ent.
4966		- * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PUBLIC
4967		- * or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).
	5819	+ * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE
	5820	+ * (so ZONE_DEVICE page and thus not on the lru).
4968	5821	* For now we such page is charge like a regular page would be as for all
4969	5822	* intent and purposes it is just special memory taking the place of a
4970	5823	* regular page.
..	..	@@ -4998,8 +5851,7 @@
4998	5851	*/
4999	5852	if (page->mem_cgroup == mc.from) {
5000	5853	ret = MC_TARGET_PAGE;
5001		- if (is_device_private_page(page) \|\|
5002		- is_device_public_page(page))
	5854	+ if (is_device_private_page(page))
5003	5855	ret = MC_TARGET_DEVICE;
5004	5856	if (target)
5005	5857	target->page = page;
..	..	@@ -5070,8 +5922,8 @@
5070	5922	if (ptl) {
5071	5923	/*
5072	5924	* Note their can not be MC_TARGET_DEVICE for now as we do not
5073		- * support transparent huge page with MEMORY_DEVICE_PUBLIC or
5074		- * MEMORY_DEVICE_PRIVATE but this might change.
	5925	+ * support transparent huge page with MEMORY_DEVICE_PRIVATE but
	5926	+ * this might change.
5075	5927	*/
5076	5928	if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
5077	5929	mc.precharge += HPAGE_PMD_NR;
..	..	@@ -5091,18 +5943,17 @@
5091	5943	return 0;
5092	5944	}
5093	5945
	5946	+static const struct mm_walk_ops precharge_walk_ops = {
	5947	+ .pmd_entry = mem_cgroup_count_precharge_pte_range,
	5948	+};
	5949	+
5094	5950	static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
5095	5951	{
5096	5952	unsigned long precharge;
5097	5953
5098		- struct mm_walk mem_cgroup_count_precharge_walk = {
5099		- .pmd_entry = mem_cgroup_count_precharge_pte_range,
5100		- .mm = mm,
5101		- };
5102		- down_read(&mm->mmap_sem);
5103		- walk_page_range(0, mm->highest_vm_end,
5104		- &mem_cgroup_count_precharge_walk);
5105		- up_read(&mm->mmap_sem);
	5954	+ mmap_read_lock(mm);
	5955	+ walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
	5956	+ mmap_read_unlock(mm);
5106	5957
5107	5958	precharge = mc.precharge;
5108	5959	mc.precharge = 0;
..	..	@@ -5152,8 +6003,6 @@
5152	6003	*/
5153	6004	if (!mem_cgroup_is_root(mc.to))
5154	6005	page_counter_uncharge(&mc.to->memory, mc.moved_swap);
5155		-
5156		- css_put_many(&mc.to->css, mc.moved_swap);
5157	6006
5158	6007	mc.moved_swap = 0;
5159	6008	}
..	..	@@ -5315,7 +6164,7 @@
5315	6164	switch (get_mctgt_type(vma, addr, ptent, &target)) {
5316	6165	case MC_TARGET_DEVICE:
5317	6166	device = true;
5318		- /* fall through */
	6167	+ fallthrough;
5319	6168	case MC_TARGET_PAGE:
5320	6169	page = target.page;
5321	6170	/*
..	..	@@ -5370,13 +6219,12 @@
5370	6219	return ret;
5371	6220	}
5372	6221
	6222	+static const struct mm_walk_ops charge_walk_ops = {
	6223	+ .pmd_entry = mem_cgroup_move_charge_pte_range,
	6224	+};
	6225	+
5373	6226	static void mem_cgroup_move_charge(void)
5374	6227	{
5375		- struct mm_walk mem_cgroup_move_charge_walk = {
5376		- .pmd_entry = mem_cgroup_move_charge_pte_range,
5377		- .mm = mc.mm,
5378		- };
5379		-
5380	6228	lru_add_drain_all();
5381	6229	/*
5382	6230	* Signal lock_page_memcg() to take the memcg's move_lock
..	..	@@ -5386,9 +6234,9 @@
5386	6234	atomic_inc(&mc.from->moving_account);
5387	6235	synchronize_rcu();
5388	6236	retry:
5389		- if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) {
	6237	+ if (unlikely(!mmap_read_trylock(mc.mm))) {
5390	6238	/*
5391		- * Someone who are holding the mmap_sem might be waiting in
	6239	+ * Someone who are holding the mmap_lock might be waiting in
5392	6240	* waitq. So we cancel all extra charges, wake up all waiters,
5393	6241	* and retry. Because we cancel precharges, we might not be able
5394	6242	* to move enough charges, but moving charge is a best-effort
..	..	@@ -5402,9 +6250,10 @@
5402	6250	* When we have consumed all precharges and failed in doing
5403	6251	* additional charge, the page walk just aborts.
5404	6252	*/
5405		- walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk);
	6253	+ walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
	6254	+ NULL);
5406	6255
5407		- up_read(&mc.mm->mmap_sem);
	6256	+ mmap_read_unlock(mc.mm);
5408	6257	atomic_dec(&mc.from->moving_account);
5409	6258	}
5410	6259
..	..	@@ -5446,6 +6295,16 @@
5446	6295	root_mem_cgroup->use_hierarchy = false;
5447	6296	}
5448	6297
	6298	+static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
	6299	+{
	6300	+ if (value == PAGE_COUNTER_MAX)
	6301	+ seq_puts(m, "max\n");
	6302	+ else
	6303	+ seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
	6304	+
	6305	+ return 0;
	6306	+}
	6307	+
5449	6308	static u64 memory_current_read(struct cgroup_subsys_state *css,
5450	6309	struct cftype *cft)
5451	6310	{
..	..	@@ -5456,15 +6315,8 @@
5456	6315
5457	6316	static int memory_min_show(struct seq_file m, void v)
5458	6317	{
5459		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5460		- unsigned long min = READ_ONCE(memcg->memory.min);
5461		-
5462		- if (min == PAGE_COUNTER_MAX)
5463		- seq_puts(m, "max\n");
5464		- else
5465		- seq_printf(m, "%llu\n", (u64)min * PAGE_SIZE);
5466		-
5467		- return 0;
	6318	+ return seq_puts_memcg_tunable(m,
	6319	+ READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
5468	6320	}
5469	6321
5470	6322	static ssize_t memory_min_write(struct kernfs_open_file *of,
..	..	@@ -5486,15 +6338,8 @@
5486	6338
5487	6339	static int memory_low_show(struct seq_file m, void v)
5488	6340	{
5489		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5490		- unsigned long low = READ_ONCE(memcg->memory.low);
5491		-
5492		- if (low == PAGE_COUNTER_MAX)
5493		- seq_puts(m, "max\n");
5494		- else
5495		- seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE);
5496		-
5497		- return 0;
	6341	+ return seq_puts_memcg_tunable(m,
	6342	+ READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
5498	6343	}
5499	6344
5500	6345	static ssize_t memory_low_write(struct kernfs_open_file *of,
..	..	@@ -5516,22 +6361,16 @@
5516	6361
5517	6362	static int memory_high_show(struct seq_file m, void v)
5518	6363	{
5519		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5520		- unsigned long high = READ_ONCE(memcg->high);
5521		-
5522		- if (high == PAGE_COUNTER_MAX)
5523		- seq_puts(m, "max\n");
5524		- else
5525		- seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE);
5526		-
5527		- return 0;
	6364	+ return seq_puts_memcg_tunable(m,
	6365	+ READ_ONCE(mem_cgroup_from_seq(m)->memory.high));
5528	6366	}
5529	6367
5530	6368	static ssize_t memory_high_write(struct kernfs_open_file *of,
5531	6369	char *buf, size_t nbytes, loff_t off)
5532	6370	{
5533	6371	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5534		- unsigned long nr_pages;
	6372	+ unsigned int nr_retries = MAX_RECLAIM_RETRIES;
	6373	+ bool drained = false;
5535	6374	unsigned long high;
5536	6375	int err;
5537	6376
..	..	@@ -5540,12 +6379,30 @@
5540	6379	if (err)
5541	6380	return err;
5542	6381
5543		- memcg->high = high;
	6382	+ page_counter_set_high(&memcg->memory, high);
5544	6383
5545		- nr_pages = page_counter_read(&memcg->memory);
5546		- if (nr_pages > high)
5547		- try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
5548		- GFP_KERNEL, true);
	6384	+ for (;;) {
	6385	+ unsigned long nr_pages = page_counter_read(&memcg->memory);
	6386	+ unsigned long reclaimed;
	6387	+
	6388	+ if (nr_pages <= high)
	6389	+ break;
	6390	+
	6391	+ if (signal_pending(current))
	6392	+ break;
	6393	+
	6394	+ if (!drained) {
	6395	+ drain_all_stock(memcg);
	6396	+ drained = true;
	6397	+ continue;
	6398	+ }
	6399	+
	6400	+ reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
	6401	+ GFP_KERNEL, true);
	6402	+
	6403	+ if (!reclaimed && !nr_retries--)
	6404	+ break;
	6405	+ }
5549	6406
5550	6407	memcg_wb_domain_size_changed(memcg);
5551	6408	return nbytes;
..	..	@@ -5553,22 +6410,15 @@
5553	6410
5554	6411	static int memory_max_show(struct seq_file m, void v)
5555	6412	{
5556		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5557		- unsigned long max = READ_ONCE(memcg->memory.max);
5558		-
5559		- if (max == PAGE_COUNTER_MAX)
5560		- seq_puts(m, "max\n");
5561		- else
5562		- seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
5563		-
5564		- return 0;
	6413	+ return seq_puts_memcg_tunable(m,
	6414	+ READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
5565	6415	}
5566	6416
5567	6417	static ssize_t memory_max_write(struct kernfs_open_file *of,
5568	6418	char *buf, size_t nbytes, loff_t off)
5569	6419	{
5570	6420	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5571		- unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
	6421	+ unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
5572	6422	bool drained = false;
5573	6423	unsigned long max;
5574	6424	int err;
..	..	@@ -5586,10 +6436,8 @@
5586	6436	if (nr_pages <= max)
5587	6437	break;
5588	6438
5589		- if (signal_pending(current)) {
5590		- err = -EINTR;
	6439	+ if (signal_pending(current))
5591	6440	break;
5592		- }
5593	6441
5594	6442	if (!drained) {
5595	6443	drain_all_stock(memcg);
..	..	@@ -5613,104 +6461,77 @@
5613	6461	return nbytes;
5614	6462	}
5615	6463
	6464	+static void __memory_events_show(struct seq_file m, atomic_long_t events)
	6465	+{
	6466	+ seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
	6467	+ seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
	6468	+ seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
	6469	+ seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
	6470	+ seq_printf(m, "oom_kill %lu\n",
	6471	+ atomic_long_read(&events[MEMCG_OOM_KILL]));
	6472	+}
	6473	+
5616	6474	static int memory_events_show(struct seq_file m, void v)
5617	6475	{
5618		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
	6476	+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
5619	6477
5620		- seq_printf(m, "low %lu\n",
5621		- atomic_long_read(&memcg->memory_events[MEMCG_LOW]));
5622		- seq_printf(m, "high %lu\n",
5623		- atomic_long_read(&memcg->memory_events[MEMCG_HIGH]));
5624		- seq_printf(m, "max %lu\n",
5625		- atomic_long_read(&memcg->memory_events[MEMCG_MAX]));
5626		- seq_printf(m, "oom %lu\n",
5627		- atomic_long_read(&memcg->memory_events[MEMCG_OOM]));
5628		- seq_printf(m, "oom_kill %lu\n",
5629		- atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
	6478	+ __memory_events_show(m, memcg->memory_events);
	6479	+ return 0;
	6480	+}
5630	6481
	6482	+static int memory_events_local_show(struct seq_file m, void v)
	6483	+{
	6484	+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
	6485	+
	6486	+ __memory_events_show(m, memcg->memory_events_local);
5631	6487	return 0;
5632	6488	}
5633	6489
5634	6490	static int memory_stat_show(struct seq_file m, void v)
5635	6491	{
5636		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5637		- struct accumulated_stats acc;
5638		- int i;
	6492	+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
	6493	+ char *buf;
5639	6494
5640		- /*
5641		- * Provide statistics on the state of the memory subsystem as
5642		- * well as cumulative event counters that show past behavior.
5643		- *
5644		- * This list is ordered following a combination of these gradients:
5645		- * 1) generic big picture -> specifics and details
5646		- * 2) reflecting userspace activity -> reflecting kernel heuristics
5647		- *
5648		- * Current memory state:
5649		- */
5650		-
5651		- memset(&acc, 0, sizeof(acc));
5652		- acc.stats_size = MEMCG_NR_STAT;
5653		- acc.events_size = NR_VM_EVENT_ITEMS;
5654		- accumulate_memcg_tree(memcg, &acc);
5655		-
5656		- seq_printf(m, "anon %llu\n",
5657		- (u64)acc.stat[MEMCG_RSS] * PAGE_SIZE);
5658		- seq_printf(m, "file %llu\n",
5659		- (u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE);
5660		- seq_printf(m, "kernel_stack %llu\n",
5661		- (u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024);
5662		- seq_printf(m, "slab %llu\n",
5663		- (u64)(acc.stat[NR_SLAB_RECLAIMABLE] +
5664		- acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
5665		- seq_printf(m, "sock %llu\n",
5666		- (u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE);
5667		-
5668		- seq_printf(m, "shmem %llu\n",
5669		- (u64)acc.stat[NR_SHMEM] * PAGE_SIZE);
5670		- seq_printf(m, "file_mapped %llu\n",
5671		- (u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE);
5672		- seq_printf(m, "file_dirty %llu\n",
5673		- (u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE);
5674		- seq_printf(m, "file_writeback %llu\n",
5675		- (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE);
5676		-
5677		- for (i = 0; i < NR_LRU_LISTS; i++)
5678		- seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
5679		- (u64)acc.lru_pages[i] * PAGE_SIZE);
5680		-
5681		- seq_printf(m, "slab_reclaimable %llu\n",
5682		- (u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
5683		- seq_printf(m, "slab_unreclaimable %llu\n",
5684		- (u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
5685		-
5686		- /* Accumulated memory events */
5687		-
5688		- seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]);
5689		- seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]);
5690		-
5691		- seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]);
5692		- seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] +
5693		- acc.events[PGSCAN_DIRECT]);
5694		- seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] +
5695		- acc.events[PGSTEAL_DIRECT]);
5696		- seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]);
5697		- seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]);
5698		- seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]);
5699		- seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
5700		-
5701		- seq_printf(m, "workingset_refault %lu\n",
5702		- acc.stat[WORKINGSET_REFAULT]);
5703		- seq_printf(m, "workingset_activate %lu\n",
5704		- acc.stat[WORKINGSET_ACTIVATE]);
5705		- seq_printf(m, "workingset_nodereclaim %lu\n",
5706		- acc.stat[WORKINGSET_NODERECLAIM]);
5707		-
	6495	+ buf = memory_stat_format(memcg);
	6496	+ if (!buf)
	6497	+ return -ENOMEM;
	6498	+ seq_puts(m, buf);
	6499	+ kfree(buf);
5708	6500	return 0;
5709	6501	}
5710	6502
	6503	+#ifdef CONFIG_NUMA
	6504	+static int memory_numa_stat_show(struct seq_file m, void v)
	6505	+{
	6506	+ int i;
	6507	+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
	6508	+
	6509	+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
	6510	+ int nid;
	6511	+
	6512	+ if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS)
	6513	+ continue;
	6514	+
	6515	+ seq_printf(m, "%s", memory_stats[i].name);
	6516	+ for_each_node_state(nid, N_MEMORY) {
	6517	+ u64 size;
	6518	+ struct lruvec *lruvec;
	6519	+
	6520	+ lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
	6521	+ size = lruvec_page_state(lruvec, memory_stats[i].idx);
	6522	+ size *= memory_stats[i].ratio;
	6523	+ seq_printf(m, " N%d=%llu", nid, size);
	6524	+ }
	6525	+ seq_putc(m, '\n');
	6526	+ }
	6527	+
	6528	+ return 0;
	6529	+}
	6530	+#endif
	6531	+
5711	6532	static int memory_oom_group_show(struct seq_file m, void v)
5712	6533	{
5713		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
	6534	+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
5714	6535
5715	6536	seq_printf(m, "%d\n", memcg->oom_group);
5716	6537
..	..	@@ -5776,10 +6597,21 @@
5776	6597	.seq_show = memory_events_show,
5777	6598	},
5778	6599	{
5779		- .name = "stat",
	6600	+ .name = "events.local",
5780	6601	.flags = CFTYPE_NOT_ON_ROOT,
	6602	+ .file_offset = offsetof(struct mem_cgroup, events_local_file),
	6603	+ .seq_show = memory_events_local_show,
	6604	+ },
	6605	+ {
	6606	+ .name = "stat",
5781	6607	.seq_show = memory_stat_show,
5782	6608	},
	6609	+#ifdef CONFIG_NUMA
	6610	+ {
	6611	+ .name = "numa_stat",
	6612	+ .seq_show = memory_numa_stat_show,
	6613	+ },
	6614	+#endif
5783	6615	{
5784	6616	.name = "oom.group",
5785	6617	.flags = CFTYPE_NOT_ON_ROOT \| CFTYPE_NS_DELEGATABLE,
..	..	@@ -5805,6 +6637,122 @@
5805	6637	.early_init = 0,
5806	6638	};
5807	6639
	6640	+/*
	6641	+ * This function calculates an individual cgroup's effective
	6642	+ * protection which is derived from its own memory.min/low, its
	6643	+ * parent's and siblings' settings, as well as the actual memory
	6644	+ * distribution in the tree.
	6645	+ *
	6646	+ * The following rules apply to the effective protection values:
	6647	+ *
	6648	+ * 1. At the first level of reclaim, effective protection is equal to
	6649	+ * the declared protection in memory.min and memory.low.
	6650	+ *
	6651	+ * 2. To enable safe delegation of the protection configuration, at
	6652	+ * subsequent levels the effective protection is capped to the
	6653	+ * parent's effective protection.
	6654	+ *
	6655	+ * 3. To make complex and dynamic subtrees easier to configure, the
	6656	+ * user is allowed to overcommit the declared protection at a given
	6657	+ * level. If that is the case, the parent's effective protection is
	6658	+ * distributed to the children in proportion to how much protection
	6659	+ * they have declared and how much of it they are utilizing.
	6660	+ *
	6661	+ * This makes distribution proportional, but also work-conserving:
	6662	+ * if one cgroup claims much more protection than it uses memory,
	6663	+ * the unused remainder is available to its siblings.
	6664	+ *
	6665	+ * 4. Conversely, when the declared protection is undercommitted at a
	6666	+ * given level, the distribution of the larger parental protection
	6667	+ * budget is NOT proportional. A cgroup's protection from a sibling
	6668	+ * is capped to its own memory.min/low setting.
	6669	+ *
	6670	+ * 5. However, to allow protecting recursive subtrees from each other
	6671	+ * without having to declare each individual cgroup's fixed share
	6672	+ * of the ancestor's claim to protection, any unutilized -
	6673	+ * "floating" - protection from up the tree is distributed in
	6674	+ * proportion to each cgroup's usage. This makes the protection
	6675	+ * neutral wrt sibling cgroups and lets them compete freely over
	6676	+ * the shared parental protection budget, but it protects the
	6677	+ * subtree as a whole from neighboring subtrees.
	6678	+ *
	6679	+ * Note that 4. and 5. are not in conflict: 4. is about protecting
	6680	+ * against immediate siblings whereas 5. is about protecting against
	6681	+ * neighboring subtrees.
	6682	+ */
	6683	+static unsigned long effective_protection(unsigned long usage,
	6684	+ unsigned long parent_usage,
	6685	+ unsigned long setting,
	6686	+ unsigned long parent_effective,
	6687	+ unsigned long siblings_protected)
	6688	+{
	6689	+ unsigned long protected;
	6690	+ unsigned long ep;
	6691	+
	6692	+ protected = min(usage, setting);
	6693	+ /*
	6694	+ * If all cgroups at this level combined claim and use more
	6695	+ * protection then what the parent affords them, distribute
	6696	+ * shares in proportion to utilization.
	6697	+ *
	6698	+ * We are using actual utilization rather than the statically
	6699	+ * claimed protection in order to be work-conserving: claimed
	6700	+ * but unused protection is available to siblings that would
	6701	+ * otherwise get a smaller chunk than what they claimed.
	6702	+ */
	6703	+ if (siblings_protected > parent_effective)
	6704	+ return protected * parent_effective / siblings_protected;
	6705	+
	6706	+ /*
	6707	+ * Ok, utilized protection of all children is within what the
	6708	+ * parent affords them, so we know whatever this child claims
	6709	+ * and utilizes is effectively protected.
	6710	+ *
	6711	+ * If there is unprotected usage beyond this value, reclaim
	6712	+ * will apply pressure in proportion to that amount.
	6713	+ *
	6714	+ * If there is unutilized protection, the cgroup will be fully
	6715	+ * shielded from reclaim, but we do return a smaller value for
	6716	+ * protection than what the group could enjoy in theory. This
	6717	+ * is okay. With the overcommit distribution above, effective
	6718	+ * protection is always dependent on how memory is actually
	6719	+ * consumed among the siblings anyway.
	6720	+ */
	6721	+ ep = protected;
	6722	+
	6723	+ /*
	6724	+ * If the children aren't claiming (all of) the protection
	6725	+ * afforded to them by the parent, distribute the remainder in
	6726	+ * proportion to the (unprotected) memory of each cgroup. That
	6727	+ * way, cgroups that aren't explicitly prioritized wrt each
	6728	+ * other compete freely over the allowance, but they are
	6729	+ * collectively protected from neighboring trees.
	6730	+ *
	6731	+ * We're using unprotected memory for the weight so that if
	6732	+ * some cgroups DO claim explicit protection, we don't protect
	6733	+ * the same bytes twice.
	6734	+ *
	6735	+ * Check both usage and parent_usage against the respective
	6736	+ * protected values. One should imply the other, but they
	6737	+ * aren't read atomically - make sure the division is sane.
	6738	+ */
	6739	+ if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT))
	6740	+ return ep;
	6741	+ if (parent_effective > siblings_protected &&
	6742	+ parent_usage > siblings_protected &&
	6743	+ usage > protected) {
	6744	+ unsigned long unclaimed;
	6745	+
	6746	+ unclaimed = parent_effective - siblings_protected;
	6747	+ unclaimed *= usage - protected;
	6748	+ unclaimed /= parent_usage - siblings_protected;
	6749	+
	6750	+ ep += unclaimed;
	6751	+ }
	6752	+
	6753	+ return ep;
	6754	+}
	6755	+
5808	6756	/**
5809	6757	* mem_cgroup_protected - check if memory consumption is in the normal range
5810	6758	* @root: the top ancestor of the sub-tree being checked
..	..	@@ -5812,259 +6760,125 @@
5812	6760	*
5813	6761	* WARNING: This function is not stateless! It can only be used as part
5814	6762	* of a top-down tree iteration, not for isolated queries.
5815		- *
5816		- * Returns one of the following:
5817		- * MEMCG_PROT_NONE: cgroup memory is not protected
5818		- * MEMCG_PROT_LOW: cgroup memory is protected as long there is
5819		- * an unprotected supply of reclaimable memory from other cgroups.
5820		- * MEMCG_PROT_MIN: cgroup memory is protected
5821		- *
5822		- * @root is exclusive; it is never protected when looked at directly
5823		- *
5824		- * To provide a proper hierarchical behavior, effective memory.min/low values
5825		- * are used. Below is the description of how effective memory.low is calculated.
5826		- * Effective memory.min values is calculated in the same way.
5827		- *
5828		- * Effective memory.low is always equal or less than the original memory.low.
5829		- * If there is no memory.low overcommittment (which is always true for
5830		- * top-level memory cgroups), these two values are equal.
5831		- * Otherwise, it's a part of parent's effective memory.low,
5832		- * calculated as a cgroup's memory.low usage divided by sum of sibling's
5833		- * memory.low usages, where memory.low usage is the size of actually
5834		- * protected memory.
5835		- *
5836		- * low_usage
5837		- * elow = min( memory.low, parent->elow * ------------------ ),
5838		- * siblings_low_usage
5839		- *
5840		- * \| memory.current, if memory.current < memory.low
5841		- * low_usage = \|
5842		- \| 0, otherwise.
5843		- *
5844		- *
5845		- * Such definition of the effective memory.low provides the expected
5846		- * hierarchical behavior: parent's memory.low value is limiting
5847		- * children, unprotected memory is reclaimed first and cgroups,
5848		- * which are not using their guarantee do not affect actual memory
5849		- * distribution.
5850		- *
5851		- * For example, if there are memcgs A, A/B, A/C, A/D and A/E:
5852		- *
5853		- * A A/memory.low = 2G, A/memory.current = 6G
5854		- * //\\
5855		- * BC DE B/memory.low = 3G B/memory.current = 2G
5856		- * C/memory.low = 1G C/memory.current = 2G
5857		- * D/memory.low = 0 D/memory.current = 2G
5858		- * E/memory.low = 10G E/memory.current = 0
5859		- *
5860		- * and the memory pressure is applied, the following memory distribution
5861		- * is expected (approximately):
5862		- *
5863		- * A/memory.current = 2G
5864		- *
5865		- * B/memory.current = 1.3G
5866		- * C/memory.current = 0.6G
5867		- * D/memory.current = 0
5868		- * E/memory.current = 0
5869		- *
5870		- * These calculations require constant tracking of the actual low usages
5871		- * (see propagate_protected_usage()), as well as recursive calculation of
5872		- * effective memory.low values. But as we do call mem_cgroup_protected()
5873		- * path for each memory cgroup top-down from the reclaim,
5874		- * it's possible to optimize this part, and save calculated elow
5875		- * for next usage. This part is intentionally racy, but it's ok,
5876		- * as memory.low is a best-effort mechanism.
5877	6763	*/
5878		-enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
5879		- struct mem_cgroup *memcg)
	6764	+void mem_cgroup_calculate_protection(struct mem_cgroup *root,
	6765	+ struct mem_cgroup *memcg)
5880	6766	{
	6767	+ unsigned long usage, parent_usage;
5881	6768	struct mem_cgroup *parent;
5882		- unsigned long emin, parent_emin;
5883		- unsigned long elow, parent_elow;
5884		- unsigned long usage;
5885	6769
5886	6770	if (mem_cgroup_disabled())
5887		- return MEMCG_PROT_NONE;
	6771	+ return;
5888	6772
5889	6773	if (!root)
5890	6774	root = root_mem_cgroup;
	6775	+
	6776	+ /*
	6777	+ * Effective values of the reclaim targets are ignored so they
	6778	+ * can be stale. Have a look at mem_cgroup_protection for more
	6779	+ * details.
	6780	+ * TODO: calculation should be more robust so that we do not need
	6781	+ * that special casing.
	6782	+ */
5891	6783	if (memcg == root)
5892		- return MEMCG_PROT_NONE;
	6784	+ return;
5893	6785
5894	6786	usage = page_counter_read(&memcg->memory);
5895	6787	if (!usage)
5896		- return MEMCG_PROT_NONE;
5897		-
5898		- emin = memcg->memory.min;
5899		- elow = memcg->memory.low;
	6788	+ return;
5900	6789
5901	6790	parent = parent_mem_cgroup(memcg);
5902	6791	/* No parent means a non-hierarchical mode on v1 memcg */
5903	6792	if (!parent)
5904		- return MEMCG_PROT_NONE;
	6793	+ return;
5905	6794
5906		- if (parent == root)
5907		- goto exit;
5908		-
5909		- parent_emin = READ_ONCE(parent->memory.emin);
5910		- emin = min(emin, parent_emin);
5911		- if (emin && parent_emin) {
5912		- unsigned long min_usage, siblings_min_usage;
5913		-
5914		- min_usage = min(usage, memcg->memory.min);
5915		- siblings_min_usage = atomic_long_read(
5916		- &parent->memory.children_min_usage);
5917		-
5918		- if (min_usage && siblings_min_usage)
5919		- emin = min(emin, parent_emin * min_usage /
5920		- siblings_min_usage);
	6795	+ if (parent == root) {
	6796	+ memcg->memory.emin = READ_ONCE(memcg->memory.min);
	6797	+ memcg->memory.elow = READ_ONCE(memcg->memory.low);
	6798	+ return;
5921	6799	}
5922	6800
5923		- parent_elow = READ_ONCE(parent->memory.elow);
5924		- elow = min(elow, parent_elow);
5925		- if (elow && parent_elow) {
5926		- unsigned long low_usage, siblings_low_usage;
	6801	+ parent_usage = page_counter_read(&parent->memory);
5927	6802
5928		- low_usage = min(usage, memcg->memory.low);
5929		- siblings_low_usage = atomic_long_read(
5930		- &parent->memory.children_low_usage);
	6803	+ WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage,
	6804	+ READ_ONCE(memcg->memory.min),
	6805	+ READ_ONCE(parent->memory.emin),
	6806	+ atomic_long_read(&parent->memory.children_min_usage)));
5931	6807
5932		- if (low_usage && siblings_low_usage)
5933		- elow = min(elow, parent_elow * low_usage /
5934		- siblings_low_usage);
5935		- }
5936		-
5937		-exit:
5938		- memcg->memory.emin = emin;
5939		- memcg->memory.elow = elow;
5940		-
5941		- if (usage <= emin)
5942		- return MEMCG_PROT_MIN;
5943		- else if (usage <= elow)
5944		- return MEMCG_PROT_LOW;
5945		- else
5946		- return MEMCG_PROT_NONE;
	6808	+ WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage,
	6809	+ READ_ONCE(memcg->memory.low),
	6810	+ READ_ONCE(parent->memory.elow),
	6811	+ atomic_long_read(&parent->memory.children_low_usage)));
5947	6812	}
5948	6813
5949	6814	/**
5950		- * mem_cgroup_try_charge - try charging a page
	6815	+ * __mem_cgroup_charge - charge a newly allocated page to a cgroup
5951	6816	* @page: page to charge
5952	6817	* @mm: mm context of the victim
5953	6818	* @gfp_mask: reclaim mode
5954		- * @memcgp: charged memcg return
5955		- * @compound: charge the page as compound or small page
5956	6819	*
5957	6820	* Try to charge @page to the memcg that @mm belongs to, reclaiming
5958	6821	* pages according to @gfp_mask if necessary.
5959	6822	*
5960		- * Returns 0 on success, with *@memcgp pointing to the charged memcg.
5961		- * Otherwise, an error code is returned.
5962		- *
5963		- * After page->mapping has been set up, the caller must finalize the
5964		- * charge with mem_cgroup_commit_charge(). Or abort the transaction
5965		- * with mem_cgroup_cancel_charge() in case page instantiation fails.
	6823	+ * Returns 0 on success. Otherwise, an error code is returned.
5966	6824	*/
5967		-int mem_cgroup_try_charge(struct page page, struct mm_struct mm,
5968		- gfp_t gfp_mask, struct mem_cgroup **memcgp,
5969		- bool compound)
	6825	+int __mem_cgroup_charge(struct page page, struct mm_struct mm,
	6826	+ gfp_t gfp_mask)
5970	6827	{
	6828	+ unsigned int nr_pages = thp_nr_pages(page);
5971	6829	struct mem_cgroup *memcg = NULL;
5972		- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5973	6830	int ret = 0;
5974	6831
5975		- if (mem_cgroup_disabled())
5976		- goto out;
5977		-
5978	6832	if (PageSwapCache(page)) {
	6833	+ swp_entry_t ent = { .val = page_private(page), };
	6834	+ unsigned short id;
	6835	+
5979	6836	/*
5980	6837	* Every swap fault against a single page tries to charge the
5981	6838	* page, bail as early as possible. shmem_unuse() encounters
5982		- * already charged pages, too. The USED bit is protected by
5983		- * the page lock, which serializes swap cache removal, which
	6839	+ * already charged pages, too. page->mem_cgroup is protected
	6840	+ * by the page lock, which serializes swap cache removal, which
5984	6841	* in turn serializes uncharging.
5985	6842	*/
5986	6843	VM_BUG_ON_PAGE(!PageLocked(page), page);
5987	6844	if (compound_head(page)->mem_cgroup)
5988	6845	goto out;
5989	6846
5990		- if (do_swap_account) {
5991		- swp_entry_t ent = { .val = page_private(page), };
5992		- unsigned short id = lookup_swap_cgroup_id(ent);
5993		-
5994		- rcu_read_lock();
5995		- memcg = mem_cgroup_from_id(id);
5996		- if (memcg && !css_tryget_online(&memcg->css))
5997		- memcg = NULL;
5998		- rcu_read_unlock();
5999		- }
	6847	+ id = lookup_swap_cgroup_id(ent);
	6848	+ rcu_read_lock();
	6849	+ memcg = mem_cgroup_from_id(id);
	6850	+ if (memcg && !css_tryget_online(&memcg->css))
	6851	+ memcg = NULL;
	6852	+ rcu_read_unlock();
6000	6853	}
6001	6854
6002	6855	if (!memcg)
6003	6856	memcg = get_mem_cgroup_from_mm(mm);
6004	6857
6005	6858	ret = try_charge(memcg, gfp_mask, nr_pages);
	6859	+ if (ret)
	6860	+ goto out_put;
6006	6861
6007		- css_put(&memcg->css);
6008		-out:
6009		- *memcgp = memcg;
6010		- return ret;
6011		-}
	6862	+ css_get(&memcg->css);
	6863	+ commit_charge(page, memcg);
6012	6864
6013		-int mem_cgroup_try_charge_delay(struct page page, struct mm_struct mm,
6014		- gfp_t gfp_mask, struct mem_cgroup **memcgp,
6015		- bool compound)
6016		-{
6017		- struct mem_cgroup *memcg;
6018		- int ret;
6019		-
6020		- ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound);
6021		- memcg = *memcgp;
6022		- mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask);
6023		- return ret;
6024		-}
6025		-
6026		-/**
6027		- * mem_cgroup_commit_charge - commit a page charge
6028		- * @page: page to charge
6029		- * @memcg: memcg to charge the page to
6030		- * @lrucare: page might be on LRU already
6031		- * @compound: charge the page as compound or small page
6032		- *
6033		- * Finalize a charge transaction started by mem_cgroup_try_charge(),
6034		- * after page->mapping has been set up. This must happen atomically
6035		- * as part of the page instantiation, i.e. under the page table lock
6036		- * for anonymous pages, under the page lock for page and swap cache.
6037		- *
6038		- * In addition, the page must not be on the LRU during the commit, to
6039		- * prevent racing with task migration. If it might be, use @lrucare.
6040		- *
6041		- * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
6042		- */
6043		-void mem_cgroup_commit_charge(struct page page, struct mem_cgroup memcg,
6044		- bool lrucare, bool compound)
6045		-{
6046		- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
6047		-
6048		- VM_BUG_ON_PAGE(!page->mapping, page);
6049		- VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
6050		-
6051		- if (mem_cgroup_disabled())
6052		- return;
6053		- /*
6054		- * Swap faults will attempt to charge the same page multiple
6055		- * times. But reuse_swap_page() might have removed the page
6056		- * from swapcache already, so we can't check PageSwapCache().
6057		- */
6058		- if (!memcg)
6059		- return;
6060		-
6061		- commit_charge(page, memcg, lrucare);
6062		-
6063		- local_lock_irq(event_lock);
6064		- mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
	6865	+ local_lock_irq(&event_lock.l);
	6866	+ mem_cgroup_charge_statistics(memcg, page, nr_pages);
6065	6867	memcg_check_events(memcg, page);
6066		- local_unlock_irq(event_lock);
	6868	+ local_unlock_irq(&event_lock.l);
6067	6869
	6870	+ /*
	6871	+ * Cgroup1's unified memory+swap counter has been charged with the
	6872	+ * new swapcache page, finish the transfer by uncharging the swap
	6873	+ * slot. The swap slot would also get uncharged when it dies, but
	6874	+ * it can stick around indefinitely and we'd count the page twice
	6875	+ * the entire time.
	6876	+ *
	6877	+ * Cgroup2 has separate resource counters for memory and swap,
	6878	+ * so this is a non-issue here. Memory and swap charge lifetimes
	6879	+ * correspond 1:1 to page and swap slot lifetimes: we charge the
	6880	+ * page to memory here, and uncharge swap when the slot is freed.
	6881	+ */
6068	6882	if (do_memsw_account() && PageSwapCache(page)) {
6069	6883	swp_entry_t entry = { .val = page_private(page) };
6070	6884	/*
..	..	@@ -6074,42 +6888,18 @@
6074	6888	*/
6075	6889	mem_cgroup_uncharge_swap(entry, nr_pages);
6076	6890	}
6077		-}
6078	6891
6079		-/**
6080		- * mem_cgroup_cancel_charge - cancel a page charge
6081		- * @page: page to charge
6082		- * @memcg: memcg to charge the page to
6083		- * @compound: charge the page as compound or small page
6084		- *
6085		- * Cancel a charge transaction started by mem_cgroup_try_charge().
6086		- */
6087		-void mem_cgroup_cancel_charge(struct page page, struct mem_cgroup memcg,
6088		- bool compound)
6089		-{
6090		- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
6091		-
6092		- if (mem_cgroup_disabled())
6093		- return;
6094		- /*
6095		- * Swap faults will attempt to charge the same page multiple
6096		- * times. But reuse_swap_page() might have removed the page
6097		- * from swapcache already, so we can't check PageSwapCache().
6098		- */
6099		- if (!memcg)
6100		- return;
6101		-
6102		- cancel_charge(memcg, nr_pages);
	6892	+out_put:
	6893	+ css_put(&memcg->css);
	6894	+out:
	6895	+ return ret;
6103	6896	}
6104	6897
6105	6898	struct uncharge_gather {
6106	6899	struct mem_cgroup *memcg;
	6900	+ unsigned long nr_pages;
6107	6901	unsigned long pgpgout;
6108		- unsigned long nr_anon;
6109		- unsigned long nr_file;
6110	6902	unsigned long nr_kmem;
6111		- unsigned long nr_huge;
6112		- unsigned long nr_shmem;
6113	6903	struct page *dummy_page;
6114	6904	};
6115	6905
..	..	@@ -6120,37 +6910,32 @@
6120	6910
6121	6911	static void uncharge_batch(const struct uncharge_gather *ug)
6122	6912	{
6123		- unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem;
6124	6913	unsigned long flags;
6125	6914
6126	6915	if (!mem_cgroup_is_root(ug->memcg)) {
6127		- page_counter_uncharge(&ug->memcg->memory, nr_pages);
	6916	+ page_counter_uncharge(&ug->memcg->memory, ug->nr_pages);
6128	6917	if (do_memsw_account())
6129		- page_counter_uncharge(&ug->memcg->memsw, nr_pages);
	6918	+ page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages);
6130	6919	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
6131	6920	page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
6132	6921	memcg_oom_recover(ug->memcg);
6133	6922	}
6134	6923
6135		- local_lock_irqsave(event_lock, flags);
6136		- __mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon);
6137		- __mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file);
6138		- __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge);
6139		- __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem);
	6924	+ local_lock_irqsave(&event_lock.l, flags);
6140	6925	__count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
6141		- __this_cpu_add(ug->memcg->stat_cpu->nr_page_events, nr_pages);
	6926	+ __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages);
6142	6927	memcg_check_events(ug->memcg, ug->dummy_page);
6143		- local_unlock_irqrestore(event_lock, flags);
	6928	+ local_unlock_irqrestore(&event_lock.l, flags);
6144	6929
6145		- if (!mem_cgroup_is_root(ug->memcg))
6146		- css_put_many(&ug->memcg->css, nr_pages);
	6930	+ /* drop reference from uncharge_page */
	6931	+ css_put(&ug->memcg->css);
6147	6932	}
6148	6933
6149	6934	static void uncharge_page(struct page page, struct uncharge_gather ug)
6150	6935	{
	6936	+ unsigned long nr_pages;
	6937	+
6151	6938	VM_BUG_ON_PAGE(PageLRU(page), page);
6152		- VM_BUG_ON_PAGE(page_count(page) && !is_zone_device_page(page) &&
6153		- !PageHWPoison(page) , page);
6154	6939
6155	6940	if (!page->mem_cgroup)
6156	6941	return;
..	..	@@ -6167,30 +6952,24 @@
6167	6952	uncharge_gather_clear(ug);
6168	6953	}
6169	6954	ug->memcg = page->mem_cgroup;
	6955	+
	6956	+ /* pairs with css_put in uncharge_batch */
	6957	+ css_get(&ug->memcg->css);
6170	6958	}
6171	6959
6172		- if (!PageKmemcg(page)) {
6173		- unsigned int nr_pages = 1;
	6960	+ nr_pages = compound_nr(page);
	6961	+ ug->nr_pages += nr_pages;
6174	6962
6175		- if (PageTransHuge(page)) {
6176		- nr_pages <<= compound_order(page);
6177		- ug->nr_huge += nr_pages;
6178		- }
6179		- if (PageAnon(page))
6180		- ug->nr_anon += nr_pages;
6181		- else {
6182		- ug->nr_file += nr_pages;
6183		- if (PageSwapBacked(page))
6184		- ug->nr_shmem += nr_pages;
6185		- }
	6963	+ if (!PageKmemcg(page)) {
6186	6964	ug->pgpgout++;
6187	6965	} else {
6188		- ug->nr_kmem += 1 << compound_order(page);
	6966	+ ug->nr_kmem += nr_pages;
6189	6967	__ClearPageKmemcg(page);
6190	6968	}
6191	6969
6192	6970	ug->dummy_page = page;
6193	6971	page->mem_cgroup = NULL;
	6972	+ css_put(&ug->memcg->css);
6194	6973	}
6195	6974
6196	6975	static void uncharge_list(struct list_head *page_list)
..	..	@@ -6219,18 +6998,14 @@
6219	6998	}
6220	6999
6221	7000	/**
6222		- * mem_cgroup_uncharge - uncharge a page
	7001	+ * __mem_cgroup_uncharge - uncharge a page
6223	7002	* @page: page to uncharge
6224	7003	*
6225		- * Uncharge a page previously charged with mem_cgroup_try_charge() and
6226		- * mem_cgroup_commit_charge().
	7004	+ * Uncharge a page previously charged with __mem_cgroup_charge().
6227	7005	*/
6228		-void mem_cgroup_uncharge(struct page *page)
	7006	+void __mem_cgroup_uncharge(struct page *page)
6229	7007	{
6230	7008	struct uncharge_gather ug;
6231		-
6232		- if (mem_cgroup_disabled())
6233		- return;
6234	7009
6235	7010	/* Don't touch page->lru of any random page, pre-check: */
6236	7011	if (!page->mem_cgroup)
..	..	@@ -6242,17 +7017,14 @@
6242	7017	}
6243	7018
6244	7019	/**
6245		- * mem_cgroup_uncharge_list - uncharge a list of page
	7020	+ * __mem_cgroup_uncharge_list - uncharge a list of page
6246	7021	* @page_list: list of pages to uncharge
6247	7022	*
6248	7023	* Uncharge a list of pages previously charged with
6249		- * mem_cgroup_try_charge() and mem_cgroup_commit_charge().
	7024	+ * __mem_cgroup_charge().
6250	7025	*/
6251		-void mem_cgroup_uncharge_list(struct list_head *page_list)
	7026	+void __mem_cgroup_uncharge_list(struct list_head *page_list)
6252	7027	{
6253		- if (mem_cgroup_disabled())
6254		- return;
6255		-
6256	7028	if (!list_empty(page_list))
6257	7029	uncharge_list(page_list);
6258	7030	}
..	..	@@ -6271,7 +7043,6 @@
6271	7043	{
6272	7044	struct mem_cgroup *memcg;
6273	7045	unsigned int nr_pages;
6274		- bool compound;
6275	7046	unsigned long flags;
6276	7047
6277	7048	VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
..	..	@@ -6293,20 +7064,19 @@
6293	7064	return;
6294	7065
6295	7066	/* Force-charge the new page. The old one will be freed soon */
6296		- compound = PageTransHuge(newpage);
6297		- nr_pages = compound ? hpage_nr_pages(newpage) : 1;
	7067	+ nr_pages = thp_nr_pages(newpage);
6298	7068
6299	7069	page_counter_charge(&memcg->memory, nr_pages);
6300	7070	if (do_memsw_account())
6301	7071	page_counter_charge(&memcg->memsw, nr_pages);
6302		- css_get_many(&memcg->css, nr_pages);
6303	7072
6304		- commit_charge(newpage, memcg, false);
	7073	+ css_get(&memcg->css);
	7074	+ commit_charge(newpage, memcg);
6305	7075
6306		- local_lock_irqsave(event_lock, flags);
6307		- mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
	7076	+ local_lock_irqsave(&event_lock.l, flags);
	7077	+ mem_cgroup_charge_statistics(memcg, newpage, nr_pages);
6308	7078	memcg_check_events(memcg, newpage);
6309		- local_unlock_irqrestore(event_lock, flags);
	7079	+ local_unlock_irqrestore(&event_lock.l, flags);
6310	7080	}
6311	7081
6312	7082	DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
..	..	@@ -6329,7 +7099,7 @@
6329	7099	goto out;
6330	7100	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
6331	7101	goto out;
6332		- if (css_tryget_online(&memcg->css))
	7102	+ if (css_tryget(&memcg->css))
6333	7103	sk->sk_memcg = memcg;
6334	7104	out:
6335	7105	rcu_read_unlock();
..	..	@@ -6407,7 +7177,7 @@
6407	7177	if (!strcmp(token, "nokmem"))
6408	7178	cgroup_memory_nokmem = true;
6409	7179	}
6410		- return 0;
	7180	+ return 1;
6411	7181	}
6412	7182	__setup("cgroup.memory=", cgroup_memory);
6413	7183
..	..	@@ -6423,23 +7193,16 @@
6423	7193	{
6424	7194	int cpu, node;
6425	7195
6426		-#ifdef CONFIG_MEMCG_KMEM
6427		- /*
6428		- * Kmem cache creation is mostly done with the slab_mutex held,
6429		- * so use a workqueue with limited concurrency to avoid stalling
6430		- * all worker threads in case lots of cgroups are created and
6431		- * destroyed simultaneously.
6432		- */
6433		- memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1);
6434		- BUG_ON(!memcg_kmem_cache_wq);
6435		-#endif
6436		-
6437	7196	cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
6438	7197	memcg_hotplug_cpu_dead);
6439	7198
6440		- for_each_possible_cpu(cpu)
6441		- INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
6442		- drain_local_stock);
	7199	+ for_each_possible_cpu(cpu) {
	7200	+ struct memcg_stock_pcp *stock;
	7201	+
	7202	+ stock = per_cpu_ptr(&memcg_stock, cpu);
	7203	+ INIT_WORK(&stock->work, drain_local_stock);
	7204	+ local_lock_init(&stock->lock);
	7205	+ }
6443	7206
6444	7207	for_each_node(node) {
6445	7208	struct mem_cgroup_tree_per_node *rtpn;
..	..	@@ -6460,7 +7223,7 @@
6460	7223	#ifdef CONFIG_MEMCG_SWAP
6461	7224	static struct mem_cgroup mem_cgroup_id_get_online(struct mem_cgroup memcg)
6462	7225	{
6463		- while (!atomic_inc_not_zero(&memcg->id.ref)) {
	7226	+ while (!refcount_inc_not_zero(&memcg->id.ref)) {
6464	7227	/*
6465	7228	* The root cgroup cannot be destroyed, so it's refcount must
6466	7229	* always be >= 1.
..	..	@@ -6493,7 +7256,10 @@
6493	7256	VM_BUG_ON_PAGE(PageLRU(page), page);
6494	7257	VM_BUG_ON_PAGE(page_count(page), page);
6495	7258
6496		- if (!do_memsw_account())
	7259	+ if (mem_cgroup_disabled())
	7260	+ return;
	7261	+
	7262	+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
6497	7263	return;
6498	7264
6499	7265	memcg = page->mem_cgroup;
..	..	@@ -6508,7 +7274,7 @@
6508	7274	* ancestor for the swap instead and transfer the memory+swap charge.
6509	7275	*/
6510	7276	swap_memcg = mem_cgroup_id_get_online(memcg);
6511		- nr_entries = hpage_nr_pages(page);
	7277	+ nr_entries = thp_nr_pages(page);
6512	7278	/* Get references for the tail pages, too */
6513	7279	if (nr_entries > 1)
6514	7280	mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
..	..	@@ -6522,7 +7288,7 @@
6522	7288	if (!mem_cgroup_is_root(memcg))
6523	7289	page_counter_uncharge(&memcg->memory, nr_entries);
6524	7290
6525		- if (memcg != swap_memcg) {
	7291	+ if (!cgroup_memory_noswap && memcg != swap_memcg) {
6526	7292	if (!mem_cgroup_is_root(swap_memcg))
6527	7293	page_counter_charge(&swap_memcg->memsw, nr_entries);
6528	7294	page_counter_uncharge(&memcg->memsw, nr_entries);
..	..	@@ -6534,21 +7300,19 @@
6534	7300	* important here to have the interrupts disabled because it is the
6535	7301	* only synchronisation we have for updating the per-CPU variables.
6536	7302	*/
6537		- local_lock_irqsave(event_lock, flags);
6538		-#ifndef CONFIG_PREEMPT_RT_BASE
	7303	+ local_lock_irqsave(&event_lock.l, flags);
	7304	+#ifndef CONFIG_PREEMPT_RT
6539	7305	VM_BUG_ON(!irqs_disabled());
6540	7306	#endif
6541		- mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
6542		- -nr_entries);
	7307	+ mem_cgroup_charge_statistics(memcg, page, -nr_entries);
6543	7308	memcg_check_events(memcg, page);
6544		- local_unlock_irqrestore(event_lock, flags);
	7309	+ local_unlock_irqrestore(&event_lock.l, flags);
6545	7310
6546		- if (!mem_cgroup_is_root(memcg))
6547		- css_put_many(&memcg->css, nr_entries);
	7311	+ css_put(&memcg->css);
6548	7312	}
6549	7313
6550	7314	/**
6551		- * mem_cgroup_try_charge_swap - try charging swap space for a page
	7315	+ * __mem_cgroup_try_charge_swap - try charging swap space for a page
6552	7316	* @page: page being added to swap
6553	7317	* @entry: swap entry to charge
6554	7318	*
..	..	@@ -6556,14 +7320,14 @@
6556	7320	*
6557	7321	* Returns 0 on success, -ENOMEM on failure.
6558	7322	*/
6559		-int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
	7323	+int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
6560	7324	{
6561		- unsigned int nr_pages = hpage_nr_pages(page);
	7325	+ unsigned int nr_pages = thp_nr_pages(page);
6562	7326	struct page_counter *counter;
6563	7327	struct mem_cgroup *memcg;
6564	7328	unsigned short oldid;
6565	7329
6566		- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) \|\| !do_swap_account)
	7330	+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
6567	7331	return 0;
6568	7332
6569	7333	memcg = page->mem_cgroup;
..	..	@@ -6579,7 +7343,7 @@
6579	7343
6580	7344	memcg = mem_cgroup_id_get_online(memcg);
6581	7345
6582		- if (!mem_cgroup_is_root(memcg) &&
	7346	+ if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) &&
6583	7347	!page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
6584	7348	memcg_memory_event(memcg, MEMCG_SWAP_MAX);
6585	7349	memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
..	..	@@ -6598,23 +7362,20 @@
6598	7362	}
6599	7363
6600	7364	/**
6601		- * mem_cgroup_uncharge_swap - uncharge swap space
	7365	+ * __mem_cgroup_uncharge_swap - uncharge swap space
6602	7366	* @entry: swap entry to uncharge
6603	7367	* @nr_pages: the amount of swap space to uncharge
6604	7368	*/
6605		-void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
	7369	+void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
6606	7370	{
6607	7371	struct mem_cgroup *memcg;
6608	7372	unsigned short id;
6609		-
6610		- if (!do_swap_account)
6611		- return;
6612	7373
6613	7374	id = swap_cgroup_record(entry, 0, nr_pages);
6614	7375	rcu_read_lock();
6615	7376	memcg = mem_cgroup_from_id(id);
6616	7377	if (memcg) {
6617		- if (!mem_cgroup_is_root(memcg)) {
	7378	+ if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) {
6618	7379	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
6619	7380	page_counter_uncharge(&memcg->swap, nr_pages);
6620	7381	else
..	..	@@ -6630,7 +7391,7 @@
6630	7391	{
6631	7392	long nr_swap_pages = get_nr_swap_pages();
6632	7393
6633		- if (!do_swap_account \|\| !cgroup_subsys_on_dfl(memory_cgrp_subsys))
	7394	+ if (cgroup_memory_noswap \|\| !cgroup_subsys_on_dfl(memory_cgrp_subsys))
6634	7395	return nr_swap_pages;
6635	7396	for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
6636	7397	nr_swap_pages = min_t(long, nr_swap_pages,
..	..	@@ -6647,36 +7408,33 @@
6647	7408
6648	7409	if (vm_swap_full())
6649	7410	return true;
6650		- if (!do_swap_account \|\| !cgroup_subsys_on_dfl(memory_cgrp_subsys))
	7411	+ if (cgroup_memory_noswap \|\| !cgroup_subsys_on_dfl(memory_cgrp_subsys))
6651	7412	return false;
6652	7413
6653	7414	memcg = page->mem_cgroup;
6654	7415	if (!memcg)
6655	7416	return false;
6656	7417
6657		- for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
6658		- if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.max)
	7418	+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
	7419	+ unsigned long usage = page_counter_read(&memcg->swap);
	7420	+
	7421	+ if (usage * 2 >= READ_ONCE(memcg->swap.high) \|\|
	7422	+ usage * 2 >= READ_ONCE(memcg->swap.max))
6659	7423	return true;
	7424	+ }
6660	7425
6661	7426	return false;
6662	7427	}
6663	7428
6664		-/* for remember boot option*/
6665		-#ifdef CONFIG_MEMCG_SWAP_ENABLED
6666		-static int really_do_swap_account __initdata = 1;
6667		-#else
6668		-static int really_do_swap_account __initdata;
6669		-#endif
6670		-
6671		-static int __init enable_swap_account(char *s)
	7429	+static int __init setup_swap_account(char *s)
6672	7430	{
6673	7431	if (!strcmp(s, "1"))
6674		- really_do_swap_account = 1;
	7432	+ cgroup_memory_noswap = 0;
6675	7433	else if (!strcmp(s, "0"))
6676		- really_do_swap_account = 0;
	7434	+ cgroup_memory_noswap = 1;
6677	7435	return 1;
6678	7436	}
6679		-__setup("swapaccount=", enable_swap_account);
	7437	+__setup("swapaccount=", setup_swap_account);
6680	7438
6681	7439	static u64 swap_current_read(struct cgroup_subsys_state *css,
6682	7440	struct cftype *cft)
..	..	@@ -6686,17 +7444,33 @@
6686	7444	return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
6687	7445	}
6688	7446
	7447	+static int swap_high_show(struct seq_file m, void v)
	7448	+{
	7449	+ return seq_puts_memcg_tunable(m,
	7450	+ READ_ONCE(mem_cgroup_from_seq(m)->swap.high));
	7451	+}
	7452	+
	7453	+static ssize_t swap_high_write(struct kernfs_open_file *of,
	7454	+ char *buf, size_t nbytes, loff_t off)
	7455	+{
	7456	+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
	7457	+ unsigned long high;
	7458	+ int err;
	7459	+
	7460	+ buf = strstrip(buf);
	7461	+ err = page_counter_memparse(buf, "max", &high);
	7462	+ if (err)
	7463	+ return err;
	7464	+
	7465	+ page_counter_set_high(&memcg->swap, high);
	7466	+
	7467	+ return nbytes;
	7468	+}
	7469	+
6689	7470	static int swap_max_show(struct seq_file m, void v)
6690	7471	{
6691		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
6692		- unsigned long max = READ_ONCE(memcg->swap.max);
6693		-
6694		- if (max == PAGE_COUNTER_MAX)
6695		- seq_puts(m, "max\n");
6696		- else
6697		- seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
6698		-
6699		- return 0;
	7472	+ return seq_puts_memcg_tunable(m,
	7473	+ READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
6700	7474	}
6701	7475
6702	7476	static ssize_t swap_max_write(struct kernfs_open_file *of,
..	..	@@ -6718,8 +7492,10 @@
6718	7492
6719	7493	static int swap_events_show(struct seq_file m, void v)
6720	7494	{
6721		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
	7495	+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6722	7496
	7497	+ seq_printf(m, "high %lu\n",
	7498	+ atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH]));
6723	7499	seq_printf(m, "max %lu\n",
6724	7500	atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
6725	7501	seq_printf(m, "fail %lu\n",
..	..	@@ -6733,6 +7509,12 @@
6733	7509	.name = "swap.current",
6734	7510	.flags = CFTYPE_NOT_ON_ROOT,
6735	7511	.read_u64 = swap_current_read,
	7512	+ },
	7513	+ {
	7514	+ .name = "swap.high",
	7515	+ .flags = CFTYPE_NOT_ON_ROOT,
	7516	+ .seq_show = swap_high_show,
	7517	+ .write = swap_high_write,
6736	7518	},
6737	7519	{
6738	7520	.name = "swap.max",
..	..	@@ -6749,7 +7531,7 @@
6749	7531	{ } /* terminate */
6750	7532	};
6751	7533
6752		-static struct cftype memsw_cgroup_files[] = {
	7534	+static struct cftype memsw_files[] = {
6753	7535	{
6754	7536	.name = "memsw.usage_in_bytes",
6755	7537	.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
..	..	@@ -6776,17 +7558,27 @@
6776	7558	{ }, /* terminate */
6777	7559	};
6778	7560
	7561	+/*
	7562	+ * If mem_cgroup_swap_init() is implemented as a subsys_initcall()
	7563	+ * instead of a core_initcall(), this could mean cgroup_memory_noswap still
	7564	+ * remains set to false even when memcg is disabled via "cgroup_disable=memory"
	7565	+ * boot parameter. This may result in premature OOPS inside
	7566	+ * mem_cgroup_get_nr_swap_pages() function in corner cases.
	7567	+ */
6779	7568	static int __init mem_cgroup_swap_init(void)
6780	7569	{
6781		- if (!mem_cgroup_disabled() && really_do_swap_account) {
6782		- do_swap_account = 1;
6783		- WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys,
6784		- swap_files));
6785		- WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
6786		- memsw_cgroup_files));
6787		- }
	7570	+ /* No memory control -> no swap control */
	7571	+ if (mem_cgroup_disabled())
	7572	+ cgroup_memory_noswap = true;
	7573	+
	7574	+ if (cgroup_memory_noswap)
	7575	+ return 0;
	7576	+
	7577	+ WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
	7578	+ WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files));
	7579	+
6788	7580	return 0;
6789	7581	}
6790		-subsys_initcall(mem_cgroup_swap_init);
	7582	+core_initcall(mem_cgroup_swap_init);
6791	7583
6792	7584	#endif /* CONFIG_MEMCG_SWAP */