~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0-or-later
1	2	/* memcontrol.c - Memory Controller
2	3	*
3	4	* Copyright IBM Corporation, 2007
..	..	@@ -19,26 +20,17 @@
19	20	* Lockless page tracking & accounting
20	21	* Unified hierarchy configuration model
21	22	* Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
22		- *
23		- * This program is free software; you can redistribute it and/or modify
24		- * it under the terms of the GNU General Public License as published by
25		- * the Free Software Foundation; either version 2 of the License, or
26		- * (at your option) any later version.
27		- *
28		- * This program is distributed in the hope that it will be useful,
29		- * but WITHOUT ANY WARRANTY; without even the implied warranty of
30		- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
31		- * GNU General Public License for more details.
32	23	*/
33	24
34	25	#include <linux/page_counter.h>
35	26	#include <linux/memcontrol.h>
36	27	#include <linux/cgroup.h>
37		-#include <linux/mm.h>
	28	+#include <linux/pagewalk.h>
38	29	#include <linux/sched/mm.h>
39	30	#include <linux/shmem_fs.h>
40	31	#include <linux/hugetlb.h>
41	32	#include <linux/pagemap.h>
	33	+#include <linux/vm_event_item.h>
42	34	#include <linux/smp.h>
43	35	#include <linux/page-flags.h>
44	36	#include <linux/backing-dev.h>
..	..	@@ -65,22 +57,26 @@
65	57	#include <linux/lockdep.h>
66	58	#include <linux/file.h>
67	59	#include <linux/tracehook.h>
	60	+#include <linux/psi.h>
	61	+#include <linux/seq_buf.h>
68	62	#include "internal.h"
69	63	#include <net/sock.h>
70	64	#include <net/ip.h>
71	65	#include "slab.h"
72		-#include <linux/locallock.h>
73	66
74	67	#include <linux/uaccess.h>
75	68
76	69	#include <trace/events/vmscan.h>
	70	+#include <trace/hooks/mm.h>
77	71
78	72	struct cgroup_subsys memory_cgrp_subsys __read_mostly;
79	73	EXPORT_SYMBOL(memory_cgrp_subsys);
80	74
81	75	struct mem_cgroup *root_mem_cgroup __read_mostly;
	76	+EXPORT_SYMBOL_GPL(root_mem_cgroup);
82	77
83		-#define MEM_CGROUP_RECLAIM_RETRIES 5
	78	+/* Active memory cgroup to use from an interrupt context */
	79	+DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
84	80
85	81	/* Socket memory accounting disabled? */
86	82	static bool cgroup_memory_nosocket;
..	..	@@ -90,30 +86,23 @@
90	86
91	87	/* Whether the swap controller is active */
92	88	#ifdef CONFIG_MEMCG_SWAP
93		-int do_swap_account __read_mostly;
	89	+bool cgroup_memory_noswap __read_mostly;
94	90	#else
95		-#define do_swap_account 0
	91	+#define cgroup_memory_noswap 1
96	92	#endif
97	93
98		-static DEFINE_LOCAL_IRQ_LOCK(event_lock);
	94	+#ifdef CONFIG_CGROUP_WRITEBACK
	95	+static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
	96	+#endif
99	97
100	98	/* Whether legacy memory+swap accounting is active */
101	99	static bool do_memsw_account(void)
102	100	{
103		- return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
	101	+ return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
104	102	}
105		-
106		-static const char *const mem_cgroup_lru_names[] = {
107		- "inactive_anon",
108		- "active_anon",
109		- "inactive_file",
110		- "active_file",
111		- "unevictable",
112		-};
113	103
114	104	#define THRESHOLDS_EVENTS_TARGET 128
115	105	#define SOFTLIMIT_EVENTS_TARGET 1024
116		-#define NUMAINFO_EVENTS_TARGET 1024
117	106
118	107	/*
119	108	* Cgroups above their limits are maintained in a RB-Tree, independent of
..	..	@@ -213,14 +202,6 @@
213	202	#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
214	203	#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
215	204
216		-enum charge_type {
217		- MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
218		- MEM_CGROUP_CHARGE_TYPE_ANON,
219		- MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
220		- MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */
221		- NR_CHARGE_TYPE,
222		-};
223		-
224	205	/* for encoding cft->private value on file */
225	206	enum res_type {
226	207	_MEM,
..	..	@@ -251,7 +232,7 @@
251	232	iter != NULL; \
252	233	iter = mem_cgroup_iter(NULL, iter, NULL))
253	234
254		-static inline bool should_force_charge(void)
	235	+static inline bool task_is_dying(void)
255	236	{
256	237	return tsk_is_oom_victim(current) \|\| fatal_signal_pending(current) \|\|
257	238	(current->flags & PF_EXITING);
..	..	@@ -271,8 +252,100 @@
271	252	}
272	253
273	254	#ifdef CONFIG_MEMCG_KMEM
	255	+static DEFINE_SPINLOCK(objcg_lock);
	256	+
	257	+static void obj_cgroup_release(struct percpu_ref *ref)
	258	+{
	259	+ struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
	260	+ struct mem_cgroup *memcg;
	261	+ unsigned int nr_bytes;
	262	+ unsigned int nr_pages;
	263	+ unsigned long flags;
	264	+
	265	+ /*
	266	+ * At this point all allocated objects are freed, and
	267	+ * objcg->nr_charged_bytes can't have an arbitrary byte value.
	268	+ * However, it can be PAGE_SIZE or (x * PAGE_SIZE).
	269	+ *
	270	+ * The following sequence can lead to it:
	271	+ * 1) CPU0: objcg == stock->cached_objcg
	272	+ * 2) CPU1: we do a small allocation (e.g. 92 bytes),
	273	+ * PAGE_SIZE bytes are charged
	274	+ * 3) CPU1: a process from another memcg is allocating something,
	275	+ * the stock if flushed,
	276	+ * objcg->nr_charged_bytes = PAGE_SIZE - 92
	277	+ * 5) CPU0: we do release this object,
	278	+ * 92 bytes are added to stock->nr_bytes
	279	+ * 6) CPU0: stock is flushed,
	280	+ * 92 bytes are added to objcg->nr_charged_bytes
	281	+ *
	282	+ * In the result, nr_charged_bytes == PAGE_SIZE.
	283	+ * This page will be uncharged in obj_cgroup_release().
	284	+ */
	285	+ nr_bytes = atomic_read(&objcg->nr_charged_bytes);
	286	+ WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
	287	+ nr_pages = nr_bytes >> PAGE_SHIFT;
	288	+
	289	+ spin_lock_irqsave(&objcg_lock, flags);
	290	+ memcg = obj_cgroup_memcg(objcg);
	291	+ if (nr_pages)
	292	+ __memcg_kmem_uncharge(memcg, nr_pages);
	293	+ list_del(&objcg->list);
	294	+ mem_cgroup_put(memcg);
	295	+ spin_unlock_irqrestore(&objcg_lock, flags);
	296	+
	297	+ percpu_ref_exit(ref);
	298	+ kfree_rcu(objcg, rcu);
	299	+}
	300	+
	301	+static struct obj_cgroup *obj_cgroup_alloc(void)
	302	+{
	303	+ struct obj_cgroup *objcg;
	304	+ int ret;
	305	+
	306	+ objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
	307	+ if (!objcg)
	308	+ return NULL;
	309	+
	310	+ ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0,
	311	+ GFP_KERNEL);
	312	+ if (ret) {
	313	+ kfree(objcg);
	314	+ return NULL;
	315	+ }
	316	+ INIT_LIST_HEAD(&objcg->list);
	317	+ return objcg;
	318	+}
	319	+
	320	+static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
	321	+ struct mem_cgroup *parent)
	322	+{
	323	+ struct obj_cgroup objcg, iter;
	324	+
	325	+ objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
	326	+
	327	+ spin_lock_irq(&objcg_lock);
	328	+
	329	+ /* Move active objcg to the parent's list */
	330	+ xchg(&objcg->memcg, parent);
	331	+ css_get(&parent->css);
	332	+ list_add(&objcg->list, &parent->objcg_list);
	333	+
	334	+ /* Move already reparented objcgs to the parent's list */
	335	+ list_for_each_entry(iter, &memcg->objcg_list, list) {
	336	+ css_get(&parent->css);
	337	+ xchg(&iter->memcg, parent);
	338	+ css_put(&memcg->css);
	339	+ }
	340	+ list_splice(&memcg->objcg_list, &parent->objcg_list);
	341	+
	342	+ spin_unlock_irq(&objcg_lock);
	343	+
	344	+ percpu_ref_kill(&objcg->refcnt);
	345	+}
	346	+
274	347	/*
275		- * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
	348	+ * This will be used as a shrinker list's index.
276	349	* The main reason for not using cgroup id for this:
277	350	* this works better in sparse environments, where we have a lot of memcgs,
278	351	* but only a few kmem-limited. Or also, if we have, for instance, 200
..	..	@@ -315,14 +388,13 @@
315	388
316	389	/*
317	390	* A lot of the calls to the cache allocation functions are expected to be
318		- * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
	391	+ * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are
319	392	* conditional to this static branch, we'll have to allow modules that does
320	393	* kmem_cache_alloc and the such to see this symbol as well
321	394	*/
322	395	DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
323	396	EXPORT_SYMBOL(memcg_kmem_enabled_key);
324		-
325		-struct workqueue_struct *memcg_kmem_cache_wq;
	397	+#endif
326	398
327	399	static int memcg_shrinker_map_size;
328	400	static DEFINE_MUTEX(memcg_shrinker_map_mutex);
..	..	@@ -347,7 +419,7 @@
347	419	if (!old)
348	420	return 0;
349	421
350		- new = kvmalloc(sizeof(*new) + size, GFP_KERNEL);
	422	+ new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
351	423	if (!new)
352	424	return -ENOMEM;
353	425
..	..	@@ -391,7 +463,7 @@
391	463	mutex_lock(&memcg_shrinker_map_mutex);
392	464	size = memcg_shrinker_map_size;
393	465	for_each_node(nid) {
394		- map = kvzalloc(sizeof(*map) + size, GFP_KERNEL);
	466	+ map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid);
395	467	if (!map) {
396	468	memcg_free_shrinker_maps(memcg);
397	469	ret = -ENOMEM;
..	..	@@ -448,14 +520,6 @@
448	520	}
449	521	}
450	522
451		-#else /* CONFIG_MEMCG_KMEM */
452		-static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
453		-{
454		- return 0;
455		-}
456		-static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { }
457		-#endif /* CONFIG_MEMCG_KMEM */
458		-
459	523	/**
460	524	* mem_cgroup_css_from_page - css of the memcg associated with a page
461	525	* @page: page of interest
..	..	@@ -498,7 +562,17 @@
498	562	unsigned long ino = 0;
499	563
500	564	rcu_read_lock();
501		- memcg = READ_ONCE(page->mem_cgroup);
	565	+ memcg = page->mem_cgroup;
	566	+
	567	+ /*
	568	+ * The lowest bit set means that memcg isn't a valid
	569	+ * memcg pointer, but a obj_cgroups pointer.
	570	+ * In this case the page is shared and doesn't belong
	571	+ * to any specific memory cgroup.
	572	+ */
	573	+ if ((unsigned long) memcg & 0x1UL)
	574	+ memcg = NULL;
	575	+
502	576	while (memcg && !(memcg->css.flags & CSS_ONLINE))
503	577	memcg = parent_mem_cgroup(memcg);
504	578	if (memcg)
..	..	@@ -674,7 +748,7 @@
674	748	*/
675	749	__mem_cgroup_remove_exceeded(mz, mctz);
676	750	if (!soft_limit_excess(mz->memcg) \|\|
677		- !css_tryget_online(&mz->memcg->css))
	751	+ !css_tryget(&mz->memcg->css))
678	752	goto retry;
679	753	done:
680	754	return mz;
..	..	@@ -691,33 +765,186 @@
691	765	return mz;
692	766	}
693	767
694		-static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
695		- int event)
	768	+/**
	769	+ * __mod_memcg_state - update cgroup memory statistics
	770	+ * @memcg: the memory cgroup
	771	+ * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
	772	+ * @val: delta to add to the counter, can be negative
	773	+ */
	774	+void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
696	775	{
697		- return atomic_long_read(&memcg->events[event]);
	776	+ long x, threshold = MEMCG_CHARGE_BATCH;
	777	+
	778	+ if (mem_cgroup_disabled())
	779	+ return;
	780	+
	781	+ if (memcg_stat_item_in_bytes(idx))
	782	+ threshold <<= PAGE_SHIFT;
	783	+
	784	+ x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
	785	+ if (unlikely(abs(x) > threshold)) {
	786	+ struct mem_cgroup *mi;
	787	+
	788	+ /*
	789	+ * Batch local counters to keep them in sync with
	790	+ * the hierarchical ones.
	791	+ */
	792	+ __this_cpu_add(memcg->vmstats_local->stat[idx], x);
	793	+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
	794	+ atomic_long_add(x, &mi->vmstats[idx]);
	795	+ x = 0;
	796	+ }
	797	+ __this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
	798	+}
	799	+
	800	+static struct mem_cgroup_per_node *
	801	+parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
	802	+{
	803	+ struct mem_cgroup *parent;
	804	+
	805	+ parent = parent_mem_cgroup(pn->memcg);
	806	+ if (!parent)
	807	+ return NULL;
	808	+ return mem_cgroup_nodeinfo(parent, nid);
	809	+}
	810	+
	811	+void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
	812	+ int val)
	813	+{
	814	+ struct mem_cgroup_per_node *pn;
	815	+ struct mem_cgroup *memcg;
	816	+ long x, threshold = MEMCG_CHARGE_BATCH;
	817	+
	818	+ pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
	819	+ memcg = pn->memcg;
	820	+
	821	+ /* Update memcg */
	822	+ __mod_memcg_state(memcg, idx, val);
	823	+
	824	+ /* Update lruvec */
	825	+ __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
	826	+
	827	+ if (vmstat_item_in_bytes(idx))
	828	+ threshold <<= PAGE_SHIFT;
	829	+
	830	+ x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
	831	+ if (unlikely(abs(x) > threshold)) {
	832	+ pg_data_t *pgdat = lruvec_pgdat(lruvec);
	833	+ struct mem_cgroup_per_node *pi;
	834	+
	835	+ for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
	836	+ atomic_long_add(x, &pi->lruvec_stat[idx]);
	837	+ x = 0;
	838	+ }
	839	+ __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
	840	+}
	841	+
	842	+/**
	843	+ * __mod_lruvec_state - update lruvec memory statistics
	844	+ * @lruvec: the lruvec
	845	+ * @idx: the stat item
	846	+ * @val: delta to add to the counter, can be negative
	847	+ *
	848	+ * The lruvec is the intersection of the NUMA node and a cgroup. This
	849	+ * function updates the all three counters that are affected by a
	850	+ * change of state at this level: per-node, per-cgroup, per-lruvec.
	851	+ */
	852	+void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
	853	+ int val)
	854	+{
	855	+ /* Update node */
	856	+ __mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
	857	+
	858	+ /* Update memcg and lruvec */
	859	+ if (!mem_cgroup_disabled())
	860	+ __mod_memcg_lruvec_state(lruvec, idx, val);
	861	+}
	862	+EXPORT_SYMBOL_GPL(__mod_lruvec_state);
	863	+
	864	+void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
	865	+{
	866	+ pg_data_t *pgdat = page_pgdat(virt_to_page(p));
	867	+ struct mem_cgroup *memcg;
	868	+ struct lruvec *lruvec;
	869	+
	870	+ rcu_read_lock();
	871	+ memcg = mem_cgroup_from_obj(p);
	872	+
	873	+ /*
	874	+ * Untracked pages have no memcg, no lruvec. Update only the
	875	+ * node. If we reparent the slab objects to the root memcg,
	876	+ * when we free the slab object, we need to update the per-memcg
	877	+ * vmstats to keep it correct for the root memcg.
	878	+ */
	879	+ if (!memcg) {
	880	+ __mod_node_page_state(pgdat, idx, val);
	881	+ } else {
	882	+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
	883	+ __mod_lruvec_state(lruvec, idx, val);
	884	+ }
	885	+ rcu_read_unlock();
	886	+}
	887	+
	888	+void mod_memcg_obj_state(void *p, int idx, int val)
	889	+{
	890	+ struct mem_cgroup *memcg;
	891	+
	892	+ rcu_read_lock();
	893	+ memcg = mem_cgroup_from_obj(p);
	894	+ if (memcg)
	895	+ mod_memcg_state(memcg, idx, val);
	896	+ rcu_read_unlock();
	897	+}
	898	+
	899	+/**
	900	+ * __count_memcg_events - account VM events in a cgroup
	901	+ * @memcg: the memory cgroup
	902	+ * @idx: the event item
	903	+ * @count: the number of events that occured
	904	+ */
	905	+void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
	906	+ unsigned long count)
	907	+{
	908	+ unsigned long x;
	909	+
	910	+ if (mem_cgroup_disabled())
	911	+ return;
	912	+
	913	+ x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
	914	+ if (unlikely(x > MEMCG_CHARGE_BATCH)) {
	915	+ struct mem_cgroup *mi;
	916	+
	917	+ /*
	918	+ * Batch local counters to keep them in sync with
	919	+ * the hierarchical ones.
	920	+ */
	921	+ __this_cpu_add(memcg->vmstats_local->events[idx], x);
	922	+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
	923	+ atomic_long_add(x, &mi->vmevents[idx]);
	924	+ x = 0;
	925	+ }
	926	+ __this_cpu_write(memcg->vmstats_percpu->events[idx], x);
	927	+}
	928	+
	929	+static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
	930	+{
	931	+ return atomic_long_read(&memcg->vmevents[event]);
	932	+}
	933	+
	934	+static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
	935	+{
	936	+ long x = 0;
	937	+ int cpu;
	938	+
	939	+ for_each_possible_cpu(cpu)
	940	+ x += per_cpu(memcg->vmstats_local->events[event], cpu);
	941	+ return x;
698	942	}
699	943
700	944	static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
701	945	struct page *page,
702		- bool compound, int nr_pages)
	946	+ int nr_pages)
703	947	{
704		- /*
705		- * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
706		- * counted as CACHE even if it's on ANON LRU.
707		- */
708		- if (PageAnon(page))
709		- __mod_memcg_state(memcg, MEMCG_RSS, nr_pages);
710		- else {
711		- __mod_memcg_state(memcg, MEMCG_CACHE, nr_pages);
712		- if (PageSwapBacked(page))
713		- __mod_memcg_state(memcg, NR_SHMEM, nr_pages);
714		- }
715		-
716		- if (compound) {
717		- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
718		- __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages);
719		- }
720		-
721	948	/* pagein of a big page is an event. So, ignore page size */
722	949	if (nr_pages > 0)
723	950	__count_memcg_events(memcg, PGPGIN, 1);
..	..	@@ -726,35 +953,7 @@
726	953	nr_pages = -nr_pages; /* for event */
727	954	}
728	955
729		- __this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages);
730		-}
731		-
732		-unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
733		- int nid, unsigned int lru_mask)
734		-{
735		- struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
736		- unsigned long nr = 0;
737		- enum lru_list lru;
738		-
739		- VM_BUG_ON((unsigned)nid >= nr_node_ids);
740		-
741		- for_each_lru(lru) {
742		- if (!(BIT(lru) & lru_mask))
743		- continue;
744		- nr += mem_cgroup_get_lru_size(lruvec, lru);
745		- }
746		- return nr;
747		-}
748		-
749		-static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
750		- unsigned int lru_mask)
751		-{
752		- unsigned long nr = 0;
753		- int nid;
754		-
755		- for_each_node_state(nid, N_MEMORY)
756		- nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
757		- return nr;
	956	+ __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
758	957	}
759	958
760	959	static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
..	..	@@ -762,8 +961,8 @@
762	961	{
763	962	unsigned long val, next;
764	963
765		- val = __this_cpu_read(memcg->stat_cpu->nr_page_events);
766		- next = __this_cpu_read(memcg->stat_cpu->targets[target]);
	964	+ val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
	965	+ next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
767	966	/* from time_after() in jiffies.h */
768	967	if ((long)(next - val) < 0) {
769	968	switch (target) {
..	..	@@ -773,13 +972,10 @@
773	972	case MEM_CGROUP_TARGET_SOFTLIMIT:
774	973	next = val + SOFTLIMIT_EVENTS_TARGET;
775	974	break;
776		- case MEM_CGROUP_TARGET_NUMAINFO:
777		- next = val + NUMAINFO_EVENTS_TARGET;
778		- break;
779	975	default:
780	976	break;
781	977	}
782		- __this_cpu_write(memcg->stat_cpu->targets[target], next);
	978	+ __this_cpu_write(memcg->vmstats_percpu->targets[target], next);
783	979	return true;
784	980	}
785	981	return false;
..	..	@@ -795,21 +991,12 @@
795	991	if (unlikely(mem_cgroup_event_ratelimit(memcg,
796	992	MEM_CGROUP_TARGET_THRESH))) {
797	993	bool do_softlimit;
798		- bool do_numainfo __maybe_unused;
799	994
800	995	do_softlimit = mem_cgroup_event_ratelimit(memcg,
801	996	MEM_CGROUP_TARGET_SOFTLIMIT);
802		-#if MAX_NUMNODES > 1
803		- do_numainfo = mem_cgroup_event_ratelimit(memcg,
804		- MEM_CGROUP_TARGET_NUMAINFO);
805		-#endif
806	997	mem_cgroup_threshold(memcg);
807	998	if (unlikely(do_softlimit))
808	999	mem_cgroup_update_tree(memcg, page);
809		-#if MAX_NUMNODES > 1
810		- if (unlikely(do_numainfo))
811		- atomic_inc(&memcg->numainfo_events);
812		-#endif
813	1000	}
814	1001	}
815	1002
..	..	@@ -877,27 +1064,60 @@
877	1064	return NULL;
878	1065
879	1066	rcu_read_lock();
880		- if (!memcg \|\| !css_tryget_online(&memcg->css))
	1067	+ /* Page should not get uncharged and freed memcg under us. */
	1068	+ if (!memcg \|\| WARN_ON_ONCE(!css_tryget(&memcg->css)))
881	1069	memcg = root_mem_cgroup;
882	1070	rcu_read_unlock();
883	1071	return memcg;
884	1072	}
885	1073	EXPORT_SYMBOL(get_mem_cgroup_from_page);
886	1074
	1075	+static __always_inline struct mem_cgroup *active_memcg(void)
	1076	+{
	1077	+ if (in_interrupt())
	1078	+ return this_cpu_read(int_active_memcg);
	1079	+ else
	1080	+ return current->active_memcg;
	1081	+}
	1082	+
	1083	+static __always_inline struct mem_cgroup *get_active_memcg(void)
	1084	+{
	1085	+ struct mem_cgroup *memcg;
	1086	+
	1087	+ rcu_read_lock();
	1088	+ memcg = active_memcg();
	1089	+ /* remote memcg must hold a ref. */
	1090	+ if (memcg && WARN_ON_ONCE(!css_tryget(&memcg->css)))
	1091	+ memcg = root_mem_cgroup;
	1092	+ rcu_read_unlock();
	1093	+
	1094	+ return memcg;
	1095	+}
	1096	+
	1097	+static __always_inline bool memcg_kmem_bypass(void)
	1098	+{
	1099	+ /* Allow remote memcg charging from any context. */
	1100	+ if (unlikely(active_memcg()))
	1101	+ return false;
	1102	+
	1103	+ /* Memcg to charge can't be determined. */
	1104	+ if (in_interrupt() \|\| !current->mm \|\| (current->flags & PF_KTHREAD))
	1105	+ return true;
	1106	+
	1107	+ return false;
	1108	+}
	1109	+
887	1110	/**
888		- * If current->active_memcg is non-NULL, do not fallback to current->mm->memcg.
	1111	+ * If active memcg is set, do not fallback to current->mm->memcg.
889	1112	*/
890	1113	static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
891	1114	{
892		- if (unlikely(current->active_memcg)) {
893		- struct mem_cgroup *memcg = root_mem_cgroup;
	1115	+ if (memcg_kmem_bypass())
	1116	+ return NULL;
894	1117
895		- rcu_read_lock();
896		- if (css_tryget_online(&current->active_memcg->css))
897		- memcg = current->active_memcg;
898		- rcu_read_unlock();
899		- return memcg;
900		- }
	1118	+ if (unlikely(active_memcg()))
	1119	+ return get_active_memcg();
	1120	+
901	1121	return get_mem_cgroup_from_mm(current->mm);
902	1122	}
903	1123
..	..	@@ -914,15 +1134,15 @@
914	1134	* invocations for reference counting, or use mem_cgroup_iter_break()
915	1135	* to cancel a hierarchy walk before the round-trip is complete.
916	1136	*
917		- * Reclaimers can specify a node and a priority level in @reclaim to
918		- * divide up the memcgs in the hierarchy among all concurrent
919		- * reclaimers operating on the same node and priority.
	1137	+ * Reclaimers can specify a node in @reclaim to divide up the memcgs
	1138	+ * in the hierarchy among all concurrent reclaimers operating on the
	1139	+ * same node.
920	1140	*/
921	1141	struct mem_cgroup mem_cgroup_iter(struct mem_cgroup root,
922	1142	struct mem_cgroup *prev,
923	1143	struct mem_cgroup_reclaim_cookie *reclaim)
924	1144	{
925		- struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
	1145	+ struct mem_cgroup_reclaim_iter *iter;
926	1146	struct cgroup_subsys_state *css = NULL;
927	1147	struct mem_cgroup *memcg = NULL;
928	1148	struct mem_cgroup *pos = NULL;
..	..	@@ -948,7 +1168,7 @@
948	1168	struct mem_cgroup_per_node *mz;
949	1169
950	1170	mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
951		- iter = &mz->iter[reclaim->priority];
	1171	+ iter = &mz->iter;
952	1172
953	1173	if (prev && reclaim->generation != iter->generation)
954	1174	goto out_unlock;
..	..	@@ -1048,15 +1268,11 @@
1048	1268	struct mem_cgroup_reclaim_iter *iter;
1049	1269	struct mem_cgroup_per_node *mz;
1050	1270	int nid;
1051		- int i;
1052	1271
1053	1272	for_each_node(nid) {
1054	1273	mz = mem_cgroup_nodeinfo(from, nid);
1055		- for (i = 0; i <= DEF_PRIORITY; i++) {
1056		- iter = &mz->iter[i];
1057		- cmpxchg(&iter->position,
1058		- dead_memcg, NULL);
1059		- }
	1274	+ iter = &mz->iter;
	1275	+ cmpxchg(&iter->position, dead_memcg, NULL);
1060	1276	}
1061	1277	}
1062	1278
..	..	@@ -1106,7 +1322,7 @@
1106	1322	struct css_task_iter it;
1107	1323	struct task_struct *task;
1108	1324
1109		- css_task_iter_start(&iter->css, 0, &it);
	1325	+ css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
1110	1326	while (!ret && (task = css_task_iter_next(&it)))
1111	1327	ret = fn(task, arg);
1112	1328	css_task_iter_end(&it);
..	..	@@ -1123,9 +1339,8 @@
1123	1339	* @page: the page
1124	1340	* @pgdat: pgdat of the page
1125	1341	*
1126		- * This function is only safe when following the LRU page isolation
1127		- * and putback protocol: the LRU lock must be held, and the page must
1128		- * either be PageLRU() or the caller must have isolated/allocated it.
	1342	+ * This function relies on page->mem_cgroup being stable - see the
	1343	+ * access rules in commit_charge().
1129	1344	*/
1130	1345	struct lruvec mem_cgroup_page_lruvec(struct page page, struct pglist_data *pgdat)
1131	1346	{
..	..	@@ -1134,7 +1349,7 @@
1134	1349	struct lruvec *lruvec;
1135	1350
1136	1351	if (mem_cgroup_disabled()) {
1137		- lruvec = &pgdat->lruvec;
	1352	+ lruvec = &pgdat->__lruvec;
1138	1353	goto out;
1139	1354	}
1140	1355
..	..	@@ -1158,6 +1373,38 @@
1158	1373	lruvec->pgdat = pgdat;
1159	1374	return lruvec;
1160	1375	}
	1376	+
	1377	+struct lruvec page_to_lruvec(struct page page, pg_data_t *pgdat)
	1378	+{
	1379	+ struct lruvec *lruvec;
	1380	+
	1381	+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
	1382	+
	1383	+ return lruvec;
	1384	+}
	1385	+EXPORT_SYMBOL_GPL(page_to_lruvec);
	1386	+
	1387	+void do_traversal_all_lruvec(void)
	1388	+{
	1389	+ pg_data_t *pgdat;
	1390	+
	1391	+ for_each_online_pgdat(pgdat) {
	1392	+ struct mem_cgroup *memcg = NULL;
	1393	+
	1394	+ spin_lock_irq(&pgdat->lru_lock);
	1395	+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
	1396	+ do {
	1397	+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
	1398	+
	1399	+ trace_android_vh_do_traversal_lruvec(lruvec);
	1400	+
	1401	+ memcg = mem_cgroup_iter(NULL, memcg, NULL);
	1402	+ } while (memcg);
	1403	+
	1404	+ spin_unlock_irq(&pgdat->lru_lock);
	1405	+ }
	1406	+}
	1407	+EXPORT_SYMBOL_GPL(do_traversal_all_lruvec);
1161	1408
1162	1409	/**
1163	1410	* mem_cgroup_update_lru_size - account for adding or removing an lru page
..	..	@@ -1197,32 +1444,7 @@
1197	1444	if (nr_pages > 0)
1198	1445	*lru_size += nr_pages;
1199	1446	}
1200		-
1201		-bool task_in_mem_cgroup(struct task_struct task, struct mem_cgroup memcg)
1202		-{
1203		- struct mem_cgroup *task_memcg;
1204		- struct task_struct *p;
1205		- bool ret;
1206		-
1207		- p = find_lock_task_mm(task);
1208		- if (p) {
1209		- task_memcg = get_mem_cgroup_from_mm(p->mm);
1210		- task_unlock(p);
1211		- } else {
1212		- /*
1213		- * All threads may have already detached their mm's, but the oom
1214		- * killer still needs to detect if they have already been oom
1215		- * killed to prevent needlessly killing additional tasks.
1216		- */
1217		- rcu_read_lock();
1218		- task_memcg = mem_cgroup_from_task(task);
1219		- css_get(&task_memcg->css);
1220		- rcu_read_unlock();
1221		- }
1222		- ret = mem_cgroup_is_descendant(task_memcg, memcg);
1223		- css_put(&task_memcg->css);
1224		- return ret;
1225		-}
	1447	+EXPORT_SYMBOL_GPL(mem_cgroup_update_lru_size);
1226	1448
1227	1449	/**
1228	1450	* mem_cgroup_margin - calculate chargeable space of a memory cgroup
..	..	@@ -1245,7 +1467,7 @@
1245	1467	if (do_memsw_account()) {
1246	1468	count = page_counter_read(&memcg->memsw);
1247	1469	limit = READ_ONCE(memcg->memsw.max);
1248		- if (count <= limit)
	1470	+ if (count < limit)
1249	1471	margin = min(margin, limit - count);
1250	1472	else
1251	1473	margin = 0;
..	..	@@ -1299,85 +1521,199 @@
1299	1521	return false;
1300	1522	}
1301	1523
1302		-static const unsigned int memcg1_stats[] = {
1303		- MEMCG_CACHE,
1304		- MEMCG_RSS,
1305		- MEMCG_RSS_HUGE,
1306		- NR_SHMEM,
1307		- NR_FILE_MAPPED,
1308		- NR_FILE_DIRTY,
1309		- NR_WRITEBACK,
1310		- MEMCG_SWAP,
	1524	+struct memory_stat {
	1525	+ const char *name;
	1526	+ unsigned int ratio;
	1527	+ unsigned int idx;
1311	1528	};
1312	1529
1313		-static const char *const memcg1_stat_names[] = {
1314		- "cache",
1315		- "rss",
1316		- "rss_huge",
1317		- "shmem",
1318		- "mapped_file",
1319		- "dirty",
1320		- "writeback",
1321		- "swap",
	1530	+static struct memory_stat memory_stats[] = {
	1531	+ { "anon", PAGE_SIZE, NR_ANON_MAPPED },
	1532	+ { "file", PAGE_SIZE, NR_FILE_PAGES },
	1533	+ { "kernel_stack", 1024, NR_KERNEL_STACK_KB },
	1534	+ { "percpu", 1, MEMCG_PERCPU_B },
	1535	+ { "sock", PAGE_SIZE, MEMCG_SOCK },
	1536	+ { "shmem", PAGE_SIZE, NR_SHMEM },
	1537	+ { "file_mapped", PAGE_SIZE, NR_FILE_MAPPED },
	1538	+ { "file_dirty", PAGE_SIZE, NR_FILE_DIRTY },
	1539	+ { "file_writeback", PAGE_SIZE, NR_WRITEBACK },
	1540	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	1541	+ /*
	1542	+ * The ratio will be initialized in memory_stats_init(). Because
	1543	+ * on some architectures, the macro of HPAGE_PMD_SIZE is not
	1544	+ * constant(e.g. powerpc).
	1545	+ */
	1546	+ { "anon_thp", 0, NR_ANON_THPS },
	1547	+#endif
	1548	+ { "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON },
	1549	+ { "active_anon", PAGE_SIZE, NR_ACTIVE_ANON },
	1550	+ { "inactive_file", PAGE_SIZE, NR_INACTIVE_FILE },
	1551	+ { "active_file", PAGE_SIZE, NR_ACTIVE_FILE },
	1552	+ { "unevictable", PAGE_SIZE, NR_UNEVICTABLE },
	1553	+
	1554	+ /*
	1555	+ * Note: The slab_reclaimable and slab_unreclaimable must be
	1556	+ * together and slab_reclaimable must be in front.
	1557	+ */
	1558	+ { "slab_reclaimable", 1, NR_SLAB_RECLAIMABLE_B },
	1559	+ { "slab_unreclaimable", 1, NR_SLAB_UNRECLAIMABLE_B },
	1560	+
	1561	+ /* The memory events */
	1562	+ { "workingset_refault_anon", 1, WORKINGSET_REFAULT_ANON },
	1563	+ { "workingset_refault_file", 1, WORKINGSET_REFAULT_FILE },
	1564	+ { "workingset_activate_anon", 1, WORKINGSET_ACTIVATE_ANON },
	1565	+ { "workingset_activate_file", 1, WORKINGSET_ACTIVATE_FILE },
	1566	+ { "workingset_restore_anon", 1, WORKINGSET_RESTORE_ANON },
	1567	+ { "workingset_restore_file", 1, WORKINGSET_RESTORE_FILE },
	1568	+ { "workingset_nodereclaim", 1, WORKINGSET_NODERECLAIM },
1322	1569	};
	1570	+
	1571	+static int __init memory_stats_init(void)
	1572	+{
	1573	+ int i;
	1574	+
	1575	+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
	1576	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	1577	+ if (memory_stats[i].idx == NR_ANON_THPS)
	1578	+ memory_stats[i].ratio = HPAGE_PMD_SIZE;
	1579	+#endif
	1580	+ VM_BUG_ON(!memory_stats[i].ratio);
	1581	+ VM_BUG_ON(memory_stats[i].idx >= MEMCG_NR_STAT);
	1582	+ }
	1583	+
	1584	+ return 0;
	1585	+}
	1586	+pure_initcall(memory_stats_init);
	1587	+
	1588	+static char memory_stat_format(struct mem_cgroup memcg)
	1589	+{
	1590	+ struct seq_buf s;
	1591	+ int i;
	1592	+
	1593	+ seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
	1594	+ if (!s.buffer)
	1595	+ return NULL;
	1596	+
	1597	+ /*
	1598	+ * Provide statistics on the state of the memory subsystem as
	1599	+ * well as cumulative event counters that show past behavior.
	1600	+ *
	1601	+ * This list is ordered following a combination of these gradients:
	1602	+ * 1) generic big picture -> specifics and details
	1603	+ * 2) reflecting userspace activity -> reflecting kernel heuristics
	1604	+ *
	1605	+ * Current memory state:
	1606	+ */
	1607	+
	1608	+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
	1609	+ u64 size;
	1610	+
	1611	+ size = memcg_page_state(memcg, memory_stats[i].idx);
	1612	+ size *= memory_stats[i].ratio;
	1613	+ seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size);
	1614	+
	1615	+ if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
	1616	+ size = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
	1617	+ memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B);
	1618	+ seq_buf_printf(&s, "slab %llu\n", size);
	1619	+ }
	1620	+ }
	1621	+
	1622	+ /* Accumulated memory events */
	1623	+
	1624	+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT),
	1625	+ memcg_events(memcg, PGFAULT));
	1626	+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
	1627	+ memcg_events(memcg, PGMAJFAULT));
	1628	+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGREFILL),
	1629	+ memcg_events(memcg, PGREFILL));
	1630	+ seq_buf_printf(&s, "pgscan %lu\n",
	1631	+ memcg_events(memcg, PGSCAN_KSWAPD) +
	1632	+ memcg_events(memcg, PGSCAN_DIRECT));
	1633	+ seq_buf_printf(&s, "pgsteal %lu\n",
	1634	+ memcg_events(memcg, PGSTEAL_KSWAPD) +
	1635	+ memcg_events(memcg, PGSTEAL_DIRECT));
	1636	+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE),
	1637	+ memcg_events(memcg, PGACTIVATE));
	1638	+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE),
	1639	+ memcg_events(memcg, PGDEACTIVATE));
	1640	+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE),
	1641	+ memcg_events(memcg, PGLAZYFREE));
	1642	+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED),
	1643	+ memcg_events(memcg, PGLAZYFREED));
	1644	+
	1645	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	1646	+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC),
	1647	+ memcg_events(memcg, THP_FAULT_ALLOC));
	1648	+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC),
	1649	+ memcg_events(memcg, THP_COLLAPSE_ALLOC));
	1650	+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
	1651	+
	1652	+ /* The above should easily fit into one page */
	1653	+ WARN_ON_ONCE(seq_buf_has_overflowed(&s));
	1654	+
	1655	+ return s.buffer;
	1656	+}
1323	1657
1324	1658	#define K(x) ((x) << (PAGE_SHIFT-10))
1325	1659	/**
1326		- * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
	1660	+ * mem_cgroup_print_oom_context: Print OOM information relevant to
	1661	+ * memory controller.
1327	1662	* @memcg: The memory cgroup that went over limit
1328	1663	* @p: Task that is going to be killed
1329	1664	*
1330	1665	* NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1331	1666	* enabled
1332	1667	*/
1333		-void mem_cgroup_print_oom_info(struct mem_cgroup memcg, struct task_struct p)
	1668	+void mem_cgroup_print_oom_context(struct mem_cgroup memcg, struct task_struct p)
1334	1669	{
1335		- struct mem_cgroup *iter;
1336		- unsigned int i;
1337		-
1338	1670	rcu_read_lock();
1339	1671
	1672	+ if (memcg) {
	1673	+ pr_cont(",oom_memcg=");
	1674	+ pr_cont_cgroup_path(memcg->css.cgroup);
	1675	+ } else
	1676	+ pr_cont(",global_oom");
1340	1677	if (p) {
1341		- pr_info("Task in ");
	1678	+ pr_cont(",task_memcg=");
1342	1679	pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1343		- pr_cont(" killed as a result of limit of ");
1344		- } else {
1345		- pr_info("Memory limit reached of cgroup ");
1346	1680	}
1347		-
1348		- pr_cont_cgroup_path(memcg->css.cgroup);
1349		- pr_cont("\n");
1350		-
1351	1681	rcu_read_unlock();
	1682	+}
	1683	+
	1684	+/**
	1685	+ * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to
	1686	+ * memory controller.
	1687	+ * @memcg: The memory cgroup that went over limit
	1688	+ */
	1689	+void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
	1690	+{
	1691	+ char *buf;
1352	1692
1353	1693	pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1354	1694	K((u64)page_counter_read(&memcg->memory)),
1355		- K((u64)memcg->memory.max), memcg->memory.failcnt);
1356		- pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1357		- K((u64)page_counter_read(&memcg->memsw)),
1358		- K((u64)memcg->memsw.max), memcg->memsw.failcnt);
1359		- pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1360		- K((u64)page_counter_read(&memcg->kmem)),
1361		- K((u64)memcg->kmem.max), memcg->kmem.failcnt);
1362		-
1363		- for_each_mem_cgroup_tree(iter, memcg) {
1364		- pr_info("Memory cgroup stats for ");
1365		- pr_cont_cgroup_path(iter->css.cgroup);
1366		- pr_cont(":");
1367		-
1368		- for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
1369		- if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account)
1370		- continue;
1371		- pr_cont(" %s:%luKB", memcg1_stat_names[i],
1372		- K(memcg_page_state(iter, memcg1_stats[i])));
1373		- }
1374		-
1375		- for (i = 0; i < NR_LRU_LISTS; i++)
1376		- pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
1377		- K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
1378		-
1379		- pr_cont("\n");
	1695	+ K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt);
	1696	+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
	1697	+ pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
	1698	+ K((u64)page_counter_read(&memcg->swap)),
	1699	+ K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt);
	1700	+ else {
	1701	+ pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
	1702	+ K((u64)page_counter_read(&memcg->memsw)),
	1703	+ K((u64)memcg->memsw.max), memcg->memsw.failcnt);
	1704	+ pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
	1705	+ K((u64)page_counter_read(&memcg->kmem)),
	1706	+ K((u64)memcg->kmem.max), memcg->kmem.failcnt);
1380	1707	}
	1708	+
	1709	+ pr_info("Memory cgroup stats for ");
	1710	+ pr_cont_cgroup_path(memcg->css.cgroup);
	1711	+ pr_cont(":");
	1712	+ buf = memory_stat_format(memcg);
	1713	+ if (!buf)
	1714	+ return;
	1715	+ pr_info("%s", buf);
	1716	+ kfree(buf);
1381	1717	}
1382	1718
1383	1719	/*
..	..	@@ -1385,19 +1721,26 @@
1385	1721	*/
1386	1722	unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
1387	1723	{
1388		- unsigned long max;
	1724	+ unsigned long max = READ_ONCE(memcg->memory.max);
1389	1725
1390		- max = memcg->memory.max;
1391		- if (mem_cgroup_swappiness(memcg)) {
1392		- unsigned long memsw_max;
1393		- unsigned long swap_max;
	1726	+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
	1727	+ if (mem_cgroup_swappiness(memcg))
	1728	+ max += min(READ_ONCE(memcg->swap.max),
	1729	+ (unsigned long)total_swap_pages);
	1730	+ } else { /* v1 */
	1731	+ if (mem_cgroup_swappiness(memcg)) {
	1732	+ /* Calculate swap excess capacity from memsw limit */
	1733	+ unsigned long swap = READ_ONCE(memcg->memsw.max) - max;
1394	1734
1395		- memsw_max = memcg->memsw.max;
1396		- swap_max = memcg->swap.max;
1397		- swap_max = min(swap_max, (unsigned long)total_swap_pages);
1398		- max = min(max + swap_max, memsw_max);
	1735	+ max += min(swap, (unsigned long)total_swap_pages);
	1736	+ }
1399	1737	}
1400	1738	return max;
	1739	+}
	1740	+
	1741	+unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
	1742	+{
	1743	+ return page_counter_read(&memcg->memory);
1401	1744	}
1402	1745
1403	1746	static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
..	..	@@ -1410,112 +1753,24 @@
1410	1753	.gfp_mask = gfp_mask,
1411	1754	.order = order,
1412	1755	};
1413		- bool ret;
	1756	+ bool ret = true;
1414	1757
1415	1758	if (mutex_lock_killable(&oom_lock))
1416	1759	return true;
	1760	+
	1761	+ if (mem_cgroup_margin(memcg) >= (1 << order))
	1762	+ goto unlock;
	1763	+
1417	1764	/*
1418	1765	* A few threads which were not waiting at mutex_lock_killable() can
1419	1766	* fail to bail out. Therefore, check again after holding oom_lock.
1420	1767	*/
1421		- ret = should_force_charge() \|\| out_of_memory(&oc);
	1768	+ ret = task_is_dying() \|\| out_of_memory(&oc);
	1769	+
	1770	+unlock:
1422	1771	mutex_unlock(&oom_lock);
1423	1772	return ret;
1424	1773	}
1425		-
1426		-#if MAX_NUMNODES > 1
1427		-
1428		-/**
1429		- * test_mem_cgroup_node_reclaimable
1430		- * @memcg: the target memcg
1431		- * @nid: the node ID to be checked.
1432		- * @noswap : specify true here if the user wants flle only information.
1433		- *
1434		- * This function returns whether the specified memcg contains any
1435		- * reclaimable pages on a node. Returns true if there are any reclaimable
1436		- * pages in the node.
1437		- */
1438		-static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1439		- int nid, bool noswap)
1440		-{
1441		- if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
1442		- return true;
1443		- if (noswap \|\| !total_swap_pages)
1444		- return false;
1445		- if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
1446		- return true;
1447		- return false;
1448		-
1449		-}
1450		-
1451		-/*
1452		- * Always updating the nodemask is not very good - even if we have an empty
1453		- * list or the wrong list here, we can start from some node and traverse all
1454		- * nodes based on the zonelist. So update the list loosely once per 10 secs.
1455		- *
1456		- */
1457		-static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1458		-{
1459		- int nid;
1460		- /*
1461		- * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
1462		- * pagein/pageout changes since the last update.
1463		- */
1464		- if (!atomic_read(&memcg->numainfo_events))
1465		- return;
1466		- if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1467		- return;
1468		-
1469		- /* make a nodemask where this memcg uses memory from */
1470		- memcg->scan_nodes = node_states[N_MEMORY];
1471		-
1472		- for_each_node_mask(nid, node_states[N_MEMORY]) {
1473		-
1474		- if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1475		- node_clear(nid, memcg->scan_nodes);
1476		- }
1477		-
1478		- atomic_set(&memcg->numainfo_events, 0);
1479		- atomic_set(&memcg->numainfo_updating, 0);
1480		-}
1481		-
1482		-/*
1483		- * Selecting a node where we start reclaim from. Because what we need is just
1484		- * reducing usage counter, start from anywhere is O,K. Considering
1485		- * memory reclaim from current node, there are pros. and cons.
1486		- *
1487		- * Freeing memory from current node means freeing memory from a node which
1488		- * we'll use or we've used. So, it may make LRU bad. And if several threads
1489		- * hit limits, it will see a contention on a node. But freeing from remote
1490		- * node means more costs for memory reclaim because of memory latency.
1491		- *
1492		- * Now, we use round-robin. Better algorithm is welcomed.
1493		- */
1494		-int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1495		-{
1496		- int node;
1497		-
1498		- mem_cgroup_may_update_nodemask(memcg);
1499		- node = memcg->last_scanned_node;
1500		-
1501		- node = next_node_in(node, memcg->scan_nodes);
1502		- /*
1503		- * mem_cgroup_may_update_nodemask might have seen no reclaimmable pages
1504		- * last time it really checked all the LRUs due to rate limiting.
1505		- * Fallback to the current node in that case for simplicity.
1506		- */
1507		- if (unlikely(node == MAX_NUMNODES))
1508		- node = numa_node_id();
1509		-
1510		- memcg->last_scanned_node = node;
1511		- return node;
1512		-}
1513		-#else
1514		-int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1515		-{
1516		- return 0;
1517		-}
1518		-#endif
1519	1774
1520	1775	static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1521	1776	pg_data_t *pgdat,
..	..	@@ -1529,7 +1784,6 @@
1529	1784	unsigned long nr_scanned;
1530	1785	struct mem_cgroup_reclaim_cookie reclaim = {
1531	1786	.pgdat = pgdat,
1532		- .priority = 0,
1533	1787	};
1534	1788
1535	1789	excess = soft_limit_excess(root_memcg);
..	..	@@ -1624,7 +1878,7 @@
1624	1878	struct mem_cgroup *iter;
1625	1879
1626	1880	spin_lock(&memcg_oom_lock);
1627		- mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
	1881	+ mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
1628	1882	for_each_mem_cgroup_tree(iter, memcg)
1629	1883	iter->oom_lock = false;
1630	1884	spin_unlock(&memcg_oom_lock);
..	..	@@ -1645,8 +1899,8 @@
1645	1899	struct mem_cgroup *iter;
1646	1900
1647	1901	/*
1648		- * When a new child is created while the hierarchy is under oom,
1649		- * mem_cgroup_oom_lock() may not be called. Watch for underflow.
	1902	+ * Be careful about under_oom underflows becase a child memcg
	1903	+ * could have been added after mem_cgroup_mark_under_oom.
1650	1904	*/
1651	1905	spin_lock(&memcg_oom_lock);
1652	1906	for_each_mem_cgroup_tree(iter, memcg)
..	..	@@ -1706,6 +1960,8 @@
1706	1960
1707	1961	if (order > PAGE_ALLOC_COSTLY_ORDER)
1708	1962	return OOM_SKIPPED;
	1963	+
	1964	+ memcg_memory_event(memcg, MEMCG_OOM);
1709	1965
1710	1966	/*
1711	1967	* We are in the middle of the charge context here, so we
..	..	@@ -1854,6 +2110,14 @@
1854	2110	goto out;
1855	2111
1856	2112	/*
	2113	+ * If the victim task has been asynchronously moved to a different
	2114	+ * memory cgroup, we might end up killing tasks outside oom_domain.
	2115	+ * In this case it's better to ignore memory.group.oom.
	2116	+ */
	2117	+ if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain)))
	2118	+ goto out;
	2119	+
	2120	+ /*
1857	2121	* Traverse the memory cgroup hierarchy from the victim task's
1858	2122	* cgroup up to the OOMing cgroup (or root) to find the
1859	2123	* highest-level memory cgroup with oom.group set.
..	..	@@ -1894,6 +2158,7 @@
1894	2158	*/
1895	2159	struct mem_cgroup lock_page_memcg(struct page page)
1896	2160	{
	2161	+ struct page head = compound_head(page); / rmap on tail pages */
1897	2162	struct mem_cgroup *memcg;
1898	2163	unsigned long flags;
1899	2164
..	..	@@ -1913,7 +2178,7 @@
1913	2178	if (mem_cgroup_disabled())
1914	2179	return NULL;
1915	2180	again:
1916		- memcg = page->mem_cgroup;
	2181	+ memcg = head->mem_cgroup;
1917	2182	if (unlikely(!memcg))
1918	2183	return NULL;
1919	2184
..	..	@@ -1921,7 +2186,7 @@
1921	2186	return memcg;
1922	2187
1923	2188	spin_lock_irqsave(&memcg->move_lock, flags);
1924		- if (memcg != page->mem_cgroup) {
	2189	+ if (memcg != head->mem_cgroup) {
1925	2190	spin_unlock_irqrestore(&memcg->move_lock, flags);
1926	2191	goto again;
1927	2192	}
..	..	@@ -1964,19 +2229,43 @@
1964	2229	*/
1965	2230	void unlock_page_memcg(struct page *page)
1966	2231	{
1967		- __unlock_page_memcg(page->mem_cgroup);
	2232	+ struct page *head = compound_head(page);
	2233	+
	2234	+ __unlock_page_memcg(head->mem_cgroup);
1968	2235	}
1969	2236	EXPORT_SYMBOL(unlock_page_memcg);
1970	2237
1971	2238	struct memcg_stock_pcp {
1972	2239	struct mem_cgroup cached; / this never be root cgroup */
1973	2240	unsigned int nr_pages;
	2241	+
	2242	+#ifdef CONFIG_MEMCG_KMEM
	2243	+ struct obj_cgroup *cached_objcg;
	2244	+ unsigned int nr_bytes;
	2245	+#endif
	2246	+
1974	2247	struct work_struct work;
1975	2248	unsigned long flags;
1976	2249	#define FLUSHING_CACHED_CHARGE 0
1977	2250	};
1978	2251	static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1979	2252	static DEFINE_MUTEX(percpu_charge_mutex);
	2253	+
	2254	+#ifdef CONFIG_MEMCG_KMEM
	2255	+static void drain_obj_stock(struct memcg_stock_pcp *stock);
	2256	+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
	2257	+ struct mem_cgroup *root_memcg);
	2258	+
	2259	+#else
	2260	+static inline void drain_obj_stock(struct memcg_stock_pcp *stock)
	2261	+{
	2262	+}
	2263	+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
	2264	+ struct mem_cgroup *root_memcg)
	2265	+{
	2266	+ return false;
	2267	+}
	2268	+#endif
1980	2269
1981	2270	/**
1982	2271	* consume_stock: Try to consume stocked charge on this cpu.
..	..	@@ -2018,13 +2307,17 @@
2018	2307	{
2019	2308	struct mem_cgroup *old = stock->cached;
2020	2309
	2310	+ if (!old)
	2311	+ return;
	2312	+
2021	2313	if (stock->nr_pages) {
2022	2314	page_counter_uncharge(&old->memory, stock->nr_pages);
2023	2315	if (do_memsw_account())
2024	2316	page_counter_uncharge(&old->memsw, stock->nr_pages);
2025		- css_put_many(&old->css, stock->nr_pages);
2026	2317	stock->nr_pages = 0;
2027	2318	}
	2319	+
	2320	+ css_put(&old->css);
2028	2321	stock->cached = NULL;
2029	2322	}
2030	2323
..	..	@@ -2040,6 +2333,7 @@
2040	2333	local_irq_save(flags);
2041	2334
2042	2335	stock = this_cpu_ptr(&memcg_stock);
	2336	+ drain_obj_stock(stock);
2043	2337	drain_stock(stock);
2044	2338	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2045	2339
..	..	@@ -2060,6 +2354,7 @@
2060	2354	stock = this_cpu_ptr(&memcg_stock);
2061	2355	if (stock->cached != memcg) { /* reset if necessary */
2062	2356	drain_stock(stock);
	2357	+ css_get(&memcg->css);
2063	2358	stock->cached = memcg;
2064	2359	}
2065	2360	stock->nr_pages += nr_pages;
..	..	@@ -2087,34 +2382,37 @@
2087	2382	* as well as workers from this path always operate on the local
2088	2383	* per-cpu data. CPU up doesn't touch memcg_stock at all.
2089	2384	*/
2090		- curcpu = get_cpu_light();
	2385	+ curcpu = get_cpu();
2091	2386	for_each_online_cpu(cpu) {
2092	2387	struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2093	2388	struct mem_cgroup *memcg;
	2389	+ bool flush = false;
2094	2390
	2391	+ rcu_read_lock();
2095	2392	memcg = stock->cached;
2096		- if (!memcg \|\| !stock->nr_pages \|\| !css_tryget(&memcg->css))
2097		- continue;
2098		- if (!mem_cgroup_is_descendant(memcg, root_memcg)) {
2099		- css_put(&memcg->css);
2100		- continue;
2101		- }
2102		- if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
	2393	+ if (memcg && stock->nr_pages &&
	2394	+ mem_cgroup_is_descendant(memcg, root_memcg))
	2395	+ flush = true;
	2396	+ if (obj_stock_flush_required(stock, root_memcg))
	2397	+ flush = true;
	2398	+ rcu_read_unlock();
	2399	+
	2400	+ if (flush &&
	2401	+ !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2103	2402	if (cpu == curcpu)
2104	2403	drain_local_stock(&stock->work);
2105	2404	else
2106	2405	schedule_work_on(cpu, &stock->work);
2107	2406	}
2108		- css_put(&memcg->css);
2109	2407	}
2110		- put_cpu_light();
	2408	+ put_cpu();
2111	2409	mutex_unlock(&percpu_charge_mutex);
2112	2410	}
2113	2411
2114	2412	static int memcg_hotplug_cpu_dead(unsigned int cpu)
2115	2413	{
2116	2414	struct memcg_stock_pcp *stock;
2117		- struct mem_cgroup *memcg;
	2415	+ struct mem_cgroup memcg, mi;
2118	2416
2119	2417	stock = &per_cpu(memcg_stock, cpu);
2120	2418	drain_stock(stock);
..	..	@@ -2126,9 +2424,10 @@
2126	2424	int nid;
2127	2425	long x;
2128	2426
2129		- x = this_cpu_xchg(memcg->stat_cpu->count[i], 0);
	2427	+ x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
2130	2428	if (x)
2131		- atomic_long_add(x, &memcg->stat[i]);
	2429	+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
	2430	+ atomic_long_add(x, &memcg->vmstats[i]);
2132	2431
2133	2432	if (i >= NR_VM_NODE_STAT_ITEMS)
2134	2433	continue;
..	..	@@ -2139,32 +2438,48 @@
2139	2438	pn = mem_cgroup_nodeinfo(memcg, nid);
2140	2439	x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
2141	2440	if (x)
2142		- atomic_long_add(x, &pn->lruvec_stat[i]);
	2441	+ do {
	2442	+ atomic_long_add(x, &pn->lruvec_stat[i]);
	2443	+ } while ((pn = parent_nodeinfo(pn, nid)));
2143	2444	}
2144	2445	}
2145	2446
2146	2447	for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
2147	2448	long x;
2148	2449
2149		- x = this_cpu_xchg(memcg->stat_cpu->events[i], 0);
	2450	+ x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
2150	2451	if (x)
2151		- atomic_long_add(x, &memcg->events[i]);
	2452	+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
	2453	+ atomic_long_add(x, &memcg->vmevents[i]);
2152	2454	}
2153	2455	}
2154	2456
2155	2457	return 0;
2156	2458	}
2157	2459
2158		-static void reclaim_high(struct mem_cgroup *memcg,
2159		- unsigned int nr_pages,
2160		- gfp_t gfp_mask)
	2460	+static unsigned long reclaim_high(struct mem_cgroup *memcg,
	2461	+ unsigned int nr_pages,
	2462	+ gfp_t gfp_mask)
2161	2463	{
	2464	+ unsigned long nr_reclaimed = 0;
	2465	+
2162	2466	do {
2163		- if (page_counter_read(&memcg->memory) <= memcg->high)
	2467	+ unsigned long pflags;
	2468	+
	2469	+ if (page_counter_read(&memcg->memory) <=
	2470	+ READ_ONCE(memcg->memory.high))
2164	2471	continue;
	2472	+
2165	2473	memcg_memory_event(memcg, MEMCG_HIGH);
2166		- try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
2167		- } while ((memcg = parent_mem_cgroup(memcg)));
	2474	+
	2475	+ psi_memstall_enter(&pflags);
	2476	+ nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
	2477	+ gfp_mask, true);
	2478	+ psi_memstall_leave(&pflags);
	2479	+ } while ((memcg = parent_mem_cgroup(memcg)) &&
	2480	+ !mem_cgroup_is_root(memcg));
	2481	+
	2482	+ return nr_reclaimed;
2168	2483	}
2169	2484
2170	2485	static void high_work_func(struct work_struct *work)
..	..	@@ -2176,35 +2491,238 @@
2176	2491	}
2177	2492
2178	2493	/*
	2494	+ * Clamp the maximum sleep time per allocation batch to 2 seconds. This is
	2495	+ * enough to still cause a significant slowdown in most cases, while still
	2496	+ * allowing diagnostics and tracing to proceed without becoming stuck.
	2497	+ */
	2498	+#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
	2499	+
	2500	+/*
	2501	+ * When calculating the delay, we use these either side of the exponentiation to
	2502	+ * maintain precision and scale to a reasonable number of jiffies (see the table
	2503	+ * below.
	2504	+ *
	2505	+ * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
	2506	+ * overage ratio to a delay.
	2507	+ * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the
	2508	+ * proposed penalty in order to reduce to a reasonable number of jiffies, and
	2509	+ * to produce a reasonable delay curve.
	2510	+ *
	2511	+ * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
	2512	+ * reasonable delay curve compared to precision-adjusted overage, not
	2513	+ * penalising heavily at first, but still making sure that growth beyond the
	2514	+ * limit penalises misbehaviour cgroups by slowing them down exponentially. For
	2515	+ * example, with a high of 100 megabytes:
	2516	+ *
	2517	+ * +-------+------------------------+
	2518	+ * \| usage \| time to allocate in ms \|
	2519	+ * +-------+------------------------+
	2520	+ * \| 100M \| 0 \|
	2521	+ * \| 101M \| 6 \|
	2522	+ * \| 102M \| 25 \|
	2523	+ * \| 103M \| 57 \|
	2524	+ * \| 104M \| 102 \|
	2525	+ * \| 105M \| 159 \|
	2526	+ * \| 106M \| 230 \|
	2527	+ * \| 107M \| 313 \|
	2528	+ * \| 108M \| 409 \|
	2529	+ * \| 109M \| 518 \|
	2530	+ * \| 110M \| 639 \|
	2531	+ * \| 111M \| 774 \|
	2532	+ * \| 112M \| 921 \|
	2533	+ * \| 113M \| 1081 \|
	2534	+ * \| 114M \| 1254 \|
	2535	+ * \| 115M \| 1439 \|
	2536	+ * \| 116M \| 1638 \|
	2537	+ * \| 117M \| 1849 \|
	2538	+ * \| 118M \| 2000 \|
	2539	+ * \| 119M \| 2000 \|
	2540	+ * \| 120M \| 2000 \|
	2541	+ * +-------+------------------------+
	2542	+ */
	2543	+ #define MEMCG_DELAY_PRECISION_SHIFT 20
	2544	+ #define MEMCG_DELAY_SCALING_SHIFT 14
	2545	+
	2546	+static u64 calculate_overage(unsigned long usage, unsigned long high)
	2547	+{
	2548	+ u64 overage;
	2549	+
	2550	+ if (usage <= high)
	2551	+ return 0;
	2552	+
	2553	+ /*
	2554	+ * Prevent division by 0 in overage calculation by acting as if
	2555	+ * it was a threshold of 1 page
	2556	+ */
	2557	+ high = max(high, 1UL);
	2558	+
	2559	+ overage = usage - high;
	2560	+ overage <<= MEMCG_DELAY_PRECISION_SHIFT;
	2561	+ return div64_u64(overage, high);
	2562	+}
	2563	+
	2564	+static u64 mem_find_max_overage(struct mem_cgroup *memcg)
	2565	+{
	2566	+ u64 overage, max_overage = 0;
	2567	+
	2568	+ do {
	2569	+ overage = calculate_overage(page_counter_read(&memcg->memory),
	2570	+ READ_ONCE(memcg->memory.high));
	2571	+ max_overage = max(overage, max_overage);
	2572	+ } while ((memcg = parent_mem_cgroup(memcg)) &&
	2573	+ !mem_cgroup_is_root(memcg));
	2574	+
	2575	+ return max_overage;
	2576	+}
	2577	+
	2578	+static u64 swap_find_max_overage(struct mem_cgroup *memcg)
	2579	+{
	2580	+ u64 overage, max_overage = 0;
	2581	+
	2582	+ do {
	2583	+ overage = calculate_overage(page_counter_read(&memcg->swap),
	2584	+ READ_ONCE(memcg->swap.high));
	2585	+ if (overage)
	2586	+ memcg_memory_event(memcg, MEMCG_SWAP_HIGH);
	2587	+ max_overage = max(overage, max_overage);
	2588	+ } while ((memcg = parent_mem_cgroup(memcg)) &&
	2589	+ !mem_cgroup_is_root(memcg));
	2590	+
	2591	+ return max_overage;
	2592	+}
	2593	+
	2594	+/*
	2595	+ * Get the number of jiffies that we should penalise a mischievous cgroup which
	2596	+ * is exceeding its memory.high by checking both it and its ancestors.
	2597	+ */
	2598	+static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
	2599	+ unsigned int nr_pages,
	2600	+ u64 max_overage)
	2601	+{
	2602	+ unsigned long penalty_jiffies;
	2603	+
	2604	+ if (!max_overage)
	2605	+ return 0;
	2606	+
	2607	+ /*
	2608	+ * We use overage compared to memory.high to calculate the number of
	2609	+ * jiffies to sleep (penalty_jiffies). Ideally this value should be
	2610	+ * fairly lenient on small overages, and increasingly harsh when the
	2611	+ * memcg in question makes it clear that it has no intention of stopping
	2612	+ * its crazy behaviour, so we exponentially increase the delay based on
	2613	+ * overage amount.
	2614	+ */
	2615	+ penalty_jiffies = max_overage * max_overage * HZ;
	2616	+ penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
	2617	+ penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
	2618	+
	2619	+ /*
	2620	+ * Factor in the task's own contribution to the overage, such that four
	2621	+ * N-sized allocations are throttled approximately the same as one
	2622	+ * 4N-sized allocation.
	2623	+ *
	2624	+ * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
	2625	+ * larger the current charge patch is than that.
	2626	+ */
	2627	+ return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
	2628	+}
	2629	+
	2630	+/*
2179	2631	* Scheduled by try_charge() to be executed from the userland return path
2180	2632	* and reclaims memory over the high limit.
2181	2633	*/
2182	2634	void mem_cgroup_handle_over_high(void)
2183	2635	{
	2636	+ unsigned long penalty_jiffies;
	2637	+ unsigned long pflags;
	2638	+ unsigned long nr_reclaimed;
2184	2639	unsigned int nr_pages = current->memcg_nr_pages_over_high;
	2640	+ int nr_retries = MAX_RECLAIM_RETRIES;
2185	2641	struct mem_cgroup *memcg;
	2642	+ bool in_retry = false;
2186	2643
2187	2644	if (likely(!nr_pages))
2188	2645	return;
2189	2646
2190	2647	memcg = get_mem_cgroup_from_mm(current->mm);
2191		- reclaim_high(memcg, nr_pages, GFP_KERNEL);
2192		- css_put(&memcg->css);
2193	2648	current->memcg_nr_pages_over_high = 0;
	2649	+
	2650	+retry_reclaim:
	2651	+ /*
	2652	+ * The allocating task should reclaim at least the batch size, but for
	2653	+ * subsequent retries we only want to do what's necessary to prevent oom
	2654	+ * or breaching resource isolation.
	2655	+ *
	2656	+ * This is distinct from memory.max or page allocator behaviour because
	2657	+ * memory.high is currently batched, whereas memory.max and the page
	2658	+ * allocator run every time an allocation is made.
	2659	+ */
	2660	+ nr_reclaimed = reclaim_high(memcg,
	2661	+ in_retry ? SWAP_CLUSTER_MAX : nr_pages,
	2662	+ GFP_KERNEL);
	2663	+
	2664	+ /*
	2665	+ * memory.high is breached and reclaim is unable to keep up. Throttle
	2666	+ * allocators proactively to slow down excessive growth.
	2667	+ */
	2668	+ penalty_jiffies = calculate_high_delay(memcg, nr_pages,
	2669	+ mem_find_max_overage(memcg));
	2670	+
	2671	+ penalty_jiffies += calculate_high_delay(memcg, nr_pages,
	2672	+ swap_find_max_overage(memcg));
	2673	+
	2674	+ /*
	2675	+ * Clamp the max delay per usermode return so as to still keep the
	2676	+ * application moving forwards and also permit diagnostics, albeit
	2677	+ * extremely slowly.
	2678	+ */
	2679	+ penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
	2680	+
	2681	+ /*
	2682	+ * Don't sleep if the amount of jiffies this memcg owes us is so low
	2683	+ * that it's not even worth doing, in an attempt to be nice to those who
	2684	+ * go only a small amount over their memory.high value and maybe haven't
	2685	+ * been aggressively reclaimed enough yet.
	2686	+ */
	2687	+ if (penalty_jiffies <= HZ / 100)
	2688	+ goto out;
	2689	+
	2690	+ /*
	2691	+ * If reclaim is making forward progress but we're still over
	2692	+ * memory.high, we want to encourage that rather than doing allocator
	2693	+ * throttling.
	2694	+ */
	2695	+ if (nr_reclaimed \|\| nr_retries--) {
	2696	+ in_retry = true;
	2697	+ goto retry_reclaim;
	2698	+ }
	2699	+
	2700	+ /*
	2701	+ * If we exit early, we're guaranteed to die (since
	2702	+ * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
	2703	+ * need to account for any ill-begotten jiffies to pay them off later.
	2704	+ */
	2705	+ psi_memstall_enter(&pflags);
	2706	+ schedule_timeout_killable(penalty_jiffies);
	2707	+ psi_memstall_leave(&pflags);
	2708	+
	2709	+out:
	2710	+ css_put(&memcg->css);
2194	2711	}
2195	2712
2196	2713	static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2197	2714	unsigned int nr_pages)
2198	2715	{
2199	2716	unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
2200		- int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
	2717	+ int nr_retries = MAX_RECLAIM_RETRIES;
2201	2718	struct mem_cgroup *mem_over_limit;
2202	2719	struct page_counter *counter;
	2720	+ enum oom_status oom_status;
2203	2721	unsigned long nr_reclaimed;
	2722	+ bool passed_oom = false;
2204	2723	bool may_swap = true;
2205	2724	bool drained = false;
2206		- bool oomed = false;
2207		- enum oom_status oom_status;
	2725	+ unsigned long pflags;
2208	2726
2209	2727	if (mem_cgroup_is_root(memcg))
2210	2728	return 0;
..	..	@@ -2239,15 +2757,6 @@
2239	2757	goto force;
2240	2758
2241	2759	/*
2242		- * Unlike in global OOM situations, memcg is not in a physical
2243		- * memory shortage. Allow dying and OOM-killed tasks to
2244		- * bypass the last charges so that they can exit quickly and
2245		- * free their memory.
2246		- */
2247		- if (unlikely(should_force_charge()))
2248		- goto force;
2249		-
2250		- /*
2251	2760	* Prevent unbounded recursion when reclaim operations need to
2252	2761	* allocate memory. This might exceed the limits temporarily,
2253	2762	* but we prefer facilitating memory reclaim and getting back
..	..	@@ -2264,8 +2773,10 @@
2264	2773
2265	2774	memcg_memory_event(mem_over_limit, MEMCG_MAX);
2266	2775
	2776	+ psi_memstall_enter(&pflags);
2267	2777	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2268	2778	gfp_mask, may_swap);
	2779	+ psi_memstall_leave(&pflags);
2269	2780
2270	2781	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2271	2782	goto retry;
..	..	@@ -2299,16 +2810,15 @@
2299	2810	if (nr_retries--)
2300	2811	goto retry;
2301	2812
2302		- if (gfp_mask & __GFP_RETRY_MAYFAIL && oomed)
	2813	+ if (gfp_mask & __GFP_RETRY_MAYFAIL)
2303	2814	goto nomem;
2304	2815
2305	2816	if (gfp_mask & __GFP_NOFAIL)
2306	2817	goto force;
2307	2818
2308		- if (fatal_signal_pending(current))
2309		- goto force;
2310		-
2311		- memcg_memory_event(mem_over_limit, MEMCG_OOM);
	2819	+ /* Avoid endless loop for tasks bypassed by the oom killer */
	2820	+ if (passed_oom && task_is_dying())
	2821	+ goto nomem;
2312	2822
2313	2823	/*
2314	2824	* keep retrying as long as the memcg oom killer is able to make
..	..	@@ -2317,15 +2827,10 @@
2317	2827	*/
2318	2828	oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
2319	2829	get_order(nr_pages * PAGE_SIZE));
2320		- switch (oom_status) {
2321		- case OOM_SUCCESS:
2322		- nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2323		- oomed = true;
	2830	+ if (oom_status == OOM_SUCCESS) {
	2831	+ passed_oom = true;
	2832	+ nr_retries = MAX_RECLAIM_RETRIES;
2324	2833	goto retry;
2325		- case OOM_FAILED:
2326		- goto force;
2327		- default:
2328		- goto nomem;
2329	2834	}
2330	2835	nomem:
2331	2836	if (!(gfp_mask & __GFP_NOFAIL))
..	..	@@ -2339,12 +2844,10 @@
2339	2844	page_counter_charge(&memcg->memory, nr_pages);
2340	2845	if (do_memsw_account())
2341	2846	page_counter_charge(&memcg->memsw, nr_pages);
2342		- css_get_many(&memcg->css, nr_pages);
2343	2847
2344	2848	return 0;
2345	2849
2346	2850	done_restock:
2347		- css_get_many(&memcg->css, batch);
2348	2851	if (batch > nr_pages)
2349	2852	refill_stock(memcg, batch - nr_pages);
2350	2853
..	..	@@ -2358,12 +2861,32 @@
2358	2861	* reclaim, the cost of mismatch is negligible.
2359	2862	*/
2360	2863	do {
2361		- if (page_counter_read(&memcg->memory) > memcg->high) {
2362		- /* Don't bother a random interrupted task */
2363		- if (in_interrupt()) {
	2864	+ bool mem_high, swap_high;
	2865	+
	2866	+ mem_high = page_counter_read(&memcg->memory) >
	2867	+ READ_ONCE(memcg->memory.high);
	2868	+ swap_high = page_counter_read(&memcg->swap) >
	2869	+ READ_ONCE(memcg->swap.high);
	2870	+
	2871	+ /* Don't bother a random interrupted task */
	2872	+ if (in_interrupt()) {
	2873	+ if (mem_high) {
2364	2874	schedule_work(&memcg->high_work);
2365	2875	break;
2366	2876	}
	2877	+ continue;
	2878	+ }
	2879	+
	2880	+ if (mem_high \|\| swap_high) {
	2881	+ /*
	2882	+ * The allocating tasks in this cgroup will need to do
	2883	+ * reclaim or be throttled to prevent further growth
	2884	+ * of the memory or swap footprints.
	2885	+ *
	2886	+ * Target some best-effort fairness between the tasks,
	2887	+ * and distribute reclaim work and delay penalties
	2888	+ * based on how much each task is actually allocating.
	2889	+ */
2367	2890	current->memcg_nr_pages_over_high += batch;
2368	2891	set_notify_resume(current);
2369	2892	break;
..	..	@@ -2373,6 +2896,7 @@
2373	2896	return 0;
2374	2897	}
2375	2898
	2899	+#if defined(CONFIG_MEMCG_KMEM) \|\| defined(CONFIG_MMU)
2376	2900	static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2377	2901	{
2378	2902	if (mem_cgroup_is_root(memcg))
..	..	@@ -2381,76 +2905,124 @@
2381	2905	page_counter_uncharge(&memcg->memory, nr_pages);
2382	2906	if (do_memsw_account())
2383	2907	page_counter_uncharge(&memcg->memsw, nr_pages);
2384		-
2385		- css_put_many(&memcg->css, nr_pages);
2386	2908	}
	2909	+#endif
2387	2910
2388		-static void lock_page_lru(struct page page, int isolated)
	2911	+static void commit_charge(struct page page, struct mem_cgroup memcg)
2389	2912	{
2390		- struct zone *zone = page_zone(page);
2391		-
2392		- spin_lock_irq(zone_lru_lock(zone));
2393		- if (PageLRU(page)) {
2394		- struct lruvec *lruvec;
2395		-
2396		- lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
2397		- ClearPageLRU(page);
2398		- del_page_from_lru_list(page, lruvec, page_lru(page));
2399		- *isolated = 1;
2400		- } else
2401		- *isolated = 0;
2402		-}
2403		-
2404		-static void unlock_page_lru(struct page *page, int isolated)
2405		-{
2406		- struct zone *zone = page_zone(page);
2407		-
2408		- if (isolated) {
2409		- struct lruvec *lruvec;
2410		-
2411		- lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
2412		- VM_BUG_ON_PAGE(PageLRU(page), page);
2413		- SetPageLRU(page);
2414		- add_page_to_lru_list(page, lruvec, page_lru(page));
2415		- }
2416		- spin_unlock_irq(zone_lru_lock(zone));
2417		-}
2418		-
2419		-static void commit_charge(struct page page, struct mem_cgroup memcg,
2420		- bool lrucare)
2421		-{
2422		- int isolated;
2423		-
2424	2913	VM_BUG_ON_PAGE(page->mem_cgroup, page);
2425		-
2426	2914	/*
2427		- * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
2428		- * may already be on some other mem_cgroup's LRU. Take care of it.
2429		- */
2430		- if (lrucare)
2431		- lock_page_lru(page, &isolated);
2432		-
2433		- /*
2434		- * Nobody should be changing or seriously looking at
2435		- * page->mem_cgroup at this point:
	2915	+ * Any of the following ensures page->mem_cgroup stability:
2436	2916	*
2437		- * - the page is uncharged
2438		- *
2439		- * - the page is off-LRU
2440		- *
2441		- * - an anonymous fault has exclusive page access, except for
2442		- * a locked page table
2443		- *
2444		- * - a page cache insertion, a swapin fault, or a migration
2445		- * have the page locked
	2917	+ * - the page lock
	2918	+ * - LRU isolation
	2919	+ * - lock_page_memcg()
	2920	+ * - exclusive reference
2446	2921	*/
2447	2922	page->mem_cgroup = memcg;
2448		-
2449		- if (lrucare)
2450		- unlock_page_lru(page, isolated);
2451	2923	}
2452	2924
2453	2925	#ifdef CONFIG_MEMCG_KMEM
	2926	+/*
	2927	+ * The allocated objcg pointers array is not accounted directly.
	2928	+ * Moreover, it should not come from DMA buffer and is not readily
	2929	+ * reclaimable. So those GFP bits should be masked off.
	2930	+ */
	2931	+#define OBJCGS_CLEAR_MASK (__GFP_DMA \| __GFP_RECLAIMABLE \| __GFP_ACCOUNT)
	2932	+
	2933	+int memcg_alloc_page_obj_cgroups(struct page page, struct kmem_cache s,
	2934	+ gfp_t gfp)
	2935	+{
	2936	+ unsigned int objects = objs_per_slab_page(s, page);
	2937	+ void *vec;
	2938	+
	2939	+ gfp &= ~OBJCGS_CLEAR_MASK;
	2940	+ vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
	2941	+ page_to_nid(page));
	2942	+ if (!vec)
	2943	+ return -ENOMEM;
	2944	+
	2945	+ if (cmpxchg(&page->obj_cgroups, NULL,
	2946	+ (struct obj_cgroup **) ((unsigned long)vec \| 0x1UL)))
	2947	+ kfree(vec);
	2948	+ else
	2949	+ kmemleak_not_leak(vec);
	2950	+
	2951	+ return 0;
	2952	+}
	2953	+
	2954	+/*
	2955	+ * Returns a pointer to the memory cgroup to which the kernel object is charged.
	2956	+ *
	2957	+ * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
	2958	+ * cgroup_mutex, etc.
	2959	+ */
	2960	+struct mem_cgroup mem_cgroup_from_obj(void p)
	2961	+{
	2962	+ struct page *page;
	2963	+
	2964	+ if (mem_cgroup_disabled())
	2965	+ return NULL;
	2966	+
	2967	+ page = virt_to_head_page(p);
	2968	+
	2969	+ /*
	2970	+ * If page->mem_cgroup is set, it's either a simple mem_cgroup pointer
	2971	+ * or a pointer to obj_cgroup vector. In the latter case the lowest
	2972	+ * bit of the pointer is set.
	2973	+ * The page->mem_cgroup pointer can be asynchronously changed
	2974	+ * from NULL to (obj_cgroup_vec \| 0x1UL), but can't be changed
	2975	+ * from a valid memcg pointer to objcg vector or back.
	2976	+ */
	2977	+ if (!page->mem_cgroup)
	2978	+ return NULL;
	2979	+
	2980	+ /*
	2981	+ * Slab objects are accounted individually, not per-page.
	2982	+ * Memcg membership data for each individual object is saved in
	2983	+ * the page->obj_cgroups.
	2984	+ */
	2985	+ if (page_has_obj_cgroups(page)) {
	2986	+ struct obj_cgroup *objcg;
	2987	+ unsigned int off;
	2988	+
	2989	+ off = obj_to_index(page->slab_cache, page, p);
	2990	+ objcg = page_obj_cgroups(page)[off];
	2991	+ if (objcg)
	2992	+ return obj_cgroup_memcg(objcg);
	2993	+
	2994	+ return NULL;
	2995	+ }
	2996	+
	2997	+ /* All other pages use page->mem_cgroup */
	2998	+ return page->mem_cgroup;
	2999	+}
	3000	+
	3001	+__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
	3002	+{
	3003	+ struct obj_cgroup *objcg = NULL;
	3004	+ struct mem_cgroup *memcg;
	3005	+
	3006	+ if (memcg_kmem_bypass())
	3007	+ return NULL;
	3008	+
	3009	+ rcu_read_lock();
	3010	+ if (unlikely(active_memcg()))
	3011	+ memcg = active_memcg();
	3012	+ else
	3013	+ memcg = mem_cgroup_from_task(current);
	3014	+
	3015	+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
	3016	+ objcg = rcu_dereference(memcg->objcg);
	3017	+ if (objcg && obj_cgroup_tryget(objcg))
	3018	+ break;
	3019	+ objcg = NULL;
	3020	+ }
	3021	+ rcu_read_unlock();
	3022	+
	3023	+ return objcg;
	3024	+}
	3025	+
2454	3026	static int memcg_alloc_cache_id(void)
2455	3027	{
2456	3028	int id, size;
..	..	@@ -2476,9 +3048,7 @@
2476	3048	else if (size > MEMCG_CACHES_MAX_SIZE)
2477	3049	size = MEMCG_CACHES_MAX_SIZE;
2478	3050
2479		- err = memcg_update_all_caches(size);
2480		- if (!err)
2481		- err = memcg_update_all_list_lrus(size);
	3051	+ err = memcg_update_all_list_lrus(size);
2482	3052	if (!err)
2483	3053	memcg_nr_cache_ids = size;
2484	3054
..	..	@@ -2496,152 +3066,17 @@
2496	3066	ida_simple_remove(&memcg_cache_ida, id);
2497	3067	}
2498	3068
2499		-struct memcg_kmem_cache_create_work {
2500		- struct mem_cgroup *memcg;
2501		- struct kmem_cache *cachep;
2502		- struct work_struct work;
2503		-};
2504		-
2505		-static void memcg_kmem_cache_create_func(struct work_struct *w)
2506		-{
2507		- struct memcg_kmem_cache_create_work *cw =
2508		- container_of(w, struct memcg_kmem_cache_create_work, work);
2509		- struct mem_cgroup *memcg = cw->memcg;
2510		- struct kmem_cache *cachep = cw->cachep;
2511		-
2512		- memcg_create_kmem_cache(memcg, cachep);
2513		-
2514		- css_put(&memcg->css);
2515		- kfree(cw);
2516		-}
2517		-
2518		-/*
2519		- * Enqueue the creation of a per-memcg kmem_cache.
2520		- */
2521		-static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2522		- struct kmem_cache *cachep)
2523		-{
2524		- struct memcg_kmem_cache_create_work *cw;
2525		-
2526		- cw = kmalloc(sizeof(*cw), GFP_NOWAIT \| __GFP_NOWARN);
2527		- if (!cw)
2528		- return;
2529		-
2530		- css_get(&memcg->css);
2531		-
2532		- cw->memcg = memcg;
2533		- cw->cachep = cachep;
2534		- INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
2535		-
2536		- queue_work(memcg_kmem_cache_wq, &cw->work);
2537		-}
2538		-
2539		-static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2540		- struct kmem_cache *cachep)
2541		-{
2542		- /*
2543		- * We need to stop accounting when we kmalloc, because if the
2544		- * corresponding kmalloc cache is not yet created, the first allocation
2545		- * in __memcg_schedule_kmem_cache_create will recurse.
2546		- *
2547		- * However, it is better to enclose the whole function. Depending on
2548		- * the debugging options enabled, INIT_WORK(), for instance, can
2549		- * trigger an allocation. This too, will make us recurse. Because at
2550		- * this point we can't allow ourselves back into memcg_kmem_get_cache,
2551		- * the safest choice is to do it like this, wrapping the whole function.
2552		- */
2553		- current->memcg_kmem_skip_account = 1;
2554		- __memcg_schedule_kmem_cache_create(memcg, cachep);
2555		- current->memcg_kmem_skip_account = 0;
2556		-}
2557		-
2558		-static inline bool memcg_kmem_bypass(void)
2559		-{
2560		- if (in_interrupt() \|\| !current->mm \|\| (current->flags & PF_KTHREAD))
2561		- return true;
2562		- return false;
2563		-}
2564		-
2565	3069	/**
2566		- * memcg_kmem_get_cache: select the correct per-memcg cache for allocation
2567		- * @cachep: the original global kmem cache
2568		- *
2569		- * Return the kmem_cache we're supposed to use for a slab allocation.
2570		- * We try to use the current memcg's version of the cache.
2571		- *
2572		- * If the cache does not exist yet, if we are the first user of it, we
2573		- * create it asynchronously in a workqueue and let the current allocation
2574		- * go through with the original cache.
2575		- *
2576		- * This function takes a reference to the cache it returns to assure it
2577		- * won't get destroyed while we are working with it. Once the caller is
2578		- * done with it, memcg_kmem_put_cache() must be called to release the
2579		- * reference.
2580		- */
2581		-struct kmem_cache memcg_kmem_get_cache(struct kmem_cache cachep)
2582		-{
2583		- struct mem_cgroup *memcg;
2584		- struct kmem_cache *memcg_cachep;
2585		- int kmemcg_id;
2586		-
2587		- VM_BUG_ON(!is_root_cache(cachep));
2588		-
2589		- if (memcg_kmem_bypass())
2590		- return cachep;
2591		-
2592		- if (current->memcg_kmem_skip_account)
2593		- return cachep;
2594		-
2595		- memcg = get_mem_cgroup_from_current();
2596		- kmemcg_id = READ_ONCE(memcg->kmemcg_id);
2597		- if (kmemcg_id < 0)
2598		- goto out;
2599		-
2600		- memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
2601		- if (likely(memcg_cachep))
2602		- return memcg_cachep;
2603		-
2604		- /*
2605		- * If we are in a safe context (can wait, and not in interrupt
2606		- * context), we could be be predictable and return right away.
2607		- * This would guarantee that the allocation being performed
2608		- * already belongs in the new cache.
2609		- *
2610		- * However, there are some clashes that can arrive from locking.
2611		- * For instance, because we acquire the slab_mutex while doing
2612		- * memcg_create_kmem_cache, this means no further allocation
2613		- * could happen with the slab_mutex held. So it's better to
2614		- * defer everything.
2615		- */
2616		- memcg_schedule_kmem_cache_create(memcg, cachep);
2617		-out:
2618		- css_put(&memcg->css);
2619		- return cachep;
2620		-}
2621		-
2622		-/**
2623		- * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache
2624		- * @cachep: the cache returned by memcg_kmem_get_cache
2625		- */
2626		-void memcg_kmem_put_cache(struct kmem_cache *cachep)
2627		-{
2628		- if (!is_root_cache(cachep))
2629		- css_put(&cachep->memcg_params.memcg->css);
2630		-}
2631		-
2632		-/**
2633		- * memcg_kmem_charge_memcg: charge a kmem page
2634		- * @page: page to charge
2635		- * @gfp: reclaim mode
2636		- * @order: allocation order
	3070	+ * __memcg_kmem_charge: charge a number of kernel pages to a memcg
2637	3071	* @memcg: memory cgroup to charge
	3072	+ * @gfp: reclaim mode
	3073	+ * @nr_pages: number of pages to charge
2638	3074	*
2639	3075	* Returns 0 on success, an error code on failure.
2640	3076	*/
2641		-int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
2642		- struct mem_cgroup *memcg)
	3077	+int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
	3078	+ unsigned int nr_pages)
2643	3079	{
2644		- unsigned int nr_pages = 1 << order;
2645	3080	struct page_counter *counter;
2646	3081	int ret;
2647	3082
..	..	@@ -2664,43 +3099,54 @@
2664	3099	cancel_charge(memcg, nr_pages);
2665	3100	return -ENOMEM;
2666	3101	}
2667		-
2668		- page->mem_cgroup = memcg;
2669		-
2670	3102	return 0;
2671	3103	}
2672	3104
2673	3105	/**
2674		- * memcg_kmem_charge: charge a kmem page to the current memory cgroup
	3106	+ * __memcg_kmem_uncharge: uncharge a number of kernel pages from a memcg
	3107	+ * @memcg: memcg to uncharge
	3108	+ * @nr_pages: number of pages to uncharge
	3109	+ */
	3110	+void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
	3111	+{
	3112	+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
	3113	+ page_counter_uncharge(&memcg->kmem, nr_pages);
	3114	+
	3115	+ refill_stock(memcg, nr_pages);
	3116	+}
	3117	+
	3118	+/**
	3119	+ * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup
2675	3120	* @page: page to charge
2676	3121	* @gfp: reclaim mode
2677	3122	* @order: allocation order
2678	3123	*
2679	3124	* Returns 0 on success, an error code on failure.
2680	3125	*/
2681		-int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
	3126	+int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
2682	3127	{
2683	3128	struct mem_cgroup *memcg;
2684	3129	int ret = 0;
2685	3130
2686		- if (mem_cgroup_disabled() \|\| memcg_kmem_bypass())
2687		- return 0;
2688		-
2689	3131	memcg = get_mem_cgroup_from_current();
2690		- if (!mem_cgroup_is_root(memcg)) {
2691		- ret = memcg_kmem_charge_memcg(page, gfp, order, memcg);
2692		- if (!ret)
	3132	+ if (memcg && !mem_cgroup_is_root(memcg)) {
	3133	+ ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
	3134	+ if (!ret) {
	3135	+ page->mem_cgroup = memcg;
2693	3136	__SetPageKmemcg(page);
	3137	+ return 0;
	3138	+ }
	3139	+ css_put(&memcg->css);
2694	3140	}
2695		- css_put(&memcg->css);
2696	3141	return ret;
2697	3142	}
	3143	+
2698	3144	/**
2699		- * memcg_kmem_uncharge: uncharge a kmem page
	3145	+ * __memcg_kmem_uncharge_page: uncharge a kmem page
2700	3146	* @page: page to uncharge
2701	3147	* @order: allocation order
2702	3148	*/
2703		-void memcg_kmem_uncharge(struct page *page, int order)
	3149	+void __memcg_kmem_uncharge_page(struct page *page, int order)
2704	3150	{
2705	3151	struct mem_cgroup *memcg = page->mem_cgroup;
2706	3152	unsigned int nr_pages = 1 << order;
..	..	@@ -2709,43 +3155,179 @@
2709	3155	return;
2710	3156
2711	3157	VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
2712		-
2713		- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
2714		- page_counter_uncharge(&memcg->kmem, nr_pages);
2715		-
2716		- page_counter_uncharge(&memcg->memory, nr_pages);
2717		- if (do_memsw_account())
2718		- page_counter_uncharge(&memcg->memsw, nr_pages);
2719		-
	3158	+ __memcg_kmem_uncharge(memcg, nr_pages);
2720	3159	page->mem_cgroup = NULL;
	3160	+ css_put(&memcg->css);
2721	3161
2722	3162	/* slab pages do not have PageKmemcg flag set */
2723	3163	if (PageKmemcg(page))
2724	3164	__ClearPageKmemcg(page);
2725		-
2726		- css_put_many(&memcg->css, nr_pages);
2727	3165	}
2728		-#endif /* CONFIG_MEMCG_KMEM */
2729	3166
2730		-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2731		-
2732		-/*
2733		- * Because tail pages are not marked as "used", set it. We're under
2734		- * zone_lru_lock and migration entries setup in all page mappings.
2735		- */
2736		-void mem_cgroup_split_huge_fixup(struct page *head)
	3167	+static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
2737	3168	{
2738		- int i;
	3169	+ struct memcg_stock_pcp *stock;
	3170	+ unsigned long flags;
	3171	+ bool ret = false;
2739	3172
2740		- if (mem_cgroup_disabled())
	3173	+ local_irq_save(flags);
	3174	+
	3175	+ stock = this_cpu_ptr(&memcg_stock);
	3176	+ if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
	3177	+ stock->nr_bytes -= nr_bytes;
	3178	+ ret = true;
	3179	+ }
	3180	+
	3181	+ local_irq_restore(flags);
	3182	+
	3183	+ return ret;
	3184	+}
	3185	+
	3186	+static void drain_obj_stock(struct memcg_stock_pcp *stock)
	3187	+{
	3188	+ struct obj_cgroup *old = stock->cached_objcg;
	3189	+
	3190	+ if (!old)
2741	3191	return;
2742	3192
2743		- for (i = 1; i < HPAGE_PMD_NR; i++)
2744		- head[i].mem_cgroup = head->mem_cgroup;
	3193	+ if (stock->nr_bytes) {
	3194	+ unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
	3195	+ unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
2745	3196
2746		- __mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR);
	3197	+ if (nr_pages) {
	3198	+ struct mem_cgroup *memcg;
	3199	+
	3200	+ rcu_read_lock();
	3201	+retry:
	3202	+ memcg = obj_cgroup_memcg(old);
	3203	+ if (unlikely(!css_tryget(&memcg->css)))
	3204	+ goto retry;
	3205	+ rcu_read_unlock();
	3206	+
	3207	+ __memcg_kmem_uncharge(memcg, nr_pages);
	3208	+ css_put(&memcg->css);
	3209	+ }
	3210	+
	3211	+ /*
	3212	+ * The leftover is flushed to the centralized per-memcg value.
	3213	+ * On the next attempt to refill obj stock it will be moved
	3214	+ * to a per-cpu stock (probably, on an other CPU), see
	3215	+ * refill_obj_stock().
	3216	+ *
	3217	+ * How often it's flushed is a trade-off between the memory
	3218	+ * limit enforcement accuracy and potential CPU contention,
	3219	+ * so it might be changed in the future.
	3220	+ */
	3221	+ atomic_add(nr_bytes, &old->nr_charged_bytes);
	3222	+ stock->nr_bytes = 0;
	3223	+ }
	3224	+
	3225	+ obj_cgroup_put(old);
	3226	+ stock->cached_objcg = NULL;
2747	3227	}
2748		-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
	3228	+
	3229	+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
	3230	+ struct mem_cgroup *root_memcg)
	3231	+{
	3232	+ struct mem_cgroup *memcg;
	3233	+
	3234	+ if (stock->cached_objcg) {
	3235	+ memcg = obj_cgroup_memcg(stock->cached_objcg);
	3236	+ if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
	3237	+ return true;
	3238	+ }
	3239	+
	3240	+ return false;
	3241	+}
	3242	+
	3243	+static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
	3244	+{
	3245	+ struct memcg_stock_pcp *stock;
	3246	+ unsigned long flags;
	3247	+
	3248	+ local_irq_save(flags);
	3249	+
	3250	+ stock = this_cpu_ptr(&memcg_stock);
	3251	+ if (stock->cached_objcg != objcg) { /* reset if necessary */
	3252	+ drain_obj_stock(stock);
	3253	+ obj_cgroup_get(objcg);
	3254	+ stock->cached_objcg = objcg;
	3255	+ stock->nr_bytes = atomic_xchg(&objcg->nr_charged_bytes, 0);
	3256	+ }
	3257	+ stock->nr_bytes += nr_bytes;
	3258	+
	3259	+ if (stock->nr_bytes > PAGE_SIZE)
	3260	+ drain_obj_stock(stock);
	3261	+
	3262	+ local_irq_restore(flags);
	3263	+}
	3264	+
	3265	+int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
	3266	+{
	3267	+ struct mem_cgroup *memcg;
	3268	+ unsigned int nr_pages, nr_bytes;
	3269	+ int ret;
	3270	+
	3271	+ if (consume_obj_stock(objcg, size))
	3272	+ return 0;
	3273	+
	3274	+ /*
	3275	+ * In theory, memcg->nr_charged_bytes can have enough
	3276	+ * pre-charged bytes to satisfy the allocation. However,
	3277	+ * flushing memcg->nr_charged_bytes requires two atomic
	3278	+ * operations, and memcg->nr_charged_bytes can't be big,
	3279	+ * so it's better to ignore it and try grab some new pages.
	3280	+ * memcg->nr_charged_bytes will be flushed in
	3281	+ * refill_obj_stock(), called from this function or
	3282	+ * independently later.
	3283	+ */
	3284	+ rcu_read_lock();
	3285	+retry:
	3286	+ memcg = obj_cgroup_memcg(objcg);
	3287	+ if (unlikely(!css_tryget(&memcg->css)))
	3288	+ goto retry;
	3289	+ rcu_read_unlock();
	3290	+
	3291	+ nr_pages = size >> PAGE_SHIFT;
	3292	+ nr_bytes = size & (PAGE_SIZE - 1);
	3293	+
	3294	+ if (nr_bytes)
	3295	+ nr_pages += 1;
	3296	+
	3297	+ ret = __memcg_kmem_charge(memcg, gfp, nr_pages);
	3298	+ if (!ret && nr_bytes)
	3299	+ refill_obj_stock(objcg, PAGE_SIZE - nr_bytes);
	3300	+
	3301	+ css_put(&memcg->css);
	3302	+ return ret;
	3303	+}
	3304	+
	3305	+void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
	3306	+{
	3307	+ refill_obj_stock(objcg, size);
	3308	+}
	3309	+
	3310	+#endif /* CONFIG_MEMCG_KMEM */
	3311	+
	3312	+/*
	3313	+ * Because head->mem_cgroup is not set on tails, set it now.
	3314	+ */
	3315	+void split_page_memcg(struct page *head, unsigned int nr)
	3316	+{
	3317	+ struct mem_cgroup *memcg = head->mem_cgroup;
	3318	+ int kmemcg = PageKmemcg(head);
	3319	+ int i;
	3320	+
	3321	+ if (mem_cgroup_disabled() \|\| !memcg)
	3322	+ return;
	3323	+
	3324	+ for (i = 1; i < nr; i++) {
	3325	+ head[i].mem_cgroup = memcg;
	3326	+ if (kmemcg)
	3327	+ __SetPageKmemcg(head + i);
	3328	+ }
	3329	+ css_get_many(&memcg->css, nr - 1);
	3330	+}
2749	3331
2750	3332	#ifdef CONFIG_MEMCG_SWAP
2751	3333	/**
..	..	@@ -2807,7 +3389,7 @@
2807	3389	* Make sure that the new limit (memsw or memory limit) doesn't
2808	3390	* break our basic invariant rule memory.max <= memsw.max.
2809	3391	*/
2810		- limits_invariant = memsw ? max >= memcg->memory.max :
	3392	+ limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
2811	3393	max <= memcg->memsw.max;
2812	3394	if (!limits_invariant) {
2813	3395	mutex_unlock(&memcg_max_mutex);
..	..	@@ -2928,7 +3510,7 @@
2928	3510	* Test whether @memcg has children, dead or alive. Note that this
2929	3511	* function doesn't care whether @memcg has use_hierarchy enabled and
2930	3512	* returns %true if there are child csses according to the cgroup
2931		- * hierarchy. Testing use_hierarchy is the caller's responsiblity.
	3513	+ * hierarchy. Testing use_hierarchy is the caller's responsibility.
2932	3514	*/
2933	3515	static inline bool memcg_has_children(struct mem_cgroup *memcg)
2934	3516	{
..	..	@@ -2947,7 +3529,7 @@
2947	3529	*/
2948	3530	static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
2949	3531	{
2950		- int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
	3532	+ int nr_retries = MAX_RECLAIM_RETRIES;
2951	3533
2952	3534	/* we call try-to-free pages for make this cgroup empty */
2953	3535	lru_add_drain_all();
..	..	@@ -3021,50 +3603,15 @@
3021	3603	return retval;
3022	3604	}
3023	3605
3024		-struct accumulated_stats {
3025		- unsigned long stat[MEMCG_NR_STAT];
3026		- unsigned long events[NR_VM_EVENT_ITEMS];
3027		- unsigned long lru_pages[NR_LRU_LISTS];
3028		- const unsigned int *stats_array;
3029		- const unsigned int *events_array;
3030		- int stats_size;
3031		- int events_size;
3032		-};
3033		-
3034		-static void accumulate_memcg_tree(struct mem_cgroup *memcg,
3035		- struct accumulated_stats *acc)
3036		-{
3037		- struct mem_cgroup *mi;
3038		- int i;
3039		-
3040		- for_each_mem_cgroup_tree(mi, memcg) {
3041		- for (i = 0; i < acc->stats_size; i++)
3042		- acc->stat[i] += memcg_page_state(mi,
3043		- acc->stats_array ? acc->stats_array[i] : i);
3044		-
3045		- for (i = 0; i < acc->events_size; i++)
3046		- acc->events[i] += memcg_sum_events(mi,
3047		- acc->events_array ? acc->events_array[i] : i);
3048		-
3049		- for (i = 0; i < NR_LRU_LISTS; i++)
3050		- acc->lru_pages[i] +=
3051		- mem_cgroup_nr_lru_pages(mi, BIT(i));
3052		- }
3053		-}
3054		-
3055	3606	static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3056	3607	{
3057		- unsigned long val = 0;
	3608	+ unsigned long val;
3058	3609
3059	3610	if (mem_cgroup_is_root(memcg)) {
3060		- struct mem_cgroup *iter;
3061		-
3062		- for_each_mem_cgroup_tree(iter, memcg) {
3063		- val += memcg_page_state(iter, MEMCG_CACHE);
3064		- val += memcg_page_state(iter, MEMCG_RSS);
3065		- if (swap)
3066		- val += memcg_page_state(iter, MEMCG_SWAP);
3067		- }
	3611	+ val = memcg_page_state(memcg, NR_FILE_PAGES) +
	3612	+ memcg_page_state(memcg, NR_ANON_MAPPED);
	3613	+ if (swap)
	3614	+ val += memcg_page_state(memcg, MEMCG_SWAP);
3068	3615	} else {
3069	3616	if (!swap)
3070	3617	val = page_counter_read(&memcg->memory);
..	..	@@ -3125,9 +3672,61 @@
3125	3672	}
3126	3673	}
3127	3674
	3675	+static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
	3676	+{
	3677	+ unsigned long stat[MEMCG_NR_STAT] = {0};
	3678	+ struct mem_cgroup *mi;
	3679	+ int node, cpu, i;
	3680	+
	3681	+ for_each_online_cpu(cpu)
	3682	+ for (i = 0; i < MEMCG_NR_STAT; i++)
	3683	+ stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
	3684	+
	3685	+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
	3686	+ for (i = 0; i < MEMCG_NR_STAT; i++)
	3687	+ atomic_long_add(stat[i], &mi->vmstats[i]);
	3688	+
	3689	+ for_each_node(node) {
	3690	+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
	3691	+ struct mem_cgroup_per_node *pi;
	3692	+
	3693	+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
	3694	+ stat[i] = 0;
	3695	+
	3696	+ for_each_online_cpu(cpu)
	3697	+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
	3698	+ stat[i] += per_cpu(
	3699	+ pn->lruvec_stat_cpu->count[i], cpu);
	3700	+
	3701	+ for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
	3702	+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
	3703	+ atomic_long_add(stat[i], &pi->lruvec_stat[i]);
	3704	+ }
	3705	+}
	3706	+
	3707	+static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
	3708	+{
	3709	+ unsigned long events[NR_VM_EVENT_ITEMS];
	3710	+ struct mem_cgroup *mi;
	3711	+ int cpu, i;
	3712	+
	3713	+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
	3714	+ events[i] = 0;
	3715	+
	3716	+ for_each_online_cpu(cpu)
	3717	+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
	3718	+ events[i] += per_cpu(memcg->vmstats_percpu->events[i],
	3719	+ cpu);
	3720	+
	3721	+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
	3722	+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
	3723	+ atomic_long_add(events[i], &mi->vmevents[i]);
	3724	+}
	3725	+
3128	3726	#ifdef CONFIG_MEMCG_KMEM
3129	3727	static int memcg_online_kmem(struct mem_cgroup *memcg)
3130	3728	{
	3729	+ struct obj_cgroup *objcg;
3131	3730	int memcg_id;
3132	3731
3133	3732	if (cgroup_memory_nokmem)
..	..	@@ -3140,7 +3739,16 @@
3140	3739	if (memcg_id < 0)
3141	3740	return memcg_id;
3142	3741
3143		- static_branch_inc(&memcg_kmem_enabled_key);
	3742	+ objcg = obj_cgroup_alloc();
	3743	+ if (!objcg) {
	3744	+ memcg_free_cache_id(memcg_id);
	3745	+ return -ENOMEM;
	3746	+ }
	3747	+ objcg->memcg = memcg;
	3748	+ rcu_assign_pointer(memcg->objcg, objcg);
	3749	+
	3750	+ static_branch_enable(&memcg_kmem_enabled_key);
	3751	+
3144	3752	/*
3145	3753	* A memory cgroup is considered kmem-online as soon as it gets
3146	3754	* kmemcg_id. Setting the id after enabling static branching will
..	..	@@ -3149,7 +3757,6 @@
3149	3757	*/
3150	3758	memcg->kmemcg_id = memcg_id;
3151	3759	memcg->kmem_state = KMEM_ONLINE;
3152		- INIT_LIST_HEAD(&memcg->kmem_caches);
3153	3760
3154	3761	return 0;
3155	3762	}
..	..	@@ -3162,22 +3769,17 @@
3162	3769
3163	3770	if (memcg->kmem_state != KMEM_ONLINE)
3164	3771	return;
3165		- /*
3166		- * Clear the online state before clearing memcg_caches array
3167		- * entries. The slab_mutex in memcg_deactivate_kmem_caches()
3168		- * guarantees that no cache will be created for this cgroup
3169		- * after we are done (see memcg_create_kmem_cache()).
3170		- */
	3772	+
3171	3773	memcg->kmem_state = KMEM_ALLOCATED;
3172		-
3173		- memcg_deactivate_kmem_caches(memcg);
3174		-
3175		- kmemcg_id = memcg->kmemcg_id;
3176		- BUG_ON(kmemcg_id < 0);
3177	3774
3178	3775	parent = parent_mem_cgroup(memcg);
3179	3776	if (!parent)
3180	3777	parent = root_mem_cgroup;
	3778	+
	3779	+ memcg_reparent_objcgs(memcg, parent);
	3780	+
	3781	+ kmemcg_id = memcg->kmemcg_id;
	3782	+ BUG_ON(kmemcg_id < 0);
3181	3783
3182	3784	/*
3183	3785	* Change kmemcg_id of this cgroup and all its descendants to the
..	..	@@ -3207,12 +3809,6 @@
3207	3809	/* css_alloc() failed, offlining didn't happen */
3208	3810	if (unlikely(memcg->kmem_state == KMEM_ONLINE))
3209	3811	memcg_offline_kmem(memcg);
3210		-
3211		- if (memcg->kmem_state == KMEM_ALLOCATED) {
3212		- memcg_destroy_kmem_caches(memcg);
3213		- static_branch_dec(&memcg_kmem_enabled_key);
3214		- WARN_ON(page_counter_read(&memcg->kmem));
3215		- }
3216	3812	}
3217	3813	#else
3218	3814	static int memcg_online_kmem(struct mem_cgroup *memcg)
..	..	@@ -3303,6 +3899,9 @@
3303	3899	ret = mem_cgroup_resize_max(memcg, nr_pages, true);
3304	3900	break;
3305	3901	case _KMEM:
	3902	+ pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
	3903	+ "Please report your usecase to linux-mm@kvack.org if you "
	3904	+ "depend on this functionality.\n");
3306	3905	ret = memcg_update_kmem_max(memcg, nr_pages);
3307	3906	break;
3308	3907	case _TCP:
..	..	@@ -3367,6 +3966,10 @@
3367	3966	{
3368	3967	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3369	3968
	3969	+ pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. "
	3970	+ "Please report your usecase to linux-mm@kvack.org if you "
	3971	+ "depend on this functionality.\n");
	3972	+
3370	3973	if (val & ~MOVE_MASK)
3371	3974	return -EINVAL;
3372	3975
..	..	@@ -3388,6 +3991,49 @@
3388	3991	#endif
3389	3992
3390	3993	#ifdef CONFIG_NUMA
	3994	+
	3995	+#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) \| BIT(LRU_ACTIVE_FILE))
	3996	+#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) \| BIT(LRU_ACTIVE_ANON))
	3997	+#define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
	3998	+
	3999	+static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
	4000	+ int nid, unsigned int lru_mask, bool tree)
	4001	+{
	4002	+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
	4003	+ unsigned long nr = 0;
	4004	+ enum lru_list lru;
	4005	+
	4006	+ VM_BUG_ON((unsigned)nid >= nr_node_ids);
	4007	+
	4008	+ for_each_lru(lru) {
	4009	+ if (!(BIT(lru) & lru_mask))
	4010	+ continue;
	4011	+ if (tree)
	4012	+ nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
	4013	+ else
	4014	+ nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
	4015	+ }
	4016	+ return nr;
	4017	+}
	4018	+
	4019	+static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
	4020	+ unsigned int lru_mask,
	4021	+ bool tree)
	4022	+{
	4023	+ unsigned long nr = 0;
	4024	+ enum lru_list lru;
	4025	+
	4026	+ for_each_lru(lru) {
	4027	+ if (!(BIT(lru) & lru_mask))
	4028	+ continue;
	4029	+ if (tree)
	4030	+ nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
	4031	+ else
	4032	+ nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
	4033	+ }
	4034	+ return nr;
	4035	+}
	4036	+
3391	4037	static int memcg_numa_stat_show(struct seq_file m, void v)
3392	4038	{
3393	4039	struct numa_stat {
..	..	@@ -3403,40 +4049,60 @@
3403	4049	};
3404	4050	const struct numa_stat *stat;
3405	4051	int nid;
3406		- unsigned long nr;
3407		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
	4052	+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
3408	4053
3409	4054	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3410		- nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
3411		- seq_printf(m, "%s=%lu", stat->name, nr);
3412		- for_each_node_state(nid, N_MEMORY) {
3413		- nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
3414		- stat->lru_mask);
3415		- seq_printf(m, " N%d=%lu", nid, nr);
3416		- }
	4055	+ seq_printf(m, "%s=%lu", stat->name,
	4056	+ mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
	4057	+ false));
	4058	+ for_each_node_state(nid, N_MEMORY)
	4059	+ seq_printf(m, " N%d=%lu", nid,
	4060	+ mem_cgroup_node_nr_lru_pages(memcg, nid,
	4061	+ stat->lru_mask, false));
3417	4062	seq_putc(m, '\n');
3418	4063	}
3419	4064
3420	4065	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3421		- struct mem_cgroup *iter;
3422	4066
3423		- nr = 0;
3424		- for_each_mem_cgroup_tree(iter, memcg)
3425		- nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
3426		- seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
3427		- for_each_node_state(nid, N_MEMORY) {
3428		- nr = 0;
3429		- for_each_mem_cgroup_tree(iter, memcg)
3430		- nr += mem_cgroup_node_nr_lru_pages(
3431		- iter, nid, stat->lru_mask);
3432		- seq_printf(m, " N%d=%lu", nid, nr);
3433		- }
	4067	+ seq_printf(m, "hierarchical_%s=%lu", stat->name,
	4068	+ mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
	4069	+ true));
	4070	+ for_each_node_state(nid, N_MEMORY)
	4071	+ seq_printf(m, " N%d=%lu", nid,
	4072	+ mem_cgroup_node_nr_lru_pages(memcg, nid,
	4073	+ stat->lru_mask, true));
3434	4074	seq_putc(m, '\n');
3435	4075	}
3436	4076
3437	4077	return 0;
3438	4078	}
3439	4079	#endif /* CONFIG_NUMA */
	4080	+
	4081	+static const unsigned int memcg1_stats[] = {
	4082	+ NR_FILE_PAGES,
	4083	+ NR_ANON_MAPPED,
	4084	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	4085	+ NR_ANON_THPS,
	4086	+#endif
	4087	+ NR_SHMEM,
	4088	+ NR_FILE_MAPPED,
	4089	+ NR_FILE_DIRTY,
	4090	+ NR_WRITEBACK,
	4091	+ MEMCG_SWAP,
	4092	+};
	4093	+
	4094	+static const char *const memcg1_stat_names[] = {
	4095	+ "cache",
	4096	+ "rss",
	4097	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	4098	+ "rss_huge",
	4099	+#endif
	4100	+ "shmem",
	4101	+ "mapped_file",
	4102	+ "dirty",
	4103	+ "writeback",
	4104	+ "swap",
	4105	+};
3440	4106
3441	4107	/* Universal VM events cgroup1 shows, original sort order */
3442	4108	static const unsigned int memcg1_events[] = {
..	..	@@ -3446,45 +4112,42 @@
3446	4112	PGMAJFAULT,
3447	4113	};
3448	4114
3449		-static const char *const memcg1_event_names[] = {
3450		- "pgpgin",
3451		- "pgpgout",
3452		- "pgfault",
3453		- "pgmajfault",
3454		-};
3455		-
3456	4115	static int memcg_stat_show(struct seq_file m, void v)
3457	4116	{
3458		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
	4117	+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
3459	4118	unsigned long memory, memsw;
3460	4119	struct mem_cgroup *mi;
3461	4120	unsigned int i;
3462		- struct accumulated_stats acc;
3463	4121
3464	4122	BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
3465		- BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
3466	4123
3467	4124	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
	4125	+ unsigned long nr;
	4126	+
3468	4127	if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
3469	4128	continue;
3470		- seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
3471		- memcg_page_state(memcg, memcg1_stats[i]) *
3472		- PAGE_SIZE);
	4129	+ nr = memcg_page_state_local(memcg, memcg1_stats[i]);
	4130	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	4131	+ if (memcg1_stats[i] == NR_ANON_THPS)
	4132	+ nr *= HPAGE_PMD_NR;
	4133	+#endif
	4134	+ seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE);
3473	4135	}
3474	4136
3475	4137	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3476		- seq_printf(m, "%s %lu\n", memcg1_event_names[i],
3477		- memcg_sum_events(memcg, memcg1_events[i]));
	4138	+ seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]),
	4139	+ memcg_events_local(memcg, memcg1_events[i]));
3478	4140
3479	4141	for (i = 0; i < NR_LRU_LISTS; i++)
3480		- seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
3481		- mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
	4142	+ seq_printf(m, "%s %lu\n", lru_list_name(i),
	4143	+ memcg_page_state_local(memcg, NR_LRU_BASE + i) *
	4144	+ PAGE_SIZE);
3482	4145
3483	4146	/* Hierarchical information */
3484	4147	memory = memsw = PAGE_COUNTER_MAX;
3485	4148	for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
3486		- memory = min(memory, mi->memory.max);
3487		- memsw = min(memsw, mi->memsw.max);
	4149	+ memory = min(memory, READ_ONCE(mi->memory.max));
	4150	+ memsw = min(memsw, READ_ONCE(mi->memsw.max));
3488	4151	}
3489	4152	seq_printf(m, "hierarchical_memory_limit %llu\n",
3490	4153	(u64)memory * PAGE_SIZE);
..	..	@@ -3492,49 +4155,45 @@
3492	4155	seq_printf(m, "hierarchical_memsw_limit %llu\n",
3493	4156	(u64)memsw * PAGE_SIZE);
3494	4157
3495		- memset(&acc, 0, sizeof(acc));
3496		- acc.stats_size = ARRAY_SIZE(memcg1_stats);
3497		- acc.stats_array = memcg1_stats;
3498		- acc.events_size = ARRAY_SIZE(memcg1_events);
3499		- acc.events_array = memcg1_events;
3500		- accumulate_memcg_tree(memcg, &acc);
3501		-
3502	4158	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
	4159	+ unsigned long nr;
	4160	+
3503	4161	if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
3504	4162	continue;
	4163	+ nr = memcg_page_state(memcg, memcg1_stats[i]);
	4164	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	4165	+ if (memcg1_stats[i] == NR_ANON_THPS)
	4166	+ nr *= HPAGE_PMD_NR;
	4167	+#endif
3505	4168	seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
3506		- (u64)acc.stat[i] * PAGE_SIZE);
	4169	+ (u64)nr * PAGE_SIZE);
3507	4170	}
3508	4171
3509	4172	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3510		- seq_printf(m, "total_%s %llu\n", memcg1_event_names[i],
3511		- (u64)acc.events[i]);
	4173	+ seq_printf(m, "total_%s %llu\n",
	4174	+ vm_event_name(memcg1_events[i]),
	4175	+ (u64)memcg_events(memcg, memcg1_events[i]));
3512	4176
3513	4177	for (i = 0; i < NR_LRU_LISTS; i++)
3514		- seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i],
3515		- (u64)acc.lru_pages[i] * PAGE_SIZE);
	4178	+ seq_printf(m, "total_%s %llu\n", lru_list_name(i),
	4179	+ (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
	4180	+ PAGE_SIZE);
3516	4181
3517	4182	#ifdef CONFIG_DEBUG_VM
3518	4183	{
3519	4184	pg_data_t *pgdat;
3520	4185	struct mem_cgroup_per_node *mz;
3521		- struct zone_reclaim_stat *rstat;
3522		- unsigned long recent_rotated[2] = {0, 0};
3523		- unsigned long recent_scanned[2] = {0, 0};
	4186	+ unsigned long anon_cost = 0;
	4187	+ unsigned long file_cost = 0;
3524	4188
3525	4189	for_each_online_pgdat(pgdat) {
3526	4190	mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
3527		- rstat = &mz->lruvec.reclaim_stat;
3528	4191
3529		- recent_rotated[0] += rstat->recent_rotated[0];
3530		- recent_rotated[1] += rstat->recent_rotated[1];
3531		- recent_scanned[0] += rstat->recent_scanned[0];
3532		- recent_scanned[1] += rstat->recent_scanned[1];
	4192	+ anon_cost += mz->lruvec.anon_cost;
	4193	+ file_cost += mz->lruvec.file_cost;
3533	4194	}
3534		- seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
3535		- seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
3536		- seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
3537		- seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
	4195	+ seq_printf(m, "anon_cost %lu\n", anon_cost);
	4196	+ seq_printf(m, "file_cost %lu\n", file_cost);
3538	4197	}
3539	4198	#endif
3540	4199
..	..	@@ -3554,7 +4213,7 @@
3554	4213	{
3555	4214	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3556	4215
3557		- if (val > 100)
	4216	+ if (val > 200)
3558	4217	return -EINVAL;
3559	4218
3560	4219	if (css->parent)
..	..	@@ -3693,8 +4352,7 @@
3693	4352	size = thresholds->primary ? thresholds->primary->size + 1 : 1;
3694	4353
3695	4354	/* Allocate memory for new array of thresholds */
3696		- new = kmalloc(sizeof(new) + size sizeof(struct mem_cgroup_threshold),
3697		- GFP_KERNEL);
	4355	+ new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
3698	4356	if (!new) {
3699	4357	ret = -ENOMEM;
3700	4358	goto unlock;
..	..	@@ -3702,17 +4360,16 @@
3702	4360	new->size = size;
3703	4361
3704	4362	/* Copy thresholds (if any) to new array */
3705		- if (thresholds->primary) {
3706		- memcpy(new->entries, thresholds->primary->entries, (size - 1) *
3707		- sizeof(struct mem_cgroup_threshold));
3708		- }
	4363	+ if (thresholds->primary)
	4364	+ memcpy(new->entries, thresholds->primary->entries,
	4365	+ flex_array_size(new, entries, size - 1));
3709	4366
3710	4367	/* Add new threshold */
3711	4368	new->entries[size - 1].eventfd = eventfd;
3712	4369	new->entries[size - 1].threshold = threshold;
3713	4370
3714	4371	/* Sort thresholds. Registering of new threshold isn't time-critical */
3715		- sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
	4372	+ sort(new->entries, size, sizeof(*new->entries),
3716	4373	compare_thresholds, NULL);
3717	4374
3718	4375	/* Find current threshold */
..	..	@@ -3894,7 +4551,7 @@
3894	4551
3895	4552	static int mem_cgroup_oom_control_read(struct seq_file sf, void v)
3896	4553	{
3897		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
	4554	+ struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
3898	4555
3899	4556	seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
3900	4557	seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
..	..	@@ -3920,6 +4577,8 @@
3920	4577	}
3921	4578
3922	4579	#ifdef CONFIG_CGROUP_WRITEBACK
	4580	+
	4581	+#include <trace/events/writeback.h>
3923	4582
3924	4583	static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
3925	4584	{
..	..	@@ -3952,11 +4611,11 @@
3952	4611	*/
3953	4612	static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
3954	4613	{
3955		- long x = atomic_long_read(&memcg->stat[idx]);
	4614	+ long x = atomic_long_read(&memcg->vmstats[idx]);
3956	4615	int cpu;
3957	4616
3958	4617	for_each_online_cpu(cpu)
3959		- x += per_cpu_ptr(memcg->stat_cpu, cpu)->count[idx];
	4618	+ x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx];
3960	4619	if (x < 0)
3961	4620	x = 0;
3962	4621	return x;
..	..	@@ -3989,18 +4648,142 @@
3989	4648
3990	4649	*pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
3991	4650
3992		- /* this should eventually include NR_UNSTABLE_NFS */
3993	4651	*pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
3994		- *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) \|
3995		- (1 << LRU_ACTIVE_FILE));
	4652	+ *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
	4653	+ memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
3996	4654	*pheadroom = PAGE_COUNTER_MAX;
3997	4655
3998	4656	while ((parent = parent_mem_cgroup(memcg))) {
3999		- unsigned long ceiling = min(memcg->memory.max, memcg->high);
	4657	+ unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
	4658	+ READ_ONCE(memcg->memory.high));
4000	4659	unsigned long used = page_counter_read(&memcg->memory);
4001	4660
4002	4661	pheadroom = min(pheadroom, ceiling - min(ceiling, used));
4003	4662	memcg = parent;
	4663	+ }
	4664	+}
	4665	+
	4666	+/*
	4667	+ * Foreign dirty flushing
	4668	+ *
	4669	+ * There's an inherent mismatch between memcg and writeback. The former
	4670	+ * trackes ownership per-page while the latter per-inode. This was a
	4671	+ * deliberate design decision because honoring per-page ownership in the
	4672	+ * writeback path is complicated, may lead to higher CPU and IO overheads
	4673	+ * and deemed unnecessary given that write-sharing an inode across
	4674	+ * different cgroups isn't a common use-case.
	4675	+ *
	4676	+ * Combined with inode majority-writer ownership switching, this works well
	4677	+ * enough in most cases but there are some pathological cases. For
	4678	+ * example, let's say there are two cgroups A and B which keep writing to
	4679	+ * different but confined parts of the same inode. B owns the inode and
	4680	+ * A's memory is limited far below B's. A's dirty ratio can rise enough to
	4681	+ * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
	4682	+ * triggering background writeback. A will be slowed down without a way to
	4683	+ * make writeback of the dirty pages happen.
	4684	+ *
	4685	+ * Conditions like the above can lead to a cgroup getting repatedly and
	4686	+ * severely throttled after making some progress after each
	4687	+ * dirty_expire_interval while the underyling IO device is almost
	4688	+ * completely idle.
	4689	+ *
	4690	+ * Solving this problem completely requires matching the ownership tracking
	4691	+ * granularities between memcg and writeback in either direction. However,
	4692	+ * the more egregious behaviors can be avoided by simply remembering the
	4693	+ * most recent foreign dirtying events and initiating remote flushes on
	4694	+ * them when local writeback isn't enough to keep the memory clean enough.
	4695	+ *
	4696	+ * The following two functions implement such mechanism. When a foreign
	4697	+ * page - a page whose memcg and writeback ownerships don't match - is
	4698	+ * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
	4699	+ * bdi_writeback on the page owning memcg. When balance_dirty_pages()
	4700	+ * decides that the memcg needs to sleep due to high dirty ratio, it calls
	4701	+ * mem_cgroup_flush_foreign() which queues writeback on the recorded
	4702	+ * foreign bdi_writebacks which haven't expired. Both the numbers of
	4703	+ * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
	4704	+ * limited to MEMCG_CGWB_FRN_CNT.
	4705	+ *
	4706	+ * The mechanism only remembers IDs and doesn't hold any object references.
	4707	+ * As being wrong occasionally doesn't matter, updates and accesses to the
	4708	+ * records are lockless and racy.
	4709	+ */
	4710	+void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
	4711	+ struct bdi_writeback *wb)
	4712	+{
	4713	+ struct mem_cgroup *memcg = page->mem_cgroup;
	4714	+ struct memcg_cgwb_frn *frn;
	4715	+ u64 now = get_jiffies_64();
	4716	+ u64 oldest_at = now;
	4717	+ int oldest = -1;
	4718	+ int i;
	4719	+
	4720	+ trace_track_foreign_dirty(page, wb);
	4721	+
	4722	+ /*
	4723	+ * Pick the slot to use. If there is already a slot for @wb, keep
	4724	+ * using it. If not replace the oldest one which isn't being
	4725	+ * written out.
	4726	+ */
	4727	+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
	4728	+ frn = &memcg->cgwb_frn[i];
	4729	+ if (frn->bdi_id == wb->bdi->id &&
	4730	+ frn->memcg_id == wb->memcg_css->id)
	4731	+ break;
	4732	+ if (time_before64(frn->at, oldest_at) &&
	4733	+ atomic_read(&frn->done.cnt) == 1) {
	4734	+ oldest = i;
	4735	+ oldest_at = frn->at;
	4736	+ }
	4737	+ }
	4738	+
	4739	+ if (i < MEMCG_CGWB_FRN_CNT) {
	4740	+ /*
	4741	+ * Re-using an existing one. Update timestamp lazily to
	4742	+ * avoid making the cacheline hot. We want them to be
	4743	+ * reasonably up-to-date and significantly shorter than
	4744	+ * dirty_expire_interval as that's what expires the record.
	4745	+ * Use the shorter of 1s and dirty_expire_interval / 8.
	4746	+ */
	4747	+ unsigned long update_intv =
	4748	+ min_t(unsigned long, HZ,
	4749	+ msecs_to_jiffies(dirty_expire_interval * 10) / 8);
	4750	+
	4751	+ if (time_before64(frn->at, now - update_intv))
	4752	+ frn->at = now;
	4753	+ } else if (oldest >= 0) {
	4754	+ /* replace the oldest free one */
	4755	+ frn = &memcg->cgwb_frn[oldest];
	4756	+ frn->bdi_id = wb->bdi->id;
	4757	+ frn->memcg_id = wb->memcg_css->id;
	4758	+ frn->at = now;
	4759	+ }
	4760	+}
	4761	+
	4762	+/* issue foreign writeback flushes for recorded foreign dirtying events */
	4763	+void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
	4764	+{
	4765	+ struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
	4766	+ unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
	4767	+ u64 now = jiffies_64;
	4768	+ int i;
	4769	+
	4770	+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
	4771	+ struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
	4772	+
	4773	+ /*
	4774	+ * If the record is older than dirty_expire_interval,
	4775	+ * writeback on it has already started. No need to kick it
	4776	+ * off again. Also, don't start a new one if there's
	4777	+ * already one in flight.
	4778	+ */
	4779	+ if (time_after64(frn->at, now - intv) &&
	4780	+ atomic_read(&frn->done.cnt) == 1) {
	4781	+ frn->at = 0;
	4782	+ trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
	4783	+ cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
	4784	+ WB_REASON_FOREIGN_FLUSH,
	4785	+ &frn->done);
	4786	+ }
4004	4787	}
4005	4788	}
4006	4789
..	..	@@ -4123,6 +4906,7 @@
4123	4906	unsigned int efd, cfd;
4124	4907	struct fd efile;
4125	4908	struct fd cfile;
	4909	+ struct dentry *cdentry;
4126	4910	const char *name;
4127	4911	char *endp;
4128	4912	int ret;
..	..	@@ -4174,6 +4958,16 @@
4174	4958	goto out_put_cfile;
4175	4959
4176	4960	/*
	4961	+ * The control file must be a regular cgroup1 file. As a regular cgroup
	4962	+ * file can't be renamed, it's safe to access its name afterwards.
	4963	+ */
	4964	+ cdentry = cfile.file->f_path.dentry;
	4965	+ if (cdentry->d_sb->s_type != &cgroup_fs_type \|\| !d_is_reg(cdentry)) {
	4966	+ ret = -EINVAL;
	4967	+ goto out_put_cfile;
	4968	+ }
	4969	+
	4970	+ /*
4177	4971	* Determine the event callbacks and set them in @event. This used
4178	4972	* to be done via struct cftype but cgroup core no longer knows
4179	4973	* about these events. The following is crude but the whole thing
..	..	@@ -4181,7 +4975,7 @@
4181	4975	*
4182	4976	* DO NOT ADD NEW FILES.
4183	4977	*/
4184		- name = cfile.file->f_path.dentry->d_name.name;
	4978	+ name = cdentry->d_name.name;
4185	4979
4186	4980	if (!strcmp(name, "memory.usage_in_bytes")) {
4187	4981	event->register_event = mem_cgroup_usage_register_event;
..	..	@@ -4205,7 +4999,7 @@
4205	4999	* automatically removed on cgroup destruction but the removal is
4206	5000	* asynchronous, so take an extra ref on @css.
4207	5001	*/
4208		- cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
	5002	+ cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
4209	5003	&memory_cgrp_subsys);
4210	5004	ret = -EINVAL;
4211	5005	if (IS_ERR(cfile_css))
..	..	@@ -4340,12 +5134,10 @@
4340	5134	.write = mem_cgroup_reset,
4341	5135	.read_u64 = mem_cgroup_read_u64,
4342	5136	},
4343		-#if defined(CONFIG_SLAB) \|\| defined(CONFIG_SLUB_DEBUG)
	5137	+#if defined(CONFIG_MEMCG_KMEM) && \
	5138	+ (defined(CONFIG_SLAB) \|\| defined(CONFIG_SLUB_DEBUG))
4344	5139	{
4345	5140	.name = "kmem.slabinfo",
4346		- .seq_start = memcg_slab_start,
4347		- .seq_next = memcg_slab_next,
4348		- .seq_stop = memcg_slab_stop,
4349	5141	.seq_show = memcg_slab_show,
4350	5142	},
4351	5143	#endif
..	..	@@ -4383,7 +5175,7 @@
4383	5175	* limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
4384	5176	* memory-controlled cgroups to 64k.
4385	5177	*
4386		- * However, there usually are many references to the oflline CSS after
	5178	+ * However, there usually are many references to the offline CSS after
4387	5179	* the cgroup has been destroyed, such as page cache or reclaimable
4388	5180	* slab objects, that don't need to hang on to the ID. We want to keep
4389	5181	* those dead CSS from occupying IDs, or we might quickly exhaust the
..	..	@@ -4404,31 +5196,26 @@
4404	5196	static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
4405	5197	{
4406	5198	if (memcg->id.id > 0) {
	5199	+ trace_android_vh_mem_cgroup_id_remove(memcg);
4407	5200	idr_remove(&mem_cgroup_idr, memcg->id.id);
4408	5201	memcg->id.id = 0;
4409	5202	}
4410	5203	}
4411	5204
4412		-static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
	5205	+static void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg,
	5206	+ unsigned int n)
4413	5207	{
4414		- VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0);
4415		- atomic_add(n, &memcg->id.ref);
	5208	+ refcount_add(n, &memcg->id.ref);
4416	5209	}
4417	5210
4418	5211	static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
4419	5212	{
4420		- VM_BUG_ON(atomic_read(&memcg->id.ref) < n);
4421		- if (atomic_sub_and_test(n, &memcg->id.ref)) {
	5213	+ if (refcount_sub_and_test(n, &memcg->id.ref)) {
4422	5214	mem_cgroup_id_remove(memcg);
4423	5215
4424	5216	/* Memcg ID pins CSS */
4425	5217	css_put(&memcg->css);
4426	5218	}
4427		-}
4428		-
4429		-static inline void mem_cgroup_id_get(struct mem_cgroup *memcg)
4430		-{
4431		- mem_cgroup_id_get_many(memcg, 1);
4432	5219	}
4433	5220
4434	5221	static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
..	..	@@ -4447,6 +5234,7 @@
4447	5234	WARN_ON_ONCE(!rcu_read_lock_held());
4448	5235	return idr_find(&mem_cgroup_idr, id);
4449	5236	}
	5237	+EXPORT_SYMBOL_GPL(mem_cgroup_from_id);
4450	5238
4451	5239	static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
4452	5240	{
..	..	@@ -4466,8 +5254,17 @@
4466	5254	if (!pn)
4467	5255	return 1;
4468	5256
4469		- pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat);
	5257	+ pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat,
	5258	+ GFP_KERNEL_ACCOUNT);
	5259	+ if (!pn->lruvec_stat_local) {
	5260	+ kfree(pn);
	5261	+ return 1;
	5262	+ }
	5263	+
	5264	+ pn->lruvec_stat_cpu = alloc_percpu_gfp(struct lruvec_stat,
	5265	+ GFP_KERNEL_ACCOUNT);
4470	5266	if (!pn->lruvec_stat_cpu) {
	5267	+ free_percpu(pn->lruvec_stat_local);
4471	5268	kfree(pn);
4472	5269	return 1;
4473	5270	}
..	..	@@ -4489,6 +5286,7 @@
4489	5286	return;
4490	5287
4491	5288	free_percpu(pn->lruvec_stat_cpu);
	5289	+ free_percpu(pn->lruvec_stat_local);
4492	5290	kfree(pn);
4493	5291	}
4494	5292
..	..	@@ -4496,39 +5294,57 @@
4496	5294	{
4497	5295	int node;
4498	5296
	5297	+ trace_android_vh_mem_cgroup_free(memcg);
4499	5298	for_each_node(node)
4500	5299	free_mem_cgroup_per_node_info(memcg, node);
4501		- free_percpu(memcg->stat_cpu);
	5300	+ free_percpu(memcg->vmstats_percpu);
	5301	+ free_percpu(memcg->vmstats_local);
4502	5302	kfree(memcg);
4503	5303	}
4504	5304
4505	5305	static void mem_cgroup_free(struct mem_cgroup *memcg)
4506	5306	{
4507	5307	memcg_wb_domain_exit(memcg);
	5308	+ /*
	5309	+ * Flush percpu vmstats and vmevents to guarantee the value correctness
	5310	+ * on parent's and all ancestor levels.
	5311	+ */
	5312	+ memcg_flush_percpu_vmstats(memcg);
	5313	+ memcg_flush_percpu_vmevents(memcg);
4508	5314	__mem_cgroup_free(memcg);
4509	5315	}
4510	5316
4511	5317	static struct mem_cgroup *mem_cgroup_alloc(void)
4512	5318	{
4513	5319	struct mem_cgroup *memcg;
4514		- size_t size;
	5320	+ unsigned int size;
4515	5321	int node;
	5322	+ int __maybe_unused i;
	5323	+ long error = -ENOMEM;
4516	5324
4517	5325	size = sizeof(struct mem_cgroup);
4518	5326	size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
4519	5327
4520	5328	memcg = kzalloc(size, GFP_KERNEL);
4521	5329	if (!memcg)
4522		- return NULL;
	5330	+ return ERR_PTR(error);
4523	5331
4524	5332	memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
4525	5333	1, MEM_CGROUP_ID_MAX,
4526	5334	GFP_KERNEL);
4527		- if (memcg->id.id < 0)
	5335	+ if (memcg->id.id < 0) {
	5336	+ error = memcg->id.id;
	5337	+ goto fail;
	5338	+ }
	5339	+
	5340	+ memcg->vmstats_local = alloc_percpu_gfp(struct memcg_vmstats_percpu,
	5341	+ GFP_KERNEL_ACCOUNT);
	5342	+ if (!memcg->vmstats_local)
4528	5343	goto fail;
4529	5344
4530		- memcg->stat_cpu = alloc_percpu(struct mem_cgroup_stat_cpu);
4531		- if (!memcg->stat_cpu)
	5345	+ memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
	5346	+ GFP_KERNEL_ACCOUNT);
	5347	+ if (!memcg->vmstats_percpu)
4532	5348	goto fail;
4533	5349
4534	5350	for_each_node(node)
..	..	@@ -4539,7 +5355,6 @@
4539	5355	goto fail;
4540	5356
4541	5357	INIT_WORK(&memcg->high_work, high_work_func);
4542		- memcg->last_scanned_node = MAX_NUMNODES;
4543	5358	INIT_LIST_HEAD(&memcg->oom_notify);
4544	5359	mutex_init(&memcg->thresholds_lock);
4545	5360	spin_lock_init(&memcg->move_lock);
..	..	@@ -4549,48 +5364,64 @@
4549	5364	memcg->socket_pressure = jiffies;
4550	5365	#ifdef CONFIG_MEMCG_KMEM
4551	5366	memcg->kmemcg_id = -1;
	5367	+ INIT_LIST_HEAD(&memcg->objcg_list);
4552	5368	#endif
4553	5369	#ifdef CONFIG_CGROUP_WRITEBACK
4554	5370	INIT_LIST_HEAD(&memcg->cgwb_list);
	5371	+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
	5372	+ memcg->cgwb_frn[i].done =
	5373	+ __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
	5374	+#endif
	5375	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	5376	+ spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
	5377	+ INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
	5378	+ memcg->deferred_split_queue.split_queue_len = 0;
4555	5379	#endif
4556	5380	idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
	5381	+ trace_android_vh_mem_cgroup_alloc(memcg);
4557	5382	return memcg;
4558	5383	fail:
4559	5384	mem_cgroup_id_remove(memcg);
4560	5385	__mem_cgroup_free(memcg);
4561		- return NULL;
	5386	+ return ERR_PTR(error);
4562	5387	}
4563	5388
4564	5389	static struct cgroup_subsys_state * __ref
4565	5390	mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4566	5391	{
4567	5392	struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
4568		- struct mem_cgroup *memcg;
	5393	+ struct mem_cgroup memcg, old_memcg;
4569	5394	long error = -ENOMEM;
4570	5395
	5396	+ old_memcg = set_active_memcg(parent);
4571	5397	memcg = mem_cgroup_alloc();
4572		- if (!memcg)
4573		- return ERR_PTR(error);
	5398	+ set_active_memcg(old_memcg);
	5399	+ if (IS_ERR(memcg))
	5400	+ return ERR_CAST(memcg);
4574	5401
4575		- memcg->high = PAGE_COUNTER_MAX;
	5402	+ page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
4576	5403	memcg->soft_limit = PAGE_COUNTER_MAX;
	5404	+ page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
4577	5405	if (parent) {
4578	5406	memcg->swappiness = mem_cgroup_swappiness(parent);
4579	5407	memcg->oom_kill_disable = parent->oom_kill_disable;
4580	5408	}
4581		- if (parent && parent->use_hierarchy) {
	5409	+ if (!parent) {
	5410	+ page_counter_init(&memcg->memory, NULL);
	5411	+ page_counter_init(&memcg->swap, NULL);
	5412	+ page_counter_init(&memcg->kmem, NULL);
	5413	+ page_counter_init(&memcg->tcpmem, NULL);
	5414	+ } else if (parent->use_hierarchy) {
4582	5415	memcg->use_hierarchy = true;
4583	5416	page_counter_init(&memcg->memory, &parent->memory);
4584	5417	page_counter_init(&memcg->swap, &parent->swap);
4585		- page_counter_init(&memcg->memsw, &parent->memsw);
4586	5418	page_counter_init(&memcg->kmem, &parent->kmem);
4587	5419	page_counter_init(&memcg->tcpmem, &parent->tcpmem);
4588	5420	} else {
4589		- page_counter_init(&memcg->memory, NULL);
4590		- page_counter_init(&memcg->swap, NULL);
4591		- page_counter_init(&memcg->memsw, NULL);
4592		- page_counter_init(&memcg->kmem, NULL);
4593		- page_counter_init(&memcg->tcpmem, NULL);
	5421	+ page_counter_init(&memcg->memory, &root_mem_cgroup->memory);
	5422	+ page_counter_init(&memcg->swap, &root_mem_cgroup->swap);
	5423	+ page_counter_init(&memcg->kmem, &root_mem_cgroup->kmem);
	5424	+ page_counter_init(&memcg->tcpmem, &root_mem_cgroup->tcpmem);
4594	5425	/*
4595	5426	* Deeper hierachy with use_hierarchy == false doesn't make
4596	5427	* much sense so let cgroup subsystem know about this
..	..	@@ -4617,7 +5448,7 @@
4617	5448	fail:
4618	5449	mem_cgroup_id_remove(memcg);
4619	5450	mem_cgroup_free(memcg);
4620		- return ERR_PTR(-ENOMEM);
	5451	+ return ERR_PTR(error);
4621	5452	}
4622	5453
4623	5454	static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
..	..	@@ -4635,8 +5466,9 @@
4635	5466	}
4636	5467
4637	5468	/* Online state pins memcg ID, memcg ID pins CSS */
4638		- atomic_set(&memcg->id.ref, 1);
	5469	+ refcount_set(&memcg->id.ref, 1);
4639	5470	css_get(css);
	5471	+ trace_android_vh_mem_cgroup_css_online(css, memcg);
4640	5472	return 0;
4641	5473	}
4642	5474
..	..	@@ -4645,6 +5477,7 @@
4645	5477	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4646	5478	struct mem_cgroup_event event, tmp;
4647	5479
	5480	+ trace_android_vh_mem_cgroup_css_offline(css, memcg);
4648	5481	/*
4649	5482	* Unregister events and notify userspace.
4650	5483	* Notify userspace about cgroup removing only after rmdir of cgroup
..	..	@@ -4663,6 +5496,8 @@
4663	5496	memcg_offline_kmem(memcg);
4664	5497	wb_memcg_offline(memcg);
4665	5498
	5499	+ drain_all_stock(memcg);
	5500	+
4666	5501	mem_cgroup_id_put(memcg);
4667	5502	}
4668	5503
..	..	@@ -4676,7 +5511,12 @@
4676	5511	static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
4677	5512	{
4678	5513	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
	5514	+ int __maybe_unused i;
4679	5515
	5516	+#ifdef CONFIG_CGROUP_WRITEBACK
	5517	+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
	5518	+ wb_wait_for_completion(&memcg->cgwb_frn[i].done);
	5519	+#endif
4680	5520	if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
4681	5521	static_branch_dec(&memcg_sockets_enabled_key);
4682	5522
..	..	@@ -4710,13 +5550,13 @@
4710	5550
4711	5551	page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
4712	5552	page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
4713		- page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
4714	5553	page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
4715	5554	page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
4716	5555	page_counter_set_min(&memcg->memory, 0);
4717	5556	page_counter_set_low(&memcg->memory, 0);
4718		- memcg->high = PAGE_COUNTER_MAX;
	5557	+ page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
4719	5558	memcg->soft_limit = PAGE_COUNTER_MAX;
	5559	+ page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
4720	5560	memcg_wb_domain_size_changed(memcg);
4721	5561	}
4722	5562
..	..	@@ -4759,7 +5599,7 @@
4759	5599	static struct page mc_handle_present_pte(struct vm_area_struct vma,
4760	5600	unsigned long addr, pte_t ptent)
4761	5601	{
4762		- struct page *page = _vm_normal_page(vma, addr, ptent, true);
	5602	+ struct page *page = vm_normal_page(vma, addr, ptent);
4763	5603
4764	5604	if (!page \|\| !page_mapped(page))
4765	5605	return NULL;
..	..	@@ -4810,8 +5650,7 @@
4810	5650	* we call find_get_page() with swapper_space directly.
4811	5651	*/
4812	5652	page = find_get_page(swap_address_space(ent), swp_offset(ent));
4813		- if (do_memsw_account())
4814		- entry->val = ent.val;
	5653	+ entry->val = ent.val;
4815	5654
4816	5655	return page;
4817	5656	}
..	..	@@ -4826,36 +5665,15 @@
4826	5665	static struct page mc_handle_file_pte(struct vm_area_struct vma,
4827	5666	unsigned long addr, pte_t ptent, swp_entry_t *entry)
4828	5667	{
4829		- struct page *page = NULL;
4830		- struct address_space *mapping;
4831		- pgoff_t pgoff;
4832		-
4833	5668	if (!vma->vm_file) /* anonymous vma */
4834	5669	return NULL;
4835	5670	if (!(mc.flags & MOVE_FILE))
4836	5671	return NULL;
4837	5672
4838		- mapping = vma->vm_file->f_mapping;
4839		- pgoff = linear_page_index(vma, addr);
4840		-
4841	5673	/* page is moved even if it's not RSS of this task(page-faulted). */
4842		-#ifdef CONFIG_SWAP
4843	5674	/* shmem/tmpfs may report page out on swap: account for that too. */
4844		- if (shmem_mapping(mapping)) {
4845		- page = find_get_entry(mapping, pgoff);
4846		- if (radix_tree_exceptional_entry(page)) {
4847		- swp_entry_t swp = radix_to_swp_entry(page);
4848		- if (do_memsw_account())
4849		- *entry = swp;
4850		- page = find_get_page(swap_address_space(swp),
4851		- swp_offset(swp));
4852		- }
4853		- } else
4854		- page = find_get_page(mapping, pgoff);
4855		-#else
4856		- page = find_get_page(mapping, pgoff);
4857		-#endif
4858		- return page;
	5675	+ return find_get_incore_page(vma->vm_file->f_mapping,
	5676	+ linear_page_index(vma, addr));
4859	5677	}
4860	5678
4861	5679	/**
..	..	@@ -4875,10 +5693,10 @@
4875	5693	struct mem_cgroup *from,
4876	5694	struct mem_cgroup *to)
4877	5695	{
4878		- unsigned long flags;
4879		- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
	5696	+ struct lruvec from_vec, to_vec;
	5697	+ struct pglist_data *pgdat;
	5698	+ unsigned int nr_pages = compound ? thp_nr_pages(page) : 1;
4880	5699	int ret;
4881		- bool anon;
4882	5700
4883	5701	VM_BUG_ON(from == to);
4884	5702	VM_BUG_ON_PAGE(PageLRU(page), page);
..	..	@@ -4896,52 +5714,83 @@
4896	5714	if (page->mem_cgroup != from)
4897	5715	goto out_unlock;
4898	5716
4899		- anon = PageAnon(page);
	5717	+ pgdat = page_pgdat(page);
	5718	+ from_vec = mem_cgroup_lruvec(from, pgdat);
	5719	+ to_vec = mem_cgroup_lruvec(to, pgdat);
4900	5720
4901		- spin_lock_irqsave(&from->move_lock, flags);
	5721	+ lock_page_memcg(page);
4902	5722
4903		- if (!anon && page_mapped(page)) {
4904		- __mod_memcg_state(from, NR_FILE_MAPPED, -nr_pages);
4905		- __mod_memcg_state(to, NR_FILE_MAPPED, nr_pages);
4906		- }
	5723	+ if (PageAnon(page)) {
	5724	+ if (page_mapped(page)) {
	5725	+ __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
	5726	+ __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
	5727	+ if (PageTransHuge(page)) {
	5728	+ __dec_lruvec_state(from_vec, NR_ANON_THPS);
	5729	+ __inc_lruvec_state(to_vec, NR_ANON_THPS);
	5730	+ }
4907	5731
4908		- /*
4909		- * move_lock grabbed above and caller set from->moving_account, so
4910		- * mod_memcg_page_state will serialize updates to PageDirty.
4911		- * So mapping should be stable for dirty pages.
4912		- */
4913		- if (!anon && PageDirty(page)) {
4914		- struct address_space *mapping = page_mapping(page);
	5732	+ }
	5733	+ } else {
	5734	+ __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
	5735	+ __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
4915	5736
4916		- if (mapping_cap_account_dirty(mapping)) {
4917		- __mod_memcg_state(from, NR_FILE_DIRTY, -nr_pages);
4918		- __mod_memcg_state(to, NR_FILE_DIRTY, nr_pages);
	5737	+ if (PageSwapBacked(page)) {
	5738	+ __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
	5739	+ __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
	5740	+ }
	5741	+
	5742	+ if (page_mapped(page)) {
	5743	+ __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
	5744	+ __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
	5745	+ }
	5746	+
	5747	+ if (PageDirty(page)) {
	5748	+ struct address_space *mapping = page_mapping(page);
	5749	+
	5750	+ if (mapping_can_writeback(mapping)) {
	5751	+ __mod_lruvec_state(from_vec, NR_FILE_DIRTY,
	5752	+ -nr_pages);
	5753	+ __mod_lruvec_state(to_vec, NR_FILE_DIRTY,
	5754	+ nr_pages);
	5755	+ }
4919	5756	}
4920	5757	}
4921	5758
4922	5759	if (PageWriteback(page)) {
4923		- __mod_memcg_state(from, NR_WRITEBACK, -nr_pages);
4924		- __mod_memcg_state(to, NR_WRITEBACK, nr_pages);
	5760	+ __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
	5761	+ __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
4925	5762	}
4926	5763
4927	5764	/*
	5765	+ * All state has been migrated, let's switch to the new memcg.
	5766	+ *
4928	5767	* It is safe to change page->mem_cgroup here because the page
4929		- * is referenced, charged, and isolated - we can't race with
4930		- * uncharging, charging, migration, or LRU putback.
	5768	+ * is referenced, charged, isolated, and locked: we can't race
	5769	+ * with (un)charging, migration, LRU putback, or anything else
	5770	+ * that would rely on a stable page->mem_cgroup.
	5771	+ *
	5772	+ * Note that lock_page_memcg is a memcg lock, not a page lock,
	5773	+ * to save space. As soon as we switch page->mem_cgroup to a
	5774	+ * new memcg that isn't locked, the above state can change
	5775	+ * concurrently again. Make sure we're truly done with it.
4931	5776	*/
	5777	+ smp_mb();
4932	5778
4933		- /* caller should have done css_get */
	5779	+ css_get(&to->css);
	5780	+ css_put(&from->css);
	5781	+
4934	5782	page->mem_cgroup = to;
4935		- spin_unlock_irqrestore(&from->move_lock, flags);
	5783	+
	5784	+ __unlock_page_memcg(from);
4936	5785
4937	5786	ret = 0;
4938	5787
4939		- local_lock_irq(event_lock);
4940		- mem_cgroup_charge_statistics(to, page, compound, nr_pages);
	5788	+ local_irq_disable();
	5789	+ mem_cgroup_charge_statistics(to, page, nr_pages);
4941	5790	memcg_check_events(to, page);
4942		- mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
	5791	+ mem_cgroup_charge_statistics(from, page, -nr_pages);
4943	5792	memcg_check_events(from, page);
4944		- local_unlock_irq(event_lock);
	5793	+ local_irq_enable();
4945	5794	out_unlock:
4946	5795	unlock_page(page);
4947	5796	out:
..	..	@@ -4963,8 +5812,8 @@
4963	5812	* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
4964	5813	* target for charge migration. if @target is not NULL, the entry is stored
4965	5814	* in target->ent.
4966		- * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PUBLIC
4967		- * or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).
	5815	+ * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE
	5816	+ * (so ZONE_DEVICE page and thus not on the lru).
4968	5817	* For now we such page is charge like a regular page would be as for all
4969	5818	* intent and purposes it is just special memory taking the place of a
4970	5819	* regular page.
..	..	@@ -4998,8 +5847,7 @@
4998	5847	*/
4999	5848	if (page->mem_cgroup == mc.from) {
5000	5849	ret = MC_TARGET_PAGE;
5001		- if (is_device_private_page(page) \|\|
5002		- is_device_public_page(page))
	5850	+ if (is_device_private_page(page))
5003	5851	ret = MC_TARGET_DEVICE;
5004	5852	if (target)
5005	5853	target->page = page;
..	..	@@ -5070,8 +5918,8 @@
5070	5918	if (ptl) {
5071	5919	/*
5072	5920	* Note their can not be MC_TARGET_DEVICE for now as we do not
5073		- * support transparent huge page with MEMORY_DEVICE_PUBLIC or
5074		- * MEMORY_DEVICE_PRIVATE but this might change.
	5921	+ * support transparent huge page with MEMORY_DEVICE_PRIVATE but
	5922	+ * this might change.
5075	5923	*/
5076	5924	if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
5077	5925	mc.precharge += HPAGE_PMD_NR;
..	..	@@ -5091,18 +5939,17 @@
5091	5939	return 0;
5092	5940	}
5093	5941
	5942	+static const struct mm_walk_ops precharge_walk_ops = {
	5943	+ .pmd_entry = mem_cgroup_count_precharge_pte_range,
	5944	+};
	5945	+
5094	5946	static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
5095	5947	{
5096	5948	unsigned long precharge;
5097	5949
5098		- struct mm_walk mem_cgroup_count_precharge_walk = {
5099		- .pmd_entry = mem_cgroup_count_precharge_pte_range,
5100		- .mm = mm,
5101		- };
5102		- down_read(&mm->mmap_sem);
5103		- walk_page_range(0, mm->highest_vm_end,
5104		- &mem_cgroup_count_precharge_walk);
5105		- up_read(&mm->mmap_sem);
	5950	+ mmap_read_lock(mm);
	5951	+ walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
	5952	+ mmap_read_unlock(mm);
5106	5953
5107	5954	precharge = mc.precharge;
5108	5955	mc.precharge = 0;
..	..	@@ -5152,8 +5999,6 @@
5152	5999	*/
5153	6000	if (!mem_cgroup_is_root(mc.to))
5154	6001	page_counter_uncharge(&mc.to->memory, mc.moved_swap);
5155		-
5156		- css_put_many(&mc.to->css, mc.moved_swap);
5157	6002
5158	6003	mc.moved_swap = 0;
5159	6004	}
..	..	@@ -5315,7 +6160,7 @@
5315	6160	switch (get_mctgt_type(vma, addr, ptent, &target)) {
5316	6161	case MC_TARGET_DEVICE:
5317	6162	device = true;
5318		- /* fall through */
	6163	+ fallthrough;
5319	6164	case MC_TARGET_PAGE:
5320	6165	page = target.page;
5321	6166	/*
..	..	@@ -5370,13 +6215,12 @@
5370	6215	return ret;
5371	6216	}
5372	6217
	6218	+static const struct mm_walk_ops charge_walk_ops = {
	6219	+ .pmd_entry = mem_cgroup_move_charge_pte_range,
	6220	+};
	6221	+
5373	6222	static void mem_cgroup_move_charge(void)
5374	6223	{
5375		- struct mm_walk mem_cgroup_move_charge_walk = {
5376		- .pmd_entry = mem_cgroup_move_charge_pte_range,
5377		- .mm = mc.mm,
5378		- };
5379		-
5380	6224	lru_add_drain_all();
5381	6225	/*
5382	6226	* Signal lock_page_memcg() to take the memcg's move_lock
..	..	@@ -5386,9 +6230,9 @@
5386	6230	atomic_inc(&mc.from->moving_account);
5387	6231	synchronize_rcu();
5388	6232	retry:
5389		- if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) {
	6233	+ if (unlikely(!mmap_read_trylock(mc.mm))) {
5390	6234	/*
5391		- * Someone who are holding the mmap_sem might be waiting in
	6235	+ * Someone who are holding the mmap_lock might be waiting in
5392	6236	* waitq. So we cancel all extra charges, wake up all waiters,
5393	6237	* and retry. Because we cancel precharges, we might not be able
5394	6238	* to move enough charges, but moving charge is a best-effort
..	..	@@ -5402,9 +6246,10 @@
5402	6246	* When we have consumed all precharges and failed in doing
5403	6247	* additional charge, the page walk just aborts.
5404	6248	*/
5405		- walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk);
	6249	+ walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
	6250	+ NULL);
5406	6251
5407		- up_read(&mc.mm->mmap_sem);
	6252	+ mmap_read_unlock(mc.mm);
5408	6253	atomic_dec(&mc.from->moving_account);
5409	6254	}
5410	6255
..	..	@@ -5446,6 +6291,16 @@
5446	6291	root_mem_cgroup->use_hierarchy = false;
5447	6292	}
5448	6293
	6294	+static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
	6295	+{
	6296	+ if (value == PAGE_COUNTER_MAX)
	6297	+ seq_puts(m, "max\n");
	6298	+ else
	6299	+ seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
	6300	+
	6301	+ return 0;
	6302	+}
	6303	+
5449	6304	static u64 memory_current_read(struct cgroup_subsys_state *css,
5450	6305	struct cftype *cft)
5451	6306	{
..	..	@@ -5456,15 +6311,8 @@
5456	6311
5457	6312	static int memory_min_show(struct seq_file m, void v)
5458	6313	{
5459		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5460		- unsigned long min = READ_ONCE(memcg->memory.min);
5461		-
5462		- if (min == PAGE_COUNTER_MAX)
5463		- seq_puts(m, "max\n");
5464		- else
5465		- seq_printf(m, "%llu\n", (u64)min * PAGE_SIZE);
5466		-
5467		- return 0;
	6314	+ return seq_puts_memcg_tunable(m,
	6315	+ READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
5468	6316	}
5469	6317
5470	6318	static ssize_t memory_min_write(struct kernfs_open_file *of,
..	..	@@ -5486,15 +6334,8 @@
5486	6334
5487	6335	static int memory_low_show(struct seq_file m, void v)
5488	6336	{
5489		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5490		- unsigned long low = READ_ONCE(memcg->memory.low);
5491		-
5492		- if (low == PAGE_COUNTER_MAX)
5493		- seq_puts(m, "max\n");
5494		- else
5495		- seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE);
5496		-
5497		- return 0;
	6337	+ return seq_puts_memcg_tunable(m,
	6338	+ READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
5498	6339	}
5499	6340
5500	6341	static ssize_t memory_low_write(struct kernfs_open_file *of,
..	..	@@ -5516,22 +6357,16 @@
5516	6357
5517	6358	static int memory_high_show(struct seq_file m, void v)
5518	6359	{
5519		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5520		- unsigned long high = READ_ONCE(memcg->high);
5521		-
5522		- if (high == PAGE_COUNTER_MAX)
5523		- seq_puts(m, "max\n");
5524		- else
5525		- seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE);
5526		-
5527		- return 0;
	6360	+ return seq_puts_memcg_tunable(m,
	6361	+ READ_ONCE(mem_cgroup_from_seq(m)->memory.high));
5528	6362	}
5529	6363
5530	6364	static ssize_t memory_high_write(struct kernfs_open_file *of,
5531	6365	char *buf, size_t nbytes, loff_t off)
5532	6366	{
5533	6367	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5534		- unsigned long nr_pages;
	6368	+ unsigned int nr_retries = MAX_RECLAIM_RETRIES;
	6369	+ bool drained = false;
5535	6370	unsigned long high;
5536	6371	int err;
5537	6372
..	..	@@ -5540,12 +6375,30 @@
5540	6375	if (err)
5541	6376	return err;
5542	6377
5543		- memcg->high = high;
	6378	+ page_counter_set_high(&memcg->memory, high);
5544	6379
5545		- nr_pages = page_counter_read(&memcg->memory);
5546		- if (nr_pages > high)
5547		- try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
5548		- GFP_KERNEL, true);
	6380	+ for (;;) {
	6381	+ unsigned long nr_pages = page_counter_read(&memcg->memory);
	6382	+ unsigned long reclaimed;
	6383	+
	6384	+ if (nr_pages <= high)
	6385	+ break;
	6386	+
	6387	+ if (signal_pending(current))
	6388	+ break;
	6389	+
	6390	+ if (!drained) {
	6391	+ drain_all_stock(memcg);
	6392	+ drained = true;
	6393	+ continue;
	6394	+ }
	6395	+
	6396	+ reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
	6397	+ GFP_KERNEL, true);
	6398	+
	6399	+ if (!reclaimed && !nr_retries--)
	6400	+ break;
	6401	+ }
5549	6402
5550	6403	memcg_wb_domain_size_changed(memcg);
5551	6404	return nbytes;
..	..	@@ -5553,22 +6406,15 @@
5553	6406
5554	6407	static int memory_max_show(struct seq_file m, void v)
5555	6408	{
5556		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5557		- unsigned long max = READ_ONCE(memcg->memory.max);
5558		-
5559		- if (max == PAGE_COUNTER_MAX)
5560		- seq_puts(m, "max\n");
5561		- else
5562		- seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
5563		-
5564		- return 0;
	6409	+ return seq_puts_memcg_tunable(m,
	6410	+ READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
5565	6411	}
5566	6412
5567	6413	static ssize_t memory_max_write(struct kernfs_open_file *of,
5568	6414	char *buf, size_t nbytes, loff_t off)
5569	6415	{
5570	6416	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5571		- unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
	6417	+ unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
5572	6418	bool drained = false;
5573	6419	unsigned long max;
5574	6420	int err;
..	..	@@ -5586,10 +6432,8 @@
5586	6432	if (nr_pages <= max)
5587	6433	break;
5588	6434
5589		- if (signal_pending(current)) {
5590		- err = -EINTR;
	6435	+ if (signal_pending(current))
5591	6436	break;
5592		- }
5593	6437
5594	6438	if (!drained) {
5595	6439	drain_all_stock(memcg);
..	..	@@ -5613,104 +6457,77 @@
5613	6457	return nbytes;
5614	6458	}
5615	6459
	6460	+static void __memory_events_show(struct seq_file m, atomic_long_t events)
	6461	+{
	6462	+ seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
	6463	+ seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
	6464	+ seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
	6465	+ seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
	6466	+ seq_printf(m, "oom_kill %lu\n",
	6467	+ atomic_long_read(&events[MEMCG_OOM_KILL]));
	6468	+}
	6469	+
5616	6470	static int memory_events_show(struct seq_file m, void v)
5617	6471	{
5618		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
	6472	+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
5619	6473
5620		- seq_printf(m, "low %lu\n",
5621		- atomic_long_read(&memcg->memory_events[MEMCG_LOW]));
5622		- seq_printf(m, "high %lu\n",
5623		- atomic_long_read(&memcg->memory_events[MEMCG_HIGH]));
5624		- seq_printf(m, "max %lu\n",
5625		- atomic_long_read(&memcg->memory_events[MEMCG_MAX]));
5626		- seq_printf(m, "oom %lu\n",
5627		- atomic_long_read(&memcg->memory_events[MEMCG_OOM]));
5628		- seq_printf(m, "oom_kill %lu\n",
5629		- atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
	6474	+ __memory_events_show(m, memcg->memory_events);
	6475	+ return 0;
	6476	+}
5630	6477
	6478	+static int memory_events_local_show(struct seq_file m, void v)
	6479	+{
	6480	+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
	6481	+
	6482	+ __memory_events_show(m, memcg->memory_events_local);
5631	6483	return 0;
5632	6484	}
5633	6485
5634	6486	static int memory_stat_show(struct seq_file m, void v)
5635	6487	{
5636		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5637		- struct accumulated_stats acc;
5638		- int i;
	6488	+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
	6489	+ char *buf;
5639	6490
5640		- /*
5641		- * Provide statistics on the state of the memory subsystem as
5642		- * well as cumulative event counters that show past behavior.
5643		- *
5644		- * This list is ordered following a combination of these gradients:
5645		- * 1) generic big picture -> specifics and details
5646		- * 2) reflecting userspace activity -> reflecting kernel heuristics
5647		- *
5648		- * Current memory state:
5649		- */
5650		-
5651		- memset(&acc, 0, sizeof(acc));
5652		- acc.stats_size = MEMCG_NR_STAT;
5653		- acc.events_size = NR_VM_EVENT_ITEMS;
5654		- accumulate_memcg_tree(memcg, &acc);
5655		-
5656		- seq_printf(m, "anon %llu\n",
5657		- (u64)acc.stat[MEMCG_RSS] * PAGE_SIZE);
5658		- seq_printf(m, "file %llu\n",
5659		- (u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE);
5660		- seq_printf(m, "kernel_stack %llu\n",
5661		- (u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024);
5662		- seq_printf(m, "slab %llu\n",
5663		- (u64)(acc.stat[NR_SLAB_RECLAIMABLE] +
5664		- acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
5665		- seq_printf(m, "sock %llu\n",
5666		- (u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE);
5667		-
5668		- seq_printf(m, "shmem %llu\n",
5669		- (u64)acc.stat[NR_SHMEM] * PAGE_SIZE);
5670		- seq_printf(m, "file_mapped %llu\n",
5671		- (u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE);
5672		- seq_printf(m, "file_dirty %llu\n",
5673		- (u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE);
5674		- seq_printf(m, "file_writeback %llu\n",
5675		- (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE);
5676		-
5677		- for (i = 0; i < NR_LRU_LISTS; i++)
5678		- seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
5679		- (u64)acc.lru_pages[i] * PAGE_SIZE);
5680		-
5681		- seq_printf(m, "slab_reclaimable %llu\n",
5682		- (u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
5683		- seq_printf(m, "slab_unreclaimable %llu\n",
5684		- (u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
5685		-
5686		- /* Accumulated memory events */
5687		-
5688		- seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]);
5689		- seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]);
5690		-
5691		- seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]);
5692		- seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] +
5693		- acc.events[PGSCAN_DIRECT]);
5694		- seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] +
5695		- acc.events[PGSTEAL_DIRECT]);
5696		- seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]);
5697		- seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]);
5698		- seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]);
5699		- seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
5700		-
5701		- seq_printf(m, "workingset_refault %lu\n",
5702		- acc.stat[WORKINGSET_REFAULT]);
5703		- seq_printf(m, "workingset_activate %lu\n",
5704		- acc.stat[WORKINGSET_ACTIVATE]);
5705		- seq_printf(m, "workingset_nodereclaim %lu\n",
5706		- acc.stat[WORKINGSET_NODERECLAIM]);
5707		-
	6491	+ buf = memory_stat_format(memcg);
	6492	+ if (!buf)
	6493	+ return -ENOMEM;
	6494	+ seq_puts(m, buf);
	6495	+ kfree(buf);
5708	6496	return 0;
5709	6497	}
5710	6498
	6499	+#ifdef CONFIG_NUMA
	6500	+static int memory_numa_stat_show(struct seq_file m, void v)
	6501	+{
	6502	+ int i;
	6503	+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
	6504	+
	6505	+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
	6506	+ int nid;
	6507	+
	6508	+ if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS)
	6509	+ continue;
	6510	+
	6511	+ seq_printf(m, "%s", memory_stats[i].name);
	6512	+ for_each_node_state(nid, N_MEMORY) {
	6513	+ u64 size;
	6514	+ struct lruvec *lruvec;
	6515	+
	6516	+ lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
	6517	+ size = lruvec_page_state(lruvec, memory_stats[i].idx);
	6518	+ size *= memory_stats[i].ratio;
	6519	+ seq_printf(m, " N%d=%llu", nid, size);
	6520	+ }
	6521	+ seq_putc(m, '\n');
	6522	+ }
	6523	+
	6524	+ return 0;
	6525	+}
	6526	+#endif
	6527	+
5711	6528	static int memory_oom_group_show(struct seq_file m, void v)
5712	6529	{
5713		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
	6530	+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
5714	6531
5715	6532	seq_printf(m, "%d\n", memcg->oom_group);
5716	6533
..	..	@@ -5776,10 +6593,21 @@
5776	6593	.seq_show = memory_events_show,
5777	6594	},
5778	6595	{
5779		- .name = "stat",
	6596	+ .name = "events.local",
5780	6597	.flags = CFTYPE_NOT_ON_ROOT,
	6598	+ .file_offset = offsetof(struct mem_cgroup, events_local_file),
	6599	+ .seq_show = memory_events_local_show,
	6600	+ },
	6601	+ {
	6602	+ .name = "stat",
5781	6603	.seq_show = memory_stat_show,
5782	6604	},
	6605	+#ifdef CONFIG_NUMA
	6606	+ {
	6607	+ .name = "numa_stat",
	6608	+ .seq_show = memory_numa_stat_show,
	6609	+ },
	6610	+#endif
5783	6611	{
5784	6612	.name = "oom.group",
5785	6613	.flags = CFTYPE_NOT_ON_ROOT \| CFTYPE_NS_DELEGATABLE,
..	..	@@ -5805,6 +6633,122 @@
5805	6633	.early_init = 0,
5806	6634	};
5807	6635
	6636	+/*
	6637	+ * This function calculates an individual cgroup's effective
	6638	+ * protection which is derived from its own memory.min/low, its
	6639	+ * parent's and siblings' settings, as well as the actual memory
	6640	+ * distribution in the tree.
	6641	+ *
	6642	+ * The following rules apply to the effective protection values:
	6643	+ *
	6644	+ * 1. At the first level of reclaim, effective protection is equal to
	6645	+ * the declared protection in memory.min and memory.low.
	6646	+ *
	6647	+ * 2. To enable safe delegation of the protection configuration, at
	6648	+ * subsequent levels the effective protection is capped to the
	6649	+ * parent's effective protection.
	6650	+ *
	6651	+ * 3. To make complex and dynamic subtrees easier to configure, the
	6652	+ * user is allowed to overcommit the declared protection at a given
	6653	+ * level. If that is the case, the parent's effective protection is
	6654	+ * distributed to the children in proportion to how much protection
	6655	+ * they have declared and how much of it they are utilizing.
	6656	+ *
	6657	+ * This makes distribution proportional, but also work-conserving:
	6658	+ * if one cgroup claims much more protection than it uses memory,
	6659	+ * the unused remainder is available to its siblings.
	6660	+ *
	6661	+ * 4. Conversely, when the declared protection is undercommitted at a
	6662	+ * given level, the distribution of the larger parental protection
	6663	+ * budget is NOT proportional. A cgroup's protection from a sibling
	6664	+ * is capped to its own memory.min/low setting.
	6665	+ *
	6666	+ * 5. However, to allow protecting recursive subtrees from each other
	6667	+ * without having to declare each individual cgroup's fixed share
	6668	+ * of the ancestor's claim to protection, any unutilized -
	6669	+ * "floating" - protection from up the tree is distributed in
	6670	+ * proportion to each cgroup's usage. This makes the protection
	6671	+ * neutral wrt sibling cgroups and lets them compete freely over
	6672	+ * the shared parental protection budget, but it protects the
	6673	+ * subtree as a whole from neighboring subtrees.
	6674	+ *
	6675	+ * Note that 4. and 5. are not in conflict: 4. is about protecting
	6676	+ * against immediate siblings whereas 5. is about protecting against
	6677	+ * neighboring subtrees.
	6678	+ */
	6679	+static unsigned long effective_protection(unsigned long usage,
	6680	+ unsigned long parent_usage,
	6681	+ unsigned long setting,
	6682	+ unsigned long parent_effective,
	6683	+ unsigned long siblings_protected)
	6684	+{
	6685	+ unsigned long protected;
	6686	+ unsigned long ep;
	6687	+
	6688	+ protected = min(usage, setting);
	6689	+ /*
	6690	+ * If all cgroups at this level combined claim and use more
	6691	+ * protection then what the parent affords them, distribute
	6692	+ * shares in proportion to utilization.
	6693	+ *
	6694	+ * We are using actual utilization rather than the statically
	6695	+ * claimed protection in order to be work-conserving: claimed
	6696	+ * but unused protection is available to siblings that would
	6697	+ * otherwise get a smaller chunk than what they claimed.
	6698	+ */
	6699	+ if (siblings_protected > parent_effective)
	6700	+ return protected * parent_effective / siblings_protected;
	6701	+
	6702	+ /*
	6703	+ * Ok, utilized protection of all children is within what the
	6704	+ * parent affords them, so we know whatever this child claims
	6705	+ * and utilizes is effectively protected.
	6706	+ *
	6707	+ * If there is unprotected usage beyond this value, reclaim
	6708	+ * will apply pressure in proportion to that amount.
	6709	+ *
	6710	+ * If there is unutilized protection, the cgroup will be fully
	6711	+ * shielded from reclaim, but we do return a smaller value for
	6712	+ * protection than what the group could enjoy in theory. This
	6713	+ * is okay. With the overcommit distribution above, effective
	6714	+ * protection is always dependent on how memory is actually
	6715	+ * consumed among the siblings anyway.
	6716	+ */
	6717	+ ep = protected;
	6718	+
	6719	+ /*
	6720	+ * If the children aren't claiming (all of) the protection
	6721	+ * afforded to them by the parent, distribute the remainder in
	6722	+ * proportion to the (unprotected) memory of each cgroup. That
	6723	+ * way, cgroups that aren't explicitly prioritized wrt each
	6724	+ * other compete freely over the allowance, but they are
	6725	+ * collectively protected from neighboring trees.
	6726	+ *
	6727	+ * We're using unprotected memory for the weight so that if
	6728	+ * some cgroups DO claim explicit protection, we don't protect
	6729	+ * the same bytes twice.
	6730	+ *
	6731	+ * Check both usage and parent_usage against the respective
	6732	+ * protected values. One should imply the other, but they
	6733	+ * aren't read atomically - make sure the division is sane.
	6734	+ */
	6735	+ if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT))
	6736	+ return ep;
	6737	+ if (parent_effective > siblings_protected &&
	6738	+ parent_usage > siblings_protected &&
	6739	+ usage > protected) {
	6740	+ unsigned long unclaimed;
	6741	+
	6742	+ unclaimed = parent_effective - siblings_protected;
	6743	+ unclaimed *= usage - protected;
	6744	+ unclaimed /= parent_usage - siblings_protected;
	6745	+
	6746	+ ep += unclaimed;
	6747	+ }
	6748	+
	6749	+ return ep;
	6750	+}
	6751	+
5808	6752	/**
5809	6753	* mem_cgroup_protected - check if memory consumption is in the normal range
5810	6754	* @root: the top ancestor of the sub-tree being checked
..	..	@@ -5812,259 +6756,125 @@
5812	6756	*
5813	6757	* WARNING: This function is not stateless! It can only be used as part
5814	6758	* of a top-down tree iteration, not for isolated queries.
5815		- *
5816		- * Returns one of the following:
5817		- * MEMCG_PROT_NONE: cgroup memory is not protected
5818		- * MEMCG_PROT_LOW: cgroup memory is protected as long there is
5819		- * an unprotected supply of reclaimable memory from other cgroups.
5820		- * MEMCG_PROT_MIN: cgroup memory is protected
5821		- *
5822		- * @root is exclusive; it is never protected when looked at directly
5823		- *
5824		- * To provide a proper hierarchical behavior, effective memory.min/low values
5825		- * are used. Below is the description of how effective memory.low is calculated.
5826		- * Effective memory.min values is calculated in the same way.
5827		- *
5828		- * Effective memory.low is always equal or less than the original memory.low.
5829		- * If there is no memory.low overcommittment (which is always true for
5830		- * top-level memory cgroups), these two values are equal.
5831		- * Otherwise, it's a part of parent's effective memory.low,
5832		- * calculated as a cgroup's memory.low usage divided by sum of sibling's
5833		- * memory.low usages, where memory.low usage is the size of actually
5834		- * protected memory.
5835		- *
5836		- * low_usage
5837		- * elow = min( memory.low, parent->elow * ------------------ ),
5838		- * siblings_low_usage
5839		- *
5840		- * \| memory.current, if memory.current < memory.low
5841		- * low_usage = \|
5842		- \| 0, otherwise.
5843		- *
5844		- *
5845		- * Such definition of the effective memory.low provides the expected
5846		- * hierarchical behavior: parent's memory.low value is limiting
5847		- * children, unprotected memory is reclaimed first and cgroups,
5848		- * which are not using their guarantee do not affect actual memory
5849		- * distribution.
5850		- *
5851		- * For example, if there are memcgs A, A/B, A/C, A/D and A/E:
5852		- *
5853		- * A A/memory.low = 2G, A/memory.current = 6G
5854		- * //\\
5855		- * BC DE B/memory.low = 3G B/memory.current = 2G
5856		- * C/memory.low = 1G C/memory.current = 2G
5857		- * D/memory.low = 0 D/memory.current = 2G
5858		- * E/memory.low = 10G E/memory.current = 0
5859		- *
5860		- * and the memory pressure is applied, the following memory distribution
5861		- * is expected (approximately):
5862		- *
5863		- * A/memory.current = 2G
5864		- *
5865		- * B/memory.current = 1.3G
5866		- * C/memory.current = 0.6G
5867		- * D/memory.current = 0
5868		- * E/memory.current = 0
5869		- *
5870		- * These calculations require constant tracking of the actual low usages
5871		- * (see propagate_protected_usage()), as well as recursive calculation of
5872		- * effective memory.low values. But as we do call mem_cgroup_protected()
5873		- * path for each memory cgroup top-down from the reclaim,
5874		- * it's possible to optimize this part, and save calculated elow
5875		- * for next usage. This part is intentionally racy, but it's ok,
5876		- * as memory.low is a best-effort mechanism.
5877	6759	*/
5878		-enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
5879		- struct mem_cgroup *memcg)
	6760	+void mem_cgroup_calculate_protection(struct mem_cgroup *root,
	6761	+ struct mem_cgroup *memcg)
5880	6762	{
	6763	+ unsigned long usage, parent_usage;
5881	6764	struct mem_cgroup *parent;
5882		- unsigned long emin, parent_emin;
5883		- unsigned long elow, parent_elow;
5884		- unsigned long usage;
5885	6765
5886	6766	if (mem_cgroup_disabled())
5887		- return MEMCG_PROT_NONE;
	6767	+ return;
5888	6768
5889	6769	if (!root)
5890	6770	root = root_mem_cgroup;
	6771	+
	6772	+ /*
	6773	+ * Effective values of the reclaim targets are ignored so they
	6774	+ * can be stale. Have a look at mem_cgroup_protection for more
	6775	+ * details.
	6776	+ * TODO: calculation should be more robust so that we do not need
	6777	+ * that special casing.
	6778	+ */
5891	6779	if (memcg == root)
5892		- return MEMCG_PROT_NONE;
	6780	+ return;
5893	6781
5894	6782	usage = page_counter_read(&memcg->memory);
5895	6783	if (!usage)
5896		- return MEMCG_PROT_NONE;
5897		-
5898		- emin = memcg->memory.min;
5899		- elow = memcg->memory.low;
	6784	+ return;
5900	6785
5901	6786	parent = parent_mem_cgroup(memcg);
5902	6787	/* No parent means a non-hierarchical mode on v1 memcg */
5903	6788	if (!parent)
5904		- return MEMCG_PROT_NONE;
	6789	+ return;
5905	6790
5906		- if (parent == root)
5907		- goto exit;
5908		-
5909		- parent_emin = READ_ONCE(parent->memory.emin);
5910		- emin = min(emin, parent_emin);
5911		- if (emin && parent_emin) {
5912		- unsigned long min_usage, siblings_min_usage;
5913		-
5914		- min_usage = min(usage, memcg->memory.min);
5915		- siblings_min_usage = atomic_long_read(
5916		- &parent->memory.children_min_usage);
5917		-
5918		- if (min_usage && siblings_min_usage)
5919		- emin = min(emin, parent_emin * min_usage /
5920		- siblings_min_usage);
	6791	+ if (parent == root) {
	6792	+ memcg->memory.emin = READ_ONCE(memcg->memory.min);
	6793	+ memcg->memory.elow = READ_ONCE(memcg->memory.low);
	6794	+ return;
5921	6795	}
5922	6796
5923		- parent_elow = READ_ONCE(parent->memory.elow);
5924		- elow = min(elow, parent_elow);
5925		- if (elow && parent_elow) {
5926		- unsigned long low_usage, siblings_low_usage;
	6797	+ parent_usage = page_counter_read(&parent->memory);
5927	6798
5928		- low_usage = min(usage, memcg->memory.low);
5929		- siblings_low_usage = atomic_long_read(
5930		- &parent->memory.children_low_usage);
	6799	+ WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage,
	6800	+ READ_ONCE(memcg->memory.min),
	6801	+ READ_ONCE(parent->memory.emin),
	6802	+ atomic_long_read(&parent->memory.children_min_usage)));
5931	6803
5932		- if (low_usage && siblings_low_usage)
5933		- elow = min(elow, parent_elow * low_usage /
5934		- siblings_low_usage);
5935		- }
5936		-
5937		-exit:
5938		- memcg->memory.emin = emin;
5939		- memcg->memory.elow = elow;
5940		-
5941		- if (usage <= emin)
5942		- return MEMCG_PROT_MIN;
5943		- else if (usage <= elow)
5944		- return MEMCG_PROT_LOW;
5945		- else
5946		- return MEMCG_PROT_NONE;
	6804	+ WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage,
	6805	+ READ_ONCE(memcg->memory.low),
	6806	+ READ_ONCE(parent->memory.elow),
	6807	+ atomic_long_read(&parent->memory.children_low_usage)));
5947	6808	}
5948	6809
5949	6810	/**
5950		- * mem_cgroup_try_charge - try charging a page
	6811	+ * __mem_cgroup_charge - charge a newly allocated page to a cgroup
5951	6812	* @page: page to charge
5952	6813	* @mm: mm context of the victim
5953	6814	* @gfp_mask: reclaim mode
5954		- * @memcgp: charged memcg return
5955		- * @compound: charge the page as compound or small page
5956	6815	*
5957	6816	* Try to charge @page to the memcg that @mm belongs to, reclaiming
5958	6817	* pages according to @gfp_mask if necessary.
5959	6818	*
5960		- * Returns 0 on success, with *@memcgp pointing to the charged memcg.
5961		- * Otherwise, an error code is returned.
5962		- *
5963		- * After page->mapping has been set up, the caller must finalize the
5964		- * charge with mem_cgroup_commit_charge(). Or abort the transaction
5965		- * with mem_cgroup_cancel_charge() in case page instantiation fails.
	6819	+ * Returns 0 on success. Otherwise, an error code is returned.
5966	6820	*/
5967		-int mem_cgroup_try_charge(struct page page, struct mm_struct mm,
5968		- gfp_t gfp_mask, struct mem_cgroup **memcgp,
5969		- bool compound)
	6821	+int __mem_cgroup_charge(struct page page, struct mm_struct mm,
	6822	+ gfp_t gfp_mask)
5970	6823	{
	6824	+ unsigned int nr_pages = thp_nr_pages(page);
5971	6825	struct mem_cgroup *memcg = NULL;
5972		- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5973	6826	int ret = 0;
5974	6827
5975		- if (mem_cgroup_disabled())
5976		- goto out;
5977		-
5978	6828	if (PageSwapCache(page)) {
	6829	+ swp_entry_t ent = { .val = page_private(page), };
	6830	+ unsigned short id;
	6831	+
5979	6832	/*
5980	6833	* Every swap fault against a single page tries to charge the
5981	6834	* page, bail as early as possible. shmem_unuse() encounters
5982		- * already charged pages, too. The USED bit is protected by
5983		- * the page lock, which serializes swap cache removal, which
	6835	+ * already charged pages, too. page->mem_cgroup is protected
	6836	+ * by the page lock, which serializes swap cache removal, which
5984	6837	* in turn serializes uncharging.
5985	6838	*/
5986	6839	VM_BUG_ON_PAGE(!PageLocked(page), page);
5987	6840	if (compound_head(page)->mem_cgroup)
5988	6841	goto out;
5989	6842
5990		- if (do_swap_account) {
5991		- swp_entry_t ent = { .val = page_private(page), };
5992		- unsigned short id = lookup_swap_cgroup_id(ent);
5993		-
5994		- rcu_read_lock();
5995		- memcg = mem_cgroup_from_id(id);
5996		- if (memcg && !css_tryget_online(&memcg->css))
5997		- memcg = NULL;
5998		- rcu_read_unlock();
5999		- }
	6843	+ id = lookup_swap_cgroup_id(ent);
	6844	+ rcu_read_lock();
	6845	+ memcg = mem_cgroup_from_id(id);
	6846	+ if (memcg && !css_tryget_online(&memcg->css))
	6847	+ memcg = NULL;
	6848	+ rcu_read_unlock();
6000	6849	}
6001	6850
6002	6851	if (!memcg)
6003	6852	memcg = get_mem_cgroup_from_mm(mm);
6004	6853
6005	6854	ret = try_charge(memcg, gfp_mask, nr_pages);
	6855	+ if (ret)
	6856	+ goto out_put;
6006	6857
6007		- css_put(&memcg->css);
6008		-out:
6009		- *memcgp = memcg;
6010		- return ret;
6011		-}
	6858	+ css_get(&memcg->css);
	6859	+ commit_charge(page, memcg);
6012	6860
6013		-int mem_cgroup_try_charge_delay(struct page page, struct mm_struct mm,
6014		- gfp_t gfp_mask, struct mem_cgroup **memcgp,
6015		- bool compound)
6016		-{
6017		- struct mem_cgroup *memcg;
6018		- int ret;
6019		-
6020		- ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound);
6021		- memcg = *memcgp;
6022		- mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask);
6023		- return ret;
6024		-}
6025		-
6026		-/**
6027		- * mem_cgroup_commit_charge - commit a page charge
6028		- * @page: page to charge
6029		- * @memcg: memcg to charge the page to
6030		- * @lrucare: page might be on LRU already
6031		- * @compound: charge the page as compound or small page
6032		- *
6033		- * Finalize a charge transaction started by mem_cgroup_try_charge(),
6034		- * after page->mapping has been set up. This must happen atomically
6035		- * as part of the page instantiation, i.e. under the page table lock
6036		- * for anonymous pages, under the page lock for page and swap cache.
6037		- *
6038		- * In addition, the page must not be on the LRU during the commit, to
6039		- * prevent racing with task migration. If it might be, use @lrucare.
6040		- *
6041		- * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
6042		- */
6043		-void mem_cgroup_commit_charge(struct page page, struct mem_cgroup memcg,
6044		- bool lrucare, bool compound)
6045		-{
6046		- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
6047		-
6048		- VM_BUG_ON_PAGE(!page->mapping, page);
6049		- VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
6050		-
6051		- if (mem_cgroup_disabled())
6052		- return;
6053		- /*
6054		- * Swap faults will attempt to charge the same page multiple
6055		- * times. But reuse_swap_page() might have removed the page
6056		- * from swapcache already, so we can't check PageSwapCache().
6057		- */
6058		- if (!memcg)
6059		- return;
6060		-
6061		- commit_charge(page, memcg, lrucare);
6062		-
6063		- local_lock_irq(event_lock);
6064		- mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
	6861	+ local_irq_disable();
	6862	+ mem_cgroup_charge_statistics(memcg, page, nr_pages);
6065	6863	memcg_check_events(memcg, page);
6066		- local_unlock_irq(event_lock);
	6864	+ local_irq_enable();
6067	6865
	6866	+ /*
	6867	+ * Cgroup1's unified memory+swap counter has been charged with the
	6868	+ * new swapcache page, finish the transfer by uncharging the swap
	6869	+ * slot. The swap slot would also get uncharged when it dies, but
	6870	+ * it can stick around indefinitely and we'd count the page twice
	6871	+ * the entire time.
	6872	+ *
	6873	+ * Cgroup2 has separate resource counters for memory and swap,
	6874	+ * so this is a non-issue here. Memory and swap charge lifetimes
	6875	+ * correspond 1:1 to page and swap slot lifetimes: we charge the
	6876	+ * page to memory here, and uncharge swap when the slot is freed.
	6877	+ */
6068	6878	if (do_memsw_account() && PageSwapCache(page)) {
6069	6879	swp_entry_t entry = { .val = page_private(page) };
6070	6880	/*
..	..	@@ -6074,42 +6884,18 @@
6074	6884	*/
6075	6885	mem_cgroup_uncharge_swap(entry, nr_pages);
6076	6886	}
6077		-}
6078	6887
6079		-/**
6080		- * mem_cgroup_cancel_charge - cancel a page charge
6081		- * @page: page to charge
6082		- * @memcg: memcg to charge the page to
6083		- * @compound: charge the page as compound or small page
6084		- *
6085		- * Cancel a charge transaction started by mem_cgroup_try_charge().
6086		- */
6087		-void mem_cgroup_cancel_charge(struct page page, struct mem_cgroup memcg,
6088		- bool compound)
6089		-{
6090		- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
6091		-
6092		- if (mem_cgroup_disabled())
6093		- return;
6094		- /*
6095		- * Swap faults will attempt to charge the same page multiple
6096		- * times. But reuse_swap_page() might have removed the page
6097		- * from swapcache already, so we can't check PageSwapCache().
6098		- */
6099		- if (!memcg)
6100		- return;
6101		-
6102		- cancel_charge(memcg, nr_pages);
	6888	+out_put:
	6889	+ css_put(&memcg->css);
	6890	+out:
	6891	+ return ret;
6103	6892	}
6104	6893
6105	6894	struct uncharge_gather {
6106	6895	struct mem_cgroup *memcg;
	6896	+ unsigned long nr_pages;
6107	6897	unsigned long pgpgout;
6108		- unsigned long nr_anon;
6109		- unsigned long nr_file;
6110	6898	unsigned long nr_kmem;
6111		- unsigned long nr_huge;
6112		- unsigned long nr_shmem;
6113	6899	struct page *dummy_page;
6114	6900	};
6115	6901
..	..	@@ -6120,37 +6906,32 @@
6120	6906
6121	6907	static void uncharge_batch(const struct uncharge_gather *ug)
6122	6908	{
6123		- unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem;
6124	6909	unsigned long flags;
6125	6910
6126	6911	if (!mem_cgroup_is_root(ug->memcg)) {
6127		- page_counter_uncharge(&ug->memcg->memory, nr_pages);
	6912	+ page_counter_uncharge(&ug->memcg->memory, ug->nr_pages);
6128	6913	if (do_memsw_account())
6129		- page_counter_uncharge(&ug->memcg->memsw, nr_pages);
	6914	+ page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages);
6130	6915	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
6131	6916	page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
6132	6917	memcg_oom_recover(ug->memcg);
6133	6918	}
6134	6919
6135		- local_lock_irqsave(event_lock, flags);
6136		- __mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon);
6137		- __mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file);
6138		- __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge);
6139		- __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem);
	6920	+ local_irq_save(flags);
6140	6921	__count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
6141		- __this_cpu_add(ug->memcg->stat_cpu->nr_page_events, nr_pages);
	6922	+ __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages);
6142	6923	memcg_check_events(ug->memcg, ug->dummy_page);
6143		- local_unlock_irqrestore(event_lock, flags);
	6924	+ local_irq_restore(flags);
6144	6925
6145		- if (!mem_cgroup_is_root(ug->memcg))
6146		- css_put_many(&ug->memcg->css, nr_pages);
	6926	+ /* drop reference from uncharge_page */
	6927	+ css_put(&ug->memcg->css);
6147	6928	}
6148	6929
6149	6930	static void uncharge_page(struct page page, struct uncharge_gather ug)
6150	6931	{
	6932	+ unsigned long nr_pages;
	6933	+
6151	6934	VM_BUG_ON_PAGE(PageLRU(page), page);
6152		- VM_BUG_ON_PAGE(page_count(page) && !is_zone_device_page(page) &&
6153		- !PageHWPoison(page) , page);
6154	6935
6155	6936	if (!page->mem_cgroup)
6156	6937	return;
..	..	@@ -6167,30 +6948,24 @@
6167	6948	uncharge_gather_clear(ug);
6168	6949	}
6169	6950	ug->memcg = page->mem_cgroup;
	6951	+
	6952	+ /* pairs with css_put in uncharge_batch */
	6953	+ css_get(&ug->memcg->css);
6170	6954	}
6171	6955
6172		- if (!PageKmemcg(page)) {
6173		- unsigned int nr_pages = 1;
	6956	+ nr_pages = compound_nr(page);
	6957	+ ug->nr_pages += nr_pages;
6174	6958
6175		- if (PageTransHuge(page)) {
6176		- nr_pages <<= compound_order(page);
6177		- ug->nr_huge += nr_pages;
6178		- }
6179		- if (PageAnon(page))
6180		- ug->nr_anon += nr_pages;
6181		- else {
6182		- ug->nr_file += nr_pages;
6183		- if (PageSwapBacked(page))
6184		- ug->nr_shmem += nr_pages;
6185		- }
	6959	+ if (!PageKmemcg(page)) {
6186	6960	ug->pgpgout++;
6187	6961	} else {
6188		- ug->nr_kmem += 1 << compound_order(page);
	6962	+ ug->nr_kmem += nr_pages;
6189	6963	__ClearPageKmemcg(page);
6190	6964	}
6191	6965
6192	6966	ug->dummy_page = page;
6193	6967	page->mem_cgroup = NULL;
	6968	+ css_put(&ug->memcg->css);
6194	6969	}
6195	6970
6196	6971	static void uncharge_list(struct list_head *page_list)
..	..	@@ -6219,18 +6994,14 @@
6219	6994	}
6220	6995
6221	6996	/**
6222		- * mem_cgroup_uncharge - uncharge a page
	6997	+ * __mem_cgroup_uncharge - uncharge a page
6223	6998	* @page: page to uncharge
6224	6999	*
6225		- * Uncharge a page previously charged with mem_cgroup_try_charge() and
6226		- * mem_cgroup_commit_charge().
	7000	+ * Uncharge a page previously charged with __mem_cgroup_charge().
6227	7001	*/
6228		-void mem_cgroup_uncharge(struct page *page)
	7002	+void __mem_cgroup_uncharge(struct page *page)
6229	7003	{
6230	7004	struct uncharge_gather ug;
6231		-
6232		- if (mem_cgroup_disabled())
6233		- return;
6234	7005
6235	7006	/* Don't touch page->lru of any random page, pre-check: */
6236	7007	if (!page->mem_cgroup)
..	..	@@ -6242,17 +7013,14 @@
6242	7013	}
6243	7014
6244	7015	/**
6245		- * mem_cgroup_uncharge_list - uncharge a list of page
	7016	+ * __mem_cgroup_uncharge_list - uncharge a list of page
6246	7017	* @page_list: list of pages to uncharge
6247	7018	*
6248	7019	* Uncharge a list of pages previously charged with
6249		- * mem_cgroup_try_charge() and mem_cgroup_commit_charge().
	7020	+ * __mem_cgroup_charge().
6250	7021	*/
6251		-void mem_cgroup_uncharge_list(struct list_head *page_list)
	7022	+void __mem_cgroup_uncharge_list(struct list_head *page_list)
6252	7023	{
6253		- if (mem_cgroup_disabled())
6254		- return;
6255		-
6256	7024	if (!list_empty(page_list))
6257	7025	uncharge_list(page_list);
6258	7026	}
..	..	@@ -6271,7 +7039,6 @@
6271	7039	{
6272	7040	struct mem_cgroup *memcg;
6273	7041	unsigned int nr_pages;
6274		- bool compound;
6275	7042	unsigned long flags;
6276	7043
6277	7044	VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
..	..	@@ -6293,20 +7060,19 @@
6293	7060	return;
6294	7061
6295	7062	/* Force-charge the new page. The old one will be freed soon */
6296		- compound = PageTransHuge(newpage);
6297		- nr_pages = compound ? hpage_nr_pages(newpage) : 1;
	7063	+ nr_pages = thp_nr_pages(newpage);
6298	7064
6299	7065	page_counter_charge(&memcg->memory, nr_pages);
6300	7066	if (do_memsw_account())
6301	7067	page_counter_charge(&memcg->memsw, nr_pages);
6302		- css_get_many(&memcg->css, nr_pages);
6303	7068
6304		- commit_charge(newpage, memcg, false);
	7069	+ css_get(&memcg->css);
	7070	+ commit_charge(newpage, memcg);
6305	7071
6306		- local_lock_irqsave(event_lock, flags);
6307		- mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
	7072	+ local_irq_save(flags);
	7073	+ mem_cgroup_charge_statistics(memcg, newpage, nr_pages);
6308	7074	memcg_check_events(memcg, newpage);
6309		- local_unlock_irqrestore(event_lock, flags);
	7075	+ local_irq_restore(flags);
6310	7076	}
6311	7077
6312	7078	DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
..	..	@@ -6329,7 +7095,7 @@
6329	7095	goto out;
6330	7096	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
6331	7097	goto out;
6332		- if (css_tryget_online(&memcg->css))
	7098	+ if (css_tryget(&memcg->css))
6333	7099	sk->sk_memcg = memcg;
6334	7100	out:
6335	7101	rcu_read_unlock();
..	..	@@ -6407,7 +7173,7 @@
6407	7173	if (!strcmp(token, "nokmem"))
6408	7174	cgroup_memory_nokmem = true;
6409	7175	}
6410		- return 0;
	7176	+ return 1;
6411	7177	}
6412	7178	__setup("cgroup.memory=", cgroup_memory);
6413	7179
..	..	@@ -6422,17 +7188,6 @@
6422	7188	static int __init mem_cgroup_init(void)
6423	7189	{
6424	7190	int cpu, node;
6425		-
6426		-#ifdef CONFIG_MEMCG_KMEM
6427		- /*
6428		- * Kmem cache creation is mostly done with the slab_mutex held,
6429		- * so use a workqueue with limited concurrency to avoid stalling
6430		- * all worker threads in case lots of cgroups are created and
6431		- * destroyed simultaneously.
6432		- */
6433		- memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1);
6434		- BUG_ON(!memcg_kmem_cache_wq);
6435		-#endif
6436	7191
6437	7192	cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
6438	7193	memcg_hotplug_cpu_dead);
..	..	@@ -6460,7 +7215,7 @@
6460	7215	#ifdef CONFIG_MEMCG_SWAP
6461	7216	static struct mem_cgroup mem_cgroup_id_get_online(struct mem_cgroup memcg)
6462	7217	{
6463		- while (!atomic_inc_not_zero(&memcg->id.ref)) {
	7218	+ while (!refcount_inc_not_zero(&memcg->id.ref)) {
6464	7219	/*
6465	7220	* The root cgroup cannot be destroyed, so it's refcount must
6466	7221	* always be >= 1.
..	..	@@ -6488,12 +7243,14 @@
6488	7243	struct mem_cgroup memcg, swap_memcg;
6489	7244	unsigned int nr_entries;
6490	7245	unsigned short oldid;
6491		- unsigned long flags;
6492	7246
6493	7247	VM_BUG_ON_PAGE(PageLRU(page), page);
6494	7248	VM_BUG_ON_PAGE(page_count(page), page);
6495	7249
6496		- if (!do_memsw_account())
	7250	+ if (mem_cgroup_disabled())
	7251	+ return;
	7252	+
	7253	+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
6497	7254	return;
6498	7255
6499	7256	memcg = page->mem_cgroup;
..	..	@@ -6508,7 +7265,7 @@
6508	7265	* ancestor for the swap instead and transfer the memory+swap charge.
6509	7266	*/
6510	7267	swap_memcg = mem_cgroup_id_get_online(memcg);
6511		- nr_entries = hpage_nr_pages(page);
	7268	+ nr_entries = thp_nr_pages(page);
6512	7269	/* Get references for the tail pages, too */
6513	7270	if (nr_entries > 1)
6514	7271	mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
..	..	@@ -6522,7 +7279,7 @@
6522	7279	if (!mem_cgroup_is_root(memcg))
6523	7280	page_counter_uncharge(&memcg->memory, nr_entries);
6524	7281
6525		- if (memcg != swap_memcg) {
	7282	+ if (!cgroup_memory_noswap && memcg != swap_memcg) {
6526	7283	if (!mem_cgroup_is_root(swap_memcg))
6527	7284	page_counter_charge(&swap_memcg->memsw, nr_entries);
6528	7285	page_counter_uncharge(&memcg->memsw, nr_entries);
..	..	@@ -6534,21 +7291,15 @@
6534	7291	* important here to have the interrupts disabled because it is the
6535	7292	* only synchronisation we have for updating the per-CPU variables.
6536	7293	*/
6537		- local_lock_irqsave(event_lock, flags);
6538		-#ifndef CONFIG_PREEMPT_RT_BASE
6539	7294	VM_BUG_ON(!irqs_disabled());
6540		-#endif
6541		- mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
6542		- -nr_entries);
	7295	+ mem_cgroup_charge_statistics(memcg, page, -nr_entries);
6543	7296	memcg_check_events(memcg, page);
6544		- local_unlock_irqrestore(event_lock, flags);
6545	7297
6546		- if (!mem_cgroup_is_root(memcg))
6547		- css_put_many(&memcg->css, nr_entries);
	7298	+ css_put(&memcg->css);
6548	7299	}
6549	7300
6550	7301	/**
6551		- * mem_cgroup_try_charge_swap - try charging swap space for a page
	7302	+ * __mem_cgroup_try_charge_swap - try charging swap space for a page
6552	7303	* @page: page being added to swap
6553	7304	* @entry: swap entry to charge
6554	7305	*
..	..	@@ -6556,14 +7307,14 @@
6556	7307	*
6557	7308	* Returns 0 on success, -ENOMEM on failure.
6558	7309	*/
6559		-int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
	7310	+int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
6560	7311	{
6561		- unsigned int nr_pages = hpage_nr_pages(page);
	7312	+ unsigned int nr_pages = thp_nr_pages(page);
6562	7313	struct page_counter *counter;
6563	7314	struct mem_cgroup *memcg;
6564	7315	unsigned short oldid;
6565	7316
6566		- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) \|\| !do_swap_account)
	7317	+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
6567	7318	return 0;
6568	7319
6569	7320	memcg = page->mem_cgroup;
..	..	@@ -6579,7 +7330,7 @@
6579	7330
6580	7331	memcg = mem_cgroup_id_get_online(memcg);
6581	7332
6582		- if (!mem_cgroup_is_root(memcg) &&
	7333	+ if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) &&
6583	7334	!page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
6584	7335	memcg_memory_event(memcg, MEMCG_SWAP_MAX);
6585	7336	memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
..	..	@@ -6598,23 +7349,20 @@
6598	7349	}
6599	7350
6600	7351	/**
6601		- * mem_cgroup_uncharge_swap - uncharge swap space
	7352	+ * __mem_cgroup_uncharge_swap - uncharge swap space
6602	7353	* @entry: swap entry to uncharge
6603	7354	* @nr_pages: the amount of swap space to uncharge
6604	7355	*/
6605		-void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
	7356	+void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
6606	7357	{
6607	7358	struct mem_cgroup *memcg;
6608	7359	unsigned short id;
6609		-
6610		- if (!do_swap_account)
6611		- return;
6612	7360
6613	7361	id = swap_cgroup_record(entry, 0, nr_pages);
6614	7362	rcu_read_lock();
6615	7363	memcg = mem_cgroup_from_id(id);
6616	7364	if (memcg) {
6617		- if (!mem_cgroup_is_root(memcg)) {
	7365	+ if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) {
6618	7366	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
6619	7367	page_counter_uncharge(&memcg->swap, nr_pages);
6620	7368	else
..	..	@@ -6630,7 +7378,7 @@
6630	7378	{
6631	7379	long nr_swap_pages = get_nr_swap_pages();
6632	7380
6633		- if (!do_swap_account \|\| !cgroup_subsys_on_dfl(memory_cgrp_subsys))
	7381	+ if (cgroup_memory_noswap \|\| !cgroup_subsys_on_dfl(memory_cgrp_subsys))
6634	7382	return nr_swap_pages;
6635	7383	for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
6636	7384	nr_swap_pages = min_t(long, nr_swap_pages,
..	..	@@ -6647,36 +7395,33 @@
6647	7395
6648	7396	if (vm_swap_full())
6649	7397	return true;
6650		- if (!do_swap_account \|\| !cgroup_subsys_on_dfl(memory_cgrp_subsys))
	7398	+ if (cgroup_memory_noswap \|\| !cgroup_subsys_on_dfl(memory_cgrp_subsys))
6651	7399	return false;
6652	7400
6653	7401	memcg = page->mem_cgroup;
6654	7402	if (!memcg)
6655	7403	return false;
6656	7404
6657		- for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
6658		- if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.max)
	7405	+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
	7406	+ unsigned long usage = page_counter_read(&memcg->swap);
	7407	+
	7408	+ if (usage * 2 >= READ_ONCE(memcg->swap.high) \|\|
	7409	+ usage * 2 >= READ_ONCE(memcg->swap.max))
6659	7410	return true;
	7411	+ }
6660	7412
6661	7413	return false;
6662	7414	}
6663	7415
6664		-/* for remember boot option*/
6665		-#ifdef CONFIG_MEMCG_SWAP_ENABLED
6666		-static int really_do_swap_account __initdata = 1;
6667		-#else
6668		-static int really_do_swap_account __initdata;
6669		-#endif
6670		-
6671		-static int __init enable_swap_account(char *s)
	7416	+static int __init setup_swap_account(char *s)
6672	7417	{
6673	7418	if (!strcmp(s, "1"))
6674		- really_do_swap_account = 1;
	7419	+ cgroup_memory_noswap = 0;
6675	7420	else if (!strcmp(s, "0"))
6676		- really_do_swap_account = 0;
	7421	+ cgroup_memory_noswap = 1;
6677	7422	return 1;
6678	7423	}
6679		-__setup("swapaccount=", enable_swap_account);
	7424	+__setup("swapaccount=", setup_swap_account);
6680	7425
6681	7426	static u64 swap_current_read(struct cgroup_subsys_state *css,
6682	7427	struct cftype *cft)
..	..	@@ -6686,17 +7431,33 @@
6686	7431	return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
6687	7432	}
6688	7433
	7434	+static int swap_high_show(struct seq_file m, void v)
	7435	+{
	7436	+ return seq_puts_memcg_tunable(m,
	7437	+ READ_ONCE(mem_cgroup_from_seq(m)->swap.high));
	7438	+}
	7439	+
	7440	+static ssize_t swap_high_write(struct kernfs_open_file *of,
	7441	+ char *buf, size_t nbytes, loff_t off)
	7442	+{
	7443	+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
	7444	+ unsigned long high;
	7445	+ int err;
	7446	+
	7447	+ buf = strstrip(buf);
	7448	+ err = page_counter_memparse(buf, "max", &high);
	7449	+ if (err)
	7450	+ return err;
	7451	+
	7452	+ page_counter_set_high(&memcg->swap, high);
	7453	+
	7454	+ return nbytes;
	7455	+}
	7456	+
6689	7457	static int swap_max_show(struct seq_file m, void v)
6690	7458	{
6691		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
6692		- unsigned long max = READ_ONCE(memcg->swap.max);
6693		-
6694		- if (max == PAGE_COUNTER_MAX)
6695		- seq_puts(m, "max\n");
6696		- else
6697		- seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
6698		-
6699		- return 0;
	7459	+ return seq_puts_memcg_tunable(m,
	7460	+ READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
6700	7461	}
6701	7462
6702	7463	static ssize_t swap_max_write(struct kernfs_open_file *of,
..	..	@@ -6718,8 +7479,10 @@
6718	7479
6719	7480	static int swap_events_show(struct seq_file m, void v)
6720	7481	{
6721		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
	7482	+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6722	7483
	7484	+ seq_printf(m, "high %lu\n",
	7485	+ atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH]));
6723	7486	seq_printf(m, "max %lu\n",
6724	7487	atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
6725	7488	seq_printf(m, "fail %lu\n",
..	..	@@ -6733,6 +7496,12 @@
6733	7496	.name = "swap.current",
6734	7497	.flags = CFTYPE_NOT_ON_ROOT,
6735	7498	.read_u64 = swap_current_read,
	7499	+ },
	7500	+ {
	7501	+ .name = "swap.high",
	7502	+ .flags = CFTYPE_NOT_ON_ROOT,
	7503	+ .seq_show = swap_high_show,
	7504	+ .write = swap_high_write,
6736	7505	},
6737	7506	{
6738	7507	.name = "swap.max",
..	..	@@ -6749,7 +7518,7 @@
6749	7518	{ } /* terminate */
6750	7519	};
6751	7520
6752		-static struct cftype memsw_cgroup_files[] = {
	7521	+static struct cftype memsw_files[] = {
6753	7522	{
6754	7523	.name = "memsw.usage_in_bytes",
6755	7524	.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
..	..	@@ -6776,17 +7545,27 @@
6776	7545	{ }, /* terminate */
6777	7546	};
6778	7547
	7548	+/*
	7549	+ * If mem_cgroup_swap_init() is implemented as a subsys_initcall()
	7550	+ * instead of a core_initcall(), this could mean cgroup_memory_noswap still
	7551	+ * remains set to false even when memcg is disabled via "cgroup_disable=memory"
	7552	+ * boot parameter. This may result in premature OOPS inside
	7553	+ * mem_cgroup_get_nr_swap_pages() function in corner cases.
	7554	+ */
6779	7555	static int __init mem_cgroup_swap_init(void)
6780	7556	{
6781		- if (!mem_cgroup_disabled() && really_do_swap_account) {
6782		- do_swap_account = 1;
6783		- WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys,
6784		- swap_files));
6785		- WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
6786		- memsw_cgroup_files));
6787		- }
	7557	+ /* No memory control -> no swap control */
	7558	+ if (mem_cgroup_disabled())
	7559	+ cgroup_memory_noswap = true;
	7560	+
	7561	+ if (cgroup_memory_noswap)
	7562	+ return 0;
	7563	+
	7564	+ WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
	7565	+ WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files));
	7566	+
6788	7567	return 0;
6789	7568	}
6790		-subsys_initcall(mem_cgroup_swap_init);
	7569	+core_initcall(mem_cgroup_swap_init);
6791	7570
6792	7571	#endif /* CONFIG_MEMCG_SWAP */