~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0-or-later
1	2	/* memcontrol.c - Memory Controller
2	3	*
3	4	* Copyright IBM Corporation, 2007
..	..	@@ -19,26 +20,17 @@
19	20	* Lockless page tracking & accounting
20	21	* Unified hierarchy configuration model
21	22	* Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
22		- *
23		- * This program is free software; you can redistribute it and/or modify
24		- * it under the terms of the GNU General Public License as published by
25		- * the Free Software Foundation; either version 2 of the License, or
26		- * (at your option) any later version.
27		- *
28		- * This program is distributed in the hope that it will be useful,
29		- * but WITHOUT ANY WARRANTY; without even the implied warranty of
30		- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
31		- * GNU General Public License for more details.
32	23	*/
33	24
34	25	#include <linux/page_counter.h>
35	26	#include <linux/memcontrol.h>
36	27	#include <linux/cgroup.h>
37		-#include <linux/mm.h>
	28	+#include <linux/pagewalk.h>
38	29	#include <linux/sched/mm.h>
39	30	#include <linux/shmem_fs.h>
40	31	#include <linux/hugetlb.h>
41	32	#include <linux/pagemap.h>
	33	+#include <linux/vm_event_item.h>
42	34	#include <linux/smp.h>
43	35	#include <linux/page-flags.h>
44	36	#include <linux/backing-dev.h>
..	..	@@ -65,6 +57,8 @@
65	57	#include <linux/lockdep.h>
66	58	#include <linux/file.h>
67	59	#include <linux/tracehook.h>
	60	+#include <linux/psi.h>
	61	+#include <linux/seq_buf.h>
68	62	#include "internal.h"
69	63	#include <net/sock.h>
70	64	#include <net/ip.h>
..	..	@@ -73,13 +67,16 @@
73	67	#include <linux/uaccess.h>
74	68
75	69	#include <trace/events/vmscan.h>
	70	+#include <trace/hooks/mm.h>
76	71
77	72	struct cgroup_subsys memory_cgrp_subsys __read_mostly;
78	73	EXPORT_SYMBOL(memory_cgrp_subsys);
79	74
80	75	struct mem_cgroup *root_mem_cgroup __read_mostly;
	76	+EXPORT_SYMBOL_GPL(root_mem_cgroup);
81	77
82		-#define MEM_CGROUP_RECLAIM_RETRIES 5
	78	+/* Active memory cgroup to use from an interrupt context */
	79	+DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
83	80
84	81	/* Socket memory accounting disabled? */
85	82	static bool cgroup_memory_nosocket;
..	..	@@ -89,28 +86,23 @@
89	86
90	87	/* Whether the swap controller is active */
91	88	#ifdef CONFIG_MEMCG_SWAP
92		-int do_swap_account __read_mostly;
	89	+bool cgroup_memory_noswap __read_mostly;
93	90	#else
94		-#define do_swap_account 0
	91	+#define cgroup_memory_noswap 1
	92	+#endif
	93	+
	94	+#ifdef CONFIG_CGROUP_WRITEBACK
	95	+static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
95	96	#endif
96	97
97	98	/* Whether legacy memory+swap accounting is active */
98	99	static bool do_memsw_account(void)
99	100	{
100		- return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
	101	+ return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
101	102	}
102		-
103		-static const char *const mem_cgroup_lru_names[] = {
104		- "inactive_anon",
105		- "active_anon",
106		- "inactive_file",
107		- "active_file",
108		- "unevictable",
109		-};
110	103
111	104	#define THRESHOLDS_EVENTS_TARGET 128
112	105	#define SOFTLIMIT_EVENTS_TARGET 1024
113		-#define NUMAINFO_EVENTS_TARGET 1024
114	106
115	107	/*
116	108	* Cgroups above their limits are maintained in a RB-Tree, independent of
..	..	@@ -210,14 +202,6 @@
210	202	#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
211	203	#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
212	204
213		-enum charge_type {
214		- MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
215		- MEM_CGROUP_CHARGE_TYPE_ANON,
216		- MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
217		- MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */
218		- NR_CHARGE_TYPE,
219		-};
220		-
221	205	/* for encoding cft->private value on file */
222	206	enum res_type {
223	207	_MEM,
..	..	@@ -248,7 +232,7 @@
248	232	iter != NULL; \
249	233	iter = mem_cgroup_iter(NULL, iter, NULL))
250	234
251		-static inline bool should_force_charge(void)
	235	+static inline bool task_is_dying(void)
252	236	{
253	237	return tsk_is_oom_victim(current) \|\| fatal_signal_pending(current) \|\|
254	238	(current->flags & PF_EXITING);
..	..	@@ -268,8 +252,100 @@
268	252	}
269	253
270	254	#ifdef CONFIG_MEMCG_KMEM
	255	+static DEFINE_SPINLOCK(objcg_lock);
	256	+
	257	+static void obj_cgroup_release(struct percpu_ref *ref)
	258	+{
	259	+ struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
	260	+ struct mem_cgroup *memcg;
	261	+ unsigned int nr_bytes;
	262	+ unsigned int nr_pages;
	263	+ unsigned long flags;
	264	+
	265	+ /*
	266	+ * At this point all allocated objects are freed, and
	267	+ * objcg->nr_charged_bytes can't have an arbitrary byte value.
	268	+ * However, it can be PAGE_SIZE or (x * PAGE_SIZE).
	269	+ *
	270	+ * The following sequence can lead to it:
	271	+ * 1) CPU0: objcg == stock->cached_objcg
	272	+ * 2) CPU1: we do a small allocation (e.g. 92 bytes),
	273	+ * PAGE_SIZE bytes are charged
	274	+ * 3) CPU1: a process from another memcg is allocating something,
	275	+ * the stock if flushed,
	276	+ * objcg->nr_charged_bytes = PAGE_SIZE - 92
	277	+ * 5) CPU0: we do release this object,
	278	+ * 92 bytes are added to stock->nr_bytes
	279	+ * 6) CPU0: stock is flushed,
	280	+ * 92 bytes are added to objcg->nr_charged_bytes
	281	+ *
	282	+ * In the result, nr_charged_bytes == PAGE_SIZE.
	283	+ * This page will be uncharged in obj_cgroup_release().
	284	+ */
	285	+ nr_bytes = atomic_read(&objcg->nr_charged_bytes);
	286	+ WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
	287	+ nr_pages = nr_bytes >> PAGE_SHIFT;
	288	+
	289	+ spin_lock_irqsave(&objcg_lock, flags);
	290	+ memcg = obj_cgroup_memcg(objcg);
	291	+ if (nr_pages)
	292	+ __memcg_kmem_uncharge(memcg, nr_pages);
	293	+ list_del(&objcg->list);
	294	+ mem_cgroup_put(memcg);
	295	+ spin_unlock_irqrestore(&objcg_lock, flags);
	296	+
	297	+ percpu_ref_exit(ref);
	298	+ kfree_rcu(objcg, rcu);
	299	+}
	300	+
	301	+static struct obj_cgroup *obj_cgroup_alloc(void)
	302	+{
	303	+ struct obj_cgroup *objcg;
	304	+ int ret;
	305	+
	306	+ objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
	307	+ if (!objcg)
	308	+ return NULL;
	309	+
	310	+ ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0,
	311	+ GFP_KERNEL);
	312	+ if (ret) {
	313	+ kfree(objcg);
	314	+ return NULL;
	315	+ }
	316	+ INIT_LIST_HEAD(&objcg->list);
	317	+ return objcg;
	318	+}
	319	+
	320	+static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
	321	+ struct mem_cgroup *parent)
	322	+{
	323	+ struct obj_cgroup objcg, iter;
	324	+
	325	+ objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
	326	+
	327	+ spin_lock_irq(&objcg_lock);
	328	+
	329	+ /* Move active objcg to the parent's list */
	330	+ xchg(&objcg->memcg, parent);
	331	+ css_get(&parent->css);
	332	+ list_add(&objcg->list, &parent->objcg_list);
	333	+
	334	+ /* Move already reparented objcgs to the parent's list */
	335	+ list_for_each_entry(iter, &memcg->objcg_list, list) {
	336	+ css_get(&parent->css);
	337	+ xchg(&iter->memcg, parent);
	338	+ css_put(&memcg->css);
	339	+ }
	340	+ list_splice(&memcg->objcg_list, &parent->objcg_list);
	341	+
	342	+ spin_unlock_irq(&objcg_lock);
	343	+
	344	+ percpu_ref_kill(&objcg->refcnt);
	345	+}
	346	+
271	347	/*
272		- * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
	348	+ * This will be used as a shrinker list's index.
273	349	* The main reason for not using cgroup id for this:
274	350	* this works better in sparse environments, where we have a lot of memcgs,
275	351	* but only a few kmem-limited. Or also, if we have, for instance, 200
..	..	@@ -312,14 +388,13 @@
312	388
313	389	/*
314	390	* A lot of the calls to the cache allocation functions are expected to be
315		- * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
	391	+ * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are
316	392	* conditional to this static branch, we'll have to allow modules that does
317	393	* kmem_cache_alloc and the such to see this symbol as well
318	394	*/
319	395	DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
320	396	EXPORT_SYMBOL(memcg_kmem_enabled_key);
321		-
322		-struct workqueue_struct *memcg_kmem_cache_wq;
	397	+#endif
323	398
324	399	static int memcg_shrinker_map_size;
325	400	static DEFINE_MUTEX(memcg_shrinker_map_mutex);
..	..	@@ -344,7 +419,7 @@
344	419	if (!old)
345	420	return 0;
346	421
347		- new = kvmalloc(sizeof(*new) + size, GFP_KERNEL);
	422	+ new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
348	423	if (!new)
349	424	return -ENOMEM;
350	425
..	..	@@ -388,7 +463,7 @@
388	463	mutex_lock(&memcg_shrinker_map_mutex);
389	464	size = memcg_shrinker_map_size;
390	465	for_each_node(nid) {
391		- map = kvzalloc(sizeof(*map) + size, GFP_KERNEL);
	466	+ map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid);
392	467	if (!map) {
393	468	memcg_free_shrinker_maps(memcg);
394	469	ret = -ENOMEM;
..	..	@@ -445,14 +520,6 @@
445	520	}
446	521	}
447	522
448		-#else /* CONFIG_MEMCG_KMEM */
449		-static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
450		-{
451		- return 0;
452		-}
453		-static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { }
454		-#endif /* CONFIG_MEMCG_KMEM */
455		-
456	523	/**
457	524	* mem_cgroup_css_from_page - css of the memcg associated with a page
458	525	* @page: page of interest
..	..	@@ -495,7 +562,17 @@
495	562	unsigned long ino = 0;
496	563
497	564	rcu_read_lock();
498		- memcg = READ_ONCE(page->mem_cgroup);
	565	+ memcg = page->mem_cgroup;
	566	+
	567	+ /*
	568	+ * The lowest bit set means that memcg isn't a valid
	569	+ * memcg pointer, but a obj_cgroups pointer.
	570	+ * In this case the page is shared and doesn't belong
	571	+ * to any specific memory cgroup.
	572	+ */
	573	+ if ((unsigned long) memcg & 0x1UL)
	574	+ memcg = NULL;
	575	+
499	576	while (memcg && !(memcg->css.flags & CSS_ONLINE))
500	577	memcg = parent_mem_cgroup(memcg);
501	578	if (memcg)
..	..	@@ -671,7 +748,7 @@
671	748	*/
672	749	__mem_cgroup_remove_exceeded(mz, mctz);
673	750	if (!soft_limit_excess(mz->memcg) \|\|
674		- !css_tryget_online(&mz->memcg->css))
	751	+ !css_tryget(&mz->memcg->css))
675	752	goto retry;
676	753	done:
677	754	return mz;
..	..	@@ -688,33 +765,186 @@
688	765	return mz;
689	766	}
690	767
691		-static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
692		- int event)
	768	+/**
	769	+ * __mod_memcg_state - update cgroup memory statistics
	770	+ * @memcg: the memory cgroup
	771	+ * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
	772	+ * @val: delta to add to the counter, can be negative
	773	+ */
	774	+void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
693	775	{
694		- return atomic_long_read(&memcg->events[event]);
	776	+ long x, threshold = MEMCG_CHARGE_BATCH;
	777	+
	778	+ if (mem_cgroup_disabled())
	779	+ return;
	780	+
	781	+ if (memcg_stat_item_in_bytes(idx))
	782	+ threshold <<= PAGE_SHIFT;
	783	+
	784	+ x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
	785	+ if (unlikely(abs(x) > threshold)) {
	786	+ struct mem_cgroup *mi;
	787	+
	788	+ /*
	789	+ * Batch local counters to keep them in sync with
	790	+ * the hierarchical ones.
	791	+ */
	792	+ __this_cpu_add(memcg->vmstats_local->stat[idx], x);
	793	+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
	794	+ atomic_long_add(x, &mi->vmstats[idx]);
	795	+ x = 0;
	796	+ }
	797	+ __this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
	798	+}
	799	+
	800	+static struct mem_cgroup_per_node *
	801	+parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
	802	+{
	803	+ struct mem_cgroup *parent;
	804	+
	805	+ parent = parent_mem_cgroup(pn->memcg);
	806	+ if (!parent)
	807	+ return NULL;
	808	+ return mem_cgroup_nodeinfo(parent, nid);
	809	+}
	810	+
	811	+void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
	812	+ int val)
	813	+{
	814	+ struct mem_cgroup_per_node *pn;
	815	+ struct mem_cgroup *memcg;
	816	+ long x, threshold = MEMCG_CHARGE_BATCH;
	817	+
	818	+ pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
	819	+ memcg = pn->memcg;
	820	+
	821	+ /* Update memcg */
	822	+ __mod_memcg_state(memcg, idx, val);
	823	+
	824	+ /* Update lruvec */
	825	+ __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
	826	+
	827	+ if (vmstat_item_in_bytes(idx))
	828	+ threshold <<= PAGE_SHIFT;
	829	+
	830	+ x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
	831	+ if (unlikely(abs(x) > threshold)) {
	832	+ pg_data_t *pgdat = lruvec_pgdat(lruvec);
	833	+ struct mem_cgroup_per_node *pi;
	834	+
	835	+ for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
	836	+ atomic_long_add(x, &pi->lruvec_stat[idx]);
	837	+ x = 0;
	838	+ }
	839	+ __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
	840	+}
	841	+
	842	+/**
	843	+ * __mod_lruvec_state - update lruvec memory statistics
	844	+ * @lruvec: the lruvec
	845	+ * @idx: the stat item
	846	+ * @val: delta to add to the counter, can be negative
	847	+ *
	848	+ * The lruvec is the intersection of the NUMA node and a cgroup. This
	849	+ * function updates the all three counters that are affected by a
	850	+ * change of state at this level: per-node, per-cgroup, per-lruvec.
	851	+ */
	852	+void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
	853	+ int val)
	854	+{
	855	+ /* Update node */
	856	+ __mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
	857	+
	858	+ /* Update memcg and lruvec */
	859	+ if (!mem_cgroup_disabled())
	860	+ __mod_memcg_lruvec_state(lruvec, idx, val);
	861	+}
	862	+EXPORT_SYMBOL_GPL(__mod_lruvec_state);
	863	+
	864	+void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
	865	+{
	866	+ pg_data_t *pgdat = page_pgdat(virt_to_page(p));
	867	+ struct mem_cgroup *memcg;
	868	+ struct lruvec *lruvec;
	869	+
	870	+ rcu_read_lock();
	871	+ memcg = mem_cgroup_from_obj(p);
	872	+
	873	+ /*
	874	+ * Untracked pages have no memcg, no lruvec. Update only the
	875	+ * node. If we reparent the slab objects to the root memcg,
	876	+ * when we free the slab object, we need to update the per-memcg
	877	+ * vmstats to keep it correct for the root memcg.
	878	+ */
	879	+ if (!memcg) {
	880	+ __mod_node_page_state(pgdat, idx, val);
	881	+ } else {
	882	+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
	883	+ __mod_lruvec_state(lruvec, idx, val);
	884	+ }
	885	+ rcu_read_unlock();
	886	+}
	887	+
	888	+void mod_memcg_obj_state(void *p, int idx, int val)
	889	+{
	890	+ struct mem_cgroup *memcg;
	891	+
	892	+ rcu_read_lock();
	893	+ memcg = mem_cgroup_from_obj(p);
	894	+ if (memcg)
	895	+ mod_memcg_state(memcg, idx, val);
	896	+ rcu_read_unlock();
	897	+}
	898	+
	899	+/**
	900	+ * __count_memcg_events - account VM events in a cgroup
	901	+ * @memcg: the memory cgroup
	902	+ * @idx: the event item
	903	+ * @count: the number of events that occured
	904	+ */
	905	+void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
	906	+ unsigned long count)
	907	+{
	908	+ unsigned long x;
	909	+
	910	+ if (mem_cgroup_disabled())
	911	+ return;
	912	+
	913	+ x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
	914	+ if (unlikely(x > MEMCG_CHARGE_BATCH)) {
	915	+ struct mem_cgroup *mi;
	916	+
	917	+ /*
	918	+ * Batch local counters to keep them in sync with
	919	+ * the hierarchical ones.
	920	+ */
	921	+ __this_cpu_add(memcg->vmstats_local->events[idx], x);
	922	+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
	923	+ atomic_long_add(x, &mi->vmevents[idx]);
	924	+ x = 0;
	925	+ }
	926	+ __this_cpu_write(memcg->vmstats_percpu->events[idx], x);
	927	+}
	928	+
	929	+static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
	930	+{
	931	+ return atomic_long_read(&memcg->vmevents[event]);
	932	+}
	933	+
	934	+static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
	935	+{
	936	+ long x = 0;
	937	+ int cpu;
	938	+
	939	+ for_each_possible_cpu(cpu)
	940	+ x += per_cpu(memcg->vmstats_local->events[event], cpu);
	941	+ return x;
695	942	}
696	943
697	944	static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
698	945	struct page *page,
699		- bool compound, int nr_pages)
	946	+ int nr_pages)
700	947	{
701		- /*
702		- * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
703		- * counted as CACHE even if it's on ANON LRU.
704		- */
705		- if (PageAnon(page))
706		- __mod_memcg_state(memcg, MEMCG_RSS, nr_pages);
707		- else {
708		- __mod_memcg_state(memcg, MEMCG_CACHE, nr_pages);
709		- if (PageSwapBacked(page))
710		- __mod_memcg_state(memcg, NR_SHMEM, nr_pages);
711		- }
712		-
713		- if (compound) {
714		- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
715		- __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages);
716		- }
717		-
718	948	/* pagein of a big page is an event. So, ignore page size */
719	949	if (nr_pages > 0)
720	950	__count_memcg_events(memcg, PGPGIN, 1);
..	..	@@ -723,35 +953,7 @@
723	953	nr_pages = -nr_pages; /* for event */
724	954	}
725	955
726		- __this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages);
727		-}
728		-
729		-unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
730		- int nid, unsigned int lru_mask)
731		-{
732		- struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
733		- unsigned long nr = 0;
734		- enum lru_list lru;
735		-
736		- VM_BUG_ON((unsigned)nid >= nr_node_ids);
737		-
738		- for_each_lru(lru) {
739		- if (!(BIT(lru) & lru_mask))
740		- continue;
741		- nr += mem_cgroup_get_lru_size(lruvec, lru);
742		- }
743		- return nr;
744		-}
745		-
746		-static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
747		- unsigned int lru_mask)
748		-{
749		- unsigned long nr = 0;
750		- int nid;
751		-
752		- for_each_node_state(nid, N_MEMORY)
753		- nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
754		- return nr;
	956	+ __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
755	957	}
756	958
757	959	static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
..	..	@@ -759,8 +961,8 @@
759	961	{
760	962	unsigned long val, next;
761	963
762		- val = __this_cpu_read(memcg->stat_cpu->nr_page_events);
763		- next = __this_cpu_read(memcg->stat_cpu->targets[target]);
	964	+ val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
	965	+ next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
764	966	/* from time_after() in jiffies.h */
765	967	if ((long)(next - val) < 0) {
766	968	switch (target) {
..	..	@@ -770,13 +972,10 @@
770	972	case MEM_CGROUP_TARGET_SOFTLIMIT:
771	973	next = val + SOFTLIMIT_EVENTS_TARGET;
772	974	break;
773		- case MEM_CGROUP_TARGET_NUMAINFO:
774		- next = val + NUMAINFO_EVENTS_TARGET;
775		- break;
776	975	default:
777	976	break;
778	977	}
779		- __this_cpu_write(memcg->stat_cpu->targets[target], next);
	978	+ __this_cpu_write(memcg->vmstats_percpu->targets[target], next);
780	979	return true;
781	980	}
782	981	return false;
..	..	@@ -792,21 +991,12 @@
792	991	if (unlikely(mem_cgroup_event_ratelimit(memcg,
793	992	MEM_CGROUP_TARGET_THRESH))) {
794	993	bool do_softlimit;
795		- bool do_numainfo __maybe_unused;
796	994
797	995	do_softlimit = mem_cgroup_event_ratelimit(memcg,
798	996	MEM_CGROUP_TARGET_SOFTLIMIT);
799		-#if MAX_NUMNODES > 1
800		- do_numainfo = mem_cgroup_event_ratelimit(memcg,
801		- MEM_CGROUP_TARGET_NUMAINFO);
802		-#endif
803	997	mem_cgroup_threshold(memcg);
804	998	if (unlikely(do_softlimit))
805	999	mem_cgroup_update_tree(memcg, page);
806		-#if MAX_NUMNODES > 1
807		- if (unlikely(do_numainfo))
808		- atomic_inc(&memcg->numainfo_events);
809		-#endif
810	1000	}
811	1001	}
812	1002
..	..	@@ -874,27 +1064,60 @@
874	1064	return NULL;
875	1065
876	1066	rcu_read_lock();
877		- if (!memcg \|\| !css_tryget_online(&memcg->css))
	1067	+ /* Page should not get uncharged and freed memcg under us. */
	1068	+ if (!memcg \|\| WARN_ON_ONCE(!css_tryget(&memcg->css)))
878	1069	memcg = root_mem_cgroup;
879	1070	rcu_read_unlock();
880	1071	return memcg;
881	1072	}
882	1073	EXPORT_SYMBOL(get_mem_cgroup_from_page);
883	1074
	1075	+static __always_inline struct mem_cgroup *active_memcg(void)
	1076	+{
	1077	+ if (in_interrupt())
	1078	+ return this_cpu_read(int_active_memcg);
	1079	+ else
	1080	+ return current->active_memcg;
	1081	+}
	1082	+
	1083	+static __always_inline struct mem_cgroup *get_active_memcg(void)
	1084	+{
	1085	+ struct mem_cgroup *memcg;
	1086	+
	1087	+ rcu_read_lock();
	1088	+ memcg = active_memcg();
	1089	+ /* remote memcg must hold a ref. */
	1090	+ if (memcg && WARN_ON_ONCE(!css_tryget(&memcg->css)))
	1091	+ memcg = root_mem_cgroup;
	1092	+ rcu_read_unlock();
	1093	+
	1094	+ return memcg;
	1095	+}
	1096	+
	1097	+static __always_inline bool memcg_kmem_bypass(void)
	1098	+{
	1099	+ /* Allow remote memcg charging from any context. */
	1100	+ if (unlikely(active_memcg()))
	1101	+ return false;
	1102	+
	1103	+ /* Memcg to charge can't be determined. */
	1104	+ if (in_interrupt() \|\| !current->mm \|\| (current->flags & PF_KTHREAD))
	1105	+ return true;
	1106	+
	1107	+ return false;
	1108	+}
	1109	+
884	1110	/**
885		- * If current->active_memcg is non-NULL, do not fallback to current->mm->memcg.
	1111	+ * If active memcg is set, do not fallback to current->mm->memcg.
886	1112	*/
887	1113	static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
888	1114	{
889		- if (unlikely(current->active_memcg)) {
890		- struct mem_cgroup *memcg = root_mem_cgroup;
	1115	+ if (memcg_kmem_bypass())
	1116	+ return NULL;
891	1117
892		- rcu_read_lock();
893		- if (css_tryget_online(&current->active_memcg->css))
894		- memcg = current->active_memcg;
895		- rcu_read_unlock();
896		- return memcg;
897		- }
	1118	+ if (unlikely(active_memcg()))
	1119	+ return get_active_memcg();
	1120	+
898	1121	return get_mem_cgroup_from_mm(current->mm);
899	1122	}
900	1123
..	..	@@ -911,15 +1134,15 @@
911	1134	* invocations for reference counting, or use mem_cgroup_iter_break()
912	1135	* to cancel a hierarchy walk before the round-trip is complete.
913	1136	*
914		- * Reclaimers can specify a node and a priority level in @reclaim to
915		- * divide up the memcgs in the hierarchy among all concurrent
916		- * reclaimers operating on the same node and priority.
	1137	+ * Reclaimers can specify a node in @reclaim to divide up the memcgs
	1138	+ * in the hierarchy among all concurrent reclaimers operating on the
	1139	+ * same node.
917	1140	*/
918	1141	struct mem_cgroup mem_cgroup_iter(struct mem_cgroup root,
919	1142	struct mem_cgroup *prev,
920	1143	struct mem_cgroup_reclaim_cookie *reclaim)
921	1144	{
922		- struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
	1145	+ struct mem_cgroup_reclaim_iter *iter;
923	1146	struct cgroup_subsys_state *css = NULL;
924	1147	struct mem_cgroup *memcg = NULL;
925	1148	struct mem_cgroup *pos = NULL;
..	..	@@ -945,7 +1168,7 @@
945	1168	struct mem_cgroup_per_node *mz;
946	1169
947	1170	mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
948		- iter = &mz->iter[reclaim->priority];
	1171	+ iter = &mz->iter;
949	1172
950	1173	if (prev && reclaim->generation != iter->generation)
951	1174	goto out_unlock;
..	..	@@ -1045,15 +1268,11 @@
1045	1268	struct mem_cgroup_reclaim_iter *iter;
1046	1269	struct mem_cgroup_per_node *mz;
1047	1270	int nid;
1048		- int i;
1049	1271
1050	1272	for_each_node(nid) {
1051	1273	mz = mem_cgroup_nodeinfo(from, nid);
1052		- for (i = 0; i <= DEF_PRIORITY; i++) {
1053		- iter = &mz->iter[i];
1054		- cmpxchg(&iter->position,
1055		- dead_memcg, NULL);
1056		- }
	1274	+ iter = &mz->iter;
	1275	+ cmpxchg(&iter->position, dead_memcg, NULL);
1057	1276	}
1058	1277	}
1059	1278
..	..	@@ -1103,7 +1322,7 @@
1103	1322	struct css_task_iter it;
1104	1323	struct task_struct *task;
1105	1324
1106		- css_task_iter_start(&iter->css, 0, &it);
	1325	+ css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
1107	1326	while (!ret && (task = css_task_iter_next(&it)))
1108	1327	ret = fn(task, arg);
1109	1328	css_task_iter_end(&it);
..	..	@@ -1120,9 +1339,8 @@
1120	1339	* @page: the page
1121	1340	* @pgdat: pgdat of the page
1122	1341	*
1123		- * This function is only safe when following the LRU page isolation
1124		- * and putback protocol: the LRU lock must be held, and the page must
1125		- * either be PageLRU() or the caller must have isolated/allocated it.
	1342	+ * This function relies on page->mem_cgroup being stable - see the
	1343	+ * access rules in commit_charge().
1126	1344	*/
1127	1345	struct lruvec mem_cgroup_page_lruvec(struct page page, struct pglist_data *pgdat)
1128	1346	{
..	..	@@ -1131,7 +1349,7 @@
1131	1349	struct lruvec *lruvec;
1132	1350
1133	1351	if (mem_cgroup_disabled()) {
1134		- lruvec = &pgdat->lruvec;
	1352	+ lruvec = &pgdat->__lruvec;
1135	1353	goto out;
1136	1354	}
1137	1355
..	..	@@ -1155,6 +1373,38 @@
1155	1373	lruvec->pgdat = pgdat;
1156	1374	return lruvec;
1157	1375	}
	1376	+
	1377	+struct lruvec page_to_lruvec(struct page page, pg_data_t *pgdat)
	1378	+{
	1379	+ struct lruvec *lruvec;
	1380	+
	1381	+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
	1382	+
	1383	+ return lruvec;
	1384	+}
	1385	+EXPORT_SYMBOL_GPL(page_to_lruvec);
	1386	+
	1387	+void do_traversal_all_lruvec(void)
	1388	+{
	1389	+ pg_data_t *pgdat;
	1390	+
	1391	+ for_each_online_pgdat(pgdat) {
	1392	+ struct mem_cgroup *memcg = NULL;
	1393	+
	1394	+ spin_lock_irq(&pgdat->lru_lock);
	1395	+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
	1396	+ do {
	1397	+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
	1398	+
	1399	+ trace_android_vh_do_traversal_lruvec(lruvec);
	1400	+
	1401	+ memcg = mem_cgroup_iter(NULL, memcg, NULL);
	1402	+ } while (memcg);
	1403	+
	1404	+ spin_unlock_irq(&pgdat->lru_lock);
	1405	+ }
	1406	+}
	1407	+EXPORT_SYMBOL_GPL(do_traversal_all_lruvec);
1158	1408
1159	1409	/**
1160	1410	* mem_cgroup_update_lru_size - account for adding or removing an lru page
..	..	@@ -1194,32 +1444,7 @@
1194	1444	if (nr_pages > 0)
1195	1445	*lru_size += nr_pages;
1196	1446	}
1197		-
1198		-bool task_in_mem_cgroup(struct task_struct task, struct mem_cgroup memcg)
1199		-{
1200		- struct mem_cgroup *task_memcg;
1201		- struct task_struct *p;
1202		- bool ret;
1203		-
1204		- p = find_lock_task_mm(task);
1205		- if (p) {
1206		- task_memcg = get_mem_cgroup_from_mm(p->mm);
1207		- task_unlock(p);
1208		- } else {
1209		- /*
1210		- * All threads may have already detached their mm's, but the oom
1211		- * killer still needs to detect if they have already been oom
1212		- * killed to prevent needlessly killing additional tasks.
1213		- */
1214		- rcu_read_lock();
1215		- task_memcg = mem_cgroup_from_task(task);
1216		- css_get(&task_memcg->css);
1217		- rcu_read_unlock();
1218		- }
1219		- ret = mem_cgroup_is_descendant(task_memcg, memcg);
1220		- css_put(&task_memcg->css);
1221		- return ret;
1222		-}
	1447	+EXPORT_SYMBOL_GPL(mem_cgroup_update_lru_size);
1223	1448
1224	1449	/**
1225	1450	* mem_cgroup_margin - calculate chargeable space of a memory cgroup
..	..	@@ -1242,7 +1467,7 @@
1242	1467	if (do_memsw_account()) {
1243	1468	count = page_counter_read(&memcg->memsw);
1244	1469	limit = READ_ONCE(memcg->memsw.max);
1245		- if (count <= limit)
	1470	+ if (count < limit)
1246	1471	margin = min(margin, limit - count);
1247	1472	else
1248	1473	margin = 0;
..	..	@@ -1296,85 +1521,199 @@
1296	1521	return false;
1297	1522	}
1298	1523
1299		-static const unsigned int memcg1_stats[] = {
1300		- MEMCG_CACHE,
1301		- MEMCG_RSS,
1302		- MEMCG_RSS_HUGE,
1303		- NR_SHMEM,
1304		- NR_FILE_MAPPED,
1305		- NR_FILE_DIRTY,
1306		- NR_WRITEBACK,
1307		- MEMCG_SWAP,
	1524	+struct memory_stat {
	1525	+ const char *name;
	1526	+ unsigned int ratio;
	1527	+ unsigned int idx;
1308	1528	};
1309	1529
1310		-static const char *const memcg1_stat_names[] = {
1311		- "cache",
1312		- "rss",
1313		- "rss_huge",
1314		- "shmem",
1315		- "mapped_file",
1316		- "dirty",
1317		- "writeback",
1318		- "swap",
	1530	+static struct memory_stat memory_stats[] = {
	1531	+ { "anon", PAGE_SIZE, NR_ANON_MAPPED },
	1532	+ { "file", PAGE_SIZE, NR_FILE_PAGES },
	1533	+ { "kernel_stack", 1024, NR_KERNEL_STACK_KB },
	1534	+ { "percpu", 1, MEMCG_PERCPU_B },
	1535	+ { "sock", PAGE_SIZE, MEMCG_SOCK },
	1536	+ { "shmem", PAGE_SIZE, NR_SHMEM },
	1537	+ { "file_mapped", PAGE_SIZE, NR_FILE_MAPPED },
	1538	+ { "file_dirty", PAGE_SIZE, NR_FILE_DIRTY },
	1539	+ { "file_writeback", PAGE_SIZE, NR_WRITEBACK },
	1540	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	1541	+ /*
	1542	+ * The ratio will be initialized in memory_stats_init(). Because
	1543	+ * on some architectures, the macro of HPAGE_PMD_SIZE is not
	1544	+ * constant(e.g. powerpc).
	1545	+ */
	1546	+ { "anon_thp", 0, NR_ANON_THPS },
	1547	+#endif
	1548	+ { "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON },
	1549	+ { "active_anon", PAGE_SIZE, NR_ACTIVE_ANON },
	1550	+ { "inactive_file", PAGE_SIZE, NR_INACTIVE_FILE },
	1551	+ { "active_file", PAGE_SIZE, NR_ACTIVE_FILE },
	1552	+ { "unevictable", PAGE_SIZE, NR_UNEVICTABLE },
	1553	+
	1554	+ /*
	1555	+ * Note: The slab_reclaimable and slab_unreclaimable must be
	1556	+ * together and slab_reclaimable must be in front.
	1557	+ */
	1558	+ { "slab_reclaimable", 1, NR_SLAB_RECLAIMABLE_B },
	1559	+ { "slab_unreclaimable", 1, NR_SLAB_UNRECLAIMABLE_B },
	1560	+
	1561	+ /* The memory events */
	1562	+ { "workingset_refault_anon", 1, WORKINGSET_REFAULT_ANON },
	1563	+ { "workingset_refault_file", 1, WORKINGSET_REFAULT_FILE },
	1564	+ { "workingset_activate_anon", 1, WORKINGSET_ACTIVATE_ANON },
	1565	+ { "workingset_activate_file", 1, WORKINGSET_ACTIVATE_FILE },
	1566	+ { "workingset_restore_anon", 1, WORKINGSET_RESTORE_ANON },
	1567	+ { "workingset_restore_file", 1, WORKINGSET_RESTORE_FILE },
	1568	+ { "workingset_nodereclaim", 1, WORKINGSET_NODERECLAIM },
1319	1569	};
	1570	+
	1571	+static int __init memory_stats_init(void)
	1572	+{
	1573	+ int i;
	1574	+
	1575	+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
	1576	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	1577	+ if (memory_stats[i].idx == NR_ANON_THPS)
	1578	+ memory_stats[i].ratio = HPAGE_PMD_SIZE;
	1579	+#endif
	1580	+ VM_BUG_ON(!memory_stats[i].ratio);
	1581	+ VM_BUG_ON(memory_stats[i].idx >= MEMCG_NR_STAT);
	1582	+ }
	1583	+
	1584	+ return 0;
	1585	+}
	1586	+pure_initcall(memory_stats_init);
	1587	+
	1588	+static char memory_stat_format(struct mem_cgroup memcg)
	1589	+{
	1590	+ struct seq_buf s;
	1591	+ int i;
	1592	+
	1593	+ seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
	1594	+ if (!s.buffer)
	1595	+ return NULL;
	1596	+
	1597	+ /*
	1598	+ * Provide statistics on the state of the memory subsystem as
	1599	+ * well as cumulative event counters that show past behavior.
	1600	+ *
	1601	+ * This list is ordered following a combination of these gradients:
	1602	+ * 1) generic big picture -> specifics and details
	1603	+ * 2) reflecting userspace activity -> reflecting kernel heuristics
	1604	+ *
	1605	+ * Current memory state:
	1606	+ */
	1607	+
	1608	+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
	1609	+ u64 size;
	1610	+
	1611	+ size = memcg_page_state(memcg, memory_stats[i].idx);
	1612	+ size *= memory_stats[i].ratio;
	1613	+ seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size);
	1614	+
	1615	+ if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
	1616	+ size = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
	1617	+ memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B);
	1618	+ seq_buf_printf(&s, "slab %llu\n", size);
	1619	+ }
	1620	+ }
	1621	+
	1622	+ /* Accumulated memory events */
	1623	+
	1624	+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT),
	1625	+ memcg_events(memcg, PGFAULT));
	1626	+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
	1627	+ memcg_events(memcg, PGMAJFAULT));
	1628	+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGREFILL),
	1629	+ memcg_events(memcg, PGREFILL));
	1630	+ seq_buf_printf(&s, "pgscan %lu\n",
	1631	+ memcg_events(memcg, PGSCAN_KSWAPD) +
	1632	+ memcg_events(memcg, PGSCAN_DIRECT));
	1633	+ seq_buf_printf(&s, "pgsteal %lu\n",
	1634	+ memcg_events(memcg, PGSTEAL_KSWAPD) +
	1635	+ memcg_events(memcg, PGSTEAL_DIRECT));
	1636	+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE),
	1637	+ memcg_events(memcg, PGACTIVATE));
	1638	+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE),
	1639	+ memcg_events(memcg, PGDEACTIVATE));
	1640	+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE),
	1641	+ memcg_events(memcg, PGLAZYFREE));
	1642	+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED),
	1643	+ memcg_events(memcg, PGLAZYFREED));
	1644	+
	1645	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	1646	+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC),
	1647	+ memcg_events(memcg, THP_FAULT_ALLOC));
	1648	+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC),
	1649	+ memcg_events(memcg, THP_COLLAPSE_ALLOC));
	1650	+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
	1651	+
	1652	+ /* The above should easily fit into one page */
	1653	+ WARN_ON_ONCE(seq_buf_has_overflowed(&s));
	1654	+
	1655	+ return s.buffer;
	1656	+}
1320	1657
1321	1658	#define K(x) ((x) << (PAGE_SHIFT-10))
1322	1659	/**
1323		- * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
	1660	+ * mem_cgroup_print_oom_context: Print OOM information relevant to
	1661	+ * memory controller.
1324	1662	* @memcg: The memory cgroup that went over limit
1325	1663	* @p: Task that is going to be killed
1326	1664	*
1327	1665	* NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1328	1666	* enabled
1329	1667	*/
1330		-void mem_cgroup_print_oom_info(struct mem_cgroup memcg, struct task_struct p)
	1668	+void mem_cgroup_print_oom_context(struct mem_cgroup memcg, struct task_struct p)
1331	1669	{
1332		- struct mem_cgroup *iter;
1333		- unsigned int i;
1334		-
1335	1670	rcu_read_lock();
1336	1671
	1672	+ if (memcg) {
	1673	+ pr_cont(",oom_memcg=");
	1674	+ pr_cont_cgroup_path(memcg->css.cgroup);
	1675	+ } else
	1676	+ pr_cont(",global_oom");
1337	1677	if (p) {
1338		- pr_info("Task in ");
	1678	+ pr_cont(",task_memcg=");
1339	1679	pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1340		- pr_cont(" killed as a result of limit of ");
1341		- } else {
1342		- pr_info("Memory limit reached of cgroup ");
1343	1680	}
1344		-
1345		- pr_cont_cgroup_path(memcg->css.cgroup);
1346		- pr_cont("\n");
1347		-
1348	1681	rcu_read_unlock();
	1682	+}
	1683	+
	1684	+/**
	1685	+ * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to
	1686	+ * memory controller.
	1687	+ * @memcg: The memory cgroup that went over limit
	1688	+ */
	1689	+void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
	1690	+{
	1691	+ char *buf;
1349	1692
1350	1693	pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1351	1694	K((u64)page_counter_read(&memcg->memory)),
1352		- K((u64)memcg->memory.max), memcg->memory.failcnt);
1353		- pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1354		- K((u64)page_counter_read(&memcg->memsw)),
1355		- K((u64)memcg->memsw.max), memcg->memsw.failcnt);
1356		- pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1357		- K((u64)page_counter_read(&memcg->kmem)),
1358		- K((u64)memcg->kmem.max), memcg->kmem.failcnt);
1359		-
1360		- for_each_mem_cgroup_tree(iter, memcg) {
1361		- pr_info("Memory cgroup stats for ");
1362		- pr_cont_cgroup_path(iter->css.cgroup);
1363		- pr_cont(":");
1364		-
1365		- for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
1366		- if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account)
1367		- continue;
1368		- pr_cont(" %s:%luKB", memcg1_stat_names[i],
1369		- K(memcg_page_state(iter, memcg1_stats[i])));
1370		- }
1371		-
1372		- for (i = 0; i < NR_LRU_LISTS; i++)
1373		- pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
1374		- K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
1375		-
1376		- pr_cont("\n");
	1695	+ K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt);
	1696	+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
	1697	+ pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
	1698	+ K((u64)page_counter_read(&memcg->swap)),
	1699	+ K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt);
	1700	+ else {
	1701	+ pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
	1702	+ K((u64)page_counter_read(&memcg->memsw)),
	1703	+ K((u64)memcg->memsw.max), memcg->memsw.failcnt);
	1704	+ pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
	1705	+ K((u64)page_counter_read(&memcg->kmem)),
	1706	+ K((u64)memcg->kmem.max), memcg->kmem.failcnt);
1377	1707	}
	1708	+
	1709	+ pr_info("Memory cgroup stats for ");
	1710	+ pr_cont_cgroup_path(memcg->css.cgroup);
	1711	+ pr_cont(":");
	1712	+ buf = memory_stat_format(memcg);
	1713	+ if (!buf)
	1714	+ return;
	1715	+ pr_info("%s", buf);
	1716	+ kfree(buf);
1378	1717	}
1379	1718
1380	1719	/*
..	..	@@ -1382,19 +1721,26 @@
1382	1721	*/
1383	1722	unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
1384	1723	{
1385		- unsigned long max;
	1724	+ unsigned long max = READ_ONCE(memcg->memory.max);
1386	1725
1387		- max = memcg->memory.max;
1388		- if (mem_cgroup_swappiness(memcg)) {
1389		- unsigned long memsw_max;
1390		- unsigned long swap_max;
	1726	+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
	1727	+ if (mem_cgroup_swappiness(memcg))
	1728	+ max += min(READ_ONCE(memcg->swap.max),
	1729	+ (unsigned long)total_swap_pages);
	1730	+ } else { /* v1 */
	1731	+ if (mem_cgroup_swappiness(memcg)) {
	1732	+ /* Calculate swap excess capacity from memsw limit */
	1733	+ unsigned long swap = READ_ONCE(memcg->memsw.max) - max;
1391	1734
1392		- memsw_max = memcg->memsw.max;
1393		- swap_max = memcg->swap.max;
1394		- swap_max = min(swap_max, (unsigned long)total_swap_pages);
1395		- max = min(max + swap_max, memsw_max);
	1735	+ max += min(swap, (unsigned long)total_swap_pages);
	1736	+ }
1396	1737	}
1397	1738	return max;
	1739	+}
	1740	+
	1741	+unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
	1742	+{
	1743	+ return page_counter_read(&memcg->memory);
1398	1744	}
1399	1745
1400	1746	static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
..	..	@@ -1407,112 +1753,24 @@
1407	1753	.gfp_mask = gfp_mask,
1408	1754	.order = order,
1409	1755	};
1410		- bool ret;
	1756	+ bool ret = true;
1411	1757
1412	1758	if (mutex_lock_killable(&oom_lock))
1413	1759	return true;
	1760	+
	1761	+ if (mem_cgroup_margin(memcg) >= (1 << order))
	1762	+ goto unlock;
	1763	+
1414	1764	/*
1415	1765	* A few threads which were not waiting at mutex_lock_killable() can
1416	1766	* fail to bail out. Therefore, check again after holding oom_lock.
1417	1767	*/
1418		- ret = should_force_charge() \|\| out_of_memory(&oc);
	1768	+ ret = task_is_dying() \|\| out_of_memory(&oc);
	1769	+
	1770	+unlock:
1419	1771	mutex_unlock(&oom_lock);
1420	1772	return ret;
1421	1773	}
1422		-
1423		-#if MAX_NUMNODES > 1
1424		-
1425		-/**
1426		- * test_mem_cgroup_node_reclaimable
1427		- * @memcg: the target memcg
1428		- * @nid: the node ID to be checked.
1429		- * @noswap : specify true here if the user wants flle only information.
1430		- *
1431		- * This function returns whether the specified memcg contains any
1432		- * reclaimable pages on a node. Returns true if there are any reclaimable
1433		- * pages in the node.
1434		- */
1435		-static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1436		- int nid, bool noswap)
1437		-{
1438		- if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
1439		- return true;
1440		- if (noswap \|\| !total_swap_pages)
1441		- return false;
1442		- if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
1443		- return true;
1444		- return false;
1445		-
1446		-}
1447		-
1448		-/*
1449		- * Always updating the nodemask is not very good - even if we have an empty
1450		- * list or the wrong list here, we can start from some node and traverse all
1451		- * nodes based on the zonelist. So update the list loosely once per 10 secs.
1452		- *
1453		- */
1454		-static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1455		-{
1456		- int nid;
1457		- /*
1458		- * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
1459		- * pagein/pageout changes since the last update.
1460		- */
1461		- if (!atomic_read(&memcg->numainfo_events))
1462		- return;
1463		- if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1464		- return;
1465		-
1466		- /* make a nodemask where this memcg uses memory from */
1467		- memcg->scan_nodes = node_states[N_MEMORY];
1468		-
1469		- for_each_node_mask(nid, node_states[N_MEMORY]) {
1470		-
1471		- if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1472		- node_clear(nid, memcg->scan_nodes);
1473		- }
1474		-
1475		- atomic_set(&memcg->numainfo_events, 0);
1476		- atomic_set(&memcg->numainfo_updating, 0);
1477		-}
1478		-
1479		-/*
1480		- * Selecting a node where we start reclaim from. Because what we need is just
1481		- * reducing usage counter, start from anywhere is O,K. Considering
1482		- * memory reclaim from current node, there are pros. and cons.
1483		- *
1484		- * Freeing memory from current node means freeing memory from a node which
1485		- * we'll use or we've used. So, it may make LRU bad. And if several threads
1486		- * hit limits, it will see a contention on a node. But freeing from remote
1487		- * node means more costs for memory reclaim because of memory latency.
1488		- *
1489		- * Now, we use round-robin. Better algorithm is welcomed.
1490		- */
1491		-int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1492		-{
1493		- int node;
1494		-
1495		- mem_cgroup_may_update_nodemask(memcg);
1496		- node = memcg->last_scanned_node;
1497		-
1498		- node = next_node_in(node, memcg->scan_nodes);
1499		- /*
1500		- * mem_cgroup_may_update_nodemask might have seen no reclaimmable pages
1501		- * last time it really checked all the LRUs due to rate limiting.
1502		- * Fallback to the current node in that case for simplicity.
1503		- */
1504		- if (unlikely(node == MAX_NUMNODES))
1505		- node = numa_node_id();
1506		-
1507		- memcg->last_scanned_node = node;
1508		- return node;
1509		-}
1510		-#else
1511		-int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1512		-{
1513		- return 0;
1514		-}
1515		-#endif
1516	1774
1517	1775	static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1518	1776	pg_data_t *pgdat,
..	..	@@ -1526,7 +1784,6 @@
1526	1784	unsigned long nr_scanned;
1527	1785	struct mem_cgroup_reclaim_cookie reclaim = {
1528	1786	.pgdat = pgdat,
1529		- .priority = 0,
1530	1787	};
1531	1788
1532	1789	excess = soft_limit_excess(root_memcg);
..	..	@@ -1621,7 +1878,7 @@
1621	1878	struct mem_cgroup *iter;
1622	1879
1623	1880	spin_lock(&memcg_oom_lock);
1624		- mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
	1881	+ mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
1625	1882	for_each_mem_cgroup_tree(iter, memcg)
1626	1883	iter->oom_lock = false;
1627	1884	spin_unlock(&memcg_oom_lock);
..	..	@@ -1642,8 +1899,8 @@
1642	1899	struct mem_cgroup *iter;
1643	1900
1644	1901	/*
1645		- * When a new child is created while the hierarchy is under oom,
1646		- * mem_cgroup_oom_lock() may not be called. Watch for underflow.
	1902	+ * Be careful about under_oom underflows becase a child memcg
	1903	+ * could have been added after mem_cgroup_mark_under_oom.
1647	1904	*/
1648	1905	spin_lock(&memcg_oom_lock);
1649	1906	for_each_mem_cgroup_tree(iter, memcg)
..	..	@@ -1703,6 +1960,8 @@
1703	1960
1704	1961	if (order > PAGE_ALLOC_COSTLY_ORDER)
1705	1962	return OOM_SKIPPED;
	1963	+
	1964	+ memcg_memory_event(memcg, MEMCG_OOM);
1706	1965
1707	1966	/*
1708	1967	* We are in the middle of the charge context here, so we
..	..	@@ -1851,6 +2110,14 @@
1851	2110	goto out;
1852	2111
1853	2112	/*
	2113	+ * If the victim task has been asynchronously moved to a different
	2114	+ * memory cgroup, we might end up killing tasks outside oom_domain.
	2115	+ * In this case it's better to ignore memory.group.oom.
	2116	+ */
	2117	+ if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain)))
	2118	+ goto out;
	2119	+
	2120	+ /*
1854	2121	* Traverse the memory cgroup hierarchy from the victim task's
1855	2122	* cgroup up to the OOMing cgroup (or root) to find the
1856	2123	* highest-level memory cgroup with oom.group set.
..	..	@@ -1891,6 +2158,7 @@
1891	2158	*/
1892	2159	struct mem_cgroup lock_page_memcg(struct page page)
1893	2160	{
	2161	+ struct page head = compound_head(page); / rmap on tail pages */
1894	2162	struct mem_cgroup *memcg;
1895	2163	unsigned long flags;
1896	2164
..	..	@@ -1910,7 +2178,7 @@
1910	2178	if (mem_cgroup_disabled())
1911	2179	return NULL;
1912	2180	again:
1913		- memcg = page->mem_cgroup;
	2181	+ memcg = head->mem_cgroup;
1914	2182	if (unlikely(!memcg))
1915	2183	return NULL;
1916	2184
..	..	@@ -1918,7 +2186,7 @@
1918	2186	return memcg;
1919	2187
1920	2188	spin_lock_irqsave(&memcg->move_lock, flags);
1921		- if (memcg != page->mem_cgroup) {
	2189	+ if (memcg != head->mem_cgroup) {
1922	2190	spin_unlock_irqrestore(&memcg->move_lock, flags);
1923	2191	goto again;
1924	2192	}
..	..	@@ -1961,19 +2229,43 @@
1961	2229	*/
1962	2230	void unlock_page_memcg(struct page *page)
1963	2231	{
1964		- __unlock_page_memcg(page->mem_cgroup);
	2232	+ struct page *head = compound_head(page);
	2233	+
	2234	+ __unlock_page_memcg(head->mem_cgroup);
1965	2235	}
1966	2236	EXPORT_SYMBOL(unlock_page_memcg);
1967	2237
1968	2238	struct memcg_stock_pcp {
1969	2239	struct mem_cgroup cached; / this never be root cgroup */
1970	2240	unsigned int nr_pages;
	2241	+
	2242	+#ifdef CONFIG_MEMCG_KMEM
	2243	+ struct obj_cgroup *cached_objcg;
	2244	+ unsigned int nr_bytes;
	2245	+#endif
	2246	+
1971	2247	struct work_struct work;
1972	2248	unsigned long flags;
1973	2249	#define FLUSHING_CACHED_CHARGE 0
1974	2250	};
1975	2251	static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1976	2252	static DEFINE_MUTEX(percpu_charge_mutex);
	2253	+
	2254	+#ifdef CONFIG_MEMCG_KMEM
	2255	+static void drain_obj_stock(struct memcg_stock_pcp *stock);
	2256	+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
	2257	+ struct mem_cgroup *root_memcg);
	2258	+
	2259	+#else
	2260	+static inline void drain_obj_stock(struct memcg_stock_pcp *stock)
	2261	+{
	2262	+}
	2263	+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
	2264	+ struct mem_cgroup *root_memcg)
	2265	+{
	2266	+ return false;
	2267	+}
	2268	+#endif
1977	2269
1978	2270	/**
1979	2271	* consume_stock: Try to consume stocked charge on this cpu.
..	..	@@ -2015,13 +2307,17 @@
2015	2307	{
2016	2308	struct mem_cgroup *old = stock->cached;
2017	2309
	2310	+ if (!old)
	2311	+ return;
	2312	+
2018	2313	if (stock->nr_pages) {
2019	2314	page_counter_uncharge(&old->memory, stock->nr_pages);
2020	2315	if (do_memsw_account())
2021	2316	page_counter_uncharge(&old->memsw, stock->nr_pages);
2022		- css_put_many(&old->css, stock->nr_pages);
2023	2317	stock->nr_pages = 0;
2024	2318	}
	2319	+
	2320	+ css_put(&old->css);
2025	2321	stock->cached = NULL;
2026	2322	}
2027	2323
..	..	@@ -2037,6 +2333,7 @@
2037	2333	local_irq_save(flags);
2038	2334
2039	2335	stock = this_cpu_ptr(&memcg_stock);
	2336	+ drain_obj_stock(stock);
2040	2337	drain_stock(stock);
2041	2338	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2042	2339
..	..	@@ -2057,6 +2354,7 @@
2057	2354	stock = this_cpu_ptr(&memcg_stock);
2058	2355	if (stock->cached != memcg) { /* reset if necessary */
2059	2356	drain_stock(stock);
	2357	+ css_get(&memcg->css);
2060	2358	stock->cached = memcg;
2061	2359	}
2062	2360	stock->nr_pages += nr_pages;
..	..	@@ -2088,21 +2386,24 @@
2088	2386	for_each_online_cpu(cpu) {
2089	2387	struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2090	2388	struct mem_cgroup *memcg;
	2389	+ bool flush = false;
2091	2390
	2391	+ rcu_read_lock();
2092	2392	memcg = stock->cached;
2093		- if (!memcg \|\| !stock->nr_pages \|\| !css_tryget(&memcg->css))
2094		- continue;
2095		- if (!mem_cgroup_is_descendant(memcg, root_memcg)) {
2096		- css_put(&memcg->css);
2097		- continue;
2098		- }
2099		- if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
	2393	+ if (memcg && stock->nr_pages &&
	2394	+ mem_cgroup_is_descendant(memcg, root_memcg))
	2395	+ flush = true;
	2396	+ if (obj_stock_flush_required(stock, root_memcg))
	2397	+ flush = true;
	2398	+ rcu_read_unlock();
	2399	+
	2400	+ if (flush &&
	2401	+ !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2100	2402	if (cpu == curcpu)
2101	2403	drain_local_stock(&stock->work);
2102	2404	else
2103	2405	schedule_work_on(cpu, &stock->work);
2104	2406	}
2105		- css_put(&memcg->css);
2106	2407	}
2107	2408	put_cpu();
2108	2409	mutex_unlock(&percpu_charge_mutex);
..	..	@@ -2111,7 +2412,7 @@
2111	2412	static int memcg_hotplug_cpu_dead(unsigned int cpu)
2112	2413	{
2113	2414	struct memcg_stock_pcp *stock;
2114		- struct mem_cgroup *memcg;
	2415	+ struct mem_cgroup memcg, mi;
2115	2416
2116	2417	stock = &per_cpu(memcg_stock, cpu);
2117	2418	drain_stock(stock);
..	..	@@ -2123,9 +2424,10 @@
2123	2424	int nid;
2124	2425	long x;
2125	2426
2126		- x = this_cpu_xchg(memcg->stat_cpu->count[i], 0);
	2427	+ x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
2127	2428	if (x)
2128		- atomic_long_add(x, &memcg->stat[i]);
	2429	+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
	2430	+ atomic_long_add(x, &memcg->vmstats[i]);
2129	2431
2130	2432	if (i >= NR_VM_NODE_STAT_ITEMS)
2131	2433	continue;
..	..	@@ -2136,32 +2438,48 @@
2136	2438	pn = mem_cgroup_nodeinfo(memcg, nid);
2137	2439	x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
2138	2440	if (x)
2139		- atomic_long_add(x, &pn->lruvec_stat[i]);
	2441	+ do {
	2442	+ atomic_long_add(x, &pn->lruvec_stat[i]);
	2443	+ } while ((pn = parent_nodeinfo(pn, nid)));
2140	2444	}
2141	2445	}
2142	2446
2143	2447	for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
2144	2448	long x;
2145	2449
2146		- x = this_cpu_xchg(memcg->stat_cpu->events[i], 0);
	2450	+ x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
2147	2451	if (x)
2148		- atomic_long_add(x, &memcg->events[i]);
	2452	+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
	2453	+ atomic_long_add(x, &memcg->vmevents[i]);
2149	2454	}
2150	2455	}
2151	2456
2152	2457	return 0;
2153	2458	}
2154	2459
2155		-static void reclaim_high(struct mem_cgroup *memcg,
2156		- unsigned int nr_pages,
2157		- gfp_t gfp_mask)
	2460	+static unsigned long reclaim_high(struct mem_cgroup *memcg,
	2461	+ unsigned int nr_pages,
	2462	+ gfp_t gfp_mask)
2158	2463	{
	2464	+ unsigned long nr_reclaimed = 0;
	2465	+
2159	2466	do {
2160		- if (page_counter_read(&memcg->memory) <= memcg->high)
	2467	+ unsigned long pflags;
	2468	+
	2469	+ if (page_counter_read(&memcg->memory) <=
	2470	+ READ_ONCE(memcg->memory.high))
2161	2471	continue;
	2472	+
2162	2473	memcg_memory_event(memcg, MEMCG_HIGH);
2163		- try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
2164		- } while ((memcg = parent_mem_cgroup(memcg)));
	2474	+
	2475	+ psi_memstall_enter(&pflags);
	2476	+ nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
	2477	+ gfp_mask, true);
	2478	+ psi_memstall_leave(&pflags);
	2479	+ } while ((memcg = parent_mem_cgroup(memcg)) &&
	2480	+ !mem_cgroup_is_root(memcg));
	2481	+
	2482	+ return nr_reclaimed;
2165	2483	}
2166	2484
2167	2485	static void high_work_func(struct work_struct *work)
..	..	@@ -2173,35 +2491,238 @@
2173	2491	}
2174	2492
2175	2493	/*
	2494	+ * Clamp the maximum sleep time per allocation batch to 2 seconds. This is
	2495	+ * enough to still cause a significant slowdown in most cases, while still
	2496	+ * allowing diagnostics and tracing to proceed without becoming stuck.
	2497	+ */
	2498	+#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
	2499	+
	2500	+/*
	2501	+ * When calculating the delay, we use these either side of the exponentiation to
	2502	+ * maintain precision and scale to a reasonable number of jiffies (see the table
	2503	+ * below.
	2504	+ *
	2505	+ * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
	2506	+ * overage ratio to a delay.
	2507	+ * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the
	2508	+ * proposed penalty in order to reduce to a reasonable number of jiffies, and
	2509	+ * to produce a reasonable delay curve.
	2510	+ *
	2511	+ * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
	2512	+ * reasonable delay curve compared to precision-adjusted overage, not
	2513	+ * penalising heavily at first, but still making sure that growth beyond the
	2514	+ * limit penalises misbehaviour cgroups by slowing them down exponentially. For
	2515	+ * example, with a high of 100 megabytes:
	2516	+ *
	2517	+ * +-------+------------------------+
	2518	+ * \| usage \| time to allocate in ms \|
	2519	+ * +-------+------------------------+
	2520	+ * \| 100M \| 0 \|
	2521	+ * \| 101M \| 6 \|
	2522	+ * \| 102M \| 25 \|
	2523	+ * \| 103M \| 57 \|
	2524	+ * \| 104M \| 102 \|
	2525	+ * \| 105M \| 159 \|
	2526	+ * \| 106M \| 230 \|
	2527	+ * \| 107M \| 313 \|
	2528	+ * \| 108M \| 409 \|
	2529	+ * \| 109M \| 518 \|
	2530	+ * \| 110M \| 639 \|
	2531	+ * \| 111M \| 774 \|
	2532	+ * \| 112M \| 921 \|
	2533	+ * \| 113M \| 1081 \|
	2534	+ * \| 114M \| 1254 \|
	2535	+ * \| 115M \| 1439 \|
	2536	+ * \| 116M \| 1638 \|
	2537	+ * \| 117M \| 1849 \|
	2538	+ * \| 118M \| 2000 \|
	2539	+ * \| 119M \| 2000 \|
	2540	+ * \| 120M \| 2000 \|
	2541	+ * +-------+------------------------+
	2542	+ */
	2543	+ #define MEMCG_DELAY_PRECISION_SHIFT 20
	2544	+ #define MEMCG_DELAY_SCALING_SHIFT 14
	2545	+
	2546	+static u64 calculate_overage(unsigned long usage, unsigned long high)
	2547	+{
	2548	+ u64 overage;
	2549	+
	2550	+ if (usage <= high)
	2551	+ return 0;
	2552	+
	2553	+ /*
	2554	+ * Prevent division by 0 in overage calculation by acting as if
	2555	+ * it was a threshold of 1 page
	2556	+ */
	2557	+ high = max(high, 1UL);
	2558	+
	2559	+ overage = usage - high;
	2560	+ overage <<= MEMCG_DELAY_PRECISION_SHIFT;
	2561	+ return div64_u64(overage, high);
	2562	+}
	2563	+
	2564	+static u64 mem_find_max_overage(struct mem_cgroup *memcg)
	2565	+{
	2566	+ u64 overage, max_overage = 0;
	2567	+
	2568	+ do {
	2569	+ overage = calculate_overage(page_counter_read(&memcg->memory),
	2570	+ READ_ONCE(memcg->memory.high));
	2571	+ max_overage = max(overage, max_overage);
	2572	+ } while ((memcg = parent_mem_cgroup(memcg)) &&
	2573	+ !mem_cgroup_is_root(memcg));
	2574	+
	2575	+ return max_overage;
	2576	+}
	2577	+
	2578	+static u64 swap_find_max_overage(struct mem_cgroup *memcg)
	2579	+{
	2580	+ u64 overage, max_overage = 0;
	2581	+
	2582	+ do {
	2583	+ overage = calculate_overage(page_counter_read(&memcg->swap),
	2584	+ READ_ONCE(memcg->swap.high));
	2585	+ if (overage)
	2586	+ memcg_memory_event(memcg, MEMCG_SWAP_HIGH);
	2587	+ max_overage = max(overage, max_overage);
	2588	+ } while ((memcg = parent_mem_cgroup(memcg)) &&
	2589	+ !mem_cgroup_is_root(memcg));
	2590	+
	2591	+ return max_overage;
	2592	+}
	2593	+
	2594	+/*
	2595	+ * Get the number of jiffies that we should penalise a mischievous cgroup which
	2596	+ * is exceeding its memory.high by checking both it and its ancestors.
	2597	+ */
	2598	+static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
	2599	+ unsigned int nr_pages,
	2600	+ u64 max_overage)
	2601	+{
	2602	+ unsigned long penalty_jiffies;
	2603	+
	2604	+ if (!max_overage)
	2605	+ return 0;
	2606	+
	2607	+ /*
	2608	+ * We use overage compared to memory.high to calculate the number of
	2609	+ * jiffies to sleep (penalty_jiffies). Ideally this value should be
	2610	+ * fairly lenient on small overages, and increasingly harsh when the
	2611	+ * memcg in question makes it clear that it has no intention of stopping
	2612	+ * its crazy behaviour, so we exponentially increase the delay based on
	2613	+ * overage amount.
	2614	+ */
	2615	+ penalty_jiffies = max_overage * max_overage * HZ;
	2616	+ penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
	2617	+ penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
	2618	+
	2619	+ /*
	2620	+ * Factor in the task's own contribution to the overage, such that four
	2621	+ * N-sized allocations are throttled approximately the same as one
	2622	+ * 4N-sized allocation.
	2623	+ *
	2624	+ * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
	2625	+ * larger the current charge patch is than that.
	2626	+ */
	2627	+ return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
	2628	+}
	2629	+
	2630	+/*
2176	2631	* Scheduled by try_charge() to be executed from the userland return path
2177	2632	* and reclaims memory over the high limit.
2178	2633	*/
2179	2634	void mem_cgroup_handle_over_high(void)
2180	2635	{
	2636	+ unsigned long penalty_jiffies;
	2637	+ unsigned long pflags;
	2638	+ unsigned long nr_reclaimed;
2181	2639	unsigned int nr_pages = current->memcg_nr_pages_over_high;
	2640	+ int nr_retries = MAX_RECLAIM_RETRIES;
2182	2641	struct mem_cgroup *memcg;
	2642	+ bool in_retry = false;
2183	2643
2184	2644	if (likely(!nr_pages))
2185	2645	return;
2186	2646
2187	2647	memcg = get_mem_cgroup_from_mm(current->mm);
2188		- reclaim_high(memcg, nr_pages, GFP_KERNEL);
2189		- css_put(&memcg->css);
2190	2648	current->memcg_nr_pages_over_high = 0;
	2649	+
	2650	+retry_reclaim:
	2651	+ /*
	2652	+ * The allocating task should reclaim at least the batch size, but for
	2653	+ * subsequent retries we only want to do what's necessary to prevent oom
	2654	+ * or breaching resource isolation.
	2655	+ *
	2656	+ * This is distinct from memory.max or page allocator behaviour because
	2657	+ * memory.high is currently batched, whereas memory.max and the page
	2658	+ * allocator run every time an allocation is made.
	2659	+ */
	2660	+ nr_reclaimed = reclaim_high(memcg,
	2661	+ in_retry ? SWAP_CLUSTER_MAX : nr_pages,
	2662	+ GFP_KERNEL);
	2663	+
	2664	+ /*
	2665	+ * memory.high is breached and reclaim is unable to keep up. Throttle
	2666	+ * allocators proactively to slow down excessive growth.
	2667	+ */
	2668	+ penalty_jiffies = calculate_high_delay(memcg, nr_pages,
	2669	+ mem_find_max_overage(memcg));
	2670	+
	2671	+ penalty_jiffies += calculate_high_delay(memcg, nr_pages,
	2672	+ swap_find_max_overage(memcg));
	2673	+
	2674	+ /*
	2675	+ * Clamp the max delay per usermode return so as to still keep the
	2676	+ * application moving forwards and also permit diagnostics, albeit
	2677	+ * extremely slowly.
	2678	+ */
	2679	+ penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
	2680	+
	2681	+ /*
	2682	+ * Don't sleep if the amount of jiffies this memcg owes us is so low
	2683	+ * that it's not even worth doing, in an attempt to be nice to those who
	2684	+ * go only a small amount over their memory.high value and maybe haven't
	2685	+ * been aggressively reclaimed enough yet.
	2686	+ */
	2687	+ if (penalty_jiffies <= HZ / 100)
	2688	+ goto out;
	2689	+
	2690	+ /*
	2691	+ * If reclaim is making forward progress but we're still over
	2692	+ * memory.high, we want to encourage that rather than doing allocator
	2693	+ * throttling.
	2694	+ */
	2695	+ if (nr_reclaimed \|\| nr_retries--) {
	2696	+ in_retry = true;
	2697	+ goto retry_reclaim;
	2698	+ }
	2699	+
	2700	+ /*
	2701	+ * If we exit early, we're guaranteed to die (since
	2702	+ * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
	2703	+ * need to account for any ill-begotten jiffies to pay them off later.
	2704	+ */
	2705	+ psi_memstall_enter(&pflags);
	2706	+ schedule_timeout_killable(penalty_jiffies);
	2707	+ psi_memstall_leave(&pflags);
	2708	+
	2709	+out:
	2710	+ css_put(&memcg->css);
2191	2711	}
2192	2712
2193	2713	static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2194	2714	unsigned int nr_pages)
2195	2715	{
2196	2716	unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
2197		- int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
	2717	+ int nr_retries = MAX_RECLAIM_RETRIES;
2198	2718	struct mem_cgroup *mem_over_limit;
2199	2719	struct page_counter *counter;
	2720	+ enum oom_status oom_status;
2200	2721	unsigned long nr_reclaimed;
	2722	+ bool passed_oom = false;
2201	2723	bool may_swap = true;
2202	2724	bool drained = false;
2203		- bool oomed = false;
2204		- enum oom_status oom_status;
	2725	+ unsigned long pflags;
2205	2726
2206	2727	if (mem_cgroup_is_root(memcg))
2207	2728	return 0;
..	..	@@ -2236,15 +2757,6 @@
2236	2757	goto force;
2237	2758
2238	2759	/*
2239		- * Unlike in global OOM situations, memcg is not in a physical
2240		- * memory shortage. Allow dying and OOM-killed tasks to
2241		- * bypass the last charges so that they can exit quickly and
2242		- * free their memory.
2243		- */
2244		- if (unlikely(should_force_charge()))
2245		- goto force;
2246		-
2247		- /*
2248	2760	* Prevent unbounded recursion when reclaim operations need to
2249	2761	* allocate memory. This might exceed the limits temporarily,
2250	2762	* but we prefer facilitating memory reclaim and getting back
..	..	@@ -2261,8 +2773,10 @@
2261	2773
2262	2774	memcg_memory_event(mem_over_limit, MEMCG_MAX);
2263	2775
	2776	+ psi_memstall_enter(&pflags);
2264	2777	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2265	2778	gfp_mask, may_swap);
	2779	+ psi_memstall_leave(&pflags);
2266	2780
2267	2781	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2268	2782	goto retry;
..	..	@@ -2296,16 +2810,15 @@
2296	2810	if (nr_retries--)
2297	2811	goto retry;
2298	2812
2299		- if (gfp_mask & __GFP_RETRY_MAYFAIL && oomed)
	2813	+ if (gfp_mask & __GFP_RETRY_MAYFAIL)
2300	2814	goto nomem;
2301	2815
2302	2816	if (gfp_mask & __GFP_NOFAIL)
2303	2817	goto force;
2304	2818
2305		- if (fatal_signal_pending(current))
2306		- goto force;
2307		-
2308		- memcg_memory_event(mem_over_limit, MEMCG_OOM);
	2819	+ /* Avoid endless loop for tasks bypassed by the oom killer */
	2820	+ if (passed_oom && task_is_dying())
	2821	+ goto nomem;
2309	2822
2310	2823	/*
2311	2824	* keep retrying as long as the memcg oom killer is able to make
..	..	@@ -2314,15 +2827,10 @@
2314	2827	*/
2315	2828	oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
2316	2829	get_order(nr_pages * PAGE_SIZE));
2317		- switch (oom_status) {
2318		- case OOM_SUCCESS:
2319		- nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2320		- oomed = true;
	2830	+ if (oom_status == OOM_SUCCESS) {
	2831	+ passed_oom = true;
	2832	+ nr_retries = MAX_RECLAIM_RETRIES;
2321	2833	goto retry;
2322		- case OOM_FAILED:
2323		- goto force;
2324		- default:
2325		- goto nomem;
2326	2834	}
2327	2835	nomem:
2328	2836	if (!(gfp_mask & __GFP_NOFAIL))
..	..	@@ -2336,12 +2844,10 @@
2336	2844	page_counter_charge(&memcg->memory, nr_pages);
2337	2845	if (do_memsw_account())
2338	2846	page_counter_charge(&memcg->memsw, nr_pages);
2339		- css_get_many(&memcg->css, nr_pages);
2340	2847
2341	2848	return 0;
2342	2849
2343	2850	done_restock:
2344		- css_get_many(&memcg->css, batch);
2345	2851	if (batch > nr_pages)
2346	2852	refill_stock(memcg, batch - nr_pages);
2347	2853
..	..	@@ -2355,12 +2861,32 @@
2355	2861	* reclaim, the cost of mismatch is negligible.
2356	2862	*/
2357	2863	do {
2358		- if (page_counter_read(&memcg->memory) > memcg->high) {
2359		- /* Don't bother a random interrupted task */
2360		- if (in_interrupt()) {
	2864	+ bool mem_high, swap_high;
	2865	+
	2866	+ mem_high = page_counter_read(&memcg->memory) >
	2867	+ READ_ONCE(memcg->memory.high);
	2868	+ swap_high = page_counter_read(&memcg->swap) >
	2869	+ READ_ONCE(memcg->swap.high);
	2870	+
	2871	+ /* Don't bother a random interrupted task */
	2872	+ if (in_interrupt()) {
	2873	+ if (mem_high) {
2361	2874	schedule_work(&memcg->high_work);
2362	2875	break;
2363	2876	}
	2877	+ continue;
	2878	+ }
	2879	+
	2880	+ if (mem_high \|\| swap_high) {
	2881	+ /*
	2882	+ * The allocating tasks in this cgroup will need to do
	2883	+ * reclaim or be throttled to prevent further growth
	2884	+ * of the memory or swap footprints.
	2885	+ *
	2886	+ * Target some best-effort fairness between the tasks,
	2887	+ * and distribute reclaim work and delay penalties
	2888	+ * based on how much each task is actually allocating.
	2889	+ */
2364	2890	current->memcg_nr_pages_over_high += batch;
2365	2891	set_notify_resume(current);
2366	2892	break;
..	..	@@ -2370,6 +2896,7 @@
2370	2896	return 0;
2371	2897	}
2372	2898
	2899	+#if defined(CONFIG_MEMCG_KMEM) \|\| defined(CONFIG_MMU)
2373	2900	static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2374	2901	{
2375	2902	if (mem_cgroup_is_root(memcg))
..	..	@@ -2378,76 +2905,124 @@
2378	2905	page_counter_uncharge(&memcg->memory, nr_pages);
2379	2906	if (do_memsw_account())
2380	2907	page_counter_uncharge(&memcg->memsw, nr_pages);
2381		-
2382		- css_put_many(&memcg->css, nr_pages);
2383	2908	}
	2909	+#endif
2384	2910
2385		-static void lock_page_lru(struct page page, int isolated)
	2911	+static void commit_charge(struct page page, struct mem_cgroup memcg)
2386	2912	{
2387		- struct zone *zone = page_zone(page);
2388		-
2389		- spin_lock_irq(zone_lru_lock(zone));
2390		- if (PageLRU(page)) {
2391		- struct lruvec *lruvec;
2392		-
2393		- lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
2394		- ClearPageLRU(page);
2395		- del_page_from_lru_list(page, lruvec, page_lru(page));
2396		- *isolated = 1;
2397		- } else
2398		- *isolated = 0;
2399		-}
2400		-
2401		-static void unlock_page_lru(struct page *page, int isolated)
2402		-{
2403		- struct zone *zone = page_zone(page);
2404		-
2405		- if (isolated) {
2406		- struct lruvec *lruvec;
2407		-
2408		- lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
2409		- VM_BUG_ON_PAGE(PageLRU(page), page);
2410		- SetPageLRU(page);
2411		- add_page_to_lru_list(page, lruvec, page_lru(page));
2412		- }
2413		- spin_unlock_irq(zone_lru_lock(zone));
2414		-}
2415		-
2416		-static void commit_charge(struct page page, struct mem_cgroup memcg,
2417		- bool lrucare)
2418		-{
2419		- int isolated;
2420		-
2421	2913	VM_BUG_ON_PAGE(page->mem_cgroup, page);
2422		-
2423	2914	/*
2424		- * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
2425		- * may already be on some other mem_cgroup's LRU. Take care of it.
2426		- */
2427		- if (lrucare)
2428		- lock_page_lru(page, &isolated);
2429		-
2430		- /*
2431		- * Nobody should be changing or seriously looking at
2432		- * page->mem_cgroup at this point:
	2915	+ * Any of the following ensures page->mem_cgroup stability:
2433	2916	*
2434		- * - the page is uncharged
2435		- *
2436		- * - the page is off-LRU
2437		- *
2438		- * - an anonymous fault has exclusive page access, except for
2439		- * a locked page table
2440		- *
2441		- * - a page cache insertion, a swapin fault, or a migration
2442		- * have the page locked
	2917	+ * - the page lock
	2918	+ * - LRU isolation
	2919	+ * - lock_page_memcg()
	2920	+ * - exclusive reference
2443	2921	*/
2444	2922	page->mem_cgroup = memcg;
2445		-
2446		- if (lrucare)
2447		- unlock_page_lru(page, isolated);
2448	2923	}
2449	2924
2450	2925	#ifdef CONFIG_MEMCG_KMEM
	2926	+/*
	2927	+ * The allocated objcg pointers array is not accounted directly.
	2928	+ * Moreover, it should not come from DMA buffer and is not readily
	2929	+ * reclaimable. So those GFP bits should be masked off.
	2930	+ */
	2931	+#define OBJCGS_CLEAR_MASK (__GFP_DMA \| __GFP_RECLAIMABLE \| __GFP_ACCOUNT)
	2932	+
	2933	+int memcg_alloc_page_obj_cgroups(struct page page, struct kmem_cache s,
	2934	+ gfp_t gfp)
	2935	+{
	2936	+ unsigned int objects = objs_per_slab_page(s, page);
	2937	+ void *vec;
	2938	+
	2939	+ gfp &= ~OBJCGS_CLEAR_MASK;
	2940	+ vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
	2941	+ page_to_nid(page));
	2942	+ if (!vec)
	2943	+ return -ENOMEM;
	2944	+
	2945	+ if (cmpxchg(&page->obj_cgroups, NULL,
	2946	+ (struct obj_cgroup **) ((unsigned long)vec \| 0x1UL)))
	2947	+ kfree(vec);
	2948	+ else
	2949	+ kmemleak_not_leak(vec);
	2950	+
	2951	+ return 0;
	2952	+}
	2953	+
	2954	+/*
	2955	+ * Returns a pointer to the memory cgroup to which the kernel object is charged.
	2956	+ *
	2957	+ * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
	2958	+ * cgroup_mutex, etc.
	2959	+ */
	2960	+struct mem_cgroup mem_cgroup_from_obj(void p)
	2961	+{
	2962	+ struct page *page;
	2963	+
	2964	+ if (mem_cgroup_disabled())
	2965	+ return NULL;
	2966	+
	2967	+ page = virt_to_head_page(p);
	2968	+
	2969	+ /*
	2970	+ * If page->mem_cgroup is set, it's either a simple mem_cgroup pointer
	2971	+ * or a pointer to obj_cgroup vector. In the latter case the lowest
	2972	+ * bit of the pointer is set.
	2973	+ * The page->mem_cgroup pointer can be asynchronously changed
	2974	+ * from NULL to (obj_cgroup_vec \| 0x1UL), but can't be changed
	2975	+ * from a valid memcg pointer to objcg vector or back.
	2976	+ */
	2977	+ if (!page->mem_cgroup)
	2978	+ return NULL;
	2979	+
	2980	+ /*
	2981	+ * Slab objects are accounted individually, not per-page.
	2982	+ * Memcg membership data for each individual object is saved in
	2983	+ * the page->obj_cgroups.
	2984	+ */
	2985	+ if (page_has_obj_cgroups(page)) {
	2986	+ struct obj_cgroup *objcg;
	2987	+ unsigned int off;
	2988	+
	2989	+ off = obj_to_index(page->slab_cache, page, p);
	2990	+ objcg = page_obj_cgroups(page)[off];
	2991	+ if (objcg)
	2992	+ return obj_cgroup_memcg(objcg);
	2993	+
	2994	+ return NULL;
	2995	+ }
	2996	+
	2997	+ /* All other pages use page->mem_cgroup */
	2998	+ return page->mem_cgroup;
	2999	+}
	3000	+
	3001	+__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
	3002	+{
	3003	+ struct obj_cgroup *objcg = NULL;
	3004	+ struct mem_cgroup *memcg;
	3005	+
	3006	+ if (memcg_kmem_bypass())
	3007	+ return NULL;
	3008	+
	3009	+ rcu_read_lock();
	3010	+ if (unlikely(active_memcg()))
	3011	+ memcg = active_memcg();
	3012	+ else
	3013	+ memcg = mem_cgroup_from_task(current);
	3014	+
	3015	+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
	3016	+ objcg = rcu_dereference(memcg->objcg);
	3017	+ if (objcg && obj_cgroup_tryget(objcg))
	3018	+ break;
	3019	+ objcg = NULL;
	3020	+ }
	3021	+ rcu_read_unlock();
	3022	+
	3023	+ return objcg;
	3024	+}
	3025	+
2451	3026	static int memcg_alloc_cache_id(void)
2452	3027	{
2453	3028	int id, size;
..	..	@@ -2473,9 +3048,7 @@
2473	3048	else if (size > MEMCG_CACHES_MAX_SIZE)
2474	3049	size = MEMCG_CACHES_MAX_SIZE;
2475	3050
2476		- err = memcg_update_all_caches(size);
2477		- if (!err)
2478		- err = memcg_update_all_list_lrus(size);
	3051	+ err = memcg_update_all_list_lrus(size);
2479	3052	if (!err)
2480	3053	memcg_nr_cache_ids = size;
2481	3054
..	..	@@ -2493,152 +3066,17 @@
2493	3066	ida_simple_remove(&memcg_cache_ida, id);
2494	3067	}
2495	3068
2496		-struct memcg_kmem_cache_create_work {
2497		- struct mem_cgroup *memcg;
2498		- struct kmem_cache *cachep;
2499		- struct work_struct work;
2500		-};
2501		-
2502		-static void memcg_kmem_cache_create_func(struct work_struct *w)
2503		-{
2504		- struct memcg_kmem_cache_create_work *cw =
2505		- container_of(w, struct memcg_kmem_cache_create_work, work);
2506		- struct mem_cgroup *memcg = cw->memcg;
2507		- struct kmem_cache *cachep = cw->cachep;
2508		-
2509		- memcg_create_kmem_cache(memcg, cachep);
2510		-
2511		- css_put(&memcg->css);
2512		- kfree(cw);
2513		-}
2514		-
2515		-/*
2516		- * Enqueue the creation of a per-memcg kmem_cache.
2517		- */
2518		-static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2519		- struct kmem_cache *cachep)
2520		-{
2521		- struct memcg_kmem_cache_create_work *cw;
2522		-
2523		- cw = kmalloc(sizeof(*cw), GFP_NOWAIT \| __GFP_NOWARN);
2524		- if (!cw)
2525		- return;
2526		-
2527		- css_get(&memcg->css);
2528		-
2529		- cw->memcg = memcg;
2530		- cw->cachep = cachep;
2531		- INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
2532		-
2533		- queue_work(memcg_kmem_cache_wq, &cw->work);
2534		-}
2535		-
2536		-static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2537		- struct kmem_cache *cachep)
2538		-{
2539		- /*
2540		- * We need to stop accounting when we kmalloc, because if the
2541		- * corresponding kmalloc cache is not yet created, the first allocation
2542		- * in __memcg_schedule_kmem_cache_create will recurse.
2543		- *
2544		- * However, it is better to enclose the whole function. Depending on
2545		- * the debugging options enabled, INIT_WORK(), for instance, can
2546		- * trigger an allocation. This too, will make us recurse. Because at
2547		- * this point we can't allow ourselves back into memcg_kmem_get_cache,
2548		- * the safest choice is to do it like this, wrapping the whole function.
2549		- */
2550		- current->memcg_kmem_skip_account = 1;
2551		- __memcg_schedule_kmem_cache_create(memcg, cachep);
2552		- current->memcg_kmem_skip_account = 0;
2553		-}
2554		-
2555		-static inline bool memcg_kmem_bypass(void)
2556		-{
2557		- if (in_interrupt() \|\| !current->mm \|\| (current->flags & PF_KTHREAD))
2558		- return true;
2559		- return false;
2560		-}
2561		-
2562	3069	/**
2563		- * memcg_kmem_get_cache: select the correct per-memcg cache for allocation
2564		- * @cachep: the original global kmem cache
2565		- *
2566		- * Return the kmem_cache we're supposed to use for a slab allocation.
2567		- * We try to use the current memcg's version of the cache.
2568		- *
2569		- * If the cache does not exist yet, if we are the first user of it, we
2570		- * create it asynchronously in a workqueue and let the current allocation
2571		- * go through with the original cache.
2572		- *
2573		- * This function takes a reference to the cache it returns to assure it
2574		- * won't get destroyed while we are working with it. Once the caller is
2575		- * done with it, memcg_kmem_put_cache() must be called to release the
2576		- * reference.
2577		- */
2578		-struct kmem_cache memcg_kmem_get_cache(struct kmem_cache cachep)
2579		-{
2580		- struct mem_cgroup *memcg;
2581		- struct kmem_cache *memcg_cachep;
2582		- int kmemcg_id;
2583		-
2584		- VM_BUG_ON(!is_root_cache(cachep));
2585		-
2586		- if (memcg_kmem_bypass())
2587		- return cachep;
2588		-
2589		- if (current->memcg_kmem_skip_account)
2590		- return cachep;
2591		-
2592		- memcg = get_mem_cgroup_from_current();
2593		- kmemcg_id = READ_ONCE(memcg->kmemcg_id);
2594		- if (kmemcg_id < 0)
2595		- goto out;
2596		-
2597		- memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
2598		- if (likely(memcg_cachep))
2599		- return memcg_cachep;
2600		-
2601		- /*
2602		- * If we are in a safe context (can wait, and not in interrupt
2603		- * context), we could be be predictable and return right away.
2604		- * This would guarantee that the allocation being performed
2605		- * already belongs in the new cache.
2606		- *
2607		- * However, there are some clashes that can arrive from locking.
2608		- * For instance, because we acquire the slab_mutex while doing
2609		- * memcg_create_kmem_cache, this means no further allocation
2610		- * could happen with the slab_mutex held. So it's better to
2611		- * defer everything.
2612		- */
2613		- memcg_schedule_kmem_cache_create(memcg, cachep);
2614		-out:
2615		- css_put(&memcg->css);
2616		- return cachep;
2617		-}
2618		-
2619		-/**
2620		- * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache
2621		- * @cachep: the cache returned by memcg_kmem_get_cache
2622		- */
2623		-void memcg_kmem_put_cache(struct kmem_cache *cachep)
2624		-{
2625		- if (!is_root_cache(cachep))
2626		- css_put(&cachep->memcg_params.memcg->css);
2627		-}
2628		-
2629		-/**
2630		- * memcg_kmem_charge_memcg: charge a kmem page
2631		- * @page: page to charge
2632		- * @gfp: reclaim mode
2633		- * @order: allocation order
	3070	+ * __memcg_kmem_charge: charge a number of kernel pages to a memcg
2634	3071	* @memcg: memory cgroup to charge
	3072	+ * @gfp: reclaim mode
	3073	+ * @nr_pages: number of pages to charge
2635	3074	*
2636	3075	* Returns 0 on success, an error code on failure.
2637	3076	*/
2638		-int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
2639		- struct mem_cgroup *memcg)
	3077	+int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
	3078	+ unsigned int nr_pages)
2640	3079	{
2641		- unsigned int nr_pages = 1 << order;
2642	3080	struct page_counter *counter;
2643	3081	int ret;
2644	3082
..	..	@@ -2661,43 +3099,54 @@
2661	3099	cancel_charge(memcg, nr_pages);
2662	3100	return -ENOMEM;
2663	3101	}
2664		-
2665		- page->mem_cgroup = memcg;
2666		-
2667	3102	return 0;
2668	3103	}
2669	3104
2670	3105	/**
2671		- * memcg_kmem_charge: charge a kmem page to the current memory cgroup
	3106	+ * __memcg_kmem_uncharge: uncharge a number of kernel pages from a memcg
	3107	+ * @memcg: memcg to uncharge
	3108	+ * @nr_pages: number of pages to uncharge
	3109	+ */
	3110	+void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
	3111	+{
	3112	+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
	3113	+ page_counter_uncharge(&memcg->kmem, nr_pages);
	3114	+
	3115	+ refill_stock(memcg, nr_pages);
	3116	+}
	3117	+
	3118	+/**
	3119	+ * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup
2672	3120	* @page: page to charge
2673	3121	* @gfp: reclaim mode
2674	3122	* @order: allocation order
2675	3123	*
2676	3124	* Returns 0 on success, an error code on failure.
2677	3125	*/
2678		-int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
	3126	+int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
2679	3127	{
2680	3128	struct mem_cgroup *memcg;
2681	3129	int ret = 0;
2682	3130
2683		- if (mem_cgroup_disabled() \|\| memcg_kmem_bypass())
2684		- return 0;
2685		-
2686	3131	memcg = get_mem_cgroup_from_current();
2687		- if (!mem_cgroup_is_root(memcg)) {
2688		- ret = memcg_kmem_charge_memcg(page, gfp, order, memcg);
2689		- if (!ret)
	3132	+ if (memcg && !mem_cgroup_is_root(memcg)) {
	3133	+ ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
	3134	+ if (!ret) {
	3135	+ page->mem_cgroup = memcg;
2690	3136	__SetPageKmemcg(page);
	3137	+ return 0;
	3138	+ }
	3139	+ css_put(&memcg->css);
2691	3140	}
2692		- css_put(&memcg->css);
2693	3141	return ret;
2694	3142	}
	3143	+
2695	3144	/**
2696		- * memcg_kmem_uncharge: uncharge a kmem page
	3145	+ * __memcg_kmem_uncharge_page: uncharge a kmem page
2697	3146	* @page: page to uncharge
2698	3147	* @order: allocation order
2699	3148	*/
2700		-void memcg_kmem_uncharge(struct page *page, int order)
	3149	+void __memcg_kmem_uncharge_page(struct page *page, int order)
2701	3150	{
2702	3151	struct mem_cgroup *memcg = page->mem_cgroup;
2703	3152	unsigned int nr_pages = 1 << order;
..	..	@@ -2706,43 +3155,179 @@
2706	3155	return;
2707	3156
2708	3157	VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
2709		-
2710		- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
2711		- page_counter_uncharge(&memcg->kmem, nr_pages);
2712		-
2713		- page_counter_uncharge(&memcg->memory, nr_pages);
2714		- if (do_memsw_account())
2715		- page_counter_uncharge(&memcg->memsw, nr_pages);
2716		-
	3158	+ __memcg_kmem_uncharge(memcg, nr_pages);
2717	3159	page->mem_cgroup = NULL;
	3160	+ css_put(&memcg->css);
2718	3161
2719	3162	/* slab pages do not have PageKmemcg flag set */
2720	3163	if (PageKmemcg(page))
2721	3164	__ClearPageKmemcg(page);
2722		-
2723		- css_put_many(&memcg->css, nr_pages);
2724	3165	}
2725		-#endif /* CONFIG_MEMCG_KMEM */
2726	3166
2727		-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2728		-
2729		-/*
2730		- * Because tail pages are not marked as "used", set it. We're under
2731		- * zone_lru_lock and migration entries setup in all page mappings.
2732		- */
2733		-void mem_cgroup_split_huge_fixup(struct page *head)
	3167	+static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
2734	3168	{
2735		- int i;
	3169	+ struct memcg_stock_pcp *stock;
	3170	+ unsigned long flags;
	3171	+ bool ret = false;
2736	3172
2737		- if (mem_cgroup_disabled())
	3173	+ local_irq_save(flags);
	3174	+
	3175	+ stock = this_cpu_ptr(&memcg_stock);
	3176	+ if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
	3177	+ stock->nr_bytes -= nr_bytes;
	3178	+ ret = true;
	3179	+ }
	3180	+
	3181	+ local_irq_restore(flags);
	3182	+
	3183	+ return ret;
	3184	+}
	3185	+
	3186	+static void drain_obj_stock(struct memcg_stock_pcp *stock)
	3187	+{
	3188	+ struct obj_cgroup *old = stock->cached_objcg;
	3189	+
	3190	+ if (!old)
2738	3191	return;
2739	3192
2740		- for (i = 1; i < HPAGE_PMD_NR; i++)
2741		- head[i].mem_cgroup = head->mem_cgroup;
	3193	+ if (stock->nr_bytes) {
	3194	+ unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
	3195	+ unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
2742	3196
2743		- __mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR);
	3197	+ if (nr_pages) {
	3198	+ struct mem_cgroup *memcg;
	3199	+
	3200	+ rcu_read_lock();
	3201	+retry:
	3202	+ memcg = obj_cgroup_memcg(old);
	3203	+ if (unlikely(!css_tryget(&memcg->css)))
	3204	+ goto retry;
	3205	+ rcu_read_unlock();
	3206	+
	3207	+ __memcg_kmem_uncharge(memcg, nr_pages);
	3208	+ css_put(&memcg->css);
	3209	+ }
	3210	+
	3211	+ /*
	3212	+ * The leftover is flushed to the centralized per-memcg value.
	3213	+ * On the next attempt to refill obj stock it will be moved
	3214	+ * to a per-cpu stock (probably, on an other CPU), see
	3215	+ * refill_obj_stock().
	3216	+ *
	3217	+ * How often it's flushed is a trade-off between the memory
	3218	+ * limit enforcement accuracy and potential CPU contention,
	3219	+ * so it might be changed in the future.
	3220	+ */
	3221	+ atomic_add(nr_bytes, &old->nr_charged_bytes);
	3222	+ stock->nr_bytes = 0;
	3223	+ }
	3224	+
	3225	+ obj_cgroup_put(old);
	3226	+ stock->cached_objcg = NULL;
2744	3227	}
2745		-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
	3228	+
	3229	+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
	3230	+ struct mem_cgroup *root_memcg)
	3231	+{
	3232	+ struct mem_cgroup *memcg;
	3233	+
	3234	+ if (stock->cached_objcg) {
	3235	+ memcg = obj_cgroup_memcg(stock->cached_objcg);
	3236	+ if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
	3237	+ return true;
	3238	+ }
	3239	+
	3240	+ return false;
	3241	+}
	3242	+
	3243	+static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
	3244	+{
	3245	+ struct memcg_stock_pcp *stock;
	3246	+ unsigned long flags;
	3247	+
	3248	+ local_irq_save(flags);
	3249	+
	3250	+ stock = this_cpu_ptr(&memcg_stock);
	3251	+ if (stock->cached_objcg != objcg) { /* reset if necessary */
	3252	+ drain_obj_stock(stock);
	3253	+ obj_cgroup_get(objcg);
	3254	+ stock->cached_objcg = objcg;
	3255	+ stock->nr_bytes = atomic_xchg(&objcg->nr_charged_bytes, 0);
	3256	+ }
	3257	+ stock->nr_bytes += nr_bytes;
	3258	+
	3259	+ if (stock->nr_bytes > PAGE_SIZE)
	3260	+ drain_obj_stock(stock);
	3261	+
	3262	+ local_irq_restore(flags);
	3263	+}
	3264	+
	3265	+int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
	3266	+{
	3267	+ struct mem_cgroup *memcg;
	3268	+ unsigned int nr_pages, nr_bytes;
	3269	+ int ret;
	3270	+
	3271	+ if (consume_obj_stock(objcg, size))
	3272	+ return 0;
	3273	+
	3274	+ /*
	3275	+ * In theory, memcg->nr_charged_bytes can have enough
	3276	+ * pre-charged bytes to satisfy the allocation. However,
	3277	+ * flushing memcg->nr_charged_bytes requires two atomic
	3278	+ * operations, and memcg->nr_charged_bytes can't be big,
	3279	+ * so it's better to ignore it and try grab some new pages.
	3280	+ * memcg->nr_charged_bytes will be flushed in
	3281	+ * refill_obj_stock(), called from this function or
	3282	+ * independently later.
	3283	+ */
	3284	+ rcu_read_lock();
	3285	+retry:
	3286	+ memcg = obj_cgroup_memcg(objcg);
	3287	+ if (unlikely(!css_tryget(&memcg->css)))
	3288	+ goto retry;
	3289	+ rcu_read_unlock();
	3290	+
	3291	+ nr_pages = size >> PAGE_SHIFT;
	3292	+ nr_bytes = size & (PAGE_SIZE - 1);
	3293	+
	3294	+ if (nr_bytes)
	3295	+ nr_pages += 1;
	3296	+
	3297	+ ret = __memcg_kmem_charge(memcg, gfp, nr_pages);
	3298	+ if (!ret && nr_bytes)
	3299	+ refill_obj_stock(objcg, PAGE_SIZE - nr_bytes);
	3300	+
	3301	+ css_put(&memcg->css);
	3302	+ return ret;
	3303	+}
	3304	+
	3305	+void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
	3306	+{
	3307	+ refill_obj_stock(objcg, size);
	3308	+}
	3309	+
	3310	+#endif /* CONFIG_MEMCG_KMEM */
	3311	+
	3312	+/*
	3313	+ * Because head->mem_cgroup is not set on tails, set it now.
	3314	+ */
	3315	+void split_page_memcg(struct page *head, unsigned int nr)
	3316	+{
	3317	+ struct mem_cgroup *memcg = head->mem_cgroup;
	3318	+ int kmemcg = PageKmemcg(head);
	3319	+ int i;
	3320	+
	3321	+ if (mem_cgroup_disabled() \|\| !memcg)
	3322	+ return;
	3323	+
	3324	+ for (i = 1; i < nr; i++) {
	3325	+ head[i].mem_cgroup = memcg;
	3326	+ if (kmemcg)
	3327	+ __SetPageKmemcg(head + i);
	3328	+ }
	3329	+ css_get_many(&memcg->css, nr - 1);
	3330	+}
2746	3331
2747	3332	#ifdef CONFIG_MEMCG_SWAP
2748	3333	/**
..	..	@@ -2804,7 +3389,7 @@
2804	3389	* Make sure that the new limit (memsw or memory limit) doesn't
2805	3390	* break our basic invariant rule memory.max <= memsw.max.
2806	3391	*/
2807		- limits_invariant = memsw ? max >= memcg->memory.max :
	3392	+ limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
2808	3393	max <= memcg->memsw.max;
2809	3394	if (!limits_invariant) {
2810	3395	mutex_unlock(&memcg_max_mutex);
..	..	@@ -2925,7 +3510,7 @@
2925	3510	* Test whether @memcg has children, dead or alive. Note that this
2926	3511	* function doesn't care whether @memcg has use_hierarchy enabled and
2927	3512	* returns %true if there are child csses according to the cgroup
2928		- * hierarchy. Testing use_hierarchy is the caller's responsiblity.
	3513	+ * hierarchy. Testing use_hierarchy is the caller's responsibility.
2929	3514	*/
2930	3515	static inline bool memcg_has_children(struct mem_cgroup *memcg)
2931	3516	{
..	..	@@ -2944,7 +3529,7 @@
2944	3529	*/
2945	3530	static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
2946	3531	{
2947		- int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
	3532	+ int nr_retries = MAX_RECLAIM_RETRIES;
2948	3533
2949	3534	/* we call try-to-free pages for make this cgroup empty */
2950	3535	lru_add_drain_all();
..	..	@@ -3018,50 +3603,15 @@
3018	3603	return retval;
3019	3604	}
3020	3605
3021		-struct accumulated_stats {
3022		- unsigned long stat[MEMCG_NR_STAT];
3023		- unsigned long events[NR_VM_EVENT_ITEMS];
3024		- unsigned long lru_pages[NR_LRU_LISTS];
3025		- const unsigned int *stats_array;
3026		- const unsigned int *events_array;
3027		- int stats_size;
3028		- int events_size;
3029		-};
3030		-
3031		-static void accumulate_memcg_tree(struct mem_cgroup *memcg,
3032		- struct accumulated_stats *acc)
3033		-{
3034		- struct mem_cgroup *mi;
3035		- int i;
3036		-
3037		- for_each_mem_cgroup_tree(mi, memcg) {
3038		- for (i = 0; i < acc->stats_size; i++)
3039		- acc->stat[i] += memcg_page_state(mi,
3040		- acc->stats_array ? acc->stats_array[i] : i);
3041		-
3042		- for (i = 0; i < acc->events_size; i++)
3043		- acc->events[i] += memcg_sum_events(mi,
3044		- acc->events_array ? acc->events_array[i] : i);
3045		-
3046		- for (i = 0; i < NR_LRU_LISTS; i++)
3047		- acc->lru_pages[i] +=
3048		- mem_cgroup_nr_lru_pages(mi, BIT(i));
3049		- }
3050		-}
3051		-
3052	3606	static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3053	3607	{
3054		- unsigned long val = 0;
	3608	+ unsigned long val;
3055	3609
3056	3610	if (mem_cgroup_is_root(memcg)) {
3057		- struct mem_cgroup *iter;
3058		-
3059		- for_each_mem_cgroup_tree(iter, memcg) {
3060		- val += memcg_page_state(iter, MEMCG_CACHE);
3061		- val += memcg_page_state(iter, MEMCG_RSS);
3062		- if (swap)
3063		- val += memcg_page_state(iter, MEMCG_SWAP);
3064		- }
	3611	+ val = memcg_page_state(memcg, NR_FILE_PAGES) +
	3612	+ memcg_page_state(memcg, NR_ANON_MAPPED);
	3613	+ if (swap)
	3614	+ val += memcg_page_state(memcg, MEMCG_SWAP);
3065	3615	} else {
3066	3616	if (!swap)
3067	3617	val = page_counter_read(&memcg->memory);
..	..	@@ -3122,9 +3672,61 @@
3122	3672	}
3123	3673	}
3124	3674
	3675	+static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
	3676	+{
	3677	+ unsigned long stat[MEMCG_NR_STAT] = {0};
	3678	+ struct mem_cgroup *mi;
	3679	+ int node, cpu, i;
	3680	+
	3681	+ for_each_online_cpu(cpu)
	3682	+ for (i = 0; i < MEMCG_NR_STAT; i++)
	3683	+ stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
	3684	+
	3685	+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
	3686	+ for (i = 0; i < MEMCG_NR_STAT; i++)
	3687	+ atomic_long_add(stat[i], &mi->vmstats[i]);
	3688	+
	3689	+ for_each_node(node) {
	3690	+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
	3691	+ struct mem_cgroup_per_node *pi;
	3692	+
	3693	+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
	3694	+ stat[i] = 0;
	3695	+
	3696	+ for_each_online_cpu(cpu)
	3697	+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
	3698	+ stat[i] += per_cpu(
	3699	+ pn->lruvec_stat_cpu->count[i], cpu);
	3700	+
	3701	+ for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
	3702	+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
	3703	+ atomic_long_add(stat[i], &pi->lruvec_stat[i]);
	3704	+ }
	3705	+}
	3706	+
	3707	+static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
	3708	+{
	3709	+ unsigned long events[NR_VM_EVENT_ITEMS];
	3710	+ struct mem_cgroup *mi;
	3711	+ int cpu, i;
	3712	+
	3713	+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
	3714	+ events[i] = 0;
	3715	+
	3716	+ for_each_online_cpu(cpu)
	3717	+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
	3718	+ events[i] += per_cpu(memcg->vmstats_percpu->events[i],
	3719	+ cpu);
	3720	+
	3721	+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
	3722	+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
	3723	+ atomic_long_add(events[i], &mi->vmevents[i]);
	3724	+}
	3725	+
3125	3726	#ifdef CONFIG_MEMCG_KMEM
3126	3727	static int memcg_online_kmem(struct mem_cgroup *memcg)
3127	3728	{
	3729	+ struct obj_cgroup *objcg;
3128	3730	int memcg_id;
3129	3731
3130	3732	if (cgroup_memory_nokmem)
..	..	@@ -3137,7 +3739,16 @@
3137	3739	if (memcg_id < 0)
3138	3740	return memcg_id;
3139	3741
3140		- static_branch_inc(&memcg_kmem_enabled_key);
	3742	+ objcg = obj_cgroup_alloc();
	3743	+ if (!objcg) {
	3744	+ memcg_free_cache_id(memcg_id);
	3745	+ return -ENOMEM;
	3746	+ }
	3747	+ objcg->memcg = memcg;
	3748	+ rcu_assign_pointer(memcg->objcg, objcg);
	3749	+
	3750	+ static_branch_enable(&memcg_kmem_enabled_key);
	3751	+
3141	3752	/*
3142	3753	* A memory cgroup is considered kmem-online as soon as it gets
3143	3754	* kmemcg_id. Setting the id after enabling static branching will
..	..	@@ -3146,7 +3757,6 @@
3146	3757	*/
3147	3758	memcg->kmemcg_id = memcg_id;
3148	3759	memcg->kmem_state = KMEM_ONLINE;
3149		- INIT_LIST_HEAD(&memcg->kmem_caches);
3150	3760
3151	3761	return 0;
3152	3762	}
..	..	@@ -3159,22 +3769,17 @@
3159	3769
3160	3770	if (memcg->kmem_state != KMEM_ONLINE)
3161	3771	return;
3162		- /*
3163		- * Clear the online state before clearing memcg_caches array
3164		- * entries. The slab_mutex in memcg_deactivate_kmem_caches()
3165		- * guarantees that no cache will be created for this cgroup
3166		- * after we are done (see memcg_create_kmem_cache()).
3167		- */
	3772	+
3168	3773	memcg->kmem_state = KMEM_ALLOCATED;
3169		-
3170		- memcg_deactivate_kmem_caches(memcg);
3171		-
3172		- kmemcg_id = memcg->kmemcg_id;
3173		- BUG_ON(kmemcg_id < 0);
3174	3774
3175	3775	parent = parent_mem_cgroup(memcg);
3176	3776	if (!parent)
3177	3777	parent = root_mem_cgroup;
	3778	+
	3779	+ memcg_reparent_objcgs(memcg, parent);
	3780	+
	3781	+ kmemcg_id = memcg->kmemcg_id;
	3782	+ BUG_ON(kmemcg_id < 0);
3178	3783
3179	3784	/*
3180	3785	* Change kmemcg_id of this cgroup and all its descendants to the
..	..	@@ -3204,12 +3809,6 @@
3204	3809	/* css_alloc() failed, offlining didn't happen */
3205	3810	if (unlikely(memcg->kmem_state == KMEM_ONLINE))
3206	3811	memcg_offline_kmem(memcg);
3207		-
3208		- if (memcg->kmem_state == KMEM_ALLOCATED) {
3209		- memcg_destroy_kmem_caches(memcg);
3210		- static_branch_dec(&memcg_kmem_enabled_key);
3211		- WARN_ON(page_counter_read(&memcg->kmem));
3212		- }
3213	3812	}
3214	3813	#else
3215	3814	static int memcg_online_kmem(struct mem_cgroup *memcg)
..	..	@@ -3300,6 +3899,9 @@
3300	3899	ret = mem_cgroup_resize_max(memcg, nr_pages, true);
3301	3900	break;
3302	3901	case _KMEM:
	3902	+ pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
	3903	+ "Please report your usecase to linux-mm@kvack.org if you "
	3904	+ "depend on this functionality.\n");
3303	3905	ret = memcg_update_kmem_max(memcg, nr_pages);
3304	3906	break;
3305	3907	case _TCP:
..	..	@@ -3364,6 +3966,10 @@
3364	3966	{
3365	3967	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3366	3968
	3969	+ pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. "
	3970	+ "Please report your usecase to linux-mm@kvack.org if you "
	3971	+ "depend on this functionality.\n");
	3972	+
3367	3973	if (val & ~MOVE_MASK)
3368	3974	return -EINVAL;
3369	3975
..	..	@@ -3385,6 +3991,49 @@
3385	3991	#endif
3386	3992
3387	3993	#ifdef CONFIG_NUMA
	3994	+
	3995	+#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) \| BIT(LRU_ACTIVE_FILE))
	3996	+#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) \| BIT(LRU_ACTIVE_ANON))
	3997	+#define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
	3998	+
	3999	+static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
	4000	+ int nid, unsigned int lru_mask, bool tree)
	4001	+{
	4002	+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
	4003	+ unsigned long nr = 0;
	4004	+ enum lru_list lru;
	4005	+
	4006	+ VM_BUG_ON((unsigned)nid >= nr_node_ids);
	4007	+
	4008	+ for_each_lru(lru) {
	4009	+ if (!(BIT(lru) & lru_mask))
	4010	+ continue;
	4011	+ if (tree)
	4012	+ nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
	4013	+ else
	4014	+ nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
	4015	+ }
	4016	+ return nr;
	4017	+}
	4018	+
	4019	+static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
	4020	+ unsigned int lru_mask,
	4021	+ bool tree)
	4022	+{
	4023	+ unsigned long nr = 0;
	4024	+ enum lru_list lru;
	4025	+
	4026	+ for_each_lru(lru) {
	4027	+ if (!(BIT(lru) & lru_mask))
	4028	+ continue;
	4029	+ if (tree)
	4030	+ nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
	4031	+ else
	4032	+ nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
	4033	+ }
	4034	+ return nr;
	4035	+}
	4036	+
3388	4037	static int memcg_numa_stat_show(struct seq_file m, void v)
3389	4038	{
3390	4039	struct numa_stat {
..	..	@@ -3400,40 +4049,60 @@
3400	4049	};
3401	4050	const struct numa_stat *stat;
3402	4051	int nid;
3403		- unsigned long nr;
3404		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
	4052	+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
3405	4053
3406	4054	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3407		- nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
3408		- seq_printf(m, "%s=%lu", stat->name, nr);
3409		- for_each_node_state(nid, N_MEMORY) {
3410		- nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
3411		- stat->lru_mask);
3412		- seq_printf(m, " N%d=%lu", nid, nr);
3413		- }
	4055	+ seq_printf(m, "%s=%lu", stat->name,
	4056	+ mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
	4057	+ false));
	4058	+ for_each_node_state(nid, N_MEMORY)
	4059	+ seq_printf(m, " N%d=%lu", nid,
	4060	+ mem_cgroup_node_nr_lru_pages(memcg, nid,
	4061	+ stat->lru_mask, false));
3414	4062	seq_putc(m, '\n');
3415	4063	}
3416	4064
3417	4065	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3418		- struct mem_cgroup *iter;
3419	4066
3420		- nr = 0;
3421		- for_each_mem_cgroup_tree(iter, memcg)
3422		- nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
3423		- seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
3424		- for_each_node_state(nid, N_MEMORY) {
3425		- nr = 0;
3426		- for_each_mem_cgroup_tree(iter, memcg)
3427		- nr += mem_cgroup_node_nr_lru_pages(
3428		- iter, nid, stat->lru_mask);
3429		- seq_printf(m, " N%d=%lu", nid, nr);
3430		- }
	4067	+ seq_printf(m, "hierarchical_%s=%lu", stat->name,
	4068	+ mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
	4069	+ true));
	4070	+ for_each_node_state(nid, N_MEMORY)
	4071	+ seq_printf(m, " N%d=%lu", nid,
	4072	+ mem_cgroup_node_nr_lru_pages(memcg, nid,
	4073	+ stat->lru_mask, true));
3431	4074	seq_putc(m, '\n');
3432	4075	}
3433	4076
3434	4077	return 0;
3435	4078	}
3436	4079	#endif /* CONFIG_NUMA */
	4080	+
	4081	+static const unsigned int memcg1_stats[] = {
	4082	+ NR_FILE_PAGES,
	4083	+ NR_ANON_MAPPED,
	4084	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	4085	+ NR_ANON_THPS,
	4086	+#endif
	4087	+ NR_SHMEM,
	4088	+ NR_FILE_MAPPED,
	4089	+ NR_FILE_DIRTY,
	4090	+ NR_WRITEBACK,
	4091	+ MEMCG_SWAP,
	4092	+};
	4093	+
	4094	+static const char *const memcg1_stat_names[] = {
	4095	+ "cache",
	4096	+ "rss",
	4097	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	4098	+ "rss_huge",
	4099	+#endif
	4100	+ "shmem",
	4101	+ "mapped_file",
	4102	+ "dirty",
	4103	+ "writeback",
	4104	+ "swap",
	4105	+};
3437	4106
3438	4107	/* Universal VM events cgroup1 shows, original sort order */
3439	4108	static const unsigned int memcg1_events[] = {
..	..	@@ -3443,45 +4112,42 @@
3443	4112	PGMAJFAULT,
3444	4113	};
3445	4114
3446		-static const char *const memcg1_event_names[] = {
3447		- "pgpgin",
3448		- "pgpgout",
3449		- "pgfault",
3450		- "pgmajfault",
3451		-};
3452		-
3453	4115	static int memcg_stat_show(struct seq_file m, void v)
3454	4116	{
3455		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
	4117	+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
3456	4118	unsigned long memory, memsw;
3457	4119	struct mem_cgroup *mi;
3458	4120	unsigned int i;
3459		- struct accumulated_stats acc;
3460	4121
3461	4122	BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
3462		- BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
3463	4123
3464	4124	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
	4125	+ unsigned long nr;
	4126	+
3465	4127	if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
3466	4128	continue;
3467		- seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
3468		- memcg_page_state(memcg, memcg1_stats[i]) *
3469		- PAGE_SIZE);
	4129	+ nr = memcg_page_state_local(memcg, memcg1_stats[i]);
	4130	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	4131	+ if (memcg1_stats[i] == NR_ANON_THPS)
	4132	+ nr *= HPAGE_PMD_NR;
	4133	+#endif
	4134	+ seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE);
3470	4135	}
3471	4136
3472	4137	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3473		- seq_printf(m, "%s %lu\n", memcg1_event_names[i],
3474		- memcg_sum_events(memcg, memcg1_events[i]));
	4138	+ seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]),
	4139	+ memcg_events_local(memcg, memcg1_events[i]));
3475	4140
3476	4141	for (i = 0; i < NR_LRU_LISTS; i++)
3477		- seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
3478		- mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
	4142	+ seq_printf(m, "%s %lu\n", lru_list_name(i),
	4143	+ memcg_page_state_local(memcg, NR_LRU_BASE + i) *
	4144	+ PAGE_SIZE);
3479	4145
3480	4146	/* Hierarchical information */
3481	4147	memory = memsw = PAGE_COUNTER_MAX;
3482	4148	for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
3483		- memory = min(memory, mi->memory.max);
3484		- memsw = min(memsw, mi->memsw.max);
	4149	+ memory = min(memory, READ_ONCE(mi->memory.max));
	4150	+ memsw = min(memsw, READ_ONCE(mi->memsw.max));
3485	4151	}
3486	4152	seq_printf(m, "hierarchical_memory_limit %llu\n",
3487	4153	(u64)memory * PAGE_SIZE);
..	..	@@ -3489,49 +4155,45 @@
3489	4155	seq_printf(m, "hierarchical_memsw_limit %llu\n",
3490	4156	(u64)memsw * PAGE_SIZE);
3491	4157
3492		- memset(&acc, 0, sizeof(acc));
3493		- acc.stats_size = ARRAY_SIZE(memcg1_stats);
3494		- acc.stats_array = memcg1_stats;
3495		- acc.events_size = ARRAY_SIZE(memcg1_events);
3496		- acc.events_array = memcg1_events;
3497		- accumulate_memcg_tree(memcg, &acc);
3498		-
3499	4158	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
	4159	+ unsigned long nr;
	4160	+
3500	4161	if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
3501	4162	continue;
	4163	+ nr = memcg_page_state(memcg, memcg1_stats[i]);
	4164	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	4165	+ if (memcg1_stats[i] == NR_ANON_THPS)
	4166	+ nr *= HPAGE_PMD_NR;
	4167	+#endif
3502	4168	seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
3503		- (u64)acc.stat[i] * PAGE_SIZE);
	4169	+ (u64)nr * PAGE_SIZE);
3504	4170	}
3505	4171
3506	4172	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3507		- seq_printf(m, "total_%s %llu\n", memcg1_event_names[i],
3508		- (u64)acc.events[i]);
	4173	+ seq_printf(m, "total_%s %llu\n",
	4174	+ vm_event_name(memcg1_events[i]),
	4175	+ (u64)memcg_events(memcg, memcg1_events[i]));
3509	4176
3510	4177	for (i = 0; i < NR_LRU_LISTS; i++)
3511		- seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i],
3512		- (u64)acc.lru_pages[i] * PAGE_SIZE);
	4178	+ seq_printf(m, "total_%s %llu\n", lru_list_name(i),
	4179	+ (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
	4180	+ PAGE_SIZE);
3513	4181
3514	4182	#ifdef CONFIG_DEBUG_VM
3515	4183	{
3516	4184	pg_data_t *pgdat;
3517	4185	struct mem_cgroup_per_node *mz;
3518		- struct zone_reclaim_stat *rstat;
3519		- unsigned long recent_rotated[2] = {0, 0};
3520		- unsigned long recent_scanned[2] = {0, 0};
	4186	+ unsigned long anon_cost = 0;
	4187	+ unsigned long file_cost = 0;
3521	4188
3522	4189	for_each_online_pgdat(pgdat) {
3523	4190	mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
3524		- rstat = &mz->lruvec.reclaim_stat;
3525	4191
3526		- recent_rotated[0] += rstat->recent_rotated[0];
3527		- recent_rotated[1] += rstat->recent_rotated[1];
3528		- recent_scanned[0] += rstat->recent_scanned[0];
3529		- recent_scanned[1] += rstat->recent_scanned[1];
	4192	+ anon_cost += mz->lruvec.anon_cost;
	4193	+ file_cost += mz->lruvec.file_cost;
3530	4194	}
3531		- seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
3532		- seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
3533		- seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
3534		- seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
	4195	+ seq_printf(m, "anon_cost %lu\n", anon_cost);
	4196	+ seq_printf(m, "file_cost %lu\n", file_cost);
3535	4197	}
3536	4198	#endif
3537	4199
..	..	@@ -3551,7 +4213,7 @@
3551	4213	{
3552	4214	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3553	4215
3554		- if (val > 100)
	4216	+ if (val > 200)
3555	4217	return -EINVAL;
3556	4218
3557	4219	if (css->parent)
..	..	@@ -3690,8 +4352,7 @@
3690	4352	size = thresholds->primary ? thresholds->primary->size + 1 : 1;
3691	4353
3692	4354	/* Allocate memory for new array of thresholds */
3693		- new = kmalloc(sizeof(new) + size sizeof(struct mem_cgroup_threshold),
3694		- GFP_KERNEL);
	4355	+ new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
3695	4356	if (!new) {
3696	4357	ret = -ENOMEM;
3697	4358	goto unlock;
..	..	@@ -3699,17 +4360,16 @@
3699	4360	new->size = size;
3700	4361
3701	4362	/* Copy thresholds (if any) to new array */
3702		- if (thresholds->primary) {
3703		- memcpy(new->entries, thresholds->primary->entries, (size - 1) *
3704		- sizeof(struct mem_cgroup_threshold));
3705		- }
	4363	+ if (thresholds->primary)
	4364	+ memcpy(new->entries, thresholds->primary->entries,
	4365	+ flex_array_size(new, entries, size - 1));
3706	4366
3707	4367	/* Add new threshold */
3708	4368	new->entries[size - 1].eventfd = eventfd;
3709	4369	new->entries[size - 1].threshold = threshold;
3710	4370
3711	4371	/* Sort thresholds. Registering of new threshold isn't time-critical */
3712		- sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
	4372	+ sort(new->entries, size, sizeof(*new->entries),
3713	4373	compare_thresholds, NULL);
3714	4374
3715	4375	/* Find current threshold */
..	..	@@ -3891,7 +4551,7 @@
3891	4551
3892	4552	static int mem_cgroup_oom_control_read(struct seq_file sf, void v)
3893	4553	{
3894		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
	4554	+ struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
3895	4555
3896	4556	seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
3897	4557	seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
..	..	@@ -3917,6 +4577,8 @@
3917	4577	}
3918	4578
3919	4579	#ifdef CONFIG_CGROUP_WRITEBACK
	4580	+
	4581	+#include <trace/events/writeback.h>
3920	4582
3921	4583	static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
3922	4584	{
..	..	@@ -3949,11 +4611,11 @@
3949	4611	*/
3950	4612	static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
3951	4613	{
3952		- long x = atomic_long_read(&memcg->stat[idx]);
	4614	+ long x = atomic_long_read(&memcg->vmstats[idx]);
3953	4615	int cpu;
3954	4616
3955	4617	for_each_online_cpu(cpu)
3956		- x += per_cpu_ptr(memcg->stat_cpu, cpu)->count[idx];
	4618	+ x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx];
3957	4619	if (x < 0)
3958	4620	x = 0;
3959	4621	return x;
..	..	@@ -3986,18 +4648,142 @@
3986	4648
3987	4649	*pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
3988	4650
3989		- /* this should eventually include NR_UNSTABLE_NFS */
3990	4651	*pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
3991		- *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) \|
3992		- (1 << LRU_ACTIVE_FILE));
	4652	+ *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
	4653	+ memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
3993	4654	*pheadroom = PAGE_COUNTER_MAX;
3994	4655
3995	4656	while ((parent = parent_mem_cgroup(memcg))) {
3996		- unsigned long ceiling = min(memcg->memory.max, memcg->high);
	4657	+ unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
	4658	+ READ_ONCE(memcg->memory.high));
3997	4659	unsigned long used = page_counter_read(&memcg->memory);
3998	4660
3999	4661	pheadroom = min(pheadroom, ceiling - min(ceiling, used));
4000	4662	memcg = parent;
	4663	+ }
	4664	+}
	4665	+
	4666	+/*
	4667	+ * Foreign dirty flushing
	4668	+ *
	4669	+ * There's an inherent mismatch between memcg and writeback. The former
	4670	+ * trackes ownership per-page while the latter per-inode. This was a
	4671	+ * deliberate design decision because honoring per-page ownership in the
	4672	+ * writeback path is complicated, may lead to higher CPU and IO overheads
	4673	+ * and deemed unnecessary given that write-sharing an inode across
	4674	+ * different cgroups isn't a common use-case.
	4675	+ *
	4676	+ * Combined with inode majority-writer ownership switching, this works well
	4677	+ * enough in most cases but there are some pathological cases. For
	4678	+ * example, let's say there are two cgroups A and B which keep writing to
	4679	+ * different but confined parts of the same inode. B owns the inode and
	4680	+ * A's memory is limited far below B's. A's dirty ratio can rise enough to
	4681	+ * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
	4682	+ * triggering background writeback. A will be slowed down without a way to
	4683	+ * make writeback of the dirty pages happen.
	4684	+ *
	4685	+ * Conditions like the above can lead to a cgroup getting repatedly and
	4686	+ * severely throttled after making some progress after each
	4687	+ * dirty_expire_interval while the underyling IO device is almost
	4688	+ * completely idle.
	4689	+ *
	4690	+ * Solving this problem completely requires matching the ownership tracking
	4691	+ * granularities between memcg and writeback in either direction. However,
	4692	+ * the more egregious behaviors can be avoided by simply remembering the
	4693	+ * most recent foreign dirtying events and initiating remote flushes on
	4694	+ * them when local writeback isn't enough to keep the memory clean enough.
	4695	+ *
	4696	+ * The following two functions implement such mechanism. When a foreign
	4697	+ * page - a page whose memcg and writeback ownerships don't match - is
	4698	+ * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
	4699	+ * bdi_writeback on the page owning memcg. When balance_dirty_pages()
	4700	+ * decides that the memcg needs to sleep due to high dirty ratio, it calls
	4701	+ * mem_cgroup_flush_foreign() which queues writeback on the recorded
	4702	+ * foreign bdi_writebacks which haven't expired. Both the numbers of
	4703	+ * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
	4704	+ * limited to MEMCG_CGWB_FRN_CNT.
	4705	+ *
	4706	+ * The mechanism only remembers IDs and doesn't hold any object references.
	4707	+ * As being wrong occasionally doesn't matter, updates and accesses to the
	4708	+ * records are lockless and racy.
	4709	+ */
	4710	+void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
	4711	+ struct bdi_writeback *wb)
	4712	+{
	4713	+ struct mem_cgroup *memcg = page->mem_cgroup;
	4714	+ struct memcg_cgwb_frn *frn;
	4715	+ u64 now = get_jiffies_64();
	4716	+ u64 oldest_at = now;
	4717	+ int oldest = -1;
	4718	+ int i;
	4719	+
	4720	+ trace_track_foreign_dirty(page, wb);
	4721	+
	4722	+ /*
	4723	+ * Pick the slot to use. If there is already a slot for @wb, keep
	4724	+ * using it. If not replace the oldest one which isn't being
	4725	+ * written out.
	4726	+ */
	4727	+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
	4728	+ frn = &memcg->cgwb_frn[i];
	4729	+ if (frn->bdi_id == wb->bdi->id &&
	4730	+ frn->memcg_id == wb->memcg_css->id)
	4731	+ break;
	4732	+ if (time_before64(frn->at, oldest_at) &&
	4733	+ atomic_read(&frn->done.cnt) == 1) {
	4734	+ oldest = i;
	4735	+ oldest_at = frn->at;
	4736	+ }
	4737	+ }
	4738	+
	4739	+ if (i < MEMCG_CGWB_FRN_CNT) {
	4740	+ /*
	4741	+ * Re-using an existing one. Update timestamp lazily to
	4742	+ * avoid making the cacheline hot. We want them to be
	4743	+ * reasonably up-to-date and significantly shorter than
	4744	+ * dirty_expire_interval as that's what expires the record.
	4745	+ * Use the shorter of 1s and dirty_expire_interval / 8.
	4746	+ */
	4747	+ unsigned long update_intv =
	4748	+ min_t(unsigned long, HZ,
	4749	+ msecs_to_jiffies(dirty_expire_interval * 10) / 8);
	4750	+
	4751	+ if (time_before64(frn->at, now - update_intv))
	4752	+ frn->at = now;
	4753	+ } else if (oldest >= 0) {
	4754	+ /* replace the oldest free one */
	4755	+ frn = &memcg->cgwb_frn[oldest];
	4756	+ frn->bdi_id = wb->bdi->id;
	4757	+ frn->memcg_id = wb->memcg_css->id;
	4758	+ frn->at = now;
	4759	+ }
	4760	+}
	4761	+
	4762	+/* issue foreign writeback flushes for recorded foreign dirtying events */
	4763	+void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
	4764	+{
	4765	+ struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
	4766	+ unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
	4767	+ u64 now = jiffies_64;
	4768	+ int i;
	4769	+
	4770	+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
	4771	+ struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
	4772	+
	4773	+ /*
	4774	+ * If the record is older than dirty_expire_interval,
	4775	+ * writeback on it has already started. No need to kick it
	4776	+ * off again. Also, don't start a new one if there's
	4777	+ * already one in flight.
	4778	+ */
	4779	+ if (time_after64(frn->at, now - intv) &&
	4780	+ atomic_read(&frn->done.cnt) == 1) {
	4781	+ frn->at = 0;
	4782	+ trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
	4783	+ cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
	4784	+ WB_REASON_FOREIGN_FLUSH,
	4785	+ &frn->done);
	4786	+ }
4001	4787	}
4002	4788	}
4003	4789
..	..	@@ -4120,6 +4906,7 @@
4120	4906	unsigned int efd, cfd;
4121	4907	struct fd efile;
4122	4908	struct fd cfile;
	4909	+ struct dentry *cdentry;
4123	4910	const char *name;
4124	4911	char *endp;
4125	4912	int ret;
..	..	@@ -4171,6 +4958,16 @@
4171	4958	goto out_put_cfile;
4172	4959
4173	4960	/*
	4961	+ * The control file must be a regular cgroup1 file. As a regular cgroup
	4962	+ * file can't be renamed, it's safe to access its name afterwards.
	4963	+ */
	4964	+ cdentry = cfile.file->f_path.dentry;
	4965	+ if (cdentry->d_sb->s_type != &cgroup_fs_type \|\| !d_is_reg(cdentry)) {
	4966	+ ret = -EINVAL;
	4967	+ goto out_put_cfile;
	4968	+ }
	4969	+
	4970	+ /*
4174	4971	* Determine the event callbacks and set them in @event. This used
4175	4972	* to be done via struct cftype but cgroup core no longer knows
4176	4973	* about these events. The following is crude but the whole thing
..	..	@@ -4178,7 +4975,7 @@
4178	4975	*
4179	4976	* DO NOT ADD NEW FILES.
4180	4977	*/
4181		- name = cfile.file->f_path.dentry->d_name.name;
	4978	+ name = cdentry->d_name.name;
4182	4979
4183	4980	if (!strcmp(name, "memory.usage_in_bytes")) {
4184	4981	event->register_event = mem_cgroup_usage_register_event;
..	..	@@ -4202,7 +4999,7 @@
4202	4999	* automatically removed on cgroup destruction but the removal is
4203	5000	* asynchronous, so take an extra ref on @css.
4204	5001	*/
4205		- cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
	5002	+ cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
4206	5003	&memory_cgrp_subsys);
4207	5004	ret = -EINVAL;
4208	5005	if (IS_ERR(cfile_css))
..	..	@@ -4337,12 +5134,10 @@
4337	5134	.write = mem_cgroup_reset,
4338	5135	.read_u64 = mem_cgroup_read_u64,
4339	5136	},
4340		-#if defined(CONFIG_SLAB) \|\| defined(CONFIG_SLUB_DEBUG)
	5137	+#if defined(CONFIG_MEMCG_KMEM) && \
	5138	+ (defined(CONFIG_SLAB) \|\| defined(CONFIG_SLUB_DEBUG))
4341	5139	{
4342	5140	.name = "kmem.slabinfo",
4343		- .seq_start = memcg_slab_start,
4344		- .seq_next = memcg_slab_next,
4345		- .seq_stop = memcg_slab_stop,
4346	5141	.seq_show = memcg_slab_show,
4347	5142	},
4348	5143	#endif
..	..	@@ -4380,7 +5175,7 @@
4380	5175	* limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
4381	5176	* memory-controlled cgroups to 64k.
4382	5177	*
4383		- * However, there usually are many references to the oflline CSS after
	5178	+ * However, there usually are many references to the offline CSS after
4384	5179	* the cgroup has been destroyed, such as page cache or reclaimable
4385	5180	* slab objects, that don't need to hang on to the ID. We want to keep
4386	5181	* those dead CSS from occupying IDs, or we might quickly exhaust the
..	..	@@ -4401,31 +5196,26 @@
4401	5196	static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
4402	5197	{
4403	5198	if (memcg->id.id > 0) {
	5199	+ trace_android_vh_mem_cgroup_id_remove(memcg);
4404	5200	idr_remove(&mem_cgroup_idr, memcg->id.id);
4405	5201	memcg->id.id = 0;
4406	5202	}
4407	5203	}
4408	5204
4409		-static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
	5205	+static void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg,
	5206	+ unsigned int n)
4410	5207	{
4411		- VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0);
4412		- atomic_add(n, &memcg->id.ref);
	5208	+ refcount_add(n, &memcg->id.ref);
4413	5209	}
4414	5210
4415	5211	static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
4416	5212	{
4417		- VM_BUG_ON(atomic_read(&memcg->id.ref) < n);
4418		- if (atomic_sub_and_test(n, &memcg->id.ref)) {
	5213	+ if (refcount_sub_and_test(n, &memcg->id.ref)) {
4419	5214	mem_cgroup_id_remove(memcg);
4420	5215
4421	5216	/* Memcg ID pins CSS */
4422	5217	css_put(&memcg->css);
4423	5218	}
4424		-}
4425		-
4426		-static inline void mem_cgroup_id_get(struct mem_cgroup *memcg)
4427		-{
4428		- mem_cgroup_id_get_many(memcg, 1);
4429	5219	}
4430	5220
4431	5221	static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
..	..	@@ -4444,6 +5234,7 @@
4444	5234	WARN_ON_ONCE(!rcu_read_lock_held());
4445	5235	return idr_find(&mem_cgroup_idr, id);
4446	5236	}
	5237	+EXPORT_SYMBOL_GPL(mem_cgroup_from_id);
4447	5238
4448	5239	static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
4449	5240	{
..	..	@@ -4463,8 +5254,17 @@
4463	5254	if (!pn)
4464	5255	return 1;
4465	5256
4466		- pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat);
	5257	+ pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat,
	5258	+ GFP_KERNEL_ACCOUNT);
	5259	+ if (!pn->lruvec_stat_local) {
	5260	+ kfree(pn);
	5261	+ return 1;
	5262	+ }
	5263	+
	5264	+ pn->lruvec_stat_cpu = alloc_percpu_gfp(struct lruvec_stat,
	5265	+ GFP_KERNEL_ACCOUNT);
4467	5266	if (!pn->lruvec_stat_cpu) {
	5267	+ free_percpu(pn->lruvec_stat_local);
4468	5268	kfree(pn);
4469	5269	return 1;
4470	5270	}
..	..	@@ -4486,6 +5286,7 @@
4486	5286	return;
4487	5287
4488	5288	free_percpu(pn->lruvec_stat_cpu);
	5289	+ free_percpu(pn->lruvec_stat_local);
4489	5290	kfree(pn);
4490	5291	}
4491	5292
..	..	@@ -4493,39 +5294,57 @@
4493	5294	{
4494	5295	int node;
4495	5296
	5297	+ trace_android_vh_mem_cgroup_free(memcg);
4496	5298	for_each_node(node)
4497	5299	free_mem_cgroup_per_node_info(memcg, node);
4498		- free_percpu(memcg->stat_cpu);
	5300	+ free_percpu(memcg->vmstats_percpu);
	5301	+ free_percpu(memcg->vmstats_local);
4499	5302	kfree(memcg);
4500	5303	}
4501	5304
4502	5305	static void mem_cgroup_free(struct mem_cgroup *memcg)
4503	5306	{
4504	5307	memcg_wb_domain_exit(memcg);
	5308	+ /*
	5309	+ * Flush percpu vmstats and vmevents to guarantee the value correctness
	5310	+ * on parent's and all ancestor levels.
	5311	+ */
	5312	+ memcg_flush_percpu_vmstats(memcg);
	5313	+ memcg_flush_percpu_vmevents(memcg);
4505	5314	__mem_cgroup_free(memcg);
4506	5315	}
4507	5316
4508	5317	static struct mem_cgroup *mem_cgroup_alloc(void)
4509	5318	{
4510	5319	struct mem_cgroup *memcg;
4511		- size_t size;
	5320	+ unsigned int size;
4512	5321	int node;
	5322	+ int __maybe_unused i;
	5323	+ long error = -ENOMEM;
4513	5324
4514	5325	size = sizeof(struct mem_cgroup);
4515	5326	size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
4516	5327
4517	5328	memcg = kzalloc(size, GFP_KERNEL);
4518	5329	if (!memcg)
4519		- return NULL;
	5330	+ return ERR_PTR(error);
4520	5331
4521	5332	memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
4522	5333	1, MEM_CGROUP_ID_MAX,
4523	5334	GFP_KERNEL);
4524		- if (memcg->id.id < 0)
	5335	+ if (memcg->id.id < 0) {
	5336	+ error = memcg->id.id;
	5337	+ goto fail;
	5338	+ }
	5339	+
	5340	+ memcg->vmstats_local = alloc_percpu_gfp(struct memcg_vmstats_percpu,
	5341	+ GFP_KERNEL_ACCOUNT);
	5342	+ if (!memcg->vmstats_local)
4525	5343	goto fail;
4526	5344
4527		- memcg->stat_cpu = alloc_percpu(struct mem_cgroup_stat_cpu);
4528		- if (!memcg->stat_cpu)
	5345	+ memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
	5346	+ GFP_KERNEL_ACCOUNT);
	5347	+ if (!memcg->vmstats_percpu)
4529	5348	goto fail;
4530	5349
4531	5350	for_each_node(node)
..	..	@@ -4536,7 +5355,6 @@
4536	5355	goto fail;
4537	5356
4538	5357	INIT_WORK(&memcg->high_work, high_work_func);
4539		- memcg->last_scanned_node = MAX_NUMNODES;
4540	5358	INIT_LIST_HEAD(&memcg->oom_notify);
4541	5359	mutex_init(&memcg->thresholds_lock);
4542	5360	spin_lock_init(&memcg->move_lock);
..	..	@@ -4546,48 +5364,64 @@
4546	5364	memcg->socket_pressure = jiffies;
4547	5365	#ifdef CONFIG_MEMCG_KMEM
4548	5366	memcg->kmemcg_id = -1;
	5367	+ INIT_LIST_HEAD(&memcg->objcg_list);
4549	5368	#endif
4550	5369	#ifdef CONFIG_CGROUP_WRITEBACK
4551	5370	INIT_LIST_HEAD(&memcg->cgwb_list);
	5371	+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
	5372	+ memcg->cgwb_frn[i].done =
	5373	+ __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
	5374	+#endif
	5375	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	5376	+ spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
	5377	+ INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
	5378	+ memcg->deferred_split_queue.split_queue_len = 0;
4552	5379	#endif
4553	5380	idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
	5381	+ trace_android_vh_mem_cgroup_alloc(memcg);
4554	5382	return memcg;
4555	5383	fail:
4556	5384	mem_cgroup_id_remove(memcg);
4557	5385	__mem_cgroup_free(memcg);
4558		- return NULL;
	5386	+ return ERR_PTR(error);
4559	5387	}
4560	5388
4561	5389	static struct cgroup_subsys_state * __ref
4562	5390	mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4563	5391	{
4564	5392	struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
4565		- struct mem_cgroup *memcg;
	5393	+ struct mem_cgroup memcg, old_memcg;
4566	5394	long error = -ENOMEM;
4567	5395
	5396	+ old_memcg = set_active_memcg(parent);
4568	5397	memcg = mem_cgroup_alloc();
4569		- if (!memcg)
4570		- return ERR_PTR(error);
	5398	+ set_active_memcg(old_memcg);
	5399	+ if (IS_ERR(memcg))
	5400	+ return ERR_CAST(memcg);
4571	5401
4572		- memcg->high = PAGE_COUNTER_MAX;
	5402	+ page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
4573	5403	memcg->soft_limit = PAGE_COUNTER_MAX;
	5404	+ page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
4574	5405	if (parent) {
4575	5406	memcg->swappiness = mem_cgroup_swappiness(parent);
4576	5407	memcg->oom_kill_disable = parent->oom_kill_disable;
4577	5408	}
4578		- if (parent && parent->use_hierarchy) {
	5409	+ if (!parent) {
	5410	+ page_counter_init(&memcg->memory, NULL);
	5411	+ page_counter_init(&memcg->swap, NULL);
	5412	+ page_counter_init(&memcg->kmem, NULL);
	5413	+ page_counter_init(&memcg->tcpmem, NULL);
	5414	+ } else if (parent->use_hierarchy) {
4579	5415	memcg->use_hierarchy = true;
4580	5416	page_counter_init(&memcg->memory, &parent->memory);
4581	5417	page_counter_init(&memcg->swap, &parent->swap);
4582		- page_counter_init(&memcg->memsw, &parent->memsw);
4583	5418	page_counter_init(&memcg->kmem, &parent->kmem);
4584	5419	page_counter_init(&memcg->tcpmem, &parent->tcpmem);
4585	5420	} else {
4586		- page_counter_init(&memcg->memory, NULL);
4587		- page_counter_init(&memcg->swap, NULL);
4588		- page_counter_init(&memcg->memsw, NULL);
4589		- page_counter_init(&memcg->kmem, NULL);
4590		- page_counter_init(&memcg->tcpmem, NULL);
	5421	+ page_counter_init(&memcg->memory, &root_mem_cgroup->memory);
	5422	+ page_counter_init(&memcg->swap, &root_mem_cgroup->swap);
	5423	+ page_counter_init(&memcg->kmem, &root_mem_cgroup->kmem);
	5424	+ page_counter_init(&memcg->tcpmem, &root_mem_cgroup->tcpmem);
4591	5425	/*
4592	5426	* Deeper hierachy with use_hierarchy == false doesn't make
4593	5427	* much sense so let cgroup subsystem know about this
..	..	@@ -4614,7 +5448,7 @@
4614	5448	fail:
4615	5449	mem_cgroup_id_remove(memcg);
4616	5450	mem_cgroup_free(memcg);
4617		- return ERR_PTR(-ENOMEM);
	5451	+ return ERR_PTR(error);
4618	5452	}
4619	5453
4620	5454	static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
..	..	@@ -4632,8 +5466,9 @@
4632	5466	}
4633	5467
4634	5468	/* Online state pins memcg ID, memcg ID pins CSS */
4635		- atomic_set(&memcg->id.ref, 1);
	5469	+ refcount_set(&memcg->id.ref, 1);
4636	5470	css_get(css);
	5471	+ trace_android_vh_mem_cgroup_css_online(css, memcg);
4637	5472	return 0;
4638	5473	}
4639	5474
..	..	@@ -4642,6 +5477,7 @@
4642	5477	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4643	5478	struct mem_cgroup_event event, tmp;
4644	5479
	5480	+ trace_android_vh_mem_cgroup_css_offline(css, memcg);
4645	5481	/*
4646	5482	* Unregister events and notify userspace.
4647	5483	* Notify userspace about cgroup removing only after rmdir of cgroup
..	..	@@ -4660,6 +5496,8 @@
4660	5496	memcg_offline_kmem(memcg);
4661	5497	wb_memcg_offline(memcg);
4662	5498
	5499	+ drain_all_stock(memcg);
	5500	+
4663	5501	mem_cgroup_id_put(memcg);
4664	5502	}
4665	5503
..	..	@@ -4673,7 +5511,12 @@
4673	5511	static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
4674	5512	{
4675	5513	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
	5514	+ int __maybe_unused i;
4676	5515
	5516	+#ifdef CONFIG_CGROUP_WRITEBACK
	5517	+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
	5518	+ wb_wait_for_completion(&memcg->cgwb_frn[i].done);
	5519	+#endif
4677	5520	if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
4678	5521	static_branch_dec(&memcg_sockets_enabled_key);
4679	5522
..	..	@@ -4707,13 +5550,13 @@
4707	5550
4708	5551	page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
4709	5552	page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
4710		- page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
4711	5553	page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
4712	5554	page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
4713	5555	page_counter_set_min(&memcg->memory, 0);
4714	5556	page_counter_set_low(&memcg->memory, 0);
4715		- memcg->high = PAGE_COUNTER_MAX;
	5557	+ page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
4716	5558	memcg->soft_limit = PAGE_COUNTER_MAX;
	5559	+ page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
4717	5560	memcg_wb_domain_size_changed(memcg);
4718	5561	}
4719	5562
..	..	@@ -4756,7 +5599,7 @@
4756	5599	static struct page mc_handle_present_pte(struct vm_area_struct vma,
4757	5600	unsigned long addr, pte_t ptent)
4758	5601	{
4759		- struct page *page = _vm_normal_page(vma, addr, ptent, true);
	5602	+ struct page *page = vm_normal_page(vma, addr, ptent);
4760	5603
4761	5604	if (!page \|\| !page_mapped(page))
4762	5605	return NULL;
..	..	@@ -4807,8 +5650,7 @@
4807	5650	* we call find_get_page() with swapper_space directly.
4808	5651	*/
4809	5652	page = find_get_page(swap_address_space(ent), swp_offset(ent));
4810		- if (do_memsw_account())
4811		- entry->val = ent.val;
	5653	+ entry->val = ent.val;
4812	5654
4813	5655	return page;
4814	5656	}
..	..	@@ -4823,36 +5665,15 @@
4823	5665	static struct page mc_handle_file_pte(struct vm_area_struct vma,
4824	5666	unsigned long addr, pte_t ptent, swp_entry_t *entry)
4825	5667	{
4826		- struct page *page = NULL;
4827		- struct address_space *mapping;
4828		- pgoff_t pgoff;
4829		-
4830	5668	if (!vma->vm_file) /* anonymous vma */
4831	5669	return NULL;
4832	5670	if (!(mc.flags & MOVE_FILE))
4833	5671	return NULL;
4834	5672
4835		- mapping = vma->vm_file->f_mapping;
4836		- pgoff = linear_page_index(vma, addr);
4837		-
4838	5673	/* page is moved even if it's not RSS of this task(page-faulted). */
4839		-#ifdef CONFIG_SWAP
4840	5674	/* shmem/tmpfs may report page out on swap: account for that too. */
4841		- if (shmem_mapping(mapping)) {
4842		- page = find_get_entry(mapping, pgoff);
4843		- if (radix_tree_exceptional_entry(page)) {
4844		- swp_entry_t swp = radix_to_swp_entry(page);
4845		- if (do_memsw_account())
4846		- *entry = swp;
4847		- page = find_get_page(swap_address_space(swp),
4848		- swp_offset(swp));
4849		- }
4850		- } else
4851		- page = find_get_page(mapping, pgoff);
4852		-#else
4853		- page = find_get_page(mapping, pgoff);
4854		-#endif
4855		- return page;
	5675	+ return find_get_incore_page(vma->vm_file->f_mapping,
	5676	+ linear_page_index(vma, addr));
4856	5677	}
4857	5678
4858	5679	/**
..	..	@@ -4872,10 +5693,10 @@
4872	5693	struct mem_cgroup *from,
4873	5694	struct mem_cgroup *to)
4874	5695	{
4875		- unsigned long flags;
4876		- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
	5696	+ struct lruvec from_vec, to_vec;
	5697	+ struct pglist_data *pgdat;
	5698	+ unsigned int nr_pages = compound ? thp_nr_pages(page) : 1;
4877	5699	int ret;
4878		- bool anon;
4879	5700
4880	5701	VM_BUG_ON(from == to);
4881	5702	VM_BUG_ON_PAGE(PageLRU(page), page);
..	..	@@ -4893,50 +5714,81 @@
4893	5714	if (page->mem_cgroup != from)
4894	5715	goto out_unlock;
4895	5716
4896		- anon = PageAnon(page);
	5717	+ pgdat = page_pgdat(page);
	5718	+ from_vec = mem_cgroup_lruvec(from, pgdat);
	5719	+ to_vec = mem_cgroup_lruvec(to, pgdat);
4897	5720
4898		- spin_lock_irqsave(&from->move_lock, flags);
	5721	+ lock_page_memcg(page);
4899	5722
4900		- if (!anon && page_mapped(page)) {
4901		- __mod_memcg_state(from, NR_FILE_MAPPED, -nr_pages);
4902		- __mod_memcg_state(to, NR_FILE_MAPPED, nr_pages);
4903		- }
	5723	+ if (PageAnon(page)) {
	5724	+ if (page_mapped(page)) {
	5725	+ __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
	5726	+ __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
	5727	+ if (PageTransHuge(page)) {
	5728	+ __dec_lruvec_state(from_vec, NR_ANON_THPS);
	5729	+ __inc_lruvec_state(to_vec, NR_ANON_THPS);
	5730	+ }
4904	5731
4905		- /*
4906		- * move_lock grabbed above and caller set from->moving_account, so
4907		- * mod_memcg_page_state will serialize updates to PageDirty.
4908		- * So mapping should be stable for dirty pages.
4909		- */
4910		- if (!anon && PageDirty(page)) {
4911		- struct address_space *mapping = page_mapping(page);
	5732	+ }
	5733	+ } else {
	5734	+ __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
	5735	+ __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
4912	5736
4913		- if (mapping_cap_account_dirty(mapping)) {
4914		- __mod_memcg_state(from, NR_FILE_DIRTY, -nr_pages);
4915		- __mod_memcg_state(to, NR_FILE_DIRTY, nr_pages);
	5737	+ if (PageSwapBacked(page)) {
	5738	+ __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
	5739	+ __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
	5740	+ }
	5741	+
	5742	+ if (page_mapped(page)) {
	5743	+ __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
	5744	+ __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
	5745	+ }
	5746	+
	5747	+ if (PageDirty(page)) {
	5748	+ struct address_space *mapping = page_mapping(page);
	5749	+
	5750	+ if (mapping_can_writeback(mapping)) {
	5751	+ __mod_lruvec_state(from_vec, NR_FILE_DIRTY,
	5752	+ -nr_pages);
	5753	+ __mod_lruvec_state(to_vec, NR_FILE_DIRTY,
	5754	+ nr_pages);
	5755	+ }
4916	5756	}
4917	5757	}
4918	5758
4919	5759	if (PageWriteback(page)) {
4920		- __mod_memcg_state(from, NR_WRITEBACK, -nr_pages);
4921		- __mod_memcg_state(to, NR_WRITEBACK, nr_pages);
	5760	+ __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
	5761	+ __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
4922	5762	}
4923	5763
4924	5764	/*
	5765	+ * All state has been migrated, let's switch to the new memcg.
	5766	+ *
4925	5767	* It is safe to change page->mem_cgroup here because the page
4926		- * is referenced, charged, and isolated - we can't race with
4927		- * uncharging, charging, migration, or LRU putback.
	5768	+ * is referenced, charged, isolated, and locked: we can't race
	5769	+ * with (un)charging, migration, LRU putback, or anything else
	5770	+ * that would rely on a stable page->mem_cgroup.
	5771	+ *
	5772	+ * Note that lock_page_memcg is a memcg lock, not a page lock,
	5773	+ * to save space. As soon as we switch page->mem_cgroup to a
	5774	+ * new memcg that isn't locked, the above state can change
	5775	+ * concurrently again. Make sure we're truly done with it.
4928	5776	*/
	5777	+ smp_mb();
4929	5778
4930		- /* caller should have done css_get */
	5779	+ css_get(&to->css);
	5780	+ css_put(&from->css);
	5781	+
4931	5782	page->mem_cgroup = to;
4932		- spin_unlock_irqrestore(&from->move_lock, flags);
	5783	+
	5784	+ __unlock_page_memcg(from);
4933	5785
4934	5786	ret = 0;
4935	5787
4936	5788	local_irq_disable();
4937		- mem_cgroup_charge_statistics(to, page, compound, nr_pages);
	5789	+ mem_cgroup_charge_statistics(to, page, nr_pages);
4938	5790	memcg_check_events(to, page);
4939		- mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
	5791	+ mem_cgroup_charge_statistics(from, page, -nr_pages);
4940	5792	memcg_check_events(from, page);
4941	5793	local_irq_enable();
4942	5794	out_unlock:
..	..	@@ -4960,8 +5812,8 @@
4960	5812	* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
4961	5813	* target for charge migration. if @target is not NULL, the entry is stored
4962	5814	* in target->ent.
4963		- * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PUBLIC
4964		- * or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).
	5815	+ * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE
	5816	+ * (so ZONE_DEVICE page and thus not on the lru).
4965	5817	* For now we such page is charge like a regular page would be as for all
4966	5818	* intent and purposes it is just special memory taking the place of a
4967	5819	* regular page.
..	..	@@ -4995,8 +5847,7 @@
4995	5847	*/
4996	5848	if (page->mem_cgroup == mc.from) {
4997	5849	ret = MC_TARGET_PAGE;
4998		- if (is_device_private_page(page) \|\|
4999		- is_device_public_page(page))
	5850	+ if (is_device_private_page(page))
5000	5851	ret = MC_TARGET_DEVICE;
5001	5852	if (target)
5002	5853	target->page = page;
..	..	@@ -5067,8 +5918,8 @@
5067	5918	if (ptl) {
5068	5919	/*
5069	5920	* Note their can not be MC_TARGET_DEVICE for now as we do not
5070		- * support transparent huge page with MEMORY_DEVICE_PUBLIC or
5071		- * MEMORY_DEVICE_PRIVATE but this might change.
	5921	+ * support transparent huge page with MEMORY_DEVICE_PRIVATE but
	5922	+ * this might change.
5072	5923	*/
5073	5924	if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
5074	5925	mc.precharge += HPAGE_PMD_NR;
..	..	@@ -5088,18 +5939,17 @@
5088	5939	return 0;
5089	5940	}
5090	5941
	5942	+static const struct mm_walk_ops precharge_walk_ops = {
	5943	+ .pmd_entry = mem_cgroup_count_precharge_pte_range,
	5944	+};
	5945	+
5091	5946	static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
5092	5947	{
5093	5948	unsigned long precharge;
5094	5949
5095		- struct mm_walk mem_cgroup_count_precharge_walk = {
5096		- .pmd_entry = mem_cgroup_count_precharge_pte_range,
5097		- .mm = mm,
5098		- };
5099		- down_read(&mm->mmap_sem);
5100		- walk_page_range(0, mm->highest_vm_end,
5101		- &mem_cgroup_count_precharge_walk);
5102		- up_read(&mm->mmap_sem);
	5950	+ mmap_read_lock(mm);
	5951	+ walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
	5952	+ mmap_read_unlock(mm);
5103	5953
5104	5954	precharge = mc.precharge;
5105	5955	mc.precharge = 0;
..	..	@@ -5149,8 +5999,6 @@
5149	5999	*/
5150	6000	if (!mem_cgroup_is_root(mc.to))
5151	6001	page_counter_uncharge(&mc.to->memory, mc.moved_swap);
5152		-
5153		- css_put_many(&mc.to->css, mc.moved_swap);
5154	6002
5155	6003	mc.moved_swap = 0;
5156	6004	}
..	..	@@ -5312,7 +6160,7 @@
5312	6160	switch (get_mctgt_type(vma, addr, ptent, &target)) {
5313	6161	case MC_TARGET_DEVICE:
5314	6162	device = true;
5315		- /* fall through */
	6163	+ fallthrough;
5316	6164	case MC_TARGET_PAGE:
5317	6165	page = target.page;
5318	6166	/*
..	..	@@ -5367,13 +6215,12 @@
5367	6215	return ret;
5368	6216	}
5369	6217
	6218	+static const struct mm_walk_ops charge_walk_ops = {
	6219	+ .pmd_entry = mem_cgroup_move_charge_pte_range,
	6220	+};
	6221	+
5370	6222	static void mem_cgroup_move_charge(void)
5371	6223	{
5372		- struct mm_walk mem_cgroup_move_charge_walk = {
5373		- .pmd_entry = mem_cgroup_move_charge_pte_range,
5374		- .mm = mc.mm,
5375		- };
5376		-
5377	6224	lru_add_drain_all();
5378	6225	/*
5379	6226	* Signal lock_page_memcg() to take the memcg's move_lock
..	..	@@ -5383,9 +6230,9 @@
5383	6230	atomic_inc(&mc.from->moving_account);
5384	6231	synchronize_rcu();
5385	6232	retry:
5386		- if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) {
	6233	+ if (unlikely(!mmap_read_trylock(mc.mm))) {
5387	6234	/*
5388		- * Someone who are holding the mmap_sem might be waiting in
	6235	+ * Someone who are holding the mmap_lock might be waiting in
5389	6236	* waitq. So we cancel all extra charges, wake up all waiters,
5390	6237	* and retry. Because we cancel precharges, we might not be able
5391	6238	* to move enough charges, but moving charge is a best-effort
..	..	@@ -5399,9 +6246,10 @@
5399	6246	* When we have consumed all precharges and failed in doing
5400	6247	* additional charge, the page walk just aborts.
5401	6248	*/
5402		- walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk);
	6249	+ walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
	6250	+ NULL);
5403	6251
5404		- up_read(&mc.mm->mmap_sem);
	6252	+ mmap_read_unlock(mc.mm);
5405	6253	atomic_dec(&mc.from->moving_account);
5406	6254	}
5407	6255
..	..	@@ -5443,6 +6291,16 @@
5443	6291	root_mem_cgroup->use_hierarchy = false;
5444	6292	}
5445	6293
	6294	+static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
	6295	+{
	6296	+ if (value == PAGE_COUNTER_MAX)
	6297	+ seq_puts(m, "max\n");
	6298	+ else
	6299	+ seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
	6300	+
	6301	+ return 0;
	6302	+}
	6303	+
5446	6304	static u64 memory_current_read(struct cgroup_subsys_state *css,
5447	6305	struct cftype *cft)
5448	6306	{
..	..	@@ -5453,15 +6311,8 @@
5453	6311
5454	6312	static int memory_min_show(struct seq_file m, void v)
5455	6313	{
5456		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5457		- unsigned long min = READ_ONCE(memcg->memory.min);
5458		-
5459		- if (min == PAGE_COUNTER_MAX)
5460		- seq_puts(m, "max\n");
5461		- else
5462		- seq_printf(m, "%llu\n", (u64)min * PAGE_SIZE);
5463		-
5464		- return 0;
	6314	+ return seq_puts_memcg_tunable(m,
	6315	+ READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
5465	6316	}
5466	6317
5467	6318	static ssize_t memory_min_write(struct kernfs_open_file *of,
..	..	@@ -5483,15 +6334,8 @@
5483	6334
5484	6335	static int memory_low_show(struct seq_file m, void v)
5485	6336	{
5486		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5487		- unsigned long low = READ_ONCE(memcg->memory.low);
5488		-
5489		- if (low == PAGE_COUNTER_MAX)
5490		- seq_puts(m, "max\n");
5491		- else
5492		- seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE);
5493		-
5494		- return 0;
	6337	+ return seq_puts_memcg_tunable(m,
	6338	+ READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
5495	6339	}
5496	6340
5497	6341	static ssize_t memory_low_write(struct kernfs_open_file *of,
..	..	@@ -5513,22 +6357,16 @@
5513	6357
5514	6358	static int memory_high_show(struct seq_file m, void v)
5515	6359	{
5516		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5517		- unsigned long high = READ_ONCE(memcg->high);
5518		-
5519		- if (high == PAGE_COUNTER_MAX)
5520		- seq_puts(m, "max\n");
5521		- else
5522		- seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE);
5523		-
5524		- return 0;
	6360	+ return seq_puts_memcg_tunable(m,
	6361	+ READ_ONCE(mem_cgroup_from_seq(m)->memory.high));
5525	6362	}
5526	6363
5527	6364	static ssize_t memory_high_write(struct kernfs_open_file *of,
5528	6365	char *buf, size_t nbytes, loff_t off)
5529	6366	{
5530	6367	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5531		- unsigned long nr_pages;
	6368	+ unsigned int nr_retries = MAX_RECLAIM_RETRIES;
	6369	+ bool drained = false;
5532	6370	unsigned long high;
5533	6371	int err;
5534	6372
..	..	@@ -5537,12 +6375,30 @@
5537	6375	if (err)
5538	6376	return err;
5539	6377
5540		- memcg->high = high;
	6378	+ page_counter_set_high(&memcg->memory, high);
5541	6379
5542		- nr_pages = page_counter_read(&memcg->memory);
5543		- if (nr_pages > high)
5544		- try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
5545		- GFP_KERNEL, true);
	6380	+ for (;;) {
	6381	+ unsigned long nr_pages = page_counter_read(&memcg->memory);
	6382	+ unsigned long reclaimed;
	6383	+
	6384	+ if (nr_pages <= high)
	6385	+ break;
	6386	+
	6387	+ if (signal_pending(current))
	6388	+ break;
	6389	+
	6390	+ if (!drained) {
	6391	+ drain_all_stock(memcg);
	6392	+ drained = true;
	6393	+ continue;
	6394	+ }
	6395	+
	6396	+ reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
	6397	+ GFP_KERNEL, true);
	6398	+
	6399	+ if (!reclaimed && !nr_retries--)
	6400	+ break;
	6401	+ }
5546	6402
5547	6403	memcg_wb_domain_size_changed(memcg);
5548	6404	return nbytes;
..	..	@@ -5550,22 +6406,15 @@
5550	6406
5551	6407	static int memory_max_show(struct seq_file m, void v)
5552	6408	{
5553		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5554		- unsigned long max = READ_ONCE(memcg->memory.max);
5555		-
5556		- if (max == PAGE_COUNTER_MAX)
5557		- seq_puts(m, "max\n");
5558		- else
5559		- seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
5560		-
5561		- return 0;
	6409	+ return seq_puts_memcg_tunable(m,
	6410	+ READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
5562	6411	}
5563	6412
5564	6413	static ssize_t memory_max_write(struct kernfs_open_file *of,
5565	6414	char *buf, size_t nbytes, loff_t off)
5566	6415	{
5567	6416	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5568		- unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
	6417	+ unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
5569	6418	bool drained = false;
5570	6419	unsigned long max;
5571	6420	int err;
..	..	@@ -5583,10 +6432,8 @@
5583	6432	if (nr_pages <= max)
5584	6433	break;
5585	6434
5586		- if (signal_pending(current)) {
5587		- err = -EINTR;
	6435	+ if (signal_pending(current))
5588	6436	break;
5589		- }
5590	6437
5591	6438	if (!drained) {
5592	6439	drain_all_stock(memcg);
..	..	@@ -5610,104 +6457,77 @@
5610	6457	return nbytes;
5611	6458	}
5612	6459
	6460	+static void __memory_events_show(struct seq_file m, atomic_long_t events)
	6461	+{
	6462	+ seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
	6463	+ seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
	6464	+ seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
	6465	+ seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
	6466	+ seq_printf(m, "oom_kill %lu\n",
	6467	+ atomic_long_read(&events[MEMCG_OOM_KILL]));
	6468	+}
	6469	+
5613	6470	static int memory_events_show(struct seq_file m, void v)
5614	6471	{
5615		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
	6472	+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
5616	6473
5617		- seq_printf(m, "low %lu\n",
5618		- atomic_long_read(&memcg->memory_events[MEMCG_LOW]));
5619		- seq_printf(m, "high %lu\n",
5620		- atomic_long_read(&memcg->memory_events[MEMCG_HIGH]));
5621		- seq_printf(m, "max %lu\n",
5622		- atomic_long_read(&memcg->memory_events[MEMCG_MAX]));
5623		- seq_printf(m, "oom %lu\n",
5624		- atomic_long_read(&memcg->memory_events[MEMCG_OOM]));
5625		- seq_printf(m, "oom_kill %lu\n",
5626		- atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
	6474	+ __memory_events_show(m, memcg->memory_events);
	6475	+ return 0;
	6476	+}
5627	6477
	6478	+static int memory_events_local_show(struct seq_file m, void v)
	6479	+{
	6480	+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
	6481	+
	6482	+ __memory_events_show(m, memcg->memory_events_local);
5628	6483	return 0;
5629	6484	}
5630	6485
5631	6486	static int memory_stat_show(struct seq_file m, void v)
5632	6487	{
5633		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5634		- struct accumulated_stats acc;
5635		- int i;
	6488	+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
	6489	+ char *buf;
5636	6490
5637		- /*
5638		- * Provide statistics on the state of the memory subsystem as
5639		- * well as cumulative event counters that show past behavior.
5640		- *
5641		- * This list is ordered following a combination of these gradients:
5642		- * 1) generic big picture -> specifics and details
5643		- * 2) reflecting userspace activity -> reflecting kernel heuristics
5644		- *
5645		- * Current memory state:
5646		- */
5647		-
5648		- memset(&acc, 0, sizeof(acc));
5649		- acc.stats_size = MEMCG_NR_STAT;
5650		- acc.events_size = NR_VM_EVENT_ITEMS;
5651		- accumulate_memcg_tree(memcg, &acc);
5652		-
5653		- seq_printf(m, "anon %llu\n",
5654		- (u64)acc.stat[MEMCG_RSS] * PAGE_SIZE);
5655		- seq_printf(m, "file %llu\n",
5656		- (u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE);
5657		- seq_printf(m, "kernel_stack %llu\n",
5658		- (u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024);
5659		- seq_printf(m, "slab %llu\n",
5660		- (u64)(acc.stat[NR_SLAB_RECLAIMABLE] +
5661		- acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
5662		- seq_printf(m, "sock %llu\n",
5663		- (u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE);
5664		-
5665		- seq_printf(m, "shmem %llu\n",
5666		- (u64)acc.stat[NR_SHMEM] * PAGE_SIZE);
5667		- seq_printf(m, "file_mapped %llu\n",
5668		- (u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE);
5669		- seq_printf(m, "file_dirty %llu\n",
5670		- (u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE);
5671		- seq_printf(m, "file_writeback %llu\n",
5672		- (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE);
5673		-
5674		- for (i = 0; i < NR_LRU_LISTS; i++)
5675		- seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
5676		- (u64)acc.lru_pages[i] * PAGE_SIZE);
5677		-
5678		- seq_printf(m, "slab_reclaimable %llu\n",
5679		- (u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
5680		- seq_printf(m, "slab_unreclaimable %llu\n",
5681		- (u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
5682		-
5683		- /* Accumulated memory events */
5684		-
5685		- seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]);
5686		- seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]);
5687		-
5688		- seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]);
5689		- seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] +
5690		- acc.events[PGSCAN_DIRECT]);
5691		- seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] +
5692		- acc.events[PGSTEAL_DIRECT]);
5693		- seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]);
5694		- seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]);
5695		- seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]);
5696		- seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
5697		-
5698		- seq_printf(m, "workingset_refault %lu\n",
5699		- acc.stat[WORKINGSET_REFAULT]);
5700		- seq_printf(m, "workingset_activate %lu\n",
5701		- acc.stat[WORKINGSET_ACTIVATE]);
5702		- seq_printf(m, "workingset_nodereclaim %lu\n",
5703		- acc.stat[WORKINGSET_NODERECLAIM]);
5704		-
	6491	+ buf = memory_stat_format(memcg);
	6492	+ if (!buf)
	6493	+ return -ENOMEM;
	6494	+ seq_puts(m, buf);
	6495	+ kfree(buf);
5705	6496	return 0;
5706	6497	}
5707	6498
	6499	+#ifdef CONFIG_NUMA
	6500	+static int memory_numa_stat_show(struct seq_file m, void v)
	6501	+{
	6502	+ int i;
	6503	+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
	6504	+
	6505	+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
	6506	+ int nid;
	6507	+
	6508	+ if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS)
	6509	+ continue;
	6510	+
	6511	+ seq_printf(m, "%s", memory_stats[i].name);
	6512	+ for_each_node_state(nid, N_MEMORY) {
	6513	+ u64 size;
	6514	+ struct lruvec *lruvec;
	6515	+
	6516	+ lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
	6517	+ size = lruvec_page_state(lruvec, memory_stats[i].idx);
	6518	+ size *= memory_stats[i].ratio;
	6519	+ seq_printf(m, " N%d=%llu", nid, size);
	6520	+ }
	6521	+ seq_putc(m, '\n');
	6522	+ }
	6523	+
	6524	+ return 0;
	6525	+}
	6526	+#endif
	6527	+
5708	6528	static int memory_oom_group_show(struct seq_file m, void v)
5709	6529	{
5710		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
	6530	+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
5711	6531
5712	6532	seq_printf(m, "%d\n", memcg->oom_group);
5713	6533
..	..	@@ -5773,10 +6593,21 @@
5773	6593	.seq_show = memory_events_show,
5774	6594	},
5775	6595	{
5776		- .name = "stat",
	6596	+ .name = "events.local",
5777	6597	.flags = CFTYPE_NOT_ON_ROOT,
	6598	+ .file_offset = offsetof(struct mem_cgroup, events_local_file),
	6599	+ .seq_show = memory_events_local_show,
	6600	+ },
	6601	+ {
	6602	+ .name = "stat",
5778	6603	.seq_show = memory_stat_show,
5779	6604	},
	6605	+#ifdef CONFIG_NUMA
	6606	+ {
	6607	+ .name = "numa_stat",
	6608	+ .seq_show = memory_numa_stat_show,
	6609	+ },
	6610	+#endif
5780	6611	{
5781	6612	.name = "oom.group",
5782	6613	.flags = CFTYPE_NOT_ON_ROOT \| CFTYPE_NS_DELEGATABLE,
..	..	@@ -5802,6 +6633,122 @@
5802	6633	.early_init = 0,
5803	6634	};
5804	6635
	6636	+/*
	6637	+ * This function calculates an individual cgroup's effective
	6638	+ * protection which is derived from its own memory.min/low, its
	6639	+ * parent's and siblings' settings, as well as the actual memory
	6640	+ * distribution in the tree.
	6641	+ *
	6642	+ * The following rules apply to the effective protection values:
	6643	+ *
	6644	+ * 1. At the first level of reclaim, effective protection is equal to
	6645	+ * the declared protection in memory.min and memory.low.
	6646	+ *
	6647	+ * 2. To enable safe delegation of the protection configuration, at
	6648	+ * subsequent levels the effective protection is capped to the
	6649	+ * parent's effective protection.
	6650	+ *
	6651	+ * 3. To make complex and dynamic subtrees easier to configure, the
	6652	+ * user is allowed to overcommit the declared protection at a given
	6653	+ * level. If that is the case, the parent's effective protection is
	6654	+ * distributed to the children in proportion to how much protection
	6655	+ * they have declared and how much of it they are utilizing.
	6656	+ *
	6657	+ * This makes distribution proportional, but also work-conserving:
	6658	+ * if one cgroup claims much more protection than it uses memory,
	6659	+ * the unused remainder is available to its siblings.
	6660	+ *
	6661	+ * 4. Conversely, when the declared protection is undercommitted at a
	6662	+ * given level, the distribution of the larger parental protection
	6663	+ * budget is NOT proportional. A cgroup's protection from a sibling
	6664	+ * is capped to its own memory.min/low setting.
	6665	+ *
	6666	+ * 5. However, to allow protecting recursive subtrees from each other
	6667	+ * without having to declare each individual cgroup's fixed share
	6668	+ * of the ancestor's claim to protection, any unutilized -
	6669	+ * "floating" - protection from up the tree is distributed in
	6670	+ * proportion to each cgroup's usage. This makes the protection
	6671	+ * neutral wrt sibling cgroups and lets them compete freely over
	6672	+ * the shared parental protection budget, but it protects the
	6673	+ * subtree as a whole from neighboring subtrees.
	6674	+ *
	6675	+ * Note that 4. and 5. are not in conflict: 4. is about protecting
	6676	+ * against immediate siblings whereas 5. is about protecting against
	6677	+ * neighboring subtrees.
	6678	+ */
	6679	+static unsigned long effective_protection(unsigned long usage,
	6680	+ unsigned long parent_usage,
	6681	+ unsigned long setting,
	6682	+ unsigned long parent_effective,
	6683	+ unsigned long siblings_protected)
	6684	+{
	6685	+ unsigned long protected;
	6686	+ unsigned long ep;
	6687	+
	6688	+ protected = min(usage, setting);
	6689	+ /*
	6690	+ * If all cgroups at this level combined claim and use more
	6691	+ * protection then what the parent affords them, distribute
	6692	+ * shares in proportion to utilization.
	6693	+ *
	6694	+ * We are using actual utilization rather than the statically
	6695	+ * claimed protection in order to be work-conserving: claimed
	6696	+ * but unused protection is available to siblings that would
	6697	+ * otherwise get a smaller chunk than what they claimed.
	6698	+ */
	6699	+ if (siblings_protected > parent_effective)
	6700	+ return protected * parent_effective / siblings_protected;
	6701	+
	6702	+ /*
	6703	+ * Ok, utilized protection of all children is within what the
	6704	+ * parent affords them, so we know whatever this child claims
	6705	+ * and utilizes is effectively protected.
	6706	+ *
	6707	+ * If there is unprotected usage beyond this value, reclaim
	6708	+ * will apply pressure in proportion to that amount.
	6709	+ *
	6710	+ * If there is unutilized protection, the cgroup will be fully
	6711	+ * shielded from reclaim, but we do return a smaller value for
	6712	+ * protection than what the group could enjoy in theory. This
	6713	+ * is okay. With the overcommit distribution above, effective
	6714	+ * protection is always dependent on how memory is actually
	6715	+ * consumed among the siblings anyway.
	6716	+ */
	6717	+ ep = protected;
	6718	+
	6719	+ /*
	6720	+ * If the children aren't claiming (all of) the protection
	6721	+ * afforded to them by the parent, distribute the remainder in
	6722	+ * proportion to the (unprotected) memory of each cgroup. That
	6723	+ * way, cgroups that aren't explicitly prioritized wrt each
	6724	+ * other compete freely over the allowance, but they are
	6725	+ * collectively protected from neighboring trees.
	6726	+ *
	6727	+ * We're using unprotected memory for the weight so that if
	6728	+ * some cgroups DO claim explicit protection, we don't protect
	6729	+ * the same bytes twice.
	6730	+ *
	6731	+ * Check both usage and parent_usage against the respective
	6732	+ * protected values. One should imply the other, but they
	6733	+ * aren't read atomically - make sure the division is sane.
	6734	+ */
	6735	+ if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT))
	6736	+ return ep;
	6737	+ if (parent_effective > siblings_protected &&
	6738	+ parent_usage > siblings_protected &&
	6739	+ usage > protected) {
	6740	+ unsigned long unclaimed;
	6741	+
	6742	+ unclaimed = parent_effective - siblings_protected;
	6743	+ unclaimed *= usage - protected;
	6744	+ unclaimed /= parent_usage - siblings_protected;
	6745	+
	6746	+ ep += unclaimed;
	6747	+ }
	6748	+
	6749	+ return ep;
	6750	+}
	6751	+
5805	6752	/**
5806	6753	* mem_cgroup_protected - check if memory consumption is in the normal range
5807	6754	* @root: the top ancestor of the sub-tree being checked
..	..	@@ -5809,259 +6756,125 @@
5809	6756	*
5810	6757	* WARNING: This function is not stateless! It can only be used as part
5811	6758	* of a top-down tree iteration, not for isolated queries.
5812		- *
5813		- * Returns one of the following:
5814		- * MEMCG_PROT_NONE: cgroup memory is not protected
5815		- * MEMCG_PROT_LOW: cgroup memory is protected as long there is
5816		- * an unprotected supply of reclaimable memory from other cgroups.
5817		- * MEMCG_PROT_MIN: cgroup memory is protected
5818		- *
5819		- * @root is exclusive; it is never protected when looked at directly
5820		- *
5821		- * To provide a proper hierarchical behavior, effective memory.min/low values
5822		- * are used. Below is the description of how effective memory.low is calculated.
5823		- * Effective memory.min values is calculated in the same way.
5824		- *
5825		- * Effective memory.low is always equal or less than the original memory.low.
5826		- * If there is no memory.low overcommittment (which is always true for
5827		- * top-level memory cgroups), these two values are equal.
5828		- * Otherwise, it's a part of parent's effective memory.low,
5829		- * calculated as a cgroup's memory.low usage divided by sum of sibling's
5830		- * memory.low usages, where memory.low usage is the size of actually
5831		- * protected memory.
5832		- *
5833		- * low_usage
5834		- * elow = min( memory.low, parent->elow * ------------------ ),
5835		- * siblings_low_usage
5836		- *
5837		- * \| memory.current, if memory.current < memory.low
5838		- * low_usage = \|
5839		- \| 0, otherwise.
5840		- *
5841		- *
5842		- * Such definition of the effective memory.low provides the expected
5843		- * hierarchical behavior: parent's memory.low value is limiting
5844		- * children, unprotected memory is reclaimed first and cgroups,
5845		- * which are not using their guarantee do not affect actual memory
5846		- * distribution.
5847		- *
5848		- * For example, if there are memcgs A, A/B, A/C, A/D and A/E:
5849		- *
5850		- * A A/memory.low = 2G, A/memory.current = 6G
5851		- * //\\
5852		- * BC DE B/memory.low = 3G B/memory.current = 2G
5853		- * C/memory.low = 1G C/memory.current = 2G
5854		- * D/memory.low = 0 D/memory.current = 2G
5855		- * E/memory.low = 10G E/memory.current = 0
5856		- *
5857		- * and the memory pressure is applied, the following memory distribution
5858		- * is expected (approximately):
5859		- *
5860		- * A/memory.current = 2G
5861		- *
5862		- * B/memory.current = 1.3G
5863		- * C/memory.current = 0.6G
5864		- * D/memory.current = 0
5865		- * E/memory.current = 0
5866		- *
5867		- * These calculations require constant tracking of the actual low usages
5868		- * (see propagate_protected_usage()), as well as recursive calculation of
5869		- * effective memory.low values. But as we do call mem_cgroup_protected()
5870		- * path for each memory cgroup top-down from the reclaim,
5871		- * it's possible to optimize this part, and save calculated elow
5872		- * for next usage. This part is intentionally racy, but it's ok,
5873		- * as memory.low is a best-effort mechanism.
5874	6759	*/
5875		-enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
5876		- struct mem_cgroup *memcg)
	6760	+void mem_cgroup_calculate_protection(struct mem_cgroup *root,
	6761	+ struct mem_cgroup *memcg)
5877	6762	{
	6763	+ unsigned long usage, parent_usage;
5878	6764	struct mem_cgroup *parent;
5879		- unsigned long emin, parent_emin;
5880		- unsigned long elow, parent_elow;
5881		- unsigned long usage;
5882	6765
5883	6766	if (mem_cgroup_disabled())
5884		- return MEMCG_PROT_NONE;
	6767	+ return;
5885	6768
5886	6769	if (!root)
5887	6770	root = root_mem_cgroup;
	6771	+
	6772	+ /*
	6773	+ * Effective values of the reclaim targets are ignored so they
	6774	+ * can be stale. Have a look at mem_cgroup_protection for more
	6775	+ * details.
	6776	+ * TODO: calculation should be more robust so that we do not need
	6777	+ * that special casing.
	6778	+ */
5888	6779	if (memcg == root)
5889		- return MEMCG_PROT_NONE;
	6780	+ return;
5890	6781
5891	6782	usage = page_counter_read(&memcg->memory);
5892	6783	if (!usage)
5893		- return MEMCG_PROT_NONE;
5894		-
5895		- emin = memcg->memory.min;
5896		- elow = memcg->memory.low;
	6784	+ return;
5897	6785
5898	6786	parent = parent_mem_cgroup(memcg);
5899	6787	/* No parent means a non-hierarchical mode on v1 memcg */
5900	6788	if (!parent)
5901		- return MEMCG_PROT_NONE;
	6789	+ return;
5902	6790
5903		- if (parent == root)
5904		- goto exit;
5905		-
5906		- parent_emin = READ_ONCE(parent->memory.emin);
5907		- emin = min(emin, parent_emin);
5908		- if (emin && parent_emin) {
5909		- unsigned long min_usage, siblings_min_usage;
5910		-
5911		- min_usage = min(usage, memcg->memory.min);
5912		- siblings_min_usage = atomic_long_read(
5913		- &parent->memory.children_min_usage);
5914		-
5915		- if (min_usage && siblings_min_usage)
5916		- emin = min(emin, parent_emin * min_usage /
5917		- siblings_min_usage);
	6791	+ if (parent == root) {
	6792	+ memcg->memory.emin = READ_ONCE(memcg->memory.min);
	6793	+ memcg->memory.elow = READ_ONCE(memcg->memory.low);
	6794	+ return;
5918	6795	}
5919	6796
5920		- parent_elow = READ_ONCE(parent->memory.elow);
5921		- elow = min(elow, parent_elow);
5922		- if (elow && parent_elow) {
5923		- unsigned long low_usage, siblings_low_usage;
	6797	+ parent_usage = page_counter_read(&parent->memory);
5924	6798
5925		- low_usage = min(usage, memcg->memory.low);
5926		- siblings_low_usage = atomic_long_read(
5927		- &parent->memory.children_low_usage);
	6799	+ WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage,
	6800	+ READ_ONCE(memcg->memory.min),
	6801	+ READ_ONCE(parent->memory.emin),
	6802	+ atomic_long_read(&parent->memory.children_min_usage)));
5928	6803
5929		- if (low_usage && siblings_low_usage)
5930		- elow = min(elow, parent_elow * low_usage /
5931		- siblings_low_usage);
5932		- }
5933		-
5934		-exit:
5935		- memcg->memory.emin = emin;
5936		- memcg->memory.elow = elow;
5937		-
5938		- if (usage <= emin)
5939		- return MEMCG_PROT_MIN;
5940		- else if (usage <= elow)
5941		- return MEMCG_PROT_LOW;
5942		- else
5943		- return MEMCG_PROT_NONE;
	6804	+ WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage,
	6805	+ READ_ONCE(memcg->memory.low),
	6806	+ READ_ONCE(parent->memory.elow),
	6807	+ atomic_long_read(&parent->memory.children_low_usage)));
5944	6808	}
5945	6809
5946	6810	/**
5947		- * mem_cgroup_try_charge - try charging a page
	6811	+ * __mem_cgroup_charge - charge a newly allocated page to a cgroup
5948	6812	* @page: page to charge
5949	6813	* @mm: mm context of the victim
5950	6814	* @gfp_mask: reclaim mode
5951		- * @memcgp: charged memcg return
5952		- * @compound: charge the page as compound or small page
5953	6815	*
5954	6816	* Try to charge @page to the memcg that @mm belongs to, reclaiming
5955	6817	* pages according to @gfp_mask if necessary.
5956	6818	*
5957		- * Returns 0 on success, with *@memcgp pointing to the charged memcg.
5958		- * Otherwise, an error code is returned.
5959		- *
5960		- * After page->mapping has been set up, the caller must finalize the
5961		- * charge with mem_cgroup_commit_charge(). Or abort the transaction
5962		- * with mem_cgroup_cancel_charge() in case page instantiation fails.
	6819	+ * Returns 0 on success. Otherwise, an error code is returned.
5963	6820	*/
5964		-int mem_cgroup_try_charge(struct page page, struct mm_struct mm,
5965		- gfp_t gfp_mask, struct mem_cgroup **memcgp,
5966		- bool compound)
	6821	+int __mem_cgroup_charge(struct page page, struct mm_struct mm,
	6822	+ gfp_t gfp_mask)
5967	6823	{
	6824	+ unsigned int nr_pages = thp_nr_pages(page);
5968	6825	struct mem_cgroup *memcg = NULL;
5969		- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5970	6826	int ret = 0;
5971	6827
5972		- if (mem_cgroup_disabled())
5973		- goto out;
5974		-
5975	6828	if (PageSwapCache(page)) {
	6829	+ swp_entry_t ent = { .val = page_private(page), };
	6830	+ unsigned short id;
	6831	+
5976	6832	/*
5977	6833	* Every swap fault against a single page tries to charge the
5978	6834	* page, bail as early as possible. shmem_unuse() encounters
5979		- * already charged pages, too. The USED bit is protected by
5980		- * the page lock, which serializes swap cache removal, which
	6835	+ * already charged pages, too. page->mem_cgroup is protected
	6836	+ * by the page lock, which serializes swap cache removal, which
5981	6837	* in turn serializes uncharging.
5982	6838	*/
5983	6839	VM_BUG_ON_PAGE(!PageLocked(page), page);
5984	6840	if (compound_head(page)->mem_cgroup)
5985	6841	goto out;
5986	6842
5987		- if (do_swap_account) {
5988		- swp_entry_t ent = { .val = page_private(page), };
5989		- unsigned short id = lookup_swap_cgroup_id(ent);
5990		-
5991		- rcu_read_lock();
5992		- memcg = mem_cgroup_from_id(id);
5993		- if (memcg && !css_tryget_online(&memcg->css))
5994		- memcg = NULL;
5995		- rcu_read_unlock();
5996		- }
	6843	+ id = lookup_swap_cgroup_id(ent);
	6844	+ rcu_read_lock();
	6845	+ memcg = mem_cgroup_from_id(id);
	6846	+ if (memcg && !css_tryget_online(&memcg->css))
	6847	+ memcg = NULL;
	6848	+ rcu_read_unlock();
5997	6849	}
5998	6850
5999	6851	if (!memcg)
6000	6852	memcg = get_mem_cgroup_from_mm(mm);
6001	6853
6002	6854	ret = try_charge(memcg, gfp_mask, nr_pages);
	6855	+ if (ret)
	6856	+ goto out_put;
6003	6857
6004		- css_put(&memcg->css);
6005		-out:
6006		- *memcgp = memcg;
6007		- return ret;
6008		-}
6009		-
6010		-int mem_cgroup_try_charge_delay(struct page page, struct mm_struct mm,
6011		- gfp_t gfp_mask, struct mem_cgroup **memcgp,
6012		- bool compound)
6013		-{
6014		- struct mem_cgroup *memcg;
6015		- int ret;
6016		-
6017		- ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound);
6018		- memcg = *memcgp;
6019		- mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask);
6020		- return ret;
6021		-}
6022		-
6023		-/**
6024		- * mem_cgroup_commit_charge - commit a page charge
6025		- * @page: page to charge
6026		- * @memcg: memcg to charge the page to
6027		- * @lrucare: page might be on LRU already
6028		- * @compound: charge the page as compound or small page
6029		- *
6030		- * Finalize a charge transaction started by mem_cgroup_try_charge(),
6031		- * after page->mapping has been set up. This must happen atomically
6032		- * as part of the page instantiation, i.e. under the page table lock
6033		- * for anonymous pages, under the page lock for page and swap cache.
6034		- *
6035		- * In addition, the page must not be on the LRU during the commit, to
6036		- * prevent racing with task migration. If it might be, use @lrucare.
6037		- *
6038		- * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
6039		- */
6040		-void mem_cgroup_commit_charge(struct page page, struct mem_cgroup memcg,
6041		- bool lrucare, bool compound)
6042		-{
6043		- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
6044		-
6045		- VM_BUG_ON_PAGE(!page->mapping, page);
6046		- VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
6047		-
6048		- if (mem_cgroup_disabled())
6049		- return;
6050		- /*
6051		- * Swap faults will attempt to charge the same page multiple
6052		- * times. But reuse_swap_page() might have removed the page
6053		- * from swapcache already, so we can't check PageSwapCache().
6054		- */
6055		- if (!memcg)
6056		- return;
6057		-
6058		- commit_charge(page, memcg, lrucare);
	6858	+ css_get(&memcg->css);
	6859	+ commit_charge(page, memcg);
6059	6860
6060	6861	local_irq_disable();
6061		- mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
	6862	+ mem_cgroup_charge_statistics(memcg, page, nr_pages);
6062	6863	memcg_check_events(memcg, page);
6063	6864	local_irq_enable();
6064	6865
	6866	+ /*
	6867	+ * Cgroup1's unified memory+swap counter has been charged with the
	6868	+ * new swapcache page, finish the transfer by uncharging the swap
	6869	+ * slot. The swap slot would also get uncharged when it dies, but
	6870	+ * it can stick around indefinitely and we'd count the page twice
	6871	+ * the entire time.
	6872	+ *
	6873	+ * Cgroup2 has separate resource counters for memory and swap,
	6874	+ * so this is a non-issue here. Memory and swap charge lifetimes
	6875	+ * correspond 1:1 to page and swap slot lifetimes: we charge the
	6876	+ * page to memory here, and uncharge swap when the slot is freed.
	6877	+ */
6065	6878	if (do_memsw_account() && PageSwapCache(page)) {
6066	6879	swp_entry_t entry = { .val = page_private(page) };
6067	6880	/*
..	..	@@ -6071,42 +6884,18 @@
6071	6884	*/
6072	6885	mem_cgroup_uncharge_swap(entry, nr_pages);
6073	6886	}
6074		-}
6075	6887
6076		-/**
6077		- * mem_cgroup_cancel_charge - cancel a page charge
6078		- * @page: page to charge
6079		- * @memcg: memcg to charge the page to
6080		- * @compound: charge the page as compound or small page
6081		- *
6082		- * Cancel a charge transaction started by mem_cgroup_try_charge().
6083		- */
6084		-void mem_cgroup_cancel_charge(struct page page, struct mem_cgroup memcg,
6085		- bool compound)
6086		-{
6087		- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
6088		-
6089		- if (mem_cgroup_disabled())
6090		- return;
6091		- /*
6092		- * Swap faults will attempt to charge the same page multiple
6093		- * times. But reuse_swap_page() might have removed the page
6094		- * from swapcache already, so we can't check PageSwapCache().
6095		- */
6096		- if (!memcg)
6097		- return;
6098		-
6099		- cancel_charge(memcg, nr_pages);
	6888	+out_put:
	6889	+ css_put(&memcg->css);
	6890	+out:
	6891	+ return ret;
6100	6892	}
6101	6893
6102	6894	struct uncharge_gather {
6103	6895	struct mem_cgroup *memcg;
	6896	+ unsigned long nr_pages;
6104	6897	unsigned long pgpgout;
6105		- unsigned long nr_anon;
6106		- unsigned long nr_file;
6107	6898	unsigned long nr_kmem;
6108		- unsigned long nr_huge;
6109		- unsigned long nr_shmem;
6110	6899	struct page *dummy_page;
6111	6900	};
6112	6901
..	..	@@ -6117,37 +6906,32 @@
6117	6906
6118	6907	static void uncharge_batch(const struct uncharge_gather *ug)
6119	6908	{
6120		- unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem;
6121	6909	unsigned long flags;
6122	6910
6123	6911	if (!mem_cgroup_is_root(ug->memcg)) {
6124		- page_counter_uncharge(&ug->memcg->memory, nr_pages);
	6912	+ page_counter_uncharge(&ug->memcg->memory, ug->nr_pages);
6125	6913	if (do_memsw_account())
6126		- page_counter_uncharge(&ug->memcg->memsw, nr_pages);
	6914	+ page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages);
6127	6915	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
6128	6916	page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
6129	6917	memcg_oom_recover(ug->memcg);
6130	6918	}
6131	6919
6132	6920	local_irq_save(flags);
6133		- __mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon);
6134		- __mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file);
6135		- __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge);
6136		- __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem);
6137	6921	__count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
6138		- __this_cpu_add(ug->memcg->stat_cpu->nr_page_events, nr_pages);
	6922	+ __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages);
6139	6923	memcg_check_events(ug->memcg, ug->dummy_page);
6140	6924	local_irq_restore(flags);
6141	6925
6142		- if (!mem_cgroup_is_root(ug->memcg))
6143		- css_put_many(&ug->memcg->css, nr_pages);
	6926	+ /* drop reference from uncharge_page */
	6927	+ css_put(&ug->memcg->css);
6144	6928	}
6145	6929
6146	6930	static void uncharge_page(struct page page, struct uncharge_gather ug)
6147	6931	{
	6932	+ unsigned long nr_pages;
	6933	+
6148	6934	VM_BUG_ON_PAGE(PageLRU(page), page);
6149		- VM_BUG_ON_PAGE(page_count(page) && !is_zone_device_page(page) &&
6150		- !PageHWPoison(page) , page);
6151	6935
6152	6936	if (!page->mem_cgroup)
6153	6937	return;
..	..	@@ -6164,30 +6948,24 @@
6164	6948	uncharge_gather_clear(ug);
6165	6949	}
6166	6950	ug->memcg = page->mem_cgroup;
	6951	+
	6952	+ /* pairs with css_put in uncharge_batch */
	6953	+ css_get(&ug->memcg->css);
6167	6954	}
6168	6955
6169		- if (!PageKmemcg(page)) {
6170		- unsigned int nr_pages = 1;
	6956	+ nr_pages = compound_nr(page);
	6957	+ ug->nr_pages += nr_pages;
6171	6958
6172		- if (PageTransHuge(page)) {
6173		- nr_pages <<= compound_order(page);
6174		- ug->nr_huge += nr_pages;
6175		- }
6176		- if (PageAnon(page))
6177		- ug->nr_anon += nr_pages;
6178		- else {
6179		- ug->nr_file += nr_pages;
6180		- if (PageSwapBacked(page))
6181		- ug->nr_shmem += nr_pages;
6182		- }
	6959	+ if (!PageKmemcg(page)) {
6183	6960	ug->pgpgout++;
6184	6961	} else {
6185		- ug->nr_kmem += 1 << compound_order(page);
	6962	+ ug->nr_kmem += nr_pages;
6186	6963	__ClearPageKmemcg(page);
6187	6964	}
6188	6965
6189	6966	ug->dummy_page = page;
6190	6967	page->mem_cgroup = NULL;
	6968	+ css_put(&ug->memcg->css);
6191	6969	}
6192	6970
6193	6971	static void uncharge_list(struct list_head *page_list)
..	..	@@ -6216,18 +6994,14 @@
6216	6994	}
6217	6995
6218	6996	/**
6219		- * mem_cgroup_uncharge - uncharge a page
	6997	+ * __mem_cgroup_uncharge - uncharge a page
6220	6998	* @page: page to uncharge
6221	6999	*
6222		- * Uncharge a page previously charged with mem_cgroup_try_charge() and
6223		- * mem_cgroup_commit_charge().
	7000	+ * Uncharge a page previously charged with __mem_cgroup_charge().
6224	7001	*/
6225		-void mem_cgroup_uncharge(struct page *page)
	7002	+void __mem_cgroup_uncharge(struct page *page)
6226	7003	{
6227	7004	struct uncharge_gather ug;
6228		-
6229		- if (mem_cgroup_disabled())
6230		- return;
6231	7005
6232	7006	/* Don't touch page->lru of any random page, pre-check: */
6233	7007	if (!page->mem_cgroup)
..	..	@@ -6239,17 +7013,14 @@
6239	7013	}
6240	7014
6241	7015	/**
6242		- * mem_cgroup_uncharge_list - uncharge a list of page
	7016	+ * __mem_cgroup_uncharge_list - uncharge a list of page
6243	7017	* @page_list: list of pages to uncharge
6244	7018	*
6245	7019	* Uncharge a list of pages previously charged with
6246		- * mem_cgroup_try_charge() and mem_cgroup_commit_charge().
	7020	+ * __mem_cgroup_charge().
6247	7021	*/
6248		-void mem_cgroup_uncharge_list(struct list_head *page_list)
	7022	+void __mem_cgroup_uncharge_list(struct list_head *page_list)
6249	7023	{
6250		- if (mem_cgroup_disabled())
6251		- return;
6252		-
6253	7024	if (!list_empty(page_list))
6254	7025	uncharge_list(page_list);
6255	7026	}
..	..	@@ -6268,7 +7039,6 @@
6268	7039	{
6269	7040	struct mem_cgroup *memcg;
6270	7041	unsigned int nr_pages;
6271		- bool compound;
6272	7042	unsigned long flags;
6273	7043
6274	7044	VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
..	..	@@ -6290,18 +7060,17 @@
6290	7060	return;
6291	7061
6292	7062	/* Force-charge the new page. The old one will be freed soon */
6293		- compound = PageTransHuge(newpage);
6294		- nr_pages = compound ? hpage_nr_pages(newpage) : 1;
	7063	+ nr_pages = thp_nr_pages(newpage);
6295	7064
6296	7065	page_counter_charge(&memcg->memory, nr_pages);
6297	7066	if (do_memsw_account())
6298	7067	page_counter_charge(&memcg->memsw, nr_pages);
6299		- css_get_many(&memcg->css, nr_pages);
6300	7068
6301		- commit_charge(newpage, memcg, false);
	7069	+ css_get(&memcg->css);
	7070	+ commit_charge(newpage, memcg);
6302	7071
6303	7072	local_irq_save(flags);
6304		- mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
	7073	+ mem_cgroup_charge_statistics(memcg, newpage, nr_pages);
6305	7074	memcg_check_events(memcg, newpage);
6306	7075	local_irq_restore(flags);
6307	7076	}
..	..	@@ -6326,7 +7095,7 @@
6326	7095	goto out;
6327	7096	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
6328	7097	goto out;
6329		- if (css_tryget_online(&memcg->css))
	7098	+ if (css_tryget(&memcg->css))
6330	7099	sk->sk_memcg = memcg;
6331	7100	out:
6332	7101	rcu_read_unlock();
..	..	@@ -6404,7 +7173,7 @@
6404	7173	if (!strcmp(token, "nokmem"))
6405	7174	cgroup_memory_nokmem = true;
6406	7175	}
6407		- return 0;
	7176	+ return 1;
6408	7177	}
6409	7178	__setup("cgroup.memory=", cgroup_memory);
6410	7179
..	..	@@ -6419,17 +7188,6 @@
6419	7188	static int __init mem_cgroup_init(void)
6420	7189	{
6421	7190	int cpu, node;
6422		-
6423		-#ifdef CONFIG_MEMCG_KMEM
6424		- /*
6425		- * Kmem cache creation is mostly done with the slab_mutex held,
6426		- * so use a workqueue with limited concurrency to avoid stalling
6427		- * all worker threads in case lots of cgroups are created and
6428		- * destroyed simultaneously.
6429		- */
6430		- memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1);
6431		- BUG_ON(!memcg_kmem_cache_wq);
6432		-#endif
6433	7191
6434	7192	cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
6435	7193	memcg_hotplug_cpu_dead);
..	..	@@ -6457,7 +7215,7 @@
6457	7215	#ifdef CONFIG_MEMCG_SWAP
6458	7216	static struct mem_cgroup mem_cgroup_id_get_online(struct mem_cgroup memcg)
6459	7217	{
6460		- while (!atomic_inc_not_zero(&memcg->id.ref)) {
	7218	+ while (!refcount_inc_not_zero(&memcg->id.ref)) {
6461	7219	/*
6462	7220	* The root cgroup cannot be destroyed, so it's refcount must
6463	7221	* always be >= 1.
..	..	@@ -6489,7 +7247,10 @@
6489	7247	VM_BUG_ON_PAGE(PageLRU(page), page);
6490	7248	VM_BUG_ON_PAGE(page_count(page), page);
6491	7249
6492		- if (!do_memsw_account())
	7250	+ if (mem_cgroup_disabled())
	7251	+ return;
	7252	+
	7253	+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
6493	7254	return;
6494	7255
6495	7256	memcg = page->mem_cgroup;
..	..	@@ -6504,7 +7265,7 @@
6504	7265	* ancestor for the swap instead and transfer the memory+swap charge.
6505	7266	*/
6506	7267	swap_memcg = mem_cgroup_id_get_online(memcg);
6507		- nr_entries = hpage_nr_pages(page);
	7268	+ nr_entries = thp_nr_pages(page);
6508	7269	/* Get references for the tail pages, too */
6509	7270	if (nr_entries > 1)
6510	7271	mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
..	..	@@ -6518,7 +7279,7 @@
6518	7279	if (!mem_cgroup_is_root(memcg))
6519	7280	page_counter_uncharge(&memcg->memory, nr_entries);
6520	7281
6521		- if (memcg != swap_memcg) {
	7282	+ if (!cgroup_memory_noswap && memcg != swap_memcg) {
6522	7283	if (!mem_cgroup_is_root(swap_memcg))
6523	7284	page_counter_charge(&swap_memcg->memsw, nr_entries);
6524	7285	page_counter_uncharge(&memcg->memsw, nr_entries);
..	..	@@ -6531,16 +7292,14 @@
6531	7292	* only synchronisation we have for updating the per-CPU variables.
6532	7293	*/
6533	7294	VM_BUG_ON(!irqs_disabled());
6534		- mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
6535		- -nr_entries);
	7295	+ mem_cgroup_charge_statistics(memcg, page, -nr_entries);
6536	7296	memcg_check_events(memcg, page);
6537	7297
6538		- if (!mem_cgroup_is_root(memcg))
6539		- css_put_many(&memcg->css, nr_entries);
	7298	+ css_put(&memcg->css);
6540	7299	}
6541	7300
6542	7301	/**
6543		- * mem_cgroup_try_charge_swap - try charging swap space for a page
	7302	+ * __mem_cgroup_try_charge_swap - try charging swap space for a page
6544	7303	* @page: page being added to swap
6545	7304	* @entry: swap entry to charge
6546	7305	*
..	..	@@ -6548,14 +7307,14 @@
6548	7307	*
6549	7308	* Returns 0 on success, -ENOMEM on failure.
6550	7309	*/
6551		-int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
	7310	+int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
6552	7311	{
6553		- unsigned int nr_pages = hpage_nr_pages(page);
	7312	+ unsigned int nr_pages = thp_nr_pages(page);
6554	7313	struct page_counter *counter;
6555	7314	struct mem_cgroup *memcg;
6556	7315	unsigned short oldid;
6557	7316
6558		- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) \|\| !do_swap_account)
	7317	+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
6559	7318	return 0;
6560	7319
6561	7320	memcg = page->mem_cgroup;
..	..	@@ -6571,7 +7330,7 @@
6571	7330
6572	7331	memcg = mem_cgroup_id_get_online(memcg);
6573	7332
6574		- if (!mem_cgroup_is_root(memcg) &&
	7333	+ if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) &&
6575	7334	!page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
6576	7335	memcg_memory_event(memcg, MEMCG_SWAP_MAX);
6577	7336	memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
..	..	@@ -6590,23 +7349,20 @@
6590	7349	}
6591	7350
6592	7351	/**
6593		- * mem_cgroup_uncharge_swap - uncharge swap space
	7352	+ * __mem_cgroup_uncharge_swap - uncharge swap space
6594	7353	* @entry: swap entry to uncharge
6595	7354	* @nr_pages: the amount of swap space to uncharge
6596	7355	*/
6597		-void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
	7356	+void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
6598	7357	{
6599	7358	struct mem_cgroup *memcg;
6600	7359	unsigned short id;
6601		-
6602		- if (!do_swap_account)
6603		- return;
6604	7360
6605	7361	id = swap_cgroup_record(entry, 0, nr_pages);
6606	7362	rcu_read_lock();
6607	7363	memcg = mem_cgroup_from_id(id);
6608	7364	if (memcg) {
6609		- if (!mem_cgroup_is_root(memcg)) {
	7365	+ if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) {
6610	7366	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
6611	7367	page_counter_uncharge(&memcg->swap, nr_pages);
6612	7368	else
..	..	@@ -6622,7 +7378,7 @@
6622	7378	{
6623	7379	long nr_swap_pages = get_nr_swap_pages();
6624	7380
6625		- if (!do_swap_account \|\| !cgroup_subsys_on_dfl(memory_cgrp_subsys))
	7381	+ if (cgroup_memory_noswap \|\| !cgroup_subsys_on_dfl(memory_cgrp_subsys))
6626	7382	return nr_swap_pages;
6627	7383	for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
6628	7384	nr_swap_pages = min_t(long, nr_swap_pages,
..	..	@@ -6639,36 +7395,33 @@
6639	7395
6640	7396	if (vm_swap_full())
6641	7397	return true;
6642		- if (!do_swap_account \|\| !cgroup_subsys_on_dfl(memory_cgrp_subsys))
	7398	+ if (cgroup_memory_noswap \|\| !cgroup_subsys_on_dfl(memory_cgrp_subsys))
6643	7399	return false;
6644	7400
6645	7401	memcg = page->mem_cgroup;
6646	7402	if (!memcg)
6647	7403	return false;
6648	7404
6649		- for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
6650		- if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.max)
	7405	+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
	7406	+ unsigned long usage = page_counter_read(&memcg->swap);
	7407	+
	7408	+ if (usage * 2 >= READ_ONCE(memcg->swap.high) \|\|
	7409	+ usage * 2 >= READ_ONCE(memcg->swap.max))
6651	7410	return true;
	7411	+ }
6652	7412
6653	7413	return false;
6654	7414	}
6655	7415
6656		-/* for remember boot option*/
6657		-#ifdef CONFIG_MEMCG_SWAP_ENABLED
6658		-static int really_do_swap_account __initdata = 1;
6659		-#else
6660		-static int really_do_swap_account __initdata;
6661		-#endif
6662		-
6663		-static int __init enable_swap_account(char *s)
	7416	+static int __init setup_swap_account(char *s)
6664	7417	{
6665	7418	if (!strcmp(s, "1"))
6666		- really_do_swap_account = 1;
	7419	+ cgroup_memory_noswap = 0;
6667	7420	else if (!strcmp(s, "0"))
6668		- really_do_swap_account = 0;
	7421	+ cgroup_memory_noswap = 1;
6669	7422	return 1;
6670	7423	}
6671		-__setup("swapaccount=", enable_swap_account);
	7424	+__setup("swapaccount=", setup_swap_account);
6672	7425
6673	7426	static u64 swap_current_read(struct cgroup_subsys_state *css,
6674	7427	struct cftype *cft)
..	..	@@ -6678,17 +7431,33 @@
6678	7431	return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
6679	7432	}
6680	7433
	7434	+static int swap_high_show(struct seq_file m, void v)
	7435	+{
	7436	+ return seq_puts_memcg_tunable(m,
	7437	+ READ_ONCE(mem_cgroup_from_seq(m)->swap.high));
	7438	+}
	7439	+
	7440	+static ssize_t swap_high_write(struct kernfs_open_file *of,
	7441	+ char *buf, size_t nbytes, loff_t off)
	7442	+{
	7443	+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
	7444	+ unsigned long high;
	7445	+ int err;
	7446	+
	7447	+ buf = strstrip(buf);
	7448	+ err = page_counter_memparse(buf, "max", &high);
	7449	+ if (err)
	7450	+ return err;
	7451	+
	7452	+ page_counter_set_high(&memcg->swap, high);
	7453	+
	7454	+ return nbytes;
	7455	+}
	7456	+
6681	7457	static int swap_max_show(struct seq_file m, void v)
6682	7458	{
6683		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
6684		- unsigned long max = READ_ONCE(memcg->swap.max);
6685		-
6686		- if (max == PAGE_COUNTER_MAX)
6687		- seq_puts(m, "max\n");
6688		- else
6689		- seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
6690		-
6691		- return 0;
	7459	+ return seq_puts_memcg_tunable(m,
	7460	+ READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
6692	7461	}
6693	7462
6694	7463	static ssize_t swap_max_write(struct kernfs_open_file *of,
..	..	@@ -6710,8 +7479,10 @@
6710	7479
6711	7480	static int swap_events_show(struct seq_file m, void v)
6712	7481	{
6713		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
	7482	+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6714	7483
	7484	+ seq_printf(m, "high %lu\n",
	7485	+ atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH]));
6715	7486	seq_printf(m, "max %lu\n",
6716	7487	atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
6717	7488	seq_printf(m, "fail %lu\n",
..	..	@@ -6725,6 +7496,12 @@
6725	7496	.name = "swap.current",
6726	7497	.flags = CFTYPE_NOT_ON_ROOT,
6727	7498	.read_u64 = swap_current_read,
	7499	+ },
	7500	+ {
	7501	+ .name = "swap.high",
	7502	+ .flags = CFTYPE_NOT_ON_ROOT,
	7503	+ .seq_show = swap_high_show,
	7504	+ .write = swap_high_write,
6728	7505	},
6729	7506	{
6730	7507	.name = "swap.max",
..	..	@@ -6741,7 +7518,7 @@
6741	7518	{ } /* terminate */
6742	7519	};
6743	7520
6744		-static struct cftype memsw_cgroup_files[] = {
	7521	+static struct cftype memsw_files[] = {
6745	7522	{
6746	7523	.name = "memsw.usage_in_bytes",
6747	7524	.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
..	..	@@ -6768,17 +7545,27 @@
6768	7545	{ }, /* terminate */
6769	7546	};
6770	7547
	7548	+/*
	7549	+ * If mem_cgroup_swap_init() is implemented as a subsys_initcall()
	7550	+ * instead of a core_initcall(), this could mean cgroup_memory_noswap still
	7551	+ * remains set to false even when memcg is disabled via "cgroup_disable=memory"
	7552	+ * boot parameter. This may result in premature OOPS inside
	7553	+ * mem_cgroup_get_nr_swap_pages() function in corner cases.
	7554	+ */
6771	7555	static int __init mem_cgroup_swap_init(void)
6772	7556	{
6773		- if (!mem_cgroup_disabled() && really_do_swap_account) {
6774		- do_swap_account = 1;
6775		- WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys,
6776		- swap_files));
6777		- WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
6778		- memsw_cgroup_files));
6779		- }
	7557	+ /* No memory control -> no swap control */
	7558	+ if (mem_cgroup_disabled())
	7559	+ cgroup_memory_noswap = true;
	7560	+
	7561	+ if (cgroup_memory_noswap)
	7562	+ return 0;
	7563	+
	7564	+ WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
	7565	+ WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files));
	7566	+
6780	7567	return 0;
6781	7568	}
6782		-subsys_initcall(mem_cgroup_swap_init);
	7569	+core_initcall(mem_cgroup_swap_init);
6783	7570
6784	7571	#endif /* CONFIG_MEMCG_SWAP */