~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0-or-later
1	2	/* memcontrol.c - Memory Controller
2	3	*
3	4	* Copyright IBM Corporation, 2007
..	..	@@ -19,26 +20,17 @@
19	20	* Lockless page tracking & accounting
20	21	* Unified hierarchy configuration model
21	22	* Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
22		- *
23		- * This program is free software; you can redistribute it and/or modify
24		- * it under the terms of the GNU General Public License as published by
25		- * the Free Software Foundation; either version 2 of the License, or
26		- * (at your option) any later version.
27		- *
28		- * This program is distributed in the hope that it will be useful,
29		- * but WITHOUT ANY WARRANTY; without even the implied warranty of
30		- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
31		- * GNU General Public License for more details.
32	23	*/
33	24
34	25	#include <linux/page_counter.h>
35	26	#include <linux/memcontrol.h>
36	27	#include <linux/cgroup.h>
37		-#include <linux/mm.h>
	28	+#include <linux/pagewalk.h>
38	29	#include <linux/sched/mm.h>
39	30	#include <linux/shmem_fs.h>
40	31	#include <linux/hugetlb.h>
41	32	#include <linux/pagemap.h>
	33	+#include <linux/vm_event_item.h>
42	34	#include <linux/smp.h>
43	35	#include <linux/page-flags.h>
44	36	#include <linux/backing-dev.h>
..	..	@@ -65,21 +57,26 @@
65	57	#include <linux/lockdep.h>
66	58	#include <linux/file.h>
67	59	#include <linux/tracehook.h>
	60	+#include <linux/psi.h>
	61	+#include <linux/seq_buf.h>
68	62	#include "internal.h"
69	63	#include <net/sock.h>
70	64	#include <net/ip.h>
71	65	#include "slab.h"
	66	+#include <linux/local_lock.h>
72	67
73	68	#include <linux/uaccess.h>
74	69
75	70	#include <trace/events/vmscan.h>
	71	+#include <trace/hooks/mm.h>
76	72
77	73	struct cgroup_subsys memory_cgrp_subsys __read_mostly;
78	74	EXPORT_SYMBOL(memory_cgrp_subsys);
79	75
80	76	struct mem_cgroup *root_mem_cgroup __read_mostly;
81	77
82		-#define MEM_CGROUP_RECLAIM_RETRIES 5
	78	+/* Active memory cgroup to use from an interrupt context */
	79	+DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
83	80
84	81	/* Socket memory accounting disabled? */
85	82	static bool cgroup_memory_nosocket;
..	..	@@ -89,28 +86,30 @@
89	86
90	87	/* Whether the swap controller is active */
91	88	#ifdef CONFIG_MEMCG_SWAP
92		-int do_swap_account __read_mostly;
	89	+bool cgroup_memory_noswap __read_mostly;
93	90	#else
94		-#define do_swap_account 0
	91	+#define cgroup_memory_noswap 1
95	92	#endif
	93	+
	94	+#ifdef CONFIG_CGROUP_WRITEBACK
	95	+static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
	96	+#endif
	97	+
	98	+struct event_lock {
	99	+ local_lock_t l;
	100	+};
	101	+static DEFINE_PER_CPU(struct event_lock, event_lock) = {
	102	+ .l = INIT_LOCAL_LOCK(l),
	103	+};
96	104
97	105	/* Whether legacy memory+swap accounting is active */
98	106	static bool do_memsw_account(void)
99	107	{
100		- return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
	108	+ return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
101	109	}
102		-
103		-static const char *const mem_cgroup_lru_names[] = {
104		- "inactive_anon",
105		- "active_anon",
106		- "inactive_file",
107		- "active_file",
108		- "unevictable",
109		-};
110	110
111	111	#define THRESHOLDS_EVENTS_TARGET 128
112	112	#define SOFTLIMIT_EVENTS_TARGET 1024
113		-#define NUMAINFO_EVENTS_TARGET 1024
114	113
115	114	/*
116	115	* Cgroups above their limits are maintained in a RB-Tree, independent of
..	..	@@ -210,14 +209,6 @@
210	209	#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
211	210	#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
212	211
213		-enum charge_type {
214		- MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
215		- MEM_CGROUP_CHARGE_TYPE_ANON,
216		- MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
217		- MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */
218		- NR_CHARGE_TYPE,
219		-};
220		-
221	212	/* for encoding cft->private value on file */
222	213	enum res_type {
223	214	_MEM,
..	..	@@ -248,7 +239,7 @@
248	239	iter != NULL; \
249	240	iter = mem_cgroup_iter(NULL, iter, NULL))
250	241
251		-static inline bool should_force_charge(void)
	242	+static inline bool task_is_dying(void)
252	243	{
253	244	return tsk_is_oom_victim(current) \|\| fatal_signal_pending(current) \|\|
254	245	(current->flags & PF_EXITING);
..	..	@@ -268,8 +259,100 @@
268	259	}
269	260
270	261	#ifdef CONFIG_MEMCG_KMEM
	262	+static DEFINE_SPINLOCK(objcg_lock);
	263	+
	264	+static void obj_cgroup_release(struct percpu_ref *ref)
	265	+{
	266	+ struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
	267	+ struct mem_cgroup *memcg;
	268	+ unsigned int nr_bytes;
	269	+ unsigned int nr_pages;
	270	+ unsigned long flags;
	271	+
	272	+ /*
	273	+ * At this point all allocated objects are freed, and
	274	+ * objcg->nr_charged_bytes can't have an arbitrary byte value.
	275	+ * However, it can be PAGE_SIZE or (x * PAGE_SIZE).
	276	+ *
	277	+ * The following sequence can lead to it:
	278	+ * 1) CPU0: objcg == stock->cached_objcg
	279	+ * 2) CPU1: we do a small allocation (e.g. 92 bytes),
	280	+ * PAGE_SIZE bytes are charged
	281	+ * 3) CPU1: a process from another memcg is allocating something,
	282	+ * the stock if flushed,
	283	+ * objcg->nr_charged_bytes = PAGE_SIZE - 92
	284	+ * 5) CPU0: we do release this object,
	285	+ * 92 bytes are added to stock->nr_bytes
	286	+ * 6) CPU0: stock is flushed,
	287	+ * 92 bytes are added to objcg->nr_charged_bytes
	288	+ *
	289	+ * In the result, nr_charged_bytes == PAGE_SIZE.
	290	+ * This page will be uncharged in obj_cgroup_release().
	291	+ */
	292	+ nr_bytes = atomic_read(&objcg->nr_charged_bytes);
	293	+ WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
	294	+ nr_pages = nr_bytes >> PAGE_SHIFT;
	295	+
	296	+ spin_lock_irqsave(&objcg_lock, flags);
	297	+ memcg = obj_cgroup_memcg(objcg);
	298	+ if (nr_pages)
	299	+ __memcg_kmem_uncharge(memcg, nr_pages);
	300	+ list_del(&objcg->list);
	301	+ mem_cgroup_put(memcg);
	302	+ spin_unlock_irqrestore(&objcg_lock, flags);
	303	+
	304	+ percpu_ref_exit(ref);
	305	+ kfree_rcu(objcg, rcu);
	306	+}
	307	+
	308	+static struct obj_cgroup *obj_cgroup_alloc(void)
	309	+{
	310	+ struct obj_cgroup *objcg;
	311	+ int ret;
	312	+
	313	+ objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
	314	+ if (!objcg)
	315	+ return NULL;
	316	+
	317	+ ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0,
	318	+ GFP_KERNEL);
	319	+ if (ret) {
	320	+ kfree(objcg);
	321	+ return NULL;
	322	+ }
	323	+ INIT_LIST_HEAD(&objcg->list);
	324	+ return objcg;
	325	+}
	326	+
	327	+static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
	328	+ struct mem_cgroup *parent)
	329	+{
	330	+ struct obj_cgroup objcg, iter;
	331	+
	332	+ objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
	333	+
	334	+ spin_lock_irq(&objcg_lock);
	335	+
	336	+ /* Move active objcg to the parent's list */
	337	+ xchg(&objcg->memcg, parent);
	338	+ css_get(&parent->css);
	339	+ list_add(&objcg->list, &parent->objcg_list);
	340	+
	341	+ /* Move already reparented objcgs to the parent's list */
	342	+ list_for_each_entry(iter, &memcg->objcg_list, list) {
	343	+ css_get(&parent->css);
	344	+ xchg(&iter->memcg, parent);
	345	+ css_put(&memcg->css);
	346	+ }
	347	+ list_splice(&memcg->objcg_list, &parent->objcg_list);
	348	+
	349	+ spin_unlock_irq(&objcg_lock);
	350	+
	351	+ percpu_ref_kill(&objcg->refcnt);
	352	+}
	353	+
271	354	/*
272		- * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
	355	+ * This will be used as a shrinker list's index.
273	356	* The main reason for not using cgroup id for this:
274	357	* this works better in sparse environments, where we have a lot of memcgs,
275	358	* but only a few kmem-limited. Or also, if we have, for instance, 200
..	..	@@ -312,14 +395,13 @@
312	395
313	396	/*
314	397	* A lot of the calls to the cache allocation functions are expected to be
315		- * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
	398	+ * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are
316	399	* conditional to this static branch, we'll have to allow modules that does
317	400	* kmem_cache_alloc and the such to see this symbol as well
318	401	*/
319	402	DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
320	403	EXPORT_SYMBOL(memcg_kmem_enabled_key);
321		-
322		-struct workqueue_struct *memcg_kmem_cache_wq;
	404	+#endif
323	405
324	406	static int memcg_shrinker_map_size;
325	407	static DEFINE_MUTEX(memcg_shrinker_map_mutex);
..	..	@@ -344,7 +426,7 @@
344	426	if (!old)
345	427	return 0;
346	428
347		- new = kvmalloc(sizeof(*new) + size, GFP_KERNEL);
	429	+ new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
348	430	if (!new)
349	431	return -ENOMEM;
350	432
..	..	@@ -388,7 +470,7 @@
388	470	mutex_lock(&memcg_shrinker_map_mutex);
389	471	size = memcg_shrinker_map_size;
390	472	for_each_node(nid) {
391		- map = kvzalloc(sizeof(*map) + size, GFP_KERNEL);
	473	+ map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid);
392	474	if (!map) {
393	475	memcg_free_shrinker_maps(memcg);
394	476	ret = -ENOMEM;
..	..	@@ -445,14 +527,6 @@
445	527	}
446	528	}
447	529
448		-#else /* CONFIG_MEMCG_KMEM */
449		-static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
450		-{
451		- return 0;
452		-}
453		-static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { }
454		-#endif /* CONFIG_MEMCG_KMEM */
455		-
456	530	/**
457	531	* mem_cgroup_css_from_page - css of the memcg associated with a page
458	532	* @page: page of interest
..	..	@@ -495,7 +569,17 @@
495	569	unsigned long ino = 0;
496	570
497	571	rcu_read_lock();
498		- memcg = READ_ONCE(page->mem_cgroup);
	572	+ memcg = page->mem_cgroup;
	573	+
	574	+ /*
	575	+ * The lowest bit set means that memcg isn't a valid
	576	+ * memcg pointer, but a obj_cgroups pointer.
	577	+ * In this case the page is shared and doesn't belong
	578	+ * to any specific memory cgroup.
	579	+ */
	580	+ if ((unsigned long) memcg & 0x1UL)
	581	+ memcg = NULL;
	582	+
499	583	while (memcg && !(memcg->css.flags & CSS_ONLINE))
500	584	memcg = parent_mem_cgroup(memcg);
501	585	if (memcg)
..	..	@@ -671,7 +755,7 @@
671	755	*/
672	756	__mem_cgroup_remove_exceeded(mz, mctz);
673	757	if (!soft_limit_excess(mz->memcg) \|\|
674		- !css_tryget_online(&mz->memcg->css))
	758	+ !css_tryget(&mz->memcg->css))
675	759	goto retry;
676	760	done:
677	761	return mz;
..	..	@@ -688,33 +772,187 @@
688	772	return mz;
689	773	}
690	774
691		-static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
692		- int event)
	775	+/**
	776	+ * __mod_memcg_state - update cgroup memory statistics
	777	+ * @memcg: the memory cgroup
	778	+ * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
	779	+ * @val: delta to add to the counter, can be negative
	780	+ */
	781	+void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
693	782	{
694		- return atomic_long_read(&memcg->events[event]);
	783	+ long x, threshold = MEMCG_CHARGE_BATCH;
	784	+
	785	+ if (mem_cgroup_disabled())
	786	+ return;
	787	+
	788	+ if (memcg_stat_item_in_bytes(idx))
	789	+ threshold <<= PAGE_SHIFT;
	790	+
	791	+ x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
	792	+ if (unlikely(abs(x) > threshold)) {
	793	+ struct mem_cgroup *mi;
	794	+
	795	+ /*
	796	+ * Batch local counters to keep them in sync with
	797	+ * the hierarchical ones.
	798	+ */
	799	+ __this_cpu_add(memcg->vmstats_local->stat[idx], x);
	800	+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
	801	+ atomic_long_add(x, &mi->vmstats[idx]);
	802	+ x = 0;
	803	+ }
	804	+ __this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
	805	+}
	806	+
	807	+static struct mem_cgroup_per_node *
	808	+parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
	809	+{
	810	+ struct mem_cgroup *parent;
	811	+
	812	+ parent = parent_mem_cgroup(pn->memcg);
	813	+ if (!parent)
	814	+ return NULL;
	815	+ return mem_cgroup_nodeinfo(parent, nid);
	816	+}
	817	+
	818	+void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
	819	+ int val)
	820	+{
	821	+ struct mem_cgroup_per_node *pn;
	822	+ struct mem_cgroup *memcg;
	823	+ long x, threshold = MEMCG_CHARGE_BATCH;
	824	+
	825	+ pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
	826	+ memcg = pn->memcg;
	827	+
	828	+ preempt_disable_rt();
	829	+ /* Update memcg */
	830	+ __mod_memcg_state(memcg, idx, val);
	831	+
	832	+ /* Update lruvec */
	833	+ __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
	834	+
	835	+ if (vmstat_item_in_bytes(idx))
	836	+ threshold <<= PAGE_SHIFT;
	837	+
	838	+ x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
	839	+ if (unlikely(abs(x) > threshold)) {
	840	+ pg_data_t *pgdat = lruvec_pgdat(lruvec);
	841	+ struct mem_cgroup_per_node *pi;
	842	+
	843	+ for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
	844	+ atomic_long_add(x, &pi->lruvec_stat[idx]);
	845	+ x = 0;
	846	+ }
	847	+ __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
	848	+ preempt_enable_rt();
	849	+}
	850	+
	851	+/**
	852	+ * __mod_lruvec_state - update lruvec memory statistics
	853	+ * @lruvec: the lruvec
	854	+ * @idx: the stat item
	855	+ * @val: delta to add to the counter, can be negative
	856	+ *
	857	+ * The lruvec is the intersection of the NUMA node and a cgroup. This
	858	+ * function updates the all three counters that are affected by a
	859	+ * change of state at this level: per-node, per-cgroup, per-lruvec.
	860	+ */
	861	+void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
	862	+ int val)
	863	+{
	864	+ /* Update node */
	865	+ __mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
	866	+
	867	+ /* Update memcg and lruvec */
	868	+ if (!mem_cgroup_disabled())
	869	+ __mod_memcg_lruvec_state(lruvec, idx, val);
	870	+}
	871	+
	872	+void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
	873	+{
	874	+ pg_data_t *pgdat = page_pgdat(virt_to_page(p));
	875	+ struct mem_cgroup *memcg;
	876	+ struct lruvec *lruvec;
	877	+
	878	+ rcu_read_lock();
	879	+ memcg = mem_cgroup_from_obj(p);
	880	+
	881	+ /*
	882	+ * Untracked pages have no memcg, no lruvec. Update only the
	883	+ * node. If we reparent the slab objects to the root memcg,
	884	+ * when we free the slab object, we need to update the per-memcg
	885	+ * vmstats to keep it correct for the root memcg.
	886	+ */
	887	+ if (!memcg) {
	888	+ __mod_node_page_state(pgdat, idx, val);
	889	+ } else {
	890	+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
	891	+ __mod_lruvec_state(lruvec, idx, val);
	892	+ }
	893	+ rcu_read_unlock();
	894	+}
	895	+
	896	+void mod_memcg_obj_state(void *p, int idx, int val)
	897	+{
	898	+ struct mem_cgroup *memcg;
	899	+
	900	+ rcu_read_lock();
	901	+ memcg = mem_cgroup_from_obj(p);
	902	+ if (memcg)
	903	+ mod_memcg_state(memcg, idx, val);
	904	+ rcu_read_unlock();
	905	+}
	906	+
	907	+/**
	908	+ * __count_memcg_events - account VM events in a cgroup
	909	+ * @memcg: the memory cgroup
	910	+ * @idx: the event item
	911	+ * @count: the number of events that occured
	912	+ */
	913	+void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
	914	+ unsigned long count)
	915	+{
	916	+ unsigned long x;
	917	+
	918	+ if (mem_cgroup_disabled())
	919	+ return;
	920	+
	921	+ x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
	922	+ if (unlikely(x > MEMCG_CHARGE_BATCH)) {
	923	+ struct mem_cgroup *mi;
	924	+
	925	+ /*
	926	+ * Batch local counters to keep them in sync with
	927	+ * the hierarchical ones.
	928	+ */
	929	+ __this_cpu_add(memcg->vmstats_local->events[idx], x);
	930	+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
	931	+ atomic_long_add(x, &mi->vmevents[idx]);
	932	+ x = 0;
	933	+ }
	934	+ __this_cpu_write(memcg->vmstats_percpu->events[idx], x);
	935	+}
	936	+
	937	+static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
	938	+{
	939	+ return atomic_long_read(&memcg->vmevents[event]);
	940	+}
	941	+
	942	+static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
	943	+{
	944	+ long x = 0;
	945	+ int cpu;
	946	+
	947	+ for_each_possible_cpu(cpu)
	948	+ x += per_cpu(memcg->vmstats_local->events[event], cpu);
	949	+ return x;
695	950	}
696	951
697	952	static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
698	953	struct page *page,
699		- bool compound, int nr_pages)
	954	+ int nr_pages)
700	955	{
701		- /*
702		- * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
703		- * counted as CACHE even if it's on ANON LRU.
704		- */
705		- if (PageAnon(page))
706		- __mod_memcg_state(memcg, MEMCG_RSS, nr_pages);
707		- else {
708		- __mod_memcg_state(memcg, MEMCG_CACHE, nr_pages);
709		- if (PageSwapBacked(page))
710		- __mod_memcg_state(memcg, NR_SHMEM, nr_pages);
711		- }
712		-
713		- if (compound) {
714		- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
715		- __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages);
716		- }
717		-
718	956	/* pagein of a big page is an event. So, ignore page size */
719	957	if (nr_pages > 0)
720	958	__count_memcg_events(memcg, PGPGIN, 1);
..	..	@@ -723,35 +961,7 @@
723	961	nr_pages = -nr_pages; /* for event */
724	962	}
725	963
726		- __this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages);
727		-}
728		-
729		-unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
730		- int nid, unsigned int lru_mask)
731		-{
732		- struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
733		- unsigned long nr = 0;
734		- enum lru_list lru;
735		-
736		- VM_BUG_ON((unsigned)nid >= nr_node_ids);
737		-
738		- for_each_lru(lru) {
739		- if (!(BIT(lru) & lru_mask))
740		- continue;
741		- nr += mem_cgroup_get_lru_size(lruvec, lru);
742		- }
743		- return nr;
744		-}
745		-
746		-static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
747		- unsigned int lru_mask)
748		-{
749		- unsigned long nr = 0;
750		- int nid;
751		-
752		- for_each_node_state(nid, N_MEMORY)
753		- nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
754		- return nr;
	964	+ __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
755	965	}
756	966
757	967	static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
..	..	@@ -759,8 +969,8 @@
759	969	{
760	970	unsigned long val, next;
761	971
762		- val = __this_cpu_read(memcg->stat_cpu->nr_page_events);
763		- next = __this_cpu_read(memcg->stat_cpu->targets[target]);
	972	+ val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
	973	+ next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
764	974	/* from time_after() in jiffies.h */
765	975	if ((long)(next - val) < 0) {
766	976	switch (target) {
..	..	@@ -770,13 +980,10 @@
770	980	case MEM_CGROUP_TARGET_SOFTLIMIT:
771	981	next = val + SOFTLIMIT_EVENTS_TARGET;
772	982	break;
773		- case MEM_CGROUP_TARGET_NUMAINFO:
774		- next = val + NUMAINFO_EVENTS_TARGET;
775		- break;
776	983	default:
777	984	break;
778	985	}
779		- __this_cpu_write(memcg->stat_cpu->targets[target], next);
	986	+ __this_cpu_write(memcg->vmstats_percpu->targets[target], next);
780	987	return true;
781	988	}
782	989	return false;
..	..	@@ -792,21 +999,12 @@
792	999	if (unlikely(mem_cgroup_event_ratelimit(memcg,
793	1000	MEM_CGROUP_TARGET_THRESH))) {
794	1001	bool do_softlimit;
795		- bool do_numainfo __maybe_unused;
796	1002
797	1003	do_softlimit = mem_cgroup_event_ratelimit(memcg,
798	1004	MEM_CGROUP_TARGET_SOFTLIMIT);
799		-#if MAX_NUMNODES > 1
800		- do_numainfo = mem_cgroup_event_ratelimit(memcg,
801		- MEM_CGROUP_TARGET_NUMAINFO);
802		-#endif
803	1005	mem_cgroup_threshold(memcg);
804	1006	if (unlikely(do_softlimit))
805	1007	mem_cgroup_update_tree(memcg, page);
806		-#if MAX_NUMNODES > 1
807		- if (unlikely(do_numainfo))
808		- atomic_inc(&memcg->numainfo_events);
809		-#endif
810	1008	}
811	1009	}
812	1010
..	..	@@ -874,27 +1072,60 @@
874	1072	return NULL;
875	1073
876	1074	rcu_read_lock();
877		- if (!memcg \|\| !css_tryget_online(&memcg->css))
	1075	+ /* Page should not get uncharged and freed memcg under us. */
	1076	+ if (!memcg \|\| WARN_ON_ONCE(!css_tryget(&memcg->css)))
878	1077	memcg = root_mem_cgroup;
879	1078	rcu_read_unlock();
880	1079	return memcg;
881	1080	}
882	1081	EXPORT_SYMBOL(get_mem_cgroup_from_page);
883	1082
	1083	+static __always_inline struct mem_cgroup *active_memcg(void)
	1084	+{
	1085	+ if (in_interrupt())
	1086	+ return this_cpu_read(int_active_memcg);
	1087	+ else
	1088	+ return current->active_memcg;
	1089	+}
	1090	+
	1091	+static __always_inline struct mem_cgroup *get_active_memcg(void)
	1092	+{
	1093	+ struct mem_cgroup *memcg;
	1094	+
	1095	+ rcu_read_lock();
	1096	+ memcg = active_memcg();
	1097	+ /* remote memcg must hold a ref. */
	1098	+ if (memcg && WARN_ON_ONCE(!css_tryget(&memcg->css)))
	1099	+ memcg = root_mem_cgroup;
	1100	+ rcu_read_unlock();
	1101	+
	1102	+ return memcg;
	1103	+}
	1104	+
	1105	+static __always_inline bool memcg_kmem_bypass(void)
	1106	+{
	1107	+ /* Allow remote memcg charging from any context. */
	1108	+ if (unlikely(active_memcg()))
	1109	+ return false;
	1110	+
	1111	+ /* Memcg to charge can't be determined. */
	1112	+ if (in_interrupt() \|\| !current->mm \|\| (current->flags & PF_KTHREAD))
	1113	+ return true;
	1114	+
	1115	+ return false;
	1116	+}
	1117	+
884	1118	/**
885		- * If current->active_memcg is non-NULL, do not fallback to current->mm->memcg.
	1119	+ * If active memcg is set, do not fallback to current->mm->memcg.
886	1120	*/
887	1121	static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
888	1122	{
889		- if (unlikely(current->active_memcg)) {
890		- struct mem_cgroup *memcg = root_mem_cgroup;
	1123	+ if (memcg_kmem_bypass())
	1124	+ return NULL;
891	1125
892		- rcu_read_lock();
893		- if (css_tryget_online(&current->active_memcg->css))
894		- memcg = current->active_memcg;
895		- rcu_read_unlock();
896		- return memcg;
897		- }
	1126	+ if (unlikely(active_memcg()))
	1127	+ return get_active_memcg();
	1128	+
898	1129	return get_mem_cgroup_from_mm(current->mm);
899	1130	}
900	1131
..	..	@@ -911,15 +1142,15 @@
911	1142	* invocations for reference counting, or use mem_cgroup_iter_break()
912	1143	* to cancel a hierarchy walk before the round-trip is complete.
913	1144	*
914		- * Reclaimers can specify a node and a priority level in @reclaim to
915		- * divide up the memcgs in the hierarchy among all concurrent
916		- * reclaimers operating on the same node and priority.
	1145	+ * Reclaimers can specify a node in @reclaim to divide up the memcgs
	1146	+ * in the hierarchy among all concurrent reclaimers operating on the
	1147	+ * same node.
917	1148	*/
918	1149	struct mem_cgroup mem_cgroup_iter(struct mem_cgroup root,
919	1150	struct mem_cgroup *prev,
920	1151	struct mem_cgroup_reclaim_cookie *reclaim)
921	1152	{
922		- struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
	1153	+ struct mem_cgroup_reclaim_iter *iter;
923	1154	struct cgroup_subsys_state *css = NULL;
924	1155	struct mem_cgroup *memcg = NULL;
925	1156	struct mem_cgroup *pos = NULL;
..	..	@@ -945,7 +1176,7 @@
945	1176	struct mem_cgroup_per_node *mz;
946	1177
947	1178	mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
948		- iter = &mz->iter[reclaim->priority];
	1179	+ iter = &mz->iter;
949	1180
950	1181	if (prev && reclaim->generation != iter->generation)
951	1182	goto out_unlock;
..	..	@@ -1045,15 +1276,11 @@
1045	1276	struct mem_cgroup_reclaim_iter *iter;
1046	1277	struct mem_cgroup_per_node *mz;
1047	1278	int nid;
1048		- int i;
1049	1279
1050	1280	for_each_node(nid) {
1051	1281	mz = mem_cgroup_nodeinfo(from, nid);
1052		- for (i = 0; i <= DEF_PRIORITY; i++) {
1053		- iter = &mz->iter[i];
1054		- cmpxchg(&iter->position,
1055		- dead_memcg, NULL);
1056		- }
	1282	+ iter = &mz->iter;
	1283	+ cmpxchg(&iter->position, dead_memcg, NULL);
1057	1284	}
1058	1285	}
1059	1286
..	..	@@ -1103,7 +1330,7 @@
1103	1330	struct css_task_iter it;
1104	1331	struct task_struct *task;
1105	1332
1106		- css_task_iter_start(&iter->css, 0, &it);
	1333	+ css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
1107	1334	while (!ret && (task = css_task_iter_next(&it)))
1108	1335	ret = fn(task, arg);
1109	1336	css_task_iter_end(&it);
..	..	@@ -1120,9 +1347,8 @@
1120	1347	* @page: the page
1121	1348	* @pgdat: pgdat of the page
1122	1349	*
1123		- * This function is only safe when following the LRU page isolation
1124		- * and putback protocol: the LRU lock must be held, and the page must
1125		- * either be PageLRU() or the caller must have isolated/allocated it.
	1350	+ * This function relies on page->mem_cgroup being stable - see the
	1351	+ * access rules in commit_charge().
1126	1352	*/
1127	1353	struct lruvec mem_cgroup_page_lruvec(struct page page, struct pglist_data *pgdat)
1128	1354	{
..	..	@@ -1131,7 +1357,7 @@
1131	1357	struct lruvec *lruvec;
1132	1358
1133	1359	if (mem_cgroup_disabled()) {
1134		- lruvec = &pgdat->lruvec;
	1360	+ lruvec = &pgdat->__lruvec;
1135	1361	goto out;
1136	1362	}
1137	1363
..	..	@@ -1155,6 +1381,38 @@
1155	1381	lruvec->pgdat = pgdat;
1156	1382	return lruvec;
1157	1383	}
	1384	+
	1385	+struct lruvec page_to_lruvec(struct page page, pg_data_t *pgdat)
	1386	+{
	1387	+ struct lruvec *lruvec;
	1388	+
	1389	+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
	1390	+
	1391	+ return lruvec;
	1392	+}
	1393	+EXPORT_SYMBOL_GPL(page_to_lruvec);
	1394	+
	1395	+void do_traversal_all_lruvec(void)
	1396	+{
	1397	+ pg_data_t *pgdat;
	1398	+
	1399	+ for_each_online_pgdat(pgdat) {
	1400	+ struct mem_cgroup *memcg = NULL;
	1401	+
	1402	+ spin_lock_irq(&pgdat->lru_lock);
	1403	+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
	1404	+ do {
	1405	+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
	1406	+
	1407	+ trace_android_vh_do_traversal_lruvec(lruvec);
	1408	+
	1409	+ memcg = mem_cgroup_iter(NULL, memcg, NULL);
	1410	+ } while (memcg);
	1411	+
	1412	+ spin_unlock_irq(&pgdat->lru_lock);
	1413	+ }
	1414	+}
	1415	+EXPORT_SYMBOL_GPL(do_traversal_all_lruvec);
1158	1416
1159	1417	/**
1160	1418	* mem_cgroup_update_lru_size - account for adding or removing an lru page
..	..	@@ -1195,32 +1453,6 @@
1195	1453	*lru_size += nr_pages;
1196	1454	}
1197	1455
1198		-bool task_in_mem_cgroup(struct task_struct task, struct mem_cgroup memcg)
1199		-{
1200		- struct mem_cgroup *task_memcg;
1201		- struct task_struct *p;
1202		- bool ret;
1203		-
1204		- p = find_lock_task_mm(task);
1205		- if (p) {
1206		- task_memcg = get_mem_cgroup_from_mm(p->mm);
1207		- task_unlock(p);
1208		- } else {
1209		- /*
1210		- * All threads may have already detached their mm's, but the oom
1211		- * killer still needs to detect if they have already been oom
1212		- * killed to prevent needlessly killing additional tasks.
1213		- */
1214		- rcu_read_lock();
1215		- task_memcg = mem_cgroup_from_task(task);
1216		- css_get(&task_memcg->css);
1217		- rcu_read_unlock();
1218		- }
1219		- ret = mem_cgroup_is_descendant(task_memcg, memcg);
1220		- css_put(&task_memcg->css);
1221		- return ret;
1222		-}
1223		-
1224	1456	/**
1225	1457	* mem_cgroup_margin - calculate chargeable space of a memory cgroup
1226	1458	* @memcg: the memory cgroup
..	..	@@ -1242,7 +1474,7 @@
1242	1474	if (do_memsw_account()) {
1243	1475	count = page_counter_read(&memcg->memsw);
1244	1476	limit = READ_ONCE(memcg->memsw.max);
1245		- if (count <= limit)
	1477	+ if (count < limit)
1246	1478	margin = min(margin, limit - count);
1247	1479	else
1248	1480	margin = 0;
..	..	@@ -1296,85 +1528,199 @@
1296	1528	return false;
1297	1529	}
1298	1530
1299		-static const unsigned int memcg1_stats[] = {
1300		- MEMCG_CACHE,
1301		- MEMCG_RSS,
1302		- MEMCG_RSS_HUGE,
1303		- NR_SHMEM,
1304		- NR_FILE_MAPPED,
1305		- NR_FILE_DIRTY,
1306		- NR_WRITEBACK,
1307		- MEMCG_SWAP,
	1531	+struct memory_stat {
	1532	+ const char *name;
	1533	+ unsigned int ratio;
	1534	+ unsigned int idx;
1308	1535	};
1309	1536
1310		-static const char *const memcg1_stat_names[] = {
1311		- "cache",
1312		- "rss",
1313		- "rss_huge",
1314		- "shmem",
1315		- "mapped_file",
1316		- "dirty",
1317		- "writeback",
1318		- "swap",
	1537	+static struct memory_stat memory_stats[] = {
	1538	+ { "anon", PAGE_SIZE, NR_ANON_MAPPED },
	1539	+ { "file", PAGE_SIZE, NR_FILE_PAGES },
	1540	+ { "kernel_stack", 1024, NR_KERNEL_STACK_KB },
	1541	+ { "percpu", 1, MEMCG_PERCPU_B },
	1542	+ { "sock", PAGE_SIZE, MEMCG_SOCK },
	1543	+ { "shmem", PAGE_SIZE, NR_SHMEM },
	1544	+ { "file_mapped", PAGE_SIZE, NR_FILE_MAPPED },
	1545	+ { "file_dirty", PAGE_SIZE, NR_FILE_DIRTY },
	1546	+ { "file_writeback", PAGE_SIZE, NR_WRITEBACK },
	1547	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	1548	+ /*
	1549	+ * The ratio will be initialized in memory_stats_init(). Because
	1550	+ * on some architectures, the macro of HPAGE_PMD_SIZE is not
	1551	+ * constant(e.g. powerpc).
	1552	+ */
	1553	+ { "anon_thp", 0, NR_ANON_THPS },
	1554	+#endif
	1555	+ { "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON },
	1556	+ { "active_anon", PAGE_SIZE, NR_ACTIVE_ANON },
	1557	+ { "inactive_file", PAGE_SIZE, NR_INACTIVE_FILE },
	1558	+ { "active_file", PAGE_SIZE, NR_ACTIVE_FILE },
	1559	+ { "unevictable", PAGE_SIZE, NR_UNEVICTABLE },
	1560	+
	1561	+ /*
	1562	+ * Note: The slab_reclaimable and slab_unreclaimable must be
	1563	+ * together and slab_reclaimable must be in front.
	1564	+ */
	1565	+ { "slab_reclaimable", 1, NR_SLAB_RECLAIMABLE_B },
	1566	+ { "slab_unreclaimable", 1, NR_SLAB_UNRECLAIMABLE_B },
	1567	+
	1568	+ /* The memory events */
	1569	+ { "workingset_refault_anon", 1, WORKINGSET_REFAULT_ANON },
	1570	+ { "workingset_refault_file", 1, WORKINGSET_REFAULT_FILE },
	1571	+ { "workingset_activate_anon", 1, WORKINGSET_ACTIVATE_ANON },
	1572	+ { "workingset_activate_file", 1, WORKINGSET_ACTIVATE_FILE },
	1573	+ { "workingset_restore_anon", 1, WORKINGSET_RESTORE_ANON },
	1574	+ { "workingset_restore_file", 1, WORKINGSET_RESTORE_FILE },
	1575	+ { "workingset_nodereclaim", 1, WORKINGSET_NODERECLAIM },
1319	1576	};
	1577	+
	1578	+static int __init memory_stats_init(void)
	1579	+{
	1580	+ int i;
	1581	+
	1582	+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
	1583	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	1584	+ if (memory_stats[i].idx == NR_ANON_THPS)
	1585	+ memory_stats[i].ratio = HPAGE_PMD_SIZE;
	1586	+#endif
	1587	+ VM_BUG_ON(!memory_stats[i].ratio);
	1588	+ VM_BUG_ON(memory_stats[i].idx >= MEMCG_NR_STAT);
	1589	+ }
	1590	+
	1591	+ return 0;
	1592	+}
	1593	+pure_initcall(memory_stats_init);
	1594	+
	1595	+static char memory_stat_format(struct mem_cgroup memcg)
	1596	+{
	1597	+ struct seq_buf s;
	1598	+ int i;
	1599	+
	1600	+ seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
	1601	+ if (!s.buffer)
	1602	+ return NULL;
	1603	+
	1604	+ /*
	1605	+ * Provide statistics on the state of the memory subsystem as
	1606	+ * well as cumulative event counters that show past behavior.
	1607	+ *
	1608	+ * This list is ordered following a combination of these gradients:
	1609	+ * 1) generic big picture -> specifics and details
	1610	+ * 2) reflecting userspace activity -> reflecting kernel heuristics
	1611	+ *
	1612	+ * Current memory state:
	1613	+ */
	1614	+
	1615	+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
	1616	+ u64 size;
	1617	+
	1618	+ size = memcg_page_state(memcg, memory_stats[i].idx);
	1619	+ size *= memory_stats[i].ratio;
	1620	+ seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size);
	1621	+
	1622	+ if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
	1623	+ size = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
	1624	+ memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B);
	1625	+ seq_buf_printf(&s, "slab %llu\n", size);
	1626	+ }
	1627	+ }
	1628	+
	1629	+ /* Accumulated memory events */
	1630	+
	1631	+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT),
	1632	+ memcg_events(memcg, PGFAULT));
	1633	+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
	1634	+ memcg_events(memcg, PGMAJFAULT));
	1635	+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGREFILL),
	1636	+ memcg_events(memcg, PGREFILL));
	1637	+ seq_buf_printf(&s, "pgscan %lu\n",
	1638	+ memcg_events(memcg, PGSCAN_KSWAPD) +
	1639	+ memcg_events(memcg, PGSCAN_DIRECT));
	1640	+ seq_buf_printf(&s, "pgsteal %lu\n",
	1641	+ memcg_events(memcg, PGSTEAL_KSWAPD) +
	1642	+ memcg_events(memcg, PGSTEAL_DIRECT));
	1643	+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE),
	1644	+ memcg_events(memcg, PGACTIVATE));
	1645	+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE),
	1646	+ memcg_events(memcg, PGDEACTIVATE));
	1647	+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE),
	1648	+ memcg_events(memcg, PGLAZYFREE));
	1649	+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED),
	1650	+ memcg_events(memcg, PGLAZYFREED));
	1651	+
	1652	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	1653	+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC),
	1654	+ memcg_events(memcg, THP_FAULT_ALLOC));
	1655	+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC),
	1656	+ memcg_events(memcg, THP_COLLAPSE_ALLOC));
	1657	+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
	1658	+
	1659	+ /* The above should easily fit into one page */
	1660	+ WARN_ON_ONCE(seq_buf_has_overflowed(&s));
	1661	+
	1662	+ return s.buffer;
	1663	+}
1320	1664
1321	1665	#define K(x) ((x) << (PAGE_SHIFT-10))
1322	1666	/**
1323		- * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
	1667	+ * mem_cgroup_print_oom_context: Print OOM information relevant to
	1668	+ * memory controller.
1324	1669	* @memcg: The memory cgroup that went over limit
1325	1670	* @p: Task that is going to be killed
1326	1671	*
1327	1672	* NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1328	1673	* enabled
1329	1674	*/
1330		-void mem_cgroup_print_oom_info(struct mem_cgroup memcg, struct task_struct p)
	1675	+void mem_cgroup_print_oom_context(struct mem_cgroup memcg, struct task_struct p)
1331	1676	{
1332		- struct mem_cgroup *iter;
1333		- unsigned int i;
1334		-
1335	1677	rcu_read_lock();
1336	1678
	1679	+ if (memcg) {
	1680	+ pr_cont(",oom_memcg=");
	1681	+ pr_cont_cgroup_path(memcg->css.cgroup);
	1682	+ } else
	1683	+ pr_cont(",global_oom");
1337	1684	if (p) {
1338		- pr_info("Task in ");
	1685	+ pr_cont(",task_memcg=");
1339	1686	pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1340		- pr_cont(" killed as a result of limit of ");
1341		- } else {
1342		- pr_info("Memory limit reached of cgroup ");
1343	1687	}
1344		-
1345		- pr_cont_cgroup_path(memcg->css.cgroup);
1346		- pr_cont("\n");
1347		-
1348	1688	rcu_read_unlock();
	1689	+}
	1690	+
	1691	+/**
	1692	+ * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to
	1693	+ * memory controller.
	1694	+ * @memcg: The memory cgroup that went over limit
	1695	+ */
	1696	+void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
	1697	+{
	1698	+ char *buf;
1349	1699
1350	1700	pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1351	1701	K((u64)page_counter_read(&memcg->memory)),
1352		- K((u64)memcg->memory.max), memcg->memory.failcnt);
1353		- pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1354		- K((u64)page_counter_read(&memcg->memsw)),
1355		- K((u64)memcg->memsw.max), memcg->memsw.failcnt);
1356		- pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1357		- K((u64)page_counter_read(&memcg->kmem)),
1358		- K((u64)memcg->kmem.max), memcg->kmem.failcnt);
1359		-
1360		- for_each_mem_cgroup_tree(iter, memcg) {
1361		- pr_info("Memory cgroup stats for ");
1362		- pr_cont_cgroup_path(iter->css.cgroup);
1363		- pr_cont(":");
1364		-
1365		- for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
1366		- if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account)
1367		- continue;
1368		- pr_cont(" %s:%luKB", memcg1_stat_names[i],
1369		- K(memcg_page_state(iter, memcg1_stats[i])));
1370		- }
1371		-
1372		- for (i = 0; i < NR_LRU_LISTS; i++)
1373		- pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
1374		- K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
1375		-
1376		- pr_cont("\n");
	1702	+ K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt);
	1703	+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
	1704	+ pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
	1705	+ K((u64)page_counter_read(&memcg->swap)),
	1706	+ K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt);
	1707	+ else {
	1708	+ pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
	1709	+ K((u64)page_counter_read(&memcg->memsw)),
	1710	+ K((u64)memcg->memsw.max), memcg->memsw.failcnt);
	1711	+ pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
	1712	+ K((u64)page_counter_read(&memcg->kmem)),
	1713	+ K((u64)memcg->kmem.max), memcg->kmem.failcnt);
1377	1714	}
	1715	+
	1716	+ pr_info("Memory cgroup stats for ");
	1717	+ pr_cont_cgroup_path(memcg->css.cgroup);
	1718	+ pr_cont(":");
	1719	+ buf = memory_stat_format(memcg);
	1720	+ if (!buf)
	1721	+ return;
	1722	+ pr_info("%s", buf);
	1723	+ kfree(buf);
1378	1724	}
1379	1725
1380	1726	/*
..	..	@@ -1382,19 +1728,26 @@
1382	1728	*/
1383	1729	unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
1384	1730	{
1385		- unsigned long max;
	1731	+ unsigned long max = READ_ONCE(memcg->memory.max);
1386	1732
1387		- max = memcg->memory.max;
1388		- if (mem_cgroup_swappiness(memcg)) {
1389		- unsigned long memsw_max;
1390		- unsigned long swap_max;
	1733	+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
	1734	+ if (mem_cgroup_swappiness(memcg))
	1735	+ max += min(READ_ONCE(memcg->swap.max),
	1736	+ (unsigned long)total_swap_pages);
	1737	+ } else { /* v1 */
	1738	+ if (mem_cgroup_swappiness(memcg)) {
	1739	+ /* Calculate swap excess capacity from memsw limit */
	1740	+ unsigned long swap = READ_ONCE(memcg->memsw.max) - max;
1391	1741
1392		- memsw_max = memcg->memsw.max;
1393		- swap_max = memcg->swap.max;
1394		- swap_max = min(swap_max, (unsigned long)total_swap_pages);
1395		- max = min(max + swap_max, memsw_max);
	1742	+ max += min(swap, (unsigned long)total_swap_pages);
	1743	+ }
1396	1744	}
1397	1745	return max;
	1746	+}
	1747	+
	1748	+unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
	1749	+{
	1750	+ return page_counter_read(&memcg->memory);
1398	1751	}
1399	1752
1400	1753	static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
..	..	@@ -1407,112 +1760,24 @@
1407	1760	.gfp_mask = gfp_mask,
1408	1761	.order = order,
1409	1762	};
1410		- bool ret;
	1763	+ bool ret = true;
1411	1764
1412	1765	if (mutex_lock_killable(&oom_lock))
1413	1766	return true;
	1767	+
	1768	+ if (mem_cgroup_margin(memcg) >= (1 << order))
	1769	+ goto unlock;
	1770	+
1414	1771	/*
1415	1772	* A few threads which were not waiting at mutex_lock_killable() can
1416	1773	* fail to bail out. Therefore, check again after holding oom_lock.
1417	1774	*/
1418		- ret = should_force_charge() \|\| out_of_memory(&oc);
	1775	+ ret = task_is_dying() \|\| out_of_memory(&oc);
	1776	+
	1777	+unlock:
1419	1778	mutex_unlock(&oom_lock);
1420	1779	return ret;
1421	1780	}
1422		-
1423		-#if MAX_NUMNODES > 1
1424		-
1425		-/**
1426		- * test_mem_cgroup_node_reclaimable
1427		- * @memcg: the target memcg
1428		- * @nid: the node ID to be checked.
1429		- * @noswap : specify true here if the user wants flle only information.
1430		- *
1431		- * This function returns whether the specified memcg contains any
1432		- * reclaimable pages on a node. Returns true if there are any reclaimable
1433		- * pages in the node.
1434		- */
1435		-static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1436		- int nid, bool noswap)
1437		-{
1438		- if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
1439		- return true;
1440		- if (noswap \|\| !total_swap_pages)
1441		- return false;
1442		- if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
1443		- return true;
1444		- return false;
1445		-
1446		-}
1447		-
1448		-/*
1449		- * Always updating the nodemask is not very good - even if we have an empty
1450		- * list or the wrong list here, we can start from some node and traverse all
1451		- * nodes based on the zonelist. So update the list loosely once per 10 secs.
1452		- *
1453		- */
1454		-static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1455		-{
1456		- int nid;
1457		- /*
1458		- * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
1459		- * pagein/pageout changes since the last update.
1460		- */
1461		- if (!atomic_read(&memcg->numainfo_events))
1462		- return;
1463		- if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1464		- return;
1465		-
1466		- /* make a nodemask where this memcg uses memory from */
1467		- memcg->scan_nodes = node_states[N_MEMORY];
1468		-
1469		- for_each_node_mask(nid, node_states[N_MEMORY]) {
1470		-
1471		- if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1472		- node_clear(nid, memcg->scan_nodes);
1473		- }
1474		-
1475		- atomic_set(&memcg->numainfo_events, 0);
1476		- atomic_set(&memcg->numainfo_updating, 0);
1477		-}
1478		-
1479		-/*
1480		- * Selecting a node where we start reclaim from. Because what we need is just
1481		- * reducing usage counter, start from anywhere is O,K. Considering
1482		- * memory reclaim from current node, there are pros. and cons.
1483		- *
1484		- * Freeing memory from current node means freeing memory from a node which
1485		- * we'll use or we've used. So, it may make LRU bad. And if several threads
1486		- * hit limits, it will see a contention on a node. But freeing from remote
1487		- * node means more costs for memory reclaim because of memory latency.
1488		- *
1489		- * Now, we use round-robin. Better algorithm is welcomed.
1490		- */
1491		-int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1492		-{
1493		- int node;
1494		-
1495		- mem_cgroup_may_update_nodemask(memcg);
1496		- node = memcg->last_scanned_node;
1497		-
1498		- node = next_node_in(node, memcg->scan_nodes);
1499		- /*
1500		- * mem_cgroup_may_update_nodemask might have seen no reclaimmable pages
1501		- * last time it really checked all the LRUs due to rate limiting.
1502		- * Fallback to the current node in that case for simplicity.
1503		- */
1504		- if (unlikely(node == MAX_NUMNODES))
1505		- node = numa_node_id();
1506		-
1507		- memcg->last_scanned_node = node;
1508		- return node;
1509		-}
1510		-#else
1511		-int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1512		-{
1513		- return 0;
1514		-}
1515		-#endif
1516	1781
1517	1782	static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1518	1783	pg_data_t *pgdat,
..	..	@@ -1526,7 +1791,6 @@
1526	1791	unsigned long nr_scanned;
1527	1792	struct mem_cgroup_reclaim_cookie reclaim = {
1528	1793	.pgdat = pgdat,
1529		- .priority = 0,
1530	1794	};
1531	1795
1532	1796	excess = soft_limit_excess(root_memcg);
..	..	@@ -1621,7 +1885,7 @@
1621	1885	struct mem_cgroup *iter;
1622	1886
1623	1887	spin_lock(&memcg_oom_lock);
1624		- mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
	1888	+ mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
1625	1889	for_each_mem_cgroup_tree(iter, memcg)
1626	1890	iter->oom_lock = false;
1627	1891	spin_unlock(&memcg_oom_lock);
..	..	@@ -1642,8 +1906,8 @@
1642	1906	struct mem_cgroup *iter;
1643	1907
1644	1908	/*
1645		- * When a new child is created while the hierarchy is under oom,
1646		- * mem_cgroup_oom_lock() may not be called. Watch for underflow.
	1909	+ * Be careful about under_oom underflows becase a child memcg
	1910	+ * could have been added after mem_cgroup_mark_under_oom.
1647	1911	*/
1648	1912	spin_lock(&memcg_oom_lock);
1649	1913	for_each_mem_cgroup_tree(iter, memcg)
..	..	@@ -1703,6 +1967,8 @@
1703	1967
1704	1968	if (order > PAGE_ALLOC_COSTLY_ORDER)
1705	1969	return OOM_SKIPPED;
	1970	+
	1971	+ memcg_memory_event(memcg, MEMCG_OOM);
1706	1972
1707	1973	/*
1708	1974	* We are in the middle of the charge context here, so we
..	..	@@ -1851,6 +2117,14 @@
1851	2117	goto out;
1852	2118
1853	2119	/*
	2120	+ * If the victim task has been asynchronously moved to a different
	2121	+ * memory cgroup, we might end up killing tasks outside oom_domain.
	2122	+ * In this case it's better to ignore memory.group.oom.
	2123	+ */
	2124	+ if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain)))
	2125	+ goto out;
	2126	+
	2127	+ /*
1854	2128	* Traverse the memory cgroup hierarchy from the victim task's
1855	2129	* cgroup up to the OOMing cgroup (or root) to find the
1856	2130	* highest-level memory cgroup with oom.group set.
..	..	@@ -1891,6 +2165,7 @@
1891	2165	*/
1892	2166	struct mem_cgroup lock_page_memcg(struct page page)
1893	2167	{
	2168	+ struct page head = compound_head(page); / rmap on tail pages */
1894	2169	struct mem_cgroup *memcg;
1895	2170	unsigned long flags;
1896	2171
..	..	@@ -1910,7 +2185,7 @@
1910	2185	if (mem_cgroup_disabled())
1911	2186	return NULL;
1912	2187	again:
1913		- memcg = page->mem_cgroup;
	2188	+ memcg = head->mem_cgroup;
1914	2189	if (unlikely(!memcg))
1915	2190	return NULL;
1916	2191
..	..	@@ -1918,7 +2193,7 @@
1918	2193	return memcg;
1919	2194
1920	2195	spin_lock_irqsave(&memcg->move_lock, flags);
1921		- if (memcg != page->mem_cgroup) {
	2196	+ if (memcg != head->mem_cgroup) {
1922	2197	spin_unlock_irqrestore(&memcg->move_lock, flags);
1923	2198	goto again;
1924	2199	}
..	..	@@ -1961,19 +2236,44 @@
1961	2236	*/
1962	2237	void unlock_page_memcg(struct page *page)
1963	2238	{
1964		- __unlock_page_memcg(page->mem_cgroup);
	2239	+ struct page *head = compound_head(page);
	2240	+
	2241	+ __unlock_page_memcg(head->mem_cgroup);
1965	2242	}
1966	2243	EXPORT_SYMBOL(unlock_page_memcg);
1967	2244
1968	2245	struct memcg_stock_pcp {
	2246	+ local_lock_t lock;
1969	2247	struct mem_cgroup cached; / this never be root cgroup */
1970	2248	unsigned int nr_pages;
	2249	+
	2250	+#ifdef CONFIG_MEMCG_KMEM
	2251	+ struct obj_cgroup *cached_objcg;
	2252	+ unsigned int nr_bytes;
	2253	+#endif
	2254	+
1971	2255	struct work_struct work;
1972	2256	unsigned long flags;
1973	2257	#define FLUSHING_CACHED_CHARGE 0
1974	2258	};
1975	2259	static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1976	2260	static DEFINE_MUTEX(percpu_charge_mutex);
	2261	+
	2262	+#ifdef CONFIG_MEMCG_KMEM
	2263	+static void drain_obj_stock(struct memcg_stock_pcp *stock);
	2264	+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
	2265	+ struct mem_cgroup *root_memcg);
	2266	+
	2267	+#else
	2268	+static inline void drain_obj_stock(struct memcg_stock_pcp *stock)
	2269	+{
	2270	+}
	2271	+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
	2272	+ struct mem_cgroup *root_memcg)
	2273	+{
	2274	+ return false;
	2275	+}
	2276	+#endif
1977	2277
1978	2278	/**
1979	2279	* consume_stock: Try to consume stocked charge on this cpu.
..	..	@@ -1995,7 +2295,7 @@
1995	2295	if (nr_pages > MEMCG_CHARGE_BATCH)
1996	2296	return ret;
1997	2297
1998		- local_irq_save(flags);
	2298	+ local_lock_irqsave(&memcg_stock.lock, flags);
1999	2299
2000	2300	stock = this_cpu_ptr(&memcg_stock);
2001	2301	if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
..	..	@@ -2003,7 +2303,7 @@
2003	2303	ret = true;
2004	2304	}
2005	2305
2006		- local_irq_restore(flags);
	2306	+ local_unlock_irqrestore(&memcg_stock.lock, flags);
2007	2307
2008	2308	return ret;
2009	2309	}
..	..	@@ -2015,13 +2315,17 @@
2015	2315	{
2016	2316	struct mem_cgroup *old = stock->cached;
2017	2317
	2318	+ if (!old)
	2319	+ return;
	2320	+
2018	2321	if (stock->nr_pages) {
2019	2322	page_counter_uncharge(&old->memory, stock->nr_pages);
2020	2323	if (do_memsw_account())
2021	2324	page_counter_uncharge(&old->memsw, stock->nr_pages);
2022		- css_put_many(&old->css, stock->nr_pages);
2023	2325	stock->nr_pages = 0;
2024	2326	}
	2327	+
	2328	+ css_put(&old->css);
2025	2329	stock->cached = NULL;
2026	2330	}
2027	2331
..	..	@@ -2034,13 +2338,14 @@
2034	2338	* The only protection from memory hotplug vs. drain_stock races is
2035	2339	* that we always operate on local CPU stock here with IRQ disabled
2036	2340	*/
2037		- local_irq_save(flags);
	2341	+ local_lock_irqsave(&memcg_stock.lock, flags);
2038	2342
2039	2343	stock = this_cpu_ptr(&memcg_stock);
	2344	+ drain_obj_stock(stock);
2040	2345	drain_stock(stock);
2041	2346	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2042	2347
2043		- local_irq_restore(flags);
	2348	+ local_unlock_irqrestore(&memcg_stock.lock, flags);
2044	2349	}
2045	2350
2046	2351	/*
..	..	@@ -2052,11 +2357,12 @@
2052	2357	struct memcg_stock_pcp *stock;
2053	2358	unsigned long flags;
2054	2359
2055		- local_irq_save(flags);
	2360	+ local_lock_irqsave(&memcg_stock.lock, flags);
2056	2361
2057	2362	stock = this_cpu_ptr(&memcg_stock);
2058	2363	if (stock->cached != memcg) { /* reset if necessary */
2059	2364	drain_stock(stock);
	2365	+ css_get(&memcg->css);
2060	2366	stock->cached = memcg;
2061	2367	}
2062	2368	stock->nr_pages += nr_pages;
..	..	@@ -2064,7 +2370,7 @@
2064	2370	if (stock->nr_pages > MEMCG_CHARGE_BATCH)
2065	2371	drain_stock(stock);
2066	2372
2067		- local_irq_restore(flags);
	2373	+ local_unlock_irqrestore(&memcg_stock.lock, flags);
2068	2374	}
2069	2375
2070	2376	/*
..	..	@@ -2084,34 +2390,37 @@
2084	2390	* as well as workers from this path always operate on the local
2085	2391	* per-cpu data. CPU up doesn't touch memcg_stock at all.
2086	2392	*/
2087		- curcpu = get_cpu();
	2393	+ curcpu = get_cpu_light();
2088	2394	for_each_online_cpu(cpu) {
2089	2395	struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2090	2396	struct mem_cgroup *memcg;
	2397	+ bool flush = false;
2091	2398
	2399	+ rcu_read_lock();
2092	2400	memcg = stock->cached;
2093		- if (!memcg \|\| !stock->nr_pages \|\| !css_tryget(&memcg->css))
2094		- continue;
2095		- if (!mem_cgroup_is_descendant(memcg, root_memcg)) {
2096		- css_put(&memcg->css);
2097		- continue;
2098		- }
2099		- if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
	2401	+ if (memcg && stock->nr_pages &&
	2402	+ mem_cgroup_is_descendant(memcg, root_memcg))
	2403	+ flush = true;
	2404	+ if (obj_stock_flush_required(stock, root_memcg))
	2405	+ flush = true;
	2406	+ rcu_read_unlock();
	2407	+
	2408	+ if (flush &&
	2409	+ !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2100	2410	if (cpu == curcpu)
2101	2411	drain_local_stock(&stock->work);
2102	2412	else
2103	2413	schedule_work_on(cpu, &stock->work);
2104	2414	}
2105		- css_put(&memcg->css);
2106	2415	}
2107		- put_cpu();
	2416	+ put_cpu_light();
2108	2417	mutex_unlock(&percpu_charge_mutex);
2109	2418	}
2110	2419
2111	2420	static int memcg_hotplug_cpu_dead(unsigned int cpu)
2112	2421	{
2113	2422	struct memcg_stock_pcp *stock;
2114		- struct mem_cgroup *memcg;
	2423	+ struct mem_cgroup memcg, mi;
2115	2424
2116	2425	stock = &per_cpu(memcg_stock, cpu);
2117	2426	drain_stock(stock);
..	..	@@ -2123,9 +2432,10 @@
2123	2432	int nid;
2124	2433	long x;
2125	2434
2126		- x = this_cpu_xchg(memcg->stat_cpu->count[i], 0);
	2435	+ x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
2127	2436	if (x)
2128		- atomic_long_add(x, &memcg->stat[i]);
	2437	+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
	2438	+ atomic_long_add(x, &memcg->vmstats[i]);
2129	2439
2130	2440	if (i >= NR_VM_NODE_STAT_ITEMS)
2131	2441	continue;
..	..	@@ -2136,32 +2446,48 @@
2136	2446	pn = mem_cgroup_nodeinfo(memcg, nid);
2137	2447	x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
2138	2448	if (x)
2139		- atomic_long_add(x, &pn->lruvec_stat[i]);
	2449	+ do {
	2450	+ atomic_long_add(x, &pn->lruvec_stat[i]);
	2451	+ } while ((pn = parent_nodeinfo(pn, nid)));
2140	2452	}
2141	2453	}
2142	2454
2143	2455	for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
2144	2456	long x;
2145	2457
2146		- x = this_cpu_xchg(memcg->stat_cpu->events[i], 0);
	2458	+ x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
2147	2459	if (x)
2148		- atomic_long_add(x, &memcg->events[i]);
	2460	+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
	2461	+ atomic_long_add(x, &memcg->vmevents[i]);
2149	2462	}
2150	2463	}
2151	2464
2152	2465	return 0;
2153	2466	}
2154	2467
2155		-static void reclaim_high(struct mem_cgroup *memcg,
2156		- unsigned int nr_pages,
2157		- gfp_t gfp_mask)
	2468	+static unsigned long reclaim_high(struct mem_cgroup *memcg,
	2469	+ unsigned int nr_pages,
	2470	+ gfp_t gfp_mask)
2158	2471	{
	2472	+ unsigned long nr_reclaimed = 0;
	2473	+
2159	2474	do {
2160		- if (page_counter_read(&memcg->memory) <= memcg->high)
	2475	+ unsigned long pflags;
	2476	+
	2477	+ if (page_counter_read(&memcg->memory) <=
	2478	+ READ_ONCE(memcg->memory.high))
2161	2479	continue;
	2480	+
2162	2481	memcg_memory_event(memcg, MEMCG_HIGH);
2163		- try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
2164		- } while ((memcg = parent_mem_cgroup(memcg)));
	2482	+
	2483	+ psi_memstall_enter(&pflags);
	2484	+ nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
	2485	+ gfp_mask, true);
	2486	+ psi_memstall_leave(&pflags);
	2487	+ } while ((memcg = parent_mem_cgroup(memcg)) &&
	2488	+ !mem_cgroup_is_root(memcg));
	2489	+
	2490	+ return nr_reclaimed;
2165	2491	}
2166	2492
2167	2493	static void high_work_func(struct work_struct *work)
..	..	@@ -2173,35 +2499,238 @@
2173	2499	}
2174	2500
2175	2501	/*
	2502	+ * Clamp the maximum sleep time per allocation batch to 2 seconds. This is
	2503	+ * enough to still cause a significant slowdown in most cases, while still
	2504	+ * allowing diagnostics and tracing to proceed without becoming stuck.
	2505	+ */
	2506	+#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
	2507	+
	2508	+/*
	2509	+ * When calculating the delay, we use these either side of the exponentiation to
	2510	+ * maintain precision and scale to a reasonable number of jiffies (see the table
	2511	+ * below.
	2512	+ *
	2513	+ * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
	2514	+ * overage ratio to a delay.
	2515	+ * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the
	2516	+ * proposed penalty in order to reduce to a reasonable number of jiffies, and
	2517	+ * to produce a reasonable delay curve.
	2518	+ *
	2519	+ * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
	2520	+ * reasonable delay curve compared to precision-adjusted overage, not
	2521	+ * penalising heavily at first, but still making sure that growth beyond the
	2522	+ * limit penalises misbehaviour cgroups by slowing them down exponentially. For
	2523	+ * example, with a high of 100 megabytes:
	2524	+ *
	2525	+ * +-------+------------------------+
	2526	+ * \| usage \| time to allocate in ms \|
	2527	+ * +-------+------------------------+
	2528	+ * \| 100M \| 0 \|
	2529	+ * \| 101M \| 6 \|
	2530	+ * \| 102M \| 25 \|
	2531	+ * \| 103M \| 57 \|
	2532	+ * \| 104M \| 102 \|
	2533	+ * \| 105M \| 159 \|
	2534	+ * \| 106M \| 230 \|
	2535	+ * \| 107M \| 313 \|
	2536	+ * \| 108M \| 409 \|
	2537	+ * \| 109M \| 518 \|
	2538	+ * \| 110M \| 639 \|
	2539	+ * \| 111M \| 774 \|
	2540	+ * \| 112M \| 921 \|
	2541	+ * \| 113M \| 1081 \|
	2542	+ * \| 114M \| 1254 \|
	2543	+ * \| 115M \| 1439 \|
	2544	+ * \| 116M \| 1638 \|
	2545	+ * \| 117M \| 1849 \|
	2546	+ * \| 118M \| 2000 \|
	2547	+ * \| 119M \| 2000 \|
	2548	+ * \| 120M \| 2000 \|
	2549	+ * +-------+------------------------+
	2550	+ */
	2551	+ #define MEMCG_DELAY_PRECISION_SHIFT 20
	2552	+ #define MEMCG_DELAY_SCALING_SHIFT 14
	2553	+
	2554	+static u64 calculate_overage(unsigned long usage, unsigned long high)
	2555	+{
	2556	+ u64 overage;
	2557	+
	2558	+ if (usage <= high)
	2559	+ return 0;
	2560	+
	2561	+ /*
	2562	+ * Prevent division by 0 in overage calculation by acting as if
	2563	+ * it was a threshold of 1 page
	2564	+ */
	2565	+ high = max(high, 1UL);
	2566	+
	2567	+ overage = usage - high;
	2568	+ overage <<= MEMCG_DELAY_PRECISION_SHIFT;
	2569	+ return div64_u64(overage, high);
	2570	+}
	2571	+
	2572	+static u64 mem_find_max_overage(struct mem_cgroup *memcg)
	2573	+{
	2574	+ u64 overage, max_overage = 0;
	2575	+
	2576	+ do {
	2577	+ overage = calculate_overage(page_counter_read(&memcg->memory),
	2578	+ READ_ONCE(memcg->memory.high));
	2579	+ max_overage = max(overage, max_overage);
	2580	+ } while ((memcg = parent_mem_cgroup(memcg)) &&
	2581	+ !mem_cgroup_is_root(memcg));
	2582	+
	2583	+ return max_overage;
	2584	+}
	2585	+
	2586	+static u64 swap_find_max_overage(struct mem_cgroup *memcg)
	2587	+{
	2588	+ u64 overage, max_overage = 0;
	2589	+
	2590	+ do {
	2591	+ overage = calculate_overage(page_counter_read(&memcg->swap),
	2592	+ READ_ONCE(memcg->swap.high));
	2593	+ if (overage)
	2594	+ memcg_memory_event(memcg, MEMCG_SWAP_HIGH);
	2595	+ max_overage = max(overage, max_overage);
	2596	+ } while ((memcg = parent_mem_cgroup(memcg)) &&
	2597	+ !mem_cgroup_is_root(memcg));
	2598	+
	2599	+ return max_overage;
	2600	+}
	2601	+
	2602	+/*
	2603	+ * Get the number of jiffies that we should penalise a mischievous cgroup which
	2604	+ * is exceeding its memory.high by checking both it and its ancestors.
	2605	+ */
	2606	+static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
	2607	+ unsigned int nr_pages,
	2608	+ u64 max_overage)
	2609	+{
	2610	+ unsigned long penalty_jiffies;
	2611	+
	2612	+ if (!max_overage)
	2613	+ return 0;
	2614	+
	2615	+ /*
	2616	+ * We use overage compared to memory.high to calculate the number of
	2617	+ * jiffies to sleep (penalty_jiffies). Ideally this value should be
	2618	+ * fairly lenient on small overages, and increasingly harsh when the
	2619	+ * memcg in question makes it clear that it has no intention of stopping
	2620	+ * its crazy behaviour, so we exponentially increase the delay based on
	2621	+ * overage amount.
	2622	+ */
	2623	+ penalty_jiffies = max_overage * max_overage * HZ;
	2624	+ penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
	2625	+ penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
	2626	+
	2627	+ /*
	2628	+ * Factor in the task's own contribution to the overage, such that four
	2629	+ * N-sized allocations are throttled approximately the same as one
	2630	+ * 4N-sized allocation.
	2631	+ *
	2632	+ * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
	2633	+ * larger the current charge patch is than that.
	2634	+ */
	2635	+ return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
	2636	+}
	2637	+
	2638	+/*
2176	2639	* Scheduled by try_charge() to be executed from the userland return path
2177	2640	* and reclaims memory over the high limit.
2178	2641	*/
2179	2642	void mem_cgroup_handle_over_high(void)
2180	2643	{
	2644	+ unsigned long penalty_jiffies;
	2645	+ unsigned long pflags;
	2646	+ unsigned long nr_reclaimed;
2181	2647	unsigned int nr_pages = current->memcg_nr_pages_over_high;
	2648	+ int nr_retries = MAX_RECLAIM_RETRIES;
2182	2649	struct mem_cgroup *memcg;
	2650	+ bool in_retry = false;
2183	2651
2184	2652	if (likely(!nr_pages))
2185	2653	return;
2186	2654
2187	2655	memcg = get_mem_cgroup_from_mm(current->mm);
2188		- reclaim_high(memcg, nr_pages, GFP_KERNEL);
2189		- css_put(&memcg->css);
2190	2656	current->memcg_nr_pages_over_high = 0;
	2657	+
	2658	+retry_reclaim:
	2659	+ /*
	2660	+ * The allocating task should reclaim at least the batch size, but for
	2661	+ * subsequent retries we only want to do what's necessary to prevent oom
	2662	+ * or breaching resource isolation.
	2663	+ *
	2664	+ * This is distinct from memory.max or page allocator behaviour because
	2665	+ * memory.high is currently batched, whereas memory.max and the page
	2666	+ * allocator run every time an allocation is made.
	2667	+ */
	2668	+ nr_reclaimed = reclaim_high(memcg,
	2669	+ in_retry ? SWAP_CLUSTER_MAX : nr_pages,
	2670	+ GFP_KERNEL);
	2671	+
	2672	+ /*
	2673	+ * memory.high is breached and reclaim is unable to keep up. Throttle
	2674	+ * allocators proactively to slow down excessive growth.
	2675	+ */
	2676	+ penalty_jiffies = calculate_high_delay(memcg, nr_pages,
	2677	+ mem_find_max_overage(memcg));
	2678	+
	2679	+ penalty_jiffies += calculate_high_delay(memcg, nr_pages,
	2680	+ swap_find_max_overage(memcg));
	2681	+
	2682	+ /*
	2683	+ * Clamp the max delay per usermode return so as to still keep the
	2684	+ * application moving forwards and also permit diagnostics, albeit
	2685	+ * extremely slowly.
	2686	+ */
	2687	+ penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
	2688	+
	2689	+ /*
	2690	+ * Don't sleep if the amount of jiffies this memcg owes us is so low
	2691	+ * that it's not even worth doing, in an attempt to be nice to those who
	2692	+ * go only a small amount over their memory.high value and maybe haven't
	2693	+ * been aggressively reclaimed enough yet.
	2694	+ */
	2695	+ if (penalty_jiffies <= HZ / 100)
	2696	+ goto out;
	2697	+
	2698	+ /*
	2699	+ * If reclaim is making forward progress but we're still over
	2700	+ * memory.high, we want to encourage that rather than doing allocator
	2701	+ * throttling.
	2702	+ */
	2703	+ if (nr_reclaimed \|\| nr_retries--) {
	2704	+ in_retry = true;
	2705	+ goto retry_reclaim;
	2706	+ }
	2707	+
	2708	+ /*
	2709	+ * If we exit early, we're guaranteed to die (since
	2710	+ * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
	2711	+ * need to account for any ill-begotten jiffies to pay them off later.
	2712	+ */
	2713	+ psi_memstall_enter(&pflags);
	2714	+ schedule_timeout_killable(penalty_jiffies);
	2715	+ psi_memstall_leave(&pflags);
	2716	+
	2717	+out:
	2718	+ css_put(&memcg->css);
2191	2719	}
2192	2720
2193	2721	static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2194	2722	unsigned int nr_pages)
2195	2723	{
2196	2724	unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
2197		- int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
	2725	+ int nr_retries = MAX_RECLAIM_RETRIES;
2198	2726	struct mem_cgroup *mem_over_limit;
2199	2727	struct page_counter *counter;
	2728	+ enum oom_status oom_status;
2200	2729	unsigned long nr_reclaimed;
	2730	+ bool passed_oom = false;
2201	2731	bool may_swap = true;
2202	2732	bool drained = false;
2203		- bool oomed = false;
2204		- enum oom_status oom_status;
	2733	+ unsigned long pflags;
2205	2734
2206	2735	if (mem_cgroup_is_root(memcg))
2207	2736	return 0;
..	..	@@ -2236,15 +2765,6 @@
2236	2765	goto force;
2237	2766
2238	2767	/*
2239		- * Unlike in global OOM situations, memcg is not in a physical
2240		- * memory shortage. Allow dying and OOM-killed tasks to
2241		- * bypass the last charges so that they can exit quickly and
2242		- * free their memory.
2243		- */
2244		- if (unlikely(should_force_charge()))
2245		- goto force;
2246		-
2247		- /*
2248	2768	* Prevent unbounded recursion when reclaim operations need to
2249	2769	* allocate memory. This might exceed the limits temporarily,
2250	2770	* but we prefer facilitating memory reclaim and getting back
..	..	@@ -2261,8 +2781,10 @@
2261	2781
2262	2782	memcg_memory_event(mem_over_limit, MEMCG_MAX);
2263	2783
	2784	+ psi_memstall_enter(&pflags);
2264	2785	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2265	2786	gfp_mask, may_swap);
	2787	+ psi_memstall_leave(&pflags);
2266	2788
2267	2789	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2268	2790	goto retry;
..	..	@@ -2296,16 +2818,15 @@
2296	2818	if (nr_retries--)
2297	2819	goto retry;
2298	2820
2299		- if (gfp_mask & __GFP_RETRY_MAYFAIL && oomed)
	2821	+ if (gfp_mask & __GFP_RETRY_MAYFAIL)
2300	2822	goto nomem;
2301	2823
2302	2824	if (gfp_mask & __GFP_NOFAIL)
2303	2825	goto force;
2304	2826
2305		- if (fatal_signal_pending(current))
2306		- goto force;
2307		-
2308		- memcg_memory_event(mem_over_limit, MEMCG_OOM);
	2827	+ /* Avoid endless loop for tasks bypassed by the oom killer */
	2828	+ if (passed_oom && task_is_dying())
	2829	+ goto nomem;
2309	2830
2310	2831	/*
2311	2832	* keep retrying as long as the memcg oom killer is able to make
..	..	@@ -2314,15 +2835,10 @@
2314	2835	*/
2315	2836	oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
2316	2837	get_order(nr_pages * PAGE_SIZE));
2317		- switch (oom_status) {
2318		- case OOM_SUCCESS:
2319		- nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2320		- oomed = true;
	2838	+ if (oom_status == OOM_SUCCESS) {
	2839	+ passed_oom = true;
	2840	+ nr_retries = MAX_RECLAIM_RETRIES;
2321	2841	goto retry;
2322		- case OOM_FAILED:
2323		- goto force;
2324		- default:
2325		- goto nomem;
2326	2842	}
2327	2843	nomem:
2328	2844	if (!(gfp_mask & __GFP_NOFAIL))
..	..	@@ -2336,12 +2852,10 @@
2336	2852	page_counter_charge(&memcg->memory, nr_pages);
2337	2853	if (do_memsw_account())
2338	2854	page_counter_charge(&memcg->memsw, nr_pages);
2339		- css_get_many(&memcg->css, nr_pages);
2340	2855
2341	2856	return 0;
2342	2857
2343	2858	done_restock:
2344		- css_get_many(&memcg->css, batch);
2345	2859	if (batch > nr_pages)
2346	2860	refill_stock(memcg, batch - nr_pages);
2347	2861
..	..	@@ -2355,12 +2869,32 @@
2355	2869	* reclaim, the cost of mismatch is negligible.
2356	2870	*/
2357	2871	do {
2358		- if (page_counter_read(&memcg->memory) > memcg->high) {
2359		- /* Don't bother a random interrupted task */
2360		- if (in_interrupt()) {
	2872	+ bool mem_high, swap_high;
	2873	+
	2874	+ mem_high = page_counter_read(&memcg->memory) >
	2875	+ READ_ONCE(memcg->memory.high);
	2876	+ swap_high = page_counter_read(&memcg->swap) >
	2877	+ READ_ONCE(memcg->swap.high);
	2878	+
	2879	+ /* Don't bother a random interrupted task */
	2880	+ if (in_interrupt()) {
	2881	+ if (mem_high) {
2361	2882	schedule_work(&memcg->high_work);
2362	2883	break;
2363	2884	}
	2885	+ continue;
	2886	+ }
	2887	+
	2888	+ if (mem_high \|\| swap_high) {
	2889	+ /*
	2890	+ * The allocating tasks in this cgroup will need to do
	2891	+ * reclaim or be throttled to prevent further growth
	2892	+ * of the memory or swap footprints.
	2893	+ *
	2894	+ * Target some best-effort fairness between the tasks,
	2895	+ * and distribute reclaim work and delay penalties
	2896	+ * based on how much each task is actually allocating.
	2897	+ */
2364	2898	current->memcg_nr_pages_over_high += batch;
2365	2899	set_notify_resume(current);
2366	2900	break;
..	..	@@ -2370,6 +2904,7 @@
2370	2904	return 0;
2371	2905	}
2372	2906
	2907	+#if defined(CONFIG_MEMCG_KMEM) \|\| defined(CONFIG_MMU)
2373	2908	static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2374	2909	{
2375	2910	if (mem_cgroup_is_root(memcg))
..	..	@@ -2378,76 +2913,124 @@
2378	2913	page_counter_uncharge(&memcg->memory, nr_pages);
2379	2914	if (do_memsw_account())
2380	2915	page_counter_uncharge(&memcg->memsw, nr_pages);
2381		-
2382		- css_put_many(&memcg->css, nr_pages);
2383	2916	}
	2917	+#endif
2384	2918
2385		-static void lock_page_lru(struct page page, int isolated)
	2919	+static void commit_charge(struct page page, struct mem_cgroup memcg)
2386	2920	{
2387		- struct zone *zone = page_zone(page);
2388		-
2389		- spin_lock_irq(zone_lru_lock(zone));
2390		- if (PageLRU(page)) {
2391		- struct lruvec *lruvec;
2392		-
2393		- lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
2394		- ClearPageLRU(page);
2395		- del_page_from_lru_list(page, lruvec, page_lru(page));
2396		- *isolated = 1;
2397		- } else
2398		- *isolated = 0;
2399		-}
2400		-
2401		-static void unlock_page_lru(struct page *page, int isolated)
2402		-{
2403		- struct zone *zone = page_zone(page);
2404		-
2405		- if (isolated) {
2406		- struct lruvec *lruvec;
2407		-
2408		- lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
2409		- VM_BUG_ON_PAGE(PageLRU(page), page);
2410		- SetPageLRU(page);
2411		- add_page_to_lru_list(page, lruvec, page_lru(page));
2412		- }
2413		- spin_unlock_irq(zone_lru_lock(zone));
2414		-}
2415		-
2416		-static void commit_charge(struct page page, struct mem_cgroup memcg,
2417		- bool lrucare)
2418		-{
2419		- int isolated;
2420		-
2421	2921	VM_BUG_ON_PAGE(page->mem_cgroup, page);
2422		-
2423	2922	/*
2424		- * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
2425		- * may already be on some other mem_cgroup's LRU. Take care of it.
2426		- */
2427		- if (lrucare)
2428		- lock_page_lru(page, &isolated);
2429		-
2430		- /*
2431		- * Nobody should be changing or seriously looking at
2432		- * page->mem_cgroup at this point:
	2923	+ * Any of the following ensures page->mem_cgroup stability:
2433	2924	*
2434		- * - the page is uncharged
2435		- *
2436		- * - the page is off-LRU
2437		- *
2438		- * - an anonymous fault has exclusive page access, except for
2439		- * a locked page table
2440		- *
2441		- * - a page cache insertion, a swapin fault, or a migration
2442		- * have the page locked
	2925	+ * - the page lock
	2926	+ * - LRU isolation
	2927	+ * - lock_page_memcg()
	2928	+ * - exclusive reference
2443	2929	*/
2444	2930	page->mem_cgroup = memcg;
2445		-
2446		- if (lrucare)
2447		- unlock_page_lru(page, isolated);
2448	2931	}
2449	2932
2450	2933	#ifdef CONFIG_MEMCG_KMEM
	2934	+/*
	2935	+ * The allocated objcg pointers array is not accounted directly.
	2936	+ * Moreover, it should not come from DMA buffer and is not readily
	2937	+ * reclaimable. So those GFP bits should be masked off.
	2938	+ */
	2939	+#define OBJCGS_CLEAR_MASK (__GFP_DMA \| __GFP_RECLAIMABLE \| __GFP_ACCOUNT)
	2940	+
	2941	+int memcg_alloc_page_obj_cgroups(struct page page, struct kmem_cache s,
	2942	+ gfp_t gfp)
	2943	+{
	2944	+ unsigned int objects = objs_per_slab_page(s, page);
	2945	+ void *vec;
	2946	+
	2947	+ gfp &= ~OBJCGS_CLEAR_MASK;
	2948	+ vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
	2949	+ page_to_nid(page));
	2950	+ if (!vec)
	2951	+ return -ENOMEM;
	2952	+
	2953	+ if (cmpxchg(&page->obj_cgroups, NULL,
	2954	+ (struct obj_cgroup **) ((unsigned long)vec \| 0x1UL)))
	2955	+ kfree(vec);
	2956	+ else
	2957	+ kmemleak_not_leak(vec);
	2958	+
	2959	+ return 0;
	2960	+}
	2961	+
	2962	+/*
	2963	+ * Returns a pointer to the memory cgroup to which the kernel object is charged.
	2964	+ *
	2965	+ * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
	2966	+ * cgroup_mutex, etc.
	2967	+ */
	2968	+struct mem_cgroup mem_cgroup_from_obj(void p)
	2969	+{
	2970	+ struct page *page;
	2971	+
	2972	+ if (mem_cgroup_disabled())
	2973	+ return NULL;
	2974	+
	2975	+ page = virt_to_head_page(p);
	2976	+
	2977	+ /*
	2978	+ * If page->mem_cgroup is set, it's either a simple mem_cgroup pointer
	2979	+ * or a pointer to obj_cgroup vector. In the latter case the lowest
	2980	+ * bit of the pointer is set.
	2981	+ * The page->mem_cgroup pointer can be asynchronously changed
	2982	+ * from NULL to (obj_cgroup_vec \| 0x1UL), but can't be changed
	2983	+ * from a valid memcg pointer to objcg vector or back.
	2984	+ */
	2985	+ if (!page->mem_cgroup)
	2986	+ return NULL;
	2987	+
	2988	+ /*
	2989	+ * Slab objects are accounted individually, not per-page.
	2990	+ * Memcg membership data for each individual object is saved in
	2991	+ * the page->obj_cgroups.
	2992	+ */
	2993	+ if (page_has_obj_cgroups(page)) {
	2994	+ struct obj_cgroup *objcg;
	2995	+ unsigned int off;
	2996	+
	2997	+ off = obj_to_index(page->slab_cache, page, p);
	2998	+ objcg = page_obj_cgroups(page)[off];
	2999	+ if (objcg)
	3000	+ return obj_cgroup_memcg(objcg);
	3001	+
	3002	+ return NULL;
	3003	+ }
	3004	+
	3005	+ /* All other pages use page->mem_cgroup */
	3006	+ return page->mem_cgroup;
	3007	+}
	3008	+
	3009	+__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
	3010	+{
	3011	+ struct obj_cgroup *objcg = NULL;
	3012	+ struct mem_cgroup *memcg;
	3013	+
	3014	+ if (memcg_kmem_bypass())
	3015	+ return NULL;
	3016	+
	3017	+ rcu_read_lock();
	3018	+ if (unlikely(active_memcg()))
	3019	+ memcg = active_memcg();
	3020	+ else
	3021	+ memcg = mem_cgroup_from_task(current);
	3022	+
	3023	+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
	3024	+ objcg = rcu_dereference(memcg->objcg);
	3025	+ if (objcg && obj_cgroup_tryget(objcg))
	3026	+ break;
	3027	+ objcg = NULL;
	3028	+ }
	3029	+ rcu_read_unlock();
	3030	+
	3031	+ return objcg;
	3032	+}
	3033	+
2451	3034	static int memcg_alloc_cache_id(void)
2452	3035	{
2453	3036	int id, size;
..	..	@@ -2473,9 +3056,7 @@
2473	3056	else if (size > MEMCG_CACHES_MAX_SIZE)
2474	3057	size = MEMCG_CACHES_MAX_SIZE;
2475	3058
2476		- err = memcg_update_all_caches(size);
2477		- if (!err)
2478		- err = memcg_update_all_list_lrus(size);
	3059	+ err = memcg_update_all_list_lrus(size);
2479	3060	if (!err)
2480	3061	memcg_nr_cache_ids = size;
2481	3062
..	..	@@ -2493,152 +3074,17 @@
2493	3074	ida_simple_remove(&memcg_cache_ida, id);
2494	3075	}
2495	3076
2496		-struct memcg_kmem_cache_create_work {
2497		- struct mem_cgroup *memcg;
2498		- struct kmem_cache *cachep;
2499		- struct work_struct work;
2500		-};
2501		-
2502		-static void memcg_kmem_cache_create_func(struct work_struct *w)
2503		-{
2504		- struct memcg_kmem_cache_create_work *cw =
2505		- container_of(w, struct memcg_kmem_cache_create_work, work);
2506		- struct mem_cgroup *memcg = cw->memcg;
2507		- struct kmem_cache *cachep = cw->cachep;
2508		-
2509		- memcg_create_kmem_cache(memcg, cachep);
2510		-
2511		- css_put(&memcg->css);
2512		- kfree(cw);
2513		-}
2514		-
2515		-/*
2516		- * Enqueue the creation of a per-memcg kmem_cache.
2517		- */
2518		-static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2519		- struct kmem_cache *cachep)
2520		-{
2521		- struct memcg_kmem_cache_create_work *cw;
2522		-
2523		- cw = kmalloc(sizeof(*cw), GFP_NOWAIT \| __GFP_NOWARN);
2524		- if (!cw)
2525		- return;
2526		-
2527		- css_get(&memcg->css);
2528		-
2529		- cw->memcg = memcg;
2530		- cw->cachep = cachep;
2531		- INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
2532		-
2533		- queue_work(memcg_kmem_cache_wq, &cw->work);
2534		-}
2535		-
2536		-static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2537		- struct kmem_cache *cachep)
2538		-{
2539		- /*
2540		- * We need to stop accounting when we kmalloc, because if the
2541		- * corresponding kmalloc cache is not yet created, the first allocation
2542		- * in __memcg_schedule_kmem_cache_create will recurse.
2543		- *
2544		- * However, it is better to enclose the whole function. Depending on
2545		- * the debugging options enabled, INIT_WORK(), for instance, can
2546		- * trigger an allocation. This too, will make us recurse. Because at
2547		- * this point we can't allow ourselves back into memcg_kmem_get_cache,
2548		- * the safest choice is to do it like this, wrapping the whole function.
2549		- */
2550		- current->memcg_kmem_skip_account = 1;
2551		- __memcg_schedule_kmem_cache_create(memcg, cachep);
2552		- current->memcg_kmem_skip_account = 0;
2553		-}
2554		-
2555		-static inline bool memcg_kmem_bypass(void)
2556		-{
2557		- if (in_interrupt() \|\| !current->mm \|\| (current->flags & PF_KTHREAD))
2558		- return true;
2559		- return false;
2560		-}
2561		-
2562	3077	/**
2563		- * memcg_kmem_get_cache: select the correct per-memcg cache for allocation
2564		- * @cachep: the original global kmem cache
2565		- *
2566		- * Return the kmem_cache we're supposed to use for a slab allocation.
2567		- * We try to use the current memcg's version of the cache.
2568		- *
2569		- * If the cache does not exist yet, if we are the first user of it, we
2570		- * create it asynchronously in a workqueue and let the current allocation
2571		- * go through with the original cache.
2572		- *
2573		- * This function takes a reference to the cache it returns to assure it
2574		- * won't get destroyed while we are working with it. Once the caller is
2575		- * done with it, memcg_kmem_put_cache() must be called to release the
2576		- * reference.
2577		- */
2578		-struct kmem_cache memcg_kmem_get_cache(struct kmem_cache cachep)
2579		-{
2580		- struct mem_cgroup *memcg;
2581		- struct kmem_cache *memcg_cachep;
2582		- int kmemcg_id;
2583		-
2584		- VM_BUG_ON(!is_root_cache(cachep));
2585		-
2586		- if (memcg_kmem_bypass())
2587		- return cachep;
2588		-
2589		- if (current->memcg_kmem_skip_account)
2590		- return cachep;
2591		-
2592		- memcg = get_mem_cgroup_from_current();
2593		- kmemcg_id = READ_ONCE(memcg->kmemcg_id);
2594		- if (kmemcg_id < 0)
2595		- goto out;
2596		-
2597		- memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
2598		- if (likely(memcg_cachep))
2599		- return memcg_cachep;
2600		-
2601		- /*
2602		- * If we are in a safe context (can wait, and not in interrupt
2603		- * context), we could be be predictable and return right away.
2604		- * This would guarantee that the allocation being performed
2605		- * already belongs in the new cache.
2606		- *
2607		- * However, there are some clashes that can arrive from locking.
2608		- * For instance, because we acquire the slab_mutex while doing
2609		- * memcg_create_kmem_cache, this means no further allocation
2610		- * could happen with the slab_mutex held. So it's better to
2611		- * defer everything.
2612		- */
2613		- memcg_schedule_kmem_cache_create(memcg, cachep);
2614		-out:
2615		- css_put(&memcg->css);
2616		- return cachep;
2617		-}
2618		-
2619		-/**
2620		- * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache
2621		- * @cachep: the cache returned by memcg_kmem_get_cache
2622		- */
2623		-void memcg_kmem_put_cache(struct kmem_cache *cachep)
2624		-{
2625		- if (!is_root_cache(cachep))
2626		- css_put(&cachep->memcg_params.memcg->css);
2627		-}
2628		-
2629		-/**
2630		- * memcg_kmem_charge_memcg: charge a kmem page
2631		- * @page: page to charge
2632		- * @gfp: reclaim mode
2633		- * @order: allocation order
	3078	+ * __memcg_kmem_charge: charge a number of kernel pages to a memcg
2634	3079	* @memcg: memory cgroup to charge
	3080	+ * @gfp: reclaim mode
	3081	+ * @nr_pages: number of pages to charge
2635	3082	*
2636	3083	* Returns 0 on success, an error code on failure.
2637	3084	*/
2638		-int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
2639		- struct mem_cgroup *memcg)
	3085	+int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
	3086	+ unsigned int nr_pages)
2640	3087	{
2641		- unsigned int nr_pages = 1 << order;
2642	3088	struct page_counter *counter;
2643	3089	int ret;
2644	3090
..	..	@@ -2661,43 +3107,54 @@
2661	3107	cancel_charge(memcg, nr_pages);
2662	3108	return -ENOMEM;
2663	3109	}
2664		-
2665		- page->mem_cgroup = memcg;
2666		-
2667	3110	return 0;
2668	3111	}
2669	3112
2670	3113	/**
2671		- * memcg_kmem_charge: charge a kmem page to the current memory cgroup
	3114	+ * __memcg_kmem_uncharge: uncharge a number of kernel pages from a memcg
	3115	+ * @memcg: memcg to uncharge
	3116	+ * @nr_pages: number of pages to uncharge
	3117	+ */
	3118	+void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
	3119	+{
	3120	+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
	3121	+ page_counter_uncharge(&memcg->kmem, nr_pages);
	3122	+
	3123	+ refill_stock(memcg, nr_pages);
	3124	+}
	3125	+
	3126	+/**
	3127	+ * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup
2672	3128	* @page: page to charge
2673	3129	* @gfp: reclaim mode
2674	3130	* @order: allocation order
2675	3131	*
2676	3132	* Returns 0 on success, an error code on failure.
2677	3133	*/
2678		-int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
	3134	+int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
2679	3135	{
2680	3136	struct mem_cgroup *memcg;
2681	3137	int ret = 0;
2682	3138
2683		- if (mem_cgroup_disabled() \|\| memcg_kmem_bypass())
2684		- return 0;
2685		-
2686	3139	memcg = get_mem_cgroup_from_current();
2687		- if (!mem_cgroup_is_root(memcg)) {
2688		- ret = memcg_kmem_charge_memcg(page, gfp, order, memcg);
2689		- if (!ret)
	3140	+ if (memcg && !mem_cgroup_is_root(memcg)) {
	3141	+ ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
	3142	+ if (!ret) {
	3143	+ page->mem_cgroup = memcg;
2690	3144	__SetPageKmemcg(page);
	3145	+ return 0;
	3146	+ }
	3147	+ css_put(&memcg->css);
2691	3148	}
2692		- css_put(&memcg->css);
2693	3149	return ret;
2694	3150	}
	3151	+
2695	3152	/**
2696		- * memcg_kmem_uncharge: uncharge a kmem page
	3153	+ * __memcg_kmem_uncharge_page: uncharge a kmem page
2697	3154	* @page: page to uncharge
2698	3155	* @order: allocation order
2699	3156	*/
2700		-void memcg_kmem_uncharge(struct page *page, int order)
	3157	+void __memcg_kmem_uncharge_page(struct page *page, int order)
2701	3158	{
2702	3159	struct mem_cgroup *memcg = page->mem_cgroup;
2703	3160	unsigned int nr_pages = 1 << order;
..	..	@@ -2706,43 +3163,179 @@
2706	3163	return;
2707	3164
2708	3165	VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
2709		-
2710		- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
2711		- page_counter_uncharge(&memcg->kmem, nr_pages);
2712		-
2713		- page_counter_uncharge(&memcg->memory, nr_pages);
2714		- if (do_memsw_account())
2715		- page_counter_uncharge(&memcg->memsw, nr_pages);
2716		-
	3166	+ __memcg_kmem_uncharge(memcg, nr_pages);
2717	3167	page->mem_cgroup = NULL;
	3168	+ css_put(&memcg->css);
2718	3169
2719	3170	/* slab pages do not have PageKmemcg flag set */
2720	3171	if (PageKmemcg(page))
2721	3172	__ClearPageKmemcg(page);
2722		-
2723		- css_put_many(&memcg->css, nr_pages);
2724	3173	}
2725		-#endif /* CONFIG_MEMCG_KMEM */
2726	3174
2727		-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2728		-
2729		-/*
2730		- * Because tail pages are not marked as "used", set it. We're under
2731		- * zone_lru_lock and migration entries setup in all page mappings.
2732		- */
2733		-void mem_cgroup_split_huge_fixup(struct page *head)
	3175	+static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
2734	3176	{
2735		- int i;
	3177	+ struct memcg_stock_pcp *stock;
	3178	+ unsigned long flags;
	3179	+ bool ret = false;
2736	3180
2737		- if (mem_cgroup_disabled())
	3181	+ local_lock_irqsave(&memcg_stock.lock, flags);
	3182	+
	3183	+ stock = this_cpu_ptr(&memcg_stock);
	3184	+ if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
	3185	+ stock->nr_bytes -= nr_bytes;
	3186	+ ret = true;
	3187	+ }
	3188	+
	3189	+ local_unlock_irqrestore(&memcg_stock.lock, flags);
	3190	+
	3191	+ return ret;
	3192	+}
	3193	+
	3194	+static void drain_obj_stock(struct memcg_stock_pcp *stock)
	3195	+{
	3196	+ struct obj_cgroup *old = stock->cached_objcg;
	3197	+
	3198	+ if (!old)
2738	3199	return;
2739	3200
2740		- for (i = 1; i < HPAGE_PMD_NR; i++)
2741		- head[i].mem_cgroup = head->mem_cgroup;
	3201	+ if (stock->nr_bytes) {
	3202	+ unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
	3203	+ unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
2742	3204
2743		- __mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR);
	3205	+ if (nr_pages) {
	3206	+ struct mem_cgroup *memcg;
	3207	+
	3208	+ rcu_read_lock();
	3209	+retry:
	3210	+ memcg = obj_cgroup_memcg(old);
	3211	+ if (unlikely(!css_tryget(&memcg->css)))
	3212	+ goto retry;
	3213	+ rcu_read_unlock();
	3214	+
	3215	+ __memcg_kmem_uncharge(memcg, nr_pages);
	3216	+ css_put(&memcg->css);
	3217	+ }
	3218	+
	3219	+ /*
	3220	+ * The leftover is flushed to the centralized per-memcg value.
	3221	+ * On the next attempt to refill obj stock it will be moved
	3222	+ * to a per-cpu stock (probably, on an other CPU), see
	3223	+ * refill_obj_stock().
	3224	+ *
	3225	+ * How often it's flushed is a trade-off between the memory
	3226	+ * limit enforcement accuracy and potential CPU contention,
	3227	+ * so it might be changed in the future.
	3228	+ */
	3229	+ atomic_add(nr_bytes, &old->nr_charged_bytes);
	3230	+ stock->nr_bytes = 0;
	3231	+ }
	3232	+
	3233	+ obj_cgroup_put(old);
	3234	+ stock->cached_objcg = NULL;
2744	3235	}
2745		-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
	3236	+
	3237	+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
	3238	+ struct mem_cgroup *root_memcg)
	3239	+{
	3240	+ struct mem_cgroup *memcg;
	3241	+
	3242	+ if (stock->cached_objcg) {
	3243	+ memcg = obj_cgroup_memcg(stock->cached_objcg);
	3244	+ if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
	3245	+ return true;
	3246	+ }
	3247	+
	3248	+ return false;
	3249	+}
	3250	+
	3251	+static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
	3252	+{
	3253	+ struct memcg_stock_pcp *stock;
	3254	+ unsigned long flags;
	3255	+
	3256	+ local_lock_irqsave(&memcg_stock.lock, flags);
	3257	+
	3258	+ stock = this_cpu_ptr(&memcg_stock);
	3259	+ if (stock->cached_objcg != objcg) { /* reset if necessary */
	3260	+ drain_obj_stock(stock);
	3261	+ obj_cgroup_get(objcg);
	3262	+ stock->cached_objcg = objcg;
	3263	+ stock->nr_bytes = atomic_xchg(&objcg->nr_charged_bytes, 0);
	3264	+ }
	3265	+ stock->nr_bytes += nr_bytes;
	3266	+
	3267	+ if (stock->nr_bytes > PAGE_SIZE)
	3268	+ drain_obj_stock(stock);
	3269	+
	3270	+ local_unlock_irqrestore(&memcg_stock.lock, flags);
	3271	+}
	3272	+
	3273	+int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
	3274	+{
	3275	+ struct mem_cgroup *memcg;
	3276	+ unsigned int nr_pages, nr_bytes;
	3277	+ int ret;
	3278	+
	3279	+ if (consume_obj_stock(objcg, size))
	3280	+ return 0;
	3281	+
	3282	+ /*
	3283	+ * In theory, memcg->nr_charged_bytes can have enough
	3284	+ * pre-charged bytes to satisfy the allocation. However,
	3285	+ * flushing memcg->nr_charged_bytes requires two atomic
	3286	+ * operations, and memcg->nr_charged_bytes can't be big,
	3287	+ * so it's better to ignore it and try grab some new pages.
	3288	+ * memcg->nr_charged_bytes will be flushed in
	3289	+ * refill_obj_stock(), called from this function or
	3290	+ * independently later.
	3291	+ */
	3292	+ rcu_read_lock();
	3293	+retry:
	3294	+ memcg = obj_cgroup_memcg(objcg);
	3295	+ if (unlikely(!css_tryget(&memcg->css)))
	3296	+ goto retry;
	3297	+ rcu_read_unlock();
	3298	+
	3299	+ nr_pages = size >> PAGE_SHIFT;
	3300	+ nr_bytes = size & (PAGE_SIZE - 1);
	3301	+
	3302	+ if (nr_bytes)
	3303	+ nr_pages += 1;
	3304	+
	3305	+ ret = __memcg_kmem_charge(memcg, gfp, nr_pages);
	3306	+ if (!ret && nr_bytes)
	3307	+ refill_obj_stock(objcg, PAGE_SIZE - nr_bytes);
	3308	+
	3309	+ css_put(&memcg->css);
	3310	+ return ret;
	3311	+}
	3312	+
	3313	+void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
	3314	+{
	3315	+ refill_obj_stock(objcg, size);
	3316	+}
	3317	+
	3318	+#endif /* CONFIG_MEMCG_KMEM */
	3319	+
	3320	+/*
	3321	+ * Because head->mem_cgroup is not set on tails, set it now.
	3322	+ */
	3323	+void split_page_memcg(struct page *head, unsigned int nr)
	3324	+{
	3325	+ struct mem_cgroup *memcg = head->mem_cgroup;
	3326	+ int kmemcg = PageKmemcg(head);
	3327	+ int i;
	3328	+
	3329	+ if (mem_cgroup_disabled() \|\| !memcg)
	3330	+ return;
	3331	+
	3332	+ for (i = 1; i < nr; i++) {
	3333	+ head[i].mem_cgroup = memcg;
	3334	+ if (kmemcg)
	3335	+ __SetPageKmemcg(head + i);
	3336	+ }
	3337	+ css_get_many(&memcg->css, nr - 1);
	3338	+}
2746	3339
2747	3340	#ifdef CONFIG_MEMCG_SWAP
2748	3341	/**
..	..	@@ -2804,7 +3397,7 @@
2804	3397	* Make sure that the new limit (memsw or memory limit) doesn't
2805	3398	* break our basic invariant rule memory.max <= memsw.max.
2806	3399	*/
2807		- limits_invariant = memsw ? max >= memcg->memory.max :
	3400	+ limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
2808	3401	max <= memcg->memsw.max;
2809	3402	if (!limits_invariant) {
2810	3403	mutex_unlock(&memcg_max_mutex);
..	..	@@ -2925,7 +3518,7 @@
2925	3518	* Test whether @memcg has children, dead or alive. Note that this
2926	3519	* function doesn't care whether @memcg has use_hierarchy enabled and
2927	3520	* returns %true if there are child csses according to the cgroup
2928		- * hierarchy. Testing use_hierarchy is the caller's responsiblity.
	3521	+ * hierarchy. Testing use_hierarchy is the caller's responsibility.
2929	3522	*/
2930	3523	static inline bool memcg_has_children(struct mem_cgroup *memcg)
2931	3524	{
..	..	@@ -2944,7 +3537,7 @@
2944	3537	*/
2945	3538	static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
2946	3539	{
2947		- int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
	3540	+ int nr_retries = MAX_RECLAIM_RETRIES;
2948	3541
2949	3542	/* we call try-to-free pages for make this cgroup empty */
2950	3543	lru_add_drain_all();
..	..	@@ -3018,50 +3611,15 @@
3018	3611	return retval;
3019	3612	}
3020	3613
3021		-struct accumulated_stats {
3022		- unsigned long stat[MEMCG_NR_STAT];
3023		- unsigned long events[NR_VM_EVENT_ITEMS];
3024		- unsigned long lru_pages[NR_LRU_LISTS];
3025		- const unsigned int *stats_array;
3026		- const unsigned int *events_array;
3027		- int stats_size;
3028		- int events_size;
3029		-};
3030		-
3031		-static void accumulate_memcg_tree(struct mem_cgroup *memcg,
3032		- struct accumulated_stats *acc)
3033		-{
3034		- struct mem_cgroup *mi;
3035		- int i;
3036		-
3037		- for_each_mem_cgroup_tree(mi, memcg) {
3038		- for (i = 0; i < acc->stats_size; i++)
3039		- acc->stat[i] += memcg_page_state(mi,
3040		- acc->stats_array ? acc->stats_array[i] : i);
3041		-
3042		- for (i = 0; i < acc->events_size; i++)
3043		- acc->events[i] += memcg_sum_events(mi,
3044		- acc->events_array ? acc->events_array[i] : i);
3045		-
3046		- for (i = 0; i < NR_LRU_LISTS; i++)
3047		- acc->lru_pages[i] +=
3048		- mem_cgroup_nr_lru_pages(mi, BIT(i));
3049		- }
3050		-}
3051		-
3052	3614	static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3053	3615	{
3054		- unsigned long val = 0;
	3616	+ unsigned long val;
3055	3617
3056	3618	if (mem_cgroup_is_root(memcg)) {
3057		- struct mem_cgroup *iter;
3058		-
3059		- for_each_mem_cgroup_tree(iter, memcg) {
3060		- val += memcg_page_state(iter, MEMCG_CACHE);
3061		- val += memcg_page_state(iter, MEMCG_RSS);
3062		- if (swap)
3063		- val += memcg_page_state(iter, MEMCG_SWAP);
3064		- }
	3619	+ val = memcg_page_state(memcg, NR_FILE_PAGES) +
	3620	+ memcg_page_state(memcg, NR_ANON_MAPPED);
	3621	+ if (swap)
	3622	+ val += memcg_page_state(memcg, MEMCG_SWAP);
3065	3623	} else {
3066	3624	if (!swap)
3067	3625	val = page_counter_read(&memcg->memory);
..	..	@@ -3122,9 +3680,61 @@
3122	3680	}
3123	3681	}
3124	3682
	3683	+static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
	3684	+{
	3685	+ unsigned long stat[MEMCG_NR_STAT] = {0};
	3686	+ struct mem_cgroup *mi;
	3687	+ int node, cpu, i;
	3688	+
	3689	+ for_each_online_cpu(cpu)
	3690	+ for (i = 0; i < MEMCG_NR_STAT; i++)
	3691	+ stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
	3692	+
	3693	+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
	3694	+ for (i = 0; i < MEMCG_NR_STAT; i++)
	3695	+ atomic_long_add(stat[i], &mi->vmstats[i]);
	3696	+
	3697	+ for_each_node(node) {
	3698	+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
	3699	+ struct mem_cgroup_per_node *pi;
	3700	+
	3701	+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
	3702	+ stat[i] = 0;
	3703	+
	3704	+ for_each_online_cpu(cpu)
	3705	+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
	3706	+ stat[i] += per_cpu(
	3707	+ pn->lruvec_stat_cpu->count[i], cpu);
	3708	+
	3709	+ for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
	3710	+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
	3711	+ atomic_long_add(stat[i], &pi->lruvec_stat[i]);
	3712	+ }
	3713	+}
	3714	+
	3715	+static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
	3716	+{
	3717	+ unsigned long events[NR_VM_EVENT_ITEMS];
	3718	+ struct mem_cgroup *mi;
	3719	+ int cpu, i;
	3720	+
	3721	+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
	3722	+ events[i] = 0;
	3723	+
	3724	+ for_each_online_cpu(cpu)
	3725	+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
	3726	+ events[i] += per_cpu(memcg->vmstats_percpu->events[i],
	3727	+ cpu);
	3728	+
	3729	+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
	3730	+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
	3731	+ atomic_long_add(events[i], &mi->vmevents[i]);
	3732	+}
	3733	+
3125	3734	#ifdef CONFIG_MEMCG_KMEM
3126	3735	static int memcg_online_kmem(struct mem_cgroup *memcg)
3127	3736	{
	3737	+ struct obj_cgroup *objcg;
3128	3738	int memcg_id;
3129	3739
3130	3740	if (cgroup_memory_nokmem)
..	..	@@ -3137,7 +3747,16 @@
3137	3747	if (memcg_id < 0)
3138	3748	return memcg_id;
3139	3749
3140		- static_branch_inc(&memcg_kmem_enabled_key);
	3750	+ objcg = obj_cgroup_alloc();
	3751	+ if (!objcg) {
	3752	+ memcg_free_cache_id(memcg_id);
	3753	+ return -ENOMEM;
	3754	+ }
	3755	+ objcg->memcg = memcg;
	3756	+ rcu_assign_pointer(memcg->objcg, objcg);
	3757	+
	3758	+ static_branch_enable(&memcg_kmem_enabled_key);
	3759	+
3141	3760	/*
3142	3761	* A memory cgroup is considered kmem-online as soon as it gets
3143	3762	* kmemcg_id. Setting the id after enabling static branching will
..	..	@@ -3146,7 +3765,6 @@
3146	3765	*/
3147	3766	memcg->kmemcg_id = memcg_id;
3148	3767	memcg->kmem_state = KMEM_ONLINE;
3149		- INIT_LIST_HEAD(&memcg->kmem_caches);
3150	3768
3151	3769	return 0;
3152	3770	}
..	..	@@ -3159,22 +3777,17 @@
3159	3777
3160	3778	if (memcg->kmem_state != KMEM_ONLINE)
3161	3779	return;
3162		- /*
3163		- * Clear the online state before clearing memcg_caches array
3164		- * entries. The slab_mutex in memcg_deactivate_kmem_caches()
3165		- * guarantees that no cache will be created for this cgroup
3166		- * after we are done (see memcg_create_kmem_cache()).
3167		- */
	3780	+
3168	3781	memcg->kmem_state = KMEM_ALLOCATED;
3169		-
3170		- memcg_deactivate_kmem_caches(memcg);
3171		-
3172		- kmemcg_id = memcg->kmemcg_id;
3173		- BUG_ON(kmemcg_id < 0);
3174	3782
3175	3783	parent = parent_mem_cgroup(memcg);
3176	3784	if (!parent)
3177	3785	parent = root_mem_cgroup;
	3786	+
	3787	+ memcg_reparent_objcgs(memcg, parent);
	3788	+
	3789	+ kmemcg_id = memcg->kmemcg_id;
	3790	+ BUG_ON(kmemcg_id < 0);
3178	3791
3179	3792	/*
3180	3793	* Change kmemcg_id of this cgroup and all its descendants to the
..	..	@@ -3204,12 +3817,6 @@
3204	3817	/* css_alloc() failed, offlining didn't happen */
3205	3818	if (unlikely(memcg->kmem_state == KMEM_ONLINE))
3206	3819	memcg_offline_kmem(memcg);
3207		-
3208		- if (memcg->kmem_state == KMEM_ALLOCATED) {
3209		- memcg_destroy_kmem_caches(memcg);
3210		- static_branch_dec(&memcg_kmem_enabled_key);
3211		- WARN_ON(page_counter_read(&memcg->kmem));
3212		- }
3213	3820	}
3214	3821	#else
3215	3822	static int memcg_online_kmem(struct mem_cgroup *memcg)
..	..	@@ -3300,6 +3907,9 @@
3300	3907	ret = mem_cgroup_resize_max(memcg, nr_pages, true);
3301	3908	break;
3302	3909	case _KMEM:
	3910	+ pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
	3911	+ "Please report your usecase to linux-mm@kvack.org if you "
	3912	+ "depend on this functionality.\n");
3303	3913	ret = memcg_update_kmem_max(memcg, nr_pages);
3304	3914	break;
3305	3915	case _TCP:
..	..	@@ -3385,6 +3995,49 @@
3385	3995	#endif
3386	3996
3387	3997	#ifdef CONFIG_NUMA
	3998	+
	3999	+#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) \| BIT(LRU_ACTIVE_FILE))
	4000	+#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) \| BIT(LRU_ACTIVE_ANON))
	4001	+#define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
	4002	+
	4003	+static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
	4004	+ int nid, unsigned int lru_mask, bool tree)
	4005	+{
	4006	+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
	4007	+ unsigned long nr = 0;
	4008	+ enum lru_list lru;
	4009	+
	4010	+ VM_BUG_ON((unsigned)nid >= nr_node_ids);
	4011	+
	4012	+ for_each_lru(lru) {
	4013	+ if (!(BIT(lru) & lru_mask))
	4014	+ continue;
	4015	+ if (tree)
	4016	+ nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
	4017	+ else
	4018	+ nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
	4019	+ }
	4020	+ return nr;
	4021	+}
	4022	+
	4023	+static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
	4024	+ unsigned int lru_mask,
	4025	+ bool tree)
	4026	+{
	4027	+ unsigned long nr = 0;
	4028	+ enum lru_list lru;
	4029	+
	4030	+ for_each_lru(lru) {
	4031	+ if (!(BIT(lru) & lru_mask))
	4032	+ continue;
	4033	+ if (tree)
	4034	+ nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
	4035	+ else
	4036	+ nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
	4037	+ }
	4038	+ return nr;
	4039	+}
	4040	+
3388	4041	static int memcg_numa_stat_show(struct seq_file m, void v)
3389	4042	{
3390	4043	struct numa_stat {
..	..	@@ -3400,40 +4053,60 @@
3400	4053	};
3401	4054	const struct numa_stat *stat;
3402	4055	int nid;
3403		- unsigned long nr;
3404		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
	4056	+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
3405	4057
3406	4058	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3407		- nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
3408		- seq_printf(m, "%s=%lu", stat->name, nr);
3409		- for_each_node_state(nid, N_MEMORY) {
3410		- nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
3411		- stat->lru_mask);
3412		- seq_printf(m, " N%d=%lu", nid, nr);
3413		- }
	4059	+ seq_printf(m, "%s=%lu", stat->name,
	4060	+ mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
	4061	+ false));
	4062	+ for_each_node_state(nid, N_MEMORY)
	4063	+ seq_printf(m, " N%d=%lu", nid,
	4064	+ mem_cgroup_node_nr_lru_pages(memcg, nid,
	4065	+ stat->lru_mask, false));
3414	4066	seq_putc(m, '\n');
3415	4067	}
3416	4068
3417	4069	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3418		- struct mem_cgroup *iter;
3419	4070
3420		- nr = 0;
3421		- for_each_mem_cgroup_tree(iter, memcg)
3422		- nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
3423		- seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
3424		- for_each_node_state(nid, N_MEMORY) {
3425		- nr = 0;
3426		- for_each_mem_cgroup_tree(iter, memcg)
3427		- nr += mem_cgroup_node_nr_lru_pages(
3428		- iter, nid, stat->lru_mask);
3429		- seq_printf(m, " N%d=%lu", nid, nr);
3430		- }
	4071	+ seq_printf(m, "hierarchical_%s=%lu", stat->name,
	4072	+ mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
	4073	+ true));
	4074	+ for_each_node_state(nid, N_MEMORY)
	4075	+ seq_printf(m, " N%d=%lu", nid,
	4076	+ mem_cgroup_node_nr_lru_pages(memcg, nid,
	4077	+ stat->lru_mask, true));
3431	4078	seq_putc(m, '\n');
3432	4079	}
3433	4080
3434	4081	return 0;
3435	4082	}
3436	4083	#endif /* CONFIG_NUMA */
	4084	+
	4085	+static const unsigned int memcg1_stats[] = {
	4086	+ NR_FILE_PAGES,
	4087	+ NR_ANON_MAPPED,
	4088	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	4089	+ NR_ANON_THPS,
	4090	+#endif
	4091	+ NR_SHMEM,
	4092	+ NR_FILE_MAPPED,
	4093	+ NR_FILE_DIRTY,
	4094	+ NR_WRITEBACK,
	4095	+ MEMCG_SWAP,
	4096	+};
	4097	+
	4098	+static const char *const memcg1_stat_names[] = {
	4099	+ "cache",
	4100	+ "rss",
	4101	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	4102	+ "rss_huge",
	4103	+#endif
	4104	+ "shmem",
	4105	+ "mapped_file",
	4106	+ "dirty",
	4107	+ "writeback",
	4108	+ "swap",
	4109	+};
3437	4110
3438	4111	/* Universal VM events cgroup1 shows, original sort order */
3439	4112	static const unsigned int memcg1_events[] = {
..	..	@@ -3443,45 +4116,42 @@
3443	4116	PGMAJFAULT,
3444	4117	};
3445	4118
3446		-static const char *const memcg1_event_names[] = {
3447		- "pgpgin",
3448		- "pgpgout",
3449		- "pgfault",
3450		- "pgmajfault",
3451		-};
3452		-
3453	4119	static int memcg_stat_show(struct seq_file m, void v)
3454	4120	{
3455		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
	4121	+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
3456	4122	unsigned long memory, memsw;
3457	4123	struct mem_cgroup *mi;
3458	4124	unsigned int i;
3459		- struct accumulated_stats acc;
3460	4125
3461	4126	BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
3462		- BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
3463	4127
3464	4128	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
	4129	+ unsigned long nr;
	4130	+
3465	4131	if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
3466	4132	continue;
3467		- seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
3468		- memcg_page_state(memcg, memcg1_stats[i]) *
3469		- PAGE_SIZE);
	4133	+ nr = memcg_page_state_local(memcg, memcg1_stats[i]);
	4134	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	4135	+ if (memcg1_stats[i] == NR_ANON_THPS)
	4136	+ nr *= HPAGE_PMD_NR;
	4137	+#endif
	4138	+ seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE);
3470	4139	}
3471	4140
3472	4141	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3473		- seq_printf(m, "%s %lu\n", memcg1_event_names[i],
3474		- memcg_sum_events(memcg, memcg1_events[i]));
	4142	+ seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]),
	4143	+ memcg_events_local(memcg, memcg1_events[i]));
3475	4144
3476	4145	for (i = 0; i < NR_LRU_LISTS; i++)
3477		- seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
3478		- mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
	4146	+ seq_printf(m, "%s %lu\n", lru_list_name(i),
	4147	+ memcg_page_state_local(memcg, NR_LRU_BASE + i) *
	4148	+ PAGE_SIZE);
3479	4149
3480	4150	/* Hierarchical information */
3481	4151	memory = memsw = PAGE_COUNTER_MAX;
3482	4152	for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
3483		- memory = min(memory, mi->memory.max);
3484		- memsw = min(memsw, mi->memsw.max);
	4153	+ memory = min(memory, READ_ONCE(mi->memory.max));
	4154	+ memsw = min(memsw, READ_ONCE(mi->memsw.max));
3485	4155	}
3486	4156	seq_printf(m, "hierarchical_memory_limit %llu\n",
3487	4157	(u64)memory * PAGE_SIZE);
..	..	@@ -3489,49 +4159,45 @@
3489	4159	seq_printf(m, "hierarchical_memsw_limit %llu\n",
3490	4160	(u64)memsw * PAGE_SIZE);
3491	4161
3492		- memset(&acc, 0, sizeof(acc));
3493		- acc.stats_size = ARRAY_SIZE(memcg1_stats);
3494		- acc.stats_array = memcg1_stats;
3495		- acc.events_size = ARRAY_SIZE(memcg1_events);
3496		- acc.events_array = memcg1_events;
3497		- accumulate_memcg_tree(memcg, &acc);
3498		-
3499	4162	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
	4163	+ unsigned long nr;
	4164	+
3500	4165	if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
3501	4166	continue;
	4167	+ nr = memcg_page_state(memcg, memcg1_stats[i]);
	4168	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	4169	+ if (memcg1_stats[i] == NR_ANON_THPS)
	4170	+ nr *= HPAGE_PMD_NR;
	4171	+#endif
3502	4172	seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
3503		- (u64)acc.stat[i] * PAGE_SIZE);
	4173	+ (u64)nr * PAGE_SIZE);
3504	4174	}
3505	4175
3506	4176	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3507		- seq_printf(m, "total_%s %llu\n", memcg1_event_names[i],
3508		- (u64)acc.events[i]);
	4177	+ seq_printf(m, "total_%s %llu\n",
	4178	+ vm_event_name(memcg1_events[i]),
	4179	+ (u64)memcg_events(memcg, memcg1_events[i]));
3509	4180
3510	4181	for (i = 0; i < NR_LRU_LISTS; i++)
3511		- seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i],
3512		- (u64)acc.lru_pages[i] * PAGE_SIZE);
	4182	+ seq_printf(m, "total_%s %llu\n", lru_list_name(i),
	4183	+ (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
	4184	+ PAGE_SIZE);
3513	4185
3514	4186	#ifdef CONFIG_DEBUG_VM
3515	4187	{
3516	4188	pg_data_t *pgdat;
3517	4189	struct mem_cgroup_per_node *mz;
3518		- struct zone_reclaim_stat *rstat;
3519		- unsigned long recent_rotated[2] = {0, 0};
3520		- unsigned long recent_scanned[2] = {0, 0};
	4190	+ unsigned long anon_cost = 0;
	4191	+ unsigned long file_cost = 0;
3521	4192
3522	4193	for_each_online_pgdat(pgdat) {
3523	4194	mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
3524		- rstat = &mz->lruvec.reclaim_stat;
3525	4195
3526		- recent_rotated[0] += rstat->recent_rotated[0];
3527		- recent_rotated[1] += rstat->recent_rotated[1];
3528		- recent_scanned[0] += rstat->recent_scanned[0];
3529		- recent_scanned[1] += rstat->recent_scanned[1];
	4196	+ anon_cost += mz->lruvec.anon_cost;
	4197	+ file_cost += mz->lruvec.file_cost;
3530	4198	}
3531		- seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
3532		- seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
3533		- seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
3534		- seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
	4199	+ seq_printf(m, "anon_cost %lu\n", anon_cost);
	4200	+ seq_printf(m, "file_cost %lu\n", file_cost);
3535	4201	}
3536	4202	#endif
3537	4203
..	..	@@ -3690,8 +4356,7 @@
3690	4356	size = thresholds->primary ? thresholds->primary->size + 1 : 1;
3691	4357
3692	4358	/* Allocate memory for new array of thresholds */
3693		- new = kmalloc(sizeof(new) + size sizeof(struct mem_cgroup_threshold),
3694		- GFP_KERNEL);
	4359	+ new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
3695	4360	if (!new) {
3696	4361	ret = -ENOMEM;
3697	4362	goto unlock;
..	..	@@ -3699,17 +4364,16 @@
3699	4364	new->size = size;
3700	4365
3701	4366	/* Copy thresholds (if any) to new array */
3702		- if (thresholds->primary) {
3703		- memcpy(new->entries, thresholds->primary->entries, (size - 1) *
3704		- sizeof(struct mem_cgroup_threshold));
3705		- }
	4367	+ if (thresholds->primary)
	4368	+ memcpy(new->entries, thresholds->primary->entries,
	4369	+ flex_array_size(new, entries, size - 1));
3706	4370
3707	4371	/* Add new threshold */
3708	4372	new->entries[size - 1].eventfd = eventfd;
3709	4373	new->entries[size - 1].threshold = threshold;
3710	4374
3711	4375	/* Sort thresholds. Registering of new threshold isn't time-critical */
3712		- sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
	4376	+ sort(new->entries, size, sizeof(*new->entries),
3713	4377	compare_thresholds, NULL);
3714	4378
3715	4379	/* Find current threshold */
..	..	@@ -3891,7 +4555,7 @@
3891	4555
3892	4556	static int mem_cgroup_oom_control_read(struct seq_file sf, void v)
3893	4557	{
3894		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
	4558	+ struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
3895	4559
3896	4560	seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
3897	4561	seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
..	..	@@ -3917,6 +4581,8 @@
3917	4581	}
3918	4582
3919	4583	#ifdef CONFIG_CGROUP_WRITEBACK
	4584	+
	4585	+#include <trace/events/writeback.h>
3920	4586
3921	4587	static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
3922	4588	{
..	..	@@ -3949,11 +4615,11 @@
3949	4615	*/
3950	4616	static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
3951	4617	{
3952		- long x = atomic_long_read(&memcg->stat[idx]);
	4618	+ long x = atomic_long_read(&memcg->vmstats[idx]);
3953	4619	int cpu;
3954	4620
3955	4621	for_each_online_cpu(cpu)
3956		- x += per_cpu_ptr(memcg->stat_cpu, cpu)->count[idx];
	4622	+ x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx];
3957	4623	if (x < 0)
3958	4624	x = 0;
3959	4625	return x;
..	..	@@ -3986,18 +4652,142 @@
3986	4652
3987	4653	*pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
3988	4654
3989		- /* this should eventually include NR_UNSTABLE_NFS */
3990	4655	*pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
3991		- *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) \|
3992		- (1 << LRU_ACTIVE_FILE));
	4656	+ *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
	4657	+ memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
3993	4658	*pheadroom = PAGE_COUNTER_MAX;
3994	4659
3995	4660	while ((parent = parent_mem_cgroup(memcg))) {
3996		- unsigned long ceiling = min(memcg->memory.max, memcg->high);
	4661	+ unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
	4662	+ READ_ONCE(memcg->memory.high));
3997	4663	unsigned long used = page_counter_read(&memcg->memory);
3998	4664
3999	4665	pheadroom = min(pheadroom, ceiling - min(ceiling, used));
4000	4666	memcg = parent;
	4667	+ }
	4668	+}
	4669	+
	4670	+/*
	4671	+ * Foreign dirty flushing
	4672	+ *
	4673	+ * There's an inherent mismatch between memcg and writeback. The former
	4674	+ * trackes ownership per-page while the latter per-inode. This was a
	4675	+ * deliberate design decision because honoring per-page ownership in the
	4676	+ * writeback path is complicated, may lead to higher CPU and IO overheads
	4677	+ * and deemed unnecessary given that write-sharing an inode across
	4678	+ * different cgroups isn't a common use-case.
	4679	+ *
	4680	+ * Combined with inode majority-writer ownership switching, this works well
	4681	+ * enough in most cases but there are some pathological cases. For
	4682	+ * example, let's say there are two cgroups A and B which keep writing to
	4683	+ * different but confined parts of the same inode. B owns the inode and
	4684	+ * A's memory is limited far below B's. A's dirty ratio can rise enough to
	4685	+ * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
	4686	+ * triggering background writeback. A will be slowed down without a way to
	4687	+ * make writeback of the dirty pages happen.
	4688	+ *
	4689	+ * Conditions like the above can lead to a cgroup getting repatedly and
	4690	+ * severely throttled after making some progress after each
	4691	+ * dirty_expire_interval while the underyling IO device is almost
	4692	+ * completely idle.
	4693	+ *
	4694	+ * Solving this problem completely requires matching the ownership tracking
	4695	+ * granularities between memcg and writeback in either direction. However,
	4696	+ * the more egregious behaviors can be avoided by simply remembering the
	4697	+ * most recent foreign dirtying events and initiating remote flushes on
	4698	+ * them when local writeback isn't enough to keep the memory clean enough.
	4699	+ *
	4700	+ * The following two functions implement such mechanism. When a foreign
	4701	+ * page - a page whose memcg and writeback ownerships don't match - is
	4702	+ * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
	4703	+ * bdi_writeback on the page owning memcg. When balance_dirty_pages()
	4704	+ * decides that the memcg needs to sleep due to high dirty ratio, it calls
	4705	+ * mem_cgroup_flush_foreign() which queues writeback on the recorded
	4706	+ * foreign bdi_writebacks which haven't expired. Both the numbers of
	4707	+ * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
	4708	+ * limited to MEMCG_CGWB_FRN_CNT.
	4709	+ *
	4710	+ * The mechanism only remembers IDs and doesn't hold any object references.
	4711	+ * As being wrong occasionally doesn't matter, updates and accesses to the
	4712	+ * records are lockless and racy.
	4713	+ */
	4714	+void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
	4715	+ struct bdi_writeback *wb)
	4716	+{
	4717	+ struct mem_cgroup *memcg = page->mem_cgroup;
	4718	+ struct memcg_cgwb_frn *frn;
	4719	+ u64 now = get_jiffies_64();
	4720	+ u64 oldest_at = now;
	4721	+ int oldest = -1;
	4722	+ int i;
	4723	+
	4724	+ trace_track_foreign_dirty(page, wb);
	4725	+
	4726	+ /*
	4727	+ * Pick the slot to use. If there is already a slot for @wb, keep
	4728	+ * using it. If not replace the oldest one which isn't being
	4729	+ * written out.
	4730	+ */
	4731	+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
	4732	+ frn = &memcg->cgwb_frn[i];
	4733	+ if (frn->bdi_id == wb->bdi->id &&
	4734	+ frn->memcg_id == wb->memcg_css->id)
	4735	+ break;
	4736	+ if (time_before64(frn->at, oldest_at) &&
	4737	+ atomic_read(&frn->done.cnt) == 1) {
	4738	+ oldest = i;
	4739	+ oldest_at = frn->at;
	4740	+ }
	4741	+ }
	4742	+
	4743	+ if (i < MEMCG_CGWB_FRN_CNT) {
	4744	+ /*
	4745	+ * Re-using an existing one. Update timestamp lazily to
	4746	+ * avoid making the cacheline hot. We want them to be
	4747	+ * reasonably up-to-date and significantly shorter than
	4748	+ * dirty_expire_interval as that's what expires the record.
	4749	+ * Use the shorter of 1s and dirty_expire_interval / 8.
	4750	+ */
	4751	+ unsigned long update_intv =
	4752	+ min_t(unsigned long, HZ,
	4753	+ msecs_to_jiffies(dirty_expire_interval * 10) / 8);
	4754	+
	4755	+ if (time_before64(frn->at, now - update_intv))
	4756	+ frn->at = now;
	4757	+ } else if (oldest >= 0) {
	4758	+ /* replace the oldest free one */
	4759	+ frn = &memcg->cgwb_frn[oldest];
	4760	+ frn->bdi_id = wb->bdi->id;
	4761	+ frn->memcg_id = wb->memcg_css->id;
	4762	+ frn->at = now;
	4763	+ }
	4764	+}
	4765	+
	4766	+/* issue foreign writeback flushes for recorded foreign dirtying events */
	4767	+void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
	4768	+{
	4769	+ struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
	4770	+ unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
	4771	+ u64 now = jiffies_64;
	4772	+ int i;
	4773	+
	4774	+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
	4775	+ struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
	4776	+
	4777	+ /*
	4778	+ * If the record is older than dirty_expire_interval,
	4779	+ * writeback on it has already started. No need to kick it
	4780	+ * off again. Also, don't start a new one if there's
	4781	+ * already one in flight.
	4782	+ */
	4783	+ if (time_after64(frn->at, now - intv) &&
	4784	+ atomic_read(&frn->done.cnt) == 1) {
	4785	+ frn->at = 0;
	4786	+ trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
	4787	+ cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
	4788	+ WB_REASON_FOREIGN_FLUSH,
	4789	+ &frn->done);
	4790	+ }
4001	4791	}
4002	4792	}
4003	4793
..	..	@@ -4120,6 +4910,7 @@
4120	4910	unsigned int efd, cfd;
4121	4911	struct fd efile;
4122	4912	struct fd cfile;
	4913	+ struct dentry *cdentry;
4123	4914	const char *name;
4124	4915	char *endp;
4125	4916	int ret;
..	..	@@ -4171,6 +4962,16 @@
4171	4962	goto out_put_cfile;
4172	4963
4173	4964	/*
	4965	+ * The control file must be a regular cgroup1 file. As a regular cgroup
	4966	+ * file can't be renamed, it's safe to access its name afterwards.
	4967	+ */
	4968	+ cdentry = cfile.file->f_path.dentry;
	4969	+ if (cdentry->d_sb->s_type != &cgroup_fs_type \|\| !d_is_reg(cdentry)) {
	4970	+ ret = -EINVAL;
	4971	+ goto out_put_cfile;
	4972	+ }
	4973	+
	4974	+ /*
4174	4975	* Determine the event callbacks and set them in @event. This used
4175	4976	* to be done via struct cftype but cgroup core no longer knows
4176	4977	* about these events. The following is crude but the whole thing
..	..	@@ -4178,7 +4979,7 @@
4178	4979	*
4179	4980	* DO NOT ADD NEW FILES.
4180	4981	*/
4181		- name = cfile.file->f_path.dentry->d_name.name;
	4982	+ name = cdentry->d_name.name;
4182	4983
4183	4984	if (!strcmp(name, "memory.usage_in_bytes")) {
4184	4985	event->register_event = mem_cgroup_usage_register_event;
..	..	@@ -4202,7 +5003,7 @@
4202	5003	* automatically removed on cgroup destruction but the removal is
4203	5004	* asynchronous, so take an extra ref on @css.
4204	5005	*/
4205		- cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
	5006	+ cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
4206	5007	&memory_cgrp_subsys);
4207	5008	ret = -EINVAL;
4208	5009	if (IS_ERR(cfile_css))
..	..	@@ -4337,12 +5138,10 @@
4337	5138	.write = mem_cgroup_reset,
4338	5139	.read_u64 = mem_cgroup_read_u64,
4339	5140	},
4340		-#if defined(CONFIG_SLAB) \|\| defined(CONFIG_SLUB_DEBUG)
	5141	+#if defined(CONFIG_MEMCG_KMEM) && \
	5142	+ (defined(CONFIG_SLAB) \|\| defined(CONFIG_SLUB_DEBUG))
4341	5143	{
4342	5144	.name = "kmem.slabinfo",
4343		- .seq_start = memcg_slab_start,
4344		- .seq_next = memcg_slab_next,
4345		- .seq_stop = memcg_slab_stop,
4346	5145	.seq_show = memcg_slab_show,
4347	5146	},
4348	5147	#endif
..	..	@@ -4380,7 +5179,7 @@
4380	5179	* limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
4381	5180	* memory-controlled cgroups to 64k.
4382	5181	*
4383		- * However, there usually are many references to the oflline CSS after
	5182	+ * However, there usually are many references to the offline CSS after
4384	5183	* the cgroup has been destroyed, such as page cache or reclaimable
4385	5184	* slab objects, that don't need to hang on to the ID. We want to keep
4386	5185	* those dead CSS from occupying IDs, or we might quickly exhaust the
..	..	@@ -4401,31 +5200,26 @@
4401	5200	static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
4402	5201	{
4403	5202	if (memcg->id.id > 0) {
	5203	+ trace_android_vh_mem_cgroup_id_remove(memcg);
4404	5204	idr_remove(&mem_cgroup_idr, memcg->id.id);
4405	5205	memcg->id.id = 0;
4406	5206	}
4407	5207	}
4408	5208
4409		-static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
	5209	+static void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg,
	5210	+ unsigned int n)
4410	5211	{
4411		- VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0);
4412		- atomic_add(n, &memcg->id.ref);
	5212	+ refcount_add(n, &memcg->id.ref);
4413	5213	}
4414	5214
4415	5215	static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
4416	5216	{
4417		- VM_BUG_ON(atomic_read(&memcg->id.ref) < n);
4418		- if (atomic_sub_and_test(n, &memcg->id.ref)) {
	5217	+ if (refcount_sub_and_test(n, &memcg->id.ref)) {
4419	5218	mem_cgroup_id_remove(memcg);
4420	5219
4421	5220	/* Memcg ID pins CSS */
4422	5221	css_put(&memcg->css);
4423	5222	}
4424		-}
4425		-
4426		-static inline void mem_cgroup_id_get(struct mem_cgroup *memcg)
4427		-{
4428		- mem_cgroup_id_get_many(memcg, 1);
4429	5223	}
4430	5224
4431	5225	static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
..	..	@@ -4444,6 +5238,7 @@
4444	5238	WARN_ON_ONCE(!rcu_read_lock_held());
4445	5239	return idr_find(&mem_cgroup_idr, id);
4446	5240	}
	5241	+EXPORT_SYMBOL_GPL(mem_cgroup_from_id);
4447	5242
4448	5243	static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
4449	5244	{
..	..	@@ -4463,8 +5258,17 @@
4463	5258	if (!pn)
4464	5259	return 1;
4465	5260
4466		- pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat);
	5261	+ pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat,
	5262	+ GFP_KERNEL_ACCOUNT);
	5263	+ if (!pn->lruvec_stat_local) {
	5264	+ kfree(pn);
	5265	+ return 1;
	5266	+ }
	5267	+
	5268	+ pn->lruvec_stat_cpu = alloc_percpu_gfp(struct lruvec_stat,
	5269	+ GFP_KERNEL_ACCOUNT);
4467	5270	if (!pn->lruvec_stat_cpu) {
	5271	+ free_percpu(pn->lruvec_stat_local);
4468	5272	kfree(pn);
4469	5273	return 1;
4470	5274	}
..	..	@@ -4486,6 +5290,7 @@
4486	5290	return;
4487	5291
4488	5292	free_percpu(pn->lruvec_stat_cpu);
	5293	+ free_percpu(pn->lruvec_stat_local);
4489	5294	kfree(pn);
4490	5295	}
4491	5296
..	..	@@ -4493,39 +5298,57 @@
4493	5298	{
4494	5299	int node;
4495	5300
	5301	+ trace_android_vh_mem_cgroup_free(memcg);
4496	5302	for_each_node(node)
4497	5303	free_mem_cgroup_per_node_info(memcg, node);
4498		- free_percpu(memcg->stat_cpu);
	5304	+ free_percpu(memcg->vmstats_percpu);
	5305	+ free_percpu(memcg->vmstats_local);
4499	5306	kfree(memcg);
4500	5307	}
4501	5308
4502	5309	static void mem_cgroup_free(struct mem_cgroup *memcg)
4503	5310	{
4504	5311	memcg_wb_domain_exit(memcg);
	5312	+ /*
	5313	+ * Flush percpu vmstats and vmevents to guarantee the value correctness
	5314	+ * on parent's and all ancestor levels.
	5315	+ */
	5316	+ memcg_flush_percpu_vmstats(memcg);
	5317	+ memcg_flush_percpu_vmevents(memcg);
4505	5318	__mem_cgroup_free(memcg);
4506	5319	}
4507	5320
4508	5321	static struct mem_cgroup *mem_cgroup_alloc(void)
4509	5322	{
4510	5323	struct mem_cgroup *memcg;
4511		- size_t size;
	5324	+ unsigned int size;
4512	5325	int node;
	5326	+ int __maybe_unused i;
	5327	+ long error = -ENOMEM;
4513	5328
4514	5329	size = sizeof(struct mem_cgroup);
4515	5330	size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
4516	5331
4517	5332	memcg = kzalloc(size, GFP_KERNEL);
4518	5333	if (!memcg)
4519		- return NULL;
	5334	+ return ERR_PTR(error);
4520	5335
4521	5336	memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
4522	5337	1, MEM_CGROUP_ID_MAX,
4523	5338	GFP_KERNEL);
4524		- if (memcg->id.id < 0)
	5339	+ if (memcg->id.id < 0) {
	5340	+ error = memcg->id.id;
	5341	+ goto fail;
	5342	+ }
	5343	+
	5344	+ memcg->vmstats_local = alloc_percpu_gfp(struct memcg_vmstats_percpu,
	5345	+ GFP_KERNEL_ACCOUNT);
	5346	+ if (!memcg->vmstats_local)
4525	5347	goto fail;
4526	5348
4527		- memcg->stat_cpu = alloc_percpu(struct mem_cgroup_stat_cpu);
4528		- if (!memcg->stat_cpu)
	5349	+ memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
	5350	+ GFP_KERNEL_ACCOUNT);
	5351	+ if (!memcg->vmstats_percpu)
4529	5352	goto fail;
4530	5353
4531	5354	for_each_node(node)
..	..	@@ -4536,7 +5359,6 @@
4536	5359	goto fail;
4537	5360
4538	5361	INIT_WORK(&memcg->high_work, high_work_func);
4539		- memcg->last_scanned_node = MAX_NUMNODES;
4540	5362	INIT_LIST_HEAD(&memcg->oom_notify);
4541	5363	mutex_init(&memcg->thresholds_lock);
4542	5364	spin_lock_init(&memcg->move_lock);
..	..	@@ -4546,48 +5368,64 @@
4546	5368	memcg->socket_pressure = jiffies;
4547	5369	#ifdef CONFIG_MEMCG_KMEM
4548	5370	memcg->kmemcg_id = -1;
	5371	+ INIT_LIST_HEAD(&memcg->objcg_list);
4549	5372	#endif
4550	5373	#ifdef CONFIG_CGROUP_WRITEBACK
4551	5374	INIT_LIST_HEAD(&memcg->cgwb_list);
	5375	+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
	5376	+ memcg->cgwb_frn[i].done =
	5377	+ __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
	5378	+#endif
	5379	+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	5380	+ spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
	5381	+ INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
	5382	+ memcg->deferred_split_queue.split_queue_len = 0;
4552	5383	#endif
4553	5384	idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
	5385	+ trace_android_vh_mem_cgroup_alloc(memcg);
4554	5386	return memcg;
4555	5387	fail:
4556	5388	mem_cgroup_id_remove(memcg);
4557	5389	__mem_cgroup_free(memcg);
4558		- return NULL;
	5390	+ return ERR_PTR(error);
4559	5391	}
4560	5392
4561	5393	static struct cgroup_subsys_state * __ref
4562	5394	mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4563	5395	{
4564	5396	struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
4565		- struct mem_cgroup *memcg;
	5397	+ struct mem_cgroup memcg, old_memcg;
4566	5398	long error = -ENOMEM;
4567	5399
	5400	+ old_memcg = set_active_memcg(parent);
4568	5401	memcg = mem_cgroup_alloc();
4569		- if (!memcg)
4570		- return ERR_PTR(error);
	5402	+ set_active_memcg(old_memcg);
	5403	+ if (IS_ERR(memcg))
	5404	+ return ERR_CAST(memcg);
4571	5405
4572		- memcg->high = PAGE_COUNTER_MAX;
	5406	+ page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
4573	5407	memcg->soft_limit = PAGE_COUNTER_MAX;
	5408	+ page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
4574	5409	if (parent) {
4575	5410	memcg->swappiness = mem_cgroup_swappiness(parent);
4576	5411	memcg->oom_kill_disable = parent->oom_kill_disable;
4577	5412	}
4578		- if (parent && parent->use_hierarchy) {
	5413	+ if (!parent) {
	5414	+ page_counter_init(&memcg->memory, NULL);
	5415	+ page_counter_init(&memcg->swap, NULL);
	5416	+ page_counter_init(&memcg->kmem, NULL);
	5417	+ page_counter_init(&memcg->tcpmem, NULL);
	5418	+ } else if (parent->use_hierarchy) {
4579	5419	memcg->use_hierarchy = true;
4580	5420	page_counter_init(&memcg->memory, &parent->memory);
4581	5421	page_counter_init(&memcg->swap, &parent->swap);
4582		- page_counter_init(&memcg->memsw, &parent->memsw);
4583	5422	page_counter_init(&memcg->kmem, &parent->kmem);
4584	5423	page_counter_init(&memcg->tcpmem, &parent->tcpmem);
4585	5424	} else {
4586		- page_counter_init(&memcg->memory, NULL);
4587		- page_counter_init(&memcg->swap, NULL);
4588		- page_counter_init(&memcg->memsw, NULL);
4589		- page_counter_init(&memcg->kmem, NULL);
4590		- page_counter_init(&memcg->tcpmem, NULL);
	5425	+ page_counter_init(&memcg->memory, &root_mem_cgroup->memory);
	5426	+ page_counter_init(&memcg->swap, &root_mem_cgroup->swap);
	5427	+ page_counter_init(&memcg->kmem, &root_mem_cgroup->kmem);
	5428	+ page_counter_init(&memcg->tcpmem, &root_mem_cgroup->tcpmem);
4591	5429	/*
4592	5430	* Deeper hierachy with use_hierarchy == false doesn't make
4593	5431	* much sense so let cgroup subsystem know about this
..	..	@@ -4614,7 +5452,7 @@
4614	5452	fail:
4615	5453	mem_cgroup_id_remove(memcg);
4616	5454	mem_cgroup_free(memcg);
4617		- return ERR_PTR(-ENOMEM);
	5455	+ return ERR_PTR(error);
4618	5456	}
4619	5457
4620	5458	static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
..	..	@@ -4632,8 +5470,9 @@
4632	5470	}
4633	5471
4634	5472	/* Online state pins memcg ID, memcg ID pins CSS */
4635		- atomic_set(&memcg->id.ref, 1);
	5473	+ refcount_set(&memcg->id.ref, 1);
4636	5474	css_get(css);
	5475	+ trace_android_vh_mem_cgroup_css_online(css, memcg);
4637	5476	return 0;
4638	5477	}
4639	5478
..	..	@@ -4642,6 +5481,7 @@
4642	5481	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4643	5482	struct mem_cgroup_event event, tmp;
4644	5483
	5484	+ trace_android_vh_mem_cgroup_css_offline(css, memcg);
4645	5485	/*
4646	5486	* Unregister events and notify userspace.
4647	5487	* Notify userspace about cgroup removing only after rmdir of cgroup
..	..	@@ -4660,6 +5500,8 @@
4660	5500	memcg_offline_kmem(memcg);
4661	5501	wb_memcg_offline(memcg);
4662	5502
	5503	+ drain_all_stock(memcg);
	5504	+
4663	5505	mem_cgroup_id_put(memcg);
4664	5506	}
4665	5507
..	..	@@ -4673,7 +5515,12 @@
4673	5515	static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
4674	5516	{
4675	5517	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
	5518	+ int __maybe_unused i;
4676	5519
	5520	+#ifdef CONFIG_CGROUP_WRITEBACK
	5521	+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
	5522	+ wb_wait_for_completion(&memcg->cgwb_frn[i].done);
	5523	+#endif
4677	5524	if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
4678	5525	static_branch_dec(&memcg_sockets_enabled_key);
4679	5526
..	..	@@ -4707,13 +5554,13 @@
4707	5554
4708	5555	page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
4709	5556	page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
4710		- page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
4711	5557	page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
4712	5558	page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
4713	5559	page_counter_set_min(&memcg->memory, 0);
4714	5560	page_counter_set_low(&memcg->memory, 0);
4715		- memcg->high = PAGE_COUNTER_MAX;
	5561	+ page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
4716	5562	memcg->soft_limit = PAGE_COUNTER_MAX;
	5563	+ page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
4717	5564	memcg_wb_domain_size_changed(memcg);
4718	5565	}
4719	5566
..	..	@@ -4756,7 +5603,7 @@
4756	5603	static struct page mc_handle_present_pte(struct vm_area_struct vma,
4757	5604	unsigned long addr, pte_t ptent)
4758	5605	{
4759		- struct page *page = _vm_normal_page(vma, addr, ptent, true);
	5606	+ struct page *page = vm_normal_page(vma, addr, ptent);
4760	5607
4761	5608	if (!page \|\| !page_mapped(page))
4762	5609	return NULL;
..	..	@@ -4807,8 +5654,7 @@
4807	5654	* we call find_get_page() with swapper_space directly.
4808	5655	*/
4809	5656	page = find_get_page(swap_address_space(ent), swp_offset(ent));
4810		- if (do_memsw_account())
4811		- entry->val = ent.val;
	5657	+ entry->val = ent.val;
4812	5658
4813	5659	return page;
4814	5660	}
..	..	@@ -4823,36 +5669,15 @@
4823	5669	static struct page mc_handle_file_pte(struct vm_area_struct vma,
4824	5670	unsigned long addr, pte_t ptent, swp_entry_t *entry)
4825	5671	{
4826		- struct page *page = NULL;
4827		- struct address_space *mapping;
4828		- pgoff_t pgoff;
4829		-
4830	5672	if (!vma->vm_file) /* anonymous vma */
4831	5673	return NULL;
4832	5674	if (!(mc.flags & MOVE_FILE))
4833	5675	return NULL;
4834	5676
4835		- mapping = vma->vm_file->f_mapping;
4836		- pgoff = linear_page_index(vma, addr);
4837		-
4838	5677	/* page is moved even if it's not RSS of this task(page-faulted). */
4839		-#ifdef CONFIG_SWAP
4840	5678	/* shmem/tmpfs may report page out on swap: account for that too. */
4841		- if (shmem_mapping(mapping)) {
4842		- page = find_get_entry(mapping, pgoff);
4843		- if (radix_tree_exceptional_entry(page)) {
4844		- swp_entry_t swp = radix_to_swp_entry(page);
4845		- if (do_memsw_account())
4846		- *entry = swp;
4847		- page = find_get_page(swap_address_space(swp),
4848		- swp_offset(swp));
4849		- }
4850		- } else
4851		- page = find_get_page(mapping, pgoff);
4852		-#else
4853		- page = find_get_page(mapping, pgoff);
4854		-#endif
4855		- return page;
	5679	+ return find_get_incore_page(vma->vm_file->f_mapping,
	5680	+ linear_page_index(vma, addr));
4856	5681	}
4857	5682
4858	5683	/**
..	..	@@ -4872,10 +5697,10 @@
4872	5697	struct mem_cgroup *from,
4873	5698	struct mem_cgroup *to)
4874	5699	{
4875		- unsigned long flags;
4876		- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
	5700	+ struct lruvec from_vec, to_vec;
	5701	+ struct pglist_data *pgdat;
	5702	+ unsigned int nr_pages = compound ? thp_nr_pages(page) : 1;
4877	5703	int ret;
4878		- bool anon;
4879	5704
4880	5705	VM_BUG_ON(from == to);
4881	5706	VM_BUG_ON_PAGE(PageLRU(page), page);
..	..	@@ -4893,52 +5718,83 @@
4893	5718	if (page->mem_cgroup != from)
4894	5719	goto out_unlock;
4895	5720
4896		- anon = PageAnon(page);
	5721	+ pgdat = page_pgdat(page);
	5722	+ from_vec = mem_cgroup_lruvec(from, pgdat);
	5723	+ to_vec = mem_cgroup_lruvec(to, pgdat);
4897	5724
4898		- spin_lock_irqsave(&from->move_lock, flags);
	5725	+ lock_page_memcg(page);
4899	5726
4900		- if (!anon && page_mapped(page)) {
4901		- __mod_memcg_state(from, NR_FILE_MAPPED, -nr_pages);
4902		- __mod_memcg_state(to, NR_FILE_MAPPED, nr_pages);
4903		- }
	5727	+ if (PageAnon(page)) {
	5728	+ if (page_mapped(page)) {
	5729	+ __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
	5730	+ __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
	5731	+ if (PageTransHuge(page)) {
	5732	+ __dec_lruvec_state(from_vec, NR_ANON_THPS);
	5733	+ __inc_lruvec_state(to_vec, NR_ANON_THPS);
	5734	+ }
4904	5735
4905		- /*
4906		- * move_lock grabbed above and caller set from->moving_account, so
4907		- * mod_memcg_page_state will serialize updates to PageDirty.
4908		- * So mapping should be stable for dirty pages.
4909		- */
4910		- if (!anon && PageDirty(page)) {
4911		- struct address_space *mapping = page_mapping(page);
	5736	+ }
	5737	+ } else {
	5738	+ __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
	5739	+ __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
4912	5740
4913		- if (mapping_cap_account_dirty(mapping)) {
4914		- __mod_memcg_state(from, NR_FILE_DIRTY, -nr_pages);
4915		- __mod_memcg_state(to, NR_FILE_DIRTY, nr_pages);
	5741	+ if (PageSwapBacked(page)) {
	5742	+ __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
	5743	+ __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
	5744	+ }
	5745	+
	5746	+ if (page_mapped(page)) {
	5747	+ __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
	5748	+ __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
	5749	+ }
	5750	+
	5751	+ if (PageDirty(page)) {
	5752	+ struct address_space *mapping = page_mapping(page);
	5753	+
	5754	+ if (mapping_can_writeback(mapping)) {
	5755	+ __mod_lruvec_state(from_vec, NR_FILE_DIRTY,
	5756	+ -nr_pages);
	5757	+ __mod_lruvec_state(to_vec, NR_FILE_DIRTY,
	5758	+ nr_pages);
	5759	+ }
4916	5760	}
4917	5761	}
4918	5762
4919	5763	if (PageWriteback(page)) {
4920		- __mod_memcg_state(from, NR_WRITEBACK, -nr_pages);
4921		- __mod_memcg_state(to, NR_WRITEBACK, nr_pages);
	5764	+ __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
	5765	+ __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
4922	5766	}
4923	5767
4924	5768	/*
	5769	+ * All state has been migrated, let's switch to the new memcg.
	5770	+ *
4925	5771	* It is safe to change page->mem_cgroup here because the page
4926		- * is referenced, charged, and isolated - we can't race with
4927		- * uncharging, charging, migration, or LRU putback.
	5772	+ * is referenced, charged, isolated, and locked: we can't race
	5773	+ * with (un)charging, migration, LRU putback, or anything else
	5774	+ * that would rely on a stable page->mem_cgroup.
	5775	+ *
	5776	+ * Note that lock_page_memcg is a memcg lock, not a page lock,
	5777	+ * to save space. As soon as we switch page->mem_cgroup to a
	5778	+ * new memcg that isn't locked, the above state can change
	5779	+ * concurrently again. Make sure we're truly done with it.
4928	5780	*/
	5781	+ smp_mb();
4929	5782
4930		- /* caller should have done css_get */
	5783	+ css_get(&to->css);
	5784	+ css_put(&from->css);
	5785	+
4931	5786	page->mem_cgroup = to;
4932		- spin_unlock_irqrestore(&from->move_lock, flags);
	5787	+
	5788	+ __unlock_page_memcg(from);
4933	5789
4934	5790	ret = 0;
4935	5791
4936		- local_irq_disable();
4937		- mem_cgroup_charge_statistics(to, page, compound, nr_pages);
	5792	+ local_lock_irq(&event_lock.l);
	5793	+ mem_cgroup_charge_statistics(to, page, nr_pages);
4938	5794	memcg_check_events(to, page);
4939		- mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
	5795	+ mem_cgroup_charge_statistics(from, page, -nr_pages);
4940	5796	memcg_check_events(from, page);
4941		- local_irq_enable();
	5797	+ local_unlock_irq(&event_lock.l);
4942	5798	out_unlock:
4943	5799	unlock_page(page);
4944	5800	out:
..	..	@@ -4960,8 +5816,8 @@
4960	5816	* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
4961	5817	* target for charge migration. if @target is not NULL, the entry is stored
4962	5818	* in target->ent.
4963		- * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PUBLIC
4964		- * or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).
	5819	+ * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE
	5820	+ * (so ZONE_DEVICE page and thus not on the lru).
4965	5821	* For now we such page is charge like a regular page would be as for all
4966	5822	* intent and purposes it is just special memory taking the place of a
4967	5823	* regular page.
..	..	@@ -4995,8 +5851,7 @@
4995	5851	*/
4996	5852	if (page->mem_cgroup == mc.from) {
4997	5853	ret = MC_TARGET_PAGE;
4998		- if (is_device_private_page(page) \|\|
4999		- is_device_public_page(page))
	5854	+ if (is_device_private_page(page))
5000	5855	ret = MC_TARGET_DEVICE;
5001	5856	if (target)
5002	5857	target->page = page;
..	..	@@ -5067,8 +5922,8 @@
5067	5922	if (ptl) {
5068	5923	/*
5069	5924	* Note their can not be MC_TARGET_DEVICE for now as we do not
5070		- * support transparent huge page with MEMORY_DEVICE_PUBLIC or
5071		- * MEMORY_DEVICE_PRIVATE but this might change.
	5925	+ * support transparent huge page with MEMORY_DEVICE_PRIVATE but
	5926	+ * this might change.
5072	5927	*/
5073	5928	if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
5074	5929	mc.precharge += HPAGE_PMD_NR;
..	..	@@ -5088,18 +5943,17 @@
5088	5943	return 0;
5089	5944	}
5090	5945
	5946	+static const struct mm_walk_ops precharge_walk_ops = {
	5947	+ .pmd_entry = mem_cgroup_count_precharge_pte_range,
	5948	+};
	5949	+
5091	5950	static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
5092	5951	{
5093	5952	unsigned long precharge;
5094	5953
5095		- struct mm_walk mem_cgroup_count_precharge_walk = {
5096		- .pmd_entry = mem_cgroup_count_precharge_pte_range,
5097		- .mm = mm,
5098		- };
5099		- down_read(&mm->mmap_sem);
5100		- walk_page_range(0, mm->highest_vm_end,
5101		- &mem_cgroup_count_precharge_walk);
5102		- up_read(&mm->mmap_sem);
	5954	+ mmap_read_lock(mm);
	5955	+ walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
	5956	+ mmap_read_unlock(mm);
5103	5957
5104	5958	precharge = mc.precharge;
5105	5959	mc.precharge = 0;
..	..	@@ -5149,8 +6003,6 @@
5149	6003	*/
5150	6004	if (!mem_cgroup_is_root(mc.to))
5151	6005	page_counter_uncharge(&mc.to->memory, mc.moved_swap);
5152		-
5153		- css_put_many(&mc.to->css, mc.moved_swap);
5154	6006
5155	6007	mc.moved_swap = 0;
5156	6008	}
..	..	@@ -5312,7 +6164,7 @@
5312	6164	switch (get_mctgt_type(vma, addr, ptent, &target)) {
5313	6165	case MC_TARGET_DEVICE:
5314	6166	device = true;
5315		- /* fall through */
	6167	+ fallthrough;
5316	6168	case MC_TARGET_PAGE:
5317	6169	page = target.page;
5318	6170	/*
..	..	@@ -5367,13 +6219,12 @@
5367	6219	return ret;
5368	6220	}
5369	6221
	6222	+static const struct mm_walk_ops charge_walk_ops = {
	6223	+ .pmd_entry = mem_cgroup_move_charge_pte_range,
	6224	+};
	6225	+
5370	6226	static void mem_cgroup_move_charge(void)
5371	6227	{
5372		- struct mm_walk mem_cgroup_move_charge_walk = {
5373		- .pmd_entry = mem_cgroup_move_charge_pte_range,
5374		- .mm = mc.mm,
5375		- };
5376		-
5377	6228	lru_add_drain_all();
5378	6229	/*
5379	6230	* Signal lock_page_memcg() to take the memcg's move_lock
..	..	@@ -5383,9 +6234,9 @@
5383	6234	atomic_inc(&mc.from->moving_account);
5384	6235	synchronize_rcu();
5385	6236	retry:
5386		- if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) {
	6237	+ if (unlikely(!mmap_read_trylock(mc.mm))) {
5387	6238	/*
5388		- * Someone who are holding the mmap_sem might be waiting in
	6239	+ * Someone who are holding the mmap_lock might be waiting in
5389	6240	* waitq. So we cancel all extra charges, wake up all waiters,
5390	6241	* and retry. Because we cancel precharges, we might not be able
5391	6242	* to move enough charges, but moving charge is a best-effort
..	..	@@ -5399,9 +6250,10 @@
5399	6250	* When we have consumed all precharges and failed in doing
5400	6251	* additional charge, the page walk just aborts.
5401	6252	*/
5402		- walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk);
	6253	+ walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
	6254	+ NULL);
5403	6255
5404		- up_read(&mc.mm->mmap_sem);
	6256	+ mmap_read_unlock(mc.mm);
5405	6257	atomic_dec(&mc.from->moving_account);
5406	6258	}
5407	6259
..	..	@@ -5443,6 +6295,16 @@
5443	6295	root_mem_cgroup->use_hierarchy = false;
5444	6296	}
5445	6297
	6298	+static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
	6299	+{
	6300	+ if (value == PAGE_COUNTER_MAX)
	6301	+ seq_puts(m, "max\n");
	6302	+ else
	6303	+ seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
	6304	+
	6305	+ return 0;
	6306	+}
	6307	+
5446	6308	static u64 memory_current_read(struct cgroup_subsys_state *css,
5447	6309	struct cftype *cft)
5448	6310	{
..	..	@@ -5453,15 +6315,8 @@
5453	6315
5454	6316	static int memory_min_show(struct seq_file m, void v)
5455	6317	{
5456		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5457		- unsigned long min = READ_ONCE(memcg->memory.min);
5458		-
5459		- if (min == PAGE_COUNTER_MAX)
5460		- seq_puts(m, "max\n");
5461		- else
5462		- seq_printf(m, "%llu\n", (u64)min * PAGE_SIZE);
5463		-
5464		- return 0;
	6318	+ return seq_puts_memcg_tunable(m,
	6319	+ READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
5465	6320	}
5466	6321
5467	6322	static ssize_t memory_min_write(struct kernfs_open_file *of,
..	..	@@ -5483,15 +6338,8 @@
5483	6338
5484	6339	static int memory_low_show(struct seq_file m, void v)
5485	6340	{
5486		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5487		- unsigned long low = READ_ONCE(memcg->memory.low);
5488		-
5489		- if (low == PAGE_COUNTER_MAX)
5490		- seq_puts(m, "max\n");
5491		- else
5492		- seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE);
5493		-
5494		- return 0;
	6341	+ return seq_puts_memcg_tunable(m,
	6342	+ READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
5495	6343	}
5496	6344
5497	6345	static ssize_t memory_low_write(struct kernfs_open_file *of,
..	..	@@ -5513,22 +6361,16 @@
5513	6361
5514	6362	static int memory_high_show(struct seq_file m, void v)
5515	6363	{
5516		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5517		- unsigned long high = READ_ONCE(memcg->high);
5518		-
5519		- if (high == PAGE_COUNTER_MAX)
5520		- seq_puts(m, "max\n");
5521		- else
5522		- seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE);
5523		-
5524		- return 0;
	6364	+ return seq_puts_memcg_tunable(m,
	6365	+ READ_ONCE(mem_cgroup_from_seq(m)->memory.high));
5525	6366	}
5526	6367
5527	6368	static ssize_t memory_high_write(struct kernfs_open_file *of,
5528	6369	char *buf, size_t nbytes, loff_t off)
5529	6370	{
5530	6371	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5531		- unsigned long nr_pages;
	6372	+ unsigned int nr_retries = MAX_RECLAIM_RETRIES;
	6373	+ bool drained = false;
5532	6374	unsigned long high;
5533	6375	int err;
5534	6376
..	..	@@ -5537,12 +6379,30 @@
5537	6379	if (err)
5538	6380	return err;
5539	6381
5540		- memcg->high = high;
	6382	+ page_counter_set_high(&memcg->memory, high);
5541	6383
5542		- nr_pages = page_counter_read(&memcg->memory);
5543		- if (nr_pages > high)
5544		- try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
5545		- GFP_KERNEL, true);
	6384	+ for (;;) {
	6385	+ unsigned long nr_pages = page_counter_read(&memcg->memory);
	6386	+ unsigned long reclaimed;
	6387	+
	6388	+ if (nr_pages <= high)
	6389	+ break;
	6390	+
	6391	+ if (signal_pending(current))
	6392	+ break;
	6393	+
	6394	+ if (!drained) {
	6395	+ drain_all_stock(memcg);
	6396	+ drained = true;
	6397	+ continue;
	6398	+ }
	6399	+
	6400	+ reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
	6401	+ GFP_KERNEL, true);
	6402	+
	6403	+ if (!reclaimed && !nr_retries--)
	6404	+ break;
	6405	+ }
5546	6406
5547	6407	memcg_wb_domain_size_changed(memcg);
5548	6408	return nbytes;
..	..	@@ -5550,22 +6410,15 @@
5550	6410
5551	6411	static int memory_max_show(struct seq_file m, void v)
5552	6412	{
5553		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5554		- unsigned long max = READ_ONCE(memcg->memory.max);
5555		-
5556		- if (max == PAGE_COUNTER_MAX)
5557		- seq_puts(m, "max\n");
5558		- else
5559		- seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
5560		-
5561		- return 0;
	6413	+ return seq_puts_memcg_tunable(m,
	6414	+ READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
5562	6415	}
5563	6416
5564	6417	static ssize_t memory_max_write(struct kernfs_open_file *of,
5565	6418	char *buf, size_t nbytes, loff_t off)
5566	6419	{
5567	6420	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5568		- unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
	6421	+ unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
5569	6422	bool drained = false;
5570	6423	unsigned long max;
5571	6424	int err;
..	..	@@ -5583,10 +6436,8 @@
5583	6436	if (nr_pages <= max)
5584	6437	break;
5585	6438
5586		- if (signal_pending(current)) {
5587		- err = -EINTR;
	6439	+ if (signal_pending(current))
5588	6440	break;
5589		- }
5590	6441
5591	6442	if (!drained) {
5592	6443	drain_all_stock(memcg);
..	..	@@ -5610,104 +6461,77 @@
5610	6461	return nbytes;
5611	6462	}
5612	6463
	6464	+static void __memory_events_show(struct seq_file m, atomic_long_t events)
	6465	+{
	6466	+ seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
	6467	+ seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
	6468	+ seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
	6469	+ seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
	6470	+ seq_printf(m, "oom_kill %lu\n",
	6471	+ atomic_long_read(&events[MEMCG_OOM_KILL]));
	6472	+}
	6473	+
5613	6474	static int memory_events_show(struct seq_file m, void v)
5614	6475	{
5615		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
	6476	+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
5616	6477
5617		- seq_printf(m, "low %lu\n",
5618		- atomic_long_read(&memcg->memory_events[MEMCG_LOW]));
5619		- seq_printf(m, "high %lu\n",
5620		- atomic_long_read(&memcg->memory_events[MEMCG_HIGH]));
5621		- seq_printf(m, "max %lu\n",
5622		- atomic_long_read(&memcg->memory_events[MEMCG_MAX]));
5623		- seq_printf(m, "oom %lu\n",
5624		- atomic_long_read(&memcg->memory_events[MEMCG_OOM]));
5625		- seq_printf(m, "oom_kill %lu\n",
5626		- atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
	6478	+ __memory_events_show(m, memcg->memory_events);
	6479	+ return 0;
	6480	+}
5627	6481
	6482	+static int memory_events_local_show(struct seq_file m, void v)
	6483	+{
	6484	+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
	6485	+
	6486	+ __memory_events_show(m, memcg->memory_events_local);
5628	6487	return 0;
5629	6488	}
5630	6489
5631	6490	static int memory_stat_show(struct seq_file m, void v)
5632	6491	{
5633		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5634		- struct accumulated_stats acc;
5635		- int i;
	6492	+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
	6493	+ char *buf;
5636	6494
5637		- /*
5638		- * Provide statistics on the state of the memory subsystem as
5639		- * well as cumulative event counters that show past behavior.
5640		- *
5641		- * This list is ordered following a combination of these gradients:
5642		- * 1) generic big picture -> specifics and details
5643		- * 2) reflecting userspace activity -> reflecting kernel heuristics
5644		- *
5645		- * Current memory state:
5646		- */
5647		-
5648		- memset(&acc, 0, sizeof(acc));
5649		- acc.stats_size = MEMCG_NR_STAT;
5650		- acc.events_size = NR_VM_EVENT_ITEMS;
5651		- accumulate_memcg_tree(memcg, &acc);
5652		-
5653		- seq_printf(m, "anon %llu\n",
5654		- (u64)acc.stat[MEMCG_RSS] * PAGE_SIZE);
5655		- seq_printf(m, "file %llu\n",
5656		- (u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE);
5657		- seq_printf(m, "kernel_stack %llu\n",
5658		- (u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024);
5659		- seq_printf(m, "slab %llu\n",
5660		- (u64)(acc.stat[NR_SLAB_RECLAIMABLE] +
5661		- acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
5662		- seq_printf(m, "sock %llu\n",
5663		- (u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE);
5664		-
5665		- seq_printf(m, "shmem %llu\n",
5666		- (u64)acc.stat[NR_SHMEM] * PAGE_SIZE);
5667		- seq_printf(m, "file_mapped %llu\n",
5668		- (u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE);
5669		- seq_printf(m, "file_dirty %llu\n",
5670		- (u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE);
5671		- seq_printf(m, "file_writeback %llu\n",
5672		- (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE);
5673		-
5674		- for (i = 0; i < NR_LRU_LISTS; i++)
5675		- seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
5676		- (u64)acc.lru_pages[i] * PAGE_SIZE);
5677		-
5678		- seq_printf(m, "slab_reclaimable %llu\n",
5679		- (u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
5680		- seq_printf(m, "slab_unreclaimable %llu\n",
5681		- (u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
5682		-
5683		- /* Accumulated memory events */
5684		-
5685		- seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]);
5686		- seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]);
5687		-
5688		- seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]);
5689		- seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] +
5690		- acc.events[PGSCAN_DIRECT]);
5691		- seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] +
5692		- acc.events[PGSTEAL_DIRECT]);
5693		- seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]);
5694		- seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]);
5695		- seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]);
5696		- seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
5697		-
5698		- seq_printf(m, "workingset_refault %lu\n",
5699		- acc.stat[WORKINGSET_REFAULT]);
5700		- seq_printf(m, "workingset_activate %lu\n",
5701		- acc.stat[WORKINGSET_ACTIVATE]);
5702		- seq_printf(m, "workingset_nodereclaim %lu\n",
5703		- acc.stat[WORKINGSET_NODERECLAIM]);
5704		-
	6495	+ buf = memory_stat_format(memcg);
	6496	+ if (!buf)
	6497	+ return -ENOMEM;
	6498	+ seq_puts(m, buf);
	6499	+ kfree(buf);
5705	6500	return 0;
5706	6501	}
5707	6502
	6503	+#ifdef CONFIG_NUMA
	6504	+static int memory_numa_stat_show(struct seq_file m, void v)
	6505	+{
	6506	+ int i;
	6507	+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
	6508	+
	6509	+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
	6510	+ int nid;
	6511	+
	6512	+ if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS)
	6513	+ continue;
	6514	+
	6515	+ seq_printf(m, "%s", memory_stats[i].name);
	6516	+ for_each_node_state(nid, N_MEMORY) {
	6517	+ u64 size;
	6518	+ struct lruvec *lruvec;
	6519	+
	6520	+ lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
	6521	+ size = lruvec_page_state(lruvec, memory_stats[i].idx);
	6522	+ size *= memory_stats[i].ratio;
	6523	+ seq_printf(m, " N%d=%llu", nid, size);
	6524	+ }
	6525	+ seq_putc(m, '\n');
	6526	+ }
	6527	+
	6528	+ return 0;
	6529	+}
	6530	+#endif
	6531	+
5708	6532	static int memory_oom_group_show(struct seq_file m, void v)
5709	6533	{
5710		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
	6534	+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
5711	6535
5712	6536	seq_printf(m, "%d\n", memcg->oom_group);
5713	6537
..	..	@@ -5773,10 +6597,21 @@
5773	6597	.seq_show = memory_events_show,
5774	6598	},
5775	6599	{
5776		- .name = "stat",
	6600	+ .name = "events.local",
5777	6601	.flags = CFTYPE_NOT_ON_ROOT,
	6602	+ .file_offset = offsetof(struct mem_cgroup, events_local_file),
	6603	+ .seq_show = memory_events_local_show,
	6604	+ },
	6605	+ {
	6606	+ .name = "stat",
5778	6607	.seq_show = memory_stat_show,
5779	6608	},
	6609	+#ifdef CONFIG_NUMA
	6610	+ {
	6611	+ .name = "numa_stat",
	6612	+ .seq_show = memory_numa_stat_show,
	6613	+ },
	6614	+#endif
5780	6615	{
5781	6616	.name = "oom.group",
5782	6617	.flags = CFTYPE_NOT_ON_ROOT \| CFTYPE_NS_DELEGATABLE,
..	..	@@ -5802,6 +6637,122 @@
5802	6637	.early_init = 0,
5803	6638	};
5804	6639
	6640	+/*
	6641	+ * This function calculates an individual cgroup's effective
	6642	+ * protection which is derived from its own memory.min/low, its
	6643	+ * parent's and siblings' settings, as well as the actual memory
	6644	+ * distribution in the tree.
	6645	+ *
	6646	+ * The following rules apply to the effective protection values:
	6647	+ *
	6648	+ * 1. At the first level of reclaim, effective protection is equal to
	6649	+ * the declared protection in memory.min and memory.low.
	6650	+ *
	6651	+ * 2. To enable safe delegation of the protection configuration, at
	6652	+ * subsequent levels the effective protection is capped to the
	6653	+ * parent's effective protection.
	6654	+ *
	6655	+ * 3. To make complex and dynamic subtrees easier to configure, the
	6656	+ * user is allowed to overcommit the declared protection at a given
	6657	+ * level. If that is the case, the parent's effective protection is
	6658	+ * distributed to the children in proportion to how much protection
	6659	+ * they have declared and how much of it they are utilizing.
	6660	+ *
	6661	+ * This makes distribution proportional, but also work-conserving:
	6662	+ * if one cgroup claims much more protection than it uses memory,
	6663	+ * the unused remainder is available to its siblings.
	6664	+ *
	6665	+ * 4. Conversely, when the declared protection is undercommitted at a
	6666	+ * given level, the distribution of the larger parental protection
	6667	+ * budget is NOT proportional. A cgroup's protection from a sibling
	6668	+ * is capped to its own memory.min/low setting.
	6669	+ *
	6670	+ * 5. However, to allow protecting recursive subtrees from each other
	6671	+ * without having to declare each individual cgroup's fixed share
	6672	+ * of the ancestor's claim to protection, any unutilized -
	6673	+ * "floating" - protection from up the tree is distributed in
	6674	+ * proportion to each cgroup's usage. This makes the protection
	6675	+ * neutral wrt sibling cgroups and lets them compete freely over
	6676	+ * the shared parental protection budget, but it protects the
	6677	+ * subtree as a whole from neighboring subtrees.
	6678	+ *
	6679	+ * Note that 4. and 5. are not in conflict: 4. is about protecting
	6680	+ * against immediate siblings whereas 5. is about protecting against
	6681	+ * neighboring subtrees.
	6682	+ */
	6683	+static unsigned long effective_protection(unsigned long usage,
	6684	+ unsigned long parent_usage,
	6685	+ unsigned long setting,
	6686	+ unsigned long parent_effective,
	6687	+ unsigned long siblings_protected)
	6688	+{
	6689	+ unsigned long protected;
	6690	+ unsigned long ep;
	6691	+
	6692	+ protected = min(usage, setting);
	6693	+ /*
	6694	+ * If all cgroups at this level combined claim and use more
	6695	+ * protection then what the parent affords them, distribute
	6696	+ * shares in proportion to utilization.
	6697	+ *
	6698	+ * We are using actual utilization rather than the statically
	6699	+ * claimed protection in order to be work-conserving: claimed
	6700	+ * but unused protection is available to siblings that would
	6701	+ * otherwise get a smaller chunk than what they claimed.
	6702	+ */
	6703	+ if (siblings_protected > parent_effective)
	6704	+ return protected * parent_effective / siblings_protected;
	6705	+
	6706	+ /*
	6707	+ * Ok, utilized protection of all children is within what the
	6708	+ * parent affords them, so we know whatever this child claims
	6709	+ * and utilizes is effectively protected.
	6710	+ *
	6711	+ * If there is unprotected usage beyond this value, reclaim
	6712	+ * will apply pressure in proportion to that amount.
	6713	+ *
	6714	+ * If there is unutilized protection, the cgroup will be fully
	6715	+ * shielded from reclaim, but we do return a smaller value for
	6716	+ * protection than what the group could enjoy in theory. This
	6717	+ * is okay. With the overcommit distribution above, effective
	6718	+ * protection is always dependent on how memory is actually
	6719	+ * consumed among the siblings anyway.
	6720	+ */
	6721	+ ep = protected;
	6722	+
	6723	+ /*
	6724	+ * If the children aren't claiming (all of) the protection
	6725	+ * afforded to them by the parent, distribute the remainder in
	6726	+ * proportion to the (unprotected) memory of each cgroup. That
	6727	+ * way, cgroups that aren't explicitly prioritized wrt each
	6728	+ * other compete freely over the allowance, but they are
	6729	+ * collectively protected from neighboring trees.
	6730	+ *
	6731	+ * We're using unprotected memory for the weight so that if
	6732	+ * some cgroups DO claim explicit protection, we don't protect
	6733	+ * the same bytes twice.
	6734	+ *
	6735	+ * Check both usage and parent_usage against the respective
	6736	+ * protected values. One should imply the other, but they
	6737	+ * aren't read atomically - make sure the division is sane.
	6738	+ */
	6739	+ if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT))
	6740	+ return ep;
	6741	+ if (parent_effective > siblings_protected &&
	6742	+ parent_usage > siblings_protected &&
	6743	+ usage > protected) {
	6744	+ unsigned long unclaimed;
	6745	+
	6746	+ unclaimed = parent_effective - siblings_protected;
	6747	+ unclaimed *= usage - protected;
	6748	+ unclaimed /= parent_usage - siblings_protected;
	6749	+
	6750	+ ep += unclaimed;
	6751	+ }
	6752	+
	6753	+ return ep;
	6754	+}
	6755	+
5805	6756	/**
5806	6757	* mem_cgroup_protected - check if memory consumption is in the normal range
5807	6758	* @root: the top ancestor of the sub-tree being checked
..	..	@@ -5809,259 +6760,125 @@
5809	6760	*
5810	6761	* WARNING: This function is not stateless! It can only be used as part
5811	6762	* of a top-down tree iteration, not for isolated queries.
5812		- *
5813		- * Returns one of the following:
5814		- * MEMCG_PROT_NONE: cgroup memory is not protected
5815		- * MEMCG_PROT_LOW: cgroup memory is protected as long there is
5816		- * an unprotected supply of reclaimable memory from other cgroups.
5817		- * MEMCG_PROT_MIN: cgroup memory is protected
5818		- *
5819		- * @root is exclusive; it is never protected when looked at directly
5820		- *
5821		- * To provide a proper hierarchical behavior, effective memory.min/low values
5822		- * are used. Below is the description of how effective memory.low is calculated.
5823		- * Effective memory.min values is calculated in the same way.
5824		- *
5825		- * Effective memory.low is always equal or less than the original memory.low.
5826		- * If there is no memory.low overcommittment (which is always true for
5827		- * top-level memory cgroups), these two values are equal.
5828		- * Otherwise, it's a part of parent's effective memory.low,
5829		- * calculated as a cgroup's memory.low usage divided by sum of sibling's
5830		- * memory.low usages, where memory.low usage is the size of actually
5831		- * protected memory.
5832		- *
5833		- * low_usage
5834		- * elow = min( memory.low, parent->elow * ------------------ ),
5835		- * siblings_low_usage
5836		- *
5837		- * \| memory.current, if memory.current < memory.low
5838		- * low_usage = \|
5839		- \| 0, otherwise.
5840		- *
5841		- *
5842		- * Such definition of the effective memory.low provides the expected
5843		- * hierarchical behavior: parent's memory.low value is limiting
5844		- * children, unprotected memory is reclaimed first and cgroups,
5845		- * which are not using their guarantee do not affect actual memory
5846		- * distribution.
5847		- *
5848		- * For example, if there are memcgs A, A/B, A/C, A/D and A/E:
5849		- *
5850		- * A A/memory.low = 2G, A/memory.current = 6G
5851		- * //\\
5852		- * BC DE B/memory.low = 3G B/memory.current = 2G
5853		- * C/memory.low = 1G C/memory.current = 2G
5854		- * D/memory.low = 0 D/memory.current = 2G
5855		- * E/memory.low = 10G E/memory.current = 0
5856		- *
5857		- * and the memory pressure is applied, the following memory distribution
5858		- * is expected (approximately):
5859		- *
5860		- * A/memory.current = 2G
5861		- *
5862		- * B/memory.current = 1.3G
5863		- * C/memory.current = 0.6G
5864		- * D/memory.current = 0
5865		- * E/memory.current = 0
5866		- *
5867		- * These calculations require constant tracking of the actual low usages
5868		- * (see propagate_protected_usage()), as well as recursive calculation of
5869		- * effective memory.low values. But as we do call mem_cgroup_protected()
5870		- * path for each memory cgroup top-down from the reclaim,
5871		- * it's possible to optimize this part, and save calculated elow
5872		- * for next usage. This part is intentionally racy, but it's ok,
5873		- * as memory.low is a best-effort mechanism.
5874	6763	*/
5875		-enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
5876		- struct mem_cgroup *memcg)
	6764	+void mem_cgroup_calculate_protection(struct mem_cgroup *root,
	6765	+ struct mem_cgroup *memcg)
5877	6766	{
	6767	+ unsigned long usage, parent_usage;
5878	6768	struct mem_cgroup *parent;
5879		- unsigned long emin, parent_emin;
5880		- unsigned long elow, parent_elow;
5881		- unsigned long usage;
5882	6769
5883	6770	if (mem_cgroup_disabled())
5884		- return MEMCG_PROT_NONE;
	6771	+ return;
5885	6772
5886	6773	if (!root)
5887	6774	root = root_mem_cgroup;
	6775	+
	6776	+ /*
	6777	+ * Effective values of the reclaim targets are ignored so they
	6778	+ * can be stale. Have a look at mem_cgroup_protection for more
	6779	+ * details.
	6780	+ * TODO: calculation should be more robust so that we do not need
	6781	+ * that special casing.
	6782	+ */
5888	6783	if (memcg == root)
5889		- return MEMCG_PROT_NONE;
	6784	+ return;
5890	6785
5891	6786	usage = page_counter_read(&memcg->memory);
5892	6787	if (!usage)
5893		- return MEMCG_PROT_NONE;
5894		-
5895		- emin = memcg->memory.min;
5896		- elow = memcg->memory.low;
	6788	+ return;
5897	6789
5898	6790	parent = parent_mem_cgroup(memcg);
5899	6791	/* No parent means a non-hierarchical mode on v1 memcg */
5900	6792	if (!parent)
5901		- return MEMCG_PROT_NONE;
	6793	+ return;
5902	6794
5903		- if (parent == root)
5904		- goto exit;
5905		-
5906		- parent_emin = READ_ONCE(parent->memory.emin);
5907		- emin = min(emin, parent_emin);
5908		- if (emin && parent_emin) {
5909		- unsigned long min_usage, siblings_min_usage;
5910		-
5911		- min_usage = min(usage, memcg->memory.min);
5912		- siblings_min_usage = atomic_long_read(
5913		- &parent->memory.children_min_usage);
5914		-
5915		- if (min_usage && siblings_min_usage)
5916		- emin = min(emin, parent_emin * min_usage /
5917		- siblings_min_usage);
	6795	+ if (parent == root) {
	6796	+ memcg->memory.emin = READ_ONCE(memcg->memory.min);
	6797	+ memcg->memory.elow = READ_ONCE(memcg->memory.low);
	6798	+ return;
5918	6799	}
5919	6800
5920		- parent_elow = READ_ONCE(parent->memory.elow);
5921		- elow = min(elow, parent_elow);
5922		- if (elow && parent_elow) {
5923		- unsigned long low_usage, siblings_low_usage;
	6801	+ parent_usage = page_counter_read(&parent->memory);
5924	6802
5925		- low_usage = min(usage, memcg->memory.low);
5926		- siblings_low_usage = atomic_long_read(
5927		- &parent->memory.children_low_usage);
	6803	+ WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage,
	6804	+ READ_ONCE(memcg->memory.min),
	6805	+ READ_ONCE(parent->memory.emin),
	6806	+ atomic_long_read(&parent->memory.children_min_usage)));
5928	6807
5929		- if (low_usage && siblings_low_usage)
5930		- elow = min(elow, parent_elow * low_usage /
5931		- siblings_low_usage);
5932		- }
5933		-
5934		-exit:
5935		- memcg->memory.emin = emin;
5936		- memcg->memory.elow = elow;
5937		-
5938		- if (usage <= emin)
5939		- return MEMCG_PROT_MIN;
5940		- else if (usage <= elow)
5941		- return MEMCG_PROT_LOW;
5942		- else
5943		- return MEMCG_PROT_NONE;
	6808	+ WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage,
	6809	+ READ_ONCE(memcg->memory.low),
	6810	+ READ_ONCE(parent->memory.elow),
	6811	+ atomic_long_read(&parent->memory.children_low_usage)));
5944	6812	}
5945	6813
5946	6814	/**
5947		- * mem_cgroup_try_charge - try charging a page
	6815	+ * __mem_cgroup_charge - charge a newly allocated page to a cgroup
5948	6816	* @page: page to charge
5949	6817	* @mm: mm context of the victim
5950	6818	* @gfp_mask: reclaim mode
5951		- * @memcgp: charged memcg return
5952		- * @compound: charge the page as compound or small page
5953	6819	*
5954	6820	* Try to charge @page to the memcg that @mm belongs to, reclaiming
5955	6821	* pages according to @gfp_mask if necessary.
5956	6822	*
5957		- * Returns 0 on success, with *@memcgp pointing to the charged memcg.
5958		- * Otherwise, an error code is returned.
5959		- *
5960		- * After page->mapping has been set up, the caller must finalize the
5961		- * charge with mem_cgroup_commit_charge(). Or abort the transaction
5962		- * with mem_cgroup_cancel_charge() in case page instantiation fails.
	6823	+ * Returns 0 on success. Otherwise, an error code is returned.
5963	6824	*/
5964		-int mem_cgroup_try_charge(struct page page, struct mm_struct mm,
5965		- gfp_t gfp_mask, struct mem_cgroup **memcgp,
5966		- bool compound)
	6825	+int __mem_cgroup_charge(struct page page, struct mm_struct mm,
	6826	+ gfp_t gfp_mask)
5967	6827	{
	6828	+ unsigned int nr_pages = thp_nr_pages(page);
5968	6829	struct mem_cgroup *memcg = NULL;
5969		- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5970	6830	int ret = 0;
5971	6831
5972		- if (mem_cgroup_disabled())
5973		- goto out;
5974		-
5975	6832	if (PageSwapCache(page)) {
	6833	+ swp_entry_t ent = { .val = page_private(page), };
	6834	+ unsigned short id;
	6835	+
5976	6836	/*
5977	6837	* Every swap fault against a single page tries to charge the
5978	6838	* page, bail as early as possible. shmem_unuse() encounters
5979		- * already charged pages, too. The USED bit is protected by
5980		- * the page lock, which serializes swap cache removal, which
	6839	+ * already charged pages, too. page->mem_cgroup is protected
	6840	+ * by the page lock, which serializes swap cache removal, which
5981	6841	* in turn serializes uncharging.
5982	6842	*/
5983	6843	VM_BUG_ON_PAGE(!PageLocked(page), page);
5984	6844	if (compound_head(page)->mem_cgroup)
5985	6845	goto out;
5986	6846
5987		- if (do_swap_account) {
5988		- swp_entry_t ent = { .val = page_private(page), };
5989		- unsigned short id = lookup_swap_cgroup_id(ent);
5990		-
5991		- rcu_read_lock();
5992		- memcg = mem_cgroup_from_id(id);
5993		- if (memcg && !css_tryget_online(&memcg->css))
5994		- memcg = NULL;
5995		- rcu_read_unlock();
5996		- }
	6847	+ id = lookup_swap_cgroup_id(ent);
	6848	+ rcu_read_lock();
	6849	+ memcg = mem_cgroup_from_id(id);
	6850	+ if (memcg && !css_tryget_online(&memcg->css))
	6851	+ memcg = NULL;
	6852	+ rcu_read_unlock();
5997	6853	}
5998	6854
5999	6855	if (!memcg)
6000	6856	memcg = get_mem_cgroup_from_mm(mm);
6001	6857
6002	6858	ret = try_charge(memcg, gfp_mask, nr_pages);
	6859	+ if (ret)
	6860	+ goto out_put;
6003	6861
6004		- css_put(&memcg->css);
6005		-out:
6006		- *memcgp = memcg;
6007		- return ret;
6008		-}
	6862	+ css_get(&memcg->css);
	6863	+ commit_charge(page, memcg);
6009	6864
6010		-int mem_cgroup_try_charge_delay(struct page page, struct mm_struct mm,
6011		- gfp_t gfp_mask, struct mem_cgroup **memcgp,
6012		- bool compound)
6013		-{
6014		- struct mem_cgroup *memcg;
6015		- int ret;
6016		-
6017		- ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound);
6018		- memcg = *memcgp;
6019		- mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask);
6020		- return ret;
6021		-}
6022		-
6023		-/**
6024		- * mem_cgroup_commit_charge - commit a page charge
6025		- * @page: page to charge
6026		- * @memcg: memcg to charge the page to
6027		- * @lrucare: page might be on LRU already
6028		- * @compound: charge the page as compound or small page
6029		- *
6030		- * Finalize a charge transaction started by mem_cgroup_try_charge(),
6031		- * after page->mapping has been set up. This must happen atomically
6032		- * as part of the page instantiation, i.e. under the page table lock
6033		- * for anonymous pages, under the page lock for page and swap cache.
6034		- *
6035		- * In addition, the page must not be on the LRU during the commit, to
6036		- * prevent racing with task migration. If it might be, use @lrucare.
6037		- *
6038		- * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
6039		- */
6040		-void mem_cgroup_commit_charge(struct page page, struct mem_cgroup memcg,
6041		- bool lrucare, bool compound)
6042		-{
6043		- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
6044		-
6045		- VM_BUG_ON_PAGE(!page->mapping, page);
6046		- VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
6047		-
6048		- if (mem_cgroup_disabled())
6049		- return;
6050		- /*
6051		- * Swap faults will attempt to charge the same page multiple
6052		- * times. But reuse_swap_page() might have removed the page
6053		- * from swapcache already, so we can't check PageSwapCache().
6054		- */
6055		- if (!memcg)
6056		- return;
6057		-
6058		- commit_charge(page, memcg, lrucare);
6059		-
6060		- local_irq_disable();
6061		- mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
	6865	+ local_lock_irq(&event_lock.l);
	6866	+ mem_cgroup_charge_statistics(memcg, page, nr_pages);
6062	6867	memcg_check_events(memcg, page);
6063		- local_irq_enable();
	6868	+ local_unlock_irq(&event_lock.l);
6064	6869
	6870	+ /*
	6871	+ * Cgroup1's unified memory+swap counter has been charged with the
	6872	+ * new swapcache page, finish the transfer by uncharging the swap
	6873	+ * slot. The swap slot would also get uncharged when it dies, but
	6874	+ * it can stick around indefinitely and we'd count the page twice
	6875	+ * the entire time.
	6876	+ *
	6877	+ * Cgroup2 has separate resource counters for memory and swap,
	6878	+ * so this is a non-issue here. Memory and swap charge lifetimes
	6879	+ * correspond 1:1 to page and swap slot lifetimes: we charge the
	6880	+ * page to memory here, and uncharge swap when the slot is freed.
	6881	+ */
6065	6882	if (do_memsw_account() && PageSwapCache(page)) {
6066	6883	swp_entry_t entry = { .val = page_private(page) };
6067	6884	/*
..	..	@@ -6071,42 +6888,18 @@
6071	6888	*/
6072	6889	mem_cgroup_uncharge_swap(entry, nr_pages);
6073	6890	}
6074		-}
6075	6891
6076		-/**
6077		- * mem_cgroup_cancel_charge - cancel a page charge
6078		- * @page: page to charge
6079		- * @memcg: memcg to charge the page to
6080		- * @compound: charge the page as compound or small page
6081		- *
6082		- * Cancel a charge transaction started by mem_cgroup_try_charge().
6083		- */
6084		-void mem_cgroup_cancel_charge(struct page page, struct mem_cgroup memcg,
6085		- bool compound)
6086		-{
6087		- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
6088		-
6089		- if (mem_cgroup_disabled())
6090		- return;
6091		- /*
6092		- * Swap faults will attempt to charge the same page multiple
6093		- * times. But reuse_swap_page() might have removed the page
6094		- * from swapcache already, so we can't check PageSwapCache().
6095		- */
6096		- if (!memcg)
6097		- return;
6098		-
6099		- cancel_charge(memcg, nr_pages);
	6892	+out_put:
	6893	+ css_put(&memcg->css);
	6894	+out:
	6895	+ return ret;
6100	6896	}
6101	6897
6102	6898	struct uncharge_gather {
6103	6899	struct mem_cgroup *memcg;
	6900	+ unsigned long nr_pages;
6104	6901	unsigned long pgpgout;
6105		- unsigned long nr_anon;
6106		- unsigned long nr_file;
6107	6902	unsigned long nr_kmem;
6108		- unsigned long nr_huge;
6109		- unsigned long nr_shmem;
6110	6903	struct page *dummy_page;
6111	6904	};
6112	6905
..	..	@@ -6117,37 +6910,32 @@
6117	6910
6118	6911	static void uncharge_batch(const struct uncharge_gather *ug)
6119	6912	{
6120		- unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem;
6121	6913	unsigned long flags;
6122	6914
6123	6915	if (!mem_cgroup_is_root(ug->memcg)) {
6124		- page_counter_uncharge(&ug->memcg->memory, nr_pages);
	6916	+ page_counter_uncharge(&ug->memcg->memory, ug->nr_pages);
6125	6917	if (do_memsw_account())
6126		- page_counter_uncharge(&ug->memcg->memsw, nr_pages);
	6918	+ page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages);
6127	6919	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
6128	6920	page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
6129	6921	memcg_oom_recover(ug->memcg);
6130	6922	}
6131	6923
6132		- local_irq_save(flags);
6133		- __mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon);
6134		- __mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file);
6135		- __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge);
6136		- __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem);
	6924	+ local_lock_irqsave(&event_lock.l, flags);
6137	6925	__count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
6138		- __this_cpu_add(ug->memcg->stat_cpu->nr_page_events, nr_pages);
	6926	+ __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages);
6139	6927	memcg_check_events(ug->memcg, ug->dummy_page);
6140		- local_irq_restore(flags);
	6928	+ local_unlock_irqrestore(&event_lock.l, flags);
6141	6929
6142		- if (!mem_cgroup_is_root(ug->memcg))
6143		- css_put_many(&ug->memcg->css, nr_pages);
	6930	+ /* drop reference from uncharge_page */
	6931	+ css_put(&ug->memcg->css);
6144	6932	}
6145	6933
6146	6934	static void uncharge_page(struct page page, struct uncharge_gather ug)
6147	6935	{
	6936	+ unsigned long nr_pages;
	6937	+
6148	6938	VM_BUG_ON_PAGE(PageLRU(page), page);
6149		- VM_BUG_ON_PAGE(page_count(page) && !is_zone_device_page(page) &&
6150		- !PageHWPoison(page) , page);
6151	6939
6152	6940	if (!page->mem_cgroup)
6153	6941	return;
..	..	@@ -6164,30 +6952,24 @@
6164	6952	uncharge_gather_clear(ug);
6165	6953	}
6166	6954	ug->memcg = page->mem_cgroup;
	6955	+
	6956	+ /* pairs with css_put in uncharge_batch */
	6957	+ css_get(&ug->memcg->css);
6167	6958	}
6168	6959
6169		- if (!PageKmemcg(page)) {
6170		- unsigned int nr_pages = 1;
	6960	+ nr_pages = compound_nr(page);
	6961	+ ug->nr_pages += nr_pages;
6171	6962
6172		- if (PageTransHuge(page)) {
6173		- nr_pages <<= compound_order(page);
6174		- ug->nr_huge += nr_pages;
6175		- }
6176		- if (PageAnon(page))
6177		- ug->nr_anon += nr_pages;
6178		- else {
6179		- ug->nr_file += nr_pages;
6180		- if (PageSwapBacked(page))
6181		- ug->nr_shmem += nr_pages;
6182		- }
	6963	+ if (!PageKmemcg(page)) {
6183	6964	ug->pgpgout++;
6184	6965	} else {
6185		- ug->nr_kmem += 1 << compound_order(page);
	6966	+ ug->nr_kmem += nr_pages;
6186	6967	__ClearPageKmemcg(page);
6187	6968	}
6188	6969
6189	6970	ug->dummy_page = page;
6190	6971	page->mem_cgroup = NULL;
	6972	+ css_put(&ug->memcg->css);
6191	6973	}
6192	6974
6193	6975	static void uncharge_list(struct list_head *page_list)
..	..	@@ -6216,18 +6998,14 @@
6216	6998	}
6217	6999
6218	7000	/**
6219		- * mem_cgroup_uncharge - uncharge a page
	7001	+ * __mem_cgroup_uncharge - uncharge a page
6220	7002	* @page: page to uncharge
6221	7003	*
6222		- * Uncharge a page previously charged with mem_cgroup_try_charge() and
6223		- * mem_cgroup_commit_charge().
	7004	+ * Uncharge a page previously charged with __mem_cgroup_charge().
6224	7005	*/
6225		-void mem_cgroup_uncharge(struct page *page)
	7006	+void __mem_cgroup_uncharge(struct page *page)
6226	7007	{
6227	7008	struct uncharge_gather ug;
6228		-
6229		- if (mem_cgroup_disabled())
6230		- return;
6231	7009
6232	7010	/* Don't touch page->lru of any random page, pre-check: */
6233	7011	if (!page->mem_cgroup)
..	..	@@ -6239,17 +7017,14 @@
6239	7017	}
6240	7018
6241	7019	/**
6242		- * mem_cgroup_uncharge_list - uncharge a list of page
	7020	+ * __mem_cgroup_uncharge_list - uncharge a list of page
6243	7021	* @page_list: list of pages to uncharge
6244	7022	*
6245	7023	* Uncharge a list of pages previously charged with
6246		- * mem_cgroup_try_charge() and mem_cgroup_commit_charge().
	7024	+ * __mem_cgroup_charge().
6247	7025	*/
6248		-void mem_cgroup_uncharge_list(struct list_head *page_list)
	7026	+void __mem_cgroup_uncharge_list(struct list_head *page_list)
6249	7027	{
6250		- if (mem_cgroup_disabled())
6251		- return;
6252		-
6253	7028	if (!list_empty(page_list))
6254	7029	uncharge_list(page_list);
6255	7030	}
..	..	@@ -6268,7 +7043,6 @@
6268	7043	{
6269	7044	struct mem_cgroup *memcg;
6270	7045	unsigned int nr_pages;
6271		- bool compound;
6272	7046	unsigned long flags;
6273	7047
6274	7048	VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
..	..	@@ -6290,20 +7064,19 @@
6290	7064	return;
6291	7065
6292	7066	/* Force-charge the new page. The old one will be freed soon */
6293		- compound = PageTransHuge(newpage);
6294		- nr_pages = compound ? hpage_nr_pages(newpage) : 1;
	7067	+ nr_pages = thp_nr_pages(newpage);
6295	7068
6296	7069	page_counter_charge(&memcg->memory, nr_pages);
6297	7070	if (do_memsw_account())
6298	7071	page_counter_charge(&memcg->memsw, nr_pages);
6299		- css_get_many(&memcg->css, nr_pages);
6300	7072
6301		- commit_charge(newpage, memcg, false);
	7073	+ css_get(&memcg->css);
	7074	+ commit_charge(newpage, memcg);
6302	7075
6303		- local_irq_save(flags);
6304		- mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
	7076	+ local_lock_irqsave(&event_lock.l, flags);
	7077	+ mem_cgroup_charge_statistics(memcg, newpage, nr_pages);
6305	7078	memcg_check_events(memcg, newpage);
6306		- local_irq_restore(flags);
	7079	+ local_unlock_irqrestore(&event_lock.l, flags);
6307	7080	}
6308	7081
6309	7082	DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
..	..	@@ -6326,7 +7099,7 @@
6326	7099	goto out;
6327	7100	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
6328	7101	goto out;
6329		- if (css_tryget_online(&memcg->css))
	7102	+ if (css_tryget(&memcg->css))
6330	7103	sk->sk_memcg = memcg;
6331	7104	out:
6332	7105	rcu_read_unlock();
..	..	@@ -6404,7 +7177,7 @@
6404	7177	if (!strcmp(token, "nokmem"))
6405	7178	cgroup_memory_nokmem = true;
6406	7179	}
6407		- return 0;
	7180	+ return 1;
6408	7181	}
6409	7182	__setup("cgroup.memory=", cgroup_memory);
6410	7183
..	..	@@ -6420,23 +7193,16 @@
6420	7193	{
6421	7194	int cpu, node;
6422	7195
6423		-#ifdef CONFIG_MEMCG_KMEM
6424		- /*
6425		- * Kmem cache creation is mostly done with the slab_mutex held,
6426		- * so use a workqueue with limited concurrency to avoid stalling
6427		- * all worker threads in case lots of cgroups are created and
6428		- * destroyed simultaneously.
6429		- */
6430		- memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1);
6431		- BUG_ON(!memcg_kmem_cache_wq);
6432		-#endif
6433		-
6434	7196	cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
6435	7197	memcg_hotplug_cpu_dead);
6436	7198
6437		- for_each_possible_cpu(cpu)
6438		- INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
6439		- drain_local_stock);
	7199	+ for_each_possible_cpu(cpu) {
	7200	+ struct memcg_stock_pcp *stock;
	7201	+
	7202	+ stock = per_cpu_ptr(&memcg_stock, cpu);
	7203	+ INIT_WORK(&stock->work, drain_local_stock);
	7204	+ local_lock_init(&stock->lock);
	7205	+ }
6440	7206
6441	7207	for_each_node(node) {
6442	7208	struct mem_cgroup_tree_per_node *rtpn;
..	..	@@ -6457,7 +7223,7 @@
6457	7223	#ifdef CONFIG_MEMCG_SWAP
6458	7224	static struct mem_cgroup mem_cgroup_id_get_online(struct mem_cgroup memcg)
6459	7225	{
6460		- while (!atomic_inc_not_zero(&memcg->id.ref)) {
	7226	+ while (!refcount_inc_not_zero(&memcg->id.ref)) {
6461	7227	/*
6462	7228	* The root cgroup cannot be destroyed, so it's refcount must
6463	7229	* always be >= 1.
..	..	@@ -6485,11 +7251,15 @@
6485	7251	struct mem_cgroup memcg, swap_memcg;
6486	7252	unsigned int nr_entries;
6487	7253	unsigned short oldid;
	7254	+ unsigned long flags;
6488	7255
6489	7256	VM_BUG_ON_PAGE(PageLRU(page), page);
6490	7257	VM_BUG_ON_PAGE(page_count(page), page);
6491	7258
6492		- if (!do_memsw_account())
	7259	+ if (mem_cgroup_disabled())
	7260	+ return;
	7261	+
	7262	+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
6493	7263	return;
6494	7264
6495	7265	memcg = page->mem_cgroup;
..	..	@@ -6504,7 +7274,7 @@
6504	7274	* ancestor for the swap instead and transfer the memory+swap charge.
6505	7275	*/
6506	7276	swap_memcg = mem_cgroup_id_get_online(memcg);
6507		- nr_entries = hpage_nr_pages(page);
	7277	+ nr_entries = thp_nr_pages(page);
6508	7278	/* Get references for the tail pages, too */
6509	7279	if (nr_entries > 1)
6510	7280	mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
..	..	@@ -6518,7 +7288,7 @@
6518	7288	if (!mem_cgroup_is_root(memcg))
6519	7289	page_counter_uncharge(&memcg->memory, nr_entries);
6520	7290
6521		- if (memcg != swap_memcg) {
	7291	+ if (!cgroup_memory_noswap && memcg != swap_memcg) {
6522	7292	if (!mem_cgroup_is_root(swap_memcg))
6523	7293	page_counter_charge(&swap_memcg->memsw, nr_entries);
6524	7294	page_counter_uncharge(&memcg->memsw, nr_entries);
..	..	@@ -6530,17 +7300,19 @@
6530	7300	* important here to have the interrupts disabled because it is the
6531	7301	* only synchronisation we have for updating the per-CPU variables.
6532	7302	*/
	7303	+ local_lock_irqsave(&event_lock.l, flags);
	7304	+#ifndef CONFIG_PREEMPT_RT
6533	7305	VM_BUG_ON(!irqs_disabled());
6534		- mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
6535		- -nr_entries);
	7306	+#endif
	7307	+ mem_cgroup_charge_statistics(memcg, page, -nr_entries);
6536	7308	memcg_check_events(memcg, page);
	7309	+ local_unlock_irqrestore(&event_lock.l, flags);
6537	7310
6538		- if (!mem_cgroup_is_root(memcg))
6539		- css_put_many(&memcg->css, nr_entries);
	7311	+ css_put(&memcg->css);
6540	7312	}
6541	7313
6542	7314	/**
6543		- * mem_cgroup_try_charge_swap - try charging swap space for a page
	7315	+ * __mem_cgroup_try_charge_swap - try charging swap space for a page
6544	7316	* @page: page being added to swap
6545	7317	* @entry: swap entry to charge
6546	7318	*
..	..	@@ -6548,14 +7320,14 @@
6548	7320	*
6549	7321	* Returns 0 on success, -ENOMEM on failure.
6550	7322	*/
6551		-int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
	7323	+int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
6552	7324	{
6553		- unsigned int nr_pages = hpage_nr_pages(page);
	7325	+ unsigned int nr_pages = thp_nr_pages(page);
6554	7326	struct page_counter *counter;
6555	7327	struct mem_cgroup *memcg;
6556	7328	unsigned short oldid;
6557	7329
6558		- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) \|\| !do_swap_account)
	7330	+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
6559	7331	return 0;
6560	7332
6561	7333	memcg = page->mem_cgroup;
..	..	@@ -6571,7 +7343,7 @@
6571	7343
6572	7344	memcg = mem_cgroup_id_get_online(memcg);
6573	7345
6574		- if (!mem_cgroup_is_root(memcg) &&
	7346	+ if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) &&
6575	7347	!page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
6576	7348	memcg_memory_event(memcg, MEMCG_SWAP_MAX);
6577	7349	memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
..	..	@@ -6590,23 +7362,20 @@
6590	7362	}
6591	7363
6592	7364	/**
6593		- * mem_cgroup_uncharge_swap - uncharge swap space
	7365	+ * __mem_cgroup_uncharge_swap - uncharge swap space
6594	7366	* @entry: swap entry to uncharge
6595	7367	* @nr_pages: the amount of swap space to uncharge
6596	7368	*/
6597		-void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
	7369	+void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
6598	7370	{
6599	7371	struct mem_cgroup *memcg;
6600	7372	unsigned short id;
6601		-
6602		- if (!do_swap_account)
6603		- return;
6604	7373
6605	7374	id = swap_cgroup_record(entry, 0, nr_pages);
6606	7375	rcu_read_lock();
6607	7376	memcg = mem_cgroup_from_id(id);
6608	7377	if (memcg) {
6609		- if (!mem_cgroup_is_root(memcg)) {
	7378	+ if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) {
6610	7379	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
6611	7380	page_counter_uncharge(&memcg->swap, nr_pages);
6612	7381	else
..	..	@@ -6622,7 +7391,7 @@
6622	7391	{
6623	7392	long nr_swap_pages = get_nr_swap_pages();
6624	7393
6625		- if (!do_swap_account \|\| !cgroup_subsys_on_dfl(memory_cgrp_subsys))
	7394	+ if (cgroup_memory_noswap \|\| !cgroup_subsys_on_dfl(memory_cgrp_subsys))
6626	7395	return nr_swap_pages;
6627	7396	for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
6628	7397	nr_swap_pages = min_t(long, nr_swap_pages,
..	..	@@ -6639,36 +7408,33 @@
6639	7408
6640	7409	if (vm_swap_full())
6641	7410	return true;
6642		- if (!do_swap_account \|\| !cgroup_subsys_on_dfl(memory_cgrp_subsys))
	7411	+ if (cgroup_memory_noswap \|\| !cgroup_subsys_on_dfl(memory_cgrp_subsys))
6643	7412	return false;
6644	7413
6645	7414	memcg = page->mem_cgroup;
6646	7415	if (!memcg)
6647	7416	return false;
6648	7417
6649		- for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
6650		- if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.max)
	7418	+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
	7419	+ unsigned long usage = page_counter_read(&memcg->swap);
	7420	+
	7421	+ if (usage * 2 >= READ_ONCE(memcg->swap.high) \|\|
	7422	+ usage * 2 >= READ_ONCE(memcg->swap.max))
6651	7423	return true;
	7424	+ }
6652	7425
6653	7426	return false;
6654	7427	}
6655	7428
6656		-/* for remember boot option*/
6657		-#ifdef CONFIG_MEMCG_SWAP_ENABLED
6658		-static int really_do_swap_account __initdata = 1;
6659		-#else
6660		-static int really_do_swap_account __initdata;
6661		-#endif
6662		-
6663		-static int __init enable_swap_account(char *s)
	7429	+static int __init setup_swap_account(char *s)
6664	7430	{
6665	7431	if (!strcmp(s, "1"))
6666		- really_do_swap_account = 1;
	7432	+ cgroup_memory_noswap = 0;
6667	7433	else if (!strcmp(s, "0"))
6668		- really_do_swap_account = 0;
	7434	+ cgroup_memory_noswap = 1;
6669	7435	return 1;
6670	7436	}
6671		-__setup("swapaccount=", enable_swap_account);
	7437	+__setup("swapaccount=", setup_swap_account);
6672	7438
6673	7439	static u64 swap_current_read(struct cgroup_subsys_state *css,
6674	7440	struct cftype *cft)
..	..	@@ -6678,17 +7444,33 @@
6678	7444	return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
6679	7445	}
6680	7446
	7447	+static int swap_high_show(struct seq_file m, void v)
	7448	+{
	7449	+ return seq_puts_memcg_tunable(m,
	7450	+ READ_ONCE(mem_cgroup_from_seq(m)->swap.high));
	7451	+}
	7452	+
	7453	+static ssize_t swap_high_write(struct kernfs_open_file *of,
	7454	+ char *buf, size_t nbytes, loff_t off)
	7455	+{
	7456	+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
	7457	+ unsigned long high;
	7458	+ int err;
	7459	+
	7460	+ buf = strstrip(buf);
	7461	+ err = page_counter_memparse(buf, "max", &high);
	7462	+ if (err)
	7463	+ return err;
	7464	+
	7465	+ page_counter_set_high(&memcg->swap, high);
	7466	+
	7467	+ return nbytes;
	7468	+}
	7469	+
6681	7470	static int swap_max_show(struct seq_file m, void v)
6682	7471	{
6683		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
6684		- unsigned long max = READ_ONCE(memcg->swap.max);
6685		-
6686		- if (max == PAGE_COUNTER_MAX)
6687		- seq_puts(m, "max\n");
6688		- else
6689		- seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
6690		-
6691		- return 0;
	7472	+ return seq_puts_memcg_tunable(m,
	7473	+ READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
6692	7474	}
6693	7475
6694	7476	static ssize_t swap_max_write(struct kernfs_open_file *of,
..	..	@@ -6710,8 +7492,10 @@
6710	7492
6711	7493	static int swap_events_show(struct seq_file m, void v)
6712	7494	{
6713		- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
	7495	+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6714	7496
	7497	+ seq_printf(m, "high %lu\n",
	7498	+ atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH]));
6715	7499	seq_printf(m, "max %lu\n",
6716	7500	atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
6717	7501	seq_printf(m, "fail %lu\n",
..	..	@@ -6725,6 +7509,12 @@
6725	7509	.name = "swap.current",
6726	7510	.flags = CFTYPE_NOT_ON_ROOT,
6727	7511	.read_u64 = swap_current_read,
	7512	+ },
	7513	+ {
	7514	+ .name = "swap.high",
	7515	+ .flags = CFTYPE_NOT_ON_ROOT,
	7516	+ .seq_show = swap_high_show,
	7517	+ .write = swap_high_write,
6728	7518	},
6729	7519	{
6730	7520	.name = "swap.max",
..	..	@@ -6741,7 +7531,7 @@
6741	7531	{ } /* terminate */
6742	7532	};
6743	7533
6744		-static struct cftype memsw_cgroup_files[] = {
	7534	+static struct cftype memsw_files[] = {
6745	7535	{
6746	7536	.name = "memsw.usage_in_bytes",
6747	7537	.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
..	..	@@ -6768,17 +7558,27 @@
6768	7558	{ }, /* terminate */
6769	7559	};
6770	7560
	7561	+/*
	7562	+ * If mem_cgroup_swap_init() is implemented as a subsys_initcall()
	7563	+ * instead of a core_initcall(), this could mean cgroup_memory_noswap still
	7564	+ * remains set to false even when memcg is disabled via "cgroup_disable=memory"
	7565	+ * boot parameter. This may result in premature OOPS inside
	7566	+ * mem_cgroup_get_nr_swap_pages() function in corner cases.
	7567	+ */
6771	7568	static int __init mem_cgroup_swap_init(void)
6772	7569	{
6773		- if (!mem_cgroup_disabled() && really_do_swap_account) {
6774		- do_swap_account = 1;
6775		- WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys,
6776		- swap_files));
6777		- WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
6778		- memsw_cgroup_files));
6779		- }
	7570	+ /* No memory control -> no swap control */
	7571	+ if (mem_cgroup_disabled())
	7572	+ cgroup_memory_noswap = true;
	7573	+
	7574	+ if (cgroup_memory_noswap)
	7575	+ return 0;
	7576	+
	7577	+ WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
	7578	+ WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files));
	7579	+
6780	7580	return 0;
6781	7581	}
6782		-subsys_initcall(mem_cgroup_swap_init);
	7582	+core_initcall(mem_cgroup_swap_init);
6783	7583
6784	7584	#endif /* CONFIG_MEMCG_SWAP */