From ea08eeccae9297f7aabd2ef7f0c2517ac4549acc Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Tue, 20 Feb 2024 01:18:26 +0000
Subject: [PATCH] write in 30M
---
kernel/mm/memcontrol.c | 3521 ++++++++++++++++++++++++++++++++++++-----------------------
1 files changed, 2,150 insertions(+), 1,371 deletions(-)
diff --git a/kernel/mm/memcontrol.c b/kernel/mm/memcontrol.c
index 921e450..b6f6bfc 100644
--- a/kernel/mm/memcontrol.c
+++ b/kernel/mm/memcontrol.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* memcontrol.c - Memory Controller
*
* Copyright IBM Corporation, 2007
@@ -19,26 +20,17 @@
* Lockless page tracking & accounting
* Unified hierarchy configuration model
* Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
*/
#include <linux/page_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
-#include <linux/mm.h>
+#include <linux/pagewalk.h>
#include <linux/sched/mm.h>
#include <linux/shmem_fs.h>
#include <linux/hugetlb.h>
#include <linux/pagemap.h>
+#include <linux/vm_event_item.h>
#include <linux/smp.h>
#include <linux/page-flags.h>
#include <linux/backing-dev.h>
@@ -65,22 +57,26 @@
#include <linux/lockdep.h>
#include <linux/file.h>
#include <linux/tracehook.h>
+#include <linux/psi.h>
+#include <linux/seq_buf.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
#include "slab.h"
-#include <linux/locallock.h>
#include <linux/uaccess.h>
#include <trace/events/vmscan.h>
+#include <trace/hooks/mm.h>
struct cgroup_subsys memory_cgrp_subsys __read_mostly;
EXPORT_SYMBOL(memory_cgrp_subsys);
struct mem_cgroup *root_mem_cgroup __read_mostly;
+EXPORT_SYMBOL_GPL(root_mem_cgroup);
-#define MEM_CGROUP_RECLAIM_RETRIES 5
+/* Active memory cgroup to use from an interrupt context */
+DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
/* Socket memory accounting disabled? */
static bool cgroup_memory_nosocket;
@@ -90,30 +86,23 @@
/* Whether the swap controller is active */
#ifdef CONFIG_MEMCG_SWAP
-int do_swap_account __read_mostly;
+bool cgroup_memory_noswap __read_mostly;
#else
-#define do_swap_account 0
+#define cgroup_memory_noswap 1
#endif
-static DEFINE_LOCAL_IRQ_LOCK(event_lock);
+#ifdef CONFIG_CGROUP_WRITEBACK
+static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
+#endif
/* Whether legacy memory+swap accounting is active */
static bool do_memsw_account(void)
{
- return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
+ return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
}
-
-static const char *const mem_cgroup_lru_names[] = {
- "inactive_anon",
- "active_anon",
- "inactive_file",
- "active_file",
- "unevictable",
-};
#define THRESHOLDS_EVENTS_TARGET 128
#define SOFTLIMIT_EVENTS_TARGET 1024
-#define NUMAINFO_EVENTS_TARGET 1024
/*
* Cgroups above their limits are maintained in a RB-Tree, independent of
@@ -213,14 +202,6 @@
#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
-enum charge_type {
- MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
- MEM_CGROUP_CHARGE_TYPE_ANON,
- MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
- MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */
- NR_CHARGE_TYPE,
-};
-
/* for encoding cft->private value on file */
enum res_type {
_MEM,
@@ -251,7 +232,7 @@
iter != NULL; \
iter = mem_cgroup_iter(NULL, iter, NULL))
-static inline bool should_force_charge(void)
+static inline bool task_is_dying(void)
{
return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
(current->flags & PF_EXITING);
@@ -271,8 +252,100 @@
}
#ifdef CONFIG_MEMCG_KMEM
+static DEFINE_SPINLOCK(objcg_lock);
+
+static void obj_cgroup_release(struct percpu_ref *ref)
+{
+ struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
+ struct mem_cgroup *memcg;
+ unsigned int nr_bytes;
+ unsigned int nr_pages;
+ unsigned long flags;
+
+ /*
+ * At this point all allocated objects are freed, and
+ * objcg->nr_charged_bytes can't have an arbitrary byte value.
+ * However, it can be PAGE_SIZE or (x * PAGE_SIZE).
+ *
+ * The following sequence can lead to it:
+ * 1) CPU0: objcg == stock->cached_objcg
+ * 2) CPU1: we do a small allocation (e.g. 92 bytes),
+ * PAGE_SIZE bytes are charged
+ * 3) CPU1: a process from another memcg is allocating something,
+ * the stock if flushed,
+ * objcg->nr_charged_bytes = PAGE_SIZE - 92
+ * 5) CPU0: we do release this object,
+ * 92 bytes are added to stock->nr_bytes
+ * 6) CPU0: stock is flushed,
+ * 92 bytes are added to objcg->nr_charged_bytes
+ *
+ * In the result, nr_charged_bytes == PAGE_SIZE.
+ * This page will be uncharged in obj_cgroup_release().
+ */
+ nr_bytes = atomic_read(&objcg->nr_charged_bytes);
+ WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
+ nr_pages = nr_bytes >> PAGE_SHIFT;
+
+ spin_lock_irqsave(&objcg_lock, flags);
+ memcg = obj_cgroup_memcg(objcg);
+ if (nr_pages)
+ __memcg_kmem_uncharge(memcg, nr_pages);
+ list_del(&objcg->list);
+ mem_cgroup_put(memcg);
+ spin_unlock_irqrestore(&objcg_lock, flags);
+
+ percpu_ref_exit(ref);
+ kfree_rcu(objcg, rcu);
+}
+
+static struct obj_cgroup *obj_cgroup_alloc(void)
+{
+ struct obj_cgroup *objcg;
+ int ret;
+
+ objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
+ if (!objcg)
+ return NULL;
+
+ ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0,
+ GFP_KERNEL);
+ if (ret) {
+ kfree(objcg);
+ return NULL;
+ }
+ INIT_LIST_HEAD(&objcg->list);
+ return objcg;
+}
+
+static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent)
+{
+ struct obj_cgroup *objcg, *iter;
+
+ objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
+
+ spin_lock_irq(&objcg_lock);
+
+ /* Move active objcg to the parent's list */
+ xchg(&objcg->memcg, parent);
+ css_get(&parent->css);
+ list_add(&objcg->list, &parent->objcg_list);
+
+ /* Move already reparented objcgs to the parent's list */
+ list_for_each_entry(iter, &memcg->objcg_list, list) {
+ css_get(&parent->css);
+ xchg(&iter->memcg, parent);
+ css_put(&memcg->css);
+ }
+ list_splice(&memcg->objcg_list, &parent->objcg_list);
+
+ spin_unlock_irq(&objcg_lock);
+
+ percpu_ref_kill(&objcg->refcnt);
+}
+
/*
- * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
+ * This will be used as a shrinker list's index.
* The main reason for not using cgroup id for this:
* this works better in sparse environments, where we have a lot of memcgs,
* but only a few kmem-limited. Or also, if we have, for instance, 200
@@ -315,14 +388,13 @@
/*
* A lot of the calls to the cache allocation functions are expected to be
- * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
+ * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are
* conditional to this static branch, we'll have to allow modules that does
* kmem_cache_alloc and the such to see this symbol as well
*/
DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
EXPORT_SYMBOL(memcg_kmem_enabled_key);
-
-struct workqueue_struct *memcg_kmem_cache_wq;
+#endif
static int memcg_shrinker_map_size;
static DEFINE_MUTEX(memcg_shrinker_map_mutex);
@@ -347,7 +419,7 @@
if (!old)
return 0;
- new = kvmalloc(sizeof(*new) + size, GFP_KERNEL);
+ new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
if (!new)
return -ENOMEM;
@@ -391,7 +463,7 @@
mutex_lock(&memcg_shrinker_map_mutex);
size = memcg_shrinker_map_size;
for_each_node(nid) {
- map = kvzalloc(sizeof(*map) + size, GFP_KERNEL);
+ map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid);
if (!map) {
memcg_free_shrinker_maps(memcg);
ret = -ENOMEM;
@@ -448,14 +520,6 @@
}
}
-#else /* CONFIG_MEMCG_KMEM */
-static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
-{
- return 0;
-}
-static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { }
-#endif /* CONFIG_MEMCG_KMEM */
-
/**
* mem_cgroup_css_from_page - css of the memcg associated with a page
* @page: page of interest
@@ -498,7 +562,17 @@
unsigned long ino = 0;
rcu_read_lock();
- memcg = READ_ONCE(page->mem_cgroup);
+ memcg = page->mem_cgroup;
+
+ /*
+ * The lowest bit set means that memcg isn't a valid
+ * memcg pointer, but a obj_cgroups pointer.
+ * In this case the page is shared and doesn't belong
+ * to any specific memory cgroup.
+ */
+ if ((unsigned long) memcg & 0x1UL)
+ memcg = NULL;
+
while (memcg && !(memcg->css.flags & CSS_ONLINE))
memcg = parent_mem_cgroup(memcg);
if (memcg)
@@ -674,7 +748,7 @@
*/
__mem_cgroup_remove_exceeded(mz, mctz);
if (!soft_limit_excess(mz->memcg) ||
- !css_tryget_online(&mz->memcg->css))
+ !css_tryget(&mz->memcg->css))
goto retry;
done:
return mz;
@@ -691,33 +765,186 @@
return mz;
}
-static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
- int event)
+/**
+ * __mod_memcg_state - update cgroup memory statistics
+ * @memcg: the memory cgroup
+ * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
+ * @val: delta to add to the counter, can be negative
+ */
+void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
{
- return atomic_long_read(&memcg->events[event]);
+ long x, threshold = MEMCG_CHARGE_BATCH;
+
+ if (mem_cgroup_disabled())
+ return;
+
+ if (memcg_stat_item_in_bytes(idx))
+ threshold <<= PAGE_SHIFT;
+
+ x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
+ if (unlikely(abs(x) > threshold)) {
+ struct mem_cgroup *mi;
+
+ /*
+ * Batch local counters to keep them in sync with
+ * the hierarchical ones.
+ */
+ __this_cpu_add(memcg->vmstats_local->stat[idx], x);
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ atomic_long_add(x, &mi->vmstats[idx]);
+ x = 0;
+ }
+ __this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
+}
+
+static struct mem_cgroup_per_node *
+parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
+{
+ struct mem_cgroup *parent;
+
+ parent = parent_mem_cgroup(pn->memcg);
+ if (!parent)
+ return NULL;
+ return mem_cgroup_nodeinfo(parent, nid);
+}
+
+void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+ int val)
+{
+ struct mem_cgroup_per_node *pn;
+ struct mem_cgroup *memcg;
+ long x, threshold = MEMCG_CHARGE_BATCH;
+
+ pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
+ memcg = pn->memcg;
+
+ /* Update memcg */
+ __mod_memcg_state(memcg, idx, val);
+
+ /* Update lruvec */
+ __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
+
+ if (vmstat_item_in_bytes(idx))
+ threshold <<= PAGE_SHIFT;
+
+ x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
+ if (unlikely(abs(x) > threshold)) {
+ pg_data_t *pgdat = lruvec_pgdat(lruvec);
+ struct mem_cgroup_per_node *pi;
+
+ for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
+ atomic_long_add(x, &pi->lruvec_stat[idx]);
+ x = 0;
+ }
+ __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
+}
+
+/**
+ * __mod_lruvec_state - update lruvec memory statistics
+ * @lruvec: the lruvec
+ * @idx: the stat item
+ * @val: delta to add to the counter, can be negative
+ *
+ * The lruvec is the intersection of the NUMA node and a cgroup. This
+ * function updates the all three counters that are affected by a
+ * change of state at this level: per-node, per-cgroup, per-lruvec.
+ */
+void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+ int val)
+{
+ /* Update node */
+ __mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
+
+ /* Update memcg and lruvec */
+ if (!mem_cgroup_disabled())
+ __mod_memcg_lruvec_state(lruvec, idx, val);
+}
+EXPORT_SYMBOL_GPL(__mod_lruvec_state);
+
+void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
+{
+ pg_data_t *pgdat = page_pgdat(virt_to_page(p));
+ struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_obj(p);
+
+ /*
+ * Untracked pages have no memcg, no lruvec. Update only the
+ * node. If we reparent the slab objects to the root memcg,
+ * when we free the slab object, we need to update the per-memcg
+ * vmstats to keep it correct for the root memcg.
+ */
+ if (!memcg) {
+ __mod_node_page_state(pgdat, idx, val);
+ } else {
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ __mod_lruvec_state(lruvec, idx, val);
+ }
+ rcu_read_unlock();
+}
+
+void mod_memcg_obj_state(void *p, int idx, int val)
+{
+ struct mem_cgroup *memcg;
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_obj(p);
+ if (memcg)
+ mod_memcg_state(memcg, idx, val);
+ rcu_read_unlock();
+}
+
+/**
+ * __count_memcg_events - account VM events in a cgroup
+ * @memcg: the memory cgroup
+ * @idx: the event item
+ * @count: the number of events that occured
+ */
+void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
+ unsigned long count)
+{
+ unsigned long x;
+
+ if (mem_cgroup_disabled())
+ return;
+
+ x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
+ if (unlikely(x > MEMCG_CHARGE_BATCH)) {
+ struct mem_cgroup *mi;
+
+ /*
+ * Batch local counters to keep them in sync with
+ * the hierarchical ones.
+ */
+ __this_cpu_add(memcg->vmstats_local->events[idx], x);
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ atomic_long_add(x, &mi->vmevents[idx]);
+ x = 0;
+ }
+ __this_cpu_write(memcg->vmstats_percpu->events[idx], x);
+}
+
+static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
+{
+ return atomic_long_read(&memcg->vmevents[event]);
+}
+
+static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
+{
+ long x = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ x += per_cpu(memcg->vmstats_local->events[event], cpu);
+ return x;
}
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
struct page *page,
- bool compound, int nr_pages)
+ int nr_pages)
{
- /*
- * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
- * counted as CACHE even if it's on ANON LRU.
- */
- if (PageAnon(page))
- __mod_memcg_state(memcg, MEMCG_RSS, nr_pages);
- else {
- __mod_memcg_state(memcg, MEMCG_CACHE, nr_pages);
- if (PageSwapBacked(page))
- __mod_memcg_state(memcg, NR_SHMEM, nr_pages);
- }
-
- if (compound) {
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages);
- }
-
/* pagein of a big page is an event. So, ignore page size */
if (nr_pages > 0)
__count_memcg_events(memcg, PGPGIN, 1);
@@ -726,35 +953,7 @@
nr_pages = -nr_pages; /* for event */
}
- __this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages);
-}
-
-unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
- int nid, unsigned int lru_mask)
-{
- struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
- unsigned long nr = 0;
- enum lru_list lru;
-
- VM_BUG_ON((unsigned)nid >= nr_node_ids);
-
- for_each_lru(lru) {
- if (!(BIT(lru) & lru_mask))
- continue;
- nr += mem_cgroup_get_lru_size(lruvec, lru);
- }
- return nr;
-}
-
-static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
- unsigned int lru_mask)
-{
- unsigned long nr = 0;
- int nid;
-
- for_each_node_state(nid, N_MEMORY)
- nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
- return nr;
+ __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
}
static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
@@ -762,8 +961,8 @@
{
unsigned long val, next;
- val = __this_cpu_read(memcg->stat_cpu->nr_page_events);
- next = __this_cpu_read(memcg->stat_cpu->targets[target]);
+ val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
+ next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
/* from time_after() in jiffies.h */
if ((long)(next - val) < 0) {
switch (target) {
@@ -773,13 +972,10 @@
case MEM_CGROUP_TARGET_SOFTLIMIT:
next = val + SOFTLIMIT_EVENTS_TARGET;
break;
- case MEM_CGROUP_TARGET_NUMAINFO:
- next = val + NUMAINFO_EVENTS_TARGET;
- break;
default:
break;
}
- __this_cpu_write(memcg->stat_cpu->targets[target], next);
+ __this_cpu_write(memcg->vmstats_percpu->targets[target], next);
return true;
}
return false;
@@ -795,21 +991,12 @@
if (unlikely(mem_cgroup_event_ratelimit(memcg,
MEM_CGROUP_TARGET_THRESH))) {
bool do_softlimit;
- bool do_numainfo __maybe_unused;
do_softlimit = mem_cgroup_event_ratelimit(memcg,
MEM_CGROUP_TARGET_SOFTLIMIT);
-#if MAX_NUMNODES > 1
- do_numainfo = mem_cgroup_event_ratelimit(memcg,
- MEM_CGROUP_TARGET_NUMAINFO);
-#endif
mem_cgroup_threshold(memcg);
if (unlikely(do_softlimit))
mem_cgroup_update_tree(memcg, page);
-#if MAX_NUMNODES > 1
- if (unlikely(do_numainfo))
- atomic_inc(&memcg->numainfo_events);
-#endif
}
}
@@ -877,27 +1064,60 @@
return NULL;
rcu_read_lock();
- if (!memcg || !css_tryget_online(&memcg->css))
+ /* Page should not get uncharged and freed memcg under us. */
+ if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css)))
memcg = root_mem_cgroup;
rcu_read_unlock();
return memcg;
}
EXPORT_SYMBOL(get_mem_cgroup_from_page);
+static __always_inline struct mem_cgroup *active_memcg(void)
+{
+ if (in_interrupt())
+ return this_cpu_read(int_active_memcg);
+ else
+ return current->active_memcg;
+}
+
+static __always_inline struct mem_cgroup *get_active_memcg(void)
+{
+ struct mem_cgroup *memcg;
+
+ rcu_read_lock();
+ memcg = active_memcg();
+ /* remote memcg must hold a ref. */
+ if (memcg && WARN_ON_ONCE(!css_tryget(&memcg->css)))
+ memcg = root_mem_cgroup;
+ rcu_read_unlock();
+
+ return memcg;
+}
+
+static __always_inline bool memcg_kmem_bypass(void)
+{
+ /* Allow remote memcg charging from any context. */
+ if (unlikely(active_memcg()))
+ return false;
+
+ /* Memcg to charge can't be determined. */
+ if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
+ return true;
+
+ return false;
+}
+
/**
- * If current->active_memcg is non-NULL, do not fallback to current->mm->memcg.
+ * If active memcg is set, do not fallback to current->mm->memcg.
*/
static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
{
- if (unlikely(current->active_memcg)) {
- struct mem_cgroup *memcg = root_mem_cgroup;
+ if (memcg_kmem_bypass())
+ return NULL;
- rcu_read_lock();
- if (css_tryget_online(¤t->active_memcg->css))
- memcg = current->active_memcg;
- rcu_read_unlock();
- return memcg;
- }
+ if (unlikely(active_memcg()))
+ return get_active_memcg();
+
return get_mem_cgroup_from_mm(current->mm);
}
@@ -914,15 +1134,15 @@
* invocations for reference counting, or use mem_cgroup_iter_break()
* to cancel a hierarchy walk before the round-trip is complete.
*
- * Reclaimers can specify a node and a priority level in @reclaim to
- * divide up the memcgs in the hierarchy among all concurrent
- * reclaimers operating on the same node and priority.
+ * Reclaimers can specify a node in @reclaim to divide up the memcgs
+ * in the hierarchy among all concurrent reclaimers operating on the
+ * same node.
*/
struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup *prev,
struct mem_cgroup_reclaim_cookie *reclaim)
{
- struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
+ struct mem_cgroup_reclaim_iter *iter;
struct cgroup_subsys_state *css = NULL;
struct mem_cgroup *memcg = NULL;
struct mem_cgroup *pos = NULL;
@@ -948,7 +1168,7 @@
struct mem_cgroup_per_node *mz;
mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
- iter = &mz->iter[reclaim->priority];
+ iter = &mz->iter;
if (prev && reclaim->generation != iter->generation)
goto out_unlock;
@@ -1048,15 +1268,11 @@
struct mem_cgroup_reclaim_iter *iter;
struct mem_cgroup_per_node *mz;
int nid;
- int i;
for_each_node(nid) {
mz = mem_cgroup_nodeinfo(from, nid);
- for (i = 0; i <= DEF_PRIORITY; i++) {
- iter = &mz->iter[i];
- cmpxchg(&iter->position,
- dead_memcg, NULL);
- }
+ iter = &mz->iter;
+ cmpxchg(&iter->position, dead_memcg, NULL);
}
}
@@ -1106,7 +1322,7 @@
struct css_task_iter it;
struct task_struct *task;
- css_task_iter_start(&iter->css, 0, &it);
+ css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
while (!ret && (task = css_task_iter_next(&it)))
ret = fn(task, arg);
css_task_iter_end(&it);
@@ -1123,9 +1339,8 @@
* @page: the page
* @pgdat: pgdat of the page
*
- * This function is only safe when following the LRU page isolation
- * and putback protocol: the LRU lock must be held, and the page must
- * either be PageLRU() or the caller must have isolated/allocated it.
+ * This function relies on page->mem_cgroup being stable - see the
+ * access rules in commit_charge().
*/
struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
{
@@ -1134,7 +1349,7 @@
struct lruvec *lruvec;
if (mem_cgroup_disabled()) {
- lruvec = &pgdat->lruvec;
+ lruvec = &pgdat->__lruvec;
goto out;
}
@@ -1158,6 +1373,38 @@
lruvec->pgdat = pgdat;
return lruvec;
}
+
+struct lruvec *page_to_lruvec(struct page *page, pg_data_t *pgdat)
+{
+ struct lruvec *lruvec;
+
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
+
+ return lruvec;
+}
+EXPORT_SYMBOL_GPL(page_to_lruvec);
+
+void do_traversal_all_lruvec(void)
+{
+ pg_data_t *pgdat;
+
+ for_each_online_pgdat(pgdat) {
+ struct mem_cgroup *memcg = NULL;
+
+ spin_lock_irq(&pgdat->lru_lock);
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+
+ trace_android_vh_do_traversal_lruvec(lruvec);
+
+ memcg = mem_cgroup_iter(NULL, memcg, NULL);
+ } while (memcg);
+
+ spin_unlock_irq(&pgdat->lru_lock);
+ }
+}
+EXPORT_SYMBOL_GPL(do_traversal_all_lruvec);
/**
* mem_cgroup_update_lru_size - account for adding or removing an lru page
@@ -1197,32 +1444,7 @@
if (nr_pages > 0)
*lru_size += nr_pages;
}
-
-bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
-{
- struct mem_cgroup *task_memcg;
- struct task_struct *p;
- bool ret;
-
- p = find_lock_task_mm(task);
- if (p) {
- task_memcg = get_mem_cgroup_from_mm(p->mm);
- task_unlock(p);
- } else {
- /*
- * All threads may have already detached their mm's, but the oom
- * killer still needs to detect if they have already been oom
- * killed to prevent needlessly killing additional tasks.
- */
- rcu_read_lock();
- task_memcg = mem_cgroup_from_task(task);
- css_get(&task_memcg->css);
- rcu_read_unlock();
- }
- ret = mem_cgroup_is_descendant(task_memcg, memcg);
- css_put(&task_memcg->css);
- return ret;
-}
+EXPORT_SYMBOL_GPL(mem_cgroup_update_lru_size);
/**
* mem_cgroup_margin - calculate chargeable space of a memory cgroup
@@ -1245,7 +1467,7 @@
if (do_memsw_account()) {
count = page_counter_read(&memcg->memsw);
limit = READ_ONCE(memcg->memsw.max);
- if (count <= limit)
+ if (count < limit)
margin = min(margin, limit - count);
else
margin = 0;
@@ -1299,85 +1521,199 @@
return false;
}
-static const unsigned int memcg1_stats[] = {
- MEMCG_CACHE,
- MEMCG_RSS,
- MEMCG_RSS_HUGE,
- NR_SHMEM,
- NR_FILE_MAPPED,
- NR_FILE_DIRTY,
- NR_WRITEBACK,
- MEMCG_SWAP,
+struct memory_stat {
+ const char *name;
+ unsigned int ratio;
+ unsigned int idx;
};
-static const char *const memcg1_stat_names[] = {
- "cache",
- "rss",
- "rss_huge",
- "shmem",
- "mapped_file",
- "dirty",
- "writeback",
- "swap",
+static struct memory_stat memory_stats[] = {
+ { "anon", PAGE_SIZE, NR_ANON_MAPPED },
+ { "file", PAGE_SIZE, NR_FILE_PAGES },
+ { "kernel_stack", 1024, NR_KERNEL_STACK_KB },
+ { "percpu", 1, MEMCG_PERCPU_B },
+ { "sock", PAGE_SIZE, MEMCG_SOCK },
+ { "shmem", PAGE_SIZE, NR_SHMEM },
+ { "file_mapped", PAGE_SIZE, NR_FILE_MAPPED },
+ { "file_dirty", PAGE_SIZE, NR_FILE_DIRTY },
+ { "file_writeback", PAGE_SIZE, NR_WRITEBACK },
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ /*
+ * The ratio will be initialized in memory_stats_init(). Because
+ * on some architectures, the macro of HPAGE_PMD_SIZE is not
+ * constant(e.g. powerpc).
+ */
+ { "anon_thp", 0, NR_ANON_THPS },
+#endif
+ { "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON },
+ { "active_anon", PAGE_SIZE, NR_ACTIVE_ANON },
+ { "inactive_file", PAGE_SIZE, NR_INACTIVE_FILE },
+ { "active_file", PAGE_SIZE, NR_ACTIVE_FILE },
+ { "unevictable", PAGE_SIZE, NR_UNEVICTABLE },
+
+ /*
+ * Note: The slab_reclaimable and slab_unreclaimable must be
+ * together and slab_reclaimable must be in front.
+ */
+ { "slab_reclaimable", 1, NR_SLAB_RECLAIMABLE_B },
+ { "slab_unreclaimable", 1, NR_SLAB_UNRECLAIMABLE_B },
+
+ /* The memory events */
+ { "workingset_refault_anon", 1, WORKINGSET_REFAULT_ANON },
+ { "workingset_refault_file", 1, WORKINGSET_REFAULT_FILE },
+ { "workingset_activate_anon", 1, WORKINGSET_ACTIVATE_ANON },
+ { "workingset_activate_file", 1, WORKINGSET_ACTIVATE_FILE },
+ { "workingset_restore_anon", 1, WORKINGSET_RESTORE_ANON },
+ { "workingset_restore_file", 1, WORKINGSET_RESTORE_FILE },
+ { "workingset_nodereclaim", 1, WORKINGSET_NODERECLAIM },
};
+
+static int __init memory_stats_init(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (memory_stats[i].idx == NR_ANON_THPS)
+ memory_stats[i].ratio = HPAGE_PMD_SIZE;
+#endif
+ VM_BUG_ON(!memory_stats[i].ratio);
+ VM_BUG_ON(memory_stats[i].idx >= MEMCG_NR_STAT);
+ }
+
+ return 0;
+}
+pure_initcall(memory_stats_init);
+
+static char *memory_stat_format(struct mem_cgroup *memcg)
+{
+ struct seq_buf s;
+ int i;
+
+ seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
+ if (!s.buffer)
+ return NULL;
+
+ /*
+ * Provide statistics on the state of the memory subsystem as
+ * well as cumulative event counters that show past behavior.
+ *
+ * This list is ordered following a combination of these gradients:
+ * 1) generic big picture -> specifics and details
+ * 2) reflecting userspace activity -> reflecting kernel heuristics
+ *
+ * Current memory state:
+ */
+
+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
+ u64 size;
+
+ size = memcg_page_state(memcg, memory_stats[i].idx);
+ size *= memory_stats[i].ratio;
+ seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size);
+
+ if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
+ size = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
+ memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B);
+ seq_buf_printf(&s, "slab %llu\n", size);
+ }
+ }
+
+ /* Accumulated memory events */
+
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT),
+ memcg_events(memcg, PGFAULT));
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
+ memcg_events(memcg, PGMAJFAULT));
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGREFILL),
+ memcg_events(memcg, PGREFILL));
+ seq_buf_printf(&s, "pgscan %lu\n",
+ memcg_events(memcg, PGSCAN_KSWAPD) +
+ memcg_events(memcg, PGSCAN_DIRECT));
+ seq_buf_printf(&s, "pgsteal %lu\n",
+ memcg_events(memcg, PGSTEAL_KSWAPD) +
+ memcg_events(memcg, PGSTEAL_DIRECT));
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE),
+ memcg_events(memcg, PGACTIVATE));
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE),
+ memcg_events(memcg, PGDEACTIVATE));
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE),
+ memcg_events(memcg, PGLAZYFREE));
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED),
+ memcg_events(memcg, PGLAZYFREED));
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC),
+ memcg_events(memcg, THP_FAULT_ALLOC));
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC),
+ memcg_events(memcg, THP_COLLAPSE_ALLOC));
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+ /* The above should easily fit into one page */
+ WARN_ON_ONCE(seq_buf_has_overflowed(&s));
+
+ return s.buffer;
+}
#define K(x) ((x) << (PAGE_SHIFT-10))
/**
- * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
+ * mem_cgroup_print_oom_context: Print OOM information relevant to
+ * memory controller.
* @memcg: The memory cgroup that went over limit
* @p: Task that is going to be killed
*
* NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
* enabled
*/
-void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
+void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
{
- struct mem_cgroup *iter;
- unsigned int i;
-
rcu_read_lock();
+ if (memcg) {
+ pr_cont(",oom_memcg=");
+ pr_cont_cgroup_path(memcg->css.cgroup);
+ } else
+ pr_cont(",global_oom");
if (p) {
- pr_info("Task in ");
+ pr_cont(",task_memcg=");
pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
- pr_cont(" killed as a result of limit of ");
- } else {
- pr_info("Memory limit reached of cgroup ");
}
-
- pr_cont_cgroup_path(memcg->css.cgroup);
- pr_cont("\n");
-
rcu_read_unlock();
+}
+
+/**
+ * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to
+ * memory controller.
+ * @memcg: The memory cgroup that went over limit
+ */
+void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
+{
+ char *buf;
pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->memory)),
- K((u64)memcg->memory.max), memcg->memory.failcnt);
- pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
- K((u64)page_counter_read(&memcg->memsw)),
- K((u64)memcg->memsw.max), memcg->memsw.failcnt);
- pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
- K((u64)page_counter_read(&memcg->kmem)),
- K((u64)memcg->kmem.max), memcg->kmem.failcnt);
-
- for_each_mem_cgroup_tree(iter, memcg) {
- pr_info("Memory cgroup stats for ");
- pr_cont_cgroup_path(iter->css.cgroup);
- pr_cont(":");
-
- for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
- if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account)
- continue;
- pr_cont(" %s:%luKB", memcg1_stat_names[i],
- K(memcg_page_state(iter, memcg1_stats[i])));
- }
-
- for (i = 0; i < NR_LRU_LISTS; i++)
- pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
- K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
-
- pr_cont("\n");
+ K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt);
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
+ K((u64)page_counter_read(&memcg->swap)),
+ K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt);
+ else {
+ pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
+ K((u64)page_counter_read(&memcg->memsw)),
+ K((u64)memcg->memsw.max), memcg->memsw.failcnt);
+ pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
+ K((u64)page_counter_read(&memcg->kmem)),
+ K((u64)memcg->kmem.max), memcg->kmem.failcnt);
}
+
+ pr_info("Memory cgroup stats for ");
+ pr_cont_cgroup_path(memcg->css.cgroup);
+ pr_cont(":");
+ buf = memory_stat_format(memcg);
+ if (!buf)
+ return;
+ pr_info("%s", buf);
+ kfree(buf);
}
/*
@@ -1385,19 +1721,26 @@
*/
unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
{
- unsigned long max;
+ unsigned long max = READ_ONCE(memcg->memory.max);
- max = memcg->memory.max;
- if (mem_cgroup_swappiness(memcg)) {
- unsigned long memsw_max;
- unsigned long swap_max;
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
+ if (mem_cgroup_swappiness(memcg))
+ max += min(READ_ONCE(memcg->swap.max),
+ (unsigned long)total_swap_pages);
+ } else { /* v1 */
+ if (mem_cgroup_swappiness(memcg)) {
+ /* Calculate swap excess capacity from memsw limit */
+ unsigned long swap = READ_ONCE(memcg->memsw.max) - max;
- memsw_max = memcg->memsw.max;
- swap_max = memcg->swap.max;
- swap_max = min(swap_max, (unsigned long)total_swap_pages);
- max = min(max + swap_max, memsw_max);
+ max += min(swap, (unsigned long)total_swap_pages);
+ }
}
return max;
+}
+
+unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
+{
+ return page_counter_read(&memcg->memory);
}
static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
@@ -1410,112 +1753,24 @@
.gfp_mask = gfp_mask,
.order = order,
};
- bool ret;
+ bool ret = true;
if (mutex_lock_killable(&oom_lock))
return true;
+
+ if (mem_cgroup_margin(memcg) >= (1 << order))
+ goto unlock;
+
/*
* A few threads which were not waiting at mutex_lock_killable() can
* fail to bail out. Therefore, check again after holding oom_lock.
*/
- ret = should_force_charge() || out_of_memory(&oc);
+ ret = task_is_dying() || out_of_memory(&oc);
+
+unlock:
mutex_unlock(&oom_lock);
return ret;
}
-
-#if MAX_NUMNODES > 1
-
-/**
- * test_mem_cgroup_node_reclaimable
- * @memcg: the target memcg
- * @nid: the node ID to be checked.
- * @noswap : specify true here if the user wants flle only information.
- *
- * This function returns whether the specified memcg contains any
- * reclaimable pages on a node. Returns true if there are any reclaimable
- * pages in the node.
- */
-static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
- int nid, bool noswap)
-{
- if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
- return true;
- if (noswap || !total_swap_pages)
- return false;
- if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
- return true;
- return false;
-
-}
-
-/*
- * Always updating the nodemask is not very good - even if we have an empty
- * list or the wrong list here, we can start from some node and traverse all
- * nodes based on the zonelist. So update the list loosely once per 10 secs.
- *
- */
-static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
-{
- int nid;
- /*
- * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
- * pagein/pageout changes since the last update.
- */
- if (!atomic_read(&memcg->numainfo_events))
- return;
- if (atomic_inc_return(&memcg->numainfo_updating) > 1)
- return;
-
- /* make a nodemask where this memcg uses memory from */
- memcg->scan_nodes = node_states[N_MEMORY];
-
- for_each_node_mask(nid, node_states[N_MEMORY]) {
-
- if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
- node_clear(nid, memcg->scan_nodes);
- }
-
- atomic_set(&memcg->numainfo_events, 0);
- atomic_set(&memcg->numainfo_updating, 0);
-}
-
-/*
- * Selecting a node where we start reclaim from. Because what we need is just
- * reducing usage counter, start from anywhere is O,K. Considering
- * memory reclaim from current node, there are pros. and cons.
- *
- * Freeing memory from current node means freeing memory from a node which
- * we'll use or we've used. So, it may make LRU bad. And if several threads
- * hit limits, it will see a contention on a node. But freeing from remote
- * node means more costs for memory reclaim because of memory latency.
- *
- * Now, we use round-robin. Better algorithm is welcomed.
- */
-int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
-{
- int node;
-
- mem_cgroup_may_update_nodemask(memcg);
- node = memcg->last_scanned_node;
-
- node = next_node_in(node, memcg->scan_nodes);
- /*
- * mem_cgroup_may_update_nodemask might have seen no reclaimmable pages
- * last time it really checked all the LRUs due to rate limiting.
- * Fallback to the current node in that case for simplicity.
- */
- if (unlikely(node == MAX_NUMNODES))
- node = numa_node_id();
-
- memcg->last_scanned_node = node;
- return node;
-}
-#else
-int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
-{
- return 0;
-}
-#endif
static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
pg_data_t *pgdat,
@@ -1529,7 +1784,6 @@
unsigned long nr_scanned;
struct mem_cgroup_reclaim_cookie reclaim = {
.pgdat = pgdat,
- .priority = 0,
};
excess = soft_limit_excess(root_memcg);
@@ -1624,7 +1878,7 @@
struct mem_cgroup *iter;
spin_lock(&memcg_oom_lock);
- mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
+ mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
for_each_mem_cgroup_tree(iter, memcg)
iter->oom_lock = false;
spin_unlock(&memcg_oom_lock);
@@ -1645,8 +1899,8 @@
struct mem_cgroup *iter;
/*
- * When a new child is created while the hierarchy is under oom,
- * mem_cgroup_oom_lock() may not be called. Watch for underflow.
+ * Be careful about under_oom underflows becase a child memcg
+ * could have been added after mem_cgroup_mark_under_oom.
*/
spin_lock(&memcg_oom_lock);
for_each_mem_cgroup_tree(iter, memcg)
@@ -1706,6 +1960,8 @@
if (order > PAGE_ALLOC_COSTLY_ORDER)
return OOM_SKIPPED;
+
+ memcg_memory_event(memcg, MEMCG_OOM);
/*
* We are in the middle of the charge context here, so we
@@ -1854,6 +2110,14 @@
goto out;
/*
+ * If the victim task has been asynchronously moved to a different
+ * memory cgroup, we might end up killing tasks outside oom_domain.
+ * In this case it's better to ignore memory.group.oom.
+ */
+ if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain)))
+ goto out;
+
+ /*
* Traverse the memory cgroup hierarchy from the victim task's
* cgroup up to the OOMing cgroup (or root) to find the
* highest-level memory cgroup with oom.group set.
@@ -1894,6 +2158,7 @@
*/
struct mem_cgroup *lock_page_memcg(struct page *page)
{
+ struct page *head = compound_head(page); /* rmap on tail pages */
struct mem_cgroup *memcg;
unsigned long flags;
@@ -1913,7 +2178,7 @@
if (mem_cgroup_disabled())
return NULL;
again:
- memcg = page->mem_cgroup;
+ memcg = head->mem_cgroup;
if (unlikely(!memcg))
return NULL;
@@ -1921,7 +2186,7 @@
return memcg;
spin_lock_irqsave(&memcg->move_lock, flags);
- if (memcg != page->mem_cgroup) {
+ if (memcg != head->mem_cgroup) {
spin_unlock_irqrestore(&memcg->move_lock, flags);
goto again;
}
@@ -1964,19 +2229,43 @@
*/
void unlock_page_memcg(struct page *page)
{
- __unlock_page_memcg(page->mem_cgroup);
+ struct page *head = compound_head(page);
+
+ __unlock_page_memcg(head->mem_cgroup);
}
EXPORT_SYMBOL(unlock_page_memcg);
struct memcg_stock_pcp {
struct mem_cgroup *cached; /* this never be root cgroup */
unsigned int nr_pages;
+
+#ifdef CONFIG_MEMCG_KMEM
+ struct obj_cgroup *cached_objcg;
+ unsigned int nr_bytes;
+#endif
+
struct work_struct work;
unsigned long flags;
#define FLUSHING_CACHED_CHARGE 0
};
static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
static DEFINE_MUTEX(percpu_charge_mutex);
+
+#ifdef CONFIG_MEMCG_KMEM
+static void drain_obj_stock(struct memcg_stock_pcp *stock);
+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
+ struct mem_cgroup *root_memcg);
+
+#else
+static inline void drain_obj_stock(struct memcg_stock_pcp *stock)
+{
+}
+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
+ struct mem_cgroup *root_memcg)
+{
+ return false;
+}
+#endif
/**
* consume_stock: Try to consume stocked charge on this cpu.
@@ -2018,13 +2307,17 @@
{
struct mem_cgroup *old = stock->cached;
+ if (!old)
+ return;
+
if (stock->nr_pages) {
page_counter_uncharge(&old->memory, stock->nr_pages);
if (do_memsw_account())
page_counter_uncharge(&old->memsw, stock->nr_pages);
- css_put_many(&old->css, stock->nr_pages);
stock->nr_pages = 0;
}
+
+ css_put(&old->css);
stock->cached = NULL;
}
@@ -2040,6 +2333,7 @@
local_irq_save(flags);
stock = this_cpu_ptr(&memcg_stock);
+ drain_obj_stock(stock);
drain_stock(stock);
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
@@ -2060,6 +2354,7 @@
stock = this_cpu_ptr(&memcg_stock);
if (stock->cached != memcg) { /* reset if necessary */
drain_stock(stock);
+ css_get(&memcg->css);
stock->cached = memcg;
}
stock->nr_pages += nr_pages;
@@ -2087,34 +2382,37 @@
* as well as workers from this path always operate on the local
* per-cpu data. CPU up doesn't touch memcg_stock at all.
*/
- curcpu = get_cpu_light();
+ curcpu = get_cpu();
for_each_online_cpu(cpu) {
struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
struct mem_cgroup *memcg;
+ bool flush = false;
+ rcu_read_lock();
memcg = stock->cached;
- if (!memcg || !stock->nr_pages || !css_tryget(&memcg->css))
- continue;
- if (!mem_cgroup_is_descendant(memcg, root_memcg)) {
- css_put(&memcg->css);
- continue;
- }
- if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
+ if (memcg && stock->nr_pages &&
+ mem_cgroup_is_descendant(memcg, root_memcg))
+ flush = true;
+ if (obj_stock_flush_required(stock, root_memcg))
+ flush = true;
+ rcu_read_unlock();
+
+ if (flush &&
+ !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
if (cpu == curcpu)
drain_local_stock(&stock->work);
else
schedule_work_on(cpu, &stock->work);
}
- css_put(&memcg->css);
}
- put_cpu_light();
+ put_cpu();
mutex_unlock(&percpu_charge_mutex);
}
static int memcg_hotplug_cpu_dead(unsigned int cpu)
{
struct memcg_stock_pcp *stock;
- struct mem_cgroup *memcg;
+ struct mem_cgroup *memcg, *mi;
stock = &per_cpu(memcg_stock, cpu);
drain_stock(stock);
@@ -2126,9 +2424,10 @@
int nid;
long x;
- x = this_cpu_xchg(memcg->stat_cpu->count[i], 0);
+ x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
if (x)
- atomic_long_add(x, &memcg->stat[i]);
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ atomic_long_add(x, &memcg->vmstats[i]);
if (i >= NR_VM_NODE_STAT_ITEMS)
continue;
@@ -2139,32 +2438,48 @@
pn = mem_cgroup_nodeinfo(memcg, nid);
x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
if (x)
- atomic_long_add(x, &pn->lruvec_stat[i]);
+ do {
+ atomic_long_add(x, &pn->lruvec_stat[i]);
+ } while ((pn = parent_nodeinfo(pn, nid)));
}
}
for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
long x;
- x = this_cpu_xchg(memcg->stat_cpu->events[i], 0);
+ x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
if (x)
- atomic_long_add(x, &memcg->events[i]);
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ atomic_long_add(x, &memcg->vmevents[i]);
}
}
return 0;
}
-static void reclaim_high(struct mem_cgroup *memcg,
- unsigned int nr_pages,
- gfp_t gfp_mask)
+static unsigned long reclaim_high(struct mem_cgroup *memcg,
+ unsigned int nr_pages,
+ gfp_t gfp_mask)
{
+ unsigned long nr_reclaimed = 0;
+
do {
- if (page_counter_read(&memcg->memory) <= memcg->high)
+ unsigned long pflags;
+
+ if (page_counter_read(&memcg->memory) <=
+ READ_ONCE(memcg->memory.high))
continue;
+
memcg_memory_event(memcg, MEMCG_HIGH);
- try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
- } while ((memcg = parent_mem_cgroup(memcg)));
+
+ psi_memstall_enter(&pflags);
+ nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
+ gfp_mask, true);
+ psi_memstall_leave(&pflags);
+ } while ((memcg = parent_mem_cgroup(memcg)) &&
+ !mem_cgroup_is_root(memcg));
+
+ return nr_reclaimed;
}
static void high_work_func(struct work_struct *work)
@@ -2176,35 +2491,238 @@
}
/*
+ * Clamp the maximum sleep time per allocation batch to 2 seconds. This is
+ * enough to still cause a significant slowdown in most cases, while still
+ * allowing diagnostics and tracing to proceed without becoming stuck.
+ */
+#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
+
+/*
+ * When calculating the delay, we use these either side of the exponentiation to
+ * maintain precision and scale to a reasonable number of jiffies (see the table
+ * below.
+ *
+ * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
+ * overage ratio to a delay.
+ * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the
+ * proposed penalty in order to reduce to a reasonable number of jiffies, and
+ * to produce a reasonable delay curve.
+ *
+ * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
+ * reasonable delay curve compared to precision-adjusted overage, not
+ * penalising heavily at first, but still making sure that growth beyond the
+ * limit penalises misbehaviour cgroups by slowing them down exponentially. For
+ * example, with a high of 100 megabytes:
+ *
+ * +-------+------------------------+
+ * | usage | time to allocate in ms |
+ * +-------+------------------------+
+ * | 100M | 0 |
+ * | 101M | 6 |
+ * | 102M | 25 |
+ * | 103M | 57 |
+ * | 104M | 102 |
+ * | 105M | 159 |
+ * | 106M | 230 |
+ * | 107M | 313 |
+ * | 108M | 409 |
+ * | 109M | 518 |
+ * | 110M | 639 |
+ * | 111M | 774 |
+ * | 112M | 921 |
+ * | 113M | 1081 |
+ * | 114M | 1254 |
+ * | 115M | 1439 |
+ * | 116M | 1638 |
+ * | 117M | 1849 |
+ * | 118M | 2000 |
+ * | 119M | 2000 |
+ * | 120M | 2000 |
+ * +-------+------------------------+
+ */
+ #define MEMCG_DELAY_PRECISION_SHIFT 20
+ #define MEMCG_DELAY_SCALING_SHIFT 14
+
+static u64 calculate_overage(unsigned long usage, unsigned long high)
+{
+ u64 overage;
+
+ if (usage <= high)
+ return 0;
+
+ /*
+ * Prevent division by 0 in overage calculation by acting as if
+ * it was a threshold of 1 page
+ */
+ high = max(high, 1UL);
+
+ overage = usage - high;
+ overage <<= MEMCG_DELAY_PRECISION_SHIFT;
+ return div64_u64(overage, high);
+}
+
+static u64 mem_find_max_overage(struct mem_cgroup *memcg)
+{
+ u64 overage, max_overage = 0;
+
+ do {
+ overage = calculate_overage(page_counter_read(&memcg->memory),
+ READ_ONCE(memcg->memory.high));
+ max_overage = max(overage, max_overage);
+ } while ((memcg = parent_mem_cgroup(memcg)) &&
+ !mem_cgroup_is_root(memcg));
+
+ return max_overage;
+}
+
+static u64 swap_find_max_overage(struct mem_cgroup *memcg)
+{
+ u64 overage, max_overage = 0;
+
+ do {
+ overage = calculate_overage(page_counter_read(&memcg->swap),
+ READ_ONCE(memcg->swap.high));
+ if (overage)
+ memcg_memory_event(memcg, MEMCG_SWAP_HIGH);
+ max_overage = max(overage, max_overage);
+ } while ((memcg = parent_mem_cgroup(memcg)) &&
+ !mem_cgroup_is_root(memcg));
+
+ return max_overage;
+}
+
+/*
+ * Get the number of jiffies that we should penalise a mischievous cgroup which
+ * is exceeding its memory.high by checking both it and its ancestors.
+ */
+static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
+ unsigned int nr_pages,
+ u64 max_overage)
+{
+ unsigned long penalty_jiffies;
+
+ if (!max_overage)
+ return 0;
+
+ /*
+ * We use overage compared to memory.high to calculate the number of
+ * jiffies to sleep (penalty_jiffies). Ideally this value should be
+ * fairly lenient on small overages, and increasingly harsh when the
+ * memcg in question makes it clear that it has no intention of stopping
+ * its crazy behaviour, so we exponentially increase the delay based on
+ * overage amount.
+ */
+ penalty_jiffies = max_overage * max_overage * HZ;
+ penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
+ penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
+
+ /*
+ * Factor in the task's own contribution to the overage, such that four
+ * N-sized allocations are throttled approximately the same as one
+ * 4N-sized allocation.
+ *
+ * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
+ * larger the current charge patch is than that.
+ */
+ return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
+}
+
+/*
* Scheduled by try_charge() to be executed from the userland return path
* and reclaims memory over the high limit.
*/
void mem_cgroup_handle_over_high(void)
{
+ unsigned long penalty_jiffies;
+ unsigned long pflags;
+ unsigned long nr_reclaimed;
unsigned int nr_pages = current->memcg_nr_pages_over_high;
+ int nr_retries = MAX_RECLAIM_RETRIES;
struct mem_cgroup *memcg;
+ bool in_retry = false;
if (likely(!nr_pages))
return;
memcg = get_mem_cgroup_from_mm(current->mm);
- reclaim_high(memcg, nr_pages, GFP_KERNEL);
- css_put(&memcg->css);
current->memcg_nr_pages_over_high = 0;
+
+retry_reclaim:
+ /*
+ * The allocating task should reclaim at least the batch size, but for
+ * subsequent retries we only want to do what's necessary to prevent oom
+ * or breaching resource isolation.
+ *
+ * This is distinct from memory.max or page allocator behaviour because
+ * memory.high is currently batched, whereas memory.max and the page
+ * allocator run every time an allocation is made.
+ */
+ nr_reclaimed = reclaim_high(memcg,
+ in_retry ? SWAP_CLUSTER_MAX : nr_pages,
+ GFP_KERNEL);
+
+ /*
+ * memory.high is breached and reclaim is unable to keep up. Throttle
+ * allocators proactively to slow down excessive growth.
+ */
+ penalty_jiffies = calculate_high_delay(memcg, nr_pages,
+ mem_find_max_overage(memcg));
+
+ penalty_jiffies += calculate_high_delay(memcg, nr_pages,
+ swap_find_max_overage(memcg));
+
+ /*
+ * Clamp the max delay per usermode return so as to still keep the
+ * application moving forwards and also permit diagnostics, albeit
+ * extremely slowly.
+ */
+ penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
+
+ /*
+ * Don't sleep if the amount of jiffies this memcg owes us is so low
+ * that it's not even worth doing, in an attempt to be nice to those who
+ * go only a small amount over their memory.high value and maybe haven't
+ * been aggressively reclaimed enough yet.
+ */
+ if (penalty_jiffies <= HZ / 100)
+ goto out;
+
+ /*
+ * If reclaim is making forward progress but we're still over
+ * memory.high, we want to encourage that rather than doing allocator
+ * throttling.
+ */
+ if (nr_reclaimed || nr_retries--) {
+ in_retry = true;
+ goto retry_reclaim;
+ }
+
+ /*
+ * If we exit early, we're guaranteed to die (since
+ * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
+ * need to account for any ill-begotten jiffies to pay them off later.
+ */
+ psi_memstall_enter(&pflags);
+ schedule_timeout_killable(penalty_jiffies);
+ psi_memstall_leave(&pflags);
+
+out:
+ css_put(&memcg->css);
}
static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned int nr_pages)
{
unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
- int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ int nr_retries = MAX_RECLAIM_RETRIES;
struct mem_cgroup *mem_over_limit;
struct page_counter *counter;
+ enum oom_status oom_status;
unsigned long nr_reclaimed;
+ bool passed_oom = false;
bool may_swap = true;
bool drained = false;
- bool oomed = false;
- enum oom_status oom_status;
+ unsigned long pflags;
if (mem_cgroup_is_root(memcg))
return 0;
@@ -2239,15 +2757,6 @@
goto force;
/*
- * Unlike in global OOM situations, memcg is not in a physical
- * memory shortage. Allow dying and OOM-killed tasks to
- * bypass the last charges so that they can exit quickly and
- * free their memory.
- */
- if (unlikely(should_force_charge()))
- goto force;
-
- /*
* Prevent unbounded recursion when reclaim operations need to
* allocate memory. This might exceed the limits temporarily,
* but we prefer facilitating memory reclaim and getting back
@@ -2264,8 +2773,10 @@
memcg_memory_event(mem_over_limit, MEMCG_MAX);
+ psi_memstall_enter(&pflags);
nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
gfp_mask, may_swap);
+ psi_memstall_leave(&pflags);
if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
goto retry;
@@ -2299,16 +2810,15 @@
if (nr_retries--)
goto retry;
- if (gfp_mask & __GFP_RETRY_MAYFAIL && oomed)
+ if (gfp_mask & __GFP_RETRY_MAYFAIL)
goto nomem;
if (gfp_mask & __GFP_NOFAIL)
goto force;
- if (fatal_signal_pending(current))
- goto force;
-
- memcg_memory_event(mem_over_limit, MEMCG_OOM);
+ /* Avoid endless loop for tasks bypassed by the oom killer */
+ if (passed_oom && task_is_dying())
+ goto nomem;
/*
* keep retrying as long as the memcg oom killer is able to make
@@ -2317,15 +2827,10 @@
*/
oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
get_order(nr_pages * PAGE_SIZE));
- switch (oom_status) {
- case OOM_SUCCESS:
- nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
- oomed = true;
+ if (oom_status == OOM_SUCCESS) {
+ passed_oom = true;
+ nr_retries = MAX_RECLAIM_RETRIES;
goto retry;
- case OOM_FAILED:
- goto force;
- default:
- goto nomem;
}
nomem:
if (!(gfp_mask & __GFP_NOFAIL))
@@ -2339,12 +2844,10 @@
page_counter_charge(&memcg->memory, nr_pages);
if (do_memsw_account())
page_counter_charge(&memcg->memsw, nr_pages);
- css_get_many(&memcg->css, nr_pages);
return 0;
done_restock:
- css_get_many(&memcg->css, batch);
if (batch > nr_pages)
refill_stock(memcg, batch - nr_pages);
@@ -2358,12 +2861,32 @@
* reclaim, the cost of mismatch is negligible.
*/
do {
- if (page_counter_read(&memcg->memory) > memcg->high) {
- /* Don't bother a random interrupted task */
- if (in_interrupt()) {
+ bool mem_high, swap_high;
+
+ mem_high = page_counter_read(&memcg->memory) >
+ READ_ONCE(memcg->memory.high);
+ swap_high = page_counter_read(&memcg->swap) >
+ READ_ONCE(memcg->swap.high);
+
+ /* Don't bother a random interrupted task */
+ if (in_interrupt()) {
+ if (mem_high) {
schedule_work(&memcg->high_work);
break;
}
+ continue;
+ }
+
+ if (mem_high || swap_high) {
+ /*
+ * The allocating tasks in this cgroup will need to do
+ * reclaim or be throttled to prevent further growth
+ * of the memory or swap footprints.
+ *
+ * Target some best-effort fairness between the tasks,
+ * and distribute reclaim work and delay penalties
+ * based on how much each task is actually allocating.
+ */
current->memcg_nr_pages_over_high += batch;
set_notify_resume(current);
break;
@@ -2373,6 +2896,7 @@
return 0;
}
+#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU)
static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
{
if (mem_cgroup_is_root(memcg))
@@ -2381,76 +2905,124 @@
page_counter_uncharge(&memcg->memory, nr_pages);
if (do_memsw_account())
page_counter_uncharge(&memcg->memsw, nr_pages);
-
- css_put_many(&memcg->css, nr_pages);
}
+#endif
-static void lock_page_lru(struct page *page, int *isolated)
+static void commit_charge(struct page *page, struct mem_cgroup *memcg)
{
- struct zone *zone = page_zone(page);
-
- spin_lock_irq(zone_lru_lock(zone));
- if (PageLRU(page)) {
- struct lruvec *lruvec;
-
- lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
- ClearPageLRU(page);
- del_page_from_lru_list(page, lruvec, page_lru(page));
- *isolated = 1;
- } else
- *isolated = 0;
-}
-
-static void unlock_page_lru(struct page *page, int isolated)
-{
- struct zone *zone = page_zone(page);
-
- if (isolated) {
- struct lruvec *lruvec;
-
- lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
- VM_BUG_ON_PAGE(PageLRU(page), page);
- SetPageLRU(page);
- add_page_to_lru_list(page, lruvec, page_lru(page));
- }
- spin_unlock_irq(zone_lru_lock(zone));
-}
-
-static void commit_charge(struct page *page, struct mem_cgroup *memcg,
- bool lrucare)
-{
- int isolated;
-
VM_BUG_ON_PAGE(page->mem_cgroup, page);
-
/*
- * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
- * may already be on some other mem_cgroup's LRU. Take care of it.
- */
- if (lrucare)
- lock_page_lru(page, &isolated);
-
- /*
- * Nobody should be changing or seriously looking at
- * page->mem_cgroup at this point:
+ * Any of the following ensures page->mem_cgroup stability:
*
- * - the page is uncharged
- *
- * - the page is off-LRU
- *
- * - an anonymous fault has exclusive page access, except for
- * a locked page table
- *
- * - a page cache insertion, a swapin fault, or a migration
- * have the page locked
+ * - the page lock
+ * - LRU isolation
+ * - lock_page_memcg()
+ * - exclusive reference
*/
page->mem_cgroup = memcg;
-
- if (lrucare)
- unlock_page_lru(page, isolated);
}
#ifdef CONFIG_MEMCG_KMEM
+/*
+ * The allocated objcg pointers array is not accounted directly.
+ * Moreover, it should not come from DMA buffer and is not readily
+ * reclaimable. So those GFP bits should be masked off.
+ */
+#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT)
+
+int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
+ gfp_t gfp)
+{
+ unsigned int objects = objs_per_slab_page(s, page);
+ void *vec;
+
+ gfp &= ~OBJCGS_CLEAR_MASK;
+ vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
+ page_to_nid(page));
+ if (!vec)
+ return -ENOMEM;
+
+ if (cmpxchg(&page->obj_cgroups, NULL,
+ (struct obj_cgroup **) ((unsigned long)vec | 0x1UL)))
+ kfree(vec);
+ else
+ kmemleak_not_leak(vec);
+
+ return 0;
+}
+
+/*
+ * Returns a pointer to the memory cgroup to which the kernel object is charged.
+ *
+ * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
+ * cgroup_mutex, etc.
+ */
+struct mem_cgroup *mem_cgroup_from_obj(void *p)
+{
+ struct page *page;
+
+ if (mem_cgroup_disabled())
+ return NULL;
+
+ page = virt_to_head_page(p);
+
+ /*
+ * If page->mem_cgroup is set, it's either a simple mem_cgroup pointer
+ * or a pointer to obj_cgroup vector. In the latter case the lowest
+ * bit of the pointer is set.
+ * The page->mem_cgroup pointer can be asynchronously changed
+ * from NULL to (obj_cgroup_vec | 0x1UL), but can't be changed
+ * from a valid memcg pointer to objcg vector or back.
+ */
+ if (!page->mem_cgroup)
+ return NULL;
+
+ /*
+ * Slab objects are accounted individually, not per-page.
+ * Memcg membership data for each individual object is saved in
+ * the page->obj_cgroups.
+ */
+ if (page_has_obj_cgroups(page)) {
+ struct obj_cgroup *objcg;
+ unsigned int off;
+
+ off = obj_to_index(page->slab_cache, page, p);
+ objcg = page_obj_cgroups(page)[off];
+ if (objcg)
+ return obj_cgroup_memcg(objcg);
+
+ return NULL;
+ }
+
+ /* All other pages use page->mem_cgroup */
+ return page->mem_cgroup;
+}
+
+__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
+{
+ struct obj_cgroup *objcg = NULL;
+ struct mem_cgroup *memcg;
+
+ if (memcg_kmem_bypass())
+ return NULL;
+
+ rcu_read_lock();
+ if (unlikely(active_memcg()))
+ memcg = active_memcg();
+ else
+ memcg = mem_cgroup_from_task(current);
+
+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
+ objcg = rcu_dereference(memcg->objcg);
+ if (objcg && obj_cgroup_tryget(objcg))
+ break;
+ objcg = NULL;
+ }
+ rcu_read_unlock();
+
+ return objcg;
+}
+
static int memcg_alloc_cache_id(void)
{
int id, size;
@@ -2476,9 +3048,7 @@
else if (size > MEMCG_CACHES_MAX_SIZE)
size = MEMCG_CACHES_MAX_SIZE;
- err = memcg_update_all_caches(size);
- if (!err)
- err = memcg_update_all_list_lrus(size);
+ err = memcg_update_all_list_lrus(size);
if (!err)
memcg_nr_cache_ids = size;
@@ -2496,152 +3066,17 @@
ida_simple_remove(&memcg_cache_ida, id);
}
-struct memcg_kmem_cache_create_work {
- struct mem_cgroup *memcg;
- struct kmem_cache *cachep;
- struct work_struct work;
-};
-
-static void memcg_kmem_cache_create_func(struct work_struct *w)
-{
- struct memcg_kmem_cache_create_work *cw =
- container_of(w, struct memcg_kmem_cache_create_work, work);
- struct mem_cgroup *memcg = cw->memcg;
- struct kmem_cache *cachep = cw->cachep;
-
- memcg_create_kmem_cache(memcg, cachep);
-
- css_put(&memcg->css);
- kfree(cw);
-}
-
-/*
- * Enqueue the creation of a per-memcg kmem_cache.
- */
-static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
- struct kmem_cache *cachep)
-{
- struct memcg_kmem_cache_create_work *cw;
-
- cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
- if (!cw)
- return;
-
- css_get(&memcg->css);
-
- cw->memcg = memcg;
- cw->cachep = cachep;
- INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
-
- queue_work(memcg_kmem_cache_wq, &cw->work);
-}
-
-static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
- struct kmem_cache *cachep)
-{
- /*
- * We need to stop accounting when we kmalloc, because if the
- * corresponding kmalloc cache is not yet created, the first allocation
- * in __memcg_schedule_kmem_cache_create will recurse.
- *
- * However, it is better to enclose the whole function. Depending on
- * the debugging options enabled, INIT_WORK(), for instance, can
- * trigger an allocation. This too, will make us recurse. Because at
- * this point we can't allow ourselves back into memcg_kmem_get_cache,
- * the safest choice is to do it like this, wrapping the whole function.
- */
- current->memcg_kmem_skip_account = 1;
- __memcg_schedule_kmem_cache_create(memcg, cachep);
- current->memcg_kmem_skip_account = 0;
-}
-
-static inline bool memcg_kmem_bypass(void)
-{
- if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
- return true;
- return false;
-}
-
/**
- * memcg_kmem_get_cache: select the correct per-memcg cache for allocation
- * @cachep: the original global kmem cache
- *
- * Return the kmem_cache we're supposed to use for a slab allocation.
- * We try to use the current memcg's version of the cache.
- *
- * If the cache does not exist yet, if we are the first user of it, we
- * create it asynchronously in a workqueue and let the current allocation
- * go through with the original cache.
- *
- * This function takes a reference to the cache it returns to assure it
- * won't get destroyed while we are working with it. Once the caller is
- * done with it, memcg_kmem_put_cache() must be called to release the
- * reference.
- */
-struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
-{
- struct mem_cgroup *memcg;
- struct kmem_cache *memcg_cachep;
- int kmemcg_id;
-
- VM_BUG_ON(!is_root_cache(cachep));
-
- if (memcg_kmem_bypass())
- return cachep;
-
- if (current->memcg_kmem_skip_account)
- return cachep;
-
- memcg = get_mem_cgroup_from_current();
- kmemcg_id = READ_ONCE(memcg->kmemcg_id);
- if (kmemcg_id < 0)
- goto out;
-
- memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
- if (likely(memcg_cachep))
- return memcg_cachep;
-
- /*
- * If we are in a safe context (can wait, and not in interrupt
- * context), we could be be predictable and return right away.
- * This would guarantee that the allocation being performed
- * already belongs in the new cache.
- *
- * However, there are some clashes that can arrive from locking.
- * For instance, because we acquire the slab_mutex while doing
- * memcg_create_kmem_cache, this means no further allocation
- * could happen with the slab_mutex held. So it's better to
- * defer everything.
- */
- memcg_schedule_kmem_cache_create(memcg, cachep);
-out:
- css_put(&memcg->css);
- return cachep;
-}
-
-/**
- * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache
- * @cachep: the cache returned by memcg_kmem_get_cache
- */
-void memcg_kmem_put_cache(struct kmem_cache *cachep)
-{
- if (!is_root_cache(cachep))
- css_put(&cachep->memcg_params.memcg->css);
-}
-
-/**
- * memcg_kmem_charge_memcg: charge a kmem page
- * @page: page to charge
- * @gfp: reclaim mode
- * @order: allocation order
+ * __memcg_kmem_charge: charge a number of kernel pages to a memcg
* @memcg: memory cgroup to charge
+ * @gfp: reclaim mode
+ * @nr_pages: number of pages to charge
*
* Returns 0 on success, an error code on failure.
*/
-int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
- struct mem_cgroup *memcg)
+int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
+ unsigned int nr_pages)
{
- unsigned int nr_pages = 1 << order;
struct page_counter *counter;
int ret;
@@ -2664,43 +3099,54 @@
cancel_charge(memcg, nr_pages);
return -ENOMEM;
}
-
- page->mem_cgroup = memcg;
-
return 0;
}
/**
- * memcg_kmem_charge: charge a kmem page to the current memory cgroup
+ * __memcg_kmem_uncharge: uncharge a number of kernel pages from a memcg
+ * @memcg: memcg to uncharge
+ * @nr_pages: number of pages to uncharge
+ */
+void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
+{
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ page_counter_uncharge(&memcg->kmem, nr_pages);
+
+ refill_stock(memcg, nr_pages);
+}
+
+/**
+ * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup
* @page: page to charge
* @gfp: reclaim mode
* @order: allocation order
*
* Returns 0 on success, an error code on failure.
*/
-int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
+int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
{
struct mem_cgroup *memcg;
int ret = 0;
- if (mem_cgroup_disabled() || memcg_kmem_bypass())
- return 0;
-
memcg = get_mem_cgroup_from_current();
- if (!mem_cgroup_is_root(memcg)) {
- ret = memcg_kmem_charge_memcg(page, gfp, order, memcg);
- if (!ret)
+ if (memcg && !mem_cgroup_is_root(memcg)) {
+ ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
+ if (!ret) {
+ page->mem_cgroup = memcg;
__SetPageKmemcg(page);
+ return 0;
+ }
+ css_put(&memcg->css);
}
- css_put(&memcg->css);
return ret;
}
+
/**
- * memcg_kmem_uncharge: uncharge a kmem page
+ * __memcg_kmem_uncharge_page: uncharge a kmem page
* @page: page to uncharge
* @order: allocation order
*/
-void memcg_kmem_uncharge(struct page *page, int order)
+void __memcg_kmem_uncharge_page(struct page *page, int order)
{
struct mem_cgroup *memcg = page->mem_cgroup;
unsigned int nr_pages = 1 << order;
@@ -2709,43 +3155,179 @@
return;
VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
-
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
- page_counter_uncharge(&memcg->kmem, nr_pages);
-
- page_counter_uncharge(&memcg->memory, nr_pages);
- if (do_memsw_account())
- page_counter_uncharge(&memcg->memsw, nr_pages);
-
+ __memcg_kmem_uncharge(memcg, nr_pages);
page->mem_cgroup = NULL;
+ css_put(&memcg->css);
/* slab pages do not have PageKmemcg flag set */
if (PageKmemcg(page))
__ClearPageKmemcg(page);
-
- css_put_many(&memcg->css, nr_pages);
}
-#endif /* CONFIG_MEMCG_KMEM */
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-
-/*
- * Because tail pages are not marked as "used", set it. We're under
- * zone_lru_lock and migration entries setup in all page mappings.
- */
-void mem_cgroup_split_huge_fixup(struct page *head)
+static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
{
- int i;
+ struct memcg_stock_pcp *stock;
+ unsigned long flags;
+ bool ret = false;
- if (mem_cgroup_disabled())
+ local_irq_save(flags);
+
+ stock = this_cpu_ptr(&memcg_stock);
+ if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
+ stock->nr_bytes -= nr_bytes;
+ ret = true;
+ }
+
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+static void drain_obj_stock(struct memcg_stock_pcp *stock)
+{
+ struct obj_cgroup *old = stock->cached_objcg;
+
+ if (!old)
return;
- for (i = 1; i < HPAGE_PMD_NR; i++)
- head[i].mem_cgroup = head->mem_cgroup;
+ if (stock->nr_bytes) {
+ unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
+ unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
- __mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR);
+ if (nr_pages) {
+ struct mem_cgroup *memcg;
+
+ rcu_read_lock();
+retry:
+ memcg = obj_cgroup_memcg(old);
+ if (unlikely(!css_tryget(&memcg->css)))
+ goto retry;
+ rcu_read_unlock();
+
+ __memcg_kmem_uncharge(memcg, nr_pages);
+ css_put(&memcg->css);
+ }
+
+ /*
+ * The leftover is flushed to the centralized per-memcg value.
+ * On the next attempt to refill obj stock it will be moved
+ * to a per-cpu stock (probably, on an other CPU), see
+ * refill_obj_stock().
+ *
+ * How often it's flushed is a trade-off between the memory
+ * limit enforcement accuracy and potential CPU contention,
+ * so it might be changed in the future.
+ */
+ atomic_add(nr_bytes, &old->nr_charged_bytes);
+ stock->nr_bytes = 0;
+ }
+
+ obj_cgroup_put(old);
+ stock->cached_objcg = NULL;
}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
+ struct mem_cgroup *root_memcg)
+{
+ struct mem_cgroup *memcg;
+
+ if (stock->cached_objcg) {
+ memcg = obj_cgroup_memcg(stock->cached_objcg);
+ if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
+ return true;
+ }
+
+ return false;
+}
+
+static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
+{
+ struct memcg_stock_pcp *stock;
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ stock = this_cpu_ptr(&memcg_stock);
+ if (stock->cached_objcg != objcg) { /* reset if necessary */
+ drain_obj_stock(stock);
+ obj_cgroup_get(objcg);
+ stock->cached_objcg = objcg;
+ stock->nr_bytes = atomic_xchg(&objcg->nr_charged_bytes, 0);
+ }
+ stock->nr_bytes += nr_bytes;
+
+ if (stock->nr_bytes > PAGE_SIZE)
+ drain_obj_stock(stock);
+
+ local_irq_restore(flags);
+}
+
+int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
+{
+ struct mem_cgroup *memcg;
+ unsigned int nr_pages, nr_bytes;
+ int ret;
+
+ if (consume_obj_stock(objcg, size))
+ return 0;
+
+ /*
+ * In theory, memcg->nr_charged_bytes can have enough
+ * pre-charged bytes to satisfy the allocation. However,
+ * flushing memcg->nr_charged_bytes requires two atomic
+ * operations, and memcg->nr_charged_bytes can't be big,
+ * so it's better to ignore it and try grab some new pages.
+ * memcg->nr_charged_bytes will be flushed in
+ * refill_obj_stock(), called from this function or
+ * independently later.
+ */
+ rcu_read_lock();
+retry:
+ memcg = obj_cgroup_memcg(objcg);
+ if (unlikely(!css_tryget(&memcg->css)))
+ goto retry;
+ rcu_read_unlock();
+
+ nr_pages = size >> PAGE_SHIFT;
+ nr_bytes = size & (PAGE_SIZE - 1);
+
+ if (nr_bytes)
+ nr_pages += 1;
+
+ ret = __memcg_kmem_charge(memcg, gfp, nr_pages);
+ if (!ret && nr_bytes)
+ refill_obj_stock(objcg, PAGE_SIZE - nr_bytes);
+
+ css_put(&memcg->css);
+ return ret;
+}
+
+void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
+{
+ refill_obj_stock(objcg, size);
+}
+
+#endif /* CONFIG_MEMCG_KMEM */
+
+/*
+ * Because head->mem_cgroup is not set on tails, set it now.
+ */
+void split_page_memcg(struct page *head, unsigned int nr)
+{
+ struct mem_cgroup *memcg = head->mem_cgroup;
+ int kmemcg = PageKmemcg(head);
+ int i;
+
+ if (mem_cgroup_disabled() || !memcg)
+ return;
+
+ for (i = 1; i < nr; i++) {
+ head[i].mem_cgroup = memcg;
+ if (kmemcg)
+ __SetPageKmemcg(head + i);
+ }
+ css_get_many(&memcg->css, nr - 1);
+}
#ifdef CONFIG_MEMCG_SWAP
/**
@@ -2807,7 +3389,7 @@
* Make sure that the new limit (memsw or memory limit) doesn't
* break our basic invariant rule memory.max <= memsw.max.
*/
- limits_invariant = memsw ? max >= memcg->memory.max :
+ limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
max <= memcg->memsw.max;
if (!limits_invariant) {
mutex_unlock(&memcg_max_mutex);
@@ -2928,7 +3510,7 @@
* Test whether @memcg has children, dead or alive. Note that this
* function doesn't care whether @memcg has use_hierarchy enabled and
* returns %true if there are child csses according to the cgroup
- * hierarchy. Testing use_hierarchy is the caller's responsiblity.
+ * hierarchy. Testing use_hierarchy is the caller's responsibility.
*/
static inline bool memcg_has_children(struct mem_cgroup *memcg)
{
@@ -2947,7 +3529,7 @@
*/
static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
{
- int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ int nr_retries = MAX_RECLAIM_RETRIES;
/* we call try-to-free pages for make this cgroup empty */
lru_add_drain_all();
@@ -3021,50 +3603,15 @@
return retval;
}
-struct accumulated_stats {
- unsigned long stat[MEMCG_NR_STAT];
- unsigned long events[NR_VM_EVENT_ITEMS];
- unsigned long lru_pages[NR_LRU_LISTS];
- const unsigned int *stats_array;
- const unsigned int *events_array;
- int stats_size;
- int events_size;
-};
-
-static void accumulate_memcg_tree(struct mem_cgroup *memcg,
- struct accumulated_stats *acc)
-{
- struct mem_cgroup *mi;
- int i;
-
- for_each_mem_cgroup_tree(mi, memcg) {
- for (i = 0; i < acc->stats_size; i++)
- acc->stat[i] += memcg_page_state(mi,
- acc->stats_array ? acc->stats_array[i] : i);
-
- for (i = 0; i < acc->events_size; i++)
- acc->events[i] += memcg_sum_events(mi,
- acc->events_array ? acc->events_array[i] : i);
-
- for (i = 0; i < NR_LRU_LISTS; i++)
- acc->lru_pages[i] +=
- mem_cgroup_nr_lru_pages(mi, BIT(i));
- }
-}
-
static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
{
- unsigned long val = 0;
+ unsigned long val;
if (mem_cgroup_is_root(memcg)) {
- struct mem_cgroup *iter;
-
- for_each_mem_cgroup_tree(iter, memcg) {
- val += memcg_page_state(iter, MEMCG_CACHE);
- val += memcg_page_state(iter, MEMCG_RSS);
- if (swap)
- val += memcg_page_state(iter, MEMCG_SWAP);
- }
+ val = memcg_page_state(memcg, NR_FILE_PAGES) +
+ memcg_page_state(memcg, NR_ANON_MAPPED);
+ if (swap)
+ val += memcg_page_state(memcg, MEMCG_SWAP);
} else {
if (!swap)
val = page_counter_read(&memcg->memory);
@@ -3125,9 +3672,61 @@
}
}
+static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
+{
+ unsigned long stat[MEMCG_NR_STAT] = {0};
+ struct mem_cgroup *mi;
+ int node, cpu, i;
+
+ for_each_online_cpu(cpu)
+ for (i = 0; i < MEMCG_NR_STAT; i++)
+ stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
+
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ for (i = 0; i < MEMCG_NR_STAT; i++)
+ atomic_long_add(stat[i], &mi->vmstats[i]);
+
+ for_each_node(node) {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
+ struct mem_cgroup_per_node *pi;
+
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+ stat[i] = 0;
+
+ for_each_online_cpu(cpu)
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+ stat[i] += per_cpu(
+ pn->lruvec_stat_cpu->count[i], cpu);
+
+ for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+ atomic_long_add(stat[i], &pi->lruvec_stat[i]);
+ }
+}
+
+static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
+{
+ unsigned long events[NR_VM_EVENT_ITEMS];
+ struct mem_cgroup *mi;
+ int cpu, i;
+
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+ events[i] = 0;
+
+ for_each_online_cpu(cpu)
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+ events[i] += per_cpu(memcg->vmstats_percpu->events[i],
+ cpu);
+
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+ atomic_long_add(events[i], &mi->vmevents[i]);
+}
+
#ifdef CONFIG_MEMCG_KMEM
static int memcg_online_kmem(struct mem_cgroup *memcg)
{
+ struct obj_cgroup *objcg;
int memcg_id;
if (cgroup_memory_nokmem)
@@ -3140,7 +3739,16 @@
if (memcg_id < 0)
return memcg_id;
- static_branch_inc(&memcg_kmem_enabled_key);
+ objcg = obj_cgroup_alloc();
+ if (!objcg) {
+ memcg_free_cache_id(memcg_id);
+ return -ENOMEM;
+ }
+ objcg->memcg = memcg;
+ rcu_assign_pointer(memcg->objcg, objcg);
+
+ static_branch_enable(&memcg_kmem_enabled_key);
+
/*
* A memory cgroup is considered kmem-online as soon as it gets
* kmemcg_id. Setting the id after enabling static branching will
@@ -3149,7 +3757,6 @@
*/
memcg->kmemcg_id = memcg_id;
memcg->kmem_state = KMEM_ONLINE;
- INIT_LIST_HEAD(&memcg->kmem_caches);
return 0;
}
@@ -3162,22 +3769,17 @@
if (memcg->kmem_state != KMEM_ONLINE)
return;
- /*
- * Clear the online state before clearing memcg_caches array
- * entries. The slab_mutex in memcg_deactivate_kmem_caches()
- * guarantees that no cache will be created for this cgroup
- * after we are done (see memcg_create_kmem_cache()).
- */
+
memcg->kmem_state = KMEM_ALLOCATED;
-
- memcg_deactivate_kmem_caches(memcg);
-
- kmemcg_id = memcg->kmemcg_id;
- BUG_ON(kmemcg_id < 0);
parent = parent_mem_cgroup(memcg);
if (!parent)
parent = root_mem_cgroup;
+
+ memcg_reparent_objcgs(memcg, parent);
+
+ kmemcg_id = memcg->kmemcg_id;
+ BUG_ON(kmemcg_id < 0);
/*
* Change kmemcg_id of this cgroup and all its descendants to the
@@ -3207,12 +3809,6 @@
/* css_alloc() failed, offlining didn't happen */
if (unlikely(memcg->kmem_state == KMEM_ONLINE))
memcg_offline_kmem(memcg);
-
- if (memcg->kmem_state == KMEM_ALLOCATED) {
- memcg_destroy_kmem_caches(memcg);
- static_branch_dec(&memcg_kmem_enabled_key);
- WARN_ON(page_counter_read(&memcg->kmem));
- }
}
#else
static int memcg_online_kmem(struct mem_cgroup *memcg)
@@ -3303,6 +3899,9 @@
ret = mem_cgroup_resize_max(memcg, nr_pages, true);
break;
case _KMEM:
+ pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
+ "Please report your usecase to linux-mm@kvack.org if you "
+ "depend on this functionality.\n");
ret = memcg_update_kmem_max(memcg, nr_pages);
break;
case _TCP:
@@ -3367,6 +3966,10 @@
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. "
+ "Please report your usecase to linux-mm@kvack.org if you "
+ "depend on this functionality.\n");
+
if (val & ~MOVE_MASK)
return -EINVAL;
@@ -3388,6 +3991,49 @@
#endif
#ifdef CONFIG_NUMA
+
+#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
+#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
+#define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
+
+static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
+ int nid, unsigned int lru_mask, bool tree)
+{
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+ unsigned long nr = 0;
+ enum lru_list lru;
+
+ VM_BUG_ON((unsigned)nid >= nr_node_ids);
+
+ for_each_lru(lru) {
+ if (!(BIT(lru) & lru_mask))
+ continue;
+ if (tree)
+ nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
+ else
+ nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
+ }
+ return nr;
+}
+
+static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
+ unsigned int lru_mask,
+ bool tree)
+{
+ unsigned long nr = 0;
+ enum lru_list lru;
+
+ for_each_lru(lru) {
+ if (!(BIT(lru) & lru_mask))
+ continue;
+ if (tree)
+ nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
+ else
+ nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
+ }
+ return nr;
+}
+
static int memcg_numa_stat_show(struct seq_file *m, void *v)
{
struct numa_stat {
@@ -3403,40 +4049,60 @@
};
const struct numa_stat *stat;
int nid;
- unsigned long nr;
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
- nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
- seq_printf(m, "%s=%lu", stat->name, nr);
- for_each_node_state(nid, N_MEMORY) {
- nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
- stat->lru_mask);
- seq_printf(m, " N%d=%lu", nid, nr);
- }
+ seq_printf(m, "%s=%lu", stat->name,
+ mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
+ false));
+ for_each_node_state(nid, N_MEMORY)
+ seq_printf(m, " N%d=%lu", nid,
+ mem_cgroup_node_nr_lru_pages(memcg, nid,
+ stat->lru_mask, false));
seq_putc(m, '\n');
}
for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
- struct mem_cgroup *iter;
- nr = 0;
- for_each_mem_cgroup_tree(iter, memcg)
- nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
- seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
- for_each_node_state(nid, N_MEMORY) {
- nr = 0;
- for_each_mem_cgroup_tree(iter, memcg)
- nr += mem_cgroup_node_nr_lru_pages(
- iter, nid, stat->lru_mask);
- seq_printf(m, " N%d=%lu", nid, nr);
- }
+ seq_printf(m, "hierarchical_%s=%lu", stat->name,
+ mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
+ true));
+ for_each_node_state(nid, N_MEMORY)
+ seq_printf(m, " N%d=%lu", nid,
+ mem_cgroup_node_nr_lru_pages(memcg, nid,
+ stat->lru_mask, true));
seq_putc(m, '\n');
}
return 0;
}
#endif /* CONFIG_NUMA */
+
+static const unsigned int memcg1_stats[] = {
+ NR_FILE_PAGES,
+ NR_ANON_MAPPED,
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ NR_ANON_THPS,
+#endif
+ NR_SHMEM,
+ NR_FILE_MAPPED,
+ NR_FILE_DIRTY,
+ NR_WRITEBACK,
+ MEMCG_SWAP,
+};
+
+static const char *const memcg1_stat_names[] = {
+ "cache",
+ "rss",
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ "rss_huge",
+#endif
+ "shmem",
+ "mapped_file",
+ "dirty",
+ "writeback",
+ "swap",
+};
/* Universal VM events cgroup1 shows, original sort order */
static const unsigned int memcg1_events[] = {
@@ -3446,45 +4112,42 @@
PGMAJFAULT,
};
-static const char *const memcg1_event_names[] = {
- "pgpgin",
- "pgpgout",
- "pgfault",
- "pgmajfault",
-};
-
static int memcg_stat_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
unsigned long memory, memsw;
struct mem_cgroup *mi;
unsigned int i;
- struct accumulated_stats acc;
BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
- BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
+ unsigned long nr;
+
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
- seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
- memcg_page_state(memcg, memcg1_stats[i]) *
- PAGE_SIZE);
+ nr = memcg_page_state_local(memcg, memcg1_stats[i]);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (memcg1_stats[i] == NR_ANON_THPS)
+ nr *= HPAGE_PMD_NR;
+#endif
+ seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE);
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
- seq_printf(m, "%s %lu\n", memcg1_event_names[i],
- memcg_sum_events(memcg, memcg1_events[i]));
+ seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]),
+ memcg_events_local(memcg, memcg1_events[i]));
for (i = 0; i < NR_LRU_LISTS; i++)
- seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
- mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
+ seq_printf(m, "%s %lu\n", lru_list_name(i),
+ memcg_page_state_local(memcg, NR_LRU_BASE + i) *
+ PAGE_SIZE);
/* Hierarchical information */
memory = memsw = PAGE_COUNTER_MAX;
for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
- memory = min(memory, mi->memory.max);
- memsw = min(memsw, mi->memsw.max);
+ memory = min(memory, READ_ONCE(mi->memory.max));
+ memsw = min(memsw, READ_ONCE(mi->memsw.max));
}
seq_printf(m, "hierarchical_memory_limit %llu\n",
(u64)memory * PAGE_SIZE);
@@ -3492,49 +4155,45 @@
seq_printf(m, "hierarchical_memsw_limit %llu\n",
(u64)memsw * PAGE_SIZE);
- memset(&acc, 0, sizeof(acc));
- acc.stats_size = ARRAY_SIZE(memcg1_stats);
- acc.stats_array = memcg1_stats;
- acc.events_size = ARRAY_SIZE(memcg1_events);
- acc.events_array = memcg1_events;
- accumulate_memcg_tree(memcg, &acc);
-
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
+ unsigned long nr;
+
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
+ nr = memcg_page_state(memcg, memcg1_stats[i]);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (memcg1_stats[i] == NR_ANON_THPS)
+ nr *= HPAGE_PMD_NR;
+#endif
seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
- (u64)acc.stat[i] * PAGE_SIZE);
+ (u64)nr * PAGE_SIZE);
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
- seq_printf(m, "total_%s %llu\n", memcg1_event_names[i],
- (u64)acc.events[i]);
+ seq_printf(m, "total_%s %llu\n",
+ vm_event_name(memcg1_events[i]),
+ (u64)memcg_events(memcg, memcg1_events[i]));
for (i = 0; i < NR_LRU_LISTS; i++)
- seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i],
- (u64)acc.lru_pages[i] * PAGE_SIZE);
+ seq_printf(m, "total_%s %llu\n", lru_list_name(i),
+ (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
+ PAGE_SIZE);
#ifdef CONFIG_DEBUG_VM
{
pg_data_t *pgdat;
struct mem_cgroup_per_node *mz;
- struct zone_reclaim_stat *rstat;
- unsigned long recent_rotated[2] = {0, 0};
- unsigned long recent_scanned[2] = {0, 0};
+ unsigned long anon_cost = 0;
+ unsigned long file_cost = 0;
for_each_online_pgdat(pgdat) {
mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
- rstat = &mz->lruvec.reclaim_stat;
- recent_rotated[0] += rstat->recent_rotated[0];
- recent_rotated[1] += rstat->recent_rotated[1];
- recent_scanned[0] += rstat->recent_scanned[0];
- recent_scanned[1] += rstat->recent_scanned[1];
+ anon_cost += mz->lruvec.anon_cost;
+ file_cost += mz->lruvec.file_cost;
}
- seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
- seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
- seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
- seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
+ seq_printf(m, "anon_cost %lu\n", anon_cost);
+ seq_printf(m, "file_cost %lu\n", file_cost);
}
#endif
@@ -3554,7 +4213,7 @@
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- if (val > 100)
+ if (val > 200)
return -EINVAL;
if (css->parent)
@@ -3693,8 +4352,7 @@
size = thresholds->primary ? thresholds->primary->size + 1 : 1;
/* Allocate memory for new array of thresholds */
- new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
- GFP_KERNEL);
+ new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
if (!new) {
ret = -ENOMEM;
goto unlock;
@@ -3702,17 +4360,16 @@
new->size = size;
/* Copy thresholds (if any) to new array */
- if (thresholds->primary) {
- memcpy(new->entries, thresholds->primary->entries, (size - 1) *
- sizeof(struct mem_cgroup_threshold));
- }
+ if (thresholds->primary)
+ memcpy(new->entries, thresholds->primary->entries,
+ flex_array_size(new, entries, size - 1));
/* Add new threshold */
new->entries[size - 1].eventfd = eventfd;
new->entries[size - 1].threshold = threshold;
/* Sort thresholds. Registering of new threshold isn't time-critical */
- sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
+ sort(new->entries, size, sizeof(*new->entries),
compare_thresholds, NULL);
/* Find current threshold */
@@ -3894,7 +4551,7 @@
static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
@@ -3920,6 +4577,8 @@
}
#ifdef CONFIG_CGROUP_WRITEBACK
+
+#include <trace/events/writeback.h>
static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
{
@@ -3952,11 +4611,11 @@
*/
static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
{
- long x = atomic_long_read(&memcg->stat[idx]);
+ long x = atomic_long_read(&memcg->vmstats[idx]);
int cpu;
for_each_online_cpu(cpu)
- x += per_cpu_ptr(memcg->stat_cpu, cpu)->count[idx];
+ x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx];
if (x < 0)
x = 0;
return x;
@@ -3989,18 +4648,142 @@
*pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
- /* this should eventually include NR_UNSTABLE_NFS */
*pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
- *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
- (1 << LRU_ACTIVE_FILE));
+ *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
+ memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
*pheadroom = PAGE_COUNTER_MAX;
while ((parent = parent_mem_cgroup(memcg))) {
- unsigned long ceiling = min(memcg->memory.max, memcg->high);
+ unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
+ READ_ONCE(memcg->memory.high));
unsigned long used = page_counter_read(&memcg->memory);
*pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
memcg = parent;
+ }
+}
+
+/*
+ * Foreign dirty flushing
+ *
+ * There's an inherent mismatch between memcg and writeback. The former
+ * trackes ownership per-page while the latter per-inode. This was a
+ * deliberate design decision because honoring per-page ownership in the
+ * writeback path is complicated, may lead to higher CPU and IO overheads
+ * and deemed unnecessary given that write-sharing an inode across
+ * different cgroups isn't a common use-case.
+ *
+ * Combined with inode majority-writer ownership switching, this works well
+ * enough in most cases but there are some pathological cases. For
+ * example, let's say there are two cgroups A and B which keep writing to
+ * different but confined parts of the same inode. B owns the inode and
+ * A's memory is limited far below B's. A's dirty ratio can rise enough to
+ * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
+ * triggering background writeback. A will be slowed down without a way to
+ * make writeback of the dirty pages happen.
+ *
+ * Conditions like the above can lead to a cgroup getting repatedly and
+ * severely throttled after making some progress after each
+ * dirty_expire_interval while the underyling IO device is almost
+ * completely idle.
+ *
+ * Solving this problem completely requires matching the ownership tracking
+ * granularities between memcg and writeback in either direction. However,
+ * the more egregious behaviors can be avoided by simply remembering the
+ * most recent foreign dirtying events and initiating remote flushes on
+ * them when local writeback isn't enough to keep the memory clean enough.
+ *
+ * The following two functions implement such mechanism. When a foreign
+ * page - a page whose memcg and writeback ownerships don't match - is
+ * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
+ * bdi_writeback on the page owning memcg. When balance_dirty_pages()
+ * decides that the memcg needs to sleep due to high dirty ratio, it calls
+ * mem_cgroup_flush_foreign() which queues writeback on the recorded
+ * foreign bdi_writebacks which haven't expired. Both the numbers of
+ * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
+ * limited to MEMCG_CGWB_FRN_CNT.
+ *
+ * The mechanism only remembers IDs and doesn't hold any object references.
+ * As being wrong occasionally doesn't matter, updates and accesses to the
+ * records are lockless and racy.
+ */
+void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
+ struct bdi_writeback *wb)
+{
+ struct mem_cgroup *memcg = page->mem_cgroup;
+ struct memcg_cgwb_frn *frn;
+ u64 now = get_jiffies_64();
+ u64 oldest_at = now;
+ int oldest = -1;
+ int i;
+
+ trace_track_foreign_dirty(page, wb);
+
+ /*
+ * Pick the slot to use. If there is already a slot for @wb, keep
+ * using it. If not replace the oldest one which isn't being
+ * written out.
+ */
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
+ frn = &memcg->cgwb_frn[i];
+ if (frn->bdi_id == wb->bdi->id &&
+ frn->memcg_id == wb->memcg_css->id)
+ break;
+ if (time_before64(frn->at, oldest_at) &&
+ atomic_read(&frn->done.cnt) == 1) {
+ oldest = i;
+ oldest_at = frn->at;
+ }
+ }
+
+ if (i < MEMCG_CGWB_FRN_CNT) {
+ /*
+ * Re-using an existing one. Update timestamp lazily to
+ * avoid making the cacheline hot. We want them to be
+ * reasonably up-to-date and significantly shorter than
+ * dirty_expire_interval as that's what expires the record.
+ * Use the shorter of 1s and dirty_expire_interval / 8.
+ */
+ unsigned long update_intv =
+ min_t(unsigned long, HZ,
+ msecs_to_jiffies(dirty_expire_interval * 10) / 8);
+
+ if (time_before64(frn->at, now - update_intv))
+ frn->at = now;
+ } else if (oldest >= 0) {
+ /* replace the oldest free one */
+ frn = &memcg->cgwb_frn[oldest];
+ frn->bdi_id = wb->bdi->id;
+ frn->memcg_id = wb->memcg_css->id;
+ frn->at = now;
+ }
+}
+
+/* issue foreign writeback flushes for recorded foreign dirtying events */
+void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+ unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
+ u64 now = jiffies_64;
+ int i;
+
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
+ struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
+
+ /*
+ * If the record is older than dirty_expire_interval,
+ * writeback on it has already started. No need to kick it
+ * off again. Also, don't start a new one if there's
+ * already one in flight.
+ */
+ if (time_after64(frn->at, now - intv) &&
+ atomic_read(&frn->done.cnt) == 1) {
+ frn->at = 0;
+ trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
+ cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
+ WB_REASON_FOREIGN_FLUSH,
+ &frn->done);
+ }
}
}
@@ -4123,6 +4906,7 @@
unsigned int efd, cfd;
struct fd efile;
struct fd cfile;
+ struct dentry *cdentry;
const char *name;
char *endp;
int ret;
@@ -4174,6 +4958,16 @@
goto out_put_cfile;
/*
+ * The control file must be a regular cgroup1 file. As a regular cgroup
+ * file can't be renamed, it's safe to access its name afterwards.
+ */
+ cdentry = cfile.file->f_path.dentry;
+ if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
+ ret = -EINVAL;
+ goto out_put_cfile;
+ }
+
+ /*
* Determine the event callbacks and set them in @event. This used
* to be done via struct cftype but cgroup core no longer knows
* about these events. The following is crude but the whole thing
@@ -4181,7 +4975,7 @@
*
* DO NOT ADD NEW FILES.
*/
- name = cfile.file->f_path.dentry->d_name.name;
+ name = cdentry->d_name.name;
if (!strcmp(name, "memory.usage_in_bytes")) {
event->register_event = mem_cgroup_usage_register_event;
@@ -4205,7 +4999,7 @@
* automatically removed on cgroup destruction but the removal is
* asynchronous, so take an extra ref on @css.
*/
- cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
+ cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
&memory_cgrp_subsys);
ret = -EINVAL;
if (IS_ERR(cfile_css))
@@ -4340,12 +5134,10 @@
.write = mem_cgroup_reset,
.read_u64 = mem_cgroup_read_u64,
},
-#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
+#if defined(CONFIG_MEMCG_KMEM) && \
+ (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
{
.name = "kmem.slabinfo",
- .seq_start = memcg_slab_start,
- .seq_next = memcg_slab_next,
- .seq_stop = memcg_slab_stop,
.seq_show = memcg_slab_show,
},
#endif
@@ -4383,7 +5175,7 @@
* limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
* memory-controlled cgroups to 64k.
*
- * However, there usually are many references to the oflline CSS after
+ * However, there usually are many references to the offline CSS after
* the cgroup has been destroyed, such as page cache or reclaimable
* slab objects, that don't need to hang on to the ID. We want to keep
* those dead CSS from occupying IDs, or we might quickly exhaust the
@@ -4404,31 +5196,26 @@
static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
{
if (memcg->id.id > 0) {
+ trace_android_vh_mem_cgroup_id_remove(memcg);
idr_remove(&mem_cgroup_idr, memcg->id.id);
memcg->id.id = 0;
}
}
-static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
+static void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg,
+ unsigned int n)
{
- VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0);
- atomic_add(n, &memcg->id.ref);
+ refcount_add(n, &memcg->id.ref);
}
static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
{
- VM_BUG_ON(atomic_read(&memcg->id.ref) < n);
- if (atomic_sub_and_test(n, &memcg->id.ref)) {
+ if (refcount_sub_and_test(n, &memcg->id.ref)) {
mem_cgroup_id_remove(memcg);
/* Memcg ID pins CSS */
css_put(&memcg->css);
}
-}
-
-static inline void mem_cgroup_id_get(struct mem_cgroup *memcg)
-{
- mem_cgroup_id_get_many(memcg, 1);
}
static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
@@ -4447,6 +5234,7 @@
WARN_ON_ONCE(!rcu_read_lock_held());
return idr_find(&mem_cgroup_idr, id);
}
+EXPORT_SYMBOL_GPL(mem_cgroup_from_id);
static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
{
@@ -4466,8 +5254,17 @@
if (!pn)
return 1;
- pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat);
+ pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat,
+ GFP_KERNEL_ACCOUNT);
+ if (!pn->lruvec_stat_local) {
+ kfree(pn);
+ return 1;
+ }
+
+ pn->lruvec_stat_cpu = alloc_percpu_gfp(struct lruvec_stat,
+ GFP_KERNEL_ACCOUNT);
if (!pn->lruvec_stat_cpu) {
+ free_percpu(pn->lruvec_stat_local);
kfree(pn);
return 1;
}
@@ -4489,6 +5286,7 @@
return;
free_percpu(pn->lruvec_stat_cpu);
+ free_percpu(pn->lruvec_stat_local);
kfree(pn);
}
@@ -4496,39 +5294,57 @@
{
int node;
+ trace_android_vh_mem_cgroup_free(memcg);
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
- free_percpu(memcg->stat_cpu);
+ free_percpu(memcg->vmstats_percpu);
+ free_percpu(memcg->vmstats_local);
kfree(memcg);
}
static void mem_cgroup_free(struct mem_cgroup *memcg)
{
memcg_wb_domain_exit(memcg);
+ /*
+ * Flush percpu vmstats and vmevents to guarantee the value correctness
+ * on parent's and all ancestor levels.
+ */
+ memcg_flush_percpu_vmstats(memcg);
+ memcg_flush_percpu_vmevents(memcg);
__mem_cgroup_free(memcg);
}
static struct mem_cgroup *mem_cgroup_alloc(void)
{
struct mem_cgroup *memcg;
- size_t size;
+ unsigned int size;
int node;
+ int __maybe_unused i;
+ long error = -ENOMEM;
size = sizeof(struct mem_cgroup);
size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
memcg = kzalloc(size, GFP_KERNEL);
if (!memcg)
- return NULL;
+ return ERR_PTR(error);
memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
1, MEM_CGROUP_ID_MAX,
GFP_KERNEL);
- if (memcg->id.id < 0)
+ if (memcg->id.id < 0) {
+ error = memcg->id.id;
+ goto fail;
+ }
+
+ memcg->vmstats_local = alloc_percpu_gfp(struct memcg_vmstats_percpu,
+ GFP_KERNEL_ACCOUNT);
+ if (!memcg->vmstats_local)
goto fail;
- memcg->stat_cpu = alloc_percpu(struct mem_cgroup_stat_cpu);
- if (!memcg->stat_cpu)
+ memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
+ GFP_KERNEL_ACCOUNT);
+ if (!memcg->vmstats_percpu)
goto fail;
for_each_node(node)
@@ -4539,7 +5355,6 @@
goto fail;
INIT_WORK(&memcg->high_work, high_work_func);
- memcg->last_scanned_node = MAX_NUMNODES;
INIT_LIST_HEAD(&memcg->oom_notify);
mutex_init(&memcg->thresholds_lock);
spin_lock_init(&memcg->move_lock);
@@ -4549,48 +5364,64 @@
memcg->socket_pressure = jiffies;
#ifdef CONFIG_MEMCG_KMEM
memcg->kmemcg_id = -1;
+ INIT_LIST_HEAD(&memcg->objcg_list);
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&memcg->cgwb_list);
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
+ memcg->cgwb_frn[i].done =
+ __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
+ INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
+ memcg->deferred_split_queue.split_queue_len = 0;
#endif
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
+ trace_android_vh_mem_cgroup_alloc(memcg);
return memcg;
fail:
mem_cgroup_id_remove(memcg);
__mem_cgroup_free(memcg);
- return NULL;
+ return ERR_PTR(error);
}
static struct cgroup_subsys_state * __ref
mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
- struct mem_cgroup *memcg;
+ struct mem_cgroup *memcg, *old_memcg;
long error = -ENOMEM;
+ old_memcg = set_active_memcg(parent);
memcg = mem_cgroup_alloc();
- if (!memcg)
- return ERR_PTR(error);
+ set_active_memcg(old_memcg);
+ if (IS_ERR(memcg))
+ return ERR_CAST(memcg);
- memcg->high = PAGE_COUNTER_MAX;
+ page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
memcg->soft_limit = PAGE_COUNTER_MAX;
+ page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
if (parent) {
memcg->swappiness = mem_cgroup_swappiness(parent);
memcg->oom_kill_disable = parent->oom_kill_disable;
}
- if (parent && parent->use_hierarchy) {
+ if (!parent) {
+ page_counter_init(&memcg->memory, NULL);
+ page_counter_init(&memcg->swap, NULL);
+ page_counter_init(&memcg->kmem, NULL);
+ page_counter_init(&memcg->tcpmem, NULL);
+ } else if (parent->use_hierarchy) {
memcg->use_hierarchy = true;
page_counter_init(&memcg->memory, &parent->memory);
page_counter_init(&memcg->swap, &parent->swap);
- page_counter_init(&memcg->memsw, &parent->memsw);
page_counter_init(&memcg->kmem, &parent->kmem);
page_counter_init(&memcg->tcpmem, &parent->tcpmem);
} else {
- page_counter_init(&memcg->memory, NULL);
- page_counter_init(&memcg->swap, NULL);
- page_counter_init(&memcg->memsw, NULL);
- page_counter_init(&memcg->kmem, NULL);
- page_counter_init(&memcg->tcpmem, NULL);
+ page_counter_init(&memcg->memory, &root_mem_cgroup->memory);
+ page_counter_init(&memcg->swap, &root_mem_cgroup->swap);
+ page_counter_init(&memcg->kmem, &root_mem_cgroup->kmem);
+ page_counter_init(&memcg->tcpmem, &root_mem_cgroup->tcpmem);
/*
* Deeper hierachy with use_hierarchy == false doesn't make
* much sense so let cgroup subsystem know about this
@@ -4617,7 +5448,7 @@
fail:
mem_cgroup_id_remove(memcg);
mem_cgroup_free(memcg);
- return ERR_PTR(-ENOMEM);
+ return ERR_PTR(error);
}
static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
@@ -4635,8 +5466,9 @@
}
/* Online state pins memcg ID, memcg ID pins CSS */
- atomic_set(&memcg->id.ref, 1);
+ refcount_set(&memcg->id.ref, 1);
css_get(css);
+ trace_android_vh_mem_cgroup_css_online(css, memcg);
return 0;
}
@@ -4645,6 +5477,7 @@
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_event *event, *tmp;
+ trace_android_vh_mem_cgroup_css_offline(css, memcg);
/*
* Unregister events and notify userspace.
* Notify userspace about cgroup removing only after rmdir of cgroup
@@ -4663,6 +5496,8 @@
memcg_offline_kmem(memcg);
wb_memcg_offline(memcg);
+ drain_all_stock(memcg);
+
mem_cgroup_id_put(memcg);
}
@@ -4676,7 +5511,12 @@
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ int __maybe_unused i;
+#ifdef CONFIG_CGROUP_WRITEBACK
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
+ wb_wait_for_completion(&memcg->cgwb_frn[i].done);
+#endif
if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
static_branch_dec(&memcg_sockets_enabled_key);
@@ -4710,13 +5550,13 @@
page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
- page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
page_counter_set_min(&memcg->memory, 0);
page_counter_set_low(&memcg->memory, 0);
- memcg->high = PAGE_COUNTER_MAX;
+ page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
memcg->soft_limit = PAGE_COUNTER_MAX;
+ page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
memcg_wb_domain_size_changed(memcg);
}
@@ -4759,7 +5599,7 @@
static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent)
{
- struct page *page = _vm_normal_page(vma, addr, ptent, true);
+ struct page *page = vm_normal_page(vma, addr, ptent);
if (!page || !page_mapped(page))
return NULL;
@@ -4810,8 +5650,7 @@
* we call find_get_page() with swapper_space directly.
*/
page = find_get_page(swap_address_space(ent), swp_offset(ent));
- if (do_memsw_account())
- entry->val = ent.val;
+ entry->val = ent.val;
return page;
}
@@ -4826,36 +5665,15 @@
static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, swp_entry_t *entry)
{
- struct page *page = NULL;
- struct address_space *mapping;
- pgoff_t pgoff;
-
if (!vma->vm_file) /* anonymous vma */
return NULL;
if (!(mc.flags & MOVE_FILE))
return NULL;
- mapping = vma->vm_file->f_mapping;
- pgoff = linear_page_index(vma, addr);
-
/* page is moved even if it's not RSS of this task(page-faulted). */
-#ifdef CONFIG_SWAP
/* shmem/tmpfs may report page out on swap: account for that too. */
- if (shmem_mapping(mapping)) {
- page = find_get_entry(mapping, pgoff);
- if (radix_tree_exceptional_entry(page)) {
- swp_entry_t swp = radix_to_swp_entry(page);
- if (do_memsw_account())
- *entry = swp;
- page = find_get_page(swap_address_space(swp),
- swp_offset(swp));
- }
- } else
- page = find_get_page(mapping, pgoff);
-#else
- page = find_get_page(mapping, pgoff);
-#endif
- return page;
+ return find_get_incore_page(vma->vm_file->f_mapping,
+ linear_page_index(vma, addr));
}
/**
@@ -4875,10 +5693,10 @@
struct mem_cgroup *from,
struct mem_cgroup *to)
{
- unsigned long flags;
- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
+ struct lruvec *from_vec, *to_vec;
+ struct pglist_data *pgdat;
+ unsigned int nr_pages = compound ? thp_nr_pages(page) : 1;
int ret;
- bool anon;
VM_BUG_ON(from == to);
VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -4896,52 +5714,83 @@
if (page->mem_cgroup != from)
goto out_unlock;
- anon = PageAnon(page);
+ pgdat = page_pgdat(page);
+ from_vec = mem_cgroup_lruvec(from, pgdat);
+ to_vec = mem_cgroup_lruvec(to, pgdat);
- spin_lock_irqsave(&from->move_lock, flags);
+ lock_page_memcg(page);
- if (!anon && page_mapped(page)) {
- __mod_memcg_state(from, NR_FILE_MAPPED, -nr_pages);
- __mod_memcg_state(to, NR_FILE_MAPPED, nr_pages);
- }
+ if (PageAnon(page)) {
+ if (page_mapped(page)) {
+ __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
+ if (PageTransHuge(page)) {
+ __dec_lruvec_state(from_vec, NR_ANON_THPS);
+ __inc_lruvec_state(to_vec, NR_ANON_THPS);
+ }
- /*
- * move_lock grabbed above and caller set from->moving_account, so
- * mod_memcg_page_state will serialize updates to PageDirty.
- * So mapping should be stable for dirty pages.
- */
- if (!anon && PageDirty(page)) {
- struct address_space *mapping = page_mapping(page);
+ }
+ } else {
+ __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
- if (mapping_cap_account_dirty(mapping)) {
- __mod_memcg_state(from, NR_FILE_DIRTY, -nr_pages);
- __mod_memcg_state(to, NR_FILE_DIRTY, nr_pages);
+ if (PageSwapBacked(page)) {
+ __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
+ }
+
+ if (page_mapped(page)) {
+ __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
+ }
+
+ if (PageDirty(page)) {
+ struct address_space *mapping = page_mapping(page);
+
+ if (mapping_can_writeback(mapping)) {
+ __mod_lruvec_state(from_vec, NR_FILE_DIRTY,
+ -nr_pages);
+ __mod_lruvec_state(to_vec, NR_FILE_DIRTY,
+ nr_pages);
+ }
}
}
if (PageWriteback(page)) {
- __mod_memcg_state(from, NR_WRITEBACK, -nr_pages);
- __mod_memcg_state(to, NR_WRITEBACK, nr_pages);
+ __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
}
/*
+ * All state has been migrated, let's switch to the new memcg.
+ *
* It is safe to change page->mem_cgroup here because the page
- * is referenced, charged, and isolated - we can't race with
- * uncharging, charging, migration, or LRU putback.
+ * is referenced, charged, isolated, and locked: we can't race
+ * with (un)charging, migration, LRU putback, or anything else
+ * that would rely on a stable page->mem_cgroup.
+ *
+ * Note that lock_page_memcg is a memcg lock, not a page lock,
+ * to save space. As soon as we switch page->mem_cgroup to a
+ * new memcg that isn't locked, the above state can change
+ * concurrently again. Make sure we're truly done with it.
*/
+ smp_mb();
- /* caller should have done css_get */
+ css_get(&to->css);
+ css_put(&from->css);
+
page->mem_cgroup = to;
- spin_unlock_irqrestore(&from->move_lock, flags);
+
+ __unlock_page_memcg(from);
ret = 0;
- local_lock_irq(event_lock);
- mem_cgroup_charge_statistics(to, page, compound, nr_pages);
+ local_irq_disable();
+ mem_cgroup_charge_statistics(to, page, nr_pages);
memcg_check_events(to, page);
- mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
+ mem_cgroup_charge_statistics(from, page, -nr_pages);
memcg_check_events(from, page);
- local_unlock_irq(event_lock);
+ local_irq_enable();
out_unlock:
unlock_page(page);
out:
@@ -4963,8 +5812,8 @@
* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
* target for charge migration. if @target is not NULL, the entry is stored
* in target->ent.
- * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PUBLIC
- * or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).
+ * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE
+ * (so ZONE_DEVICE page and thus not on the lru).
* For now we such page is charge like a regular page would be as for all
* intent and purposes it is just special memory taking the place of a
* regular page.
@@ -4998,8 +5847,7 @@
*/
if (page->mem_cgroup == mc.from) {
ret = MC_TARGET_PAGE;
- if (is_device_private_page(page) ||
- is_device_public_page(page))
+ if (is_device_private_page(page))
ret = MC_TARGET_DEVICE;
if (target)
target->page = page;
@@ -5070,8 +5918,8 @@
if (ptl) {
/*
* Note their can not be MC_TARGET_DEVICE for now as we do not
- * support transparent huge page with MEMORY_DEVICE_PUBLIC or
- * MEMORY_DEVICE_PRIVATE but this might change.
+ * support transparent huge page with MEMORY_DEVICE_PRIVATE but
+ * this might change.
*/
if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
mc.precharge += HPAGE_PMD_NR;
@@ -5091,18 +5939,17 @@
return 0;
}
+static const struct mm_walk_ops precharge_walk_ops = {
+ .pmd_entry = mem_cgroup_count_precharge_pte_range,
+};
+
static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
{
unsigned long precharge;
- struct mm_walk mem_cgroup_count_precharge_walk = {
- .pmd_entry = mem_cgroup_count_precharge_pte_range,
- .mm = mm,
- };
- down_read(&mm->mmap_sem);
- walk_page_range(0, mm->highest_vm_end,
- &mem_cgroup_count_precharge_walk);
- up_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
+ walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
+ mmap_read_unlock(mm);
precharge = mc.precharge;
mc.precharge = 0;
@@ -5152,8 +5999,6 @@
*/
if (!mem_cgroup_is_root(mc.to))
page_counter_uncharge(&mc.to->memory, mc.moved_swap);
-
- css_put_many(&mc.to->css, mc.moved_swap);
mc.moved_swap = 0;
}
@@ -5315,7 +6160,7 @@
switch (get_mctgt_type(vma, addr, ptent, &target)) {
case MC_TARGET_DEVICE:
device = true;
- /* fall through */
+ fallthrough;
case MC_TARGET_PAGE:
page = target.page;
/*
@@ -5370,13 +6215,12 @@
return ret;
}
+static const struct mm_walk_ops charge_walk_ops = {
+ .pmd_entry = mem_cgroup_move_charge_pte_range,
+};
+
static void mem_cgroup_move_charge(void)
{
- struct mm_walk mem_cgroup_move_charge_walk = {
- .pmd_entry = mem_cgroup_move_charge_pte_range,
- .mm = mc.mm,
- };
-
lru_add_drain_all();
/*
* Signal lock_page_memcg() to take the memcg's move_lock
@@ -5386,9 +6230,9 @@
atomic_inc(&mc.from->moving_account);
synchronize_rcu();
retry:
- if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) {
+ if (unlikely(!mmap_read_trylock(mc.mm))) {
/*
- * Someone who are holding the mmap_sem might be waiting in
+ * Someone who are holding the mmap_lock might be waiting in
* waitq. So we cancel all extra charges, wake up all waiters,
* and retry. Because we cancel precharges, we might not be able
* to move enough charges, but moving charge is a best-effort
@@ -5402,9 +6246,10 @@
* When we have consumed all precharges and failed in doing
* additional charge, the page walk just aborts.
*/
- walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk);
+ walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
+ NULL);
- up_read(&mc.mm->mmap_sem);
+ mmap_read_unlock(mc.mm);
atomic_dec(&mc.from->moving_account);
}
@@ -5446,6 +6291,16 @@
root_mem_cgroup->use_hierarchy = false;
}
+static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
+{
+ if (value == PAGE_COUNTER_MAX)
+ seq_puts(m, "max\n");
+ else
+ seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
+
+ return 0;
+}
+
static u64 memory_current_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
@@ -5456,15 +6311,8 @@
static int memory_min_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long min = READ_ONCE(memcg->memory.min);
-
- if (min == PAGE_COUNTER_MAX)
- seq_puts(m, "max\n");
- else
- seq_printf(m, "%llu\n", (u64)min * PAGE_SIZE);
-
- return 0;
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
}
static ssize_t memory_min_write(struct kernfs_open_file *of,
@@ -5486,15 +6334,8 @@
static int memory_low_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long low = READ_ONCE(memcg->memory.low);
-
- if (low == PAGE_COUNTER_MAX)
- seq_puts(m, "max\n");
- else
- seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE);
-
- return 0;
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
}
static ssize_t memory_low_write(struct kernfs_open_file *of,
@@ -5516,22 +6357,16 @@
static int memory_high_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long high = READ_ONCE(memcg->high);
-
- if (high == PAGE_COUNTER_MAX)
- seq_puts(m, "max\n");
- else
- seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE);
-
- return 0;
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.high));
}
static ssize_t memory_high_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
- unsigned long nr_pages;
+ unsigned int nr_retries = MAX_RECLAIM_RETRIES;
+ bool drained = false;
unsigned long high;
int err;
@@ -5540,12 +6375,30 @@
if (err)
return err;
- memcg->high = high;
+ page_counter_set_high(&memcg->memory, high);
- nr_pages = page_counter_read(&memcg->memory);
- if (nr_pages > high)
- try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
- GFP_KERNEL, true);
+ for (;;) {
+ unsigned long nr_pages = page_counter_read(&memcg->memory);
+ unsigned long reclaimed;
+
+ if (nr_pages <= high)
+ break;
+
+ if (signal_pending(current))
+ break;
+
+ if (!drained) {
+ drain_all_stock(memcg);
+ drained = true;
+ continue;
+ }
+
+ reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
+ GFP_KERNEL, true);
+
+ if (!reclaimed && !nr_retries--)
+ break;
+ }
memcg_wb_domain_size_changed(memcg);
return nbytes;
@@ -5553,22 +6406,15 @@
static int memory_max_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long max = READ_ONCE(memcg->memory.max);
-
- if (max == PAGE_COUNTER_MAX)
- seq_puts(m, "max\n");
- else
- seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
-
- return 0;
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
}
static ssize_t memory_max_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
- unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
+ unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
bool drained = false;
unsigned long max;
int err;
@@ -5586,10 +6432,8 @@
if (nr_pages <= max)
break;
- if (signal_pending(current)) {
- err = -EINTR;
+ if (signal_pending(current))
break;
- }
if (!drained) {
drain_all_stock(memcg);
@@ -5613,104 +6457,77 @@
return nbytes;
}
+static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
+{
+ seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
+ seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
+ seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
+ seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
+ seq_printf(m, "oom_kill %lu\n",
+ atomic_long_read(&events[MEMCG_OOM_KILL]));
+}
+
static int memory_events_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
- seq_printf(m, "low %lu\n",
- atomic_long_read(&memcg->memory_events[MEMCG_LOW]));
- seq_printf(m, "high %lu\n",
- atomic_long_read(&memcg->memory_events[MEMCG_HIGH]));
- seq_printf(m, "max %lu\n",
- atomic_long_read(&memcg->memory_events[MEMCG_MAX]));
- seq_printf(m, "oom %lu\n",
- atomic_long_read(&memcg->memory_events[MEMCG_OOM]));
- seq_printf(m, "oom_kill %lu\n",
- atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
+ __memory_events_show(m, memcg->memory_events);
+ return 0;
+}
+static int memory_events_local_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+ __memory_events_show(m, memcg->memory_events_local);
return 0;
}
static int memory_stat_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- struct accumulated_stats acc;
- int i;
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+ char *buf;
- /*
- * Provide statistics on the state of the memory subsystem as
- * well as cumulative event counters that show past behavior.
- *
- * This list is ordered following a combination of these gradients:
- * 1) generic big picture -> specifics and details
- * 2) reflecting userspace activity -> reflecting kernel heuristics
- *
- * Current memory state:
- */
-
- memset(&acc, 0, sizeof(acc));
- acc.stats_size = MEMCG_NR_STAT;
- acc.events_size = NR_VM_EVENT_ITEMS;
- accumulate_memcg_tree(memcg, &acc);
-
- seq_printf(m, "anon %llu\n",
- (u64)acc.stat[MEMCG_RSS] * PAGE_SIZE);
- seq_printf(m, "file %llu\n",
- (u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE);
- seq_printf(m, "kernel_stack %llu\n",
- (u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024);
- seq_printf(m, "slab %llu\n",
- (u64)(acc.stat[NR_SLAB_RECLAIMABLE] +
- acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
- seq_printf(m, "sock %llu\n",
- (u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE);
-
- seq_printf(m, "shmem %llu\n",
- (u64)acc.stat[NR_SHMEM] * PAGE_SIZE);
- seq_printf(m, "file_mapped %llu\n",
- (u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE);
- seq_printf(m, "file_dirty %llu\n",
- (u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE);
- seq_printf(m, "file_writeback %llu\n",
- (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE);
-
- for (i = 0; i < NR_LRU_LISTS; i++)
- seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
- (u64)acc.lru_pages[i] * PAGE_SIZE);
-
- seq_printf(m, "slab_reclaimable %llu\n",
- (u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
- seq_printf(m, "slab_unreclaimable %llu\n",
- (u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
-
- /* Accumulated memory events */
-
- seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]);
- seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]);
-
- seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]);
- seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] +
- acc.events[PGSCAN_DIRECT]);
- seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] +
- acc.events[PGSTEAL_DIRECT]);
- seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]);
- seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]);
- seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]);
- seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
-
- seq_printf(m, "workingset_refault %lu\n",
- acc.stat[WORKINGSET_REFAULT]);
- seq_printf(m, "workingset_activate %lu\n",
- acc.stat[WORKINGSET_ACTIVATE]);
- seq_printf(m, "workingset_nodereclaim %lu\n",
- acc.stat[WORKINGSET_NODERECLAIM]);
-
+ buf = memory_stat_format(memcg);
+ if (!buf)
+ return -ENOMEM;
+ seq_puts(m, buf);
+ kfree(buf);
return 0;
}
+#ifdef CONFIG_NUMA
+static int memory_numa_stat_show(struct seq_file *m, void *v)
+{
+ int i;
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
+ int nid;
+
+ if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS)
+ continue;
+
+ seq_printf(m, "%s", memory_stats[i].name);
+ for_each_node_state(nid, N_MEMORY) {
+ u64 size;
+ struct lruvec *lruvec;
+
+ lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+ size = lruvec_page_state(lruvec, memory_stats[i].idx);
+ size *= memory_stats[i].ratio;
+ seq_printf(m, " N%d=%llu", nid, size);
+ }
+ seq_putc(m, '\n');
+ }
+
+ return 0;
+}
+#endif
+
static int memory_oom_group_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
seq_printf(m, "%d\n", memcg->oom_group);
@@ -5776,10 +6593,21 @@
.seq_show = memory_events_show,
},
{
- .name = "stat",
+ .name = "events.local",
.flags = CFTYPE_NOT_ON_ROOT,
+ .file_offset = offsetof(struct mem_cgroup, events_local_file),
+ .seq_show = memory_events_local_show,
+ },
+ {
+ .name = "stat",
.seq_show = memory_stat_show,
},
+#ifdef CONFIG_NUMA
+ {
+ .name = "numa_stat",
+ .seq_show = memory_numa_stat_show,
+ },
+#endif
{
.name = "oom.group",
.flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
@@ -5805,6 +6633,122 @@
.early_init = 0,
};
+/*
+ * This function calculates an individual cgroup's effective
+ * protection which is derived from its own memory.min/low, its
+ * parent's and siblings' settings, as well as the actual memory
+ * distribution in the tree.
+ *
+ * The following rules apply to the effective protection values:
+ *
+ * 1. At the first level of reclaim, effective protection is equal to
+ * the declared protection in memory.min and memory.low.
+ *
+ * 2. To enable safe delegation of the protection configuration, at
+ * subsequent levels the effective protection is capped to the
+ * parent's effective protection.
+ *
+ * 3. To make complex and dynamic subtrees easier to configure, the
+ * user is allowed to overcommit the declared protection at a given
+ * level. If that is the case, the parent's effective protection is
+ * distributed to the children in proportion to how much protection
+ * they have declared and how much of it they are utilizing.
+ *
+ * This makes distribution proportional, but also work-conserving:
+ * if one cgroup claims much more protection than it uses memory,
+ * the unused remainder is available to its siblings.
+ *
+ * 4. Conversely, when the declared protection is undercommitted at a
+ * given level, the distribution of the larger parental protection
+ * budget is NOT proportional. A cgroup's protection from a sibling
+ * is capped to its own memory.min/low setting.
+ *
+ * 5. However, to allow protecting recursive subtrees from each other
+ * without having to declare each individual cgroup's fixed share
+ * of the ancestor's claim to protection, any unutilized -
+ * "floating" - protection from up the tree is distributed in
+ * proportion to each cgroup's *usage*. This makes the protection
+ * neutral wrt sibling cgroups and lets them compete freely over
+ * the shared parental protection budget, but it protects the
+ * subtree as a whole from neighboring subtrees.
+ *
+ * Note that 4. and 5. are not in conflict: 4. is about protecting
+ * against immediate siblings whereas 5. is about protecting against
+ * neighboring subtrees.
+ */
+static unsigned long effective_protection(unsigned long usage,
+ unsigned long parent_usage,
+ unsigned long setting,
+ unsigned long parent_effective,
+ unsigned long siblings_protected)
+{
+ unsigned long protected;
+ unsigned long ep;
+
+ protected = min(usage, setting);
+ /*
+ * If all cgroups at this level combined claim and use more
+ * protection then what the parent affords them, distribute
+ * shares in proportion to utilization.
+ *
+ * We are using actual utilization rather than the statically
+ * claimed protection in order to be work-conserving: claimed
+ * but unused protection is available to siblings that would
+ * otherwise get a smaller chunk than what they claimed.
+ */
+ if (siblings_protected > parent_effective)
+ return protected * parent_effective / siblings_protected;
+
+ /*
+ * Ok, utilized protection of all children is within what the
+ * parent affords them, so we know whatever this child claims
+ * and utilizes is effectively protected.
+ *
+ * If there is unprotected usage beyond this value, reclaim
+ * will apply pressure in proportion to that amount.
+ *
+ * If there is unutilized protection, the cgroup will be fully
+ * shielded from reclaim, but we do return a smaller value for
+ * protection than what the group could enjoy in theory. This
+ * is okay. With the overcommit distribution above, effective
+ * protection is always dependent on how memory is actually
+ * consumed among the siblings anyway.
+ */
+ ep = protected;
+
+ /*
+ * If the children aren't claiming (all of) the protection
+ * afforded to them by the parent, distribute the remainder in
+ * proportion to the (unprotected) memory of each cgroup. That
+ * way, cgroups that aren't explicitly prioritized wrt each
+ * other compete freely over the allowance, but they are
+ * collectively protected from neighboring trees.
+ *
+ * We're using unprotected memory for the weight so that if
+ * some cgroups DO claim explicit protection, we don't protect
+ * the same bytes twice.
+ *
+ * Check both usage and parent_usage against the respective
+ * protected values. One should imply the other, but they
+ * aren't read atomically - make sure the division is sane.
+ */
+ if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT))
+ return ep;
+ if (parent_effective > siblings_protected &&
+ parent_usage > siblings_protected &&
+ usage > protected) {
+ unsigned long unclaimed;
+
+ unclaimed = parent_effective - siblings_protected;
+ unclaimed *= usage - protected;
+ unclaimed /= parent_usage - siblings_protected;
+
+ ep += unclaimed;
+ }
+
+ return ep;
+}
+
/**
* mem_cgroup_protected - check if memory consumption is in the normal range
* @root: the top ancestor of the sub-tree being checked
@@ -5812,259 +6756,125 @@
*
* WARNING: This function is not stateless! It can only be used as part
* of a top-down tree iteration, not for isolated queries.
- *
- * Returns one of the following:
- * MEMCG_PROT_NONE: cgroup memory is not protected
- * MEMCG_PROT_LOW: cgroup memory is protected as long there is
- * an unprotected supply of reclaimable memory from other cgroups.
- * MEMCG_PROT_MIN: cgroup memory is protected
- *
- * @root is exclusive; it is never protected when looked at directly
- *
- * To provide a proper hierarchical behavior, effective memory.min/low values
- * are used. Below is the description of how effective memory.low is calculated.
- * Effective memory.min values is calculated in the same way.
- *
- * Effective memory.low is always equal or less than the original memory.low.
- * If there is no memory.low overcommittment (which is always true for
- * top-level memory cgroups), these two values are equal.
- * Otherwise, it's a part of parent's effective memory.low,
- * calculated as a cgroup's memory.low usage divided by sum of sibling's
- * memory.low usages, where memory.low usage is the size of actually
- * protected memory.
- *
- * low_usage
- * elow = min( memory.low, parent->elow * ------------------ ),
- * siblings_low_usage
- *
- * | memory.current, if memory.current < memory.low
- * low_usage = |
- | 0, otherwise.
- *
- *
- * Such definition of the effective memory.low provides the expected
- * hierarchical behavior: parent's memory.low value is limiting
- * children, unprotected memory is reclaimed first and cgroups,
- * which are not using their guarantee do not affect actual memory
- * distribution.
- *
- * For example, if there are memcgs A, A/B, A/C, A/D and A/E:
- *
- * A A/memory.low = 2G, A/memory.current = 6G
- * //\\
- * BC DE B/memory.low = 3G B/memory.current = 2G
- * C/memory.low = 1G C/memory.current = 2G
- * D/memory.low = 0 D/memory.current = 2G
- * E/memory.low = 10G E/memory.current = 0
- *
- * and the memory pressure is applied, the following memory distribution
- * is expected (approximately):
- *
- * A/memory.current = 2G
- *
- * B/memory.current = 1.3G
- * C/memory.current = 0.6G
- * D/memory.current = 0
- * E/memory.current = 0
- *
- * These calculations require constant tracking of the actual low usages
- * (see propagate_protected_usage()), as well as recursive calculation of
- * effective memory.low values. But as we do call mem_cgroup_protected()
- * path for each memory cgroup top-down from the reclaim,
- * it's possible to optimize this part, and save calculated elow
- * for next usage. This part is intentionally racy, but it's ok,
- * as memory.low is a best-effort mechanism.
*/
-enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
- struct mem_cgroup *memcg)
+void mem_cgroup_calculate_protection(struct mem_cgroup *root,
+ struct mem_cgroup *memcg)
{
+ unsigned long usage, parent_usage;
struct mem_cgroup *parent;
- unsigned long emin, parent_emin;
- unsigned long elow, parent_elow;
- unsigned long usage;
if (mem_cgroup_disabled())
- return MEMCG_PROT_NONE;
+ return;
if (!root)
root = root_mem_cgroup;
+
+ /*
+ * Effective values of the reclaim targets are ignored so they
+ * can be stale. Have a look at mem_cgroup_protection for more
+ * details.
+ * TODO: calculation should be more robust so that we do not need
+ * that special casing.
+ */
if (memcg == root)
- return MEMCG_PROT_NONE;
+ return;
usage = page_counter_read(&memcg->memory);
if (!usage)
- return MEMCG_PROT_NONE;
-
- emin = memcg->memory.min;
- elow = memcg->memory.low;
+ return;
parent = parent_mem_cgroup(memcg);
/* No parent means a non-hierarchical mode on v1 memcg */
if (!parent)
- return MEMCG_PROT_NONE;
+ return;
- if (parent == root)
- goto exit;
-
- parent_emin = READ_ONCE(parent->memory.emin);
- emin = min(emin, parent_emin);
- if (emin && parent_emin) {
- unsigned long min_usage, siblings_min_usage;
-
- min_usage = min(usage, memcg->memory.min);
- siblings_min_usage = atomic_long_read(
- &parent->memory.children_min_usage);
-
- if (min_usage && siblings_min_usage)
- emin = min(emin, parent_emin * min_usage /
- siblings_min_usage);
+ if (parent == root) {
+ memcg->memory.emin = READ_ONCE(memcg->memory.min);
+ memcg->memory.elow = READ_ONCE(memcg->memory.low);
+ return;
}
- parent_elow = READ_ONCE(parent->memory.elow);
- elow = min(elow, parent_elow);
- if (elow && parent_elow) {
- unsigned long low_usage, siblings_low_usage;
+ parent_usage = page_counter_read(&parent->memory);
- low_usage = min(usage, memcg->memory.low);
- siblings_low_usage = atomic_long_read(
- &parent->memory.children_low_usage);
+ WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage,
+ READ_ONCE(memcg->memory.min),
+ READ_ONCE(parent->memory.emin),
+ atomic_long_read(&parent->memory.children_min_usage)));
- if (low_usage && siblings_low_usage)
- elow = min(elow, parent_elow * low_usage /
- siblings_low_usage);
- }
-
-exit:
- memcg->memory.emin = emin;
- memcg->memory.elow = elow;
-
- if (usage <= emin)
- return MEMCG_PROT_MIN;
- else if (usage <= elow)
- return MEMCG_PROT_LOW;
- else
- return MEMCG_PROT_NONE;
+ WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage,
+ READ_ONCE(memcg->memory.low),
+ READ_ONCE(parent->memory.elow),
+ atomic_long_read(&parent->memory.children_low_usage)));
}
/**
- * mem_cgroup_try_charge - try charging a page
+ * __mem_cgroup_charge - charge a newly allocated page to a cgroup
* @page: page to charge
* @mm: mm context of the victim
* @gfp_mask: reclaim mode
- * @memcgp: charged memcg return
- * @compound: charge the page as compound or small page
*
* Try to charge @page to the memcg that @mm belongs to, reclaiming
* pages according to @gfp_mask if necessary.
*
- * Returns 0 on success, with *@memcgp pointing to the charged memcg.
- * Otherwise, an error code is returned.
- *
- * After page->mapping has been set up, the caller must finalize the
- * charge with mem_cgroup_commit_charge(). Or abort the transaction
- * with mem_cgroup_cancel_charge() in case page instantiation fails.
+ * Returns 0 on success. Otherwise, an error code is returned.
*/
-int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask, struct mem_cgroup **memcgp,
- bool compound)
+int __mem_cgroup_charge(struct page *page, struct mm_struct *mm,
+ gfp_t gfp_mask)
{
+ unsigned int nr_pages = thp_nr_pages(page);
struct mem_cgroup *memcg = NULL;
- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
int ret = 0;
- if (mem_cgroup_disabled())
- goto out;
-
if (PageSwapCache(page)) {
+ swp_entry_t ent = { .val = page_private(page), };
+ unsigned short id;
+
/*
* Every swap fault against a single page tries to charge the
* page, bail as early as possible. shmem_unuse() encounters
- * already charged pages, too. The USED bit is protected by
- * the page lock, which serializes swap cache removal, which
+ * already charged pages, too. page->mem_cgroup is protected
+ * by the page lock, which serializes swap cache removal, which
* in turn serializes uncharging.
*/
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (compound_head(page)->mem_cgroup)
goto out;
- if (do_swap_account) {
- swp_entry_t ent = { .val = page_private(page), };
- unsigned short id = lookup_swap_cgroup_id(ent);
-
- rcu_read_lock();
- memcg = mem_cgroup_from_id(id);
- if (memcg && !css_tryget_online(&memcg->css))
- memcg = NULL;
- rcu_read_unlock();
- }
+ id = lookup_swap_cgroup_id(ent);
+ rcu_read_lock();
+ memcg = mem_cgroup_from_id(id);
+ if (memcg && !css_tryget_online(&memcg->css))
+ memcg = NULL;
+ rcu_read_unlock();
}
if (!memcg)
memcg = get_mem_cgroup_from_mm(mm);
ret = try_charge(memcg, gfp_mask, nr_pages);
+ if (ret)
+ goto out_put;
- css_put(&memcg->css);
-out:
- *memcgp = memcg;
- return ret;
-}
+ css_get(&memcg->css);
+ commit_charge(page, memcg);
-int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask, struct mem_cgroup **memcgp,
- bool compound)
-{
- struct mem_cgroup *memcg;
- int ret;
-
- ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound);
- memcg = *memcgp;
- mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask);
- return ret;
-}
-
-/**
- * mem_cgroup_commit_charge - commit a page charge
- * @page: page to charge
- * @memcg: memcg to charge the page to
- * @lrucare: page might be on LRU already
- * @compound: charge the page as compound or small page
- *
- * Finalize a charge transaction started by mem_cgroup_try_charge(),
- * after page->mapping has been set up. This must happen atomically
- * as part of the page instantiation, i.e. under the page table lock
- * for anonymous pages, under the page lock for page and swap cache.
- *
- * In addition, the page must not be on the LRU during the commit, to
- * prevent racing with task migration. If it might be, use @lrucare.
- *
- * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
- */
-void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
- bool lrucare, bool compound)
-{
- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
-
- VM_BUG_ON_PAGE(!page->mapping, page);
- VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
-
- if (mem_cgroup_disabled())
- return;
- /*
- * Swap faults will attempt to charge the same page multiple
- * times. But reuse_swap_page() might have removed the page
- * from swapcache already, so we can't check PageSwapCache().
- */
- if (!memcg)
- return;
-
- commit_charge(page, memcg, lrucare);
-
- local_lock_irq(event_lock);
- mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
+ local_irq_disable();
+ mem_cgroup_charge_statistics(memcg, page, nr_pages);
memcg_check_events(memcg, page);
- local_unlock_irq(event_lock);
+ local_irq_enable();
+ /*
+ * Cgroup1's unified memory+swap counter has been charged with the
+ * new swapcache page, finish the transfer by uncharging the swap
+ * slot. The swap slot would also get uncharged when it dies, but
+ * it can stick around indefinitely and we'd count the page twice
+ * the entire time.
+ *
+ * Cgroup2 has separate resource counters for memory and swap,
+ * so this is a non-issue here. Memory and swap charge lifetimes
+ * correspond 1:1 to page and swap slot lifetimes: we charge the
+ * page to memory here, and uncharge swap when the slot is freed.
+ */
if (do_memsw_account() && PageSwapCache(page)) {
swp_entry_t entry = { .val = page_private(page) };
/*
@@ -6074,42 +6884,18 @@
*/
mem_cgroup_uncharge_swap(entry, nr_pages);
}
-}
-/**
- * mem_cgroup_cancel_charge - cancel a page charge
- * @page: page to charge
- * @memcg: memcg to charge the page to
- * @compound: charge the page as compound or small page
- *
- * Cancel a charge transaction started by mem_cgroup_try_charge().
- */
-void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
- bool compound)
-{
- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
-
- if (mem_cgroup_disabled())
- return;
- /*
- * Swap faults will attempt to charge the same page multiple
- * times. But reuse_swap_page() might have removed the page
- * from swapcache already, so we can't check PageSwapCache().
- */
- if (!memcg)
- return;
-
- cancel_charge(memcg, nr_pages);
+out_put:
+ css_put(&memcg->css);
+out:
+ return ret;
}
struct uncharge_gather {
struct mem_cgroup *memcg;
+ unsigned long nr_pages;
unsigned long pgpgout;
- unsigned long nr_anon;
- unsigned long nr_file;
unsigned long nr_kmem;
- unsigned long nr_huge;
- unsigned long nr_shmem;
struct page *dummy_page;
};
@@ -6120,37 +6906,32 @@
static void uncharge_batch(const struct uncharge_gather *ug)
{
- unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem;
unsigned long flags;
if (!mem_cgroup_is_root(ug->memcg)) {
- page_counter_uncharge(&ug->memcg->memory, nr_pages);
+ page_counter_uncharge(&ug->memcg->memory, ug->nr_pages);
if (do_memsw_account())
- page_counter_uncharge(&ug->memcg->memsw, nr_pages);
+ page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages);
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
memcg_oom_recover(ug->memcg);
}
- local_lock_irqsave(event_lock, flags);
- __mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon);
- __mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file);
- __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge);
- __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem);
+ local_irq_save(flags);
__count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
- __this_cpu_add(ug->memcg->stat_cpu->nr_page_events, nr_pages);
+ __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages);
memcg_check_events(ug->memcg, ug->dummy_page);
- local_unlock_irqrestore(event_lock, flags);
+ local_irq_restore(flags);
- if (!mem_cgroup_is_root(ug->memcg))
- css_put_many(&ug->memcg->css, nr_pages);
+ /* drop reference from uncharge_page */
+ css_put(&ug->memcg->css);
}
static void uncharge_page(struct page *page, struct uncharge_gather *ug)
{
+ unsigned long nr_pages;
+
VM_BUG_ON_PAGE(PageLRU(page), page);
- VM_BUG_ON_PAGE(page_count(page) && !is_zone_device_page(page) &&
- !PageHWPoison(page) , page);
if (!page->mem_cgroup)
return;
@@ -6167,30 +6948,24 @@
uncharge_gather_clear(ug);
}
ug->memcg = page->mem_cgroup;
+
+ /* pairs with css_put in uncharge_batch */
+ css_get(&ug->memcg->css);
}
- if (!PageKmemcg(page)) {
- unsigned int nr_pages = 1;
+ nr_pages = compound_nr(page);
+ ug->nr_pages += nr_pages;
- if (PageTransHuge(page)) {
- nr_pages <<= compound_order(page);
- ug->nr_huge += nr_pages;
- }
- if (PageAnon(page))
- ug->nr_anon += nr_pages;
- else {
- ug->nr_file += nr_pages;
- if (PageSwapBacked(page))
- ug->nr_shmem += nr_pages;
- }
+ if (!PageKmemcg(page)) {
ug->pgpgout++;
} else {
- ug->nr_kmem += 1 << compound_order(page);
+ ug->nr_kmem += nr_pages;
__ClearPageKmemcg(page);
}
ug->dummy_page = page;
page->mem_cgroup = NULL;
+ css_put(&ug->memcg->css);
}
static void uncharge_list(struct list_head *page_list)
@@ -6219,18 +6994,14 @@
}
/**
- * mem_cgroup_uncharge - uncharge a page
+ * __mem_cgroup_uncharge - uncharge a page
* @page: page to uncharge
*
- * Uncharge a page previously charged with mem_cgroup_try_charge() and
- * mem_cgroup_commit_charge().
+ * Uncharge a page previously charged with __mem_cgroup_charge().
*/
-void mem_cgroup_uncharge(struct page *page)
+void __mem_cgroup_uncharge(struct page *page)
{
struct uncharge_gather ug;
-
- if (mem_cgroup_disabled())
- return;
/* Don't touch page->lru of any random page, pre-check: */
if (!page->mem_cgroup)
@@ -6242,17 +7013,14 @@
}
/**
- * mem_cgroup_uncharge_list - uncharge a list of page
+ * __mem_cgroup_uncharge_list - uncharge a list of page
* @page_list: list of pages to uncharge
*
* Uncharge a list of pages previously charged with
- * mem_cgroup_try_charge() and mem_cgroup_commit_charge().
+ * __mem_cgroup_charge().
*/
-void mem_cgroup_uncharge_list(struct list_head *page_list)
+void __mem_cgroup_uncharge_list(struct list_head *page_list)
{
- if (mem_cgroup_disabled())
- return;
-
if (!list_empty(page_list))
uncharge_list(page_list);
}
@@ -6271,7 +7039,6 @@
{
struct mem_cgroup *memcg;
unsigned int nr_pages;
- bool compound;
unsigned long flags;
VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
@@ -6293,20 +7060,19 @@
return;
/* Force-charge the new page. The old one will be freed soon */
- compound = PageTransHuge(newpage);
- nr_pages = compound ? hpage_nr_pages(newpage) : 1;
+ nr_pages = thp_nr_pages(newpage);
page_counter_charge(&memcg->memory, nr_pages);
if (do_memsw_account())
page_counter_charge(&memcg->memsw, nr_pages);
- css_get_many(&memcg->css, nr_pages);
- commit_charge(newpage, memcg, false);
+ css_get(&memcg->css);
+ commit_charge(newpage, memcg);
- local_lock_irqsave(event_lock, flags);
- mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
+ local_irq_save(flags);
+ mem_cgroup_charge_statistics(memcg, newpage, nr_pages);
memcg_check_events(memcg, newpage);
- local_unlock_irqrestore(event_lock, flags);
+ local_irq_restore(flags);
}
DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
@@ -6329,7 +7095,7 @@
goto out;
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
goto out;
- if (css_tryget_online(&memcg->css))
+ if (css_tryget(&memcg->css))
sk->sk_memcg = memcg;
out:
rcu_read_unlock();
@@ -6407,7 +7173,7 @@
if (!strcmp(token, "nokmem"))
cgroup_memory_nokmem = true;
}
- return 0;
+ return 1;
}
__setup("cgroup.memory=", cgroup_memory);
@@ -6422,17 +7188,6 @@
static int __init mem_cgroup_init(void)
{
int cpu, node;
-
-#ifdef CONFIG_MEMCG_KMEM
- /*
- * Kmem cache creation is mostly done with the slab_mutex held,
- * so use a workqueue with limited concurrency to avoid stalling
- * all worker threads in case lots of cgroups are created and
- * destroyed simultaneously.
- */
- memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1);
- BUG_ON(!memcg_kmem_cache_wq);
-#endif
cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
memcg_hotplug_cpu_dead);
@@ -6460,7 +7215,7 @@
#ifdef CONFIG_MEMCG_SWAP
static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
{
- while (!atomic_inc_not_zero(&memcg->id.ref)) {
+ while (!refcount_inc_not_zero(&memcg->id.ref)) {
/*
* The root cgroup cannot be destroyed, so it's refcount must
* always be >= 1.
@@ -6488,12 +7243,14 @@
struct mem_cgroup *memcg, *swap_memcg;
unsigned int nr_entries;
unsigned short oldid;
- unsigned long flags;
VM_BUG_ON_PAGE(PageLRU(page), page);
VM_BUG_ON_PAGE(page_count(page), page);
- if (!do_memsw_account())
+ if (mem_cgroup_disabled())
+ return;
+
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
return;
memcg = page->mem_cgroup;
@@ -6508,7 +7265,7 @@
* ancestor for the swap instead and transfer the memory+swap charge.
*/
swap_memcg = mem_cgroup_id_get_online(memcg);
- nr_entries = hpage_nr_pages(page);
+ nr_entries = thp_nr_pages(page);
/* Get references for the tail pages, too */
if (nr_entries > 1)
mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
@@ -6522,7 +7279,7 @@
if (!mem_cgroup_is_root(memcg))
page_counter_uncharge(&memcg->memory, nr_entries);
- if (memcg != swap_memcg) {
+ if (!cgroup_memory_noswap && memcg != swap_memcg) {
if (!mem_cgroup_is_root(swap_memcg))
page_counter_charge(&swap_memcg->memsw, nr_entries);
page_counter_uncharge(&memcg->memsw, nr_entries);
@@ -6534,21 +7291,15 @@
* important here to have the interrupts disabled because it is the
* only synchronisation we have for updating the per-CPU variables.
*/
- local_lock_irqsave(event_lock, flags);
-#ifndef CONFIG_PREEMPT_RT_BASE
VM_BUG_ON(!irqs_disabled());
-#endif
- mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
- -nr_entries);
+ mem_cgroup_charge_statistics(memcg, page, -nr_entries);
memcg_check_events(memcg, page);
- local_unlock_irqrestore(event_lock, flags);
- if (!mem_cgroup_is_root(memcg))
- css_put_many(&memcg->css, nr_entries);
+ css_put(&memcg->css);
}
/**
- * mem_cgroup_try_charge_swap - try charging swap space for a page
+ * __mem_cgroup_try_charge_swap - try charging swap space for a page
* @page: page being added to swap
* @entry: swap entry to charge
*
@@ -6556,14 +7307,14 @@
*
* Returns 0 on success, -ENOMEM on failure.
*/
-int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
+int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
{
- unsigned int nr_pages = hpage_nr_pages(page);
+ unsigned int nr_pages = thp_nr_pages(page);
struct page_counter *counter;
struct mem_cgroup *memcg;
unsigned short oldid;
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account)
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return 0;
memcg = page->mem_cgroup;
@@ -6579,7 +7330,7 @@
memcg = mem_cgroup_id_get_online(memcg);
- if (!mem_cgroup_is_root(memcg) &&
+ if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) &&
!page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
memcg_memory_event(memcg, MEMCG_SWAP_MAX);
memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
@@ -6598,23 +7349,20 @@
}
/**
- * mem_cgroup_uncharge_swap - uncharge swap space
+ * __mem_cgroup_uncharge_swap - uncharge swap space
* @entry: swap entry to uncharge
* @nr_pages: the amount of swap space to uncharge
*/
-void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
+void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
{
struct mem_cgroup *memcg;
unsigned short id;
-
- if (!do_swap_account)
- return;
id = swap_cgroup_record(entry, 0, nr_pages);
rcu_read_lock();
memcg = mem_cgroup_from_id(id);
if (memcg) {
- if (!mem_cgroup_is_root(memcg)) {
+ if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) {
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
page_counter_uncharge(&memcg->swap, nr_pages);
else
@@ -6630,7 +7378,7 @@
{
long nr_swap_pages = get_nr_swap_pages();
- if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
return nr_swap_pages;
for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
nr_swap_pages = min_t(long, nr_swap_pages,
@@ -6647,36 +7395,33 @@
if (vm_swap_full())
return true;
- if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
return false;
memcg = page->mem_cgroup;
if (!memcg)
return false;
- for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
- if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.max)
+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
+ unsigned long usage = page_counter_read(&memcg->swap);
+
+ if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
+ usage * 2 >= READ_ONCE(memcg->swap.max))
return true;
+ }
return false;
}
-/* for remember boot option*/
-#ifdef CONFIG_MEMCG_SWAP_ENABLED
-static int really_do_swap_account __initdata = 1;
-#else
-static int really_do_swap_account __initdata;
-#endif
-
-static int __init enable_swap_account(char *s)
+static int __init setup_swap_account(char *s)
{
if (!strcmp(s, "1"))
- really_do_swap_account = 1;
+ cgroup_memory_noswap = 0;
else if (!strcmp(s, "0"))
- really_do_swap_account = 0;
+ cgroup_memory_noswap = 1;
return 1;
}
-__setup("swapaccount=", enable_swap_account);
+__setup("swapaccount=", setup_swap_account);
static u64 swap_current_read(struct cgroup_subsys_state *css,
struct cftype *cft)
@@ -6686,17 +7431,33 @@
return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
}
+static int swap_high_show(struct seq_file *m, void *v)
+{
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->swap.high));
+}
+
+static ssize_t swap_high_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned long high;
+ int err;
+
+ buf = strstrip(buf);
+ err = page_counter_memparse(buf, "max", &high);
+ if (err)
+ return err;
+
+ page_counter_set_high(&memcg->swap, high);
+
+ return nbytes;
+}
+
static int swap_max_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long max = READ_ONCE(memcg->swap.max);
-
- if (max == PAGE_COUNTER_MAX)
- seq_puts(m, "max\n");
- else
- seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
-
- return 0;
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
}
static ssize_t swap_max_write(struct kernfs_open_file *of,
@@ -6718,8 +7479,10 @@
static int swap_events_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+ seq_printf(m, "high %lu\n",
+ atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH]));
seq_printf(m, "max %lu\n",
atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
seq_printf(m, "fail %lu\n",
@@ -6733,6 +7496,12 @@
.name = "swap.current",
.flags = CFTYPE_NOT_ON_ROOT,
.read_u64 = swap_current_read,
+ },
+ {
+ .name = "swap.high",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = swap_high_show,
+ .write = swap_high_write,
},
{
.name = "swap.max",
@@ -6749,7 +7518,7 @@
{ } /* terminate */
};
-static struct cftype memsw_cgroup_files[] = {
+static struct cftype memsw_files[] = {
{
.name = "memsw.usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
@@ -6776,17 +7545,27 @@
{ }, /* terminate */
};
+/*
+ * If mem_cgroup_swap_init() is implemented as a subsys_initcall()
+ * instead of a core_initcall(), this could mean cgroup_memory_noswap still
+ * remains set to false even when memcg is disabled via "cgroup_disable=memory"
+ * boot parameter. This may result in premature OOPS inside
+ * mem_cgroup_get_nr_swap_pages() function in corner cases.
+ */
static int __init mem_cgroup_swap_init(void)
{
- if (!mem_cgroup_disabled() && really_do_swap_account) {
- do_swap_account = 1;
- WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys,
- swap_files));
- WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
- memsw_cgroup_files));
- }
+ /* No memory control -> no swap control */
+ if (mem_cgroup_disabled())
+ cgroup_memory_noswap = true;
+
+ if (cgroup_memory_noswap)
+ return 0;
+
+ WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
+ WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files));
+
return 0;
}
-subsys_initcall(mem_cgroup_swap_init);
+core_initcall(mem_cgroup_swap_init);
#endif /* CONFIG_MEMCG_SWAP */
--
Gitblit v1.6.2