From ea08eeccae9297f7aabd2ef7f0c2517ac4549acc Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Tue, 20 Feb 2024 01:18:26 +0000
Subject: [PATCH] write in 30M
---
kernel/mm/slub.c | 1601 ++++++++++++++++++++++++++++------------------------------
1 files changed, 775 insertions(+), 826 deletions(-)
diff --git a/kernel/mm/slub.c b/kernel/mm/slub.c
index 6b58391..3acf083 100644
--- a/kernel/mm/slub.c
+++ b/kernel/mm/slub.c
@@ -28,6 +28,7 @@
#include <linux/ctype.h>
#include <linux/debugobjects.h>
#include <linux/kallsyms.h>
+#include <linux/kfence.h>
#include <linux/memory.h>
#include <linux/math64.h>
#include <linux/fault-inject.h>
@@ -36,7 +37,9 @@
#include <linux/memcontrol.h>
#include <linux/random.h>
+#include <linux/debugfs.h>
#include <trace/events/kmem.h>
+#include <trace/hooks/mm.h>
#include "internal.h"
@@ -59,10 +62,11 @@
* D. page->frozen -> frozen state
*
* If a slab is frozen then it is exempt from list management. It is not
- * on any list. The processor that froze the slab is the one who can
- * perform list operations on the page. Other processors may put objects
- * onto the freelist but the processor that froze the slab is the only
- * one that can retrieve the objects from the page's freelist.
+ * on any list except per cpu partial list. The processor that froze the
+ * slab is the one who can perform list operations on the page. Other
+ * processors may put objects onto the freelist but the processor that
+ * froze the slab is the only one that can retrieve the objects from the
+ * page's freelist.
*
* The list_lock protects the partial and full list on each node and
* the partial slab counter. If taken then no new slabs may be added or
@@ -93,9 +97,7 @@
* minimal so we rely on the page allocators per cpu caches for
* fast frees and allocs.
*
- * Overloading of page flags that are otherwise used for LRU management.
- *
- * PageActive The slab is frozen and exempt from list processing.
+ * page->frozen The slab is frozen and exempt from list processing.
* This means that the slab is dedicated to a purpose
* such as satisfying allocations for a specific
* processor. Objects may be freed in the slab while
@@ -111,23 +113,27 @@
* free objects in addition to the regular freelist
* that requires the slab lock.
*
- * PageError Slab requires special handling due to debug
+ * SLAB_DEBUG_FLAGS Slab requires special handling due to debug
* options set. This moves slab handling out of
* the fast path and disables lockless freelists.
*/
-static inline int kmem_cache_debug(struct kmem_cache *s)
-{
#ifdef CONFIG_SLUB_DEBUG
- return unlikely(s->flags & SLAB_DEBUG_FLAGS);
+#ifdef CONFIG_SLUB_DEBUG_ON
+DEFINE_STATIC_KEY_TRUE(slub_debug_enabled);
#else
- return 0;
+DEFINE_STATIC_KEY_FALSE(slub_debug_enabled);
#endif
+#endif
+
+static inline bool kmem_cache_debug(struct kmem_cache *s)
+{
+ return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS);
}
void *fixup_red_left(struct kmem_cache *s, void *p)
{
- if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE)
+ if (kmem_cache_debug_flags(s, SLAB_RED_ZONE))
p += s->red_left_pad;
return p;
@@ -197,33 +203,19 @@
/* Use cmpxchg_double */
#define __CMPXCHG_DOUBLE ((slab_flags_t __force)0x40000000U)
-/*
- * Tracking user of a slab.
- */
-#define TRACK_ADDRS_COUNT 16
-struct track {
- unsigned long addr; /* Called from address */
-#ifdef CONFIG_STACKTRACE
- unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */
-#endif
- int cpu; /* Was running on cpu */
- int pid; /* Pid context */
- unsigned long when; /* When did the operation occur */
-};
-
-enum track_item { TRACK_ALLOC, TRACK_FREE };
-
#ifdef CONFIG_SLUB_SYSFS
static int sysfs_slab_add(struct kmem_cache *);
static int sysfs_slab_alias(struct kmem_cache *, const char *);
-static void memcg_propagate_slab_attrs(struct kmem_cache *s);
-static void sysfs_slab_remove(struct kmem_cache *s);
#else
static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
{ return 0; }
-static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
-static inline void sysfs_slab_remove(struct kmem_cache *s) { }
+#endif
+
+#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG)
+static void debugfs_slab_add(struct kmem_cache *);
+#else
+static inline void debugfs_slab_add(struct kmem_cache *s) { }
#endif
static inline void stat(const struct kmem_cache *s, enum stat_item si)
@@ -251,7 +243,7 @@
{
#ifdef CONFIG_SLAB_FREELIST_HARDENED
/*
- * When CONFIG_KASAN_SW_TAGS is enabled, ptr_addr might be tagged.
+ * When CONFIG_KASAN_SW/HW_TAGS is enabled, ptr_addr might be tagged.
* Normally, this doesn't cause any issues, as both set_freepointer()
* and get_freepointer() are called with a pointer with the same tag.
* However, there are some issues with CONFIG_SLUB_DEBUG code. For
@@ -277,6 +269,7 @@
static inline void *get_freepointer(struct kmem_cache *s, void *object)
{
+ object = kasan_reset_tag(object);
return freelist_dereference(s, object + s->offset);
}
@@ -290,11 +283,12 @@
unsigned long freepointer_addr;
void *p;
- if (!debug_pagealloc_enabled())
+ if (!debug_pagealloc_enabled_static())
return get_freepointer(s, object);
+ object = kasan_reset_tag(object);
freepointer_addr = (unsigned long)object + s->offset;
- probe_kernel_read(&p, (void **)freepointer_addr, sizeof(p));
+ copy_from_kernel_nofault(&p, (void **)freepointer_addr, sizeof(p));
return freelist_ptr(s, p, freepointer_addr);
}
@@ -306,6 +300,7 @@
BUG_ON(object == fp); /* naive detection of double free or corruption */
#endif
+ freeptr_addr = (unsigned long)kasan_reset_tag((void *)freeptr_addr);
*(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr);
}
@@ -314,12 +309,6 @@
for (__p = fixup_red_left(__s, __addr); \
__p < (__addr) + (__objects) * (__s)->size; \
__p += (__s)->size)
-
-/* Determine object index from a given position */
-static inline unsigned int slab_index(void *p, struct kmem_cache *s, void *addr)
-{
- return (kasan_reset_tag(p) - addr) / s->size;
-}
static inline unsigned int order_objects(unsigned int order, unsigned int size)
{
@@ -441,19 +430,43 @@
}
#ifdef CONFIG_SLUB_DEBUG
+static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
+static DEFINE_SPINLOCK(object_map_lock);
+
+static void __fill_map(unsigned long *obj_map, struct kmem_cache *s,
+ struct page *page)
+{
+ void *addr = page_address(page);
+ void *p;
+
+ bitmap_zero(obj_map, page->objects);
+
+ for (p = page->freelist; p; p = get_freepointer(s, p))
+ set_bit(__obj_to_index(s, addr, p), obj_map);
+}
+
/*
* Determine a map of object in use on a page.
*
* Node listlock must be held to guarantee that the page does
* not vanish from under us.
*/
-static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
+static unsigned long *get_map(struct kmem_cache *s, struct page *page)
+ __acquires(&object_map_lock)
{
- void *p;
- void *addr = page_address(page);
+ VM_BUG_ON(!irqs_disabled());
- for (p = page->freelist; p; p = get_freepointer(s, p))
- set_bit(slab_index(p, s, addr), map);
+ spin_lock(&object_map_lock);
+
+ __fill_map(object_map, s, page);
+
+ return object_map;
+}
+
+static void put_map(unsigned long *map) __releases(&object_map_lock)
+{
+ VM_BUG_ON(map != object_map);
+ spin_unlock(&object_map_lock);
}
static inline unsigned int size_from_object(struct kmem_cache *s)
@@ -476,12 +489,12 @@
* Debug settings:
*/
#if defined(CONFIG_SLUB_DEBUG_ON)
-static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS;
+slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS;
#else
-static slab_flags_t slub_debug;
+slab_flags_t slub_debug;
#endif
-static char *slub_debug_slabs;
+static char *slub_debug_string;
static int disable_higher_order_debug;
/*
@@ -528,9 +541,29 @@
unsigned int length)
{
metadata_access_enable();
- print_hex_dump(level, text, DUMP_PREFIX_ADDRESS, 16, 1, addr,
- length, 1);
+ print_hex_dump(level, text, DUMP_PREFIX_ADDRESS,
+ 16, 1, kasan_reset_tag((void *)addr), length, 1);
metadata_access_disable();
+}
+
+/*
+ * See comment in calculate_sizes().
+ */
+static inline bool freeptr_outside_object(struct kmem_cache *s)
+{
+ return s->offset >= s->inuse;
+}
+
+/*
+ * Return offset of the end of info block which is inuse + free pointer if
+ * not overlapping with object.
+ */
+static inline unsigned int get_info_end(struct kmem_cache *s)
+{
+ if (freeptr_outside_object(s))
+ return s->inuse + sizeof(void *);
+ else
+ return s->inuse;
}
static struct track *get_track(struct kmem_cache *s, void *object,
@@ -538,13 +571,45 @@
{
struct track *p;
- if (s->offset)
- p = object + s->offset + sizeof(void *);
- else
- p = object + s->inuse;
+ p = object + get_info_end(s);
- return p + alloc;
+ return kasan_reset_tag(p + alloc);
}
+
+/*
+ * This function will be used to loop through all the slab objects in
+ * a page to give track structure for each object, the function fn will
+ * be using this track structure and extract required info into its private
+ * data, the return value will be the number of track structures that are
+ * processed.
+ */
+unsigned long get_each_object_track(struct kmem_cache *s,
+ struct page *page, enum track_item alloc,
+ int (*fn)(const struct kmem_cache *, const void *,
+ const struct track *, void *), void *private)
+{
+ void *p;
+ struct track *t;
+ int ret;
+ unsigned long num_track = 0;
+
+ if (!slub_debug || !(s->flags & SLAB_STORE_USER))
+ return 0;
+
+ slab_lock(page);
+ for_each_object(p, s, page_address(page), page->objects) {
+ t = get_track(s, p, alloc);
+ metadata_access_enable();
+ ret = fn(s, p, t, private);
+ metadata_access_disable();
+ if (ret < 0)
+ break;
+ num_track += 1;
+ }
+ slab_unlock(page);
+ return num_track;
+}
+EXPORT_SYMBOL_GPL(get_each_object_track);
static void set_track(struct kmem_cache *s, void *object,
enum track_item alloc, unsigned long addr)
@@ -553,31 +618,25 @@
if (addr) {
#ifdef CONFIG_STACKTRACE
- struct stack_trace trace;
- int i;
+ unsigned int nr_entries;
- trace.nr_entries = 0;
- trace.max_entries = TRACK_ADDRS_COUNT;
- trace.entries = p->addrs;
- trace.skip = 3;
metadata_access_enable();
- save_stack_trace(&trace);
+ nr_entries = stack_trace_save(kasan_reset_tag(p->addrs),
+ TRACK_ADDRS_COUNT, 3);
metadata_access_disable();
- /* See rant in lockdep.c */
- if (trace.nr_entries != 0 &&
- trace.entries[trace.nr_entries - 1] == ULONG_MAX)
- trace.nr_entries--;
-
- for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++)
- p->addrs[i] = 0;
+ if (nr_entries < TRACK_ADDRS_COUNT)
+ p->addrs[nr_entries] = 0;
+ trace_android_vh_save_track_hash(alloc == TRACK_ALLOC,
+ (unsigned long)p);
#endif
p->addr = addr;
p->cpu = smp_processor_id();
p->pid = current->pid;
p->when = jiffies;
- } else
+ } else {
memset(p, 0, sizeof(struct track));
+ }
}
static void init_tracking(struct kmem_cache *s, void *object)
@@ -608,7 +667,7 @@
#endif
}
-static void print_tracking(struct kmem_cache *s, void *object)
+void print_tracking(struct kmem_cache *s, void *object)
{
unsigned long pr_time = jiffies;
if (!(s->flags & SLAB_STORE_USER))
@@ -636,8 +695,6 @@
pr_err("=============================================================================\n");
pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf);
pr_err("-----------------------------------------------------------------------------\n\n");
-
- add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
va_end(args);
}
@@ -691,10 +748,7 @@
print_section(KERN_ERR, "Redzone ", p + s->object_size,
s->inuse - s->object_size);
- if (s->offset)
- off = s->offset + sizeof(void *);
- else
- off = s->inuse;
+ off = get_info_end(s);
if (s->flags & SLAB_STORE_USER)
off += 2 * sizeof(struct track);
@@ -714,6 +768,7 @@
{
slab_bug(s, "%s", reason);
print_trailer(s, page, object);
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}
static __printf(3, 4) void slab_err(struct kmem_cache *s, struct page *page,
@@ -728,11 +783,12 @@
slab_bug(s, "%s", buf);
print_page_info(page);
dump_stack();
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}
static void init_object(struct kmem_cache *s, void *object, u8 val)
{
- u8 *p = object;
+ u8 *p = kasan_reset_tag(object);
if (s->flags & SLAB_RED_ZONE)
memset(p - s->red_left_pad, val, s->red_left_pad);
@@ -759,9 +815,10 @@
{
u8 *fault;
u8 *end;
+ u8 *addr = page_address(page);
metadata_access_enable();
- fault = memchr_inv(start, value, bytes);
+ fault = memchr_inv(kasan_reset_tag(start), value, bytes);
metadata_access_disable();
if (!fault)
return 1;
@@ -771,9 +828,11 @@
end--;
slab_bug(s, "%s overwritten", what);
- pr_err("INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
- fault, end - 1, fault[0], value);
+ pr_err("INFO: 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
+ fault, end - 1, fault - addr,
+ fault[0], value);
print_trailer(s, page, object);
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
restore_bytes(s, what, value, fault, end);
return 0;
@@ -785,7 +844,7 @@
* object address
* Bytes of the object to be managed.
* If the freepointer may overlay the object then the free
- * pointer is the first word of the object.
+ * pointer is at the middle of the object.
*
* Poisoning uses 0x6b (POISON_FREE) and the last byte is
* 0xa5 (POISON_END)
@@ -819,11 +878,7 @@
static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
{
- unsigned long off = s->inuse; /* The end of info */
-
- if (s->offset)
- /* Freepointer is placed after the object. */
- off += sizeof(void *);
+ unsigned long off = get_info_end(s); /* The end of info */
if (s->flags & SLAB_STORE_USER)
/* We also have user information there */
@@ -852,7 +907,7 @@
return 1;
start = page_address(page);
- length = PAGE_SIZE << compound_order(page);
+ length = page_size(page);
end = start + length;
remainder = length % s->size;
if (!remainder)
@@ -860,14 +915,15 @@
pad = end - remainder;
metadata_access_enable();
- fault = memchr_inv(pad, POISON_INUSE, remainder);
+ fault = memchr_inv(kasan_reset_tag(pad), POISON_INUSE, remainder);
metadata_access_disable();
if (!fault)
return 1;
while (end > fault && end[-1] == POISON_INUSE)
end--;
- slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
+ slab_err(s, page, "Padding overwritten. 0x%p-0x%p @offset=%tu",
+ fault, end - 1, fault - start);
print_section(KERN_ERR, "Padding ", pad, remainder);
restore_bytes(s, "slab padding", POISON_INUSE, fault, end);
@@ -909,7 +965,7 @@
check_pad_bytes(s, page, p);
}
- if (!s->offset && val == SLUB_RED_ACTIVE)
+ if (!freeptr_outside_object(s) && val == SLUB_RED_ACTIVE)
/*
* Object and freepointer overlap. Cannot check
* freepointer while object is allocated.
@@ -1038,7 +1094,7 @@
return;
lockdep_assert_held(&n->list_lock);
- list_add(&page->lru, &n->full);
+ list_add(&page->slab_list, &n->full);
}
static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page)
@@ -1047,7 +1103,7 @@
return;
lockdep_assert_held(&n->list_lock);
- list_del(&page->lru);
+ list_del(&page->slab_list);
}
/* Tracking of the number of slabs for debugging purposes */
@@ -1090,26 +1146,26 @@
static void setup_object_debug(struct kmem_cache *s, struct page *page,
void *object)
{
- if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
+ if (!kmem_cache_debug_flags(s, SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))
return;
init_object(s, object, SLUB_RED_INACTIVE);
init_tracking(s, object);
}
-static void setup_page_debug(struct kmem_cache *s, void *addr, int order)
+static
+void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr)
{
- if (!(s->flags & SLAB_POISON))
+ if (!kmem_cache_debug_flags(s, SLAB_POISON))
return;
metadata_access_enable();
- memset(addr, POISON_INUSE, PAGE_SIZE << order);
+ memset(kasan_reset_tag(addr), POISON_INUSE, page_size(page));
metadata_access_disable();
}
static inline int alloc_consistency_checks(struct kmem_cache *s,
- struct page *page,
- void *object, unsigned long addr)
+ struct page *page, void *object)
{
if (!check_slab(s, page))
return 0;
@@ -1130,7 +1186,7 @@
void *object, unsigned long addr)
{
if (s->flags & SLAB_CONSISTENCY_CHECKS) {
- if (!alloc_consistency_checks(s, page, object, addr))
+ if (!alloc_consistency_checks(s, page, object))
goto bad;
}
@@ -1196,7 +1252,7 @@
struct kmem_cache_node *n = get_node(s, page_to_nid(page));
void *object = head;
int cnt = 0;
- unsigned long uninitialized_var(flags);
+ unsigned long flags;
int ret = 0;
spin_lock_irqsave(&n->list_lock, flags);
@@ -1240,69 +1296,138 @@
return ret;
}
-static int __init setup_slub_debug(char *str)
+/*
+ * Parse a block of slub_debug options. Blocks are delimited by ';'
+ *
+ * @str: start of block
+ * @flags: returns parsed flags, or DEBUG_DEFAULT_FLAGS if none specified
+ * @slabs: return start of list of slabs, or NULL when there's no list
+ * @init: assume this is initial parsing and not per-kmem-create parsing
+ *
+ * returns the start of next block if there's any, or NULL
+ */
+static char *
+parse_slub_debug_flags(char *str, slab_flags_t *flags, char **slabs, bool init)
{
- slub_debug = DEBUG_DEFAULT_FLAGS;
- if (*str++ != '=' || !*str)
- /*
- * No options specified. Switch on full debugging.
- */
- goto out;
+ bool higher_order_disable = false;
- if (*str == ',')
+ /* Skip any completely empty blocks */
+ while (*str && *str == ';')
+ str++;
+
+ if (*str == ',') {
/*
* No options but restriction on slabs. This means full
* debugging for slabs matching a pattern.
*/
+ *flags = DEBUG_DEFAULT_FLAGS;
goto check_slabs;
+ }
+ *flags = 0;
- slub_debug = 0;
- if (*str == '-')
- /*
- * Switch off all debugging measures.
- */
- goto out;
-
- /*
- * Determine which debug features should be switched on
- */
- for (; *str && *str != ','; str++) {
+ /* Determine which debug features should be switched on */
+ for (; *str && *str != ',' && *str != ';'; str++) {
switch (tolower(*str)) {
+ case '-':
+ *flags = 0;
+ break;
case 'f':
- slub_debug |= SLAB_CONSISTENCY_CHECKS;
+ *flags |= SLAB_CONSISTENCY_CHECKS;
break;
case 'z':
- slub_debug |= SLAB_RED_ZONE;
+ *flags |= SLAB_RED_ZONE;
break;
case 'p':
- slub_debug |= SLAB_POISON;
+ *flags |= SLAB_POISON;
break;
case 'u':
- slub_debug |= SLAB_STORE_USER;
+ *flags |= SLAB_STORE_USER;
break;
case 't':
- slub_debug |= SLAB_TRACE;
+ *flags |= SLAB_TRACE;
break;
case 'a':
- slub_debug |= SLAB_FAILSLAB;
+ *flags |= SLAB_FAILSLAB;
break;
case 'o':
/*
* Avoid enabling debugging on caches if its minimum
* order would increase as a result.
*/
- disable_higher_order_debug = 1;
+ higher_order_disable = true;
break;
default:
- pr_err("slub_debug option '%c' unknown. skipped\n",
- *str);
+ if (init)
+ pr_err("slub_debug option '%c' unknown. skipped\n", *str);
+ }
+ }
+check_slabs:
+ if (*str == ',')
+ *slabs = ++str;
+ else
+ *slabs = NULL;
+
+ /* Skip over the slab list */
+ while (*str && *str != ';')
+ str++;
+
+ /* Skip any completely empty blocks */
+ while (*str && *str == ';')
+ str++;
+
+ if (init && higher_order_disable)
+ disable_higher_order_debug = 1;
+
+ if (*str)
+ return str;
+ else
+ return NULL;
+}
+
+static int __init setup_slub_debug(char *str)
+{
+ slab_flags_t flags;
+ slab_flags_t global_flags;
+ char *saved_str;
+ char *slab_list;
+ bool global_slub_debug_changed = false;
+ bool slab_list_specified = false;
+
+ global_flags = DEBUG_DEFAULT_FLAGS;
+ if (*str++ != '=' || !*str)
+ /*
+ * No options specified. Switch on full debugging.
+ */
+ goto out;
+
+ saved_str = str;
+ while (str) {
+ str = parse_slub_debug_flags(str, &flags, &slab_list, true);
+
+ if (!slab_list) {
+ global_flags = flags;
+ global_slub_debug_changed = true;
+ } else {
+ slab_list_specified = true;
}
}
-check_slabs:
- if (*str == ',')
- slub_debug_slabs = str + 1;
+ /*
+ * For backwards compatibility, a single list of flags with list of
+ * slabs means debugging is only changed for those slabs, so the global
+ * slub_debug should be unchanged (0 or DEBUG_DEFAULT_FLAGS, depending
+ * on CONFIG_SLUB_DEBUG_ON). We can extended that to multiple lists as
+ * long as there is no option specifying flags without a slab list.
+ */
+ if (slab_list_specified) {
+ if (!global_slub_debug_changed)
+ global_flags = slub_debug;
+ slub_debug_string = saved_str;
+ }
out:
+ slub_debug = global_flags;
+ if (slub_debug != 0 || slub_debug_string)
+ static_branch_enable(&slub_debug_enabled);
if ((static_branch_unlikely(&init_on_alloc) ||
static_branch_unlikely(&init_on_free)) &&
(slub_debug & SLAB_POISON))
@@ -1312,24 +1437,65 @@
__setup("slub_debug", setup_slub_debug);
+/*
+ * kmem_cache_flags - apply debugging options to the cache
+ * @object_size: the size of an object without meta data
+ * @flags: flags to set
+ * @name: name of the cache
+ *
+ * Debug option(s) are applied to @flags. In addition to the debug
+ * option(s), if a slab name (or multiple) is specified i.e.
+ * slub_debug=<Debug-Options>,<slab name1>,<slab name2> ...
+ * then only the select slabs will receive the debug option(s).
+ */
slab_flags_t kmem_cache_flags(unsigned int object_size,
- slab_flags_t flags, const char *name,
- void (*ctor)(void *))
+ slab_flags_t flags, const char *name)
{
- /*
- * Enable debugging if selected on the kernel commandline.
- */
- if (slub_debug && (!slub_debug_slabs || (name &&
- !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)))))
- flags |= slub_debug;
+ char *iter;
+ size_t len;
+ char *next_block;
+ slab_flags_t block_flags;
- return flags;
+ len = strlen(name);
+ next_block = slub_debug_string;
+ /* Go through all blocks of debug options, see if any matches our slab's name */
+ while (next_block) {
+ next_block = parse_slub_debug_flags(next_block, &block_flags, &iter, false);
+ if (!iter)
+ continue;
+ /* Found a block that has a slab list, search it */
+ while (*iter) {
+ char *end, *glob;
+ size_t cmplen;
+
+ end = strchrnul(iter, ',');
+ if (next_block && next_block < end)
+ end = next_block - 1;
+
+ glob = strnchr(iter, end - iter, '*');
+ if (glob)
+ cmplen = glob - iter;
+ else
+ cmplen = max_t(size_t, len, (end - iter));
+
+ if (!strncmp(name, iter, cmplen)) {
+ flags |= block_flags;
+ return flags;
+ }
+
+ if (!*end || *end == ';')
+ break;
+ iter = end + 1;
+ }
+ }
+
+ return flags | slub_debug;
}
#else /* !CONFIG_SLUB_DEBUG */
static inline void setup_object_debug(struct kmem_cache *s,
struct page *page, void *object) {}
-static inline void setup_page_debug(struct kmem_cache *s,
- void *addr, int order) {}
+static inline
+void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr) {}
static inline int alloc_debug_processing(struct kmem_cache *s,
struct page *page, void *object, unsigned long addr) { return 0; }
@@ -1348,8 +1514,7 @@
static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
struct page *page) {}
slab_flags_t kmem_cache_flags(unsigned int object_size,
- slab_flags_t flags, const char *name,
- void (*ctor)(void *))
+ slab_flags_t flags, const char *name)
{
return flags;
}
@@ -1380,6 +1545,7 @@
static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
{
ptr = kasan_kmalloc_large(ptr, size, flags);
+ /* As ptr might get tagged, call kmemleak hook after KASAN. */
kmemleak_alloc(ptr, size, 1, flags);
return ptr;
}
@@ -1387,10 +1553,11 @@
static __always_inline void kfree_hook(void *x)
{
kmemleak_free(x);
- kasan_kfree_large(x, _RET_IP_);
+ kasan_kfree_large(x);
}
-static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x)
+static __always_inline bool slab_free_hook(struct kmem_cache *s,
+ void *x, bool init)
{
kmemleak_free_recursive(x, s->flags);
@@ -1411,8 +1578,30 @@
if (!(s->flags & SLAB_DEBUG_OBJECTS))
debug_check_no_obj_freed(x, s->object_size);
- /* KASAN might put x into memory quarantine, delaying its reuse */
- return kasan_slab_free(s, x, _RET_IP_);
+ /* Use KCSAN to help debug racy use-after-free. */
+ if (!(s->flags & SLAB_TYPESAFE_BY_RCU))
+ __kcsan_check_access(x, s->object_size,
+ KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);
+
+ /*
+ * As memory initialization might be integrated into KASAN,
+ * kasan_slab_free and initialization memset's must be
+ * kept together to avoid discrepancies in behavior.
+ *
+ * The initialization memset's clear the object and the metadata,
+ * but don't touch the SLAB redzone.
+ */
+ if (init) {
+ int rsize;
+
+ if (!kasan_has_integrated_init())
+ memset(kasan_reset_tag(x), 0, s->object_size);
+ rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad : 0;
+ memset((char *)kasan_reset_tag(x) + s->inuse, 0,
+ s->size - s->inuse - rsize);
+ }
+ /* KASAN might put x into memory quarantine, delaying its reuse. */
+ return kasan_slab_free(s, x, init);
}
static inline bool slab_free_freelist_hook(struct kmem_cache *s,
@@ -1423,7 +1612,11 @@
void *object;
void *next = *head;
void *old_tail = *tail ? *tail : *head;
- int rsize;
+
+ if (is_kfence_address(next)) {
+ slab_free_hook(s, next, false);
+ return true;
+ }
/* Head and tail of the reconstructed freelist */
*head = NULL;
@@ -1433,20 +1626,8 @@
object = next;
next = get_freepointer(s, object);
- if (slab_want_init_on_free(s)) {
- /*
- * Clear the object and the metadata, but don't touch
- * the redzone.
- */
- memset(object, 0, s->object_size);
- rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad
- : 0;
- memset((char *)object + s->inuse, 0,
- s->size - s->inuse - rsize);
-
- }
/* If object's reuse doesn't have to be delayed */
- if (!slab_free_hook(s, object)) {
+ if (!slab_free_hook(s, object, slab_want_init_on_free(s))) {
/* Move object to the new freelist */
set_freepointer(s, object, *head);
*head = object;
@@ -1494,10 +1675,8 @@
else
page = __alloc_pages_node(node, flags, order);
- if (page && memcg_charge_slab(page, flags, order, s)) {
- __free_pages(page, order);
- page = NULL;
- }
+ if (page)
+ account_slab_page(page, order, s);
return page;
}
@@ -1617,7 +1796,7 @@
struct kmem_cache_order_objects oo = s->oo;
gfp_t alloc_gfp;
void *start, *p, *next;
- int idx, order;
+ int idx;
bool shuffle;
flags &= gfp_allowed_mask;
@@ -1651,7 +1830,6 @@
page->objects = oo_objects(oo);
- order = compound_order(page);
page->slab_cache = s;
__SetPageSlab(page);
if (page_is_pfmemalloc(page))
@@ -1661,7 +1839,7 @@
start = page_address(page);
- setup_page_debug(s, start, order);
+ setup_page_debug(s, page, start);
shuffle = shuffle_freelist(s, page);
@@ -1687,11 +1865,6 @@
if (!page)
return NULL;
- mod_lruvec_page_state(page,
- (s->flags & SLAB_RECLAIM_ACCOUNT) ?
- NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
- 1 << oo_order(oo));
-
inc_slabs_node(s, page_to_nid(page), page->objects);
return page;
@@ -1699,13 +1872,8 @@
static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
{
- if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
- gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
- flags &= ~GFP_SLAB_BUG_MASK;
- pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
- invalid_mask, &invalid_mask, flags, &flags);
- dump_stack();
- }
+ if (unlikely(flags & GFP_SLAB_BUG_MASK))
+ flags = kmalloc_fix_flags(flags);
return allocate_slab(s,
flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
@@ -1716,7 +1884,7 @@
int order = compound_order(page);
int pages = 1 << order;
- if (s->flags & SLAB_CONSISTENCY_CHECKS) {
+ if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) {
void *p;
slab_pad_check(s, page);
@@ -1725,18 +1893,13 @@
check_object(s, page, p, SLUB_RED_INACTIVE);
}
- mod_lruvec_page_state(page,
- (s->flags & SLAB_RECLAIM_ACCOUNT) ?
- NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
- -pages);
-
__ClearPageSlabPfmemalloc(page);
__ClearPageSlab(page);
page->mapping = NULL;
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += pages;
- memcg_uncharge_slab(page, order, s);
+ unaccount_slab_page(page, order, s);
__free_pages(page, order);
}
@@ -1769,9 +1932,9 @@
{
n->nr_partial++;
if (tail == DEACTIVATE_TO_TAIL)
- list_add_tail(&page->lru, &n->partial);
+ list_add_tail(&page->slab_list, &n->partial);
else
- list_add(&page->lru, &n->partial);
+ list_add(&page->slab_list, &n->partial);
}
static inline void add_partial(struct kmem_cache_node *n,
@@ -1785,7 +1948,7 @@
struct page *page)
{
lockdep_assert_held(&n->list_lock);
- list_del(&page->lru);
+ list_del(&page->slab_list);
n->nr_partial--;
}
@@ -1852,14 +2015,14 @@
/*
* Racy check. If we mistakenly see no partial slabs then we
* just allocate an empty slab. If we mistakenly try to get a
- * partial slab and there is none available then get_partials()
+ * partial slab and there is none available then get_partial()
* will return NULL.
*/
if (!n || !n->nr_partial)
return NULL;
spin_lock(&n->list_lock);
- list_for_each_entry_safe(page, page2, &n->partial, lru) {
+ list_for_each_entry_safe(page, page2, &n->partial, slab_list) {
void *t;
if (!pfmemalloc_match(page, flags))
@@ -1897,7 +2060,7 @@
struct zonelist *zonelist;
struct zoneref *z;
struct zone *zone;
- enum zone_type high_zoneidx = gfp_zone(flags);
+ enum zone_type highest_zoneidx = gfp_zone(flags);
void *object;
unsigned int cpuset_mems_cookie;
@@ -1926,7 +2089,7 @@
do {
cpuset_mems_cookie = read_mems_allowed_begin();
zonelist = node_zonelist(mempolicy_slab_node(), flags);
- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+ for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
struct kmem_cache_node *n;
n = get_node(s, zone_to_nid(zone));
@@ -1947,7 +2110,7 @@
}
}
} while (read_mems_allowed_retry(cpuset_mems_cookie));
-#endif
+#endif /* CONFIG_NUMA */
return NULL;
}
@@ -1970,9 +2133,9 @@
return get_any_partial(s, flags, c);
}
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
/*
- * Calculate the next globally unique transaction for disambiguiation
+ * Calculate the next globally unique transaction for disambiguation
* during cmpxchg. The transactions start with the cpu number and are then
* incremented by CONFIG_NR_CPUS.
*/
@@ -1990,6 +2153,7 @@
return tid + TID_STEP;
}
+#ifdef SLUB_DEBUG_CMPXCHG
static inline unsigned int tid_to_cpu(unsigned long tid)
{
return tid % TID_STEP;
@@ -1999,6 +2163,7 @@
{
return tid / TID_STEP;
}
+#endif
static inline unsigned int init_tid(int cpu)
{
@@ -2013,7 +2178,7 @@
pr_info("%s %s: cmpxchg redo ", n, s->name);
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
pr_warn("due to cpu change %d -> %d\n",
tid_to_cpu(tid), tid_to_cpu(actual_tid));
@@ -2131,7 +2296,7 @@
if (!lock) {
lock = 1;
/*
- * Taking the spinlock removes the possiblity
+ * Taking the spinlock removes the possibility
* that acquire_slab() will see a slab page that
* is frozen
*/
@@ -2139,7 +2304,8 @@
}
} else {
m = M_FULL;
- if (kmem_cache_debug(s) && !lock) {
+#ifdef CONFIG_SLUB_DEBUG
+ if ((s->flags & SLAB_STORE_USER) && !lock) {
lock = 1;
/*
* This also ensures that the scanning of full
@@ -2148,29 +2314,19 @@
*/
spin_lock(&n->list_lock);
}
+#endif
}
if (l != m) {
-
if (l == M_PARTIAL)
-
remove_partial(n, page);
-
else if (l == M_FULL)
-
remove_full(s, n, page);
- if (m == M_PARTIAL) {
-
+ if (m == M_PARTIAL)
add_partial(n, page, tail);
- stat(s, tail);
-
- } else if (m == M_FULL) {
-
- stat(s, DEACTIVATE_FULL);
+ else if (m == M_FULL)
add_full(s, n, page);
-
- }
}
l = m;
@@ -2183,7 +2339,11 @@
if (lock)
spin_unlock(&n->list_lock);
- if (m == M_FREE) {
+ if (m == M_PARTIAL)
+ stat(s, tail);
+ else if (m == M_FULL)
+ stat(s, DEACTIVATE_FULL);
+ else if (m == M_FREE) {
stat(s, DEACTIVATE_EMPTY);
discard_slab(s, page);
stat(s, FREE_SLAB);
@@ -2191,6 +2351,7 @@
c->page = NULL;
c->freelist = NULL;
+ c->tid = next_tid(c->tid);
}
/*
@@ -2207,11 +2368,11 @@
struct kmem_cache_node *n = NULL, *n2 = NULL;
struct page *page, *discard_page = NULL;
- while ((page = c->partial)) {
+ while ((page = slub_percpu_partial(c))) {
struct page new;
struct page old;
- c->partial = page->next;
+ slub_set_percpu_partial(c, page);
n2 = get_node(s, page_to_nid(page));
if (n != n2) {
@@ -2258,12 +2419,12 @@
discard_slab(s, page);
stat(s, FREE_SLAB);
}
-#endif
+#endif /* CONFIG_SLUB_CPU_PARTIAL */
}
/*
- * Put a page that was just frozen (in __slab_free) into a partial page
- * slot if available.
+ * Put a page that was just frozen (in __slab_free|get_partial_node) into a
+ * partial page slot if available.
*
* If we did not find a slot then simply move all the partials to the
* per node partial list.
@@ -2284,7 +2445,7 @@
if (oldpage) {
pobjects = oldpage->pobjects;
pages = oldpage->pages;
- if (drain && pobjects > s->cpu_partial) {
+ if (drain && pobjects > slub_cpu_partial(s)) {
unsigned long flags;
/*
* partial array is full. Move the existing
@@ -2309,7 +2470,7 @@
} while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
!= oldpage);
- if (unlikely(!s->cpu_partial)) {
+ if (unlikely(!slub_cpu_partial(s))) {
unsigned long flags;
local_irq_save(flags);
@@ -2317,15 +2478,13 @@
local_irq_restore(flags);
}
preempt_enable();
-#endif
+#endif /* CONFIG_SLUB_CPU_PARTIAL */
}
static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
{
stat(s, CPUSLAB_FLUSH);
deactivate_slab(s, c->page, c->freelist, c);
-
- c->tid = next_tid(c->tid);
}
/*
@@ -2337,12 +2496,10 @@
{
struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
- if (likely(c)) {
- if (c->page)
- flush_slab(s, c);
+ if (c->page)
+ flush_slab(s, c);
- unfreeze_partials(s, c);
- }
+ unfreeze_partials(s, c);
}
static void flush_cpu_slab(void *d)
@@ -2362,7 +2519,7 @@
static void flush_all(struct kmem_cache *s)
{
- on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
+ on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1);
}
/*
@@ -2391,7 +2548,7 @@
static inline int node_match(struct page *page, int node)
{
#ifdef CONFIG_NUMA
- if (!page || (node != NUMA_NO_NODE && page_to_nid(page) != node))
+ if (node != NUMA_NO_NODE && page_to_nid(page) != node)
return 0;
#endif
return 1;
@@ -2418,7 +2575,7 @@
struct page *page;
spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry(page, &n->partial, lru)
+ list_for_each_entry(page, &n->partial, slab_list)
x += get_count(page);
spin_unlock_irqrestore(&n->list_lock, flags);
return x;
@@ -2492,8 +2649,7 @@
stat(s, ALLOC_SLAB);
c->page = page;
*pc = c;
- } else
- freelist = NULL;
+ }
return freelist;
}
@@ -2565,6 +2721,8 @@
void *freelist;
struct page *page;
+ stat(s, ALLOC_SLOWPATH);
+
page = c->page;
if (!page) {
/*
@@ -2612,6 +2770,7 @@
if (!freelist) {
c->page = NULL;
+ c->tid = next_tid(c->tid);
stat(s, DEACTIVATE_BYPASS);
goto new_slab;
}
@@ -2669,7 +2828,7 @@
unsigned long flags;
local_irq_save(flags);
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
/*
* We may have been preempted and rescheduled on a different
* cpu before disabling interrupts. Need to reload cpu area
@@ -2691,7 +2850,8 @@
void *obj)
{
if (unlikely(slab_want_init_on_free(s)) && obj)
- memset((void *)((char *)obj + s->offset), 0, sizeof(void *));
+ memset((void *)((char *)kasan_reset_tag(obj) + s->offset),
+ 0, sizeof(void *));
}
/*
@@ -2705,16 +2865,23 @@
* Otherwise we can simply pick the next object from the lockless free list.
*/
static __always_inline void *slab_alloc_node(struct kmem_cache *s,
- gfp_t gfpflags, int node, unsigned long addr)
+ gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
{
void *object;
struct kmem_cache_cpu *c;
struct page *page;
unsigned long tid;
+ struct obj_cgroup *objcg = NULL;
+ bool init = false;
- s = slab_pre_alloc_hook(s, gfpflags);
+ s = slab_pre_alloc_hook(s, &objcg, 1, gfpflags);
if (!s)
return NULL;
+
+ object = kfence_alloc(s, orig_size, gfpflags);
+ if (unlikely(object))
+ goto out;
+
redo:
/*
* Must read kmem_cache cpu data via this cpu ptr. Preemption is
@@ -2723,13 +2890,13 @@
* as we end up on the original cpu again when doing the cmpxchg.
*
* We should guarantee that tid and kmem_cache are retrieved on
- * the same cpu. It could be different if CONFIG_PREEMPT so we need
+ * the same cpu. It could be different if CONFIG_PREEMPTION so we need
* to check if it is matched or not.
*/
do {
tid = this_cpu_read(s->cpu_slab->tid);
c = raw_cpu_ptr(s->cpu_slab);
- } while (IS_ENABLED(CONFIG_PREEMPT) &&
+ } while (IS_ENABLED(CONFIG_PREEMPTION) &&
unlikely(tid != READ_ONCE(c->tid)));
/*
@@ -2751,9 +2918,8 @@
object = c->freelist;
page = c->page;
- if (unlikely(!object || !node_match(page, node))) {
+ if (unlikely(!object || !page || !node_match(page, node))) {
object = __slab_alloc(s, gfpflags, node, addr, c);
- stat(s, ALLOC_SLOWPATH);
} else {
void *next_object = get_freepointer_safe(s, object);
@@ -2784,24 +2950,23 @@
}
maybe_wipe_obj_freeptr(s, object);
+ init = slab_want_init_on_alloc(gfpflags, s);
- if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object)
- memset(object, 0, s->object_size);
-
- slab_post_alloc_hook(s, gfpflags, 1, &object);
+out:
+ slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init);
return object;
}
static __always_inline void *slab_alloc(struct kmem_cache *s,
- gfp_t gfpflags, unsigned long addr)
+ gfp_t gfpflags, unsigned long addr, size_t orig_size)
{
- return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr);
+ return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr, orig_size);
}
void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
{
- void *ret = slab_alloc(s, gfpflags, _RET_IP_);
+ void *ret = slab_alloc(s, gfpflags, _RET_IP_, s->object_size);
trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size,
s->size, gfpflags);
@@ -2813,7 +2978,7 @@
#ifdef CONFIG_TRACING
void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
{
- void *ret = slab_alloc(s, gfpflags, _RET_IP_);
+ void *ret = slab_alloc(s, gfpflags, _RET_IP_, size);
trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
ret = kasan_kmalloc(s, ret, size, gfpflags);
return ret;
@@ -2824,7 +2989,7 @@
#ifdef CONFIG_NUMA
void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
{
- void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
+ void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_, s->object_size);
trace_kmem_cache_alloc_node(_RET_IP_, ret,
s->object_size, s->size, gfpflags, node);
@@ -2838,7 +3003,7 @@
gfp_t gfpflags,
int node, size_t size)
{
- void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
+ void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_, size);
trace_kmalloc_node(_RET_IP_, ret,
size, s->size, gfpflags, node);
@@ -2848,7 +3013,7 @@
}
EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
#endif
-#endif
+#endif /* CONFIG_NUMA */
/*
* Slow path handling. This may still be called frequently since objects
@@ -2868,9 +3033,12 @@
struct page new;
unsigned long counters;
struct kmem_cache_node *n = NULL;
- unsigned long uninitialized_var(flags);
+ unsigned long flags;
stat(s, FREE_SLOWPATH);
+
+ if (kfence_free(head))
+ return;
if (kmem_cache_debug(s) &&
!free_debug_processing(s, page, head, tail, cnt, addr))
@@ -2922,20 +3090,21 @@
if (likely(!n)) {
- /*
- * If we just froze the page then put it onto the
- * per cpu partial list.
- */
- if (new.frozen && !was_frozen) {
+ if (likely(was_frozen)) {
+ /*
+ * The list lock was not taken therefore no list
+ * activity can be necessary.
+ */
+ stat(s, FREE_FROZEN);
+ } else if (new.frozen) {
+ /*
+ * If we just froze the page then put it onto the
+ * per cpu partial list.
+ */
put_cpu_partial(s, page, 1);
stat(s, CPU_PARTIAL_FREE);
}
- /*
- * The list lock was not taken therefore no list
- * activity can be necessary.
- */
- if (was_frozen)
- stat(s, FREE_FROZEN);
+
return;
}
@@ -2947,8 +3116,7 @@
* then add it.
*/
if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
- if (kmem_cache_debug(s))
- remove_full(s, n, page);
+ remove_full(s, n, page);
add_partial(n, page, DEACTIVATE_TO_TAIL);
stat(s, FREE_ADD_PARTIAL);
}
@@ -2994,6 +3162,10 @@
void *tail_obj = tail ? : head;
struct kmem_cache_cpu *c;
unsigned long tid;
+
+ /* memcg_slab_free_hook() is already called for bulk free. */
+ if (!tail)
+ memcg_slab_free_hook(s, &head, 1);
redo:
/*
* Determine the currently cpus per cpu slab.
@@ -3004,7 +3176,7 @@
do {
tid = this_cpu_read(s->cpu_slab->tid);
c = raw_cpu_ptr(s->cpu_slab);
- } while (IS_ENABLED(CONFIG_PREEMPT) &&
+ } while (IS_ENABLED(CONFIG_PREEMPTION) &&
unlikely(tid != READ_ONCE(c->tid)));
/* Same with comment on barrier() in slab_alloc_node() */
@@ -3114,6 +3286,13 @@
df->s = cache_from_obj(s, object); /* Support for memcg */
}
+ if (is_kfence_address(object)) {
+ slab_free_hook(df->s, object, false);
+ __kfence_free(object);
+ p[size] = NULL; /* mark object processed */
+ return size;
+ }
+
/* Start new detached freelist */
df->page = page;
set_freepointer(df->s, object, NULL);
@@ -3155,6 +3334,7 @@
if (WARN_ON(!size))
return;
+ memcg_slab_free_hook(s, p, size);
do {
struct detached_freelist df;
@@ -3173,9 +3353,10 @@
{
struct kmem_cache_cpu *c;
int i;
+ struct obj_cgroup *objcg = NULL;
/* memcg and kmem_cache debug support */
- s = slab_pre_alloc_hook(s, flags);
+ s = slab_pre_alloc_hook(s, &objcg, size, flags);
if (unlikely(!s))
return false;
/*
@@ -3187,8 +3368,14 @@
c = this_cpu_ptr(s->cpu_slab);
for (i = 0; i < size; i++) {
- void *object = c->freelist;
+ void *object = kfence_alloc(s, s->object_size, flags);
+ if (unlikely(object)) {
+ p[i] = object;
+ continue;
+ }
+
+ object = c->freelist;
if (unlikely(!object)) {
/*
* We may have removed an object from c->freelist using
@@ -3220,20 +3407,16 @@
c->tid = next_tid(c->tid);
local_irq_enable();
- /* Clear memory outside IRQ disabled fastpath loop */
- if (unlikely(slab_want_init_on_alloc(flags, s))) {
- int j;
-
- for (j = 0; j < i; j++)
- memset(p[j], 0, s->object_size);
- }
-
- /* memcg and kmem_cache debug support */
- slab_post_alloc_hook(s, flags, size, p);
+ /*
+ * memcg and kmem_cache debug support and memory initialization.
+ * Done outside of the IRQ disabled fastpath loop.
+ */
+ slab_post_alloc_hook(s, objcg, flags, size, p,
+ slab_want_init_on_alloc(flags, s));
return i;
error:
local_irq_enable();
- slab_post_alloc_hook(s, flags, i, p);
+ slab_post_alloc_hook(s, objcg, flags, i, p, false);
__kmem_cache_free_bulk(s, i, p);
return 0;
}
@@ -3429,8 +3612,7 @@
init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
init_tracking(kmem_cache_node, n);
#endif
- n = kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node),
- GFP_KERNEL);
+ n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false);
page->freelist = get_freepointer(kmem_cache_node, n);
page->inuse = 1;
page->frozen = 0;
@@ -3518,15 +3700,15 @@
* 50% to keep some capacity around for frees.
*/
if (!kmem_cache_has_cpu_partial(s))
- s->cpu_partial = 0;
+ slub_set_cpu_partial(s, 0);
else if (s->size >= PAGE_SIZE)
- s->cpu_partial = 2;
+ slub_set_cpu_partial(s, 2);
else if (s->size >= 1024)
- s->cpu_partial = 6;
+ slub_set_cpu_partial(s, 6);
else if (s->size >= 256)
- s->cpu_partial = 13;
+ slub_set_cpu_partial(s, 13);
else
- s->cpu_partial = 30;
+ slub_set_cpu_partial(s, 30);
#endif
}
@@ -3571,22 +3753,36 @@
/*
* With that we have determined the number of bytes in actual use
- * by the object. This is the potential offset to the free pointer.
+ * by the object and redzoning.
*/
s->inuse = size;
- if (((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||
- s->ctor)) {
+ if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||
+ ((flags & SLAB_RED_ZONE) && s->object_size < sizeof(void *)) ||
+ s->ctor) {
/*
* Relocate free pointer after the object if it is not
* permitted to overwrite the first word of the object on
* kmem_cache_free.
*
* This is the case if we do RCU, have a constructor or
- * destructor or are poisoning the objects.
+ * destructor, are poisoning the objects, or are
+ * redzoning an object smaller than sizeof(void *).
+ *
+ * The assumption that s->offset >= s->inuse means free
+ * pointer is outside of the object is used in the
+ * freeptr_outside_object() function. If that is no
+ * longer true, the function needs to be modified.
*/
s->offset = size;
size += sizeof(void *);
+ } else {
+ /*
+ * Store freelist pointer near middle of object to keep
+ * it away from the edges of the object to avoid small
+ * sized over/underflows from neighboring allocations.
+ */
+ s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *));
}
#ifdef CONFIG_SLUB_DEBUG
@@ -3623,6 +3819,7 @@
*/
size = ALIGN(size, s->align);
s->size = size;
+ s->reciprocal_size = reciprocal_value(size);
if (forced_order >= 0)
order = forced_order;
else
@@ -3657,7 +3854,7 @@
static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
{
- s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
+ s->flags = kmem_cache_flags(s->size, flags, s->name);
#ifdef CONFIG_SLAB_FREELIST_HARDENED
s->random = get_random_long();
#endif
@@ -3708,39 +3905,32 @@
if (alloc_kmem_cache_cpus(s))
return 0;
- free_kmem_cache_nodes(s);
error:
- if (flags & SLAB_PANIC)
- panic("Cannot create slab %s size=%u realsize=%u order=%u offset=%u flags=%lx\n",
- s->name, s->size, s->size,
- oo_order(s->oo), s->offset, (unsigned long)flags);
+ __kmem_cache_release(s);
return -EINVAL;
}
static void list_slab_objects(struct kmem_cache *s, struct page *page,
- const char *text)
+ const char *text)
{
#ifdef CONFIG_SLUB_DEBUG
void *addr = page_address(page);
+ unsigned long *map;
void *p;
- unsigned long *map = kcalloc(BITS_TO_LONGS(page->objects),
- sizeof(long),
- GFP_ATOMIC);
- if (!map)
- return;
+
slab_err(s, page, text, s->name);
slab_lock(page);
- get_map(s, page, map);
+ map = get_map(s, page);
for_each_object(p, s, addr, page->objects) {
- if (!test_bit(slab_index(p, s, addr), map)) {
+ if (!test_bit(__obj_to_index(s, addr, p), map)) {
pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr);
print_tracking(s, p);
}
}
+ put_map(map);
slab_unlock(page);
- kfree(map);
#endif
}
@@ -3756,18 +3946,18 @@
BUG_ON(irqs_disabled());
spin_lock_irq(&n->list_lock);
- list_for_each_entry_safe(page, h, &n->partial, lru) {
+ list_for_each_entry_safe(page, h, &n->partial, slab_list) {
if (!page->inuse) {
remove_partial(n, page);
- list_add(&page->lru, &discard);
+ list_add(&page->slab_list, &discard);
} else {
list_slab_objects(s, page,
- "Objects remaining in %s on __kmem_cache_shutdown()");
+ "Objects remaining in %s on __kmem_cache_shutdown()");
}
}
spin_unlock_irq(&n->list_lock);
- list_for_each_entry_safe(page, h, &discard, lru)
+ list_for_each_entry_safe(page, h, &discard, slab_list)
discard_slab(s, page);
}
@@ -3797,7 +3987,6 @@
if (n->nr_partial || slabs_node(s, node))
return 1;
}
- sysfs_slab_remove(s);
return 0;
}
@@ -3846,7 +4035,7 @@
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
- ret = slab_alloc(s, flags, _RET_IP_);
+ ret = slab_alloc(s, flags, _RET_IP_, size);
trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
@@ -3861,11 +4050,15 @@
{
struct page *page;
void *ptr = NULL;
+ unsigned int order = get_order(size);
flags |= __GFP_COMP;
- page = alloc_pages_node(node, flags, get_order(size));
- if (page)
+ page = alloc_pages_node(node, flags, order);
+ if (page) {
ptr = page_address(page);
+ mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B,
+ PAGE_SIZE << order);
+ }
return kmalloc_large_node_hook(ptr, size, flags);
}
@@ -3890,7 +4083,7 @@
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
- ret = slab_alloc_node(s, flags, node, _RET_IP_);
+ ret = slab_alloc_node(s, flags, node, _RET_IP_, size);
trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
@@ -3899,7 +4092,7 @@
return ret;
}
EXPORT_SYMBOL(__kmalloc_node);
-#endif
+#endif /* CONFIG_NUMA */
#ifdef CONFIG_HARDENED_USERCOPY
/*
@@ -3916,6 +4109,7 @@
struct kmem_cache *s;
unsigned int offset;
size_t object_size;
+ bool is_kfence = is_kfence_address(ptr);
ptr = kasan_reset_tag(ptr);
@@ -3928,10 +4122,13 @@
to_user, 0, n);
/* Find offset within object. */
- offset = (ptr - page_address(page)) % s->size;
+ if (is_kfence)
+ offset = ptr - kfence_object_start(ptr);
+ else
+ offset = (ptr - page_address(page)) % s->size;
/* Adjust for redzone and reject if within the redzone. */
- if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) {
+ if (!is_kfence && kmem_cache_debug_flags(s, SLAB_RED_ZONE)) {
if (offset < s->red_left_pad)
usercopy_abort("SLUB object in left red zone",
s->name, to_user, offset, n);
@@ -3961,7 +4158,7 @@
}
#endif /* CONFIG_HARDENED_USERCOPY */
-static size_t __ksize(const void *object)
+size_t __ksize(const void *object)
{
struct page *page;
@@ -3972,22 +4169,12 @@
if (unlikely(!PageSlab(page))) {
WARN_ON(!PageCompound(page));
- return PAGE_SIZE << compound_order(page);
+ return page_size(page);
}
return slab_ksize(page->slab_cache);
}
-
-size_t ksize(const void *object)
-{
- size_t size = __ksize(object);
- /* We assume that ksize callers could use whole allocated area,
- * so we need to unpoison this area.
- */
- kasan_unpoison_shadow(object, size);
- return size;
-}
-EXPORT_SYMBOL(ksize);
+EXPORT_SYMBOL(__ksize);
void kfree(const void *x)
{
@@ -4001,9 +4188,13 @@
page = virt_to_head_page(x);
if (unlikely(!PageSlab(page))) {
+ unsigned int order = compound_order(page);
+
BUG_ON(!PageCompound(page));
kfree_hook(object);
- __free_pages(page, compound_order(page));
+ mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B,
+ -(PAGE_SIZE << order));
+ __free_pages(page, order);
return;
}
slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_);
@@ -4047,7 +4238,7 @@
* Note that concurrent frees may occur while we hold the
* list_lock. page->inuse here is the upper limit.
*/
- list_for_each_entry_safe(page, t, &n->partial, lru) {
+ list_for_each_entry_safe(page, t, &n->partial, slab_list) {
int free = page->objects - page->inuse;
/* Do not reread page->inuse */
@@ -4057,10 +4248,10 @@
BUG_ON(free <= 0);
if (free == page->objects) {
- list_move(&page->lru, &discard);
+ list_move(&page->slab_list, &discard);
n->nr_partial--;
} else if (free <= SHRINK_PROMOTE_MAX)
- list_move(&page->lru, promote + free - 1);
+ list_move(&page->slab_list, promote + free - 1);
}
/*
@@ -4073,7 +4264,7 @@
spin_unlock_irqrestore(&n->list_lock, flags);
/* Release empty slabs */
- list_for_each_entry_safe(page, t, &discard, lru)
+ list_for_each_entry_safe(page, t, &discard, slab_list)
discard_slab(s, page);
if (slabs_node(s, node))
@@ -4082,42 +4273,6 @@
return ret;
}
-
-#ifdef CONFIG_MEMCG
-static void kmemcg_cache_deact_after_rcu(struct kmem_cache *s)
-{
- /*
- * Called with all the locks held after a sched RCU grace period.
- * Even if @s becomes empty after shrinking, we can't know that @s
- * doesn't have allocations already in-flight and thus can't
- * destroy @s until the associated memcg is released.
- *
- * However, let's remove the sysfs files for empty caches here.
- * Each cache has a lot of interface files which aren't
- * particularly useful for empty draining caches; otherwise, we can
- * easily end up with millions of unnecessary sysfs files on
- * systems which have a lot of memory and transient cgroups.
- */
- if (!__kmem_cache_shrink(s))
- sysfs_slab_remove(s);
-}
-
-void __kmemcg_cache_deactivate(struct kmem_cache *s)
-{
- /*
- * Disable empty slabs caching. Used to avoid pinning offline
- * memory cgroups by kmem pages that can be freed.
- */
- slub_set_cpu_partial(s, 0);
- s->min_partial = 0;
-
- /*
- * s->cpu_partial is checked locklessly (see put_cpu_partial), so
- * we have to make sure the change is visible before shrinking.
- */
- slab_deactivate_memcg_cache_rcu_sched(s, kmemcg_cache_deact_after_rcu);
-}
-#endif
static int slab_mem_going_offline_callback(void *arg)
{
@@ -4265,17 +4420,15 @@
for_each_kmem_cache_node(s, node, n) {
struct page *p;
- list_for_each_entry(p, &n->partial, lru)
+ list_for_each_entry(p, &n->partial, slab_list)
p->slab_cache = s;
#ifdef CONFIG_SLUB_DEBUG
- list_for_each_entry(p, &n->full, lru)
+ list_for_each_entry(p, &n->full, slab_list)
p->slab_cache = s;
#endif
}
- slab_init_memcg_params(s);
list_add(&s->list, &slab_caches);
- memcg_link_cache(s);
return s;
}
@@ -4316,7 +4469,7 @@
cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL,
slub_cpu_dead);
- pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%d\n",
+ pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%u\n",
cache_line_size(),
slub_min_order, slub_max_order, slub_min_objects,
nr_cpu_ids, nr_node_ids);
@@ -4330,7 +4483,7 @@
__kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
slab_flags_t flags, void (*ctor)(void *))
{
- struct kmem_cache *s, *c;
+ struct kmem_cache *s;
s = find_mergeable(size, align, flags, name, ctor);
if (s) {
@@ -4342,11 +4495,6 @@
*/
s->object_size = max(s->object_size, size);
s->inuse = max(s->inuse, ALIGN(size, sizeof(void *)));
-
- for_each_memcg_cache(c, s) {
- c->object_size = s->object_size;
- c->inuse = max(c->inuse, ALIGN(size, sizeof(void *)));
- }
if (sysfs_slab_alias(s, name)) {
s->refcount--;
@@ -4369,12 +4517,16 @@
if (slab_state <= UP)
return 0;
- memcg_propagate_slab_attrs(s);
err = sysfs_slab_add(s);
- if (err)
+ if (err) {
__kmem_cache_release(s);
+ return err;
+ }
- return err;
+ if (s->flags & SLAB_STORE_USER)
+ debugfs_slab_add(s);
+
+ return 0;
}
void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
@@ -4390,7 +4542,7 @@
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
- ret = slab_alloc(s, gfpflags, caller);
+ ret = slab_alloc(s, gfpflags, caller, size);
/* Honor the call site pointer we received. */
trace_kmalloc(caller, ret, size, s->size, gfpflags);
@@ -4421,7 +4573,7 @@
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
- ret = slab_alloc_node(s, gfpflags, node, caller);
+ ret = slab_alloc_node(s, gfpflags, node, caller, size);
/* Honor the call site pointer we received. */
trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
@@ -4444,43 +4596,33 @@
#endif
#ifdef CONFIG_SLUB_DEBUG
-static int validate_slab(struct kmem_cache *s, struct page *page,
- unsigned long *map)
+static void validate_slab(struct kmem_cache *s, struct page *page)
{
void *p;
void *addr = page_address(page);
+ unsigned long *map;
- if (!check_slab(s, page) ||
- !on_freelist(s, page, NULL))
- return 0;
+ slab_lock(page);
+
+ if (!check_slab(s, page) || !on_freelist(s, page, NULL))
+ goto unlock;
/* Now we know that a valid freelist exists */
- bitmap_zero(map, page->objects);
-
- get_map(s, page, map);
+ map = get_map(s, page);
for_each_object(p, s, addr, page->objects) {
- if (test_bit(slab_index(p, s, addr), map))
- if (!check_object(s, page, p, SLUB_RED_INACTIVE))
- return 0;
+ u8 val = test_bit(__obj_to_index(s, addr, p), map) ?
+ SLUB_RED_INACTIVE : SLUB_RED_ACTIVE;
+
+ if (!check_object(s, page, p, val))
+ break;
}
-
- for_each_object(p, s, addr, page->objects)
- if (!test_bit(slab_index(p, s, addr), map))
- if (!check_object(s, page, p, SLUB_RED_ACTIVE))
- return 0;
- return 1;
-}
-
-static void validate_slab_slab(struct kmem_cache *s, struct page *page,
- unsigned long *map)
-{
- slab_lock(page);
- validate_slab(s, page, map);
+ put_map(map);
+unlock:
slab_unlock(page);
}
static int validate_slab_node(struct kmem_cache *s,
- struct kmem_cache_node *n, unsigned long *map)
+ struct kmem_cache_node *n)
{
unsigned long count = 0;
struct page *page;
@@ -4488,8 +4630,8 @@
spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry(page, &n->partial, lru) {
- validate_slab_slab(s, page, map);
+ list_for_each_entry(page, &n->partial, slab_list) {
+ validate_slab(s, page);
count++;
}
if (count != n->nr_partial)
@@ -4499,8 +4641,8 @@
if (!(s->flags & SLAB_STORE_USER))
goto out;
- list_for_each_entry(page, &n->full, lru) {
- validate_slab_slab(s, page, map);
+ list_for_each_entry(page, &n->full, slab_list) {
+ validate_slab(s, page);
count++;
}
if (count != atomic_long_read(&n->nr_slabs))
@@ -4516,20 +4658,16 @@
{
int node;
unsigned long count = 0;
- unsigned long *map = kmalloc_array(BITS_TO_LONGS(oo_objects(s->max)),
- sizeof(unsigned long),
- GFP_KERNEL);
struct kmem_cache_node *n;
-
- if (!map)
- return -ENOMEM;
flush_all(s);
for_each_kmem_cache_node(s, node, n)
- count += validate_slab_node(s, n, map);
- kfree(map);
+ count += validate_slab_node(s, n);
+
return count;
}
+
+#ifdef CONFIG_DEBUG_FS
/*
* Generate lists of code addresses where slabcache objects are allocated
* and freed.
@@ -4551,7 +4689,10 @@
unsigned long max;
unsigned long count;
struct location *loc;
+ loff_t idx;
};
+
+static struct dentry *slab_debugfs_root;
static void free_loc_track(struct loc_track *t)
{
@@ -4658,105 +4799,19 @@
static void process_slab(struct loc_track *t, struct kmem_cache *s,
struct page *page, enum track_item alloc,
- unsigned long *map)
+ unsigned long *obj_map)
{
void *addr = page_address(page);
void *p;
- bitmap_zero(map, page->objects);
- get_map(s, page, map);
+ __fill_map(obj_map, s, page);
for_each_object(p, s, addr, page->objects)
- if (!test_bit(slab_index(p, s, addr), map))
+ if (!test_bit(__obj_to_index(s, addr, p), obj_map))
add_location(t, s, get_track(s, p, alloc));
}
-
-static int list_locations(struct kmem_cache *s, char *buf,
- enum track_item alloc)
-{
- int len = 0;
- unsigned long i;
- struct loc_track t = { 0, 0, NULL };
- int node;
- unsigned long *map = kmalloc_array(BITS_TO_LONGS(oo_objects(s->max)),
- sizeof(unsigned long),
- GFP_KERNEL);
- struct kmem_cache_node *n;
-
- if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
- GFP_KERNEL)) {
- kfree(map);
- return sprintf(buf, "Out of memory\n");
- }
- /* Push back cpu slabs */
- flush_all(s);
-
- for_each_kmem_cache_node(s, node, n) {
- unsigned long flags;
- struct page *page;
-
- if (!atomic_long_read(&n->nr_slabs))
- continue;
-
- spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry(page, &n->partial, lru)
- process_slab(&t, s, page, alloc, map);
- list_for_each_entry(page, &n->full, lru)
- process_slab(&t, s, page, alloc, map);
- spin_unlock_irqrestore(&n->list_lock, flags);
- }
-
- for (i = 0; i < t.count; i++) {
- struct location *l = &t.loc[i];
-
- if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100)
- break;
- len += sprintf(buf + len, "%7ld ", l->count);
-
- if (l->addr)
- len += sprintf(buf + len, "%pS", (void *)l->addr);
- else
- len += sprintf(buf + len, "<not-available>");
-
- if (l->sum_time != l->min_time) {
- len += sprintf(buf + len, " age=%ld/%ld/%ld",
- l->min_time,
- (long)div_u64(l->sum_time, l->count),
- l->max_time);
- } else
- len += sprintf(buf + len, " age=%ld",
- l->min_time);
-
- if (l->min_pid != l->max_pid)
- len += sprintf(buf + len, " pid=%ld-%ld",
- l->min_pid, l->max_pid);
- else
- len += sprintf(buf + len, " pid=%ld",
- l->min_pid);
-
- if (num_online_cpus() > 1 &&
- !cpumask_empty(to_cpumask(l->cpus)) &&
- len < PAGE_SIZE - 60)
- len += scnprintf(buf + len, PAGE_SIZE - len - 50,
- " cpus=%*pbl",
- cpumask_pr_args(to_cpumask(l->cpus)));
-
- if (nr_online_nodes > 1 && !nodes_empty(l->nodes) &&
- len < PAGE_SIZE - 60)
- len += scnprintf(buf + len, PAGE_SIZE - len - 50,
- " nodes=%*pbl",
- nodemask_pr_args(&l->nodes));
-
- len += sprintf(buf + len, "\n");
- }
-
- free_loc_track(&t);
- kfree(map);
- if (!t.count)
- len += sprintf(buf, "No data\n");
- return len;
-}
-#endif
+#endif /* CONFIG_DEBUG_FS */
+#endif /* CONFIG_SLUB_DEBUG */
#ifdef SLUB_RESILIENCY_TEST
static void __init resiliency_test(void)
@@ -4816,7 +4871,7 @@
#ifdef CONFIG_SLUB_SYSFS
static void resiliency_test(void) {};
#endif
-#endif
+#endif /* SLUB_RESILIENCY_TEST */
#ifdef CONFIG_SLUB_SYSFS
enum slab_stat_type {
@@ -4955,20 +5010,6 @@
return x + sprintf(buf + x, "\n");
}
-#ifdef CONFIG_SLUB_DEBUG
-static int any_slab_objects(struct kmem_cache *s)
-{
- int node;
- struct kmem_cache_node *n;
-
- for_each_kmem_cache_node(s, node, n)
- if (atomic_long_read(&n->total_objects))
- return 1;
-
- return 0;
-}
-#endif
-
#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
#define to_slab(n) container_of(n, struct kmem_cache, kobj)
@@ -5010,28 +5051,11 @@
}
SLAB_ATTR_RO(objs_per_slab);
-static ssize_t order_store(struct kmem_cache *s,
- const char *buf, size_t length)
-{
- unsigned int order;
- int err;
-
- err = kstrtouint(buf, 10, &order);
- if (err)
- return err;
-
- if (order > slub_max_order || order < slub_min_order)
- return -EINVAL;
-
- calculate_sizes(s, order);
- return length;
-}
-
static ssize_t order_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%u\n", oo_order(s->oo));
}
-SLAB_ATTR(order);
+SLAB_ATTR_RO(order);
static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
{
@@ -5153,16 +5177,7 @@
{
return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
}
-
-static ssize_t reclaim_account_store(struct kmem_cache *s,
- const char *buf, size_t length)
-{
- s->flags &= ~SLAB_RECLAIM_ACCOUNT;
- if (buf[0] == '1')
- s->flags |= SLAB_RECLAIM_ACCOUNT;
- return length;
-}
-SLAB_ATTR(reclaim_account);
+SLAB_ATTR_RO(reclaim_account);
static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
{
@@ -5207,104 +5222,34 @@
{
return sprintf(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS));
}
-
-static ssize_t sanity_checks_store(struct kmem_cache *s,
- const char *buf, size_t length)
-{
- s->flags &= ~SLAB_CONSISTENCY_CHECKS;
- if (buf[0] == '1') {
- s->flags &= ~__CMPXCHG_DOUBLE;
- s->flags |= SLAB_CONSISTENCY_CHECKS;
- }
- return length;
-}
-SLAB_ATTR(sanity_checks);
+SLAB_ATTR_RO(sanity_checks);
static ssize_t trace_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
}
-
-static ssize_t trace_store(struct kmem_cache *s, const char *buf,
- size_t length)
-{
- /*
- * Tracing a merged cache is going to give confusing results
- * as well as cause other issues like converting a mergeable
- * cache into an umergeable one.
- */
- if (s->refcount > 1)
- return -EINVAL;
-
- s->flags &= ~SLAB_TRACE;
- if (buf[0] == '1') {
- s->flags &= ~__CMPXCHG_DOUBLE;
- s->flags |= SLAB_TRACE;
- }
- return length;
-}
-SLAB_ATTR(trace);
+SLAB_ATTR_RO(trace);
static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
}
-static ssize_t red_zone_store(struct kmem_cache *s,
- const char *buf, size_t length)
-{
- if (any_slab_objects(s))
- return -EBUSY;
-
- s->flags &= ~SLAB_RED_ZONE;
- if (buf[0] == '1') {
- s->flags |= SLAB_RED_ZONE;
- }
- calculate_sizes(s, -1);
- return length;
-}
-SLAB_ATTR(red_zone);
+SLAB_ATTR_RO(red_zone);
static ssize_t poison_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
}
-static ssize_t poison_store(struct kmem_cache *s,
- const char *buf, size_t length)
-{
- if (any_slab_objects(s))
- return -EBUSY;
-
- s->flags &= ~SLAB_POISON;
- if (buf[0] == '1') {
- s->flags |= SLAB_POISON;
- }
- calculate_sizes(s, -1);
- return length;
-}
-SLAB_ATTR(poison);
+SLAB_ATTR_RO(poison);
static ssize_t store_user_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
}
-static ssize_t store_user_store(struct kmem_cache *s,
- const char *buf, size_t length)
-{
- if (any_slab_objects(s))
- return -EBUSY;
-
- s->flags &= ~SLAB_STORE_USER;
- if (buf[0] == '1') {
- s->flags &= ~__CMPXCHG_DOUBLE;
- s->flags |= SLAB_STORE_USER;
- }
- calculate_sizes(s, -1);
- return length;
-}
-SLAB_ATTR(store_user);
+SLAB_ATTR_RO(store_user);
static ssize_t validate_show(struct kmem_cache *s, char *buf)
{
@@ -5325,21 +5270,6 @@
}
SLAB_ATTR(validate);
-static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
-{
- if (!(s->flags & SLAB_STORE_USER))
- return -ENOSYS;
- return list_locations(s, buf, TRACK_ALLOC);
-}
-SLAB_ATTR_RO(alloc_calls);
-
-static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
-{
- if (!(s->flags & SLAB_STORE_USER))
- return -ENOSYS;
- return list_locations(s, buf, TRACK_FREE);
-}
-SLAB_ATTR_RO(free_calls);
#endif /* CONFIG_SLUB_DEBUG */
#ifdef CONFIG_FAILSLAB
@@ -5347,19 +5277,7 @@
{
return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
}
-
-static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
- size_t length)
-{
- if (s->refcount > 1)
- return -EINVAL;
-
- s->flags &= ~SLAB_FAILSLAB;
- if (buf[0] == '1')
- s->flags |= SLAB_FAILSLAB;
- return length;
-}
-SLAB_ATTR(failslab);
+SLAB_ATTR_RO(failslab);
#endif
static ssize_t shrink_show(struct kmem_cache *s, char *buf)
@@ -5482,7 +5400,7 @@
STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
-#endif
+#endif /* CONFIG_SLUB_STATS */
static struct attribute *slab_attrs[] = {
&slab_size_attr.attr,
@@ -5512,8 +5430,6 @@
&poison_attr.attr,
&store_user_attr.attr,
&validate_attr.attr,
- &alloc_calls_attr.attr,
- &free_calls_attr.attr,
#endif
#ifdef CONFIG_ZONE_DMA
&cache_dma_attr.attr,
@@ -5595,96 +5511,7 @@
return -EIO;
err = attribute->store(s, buf, len);
-#ifdef CONFIG_MEMCG
- if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
- struct kmem_cache *c;
-
- mutex_lock(&slab_mutex);
- if (s->max_attr_size < len)
- s->max_attr_size = len;
-
- /*
- * This is a best effort propagation, so this function's return
- * value will be determined by the parent cache only. This is
- * basically because not all attributes will have a well
- * defined semantics for rollbacks - most of the actions will
- * have permanent effects.
- *
- * Returning the error value of any of the children that fail
- * is not 100 % defined, in the sense that users seeing the
- * error code won't be able to know anything about the state of
- * the cache.
- *
- * Only returning the error code for the parent cache at least
- * has well defined semantics. The cache being written to
- * directly either failed or succeeded, in which case we loop
- * through the descendants with best-effort propagation.
- */
- for_each_memcg_cache(c, s)
- attribute->store(c, buf, len);
- mutex_unlock(&slab_mutex);
- }
-#endif
return err;
-}
-
-static void memcg_propagate_slab_attrs(struct kmem_cache *s)
-{
-#ifdef CONFIG_MEMCG
- int i;
- char *buffer = NULL;
- struct kmem_cache *root_cache;
-
- if (is_root_cache(s))
- return;
-
- root_cache = s->memcg_params.root_cache;
-
- /*
- * This mean this cache had no attribute written. Therefore, no point
- * in copying default values around
- */
- if (!root_cache->max_attr_size)
- return;
-
- for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) {
- char mbuf[64];
- char *buf;
- struct slab_attribute *attr = to_slab_attr(slab_attrs[i]);
- ssize_t len;
-
- if (!attr || !attr->store || !attr->show)
- continue;
-
- /*
- * It is really bad that we have to allocate here, so we will
- * do it only as a fallback. If we actually allocate, though,
- * we can just use the allocated buffer until the end.
- *
- * Most of the slub attributes will tend to be very small in
- * size, but sysfs allows buffers up to a page, so they can
- * theoretically happen.
- */
- if (buffer)
- buf = buffer;
- else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf) &&
- !IS_ENABLED(CONFIG_SLUB_STATS))
- buf = mbuf;
- else {
- buffer = (char *) get_zeroed_page(GFP_KERNEL);
- if (WARN_ON(!buffer))
- continue;
- buf = buffer;
- }
-
- len = attr->show(root_cache, buf);
- if (len > 0)
- attr->store(s, buf, len);
- }
-
- if (buffer)
- free_page((unsigned long)buffer);
-#endif
}
static void kmem_cache_release(struct kobject *k)
@@ -5702,27 +5529,10 @@
.release = kmem_cache_release,
};
-static int uevent_filter(struct kset *kset, struct kobject *kobj)
-{
- struct kobj_type *ktype = get_ktype(kobj);
-
- if (ktype == &slab_ktype)
- return 1;
- return 0;
-}
-
-static const struct kset_uevent_ops slab_uevent_ops = {
- .filter = uevent_filter,
-};
-
static struct kset *slab_kset;
static inline struct kset *cache_kset(struct kmem_cache *s)
{
-#ifdef CONFIG_MEMCG
- if (!is_root_cache(s))
- return s->memcg_params.root_cache->memcg_kset;
-#endif
return slab_kset;
}
@@ -5737,7 +5547,8 @@
char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
char *p = name;
- BUG_ON(!name);
+ if (!name)
+ return ERR_PTR(-ENOMEM);
*p++ = ':';
/*
@@ -5765,36 +5576,12 @@
return name;
}
-static void sysfs_slab_remove_workfn(struct work_struct *work)
-{
- struct kmem_cache *s =
- container_of(work, struct kmem_cache, kobj_remove_work);
-
- if (!s->kobj.state_in_sysfs)
- /*
- * For a memcg cache, this may be called during
- * deactivation and again on shutdown. Remove only once.
- * A cache is never shut down before deactivation is
- * complete, so no need to worry about synchronization.
- */
- goto out;
-
-#ifdef CONFIG_MEMCG
- kset_unregister(s->memcg_kset);
-#endif
- kobject_uevent(&s->kobj, KOBJ_REMOVE);
-out:
- kobject_put(&s->kobj);
-}
-
static int sysfs_slab_add(struct kmem_cache *s)
{
int err;
const char *name;
struct kset *kset = cache_kset(s);
int unmergeable = slab_unmergeable(s);
-
- INIT_WORK(&s->kobj_remove_work, sysfs_slab_remove_workfn);
if (!kset) {
kobject_init(&s->kobj, &slab_ktype);
@@ -5819,6 +5606,8 @@
* for the symlinks.
*/
name = create_unique_id(s);
+ if (IS_ERR(name))
+ return PTR_ERR(name);
}
s->kobj.kset = kset;
@@ -5830,17 +5619,6 @@
if (err)
goto out_del_kobj;
-#ifdef CONFIG_MEMCG
- if (is_root_cache(s) && memcg_sysfs_enabled) {
- s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj);
- if (!s->memcg_kset) {
- err = -ENOMEM;
- goto out_del_kobj;
- }
- }
-#endif
-
- kobject_uevent(&s->kobj, KOBJ_ADD);
if (!unmergeable) {
/* Setup first alias */
sysfs_slab_alias(s, s->name);
@@ -5852,19 +5630,6 @@
out_del_kobj:
kobject_del(&s->kobj);
goto out;
-}
-
-static void sysfs_slab_remove(struct kmem_cache *s)
-{
- if (slab_state < FULL)
- /*
- * Sysfs has not been setup yet so no need to remove the
- * cache from sysfs.
- */
- return;
-
- kobject_get(&s->kobj);
- schedule_work(&s->kobj_remove_work);
}
void sysfs_slab_unlink(struct kmem_cache *s)
@@ -5921,7 +5686,7 @@
mutex_lock(&slab_mutex);
- slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
+ slab_kset = kset_create_and_add("slab", NULL, kernel_kobj);
if (!slab_kset) {
mutex_unlock(&slab_mutex);
pr_err("Cannot register slab subsystem.\n");
@@ -5956,6 +5721,189 @@
__initcall(slab_sysfs_init);
#endif /* CONFIG_SLUB_SYSFS */
+#if defined(CONFIG_SLUB_DEBUG) && defined(CONFIG_DEBUG_FS)
+static int slab_debugfs_show(struct seq_file *seq, void *v)
+{
+ struct loc_track *t = seq->private;
+ struct location *l;
+ unsigned long idx;
+
+ idx = (unsigned long) t->idx;
+ if (idx < t->count) {
+ l = &t->loc[idx];
+
+ seq_printf(seq, "%7ld ", l->count);
+
+ if (l->addr)
+ seq_printf(seq, "%pS", (void *)l->addr);
+ else
+ seq_puts(seq, "<not-available>");
+
+ if (l->sum_time != l->min_time) {
+ seq_printf(seq, " age=%ld/%llu/%ld",
+ l->min_time, div_u64(l->sum_time, l->count),
+ l->max_time);
+ } else
+ seq_printf(seq, " age=%ld", l->min_time);
+
+ if (l->min_pid != l->max_pid)
+ seq_printf(seq, " pid=%ld-%ld", l->min_pid, l->max_pid);
+ else
+ seq_printf(seq, " pid=%ld",
+ l->min_pid);
+
+ if (num_online_cpus() > 1 && !cpumask_empty(to_cpumask(l->cpus)))
+ seq_printf(seq, " cpus=%*pbl",
+ cpumask_pr_args(to_cpumask(l->cpus)));
+
+ if (nr_online_nodes > 1 && !nodes_empty(l->nodes))
+ seq_printf(seq, " nodes=%*pbl",
+ nodemask_pr_args(&l->nodes));
+
+ seq_puts(seq, "\n");
+ }
+
+ if (!idx && !t->count)
+ seq_puts(seq, "No data\n");
+
+ return 0;
+}
+
+static void slab_debugfs_stop(struct seq_file *seq, void *v)
+{
+}
+
+static void *slab_debugfs_next(struct seq_file *seq, void *v, loff_t *ppos)
+{
+ struct loc_track *t = seq->private;
+
+ t->idx = ++(*ppos);
+ if (*ppos <= t->count)
+ return ppos;
+
+ return NULL;
+}
+
+static void *slab_debugfs_start(struct seq_file *seq, loff_t *ppos)
+{
+ struct loc_track *t = seq->private;
+
+ t->idx = *ppos;
+ return ppos;
+}
+
+static const struct seq_operations slab_debugfs_sops = {
+ .start = slab_debugfs_start,
+ .next = slab_debugfs_next,
+ .stop = slab_debugfs_stop,
+ .show = slab_debugfs_show,
+};
+
+static int slab_debug_trace_open(struct inode *inode, struct file *filep)
+{
+
+ struct kmem_cache_node *n;
+ enum track_item alloc;
+ int node;
+ struct loc_track *t = __seq_open_private(filep, &slab_debugfs_sops,
+ sizeof(struct loc_track));
+ struct kmem_cache *s = file_inode(filep)->i_private;
+ unsigned long *obj_map;
+
+ if (!t)
+ return -ENOMEM;
+
+ obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL);
+ if (!obj_map) {
+ seq_release_private(inode, filep);
+ return -ENOMEM;
+ }
+
+ if (strcmp(filep->f_path.dentry->d_name.name, "alloc_traces") == 0)
+ alloc = TRACK_ALLOC;
+ else
+ alloc = TRACK_FREE;
+
+ if (!alloc_loc_track(t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) {
+ bitmap_free(obj_map);
+ seq_release_private(inode, filep);
+ return -ENOMEM;
+ }
+
+ /* Push back cpu slabs */
+ flush_all(s);
+
+ for_each_kmem_cache_node(s, node, n) {
+ unsigned long flags;
+ struct page *page;
+
+ if (!atomic_long_read(&n->nr_slabs))
+ continue;
+
+ spin_lock_irqsave(&n->list_lock, flags);
+ list_for_each_entry(page, &n->partial, slab_list)
+ process_slab(t, s, page, alloc, obj_map);
+ list_for_each_entry(page, &n->full, slab_list)
+ process_slab(t, s, page, alloc, obj_map);
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ }
+
+ bitmap_free(obj_map);
+ return 0;
+}
+
+static int slab_debug_trace_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+ struct loc_track *t = seq->private;
+
+ free_loc_track(t);
+ return seq_release_private(inode, file);
+}
+
+static const struct file_operations slab_debugfs_fops = {
+ .open = slab_debug_trace_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = slab_debug_trace_release,
+};
+
+static void debugfs_slab_add(struct kmem_cache *s)
+{
+ struct dentry *slab_cache_dir;
+
+ if (unlikely(!slab_debugfs_root))
+ return;
+
+ slab_cache_dir = debugfs_create_dir(s->name, slab_debugfs_root);
+
+ debugfs_create_file("alloc_traces", 0400,
+ slab_cache_dir, s, &slab_debugfs_fops);
+
+ debugfs_create_file("free_traces", 0400,
+ slab_cache_dir, s, &slab_debugfs_fops);
+}
+
+void debugfs_slab_release(struct kmem_cache *s)
+{
+ debugfs_remove_recursive(debugfs_lookup(s->name, slab_debugfs_root));
+}
+
+static int __init slab_debugfs_init(void)
+{
+ struct kmem_cache *s;
+
+ slab_debugfs_root = debugfs_create_dir("slab", NULL);
+
+ list_for_each_entry(s, &slab_caches, list)
+ if (s->flags & SLAB_STORE_USER)
+ debugfs_slab_add(s);
+
+ return 0;
+
+}
+__initcall(slab_debugfs_init);
+#endif
/*
* The /proc/slabinfo ABI
*/
@@ -5981,6 +5929,7 @@
sinfo->objects_per_slab = oo_objects(s->oo);
sinfo->cache_order = oo_order(s->oo);
}
+EXPORT_SYMBOL_GPL(get_slabinfo);
void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s)
{
--
Gitblit v1.6.2