From 102a0743326a03cd1a1202ceda21e175b7d3575c Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Tue, 20 Feb 2024 01:20:52 +0000 Subject: [PATCH] add new system file --- kernel/mm/slub.c | 1736 +++++++++++++++++++++++++++-------------------------------- 1 files changed, 804 insertions(+), 932 deletions(-) diff --git a/kernel/mm/slub.c b/kernel/mm/slub.c index f35fa1a..3acf083 100644 --- a/kernel/mm/slub.c +++ b/kernel/mm/slub.c @@ -28,6 +28,7 @@ #include <linux/ctype.h> #include <linux/debugobjects.h> #include <linux/kallsyms.h> +#include <linux/kfence.h> #include <linux/memory.h> #include <linux/math64.h> #include <linux/fault-inject.h> @@ -36,7 +37,9 @@ #include <linux/memcontrol.h> #include <linux/random.h> +#include <linux/debugfs.h> #include <trace/events/kmem.h> +#include <trace/hooks/mm.h> #include "internal.h" @@ -59,10 +62,11 @@ * D. page->frozen -> frozen state * * If a slab is frozen then it is exempt from list management. It is not - * on any list. The processor that froze the slab is the one who can - * perform list operations on the page. Other processors may put objects - * onto the freelist but the processor that froze the slab is the only - * one that can retrieve the objects from the page's freelist. + * on any list except per cpu partial list. The processor that froze the + * slab is the one who can perform list operations on the page. Other + * processors may put objects onto the freelist but the processor that + * froze the slab is the only one that can retrieve the objects from the + * page's freelist. * * The list_lock protects the partial and full list on each node and * the partial slab counter. If taken then no new slabs may be added or @@ -93,9 +97,7 @@ * minimal so we rely on the page allocators per cpu caches for * fast frees and allocs. * - * Overloading of page flags that are otherwise used for LRU management. - * - * PageActive The slab is frozen and exempt from list processing. + * page->frozen The slab is frozen and exempt from list processing. * This means that the slab is dedicated to a purpose * such as satisfying allocations for a specific * processor. Objects may be freed in the slab while @@ -111,23 +113,27 @@ * free objects in addition to the regular freelist * that requires the slab lock. * - * PageError Slab requires special handling due to debug + * SLAB_DEBUG_FLAGS Slab requires special handling due to debug * options set. This moves slab handling out of * the fast path and disables lockless freelists. */ -static inline int kmem_cache_debug(struct kmem_cache *s) -{ #ifdef CONFIG_SLUB_DEBUG - return unlikely(s->flags & SLAB_DEBUG_FLAGS); +#ifdef CONFIG_SLUB_DEBUG_ON +DEFINE_STATIC_KEY_TRUE(slub_debug_enabled); #else - return 0; +DEFINE_STATIC_KEY_FALSE(slub_debug_enabled); #endif +#endif + +static inline bool kmem_cache_debug(struct kmem_cache *s) +{ + return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS); } void *fixup_red_left(struct kmem_cache *s, void *p) { - if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) + if (kmem_cache_debug_flags(s, SLAB_RED_ZONE)) p += s->red_left_pad; return p; @@ -197,33 +203,19 @@ /* Use cmpxchg_double */ #define __CMPXCHG_DOUBLE ((slab_flags_t __force)0x40000000U) -/* - * Tracking user of a slab. - */ -#define TRACK_ADDRS_COUNT 16 -struct track { - unsigned long addr; /* Called from address */ -#ifdef CONFIG_STACKTRACE - unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */ -#endif - int cpu; /* Was running on cpu */ - int pid; /* Pid context */ - unsigned long when; /* When did the operation occur */ -}; - -enum track_item { TRACK_ALLOC, TRACK_FREE }; - #ifdef CONFIG_SLUB_SYSFS static int sysfs_slab_add(struct kmem_cache *); static int sysfs_slab_alias(struct kmem_cache *, const char *); -static void memcg_propagate_slab_attrs(struct kmem_cache *s); -static void sysfs_slab_remove(struct kmem_cache *s); #else static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; } -static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { } -static inline void sysfs_slab_remove(struct kmem_cache *s) { } +#endif + +#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG) +static void debugfs_slab_add(struct kmem_cache *); +#else +static inline void debugfs_slab_add(struct kmem_cache *s) { } #endif static inline void stat(const struct kmem_cache *s, enum stat_item si) @@ -251,7 +243,7 @@ { #ifdef CONFIG_SLAB_FREELIST_HARDENED /* - * When CONFIG_KASAN_SW_TAGS is enabled, ptr_addr might be tagged. + * When CONFIG_KASAN_SW/HW_TAGS is enabled, ptr_addr might be tagged. * Normally, this doesn't cause any issues, as both set_freepointer() * and get_freepointer() are called with a pointer with the same tag. * However, there are some issues with CONFIG_SLUB_DEBUG code. For @@ -277,6 +269,7 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object) { + object = kasan_reset_tag(object); return freelist_dereference(s, object + s->offset); } @@ -290,11 +283,12 @@ unsigned long freepointer_addr; void *p; - if (!debug_pagealloc_enabled()) + if (!debug_pagealloc_enabled_static()) return get_freepointer(s, object); + object = kasan_reset_tag(object); freepointer_addr = (unsigned long)object + s->offset; - probe_kernel_read(&p, (void **)freepointer_addr, sizeof(p)); + copy_from_kernel_nofault(&p, (void **)freepointer_addr, sizeof(p)); return freelist_ptr(s, p, freepointer_addr); } @@ -306,6 +300,7 @@ BUG_ON(object == fp); /* naive detection of double free or corruption */ #endif + freeptr_addr = (unsigned long)kasan_reset_tag((void *)freeptr_addr); *(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr); } @@ -314,12 +309,6 @@ for (__p = fixup_red_left(__s, __addr); \ __p < (__addr) + (__objects) * (__s)->size; \ __p += (__s)->size) - -/* Determine object index from a given position */ -static inline unsigned int slab_index(void *p, struct kmem_cache *s, void *addr) -{ - return (kasan_reset_tag(p) - addr) / s->size; -} static inline unsigned int order_objects(unsigned int order, unsigned int size) { @@ -441,19 +430,43 @@ } #ifdef CONFIG_SLUB_DEBUG +static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)]; +static DEFINE_SPINLOCK(object_map_lock); + +static void __fill_map(unsigned long *obj_map, struct kmem_cache *s, + struct page *page) +{ + void *addr = page_address(page); + void *p; + + bitmap_zero(obj_map, page->objects); + + for (p = page->freelist; p; p = get_freepointer(s, p)) + set_bit(__obj_to_index(s, addr, p), obj_map); +} + /* * Determine a map of object in use on a page. * * Node listlock must be held to guarantee that the page does * not vanish from under us. */ -static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) +static unsigned long *get_map(struct kmem_cache *s, struct page *page) + __acquires(&object_map_lock) { - void *p; - void *addr = page_address(page); + VM_BUG_ON(!irqs_disabled()); - for (p = page->freelist; p; p = get_freepointer(s, p)) - set_bit(slab_index(p, s, addr), map); + spin_lock(&object_map_lock); + + __fill_map(object_map, s, page); + + return object_map; +} + +static void put_map(unsigned long *map) __releases(&object_map_lock) +{ + VM_BUG_ON(map != object_map); + spin_unlock(&object_map_lock); } static inline unsigned int size_from_object(struct kmem_cache *s) @@ -476,12 +489,12 @@ * Debug settings: */ #if defined(CONFIG_SLUB_DEBUG_ON) -static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS; +slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS; #else -static slab_flags_t slub_debug; +slab_flags_t slub_debug; #endif -static char *slub_debug_slabs; +static char *slub_debug_string; static int disable_higher_order_debug; /* @@ -528,9 +541,29 @@ unsigned int length) { metadata_access_enable(); - print_hex_dump(level, text, DUMP_PREFIX_ADDRESS, 16, 1, addr, - length, 1); + print_hex_dump(level, text, DUMP_PREFIX_ADDRESS, + 16, 1, kasan_reset_tag((void *)addr), length, 1); metadata_access_disable(); +} + +/* + * See comment in calculate_sizes(). + */ +static inline bool freeptr_outside_object(struct kmem_cache *s) +{ + return s->offset >= s->inuse; +} + +/* + * Return offset of the end of info block which is inuse + free pointer if + * not overlapping with object. + */ +static inline unsigned int get_info_end(struct kmem_cache *s) +{ + if (freeptr_outside_object(s)) + return s->inuse + sizeof(void *); + else + return s->inuse; } static struct track *get_track(struct kmem_cache *s, void *object, @@ -538,13 +571,45 @@ { struct track *p; - if (s->offset) - p = object + s->offset + sizeof(void *); - else - p = object + s->inuse; + p = object + get_info_end(s); - return p + alloc; + return kasan_reset_tag(p + alloc); } + +/* + * This function will be used to loop through all the slab objects in + * a page to give track structure for each object, the function fn will + * be using this track structure and extract required info into its private + * data, the return value will be the number of track structures that are + * processed. + */ +unsigned long get_each_object_track(struct kmem_cache *s, + struct page *page, enum track_item alloc, + int (*fn)(const struct kmem_cache *, const void *, + const struct track *, void *), void *private) +{ + void *p; + struct track *t; + int ret; + unsigned long num_track = 0; + + if (!slub_debug || !(s->flags & SLAB_STORE_USER)) + return 0; + + slab_lock(page); + for_each_object(p, s, page_address(page), page->objects) { + t = get_track(s, p, alloc); + metadata_access_enable(); + ret = fn(s, p, t, private); + metadata_access_disable(); + if (ret < 0) + break; + num_track += 1; + } + slab_unlock(page); + return num_track; +} +EXPORT_SYMBOL_GPL(get_each_object_track); static void set_track(struct kmem_cache *s, void *object, enum track_item alloc, unsigned long addr) @@ -553,31 +618,25 @@ if (addr) { #ifdef CONFIG_STACKTRACE - struct stack_trace trace; - int i; + unsigned int nr_entries; - trace.nr_entries = 0; - trace.max_entries = TRACK_ADDRS_COUNT; - trace.entries = p->addrs; - trace.skip = 3; metadata_access_enable(); - save_stack_trace(&trace); + nr_entries = stack_trace_save(kasan_reset_tag(p->addrs), + TRACK_ADDRS_COUNT, 3); metadata_access_disable(); - /* See rant in lockdep.c */ - if (trace.nr_entries != 0 && - trace.entries[trace.nr_entries - 1] == ULONG_MAX) - trace.nr_entries--; - - for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++) - p->addrs[i] = 0; + if (nr_entries < TRACK_ADDRS_COUNT) + p->addrs[nr_entries] = 0; + trace_android_vh_save_track_hash(alloc == TRACK_ALLOC, + (unsigned long)p); #endif p->addr = addr; p->cpu = smp_processor_id(); p->pid = current->pid; p->when = jiffies; - } else + } else { memset(p, 0, sizeof(struct track)); + } } static void init_tracking(struct kmem_cache *s, void *object) @@ -608,7 +667,7 @@ #endif } -static void print_tracking(struct kmem_cache *s, void *object) +void print_tracking(struct kmem_cache *s, void *object) { unsigned long pr_time = jiffies; if (!(s->flags & SLAB_STORE_USER)) @@ -636,8 +695,6 @@ pr_err("=============================================================================\n"); pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf); pr_err("-----------------------------------------------------------------------------\n\n"); - - add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); va_end(args); } @@ -691,10 +748,7 @@ print_section(KERN_ERR, "Redzone ", p + s->object_size, s->inuse - s->object_size); - if (s->offset) - off = s->offset + sizeof(void *); - else - off = s->inuse; + off = get_info_end(s); if (s->flags & SLAB_STORE_USER) off += 2 * sizeof(struct track); @@ -714,6 +768,7 @@ { slab_bug(s, "%s", reason); print_trailer(s, page, object); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } static __printf(3, 4) void slab_err(struct kmem_cache *s, struct page *page, @@ -728,11 +783,12 @@ slab_bug(s, "%s", buf); print_page_info(page); dump_stack(); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } static void init_object(struct kmem_cache *s, void *object, u8 val) { - u8 *p = object; + u8 *p = kasan_reset_tag(object); if (s->flags & SLAB_RED_ZONE) memset(p - s->red_left_pad, val, s->red_left_pad); @@ -759,9 +815,10 @@ { u8 *fault; u8 *end; + u8 *addr = page_address(page); metadata_access_enable(); - fault = memchr_inv(start, value, bytes); + fault = memchr_inv(kasan_reset_tag(start), value, bytes); metadata_access_disable(); if (!fault) return 1; @@ -771,9 +828,11 @@ end--; slab_bug(s, "%s overwritten", what); - pr_err("INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", - fault, end - 1, fault[0], value); + pr_err("INFO: 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", + fault, end - 1, fault - addr, + fault[0], value); print_trailer(s, page, object); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); restore_bytes(s, what, value, fault, end); return 0; @@ -785,7 +844,7 @@ * object address * Bytes of the object to be managed. * If the freepointer may overlay the object then the free - * pointer is the first word of the object. + * pointer is at the middle of the object. * * Poisoning uses 0x6b (POISON_FREE) and the last byte is * 0xa5 (POISON_END) @@ -819,11 +878,7 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) { - unsigned long off = s->inuse; /* The end of info */ - - if (s->offset) - /* Freepointer is placed after the object. */ - off += sizeof(void *); + unsigned long off = get_info_end(s); /* The end of info */ if (s->flags & SLAB_STORE_USER) /* We also have user information there */ @@ -852,7 +907,7 @@ return 1; start = page_address(page); - length = PAGE_SIZE << compound_order(page); + length = page_size(page); end = start + length; remainder = length % s->size; if (!remainder) @@ -860,14 +915,15 @@ pad = end - remainder; metadata_access_enable(); - fault = memchr_inv(pad, POISON_INUSE, remainder); + fault = memchr_inv(kasan_reset_tag(pad), POISON_INUSE, remainder); metadata_access_disable(); if (!fault) return 1; while (end > fault && end[-1] == POISON_INUSE) end--; - slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); + slab_err(s, page, "Padding overwritten. 0x%p-0x%p @offset=%tu", + fault, end - 1, fault - start); print_section(KERN_ERR, "Padding ", pad, remainder); restore_bytes(s, "slab padding", POISON_INUSE, fault, end); @@ -909,7 +965,7 @@ check_pad_bytes(s, page, p); } - if (!s->offset && val == SLUB_RED_ACTIVE) + if (!freeptr_outside_object(s) && val == SLUB_RED_ACTIVE) /* * Object and freepointer overlap. Cannot check * freepointer while object is allocated. @@ -1038,7 +1094,7 @@ return; lockdep_assert_held(&n->list_lock); - list_add(&page->lru, &n->full); + list_add(&page->slab_list, &n->full); } static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) @@ -1047,7 +1103,7 @@ return; lockdep_assert_held(&n->list_lock); - list_del(&page->lru); + list_del(&page->slab_list); } /* Tracking of the number of slabs for debugging purposes */ @@ -1090,26 +1146,26 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page, void *object) { - if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))) + if (!kmem_cache_debug_flags(s, SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)) return; init_object(s, object, SLUB_RED_INACTIVE); init_tracking(s, object); } -static void setup_page_debug(struct kmem_cache *s, void *addr, int order) +static +void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr) { - if (!(s->flags & SLAB_POISON)) + if (!kmem_cache_debug_flags(s, SLAB_POISON)) return; metadata_access_enable(); - memset(addr, POISON_INUSE, PAGE_SIZE << order); + memset(kasan_reset_tag(addr), POISON_INUSE, page_size(page)); metadata_access_disable(); } static inline int alloc_consistency_checks(struct kmem_cache *s, - struct page *page, - void *object, unsigned long addr) + struct page *page, void *object) { if (!check_slab(s, page)) return 0; @@ -1130,7 +1186,7 @@ void *object, unsigned long addr) { if (s->flags & SLAB_CONSISTENCY_CHECKS) { - if (!alloc_consistency_checks(s, page, object, addr)) + if (!alloc_consistency_checks(s, page, object)) goto bad; } @@ -1196,10 +1252,10 @@ struct kmem_cache_node *n = get_node(s, page_to_nid(page)); void *object = head; int cnt = 0; - unsigned long uninitialized_var(flags); + unsigned long flags; int ret = 0; - raw_spin_lock_irqsave(&n->list_lock, flags); + spin_lock_irqsave(&n->list_lock, flags); slab_lock(page); if (s->flags & SLAB_CONSISTENCY_CHECKS) { @@ -1234,75 +1290,144 @@ bulk_cnt, cnt); slab_unlock(page); - raw_spin_unlock_irqrestore(&n->list_lock, flags); + spin_unlock_irqrestore(&n->list_lock, flags); if (!ret) slab_fix(s, "Object at 0x%p not freed", object); return ret; } -static int __init setup_slub_debug(char *str) +/* + * Parse a block of slub_debug options. Blocks are delimited by ';' + * + * @str: start of block + * @flags: returns parsed flags, or DEBUG_DEFAULT_FLAGS if none specified + * @slabs: return start of list of slabs, or NULL when there's no list + * @init: assume this is initial parsing and not per-kmem-create parsing + * + * returns the start of next block if there's any, or NULL + */ +static char * +parse_slub_debug_flags(char *str, slab_flags_t *flags, char **slabs, bool init) { - slub_debug = DEBUG_DEFAULT_FLAGS; - if (*str++ != '=' || !*str) - /* - * No options specified. Switch on full debugging. - */ - goto out; + bool higher_order_disable = false; - if (*str == ',') + /* Skip any completely empty blocks */ + while (*str && *str == ';') + str++; + + if (*str == ',') { /* * No options but restriction on slabs. This means full * debugging for slabs matching a pattern. */ + *flags = DEBUG_DEFAULT_FLAGS; goto check_slabs; + } + *flags = 0; - slub_debug = 0; - if (*str == '-') - /* - * Switch off all debugging measures. - */ - goto out; - - /* - * Determine which debug features should be switched on - */ - for (; *str && *str != ','; str++) { + /* Determine which debug features should be switched on */ + for (; *str && *str != ',' && *str != ';'; str++) { switch (tolower(*str)) { + case '-': + *flags = 0; + break; case 'f': - slub_debug |= SLAB_CONSISTENCY_CHECKS; + *flags |= SLAB_CONSISTENCY_CHECKS; break; case 'z': - slub_debug |= SLAB_RED_ZONE; + *flags |= SLAB_RED_ZONE; break; case 'p': - slub_debug |= SLAB_POISON; + *flags |= SLAB_POISON; break; case 'u': - slub_debug |= SLAB_STORE_USER; + *flags |= SLAB_STORE_USER; break; case 't': - slub_debug |= SLAB_TRACE; + *flags |= SLAB_TRACE; break; case 'a': - slub_debug |= SLAB_FAILSLAB; + *flags |= SLAB_FAILSLAB; break; case 'o': /* * Avoid enabling debugging on caches if its minimum * order would increase as a result. */ - disable_higher_order_debug = 1; + higher_order_disable = true; break; default: - pr_err("slub_debug option '%c' unknown. skipped\n", - *str); + if (init) + pr_err("slub_debug option '%c' unknown. skipped\n", *str); + } + } +check_slabs: + if (*str == ',') + *slabs = ++str; + else + *slabs = NULL; + + /* Skip over the slab list */ + while (*str && *str != ';') + str++; + + /* Skip any completely empty blocks */ + while (*str && *str == ';') + str++; + + if (init && higher_order_disable) + disable_higher_order_debug = 1; + + if (*str) + return str; + else + return NULL; +} + +static int __init setup_slub_debug(char *str) +{ + slab_flags_t flags; + slab_flags_t global_flags; + char *saved_str; + char *slab_list; + bool global_slub_debug_changed = false; + bool slab_list_specified = false; + + global_flags = DEBUG_DEFAULT_FLAGS; + if (*str++ != '=' || !*str) + /* + * No options specified. Switch on full debugging. + */ + goto out; + + saved_str = str; + while (str) { + str = parse_slub_debug_flags(str, &flags, &slab_list, true); + + if (!slab_list) { + global_flags = flags; + global_slub_debug_changed = true; + } else { + slab_list_specified = true; } } -check_slabs: - if (*str == ',') - slub_debug_slabs = str + 1; + /* + * For backwards compatibility, a single list of flags with list of + * slabs means debugging is only changed for those slabs, so the global + * slub_debug should be unchanged (0 or DEBUG_DEFAULT_FLAGS, depending + * on CONFIG_SLUB_DEBUG_ON). We can extended that to multiple lists as + * long as there is no option specifying flags without a slab list. + */ + if (slab_list_specified) { + if (!global_slub_debug_changed) + global_flags = slub_debug; + slub_debug_string = saved_str; + } out: + slub_debug = global_flags; + if (slub_debug != 0 || slub_debug_string) + static_branch_enable(&slub_debug_enabled); if ((static_branch_unlikely(&init_on_alloc) || static_branch_unlikely(&init_on_free)) && (slub_debug & SLAB_POISON)) @@ -1312,24 +1437,65 @@ __setup("slub_debug", setup_slub_debug); +/* + * kmem_cache_flags - apply debugging options to the cache + * @object_size: the size of an object without meta data + * @flags: flags to set + * @name: name of the cache + * + * Debug option(s) are applied to @flags. In addition to the debug + * option(s), if a slab name (or multiple) is specified i.e. + * slub_debug=<Debug-Options>,<slab name1>,<slab name2> ... + * then only the select slabs will receive the debug option(s). + */ slab_flags_t kmem_cache_flags(unsigned int object_size, - slab_flags_t flags, const char *name, - void (*ctor)(void *)) + slab_flags_t flags, const char *name) { - /* - * Enable debugging if selected on the kernel commandline. - */ - if (slub_debug && (!slub_debug_slabs || (name && - !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs))))) - flags |= slub_debug; + char *iter; + size_t len; + char *next_block; + slab_flags_t block_flags; - return flags; + len = strlen(name); + next_block = slub_debug_string; + /* Go through all blocks of debug options, see if any matches our slab's name */ + while (next_block) { + next_block = parse_slub_debug_flags(next_block, &block_flags, &iter, false); + if (!iter) + continue; + /* Found a block that has a slab list, search it */ + while (*iter) { + char *end, *glob; + size_t cmplen; + + end = strchrnul(iter, ','); + if (next_block && next_block < end) + end = next_block - 1; + + glob = strnchr(iter, end - iter, '*'); + if (glob) + cmplen = glob - iter; + else + cmplen = max_t(size_t, len, (end - iter)); + + if (!strncmp(name, iter, cmplen)) { + flags |= block_flags; + return flags; + } + + if (!*end || *end == ';') + break; + iter = end + 1; + } + } + + return flags | slub_debug; } #else /* !CONFIG_SLUB_DEBUG */ static inline void setup_object_debug(struct kmem_cache *s, struct page *page, void *object) {} -static inline void setup_page_debug(struct kmem_cache *s, - void *addr, int order) {} +static inline +void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr) {} static inline int alloc_debug_processing(struct kmem_cache *s, struct page *page, void *object, unsigned long addr) { return 0; } @@ -1348,8 +1514,7 @@ static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) {} slab_flags_t kmem_cache_flags(unsigned int object_size, - slab_flags_t flags, const char *name, - void (*ctor)(void *)) + slab_flags_t flags, const char *name) { return flags; } @@ -1373,12 +1538,6 @@ } #endif /* CONFIG_SLUB_DEBUG */ -struct slub_free_list { - raw_spinlock_t lock; - struct list_head list; -}; -static DEFINE_PER_CPU(struct slub_free_list, slub_free_list); - /* * Hooks for other subsystems that check memory allocations. In a typical * production configuration these hooks all should produce no code at all. @@ -1386,6 +1545,7 @@ static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) { ptr = kasan_kmalloc_large(ptr, size, flags); + /* As ptr might get tagged, call kmemleak hook after KASAN. */ kmemleak_alloc(ptr, size, 1, flags); return ptr; } @@ -1393,10 +1553,11 @@ static __always_inline void kfree_hook(void *x) { kmemleak_free(x); - kasan_kfree_large(x, _RET_IP_); + kasan_kfree_large(x); } -static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x) +static __always_inline bool slab_free_hook(struct kmem_cache *s, + void *x, bool init) { kmemleak_free_recursive(x, s->flags); @@ -1417,8 +1578,30 @@ if (!(s->flags & SLAB_DEBUG_OBJECTS)) debug_check_no_obj_freed(x, s->object_size); - /* KASAN might put x into memory quarantine, delaying its reuse */ - return kasan_slab_free(s, x, _RET_IP_); + /* Use KCSAN to help debug racy use-after-free. */ + if (!(s->flags & SLAB_TYPESAFE_BY_RCU)) + __kcsan_check_access(x, s->object_size, + KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT); + + /* + * As memory initialization might be integrated into KASAN, + * kasan_slab_free and initialization memset's must be + * kept together to avoid discrepancies in behavior. + * + * The initialization memset's clear the object and the metadata, + * but don't touch the SLAB redzone. + */ + if (init) { + int rsize; + + if (!kasan_has_integrated_init()) + memset(kasan_reset_tag(x), 0, s->object_size); + rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad : 0; + memset((char *)kasan_reset_tag(x) + s->inuse, 0, + s->size - s->inuse - rsize); + } + /* KASAN might put x into memory quarantine, delaying its reuse. */ + return kasan_slab_free(s, x, init); } static inline bool slab_free_freelist_hook(struct kmem_cache *s, @@ -1429,7 +1612,11 @@ void *object; void *next = *head; void *old_tail = *tail ? *tail : *head; - int rsize; + + if (is_kfence_address(next)) { + slab_free_hook(s, next, false); + return true; + } /* Head and tail of the reconstructed freelist */ *head = NULL; @@ -1439,20 +1626,8 @@ object = next; next = get_freepointer(s, object); - if (slab_want_init_on_free(s)) { - /* - * Clear the object and the metadata, but don't touch - * the redzone. - */ - memset(object, 0, s->object_size); - rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad - : 0; - memset((char *)object + s->inuse, 0, - s->size - s->inuse - rsize); - - } /* If object's reuse doesn't have to be delayed */ - if (!slab_free_hook(s, object)) { + if (!slab_free_hook(s, object, slab_want_init_on_free(s))) { /* Move object to the new freelist */ set_freepointer(s, object, *head); *head = object; @@ -1500,10 +1675,8 @@ else page = __alloc_pages_node(node, flags, order); - if (page && memcg_charge_slab(page, flags, order, s)) { - __free_pages(page, order); - page = NULL; - } + if (page) + account_slab_page(page, order, s); return page; } @@ -1623,19 +1796,12 @@ struct kmem_cache_order_objects oo = s->oo; gfp_t alloc_gfp; void *start, *p, *next; - int idx, order; + int idx; bool shuffle; - bool enableirqs = false; flags &= gfp_allowed_mask; if (gfpflags_allow_blocking(flags)) - enableirqs = true; -#ifdef CONFIG_PREEMPT_RT_FULL - if (system_state > SYSTEM_BOOTING) - enableirqs = true; -#endif - if (enableirqs) local_irq_enable(); flags |= s->allocflags; @@ -1664,7 +1830,6 @@ page->objects = oo_objects(oo); - order = compound_order(page); page->slab_cache = s; __SetPageSlab(page); if (page_is_pfmemalloc(page)) @@ -1674,7 +1839,7 @@ start = page_address(page); - setup_page_debug(s, start, order); + setup_page_debug(s, page, start); shuffle = shuffle_freelist(s, page); @@ -1695,15 +1860,10 @@ page->frozen = 1; out: - if (enableirqs) + if (gfpflags_allow_blocking(flags)) local_irq_disable(); if (!page) return NULL; - - mod_lruvec_page_state(page, - (s->flags & SLAB_RECLAIM_ACCOUNT) ? - NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, - 1 << oo_order(oo)); inc_slabs_node(s, page_to_nid(page), page->objects); @@ -1712,13 +1872,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) { - if (unlikely(flags & GFP_SLAB_BUG_MASK)) { - gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK; - flags &= ~GFP_SLAB_BUG_MASK; - pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n", - invalid_mask, &invalid_mask, flags, &flags); - dump_stack(); - } + if (unlikely(flags & GFP_SLAB_BUG_MASK)) + flags = kmalloc_fix_flags(flags); return allocate_slab(s, flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); @@ -1729,7 +1884,7 @@ int order = compound_order(page); int pages = 1 << order; - if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) { void *p; slab_pad_check(s, page); @@ -1738,29 +1893,14 @@ check_object(s, page, p, SLUB_RED_INACTIVE); } - mod_lruvec_page_state(page, - (s->flags & SLAB_RECLAIM_ACCOUNT) ? - NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, - -pages); - __ClearPageSlabPfmemalloc(page); __ClearPageSlab(page); page->mapping = NULL; if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; - memcg_uncharge_slab(page, order, s); + unaccount_slab_page(page, order, s); __free_pages(page, order); -} - -static void free_delayed(struct list_head *h) -{ - while (!list_empty(h)) { - struct page *page = list_first_entry(h, struct page, lru); - - list_del(&page->lru); - __free_slab(page->slab_cache, page); - } } static void rcu_free_slab(struct rcu_head *h) @@ -1774,12 +1914,6 @@ { if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) { call_rcu(&page->rcu_head, rcu_free_slab); - } else if (irqs_disabled()) { - struct slub_free_list *f = this_cpu_ptr(&slub_free_list); - - raw_spin_lock(&f->lock); - list_add(&page->lru, &f->list); - raw_spin_unlock(&f->lock); } else __free_slab(s, page); } @@ -1798,9 +1932,9 @@ { n->nr_partial++; if (tail == DEACTIVATE_TO_TAIL) - list_add_tail(&page->lru, &n->partial); + list_add_tail(&page->slab_list, &n->partial); else - list_add(&page->lru, &n->partial); + list_add(&page->slab_list, &n->partial); } static inline void add_partial(struct kmem_cache_node *n, @@ -1814,7 +1948,7 @@ struct page *page) { lockdep_assert_held(&n->list_lock); - list_del(&page->lru); + list_del(&page->slab_list); n->nr_partial--; } @@ -1881,14 +2015,14 @@ /* * Racy check. If we mistakenly see no partial slabs then we * just allocate an empty slab. If we mistakenly try to get a - * partial slab and there is none available then get_partials() + * partial slab and there is none available then get_partial() * will return NULL. */ if (!n || !n->nr_partial) return NULL; - raw_spin_lock(&n->list_lock); - list_for_each_entry_safe(page, page2, &n->partial, lru) { + spin_lock(&n->list_lock); + list_for_each_entry_safe(page, page2, &n->partial, slab_list) { void *t; if (!pfmemalloc_match(page, flags)) @@ -1912,7 +2046,7 @@ break; } - raw_spin_unlock(&n->list_lock); + spin_unlock(&n->list_lock); return object; } @@ -1926,7 +2060,7 @@ struct zonelist *zonelist; struct zoneref *z; struct zone *zone; - enum zone_type high_zoneidx = gfp_zone(flags); + enum zone_type highest_zoneidx = gfp_zone(flags); void *object; unsigned int cpuset_mems_cookie; @@ -1955,7 +2089,7 @@ do { cpuset_mems_cookie = read_mems_allowed_begin(); zonelist = node_zonelist(mempolicy_slab_node(), flags); - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) { struct kmem_cache_node *n; n = get_node(s, zone_to_nid(zone)); @@ -1976,7 +2110,7 @@ } } } while (read_mems_allowed_retry(cpuset_mems_cookie)); -#endif +#endif /* CONFIG_NUMA */ return NULL; } @@ -1999,9 +2133,9 @@ return get_any_partial(s, flags, c); } -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION /* - * Calculate the next globally unique transaction for disambiguiation + * Calculate the next globally unique transaction for disambiguation * during cmpxchg. The transactions start with the cpu number and are then * incremented by CONFIG_NR_CPUS. */ @@ -2019,6 +2153,7 @@ return tid + TID_STEP; } +#ifdef SLUB_DEBUG_CMPXCHG static inline unsigned int tid_to_cpu(unsigned long tid) { return tid % TID_STEP; @@ -2028,6 +2163,7 @@ { return tid / TID_STEP; } +#endif static inline unsigned int init_tid(int cpu) { @@ -2042,7 +2178,7 @@ pr_info("%s %s: cmpxchg redo ", n, s->name); -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) pr_warn("due to cpu change %d -> %d\n", tid_to_cpu(tid), tid_to_cpu(actual_tid)); @@ -2160,46 +2296,37 @@ if (!lock) { lock = 1; /* - * Taking the spinlock removes the possiblity + * Taking the spinlock removes the possibility * that acquire_slab() will see a slab page that * is frozen */ - raw_spin_lock(&n->list_lock); + spin_lock(&n->list_lock); } } else { m = M_FULL; - if (kmem_cache_debug(s) && !lock) { +#ifdef CONFIG_SLUB_DEBUG + if ((s->flags & SLAB_STORE_USER) && !lock) { lock = 1; /* * This also ensures that the scanning of full * slabs from diagnostic functions will not see * any frozen slabs. */ - raw_spin_lock(&n->list_lock); + spin_lock(&n->list_lock); } +#endif } if (l != m) { - if (l == M_PARTIAL) - remove_partial(n, page); - else if (l == M_FULL) - remove_full(s, n, page); - if (m == M_PARTIAL) { - + if (m == M_PARTIAL) add_partial(n, page, tail); - stat(s, tail); - - } else if (m == M_FULL) { - - stat(s, DEACTIVATE_FULL); + else if (m == M_FULL) add_full(s, n, page); - - } } l = m; @@ -2210,9 +2337,13 @@ goto redo; if (lock) - raw_spin_unlock(&n->list_lock); + spin_unlock(&n->list_lock); - if (m == M_FREE) { + if (m == M_PARTIAL) + stat(s, tail); + else if (m == M_FULL) + stat(s, DEACTIVATE_FULL); + else if (m == M_FREE) { stat(s, DEACTIVATE_EMPTY); discard_slab(s, page); stat(s, FREE_SLAB); @@ -2220,6 +2351,7 @@ c->page = NULL; c->freelist = NULL; + c->tid = next_tid(c->tid); } /* @@ -2236,19 +2368,19 @@ struct kmem_cache_node *n = NULL, *n2 = NULL; struct page *page, *discard_page = NULL; - while ((page = c->partial)) { + while ((page = slub_percpu_partial(c))) { struct page new; struct page old; - c->partial = page->next; + slub_set_percpu_partial(c, page); n2 = get_node(s, page_to_nid(page)); if (n != n2) { if (n) - raw_spin_unlock(&n->list_lock); + spin_unlock(&n->list_lock); n = n2; - raw_spin_lock(&n->list_lock); + spin_lock(&n->list_lock); } do { @@ -2277,7 +2409,7 @@ } if (n) - raw_spin_unlock(&n->list_lock); + spin_unlock(&n->list_lock); while (discard_page) { page = discard_page; @@ -2287,12 +2419,12 @@ discard_slab(s, page); stat(s, FREE_SLAB); } -#endif +#endif /* CONFIG_SLUB_CPU_PARTIAL */ } /* - * Put a page that was just frozen (in __slab_free) into a partial page - * slot if available. + * Put a page that was just frozen (in __slab_free|get_partial_node) into a + * partial page slot if available. * * If we did not find a slot then simply move all the partials to the * per node partial list. @@ -2313,22 +2445,15 @@ if (oldpage) { pobjects = oldpage->pobjects; pages = oldpage->pages; - if (drain && pobjects > s->cpu_partial) { - struct slub_free_list *f; + if (drain && pobjects > slub_cpu_partial(s)) { unsigned long flags; - LIST_HEAD(tofree); /* * partial array is full. Move the existing * set to the per node partial list. */ local_irq_save(flags); unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); - f = this_cpu_ptr(&slub_free_list); - raw_spin_lock(&f->lock); - list_splice_init(&f->list, &tofree); - raw_spin_unlock(&f->lock); local_irq_restore(flags); - free_delayed(&tofree); oldpage = NULL; pobjects = 0; pages = 0; @@ -2345,7 +2470,7 @@ } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage); - if (unlikely(!s->cpu_partial)) { + if (unlikely(!slub_cpu_partial(s))) { unsigned long flags; local_irq_save(flags); @@ -2353,15 +2478,13 @@ local_irq_restore(flags); } preempt_enable(); -#endif +#endif /* CONFIG_SLUB_CPU_PARTIAL */ } static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { stat(s, CPUSLAB_FLUSH); deactivate_slab(s, c->page, c->freelist, c); - - c->tid = next_tid(c->tid); } /* @@ -2373,12 +2496,10 @@ { struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); - if (likely(c)) { - if (c->page) - flush_slab(s, c); + if (c->page) + flush_slab(s, c); - unfreeze_partials(s, c); - } + unfreeze_partials(s, c); } static void flush_cpu_slab(void *d) @@ -2398,19 +2519,7 @@ static void flush_all(struct kmem_cache *s) { - LIST_HEAD(tofree); - int cpu; - - on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC); - for_each_online_cpu(cpu) { - struct slub_free_list *f; - - f = &per_cpu(slub_free_list, cpu); - raw_spin_lock_irq(&f->lock); - list_splice_init(&f->list, &tofree); - raw_spin_unlock_irq(&f->lock); - free_delayed(&tofree); - } + on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1); } /* @@ -2439,7 +2548,7 @@ static inline int node_match(struct page *page, int node) { #ifdef CONFIG_NUMA - if (!page || (node != NUMA_NO_NODE && page_to_nid(page) != node)) + if (node != NUMA_NO_NODE && page_to_nid(page) != node) return 0; #endif return 1; @@ -2465,10 +2574,10 @@ unsigned long x = 0; struct page *page; - raw_spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->partial, lru) + spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry(page, &n->partial, slab_list) x += get_count(page); - raw_spin_unlock_irqrestore(&n->list_lock, flags); + spin_unlock_irqrestore(&n->list_lock, flags); return x; } #endif /* CONFIG_SLUB_DEBUG || CONFIG_SLUB_SYSFS */ @@ -2540,8 +2649,7 @@ stat(s, ALLOC_SLAB); c->page = page; *pc = c; - } else - freelist = NULL; + } return freelist; } @@ -2608,12 +2716,12 @@ * already disabled (which is the case for bulk allocation). */ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, - unsigned long addr, struct kmem_cache_cpu *c, - struct list_head *to_free) + unsigned long addr, struct kmem_cache_cpu *c) { - struct slub_free_list *f; void *freelist; struct page *page; + + stat(s, ALLOC_SLOWPATH); page = c->page; if (!page) { @@ -2662,6 +2770,7 @@ if (!freelist) { c->page = NULL; + c->tid = next_tid(c->tid); stat(s, DEACTIVATE_BYPASS); goto new_slab; } @@ -2677,13 +2786,6 @@ VM_BUG_ON(!c->page->frozen); c->freelist = get_freepointer(s, freelist); c->tid = next_tid(c->tid); - -out: - f = this_cpu_ptr(&slub_free_list); - raw_spin_lock(&f->lock); - list_splice_init(&f->list, to_free); - raw_spin_unlock(&f->lock); - return freelist; new_slab: @@ -2699,7 +2801,7 @@ if (unlikely(!freelist)) { slab_out_of_memory(s, gfpflags, node); - goto out; + return NULL; } page = c->page; @@ -2712,7 +2814,7 @@ goto new_slab; /* Slab failed checks. Next slab needed */ deactivate_slab(s, page, get_freepointer(s, freelist), c); - goto out; + return freelist; } /* @@ -2724,10 +2826,9 @@ { void *p; unsigned long flags; - LIST_HEAD(tofree); local_irq_save(flags); -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION /* * We may have been preempted and rescheduled on a different * cpu before disabling interrupts. Need to reload cpu area @@ -2736,9 +2837,8 @@ c = this_cpu_ptr(s->cpu_slab); #endif - p = ___slab_alloc(s, gfpflags, node, addr, c, &tofree); + p = ___slab_alloc(s, gfpflags, node, addr, c); local_irq_restore(flags); - free_delayed(&tofree); return p; } @@ -2750,7 +2850,8 @@ void *obj) { if (unlikely(slab_want_init_on_free(s)) && obj) - memset((void *)((char *)obj + s->offset), 0, sizeof(void *)); + memset((void *)((char *)kasan_reset_tag(obj) + s->offset), + 0, sizeof(void *)); } /* @@ -2764,16 +2865,23 @@ * Otherwise we can simply pick the next object from the lockless free list. */ static __always_inline void *slab_alloc_node(struct kmem_cache *s, - gfp_t gfpflags, int node, unsigned long addr) + gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) { void *object; struct kmem_cache_cpu *c; struct page *page; unsigned long tid; + struct obj_cgroup *objcg = NULL; + bool init = false; - s = slab_pre_alloc_hook(s, gfpflags); + s = slab_pre_alloc_hook(s, &objcg, 1, gfpflags); if (!s) return NULL; + + object = kfence_alloc(s, orig_size, gfpflags); + if (unlikely(object)) + goto out; + redo: /* * Must read kmem_cache cpu data via this cpu ptr. Preemption is @@ -2782,13 +2890,13 @@ * as we end up on the original cpu again when doing the cmpxchg. * * We should guarantee that tid and kmem_cache are retrieved on - * the same cpu. It could be different if CONFIG_PREEMPT so we need + * the same cpu. It could be different if CONFIG_PREEMPTION so we need * to check if it is matched or not. */ do { tid = this_cpu_read(s->cpu_slab->tid); c = raw_cpu_ptr(s->cpu_slab); - } while (IS_ENABLED(CONFIG_PREEMPT) && + } while (IS_ENABLED(CONFIG_PREEMPTION) && unlikely(tid != READ_ONCE(c->tid))); /* @@ -2810,9 +2918,8 @@ object = c->freelist; page = c->page; - if (unlikely(!object || !node_match(page, node))) { + if (unlikely(!object || !page || !node_match(page, node))) { object = __slab_alloc(s, gfpflags, node, addr, c); - stat(s, ALLOC_SLOWPATH); } else { void *next_object = get_freepointer_safe(s, object); @@ -2843,24 +2950,23 @@ } maybe_wipe_obj_freeptr(s, object); + init = slab_want_init_on_alloc(gfpflags, s); - if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object) - memset(object, 0, s->object_size); - - slab_post_alloc_hook(s, gfpflags, 1, &object); +out: + slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init); return object; } static __always_inline void *slab_alloc(struct kmem_cache *s, - gfp_t gfpflags, unsigned long addr) + gfp_t gfpflags, unsigned long addr, size_t orig_size) { - return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr); + return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr, orig_size); } void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) { - void *ret = slab_alloc(s, gfpflags, _RET_IP_); + void *ret = slab_alloc(s, gfpflags, _RET_IP_, s->object_size); trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, s->size, gfpflags); @@ -2872,7 +2978,7 @@ #ifdef CONFIG_TRACING void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) { - void *ret = slab_alloc(s, gfpflags, _RET_IP_); + void *ret = slab_alloc(s, gfpflags, _RET_IP_, size); trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); ret = kasan_kmalloc(s, ret, size, gfpflags); return ret; @@ -2883,7 +2989,7 @@ #ifdef CONFIG_NUMA void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) { - void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_); + void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_, s->object_size); trace_kmem_cache_alloc_node(_RET_IP_, ret, s->object_size, s->size, gfpflags, node); @@ -2897,7 +3003,7 @@ gfp_t gfpflags, int node, size_t size) { - void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_); + void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_, size); trace_kmalloc_node(_RET_IP_, ret, size, s->size, gfpflags, node); @@ -2907,7 +3013,7 @@ } EXPORT_SYMBOL(kmem_cache_alloc_node_trace); #endif -#endif +#endif /* CONFIG_NUMA */ /* * Slow path handling. This may still be called frequently since objects @@ -2927,9 +3033,12 @@ struct page new; unsigned long counters; struct kmem_cache_node *n = NULL; - unsigned long uninitialized_var(flags); + unsigned long flags; stat(s, FREE_SLOWPATH); + + if (kfence_free(head)) + return; if (kmem_cache_debug(s) && !free_debug_processing(s, page, head, tail, cnt, addr)) @@ -2937,7 +3046,7 @@ do { if (unlikely(n)) { - raw_spin_unlock_irqrestore(&n->list_lock, flags); + spin_unlock_irqrestore(&n->list_lock, flags); n = NULL; } prior = page->freelist; @@ -2969,7 +3078,7 @@ * Otherwise the list_lock will synchronize with * other processors updating the list of slabs. */ - raw_spin_lock_irqsave(&n->list_lock, flags); + spin_lock_irqsave(&n->list_lock, flags); } } @@ -2981,20 +3090,21 @@ if (likely(!n)) { - /* - * If we just froze the page then put it onto the - * per cpu partial list. - */ - if (new.frozen && !was_frozen) { + if (likely(was_frozen)) { + /* + * The list lock was not taken therefore no list + * activity can be necessary. + */ + stat(s, FREE_FROZEN); + } else if (new.frozen) { + /* + * If we just froze the page then put it onto the + * per cpu partial list. + */ put_cpu_partial(s, page, 1); stat(s, CPU_PARTIAL_FREE); } - /* - * The list lock was not taken therefore no list - * activity can be necessary. - */ - if (was_frozen) - stat(s, FREE_FROZEN); + return; } @@ -3006,12 +3116,11 @@ * then add it. */ if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) { - if (kmem_cache_debug(s)) - remove_full(s, n, page); + remove_full(s, n, page); add_partial(n, page, DEACTIVATE_TO_TAIL); stat(s, FREE_ADD_PARTIAL); } - raw_spin_unlock_irqrestore(&n->list_lock, flags); + spin_unlock_irqrestore(&n->list_lock, flags); return; slab_empty: @@ -3026,7 +3135,7 @@ remove_full(s, n, page); } - raw_spin_unlock_irqrestore(&n->list_lock, flags); + spin_unlock_irqrestore(&n->list_lock, flags); stat(s, FREE_SLAB); discard_slab(s, page); } @@ -3053,6 +3162,10 @@ void *tail_obj = tail ? : head; struct kmem_cache_cpu *c; unsigned long tid; + + /* memcg_slab_free_hook() is already called for bulk free. */ + if (!tail) + memcg_slab_free_hook(s, &head, 1); redo: /* * Determine the currently cpus per cpu slab. @@ -3063,7 +3176,7 @@ do { tid = this_cpu_read(s->cpu_slab->tid); c = raw_cpu_ptr(s->cpu_slab); - } while (IS_ENABLED(CONFIG_PREEMPT) && + } while (IS_ENABLED(CONFIG_PREEMPTION) && unlikely(tid != READ_ONCE(c->tid))); /* Same with comment on barrier() in slab_alloc_node() */ @@ -3173,6 +3286,13 @@ df->s = cache_from_obj(s, object); /* Support for memcg */ } + if (is_kfence_address(object)) { + slab_free_hook(df->s, object, false); + __kfence_free(object); + p[size] = NULL; /* mark object processed */ + return size; + } + /* Start new detached freelist */ df->page = page; set_freepointer(df->s, object, NULL); @@ -3214,6 +3334,7 @@ if (WARN_ON(!size)) return; + memcg_slab_free_hook(s, p, size); do { struct detached_freelist df; @@ -3231,11 +3352,11 @@ void **p) { struct kmem_cache_cpu *c; - LIST_HEAD(to_free); int i; + struct obj_cgroup *objcg = NULL; /* memcg and kmem_cache debug support */ - s = slab_pre_alloc_hook(s, flags); + s = slab_pre_alloc_hook(s, &objcg, size, flags); if (unlikely(!s)) return false; /* @@ -3247,8 +3368,14 @@ c = this_cpu_ptr(s->cpu_slab); for (i = 0; i < size; i++) { - void *object = c->freelist; + void *object = kfence_alloc(s, s->object_size, flags); + if (unlikely(object)) { + p[i] = object; + continue; + } + + object = c->freelist; if (unlikely(!object)) { /* * We may have removed an object from c->freelist using @@ -3264,7 +3391,7 @@ * of re-populating per CPU c->freelist */ p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, - _RET_IP_, c, &to_free); + _RET_IP_, c); if (unlikely(!p[i])) goto error; @@ -3279,23 +3406,17 @@ } c->tid = next_tid(c->tid); local_irq_enable(); - free_delayed(&to_free); - /* Clear memory outside IRQ disabled fastpath loop */ - if (unlikely(slab_want_init_on_alloc(flags, s))) { - int j; - - for (j = 0; j < i; j++) - memset(p[j], 0, s->object_size); - } - - /* memcg and kmem_cache debug support */ - slab_post_alloc_hook(s, flags, size, p); + /* + * memcg and kmem_cache debug support and memory initialization. + * Done outside of the IRQ disabled fastpath loop. + */ + slab_post_alloc_hook(s, objcg, flags, size, p, + slab_want_init_on_alloc(flags, s)); return i; error: local_irq_enable(); - free_delayed(&to_free); - slab_post_alloc_hook(s, flags, i, p); + slab_post_alloc_hook(s, objcg, flags, i, p, false); __kmem_cache_free_bulk(s, i, p); return 0; } @@ -3430,7 +3551,7 @@ init_kmem_cache_node(struct kmem_cache_node *n) { n->nr_partial = 0; - raw_spin_lock_init(&n->list_lock); + spin_lock_init(&n->list_lock); INIT_LIST_HEAD(&n->partial); #ifdef CONFIG_SLUB_DEBUG atomic_long_set(&n->nr_slabs, 0); @@ -3491,8 +3612,7 @@ init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); init_tracking(kmem_cache_node, n); #endif - n = kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node), - GFP_KERNEL); + n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false); page->freelist = get_freepointer(kmem_cache_node, n); page->inuse = 1; page->frozen = 0; @@ -3580,15 +3700,15 @@ * 50% to keep some capacity around for frees. */ if (!kmem_cache_has_cpu_partial(s)) - s->cpu_partial = 0; + slub_set_cpu_partial(s, 0); else if (s->size >= PAGE_SIZE) - s->cpu_partial = 2; + slub_set_cpu_partial(s, 2); else if (s->size >= 1024) - s->cpu_partial = 6; + slub_set_cpu_partial(s, 6); else if (s->size >= 256) - s->cpu_partial = 13; + slub_set_cpu_partial(s, 13); else - s->cpu_partial = 30; + slub_set_cpu_partial(s, 30); #endif } @@ -3633,22 +3753,36 @@ /* * With that we have determined the number of bytes in actual use - * by the object. This is the potential offset to the free pointer. + * by the object and redzoning. */ s->inuse = size; - if (((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || - s->ctor)) { + if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || + ((flags & SLAB_RED_ZONE) && s->object_size < sizeof(void *)) || + s->ctor) { /* * Relocate free pointer after the object if it is not * permitted to overwrite the first word of the object on * kmem_cache_free. * * This is the case if we do RCU, have a constructor or - * destructor or are poisoning the objects. + * destructor, are poisoning the objects, or are + * redzoning an object smaller than sizeof(void *). + * + * The assumption that s->offset >= s->inuse means free + * pointer is outside of the object is used in the + * freeptr_outside_object() function. If that is no + * longer true, the function needs to be modified. */ s->offset = size; size += sizeof(void *); + } else { + /* + * Store freelist pointer near middle of object to keep + * it away from the edges of the object to avoid small + * sized over/underflows from neighboring allocations. + */ + s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *)); } #ifdef CONFIG_SLUB_DEBUG @@ -3685,6 +3819,7 @@ */ size = ALIGN(size, s->align); s->size = size; + s->reciprocal_size = reciprocal_value(size); if (forced_order >= 0) order = forced_order; else @@ -3719,7 +3854,7 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) { - s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); + s->flags = kmem_cache_flags(s->size, flags, s->name); #ifdef CONFIG_SLAB_FREELIST_HARDENED s->random = get_random_long(); #endif @@ -3770,45 +3905,32 @@ if (alloc_kmem_cache_cpus(s)) return 0; - free_kmem_cache_nodes(s); error: - if (flags & SLAB_PANIC) - panic("Cannot create slab %s size=%u realsize=%u order=%u offset=%u flags=%lx\n", - s->name, s->size, s->size, - oo_order(s->oo), s->offset, (unsigned long)flags); + __kmem_cache_release(s); return -EINVAL; } static void list_slab_objects(struct kmem_cache *s, struct page *page, - const char *text) + const char *text) { #ifdef CONFIG_SLUB_DEBUG -#ifdef CONFIG_PREEMPT_RT_BASE - /* XXX move out of irq-off section */ - slab_err(s, page, text, s->name); -#else - void *addr = page_address(page); + unsigned long *map; void *p; - unsigned long *map = kcalloc(BITS_TO_LONGS(page->objects), - sizeof(long), - GFP_ATOMIC); - if (!map) - return; + slab_err(s, page, text, s->name); slab_lock(page); - get_map(s, page, map); + map = get_map(s, page); for_each_object(p, s, addr, page->objects) { - if (!test_bit(slab_index(p, s, addr), map)) { + if (!test_bit(__obj_to_index(s, addr, p), map)) { pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr); print_tracking(s, p); } } + put_map(map); slab_unlock(page); - kfree(map); -#endif #endif } @@ -3823,19 +3945,19 @@ struct page *page, *h; BUG_ON(irqs_disabled()); - raw_spin_lock_irq(&n->list_lock); - list_for_each_entry_safe(page, h, &n->partial, lru) { + spin_lock_irq(&n->list_lock); + list_for_each_entry_safe(page, h, &n->partial, slab_list) { if (!page->inuse) { remove_partial(n, page); - list_add(&page->lru, &discard); + list_add(&page->slab_list, &discard); } else { list_slab_objects(s, page, - "Objects remaining in %s on __kmem_cache_shutdown()"); + "Objects remaining in %s on __kmem_cache_shutdown()"); } } - raw_spin_unlock_irq(&n->list_lock); + spin_unlock_irq(&n->list_lock); - list_for_each_entry_safe(page, h, &discard, lru) + list_for_each_entry_safe(page, h, &discard, slab_list) discard_slab(s, page); } @@ -3865,7 +3987,6 @@ if (n->nr_partial || slabs_node(s, node)) return 1; } - sysfs_slab_remove(s); return 0; } @@ -3914,7 +4035,7 @@ if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc(s, flags, _RET_IP_); + ret = slab_alloc(s, flags, _RET_IP_, size); trace_kmalloc(_RET_IP_, ret, size, s->size, flags); @@ -3929,11 +4050,15 @@ { struct page *page; void *ptr = NULL; + unsigned int order = get_order(size); flags |= __GFP_COMP; - page = alloc_pages_node(node, flags, get_order(size)); - if (page) + page = alloc_pages_node(node, flags, order); + if (page) { ptr = page_address(page); + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, + PAGE_SIZE << order); + } return kmalloc_large_node_hook(ptr, size, flags); } @@ -3958,7 +4083,7 @@ if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc_node(s, flags, node, _RET_IP_); + ret = slab_alloc_node(s, flags, node, _RET_IP_, size); trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); @@ -3967,7 +4092,7 @@ return ret; } EXPORT_SYMBOL(__kmalloc_node); -#endif +#endif /* CONFIG_NUMA */ #ifdef CONFIG_HARDENED_USERCOPY /* @@ -3984,6 +4109,7 @@ struct kmem_cache *s; unsigned int offset; size_t object_size; + bool is_kfence = is_kfence_address(ptr); ptr = kasan_reset_tag(ptr); @@ -3996,10 +4122,13 @@ to_user, 0, n); /* Find offset within object. */ - offset = (ptr - page_address(page)) % s->size; + if (is_kfence) + offset = ptr - kfence_object_start(ptr); + else + offset = (ptr - page_address(page)) % s->size; /* Adjust for redzone and reject if within the redzone. */ - if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) { + if (!is_kfence && kmem_cache_debug_flags(s, SLAB_RED_ZONE)) { if (offset < s->red_left_pad) usercopy_abort("SLUB object in left red zone", s->name, to_user, offset, n); @@ -4029,7 +4158,7 @@ } #endif /* CONFIG_HARDENED_USERCOPY */ -static size_t __ksize(const void *object) +size_t __ksize(const void *object) { struct page *page; @@ -4040,22 +4169,12 @@ if (unlikely(!PageSlab(page))) { WARN_ON(!PageCompound(page)); - return PAGE_SIZE << compound_order(page); + return page_size(page); } return slab_ksize(page->slab_cache); } - -size_t ksize(const void *object) -{ - size_t size = __ksize(object); - /* We assume that ksize callers could use whole allocated area, - * so we need to unpoison this area. - */ - kasan_unpoison_shadow(object, size); - return size; -} -EXPORT_SYMBOL(ksize); +EXPORT_SYMBOL(__ksize); void kfree(const void *x) { @@ -4069,9 +4188,13 @@ page = virt_to_head_page(x); if (unlikely(!PageSlab(page))) { + unsigned int order = compound_order(page); + BUG_ON(!PageCompound(page)); kfree_hook(object); - __free_pages(page, compound_order(page)); + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, + -(PAGE_SIZE << order)); + __free_pages(page, order); return; } slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_); @@ -4107,7 +4230,7 @@ for (i = 0; i < SHRINK_PROMOTE_MAX; i++) INIT_LIST_HEAD(promote + i); - raw_spin_lock_irqsave(&n->list_lock, flags); + spin_lock_irqsave(&n->list_lock, flags); /* * Build lists of slabs to discard or promote. @@ -4115,7 +4238,7 @@ * Note that concurrent frees may occur while we hold the * list_lock. page->inuse here is the upper limit. */ - list_for_each_entry_safe(page, t, &n->partial, lru) { + list_for_each_entry_safe(page, t, &n->partial, slab_list) { int free = page->objects - page->inuse; /* Do not reread page->inuse */ @@ -4125,10 +4248,10 @@ BUG_ON(free <= 0); if (free == page->objects) { - list_move(&page->lru, &discard); + list_move(&page->slab_list, &discard); n->nr_partial--; } else if (free <= SHRINK_PROMOTE_MAX) - list_move(&page->lru, promote + free - 1); + list_move(&page->slab_list, promote + free - 1); } /* @@ -4138,10 +4261,10 @@ for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--) list_splice(promote + i, &n->partial); - raw_spin_unlock_irqrestore(&n->list_lock, flags); + spin_unlock_irqrestore(&n->list_lock, flags); /* Release empty slabs */ - list_for_each_entry_safe(page, t, &discard, lru) + list_for_each_entry_safe(page, t, &discard, slab_list) discard_slab(s, page); if (slabs_node(s, node)) @@ -4150,42 +4273,6 @@ return ret; } - -#ifdef CONFIG_MEMCG -static void kmemcg_cache_deact_after_rcu(struct kmem_cache *s) -{ - /* - * Called with all the locks held after a sched RCU grace period. - * Even if @s becomes empty after shrinking, we can't know that @s - * doesn't have allocations already in-flight and thus can't - * destroy @s until the associated memcg is released. - * - * However, let's remove the sysfs files for empty caches here. - * Each cache has a lot of interface files which aren't - * particularly useful for empty draining caches; otherwise, we can - * easily end up with millions of unnecessary sysfs files on - * systems which have a lot of memory and transient cgroups. - */ - if (!__kmem_cache_shrink(s)) - sysfs_slab_remove(s); -} - -void __kmemcg_cache_deactivate(struct kmem_cache *s) -{ - /* - * Disable empty slabs caching. Used to avoid pinning offline - * memory cgroups by kmem pages that can be freed. - */ - slub_set_cpu_partial(s, 0); - s->min_partial = 0; - - /* - * s->cpu_partial is checked locklessly (see put_cpu_partial), so - * we have to make sure the change is visible before shrinking. - */ - slab_deactivate_memcg_cache_rcu_sched(s, kmemcg_cache_deact_after_rcu); -} -#endif static int slab_mem_going_offline_callback(void *arg) { @@ -4333,17 +4420,15 @@ for_each_kmem_cache_node(s, node, n) { struct page *p; - list_for_each_entry(p, &n->partial, lru) + list_for_each_entry(p, &n->partial, slab_list) p->slab_cache = s; #ifdef CONFIG_SLUB_DEBUG - list_for_each_entry(p, &n->full, lru) + list_for_each_entry(p, &n->full, slab_list) p->slab_cache = s; #endif } - slab_init_memcg_params(s); list_add(&s->list, &slab_caches); - memcg_link_cache(s); return s; } @@ -4351,12 +4436,6 @@ { static __initdata struct kmem_cache boot_kmem_cache, boot_kmem_cache_node; - int cpu; - - for_each_possible_cpu(cpu) { - raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock); - INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list); - } if (debug_guardpage_minorder()) slub_max_order = 0; @@ -4390,7 +4469,7 @@ cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL, slub_cpu_dead); - pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%d\n", + pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%u\n", cache_line_size(), slub_min_order, slub_max_order, slub_min_objects, nr_cpu_ids, nr_node_ids); @@ -4404,7 +4483,7 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)) { - struct kmem_cache *s, *c; + struct kmem_cache *s; s = find_mergeable(size, align, flags, name, ctor); if (s) { @@ -4416,11 +4495,6 @@ */ s->object_size = max(s->object_size, size); s->inuse = max(s->inuse, ALIGN(size, sizeof(void *))); - - for_each_memcg_cache(c, s) { - c->object_size = s->object_size; - c->inuse = max(c->inuse, ALIGN(size, sizeof(void *))); - } if (sysfs_slab_alias(s, name)) { s->refcount--; @@ -4443,12 +4517,16 @@ if (slab_state <= UP) return 0; - memcg_propagate_slab_attrs(s); err = sysfs_slab_add(s); - if (err) + if (err) { __kmem_cache_release(s); + return err; + } - return err; + if (s->flags & SLAB_STORE_USER) + debugfs_slab_add(s); + + return 0; } void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) @@ -4464,7 +4542,7 @@ if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc(s, gfpflags, caller); + ret = slab_alloc(s, gfpflags, caller, size); /* Honor the call site pointer we received. */ trace_kmalloc(caller, ret, size, s->size, gfpflags); @@ -4495,7 +4573,7 @@ if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc_node(s, gfpflags, node, caller); + ret = slab_alloc_node(s, gfpflags, node, caller, size); /* Honor the call site pointer we received. */ trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); @@ -4518,52 +4596,42 @@ #endif #ifdef CONFIG_SLUB_DEBUG -static int validate_slab(struct kmem_cache *s, struct page *page, - unsigned long *map) +static void validate_slab(struct kmem_cache *s, struct page *page) { void *p; void *addr = page_address(page); + unsigned long *map; - if (!check_slab(s, page) || - !on_freelist(s, page, NULL)) - return 0; + slab_lock(page); + + if (!check_slab(s, page) || !on_freelist(s, page, NULL)) + goto unlock; /* Now we know that a valid freelist exists */ - bitmap_zero(map, page->objects); - - get_map(s, page, map); + map = get_map(s, page); for_each_object(p, s, addr, page->objects) { - if (test_bit(slab_index(p, s, addr), map)) - if (!check_object(s, page, p, SLUB_RED_INACTIVE)) - return 0; + u8 val = test_bit(__obj_to_index(s, addr, p), map) ? + SLUB_RED_INACTIVE : SLUB_RED_ACTIVE; + + if (!check_object(s, page, p, val)) + break; } - - for_each_object(p, s, addr, page->objects) - if (!test_bit(slab_index(p, s, addr), map)) - if (!check_object(s, page, p, SLUB_RED_ACTIVE)) - return 0; - return 1; -} - -static void validate_slab_slab(struct kmem_cache *s, struct page *page, - unsigned long *map) -{ - slab_lock(page); - validate_slab(s, page, map); + put_map(map); +unlock: slab_unlock(page); } static int validate_slab_node(struct kmem_cache *s, - struct kmem_cache_node *n, unsigned long *map) + struct kmem_cache_node *n) { unsigned long count = 0; struct page *page; unsigned long flags; - raw_spin_lock_irqsave(&n->list_lock, flags); + spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->partial, lru) { - validate_slab_slab(s, page, map); + list_for_each_entry(page, &n->partial, slab_list) { + validate_slab(s, page); count++; } if (count != n->nr_partial) @@ -4573,8 +4641,8 @@ if (!(s->flags & SLAB_STORE_USER)) goto out; - list_for_each_entry(page, &n->full, lru) { - validate_slab_slab(s, page, map); + list_for_each_entry(page, &n->full, slab_list) { + validate_slab(s, page); count++; } if (count != atomic_long_read(&n->nr_slabs)) @@ -4582,7 +4650,7 @@ s->name, count, atomic_long_read(&n->nr_slabs)); out: - raw_spin_unlock_irqrestore(&n->list_lock, flags); + spin_unlock_irqrestore(&n->list_lock, flags); return count; } @@ -4590,20 +4658,16 @@ { int node; unsigned long count = 0; - unsigned long *map = kmalloc_array(BITS_TO_LONGS(oo_objects(s->max)), - sizeof(unsigned long), - GFP_KERNEL); struct kmem_cache_node *n; - - if (!map) - return -ENOMEM; flush_all(s); for_each_kmem_cache_node(s, node, n) - count += validate_slab_node(s, n, map); - kfree(map); + count += validate_slab_node(s, n); + return count; } + +#ifdef CONFIG_DEBUG_FS /* * Generate lists of code addresses where slabcache objects are allocated * and freed. @@ -4625,7 +4689,10 @@ unsigned long max; unsigned long count; struct location *loc; + loff_t idx; }; + +static struct dentry *slab_debugfs_root; static void free_loc_track(struct loc_track *t) { @@ -4638,9 +4705,6 @@ { struct location *l; int order; - - if (IS_ENABLED(CONFIG_PREEMPT_RT) && flags == GFP_ATOMIC) - return 0; order = get_order(sizeof(struct location) * max); @@ -4735,105 +4799,19 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s, struct page *page, enum track_item alloc, - unsigned long *map) + unsigned long *obj_map) { void *addr = page_address(page); void *p; - bitmap_zero(map, page->objects); - get_map(s, page, map); + __fill_map(obj_map, s, page); for_each_object(p, s, addr, page->objects) - if (!test_bit(slab_index(p, s, addr), map)) + if (!test_bit(__obj_to_index(s, addr, p), obj_map)) add_location(t, s, get_track(s, p, alloc)); } - -static int list_locations(struct kmem_cache *s, char *buf, - enum track_item alloc) -{ - int len = 0; - unsigned long i; - struct loc_track t = { 0, 0, NULL }; - int node; - unsigned long *map = kmalloc_array(BITS_TO_LONGS(oo_objects(s->max)), - sizeof(unsigned long), - GFP_KERNEL); - struct kmem_cache_node *n; - - if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), - GFP_KERNEL)) { - kfree(map); - return sprintf(buf, "Out of memory\n"); - } - /* Push back cpu slabs */ - flush_all(s); - - for_each_kmem_cache_node(s, node, n) { - unsigned long flags; - struct page *page; - - if (!atomic_long_read(&n->nr_slabs)) - continue; - - raw_spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->partial, lru) - process_slab(&t, s, page, alloc, map); - list_for_each_entry(page, &n->full, lru) - process_slab(&t, s, page, alloc, map); - raw_spin_unlock_irqrestore(&n->list_lock, flags); - } - - for (i = 0; i < t.count; i++) { - struct location *l = &t.loc[i]; - - if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100) - break; - len += sprintf(buf + len, "%7ld ", l->count); - - if (l->addr) - len += sprintf(buf + len, "%pS", (void *)l->addr); - else - len += sprintf(buf + len, "<not-available>"); - - if (l->sum_time != l->min_time) { - len += sprintf(buf + len, " age=%ld/%ld/%ld", - l->min_time, - (long)div_u64(l->sum_time, l->count), - l->max_time); - } else - len += sprintf(buf + len, " age=%ld", - l->min_time); - - if (l->min_pid != l->max_pid) - len += sprintf(buf + len, " pid=%ld-%ld", - l->min_pid, l->max_pid); - else - len += sprintf(buf + len, " pid=%ld", - l->min_pid); - - if (num_online_cpus() > 1 && - !cpumask_empty(to_cpumask(l->cpus)) && - len < PAGE_SIZE - 60) - len += scnprintf(buf + len, PAGE_SIZE - len - 50, - " cpus=%*pbl", - cpumask_pr_args(to_cpumask(l->cpus))); - - if (nr_online_nodes > 1 && !nodes_empty(l->nodes) && - len < PAGE_SIZE - 60) - len += scnprintf(buf + len, PAGE_SIZE - len - 50, - " nodes=%*pbl", - nodemask_pr_args(&l->nodes)); - - len += sprintf(buf + len, "\n"); - } - - free_loc_track(&t); - kfree(map); - if (!t.count) - len += sprintf(buf, "No data\n"); - return len; -} -#endif +#endif /* CONFIG_DEBUG_FS */ +#endif /* CONFIG_SLUB_DEBUG */ #ifdef SLUB_RESILIENCY_TEST static void __init resiliency_test(void) @@ -4893,7 +4871,7 @@ #ifdef CONFIG_SLUB_SYSFS static void resiliency_test(void) {}; #endif -#endif +#endif /* SLUB_RESILIENCY_TEST */ #ifdef CONFIG_SLUB_SYSFS enum slab_stat_type { @@ -5032,20 +5010,6 @@ return x + sprintf(buf + x, "\n"); } -#ifdef CONFIG_SLUB_DEBUG -static int any_slab_objects(struct kmem_cache *s) -{ - int node; - struct kmem_cache_node *n; - - for_each_kmem_cache_node(s, node, n) - if (atomic_long_read(&n->total_objects)) - return 1; - - return 0; -} -#endif - #define to_slab_attr(n) container_of(n, struct slab_attribute, attr) #define to_slab(n) container_of(n, struct kmem_cache, kobj) @@ -5087,28 +5051,11 @@ } SLAB_ATTR_RO(objs_per_slab); -static ssize_t order_store(struct kmem_cache *s, - const char *buf, size_t length) -{ - unsigned int order; - int err; - - err = kstrtouint(buf, 10, &order); - if (err) - return err; - - if (order > slub_max_order || order < slub_min_order) - return -EINVAL; - - calculate_sizes(s, order); - return length; -} - static ssize_t order_show(struct kmem_cache *s, char *buf) { return sprintf(buf, "%u\n", oo_order(s->oo)); } -SLAB_ATTR(order); +SLAB_ATTR_RO(order); static ssize_t min_partial_show(struct kmem_cache *s, char *buf) { @@ -5230,16 +5177,7 @@ { return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); } - -static ssize_t reclaim_account_store(struct kmem_cache *s, - const char *buf, size_t length) -{ - s->flags &= ~SLAB_RECLAIM_ACCOUNT; - if (buf[0] == '1') - s->flags |= SLAB_RECLAIM_ACCOUNT; - return length; -} -SLAB_ATTR(reclaim_account); +SLAB_ATTR_RO(reclaim_account); static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf) { @@ -5284,104 +5222,34 @@ { return sprintf(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS)); } - -static ssize_t sanity_checks_store(struct kmem_cache *s, - const char *buf, size_t length) -{ - s->flags &= ~SLAB_CONSISTENCY_CHECKS; - if (buf[0] == '1') { - s->flags &= ~__CMPXCHG_DOUBLE; - s->flags |= SLAB_CONSISTENCY_CHECKS; - } - return length; -} -SLAB_ATTR(sanity_checks); +SLAB_ATTR_RO(sanity_checks); static ssize_t trace_show(struct kmem_cache *s, char *buf) { return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE)); } - -static ssize_t trace_store(struct kmem_cache *s, const char *buf, - size_t length) -{ - /* - * Tracing a merged cache is going to give confusing results - * as well as cause other issues like converting a mergeable - * cache into an umergeable one. - */ - if (s->refcount > 1) - return -EINVAL; - - s->flags &= ~SLAB_TRACE; - if (buf[0] == '1') { - s->flags &= ~__CMPXCHG_DOUBLE; - s->flags |= SLAB_TRACE; - } - return length; -} -SLAB_ATTR(trace); +SLAB_ATTR_RO(trace); static ssize_t red_zone_show(struct kmem_cache *s, char *buf) { return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE)); } -static ssize_t red_zone_store(struct kmem_cache *s, - const char *buf, size_t length) -{ - if (any_slab_objects(s)) - return -EBUSY; - - s->flags &= ~SLAB_RED_ZONE; - if (buf[0] == '1') { - s->flags |= SLAB_RED_ZONE; - } - calculate_sizes(s, -1); - return length; -} -SLAB_ATTR(red_zone); +SLAB_ATTR_RO(red_zone); static ssize_t poison_show(struct kmem_cache *s, char *buf) { return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON)); } -static ssize_t poison_store(struct kmem_cache *s, - const char *buf, size_t length) -{ - if (any_slab_objects(s)) - return -EBUSY; - - s->flags &= ~SLAB_POISON; - if (buf[0] == '1') { - s->flags |= SLAB_POISON; - } - calculate_sizes(s, -1); - return length; -} -SLAB_ATTR(poison); +SLAB_ATTR_RO(poison); static ssize_t store_user_show(struct kmem_cache *s, char *buf) { return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER)); } -static ssize_t store_user_store(struct kmem_cache *s, - const char *buf, size_t length) -{ - if (any_slab_objects(s)) - return -EBUSY; - - s->flags &= ~SLAB_STORE_USER; - if (buf[0] == '1') { - s->flags &= ~__CMPXCHG_DOUBLE; - s->flags |= SLAB_STORE_USER; - } - calculate_sizes(s, -1); - return length; -} -SLAB_ATTR(store_user); +SLAB_ATTR_RO(store_user); static ssize_t validate_show(struct kmem_cache *s, char *buf) { @@ -5402,21 +5270,6 @@ } SLAB_ATTR(validate); -static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf) -{ - if (!(s->flags & SLAB_STORE_USER)) - return -ENOSYS; - return list_locations(s, buf, TRACK_ALLOC); -} -SLAB_ATTR_RO(alloc_calls); - -static ssize_t free_calls_show(struct kmem_cache *s, char *buf) -{ - if (!(s->flags & SLAB_STORE_USER)) - return -ENOSYS; - return list_locations(s, buf, TRACK_FREE); -} -SLAB_ATTR_RO(free_calls); #endif /* CONFIG_SLUB_DEBUG */ #ifdef CONFIG_FAILSLAB @@ -5424,19 +5277,7 @@ { return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); } - -static ssize_t failslab_store(struct kmem_cache *s, const char *buf, - size_t length) -{ - if (s->refcount > 1) - return -EINVAL; - - s->flags &= ~SLAB_FAILSLAB; - if (buf[0] == '1') - s->flags |= SLAB_FAILSLAB; - return length; -} -SLAB_ATTR(failslab); +SLAB_ATTR_RO(failslab); #endif static ssize_t shrink_show(struct kmem_cache *s, char *buf) @@ -5559,7 +5400,7 @@ STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node); STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain); -#endif +#endif /* CONFIG_SLUB_STATS */ static struct attribute *slab_attrs[] = { &slab_size_attr.attr, @@ -5589,8 +5430,6 @@ &poison_attr.attr, &store_user_attr.attr, &validate_attr.attr, - &alloc_calls_attr.attr, - &free_calls_attr.attr, #endif #ifdef CONFIG_ZONE_DMA &cache_dma_attr.attr, @@ -5672,96 +5511,7 @@ return -EIO; err = attribute->store(s, buf, len); -#ifdef CONFIG_MEMCG - if (slab_state >= FULL && err >= 0 && is_root_cache(s)) { - struct kmem_cache *c; - - mutex_lock(&slab_mutex); - if (s->max_attr_size < len) - s->max_attr_size = len; - - /* - * This is a best effort propagation, so this function's return - * value will be determined by the parent cache only. This is - * basically because not all attributes will have a well - * defined semantics for rollbacks - most of the actions will - * have permanent effects. - * - * Returning the error value of any of the children that fail - * is not 100 % defined, in the sense that users seeing the - * error code won't be able to know anything about the state of - * the cache. - * - * Only returning the error code for the parent cache at least - * has well defined semantics. The cache being written to - * directly either failed or succeeded, in which case we loop - * through the descendants with best-effort propagation. - */ - for_each_memcg_cache(c, s) - attribute->store(c, buf, len); - mutex_unlock(&slab_mutex); - } -#endif return err; -} - -static void memcg_propagate_slab_attrs(struct kmem_cache *s) -{ -#ifdef CONFIG_MEMCG - int i; - char *buffer = NULL; - struct kmem_cache *root_cache; - - if (is_root_cache(s)) - return; - - root_cache = s->memcg_params.root_cache; - - /* - * This mean this cache had no attribute written. Therefore, no point - * in copying default values around - */ - if (!root_cache->max_attr_size) - return; - - for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) { - char mbuf[64]; - char *buf; - struct slab_attribute *attr = to_slab_attr(slab_attrs[i]); - ssize_t len; - - if (!attr || !attr->store || !attr->show) - continue; - - /* - * It is really bad that we have to allocate here, so we will - * do it only as a fallback. If we actually allocate, though, - * we can just use the allocated buffer until the end. - * - * Most of the slub attributes will tend to be very small in - * size, but sysfs allows buffers up to a page, so they can - * theoretically happen. - */ - if (buffer) - buf = buffer; - else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf) && - !IS_ENABLED(CONFIG_SLUB_STATS)) - buf = mbuf; - else { - buffer = (char *) get_zeroed_page(GFP_KERNEL); - if (WARN_ON(!buffer)) - continue; - buf = buffer; - } - - len = attr->show(root_cache, buf); - if (len > 0) - attr->store(s, buf, len); - } - - if (buffer) - free_page((unsigned long)buffer); -#endif } static void kmem_cache_release(struct kobject *k) @@ -5779,27 +5529,10 @@ .release = kmem_cache_release, }; -static int uevent_filter(struct kset *kset, struct kobject *kobj) -{ - struct kobj_type *ktype = get_ktype(kobj); - - if (ktype == &slab_ktype) - return 1; - return 0; -} - -static const struct kset_uevent_ops slab_uevent_ops = { - .filter = uevent_filter, -}; - static struct kset *slab_kset; static inline struct kset *cache_kset(struct kmem_cache *s) { -#ifdef CONFIG_MEMCG - if (!is_root_cache(s)) - return s->memcg_params.root_cache->memcg_kset; -#endif return slab_kset; } @@ -5814,7 +5547,8 @@ char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL); char *p = name; - BUG_ON(!name); + if (!name) + return ERR_PTR(-ENOMEM); *p++ = ':'; /* @@ -5842,36 +5576,12 @@ return name; } -static void sysfs_slab_remove_workfn(struct work_struct *work) -{ - struct kmem_cache *s = - container_of(work, struct kmem_cache, kobj_remove_work); - - if (!s->kobj.state_in_sysfs) - /* - * For a memcg cache, this may be called during - * deactivation and again on shutdown. Remove only once. - * A cache is never shut down before deactivation is - * complete, so no need to worry about synchronization. - */ - goto out; - -#ifdef CONFIG_MEMCG - kset_unregister(s->memcg_kset); -#endif - kobject_uevent(&s->kobj, KOBJ_REMOVE); -out: - kobject_put(&s->kobj); -} - static int sysfs_slab_add(struct kmem_cache *s) { int err; const char *name; struct kset *kset = cache_kset(s); int unmergeable = slab_unmergeable(s); - - INIT_WORK(&s->kobj_remove_work, sysfs_slab_remove_workfn); if (!kset) { kobject_init(&s->kobj, &slab_ktype); @@ -5896,6 +5606,8 @@ * for the symlinks. */ name = create_unique_id(s); + if (IS_ERR(name)) + return PTR_ERR(name); } s->kobj.kset = kset; @@ -5907,17 +5619,6 @@ if (err) goto out_del_kobj; -#ifdef CONFIG_MEMCG - if (is_root_cache(s) && memcg_sysfs_enabled) { - s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj); - if (!s->memcg_kset) { - err = -ENOMEM; - goto out_del_kobj; - } - } -#endif - - kobject_uevent(&s->kobj, KOBJ_ADD); if (!unmergeable) { /* Setup first alias */ sysfs_slab_alias(s, s->name); @@ -5929,19 +5630,6 @@ out_del_kobj: kobject_del(&s->kobj); goto out; -} - -static void sysfs_slab_remove(struct kmem_cache *s) -{ - if (slab_state < FULL) - /* - * Sysfs has not been setup yet so no need to remove the - * cache from sysfs. - */ - return; - - kobject_get(&s->kobj); - schedule_work(&s->kobj_remove_work); } void sysfs_slab_unlink(struct kmem_cache *s) @@ -5998,7 +5686,7 @@ mutex_lock(&slab_mutex); - slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); + slab_kset = kset_create_and_add("slab", NULL, kernel_kobj); if (!slab_kset) { mutex_unlock(&slab_mutex); pr_err("Cannot register slab subsystem.\n"); @@ -6033,6 +5721,189 @@ __initcall(slab_sysfs_init); #endif /* CONFIG_SLUB_SYSFS */ +#if defined(CONFIG_SLUB_DEBUG) && defined(CONFIG_DEBUG_FS) +static int slab_debugfs_show(struct seq_file *seq, void *v) +{ + struct loc_track *t = seq->private; + struct location *l; + unsigned long idx; + + idx = (unsigned long) t->idx; + if (idx < t->count) { + l = &t->loc[idx]; + + seq_printf(seq, "%7ld ", l->count); + + if (l->addr) + seq_printf(seq, "%pS", (void *)l->addr); + else + seq_puts(seq, "<not-available>"); + + if (l->sum_time != l->min_time) { + seq_printf(seq, " age=%ld/%llu/%ld", + l->min_time, div_u64(l->sum_time, l->count), + l->max_time); + } else + seq_printf(seq, " age=%ld", l->min_time); + + if (l->min_pid != l->max_pid) + seq_printf(seq, " pid=%ld-%ld", l->min_pid, l->max_pid); + else + seq_printf(seq, " pid=%ld", + l->min_pid); + + if (num_online_cpus() > 1 && !cpumask_empty(to_cpumask(l->cpus))) + seq_printf(seq, " cpus=%*pbl", + cpumask_pr_args(to_cpumask(l->cpus))); + + if (nr_online_nodes > 1 && !nodes_empty(l->nodes)) + seq_printf(seq, " nodes=%*pbl", + nodemask_pr_args(&l->nodes)); + + seq_puts(seq, "\n"); + } + + if (!idx && !t->count) + seq_puts(seq, "No data\n"); + + return 0; +} + +static void slab_debugfs_stop(struct seq_file *seq, void *v) +{ +} + +static void *slab_debugfs_next(struct seq_file *seq, void *v, loff_t *ppos) +{ + struct loc_track *t = seq->private; + + t->idx = ++(*ppos); + if (*ppos <= t->count) + return ppos; + + return NULL; +} + +static void *slab_debugfs_start(struct seq_file *seq, loff_t *ppos) +{ + struct loc_track *t = seq->private; + + t->idx = *ppos; + return ppos; +} + +static const struct seq_operations slab_debugfs_sops = { + .start = slab_debugfs_start, + .next = slab_debugfs_next, + .stop = slab_debugfs_stop, + .show = slab_debugfs_show, +}; + +static int slab_debug_trace_open(struct inode *inode, struct file *filep) +{ + + struct kmem_cache_node *n; + enum track_item alloc; + int node; + struct loc_track *t = __seq_open_private(filep, &slab_debugfs_sops, + sizeof(struct loc_track)); + struct kmem_cache *s = file_inode(filep)->i_private; + unsigned long *obj_map; + + if (!t) + return -ENOMEM; + + obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL); + if (!obj_map) { + seq_release_private(inode, filep); + return -ENOMEM; + } + + if (strcmp(filep->f_path.dentry->d_name.name, "alloc_traces") == 0) + alloc = TRACK_ALLOC; + else + alloc = TRACK_FREE; + + if (!alloc_loc_track(t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) { + bitmap_free(obj_map); + seq_release_private(inode, filep); + return -ENOMEM; + } + + /* Push back cpu slabs */ + flush_all(s); + + for_each_kmem_cache_node(s, node, n) { + unsigned long flags; + struct page *page; + + if (!atomic_long_read(&n->nr_slabs)) + continue; + + spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry(page, &n->partial, slab_list) + process_slab(t, s, page, alloc, obj_map); + list_for_each_entry(page, &n->full, slab_list) + process_slab(t, s, page, alloc, obj_map); + spin_unlock_irqrestore(&n->list_lock, flags); + } + + bitmap_free(obj_map); + return 0; +} + +static int slab_debug_trace_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct loc_track *t = seq->private; + + free_loc_track(t); + return seq_release_private(inode, file); +} + +static const struct file_operations slab_debugfs_fops = { + .open = slab_debug_trace_open, + .read = seq_read, + .llseek = seq_lseek, + .release = slab_debug_trace_release, +}; + +static void debugfs_slab_add(struct kmem_cache *s) +{ + struct dentry *slab_cache_dir; + + if (unlikely(!slab_debugfs_root)) + return; + + slab_cache_dir = debugfs_create_dir(s->name, slab_debugfs_root); + + debugfs_create_file("alloc_traces", 0400, + slab_cache_dir, s, &slab_debugfs_fops); + + debugfs_create_file("free_traces", 0400, + slab_cache_dir, s, &slab_debugfs_fops); +} + +void debugfs_slab_release(struct kmem_cache *s) +{ + debugfs_remove_recursive(debugfs_lookup(s->name, slab_debugfs_root)); +} + +static int __init slab_debugfs_init(void) +{ + struct kmem_cache *s; + + slab_debugfs_root = debugfs_create_dir("slab", NULL); + + list_for_each_entry(s, &slab_caches, list) + if (s->flags & SLAB_STORE_USER) + debugfs_slab_add(s); + + return 0; + +} +__initcall(slab_debugfs_init); +#endif /* * The /proc/slabinfo ABI */ @@ -6058,6 +5929,7 @@ sinfo->objects_per_slab = oo_objects(s->oo); sinfo->cache_order = oo_order(s->oo); } +EXPORT_SYMBOL_GPL(get_slabinfo); void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s) { -- Gitblit v1.6.2